forked from agentmodels/webppl-agents
-
Notifications
You must be signed in to change notification settings - Fork 1
/
restaurant_choice.wppl
194 lines (159 loc) · 7.02 KB
/
restaurant_choice.wppl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
// make sure code runs in browser and on command line:
var inBrowser = typeof(argv) === 'undefined',
_argv = inBrowser ? {} : argv;
// agent parameters:
var params = extend({
maxLambda: 0.99, // upper bound on local relative aspiration in each step (must be minLambda...1)
minLambda: 0.01, // lower bound on local relative aspiration in each step (must be 0...maxLambda)
lossCoeff4variance: 100.0, // weight of variance in loss function, must be >= 0
lossCoeff4KLdiv: 0.0, // weight of KL divergence in loss function, must be >= 0
lossCoeff4entropy: 1.0, // weight of entropy in loss function, must be >= 0
lossCoeff4random: 0.0, // weight of random noise in loss function, must be >= 0
softmaxInvTemp: 1.0, // inverse temperature of softmax mixture of actions, must be > 0
rescaling4Successors: 1.0, // degree (0...1) of aspiration rescaling. (expectation is only preserved if this is 1.0)
}, _argv);
// simulation options:
var options = extend({
aleph0: [1.2,1.4], // initial global aspiration (desired expected return);
debug: false, // if true, print debug messages
}, _argv),
aleph0 = typeof(options.aleph0Lo) === 'undefined' ? options.aleph0 : [options.aleph0Lo, options.aleph0Hi];
// fold: Restaurant constants, tableToUtilityFunction
var ___ = ' ';
var G1 = { name : 'Donut N' };
var DS = { name : 'Donut S' };
var V = { name : 'Veg' };
var N = { name : 'Noodle' };
var tableToIndicatorIncrementFct = function(table, feature) {
return function(state, action) {
var f = feature(state), stateFeatureName = f.name, inc = stateFeatureName ? table[stateFeatureName] : table[f['0']];
return inc;
};
};
//
// Construct world
var grid = [
['#', '#', '#', '#', V , '#'],
['#', '#', '#', ___, ___, ___],
['#', '#', G1 , 's', '#', ___],
['#', '#', '#', 's', '#', ___],
['#', '#', '#', ___, ___, ___],
['#', '#', '#', ___, '#', N ],
[___, ___, ___, ___, '#', '#'],
[DS , '#', '#', ___, '#', '#']
];
var mdp = makeGridWorldMDP({
grid,
start: [3, 1],
totalTime: 9
});
var world = mdp.world;
var transition = world.transition;
var stateToActions = world.stateToActions;
// Specify distribution of indicator increments (via expected value and variancs)
// (Note: under a "maximization" paradigm, this would be called "reward" or "utility")
var expectedIndicatorIncrementTable = {
'Donut S': 1,
'Donut N': 1,
'Veg': 3,
'Noodle': 2,
's': -0.1, // -0.3 sand roads take time
' ': -0.1
};
var varianceOfIndicatorIncrementTable = {
'Donut S': 0,
'Donut N': 0,
'Veg': 0,
'Noodle': 0,
's': 0, // sand roads are free
' ': 0.01 // normal roads might have traffic jams
};
var f0 = world.feature, feature = function(s) {return extend(f0(s), {test: "hello"})};
var expectedDelta = tableToIndicatorIncrementFct(expectedIndicatorIncrementTable, feature),
varianceOfDelta = tableToIndicatorIncrementFct(varianceOfIndicatorIncrementTable, feature);
// uninformedP policy for use in Shannon entropy:
var uninformedPolicy = function(state) {
return Categorical({vs: ["u", "d", "l", "r"], ps: [0.25, 0.25, 0.25, 0.25]});
}
// (Note: when refining an action a into variants a', a'',
// the probabilities of a under uninformedPolicy should split into two parts for a' and a'' additively,
// since then behavior remains consistent. In particular, if an action is cloned and its uninformedPolicy
// probability is split somehow, behavior should remain invariant.)
// Reference policy for use in KL divergence
// (can be used to steer agent towards certain actions):
var refPolicy = function(state) {
return Categorical({vs: ["u", "d", "l", "r"], ps: [0.7, 0.1, 0.1, 0.1]});
};
// initialize the agent
var agent = makeMDPAgentSatisfia(extend(params, {
expectedIndicatorIncrement: expectedDelta, varianceOfIndicatorIncrement: varianceOfDelta,
uninformedPolicy, refPolicy,
options
}), world);
// extract its methods:
var getLocalPolicy = agent.getLocalPolicy, propagateAspiration = agent.propagateAspiration,
getAspiration4state = agent.getAspiration4state,
V = agent.V, V2 = agent.V2,
entropy = agent.entropy, KLdiv = agent.KLdiv;
// Generate and draw a trajectory:
var simulate = function(state, aleph) {
if (options.debug) console.log("simulate", state, aleph);
var localPolicy = getLocalPolicy(state, aleph),
action = sample(localPolicy),
r = expectedDelta(state, action),
r2 = squared(r) + varianceOfDelta(state, action);
var sa = [state, action];
if (state.terminateAfterAction) {
return {
trajectory: [sa], // sequence of [state, action] pairs
conditionalExpectedIndicator: r, // expected indicator conditional on this trajectory
conditionalExpectedSquaredIndicator: r2 // expected squared indicator conditional on this trajectory
};
} else {
var nextState = transition(state, action),
nextAleph = propagateAspiration(state, aleph, action, r, nextState),
nextOut = simulate(nextState, nextAleph);
return {
trajectory: [sa].concat(nextOut.trajectory),
conditionalExpectedIndicator: r + nextOut.conditionalExpectedIndicator,
conditionalExpectedSquaredIndicator: r2 + 2*r*nextOut.conditionalExpectedIndicator + nextOut.conditionalExpectedSquaredIndicator
};
}
};
// calculate expected return and return std.dev. by using the inbuilt expectation function:
var expectedIndicator = expectation(Infer({ model() {
return simulate(mdp.startState, aleph0).conditionalExpectedIndicator;
}}));
console.log("in expectation, we desired indicator", aleph0, "and actually get", expectedIndicator);
var expectedSquaredIndicator = expectation(Infer({ model() {
return simulate(mdp.startState, aleph0).conditionalExpectedSquaredIndicator;
}})),
stddev = Math.sqrt(expectedSquaredIndicator - Math.pow(expectedIndicator,2));
console.log("indicator has a std.dev. of", stddev,
"which should equal", Math.sqrt(V2(mdp.startState, aleph0) - Math.pow(V(mdp.startState, aleph0),2)));
console.log("Entropy of policy is", entropy(mdp.startState, aleph0));
console.log("KL divergence from reference policy is", KLdiv(mdp.startState, aleph0));
if (inBrowser) {
// simulate and show a single trajectory:
var out = simulate(mdp.startState, aleph0);
viz.gridworld(world, { trajectory: map(first, out.trajectory) })
}
if (false) {
// Define the filename for the JSON file
var filename = 'output_rotateminlambda.json';
// Read existing JSON data from the file (if it exists)
var existingData = json.read(filename) || [];
// Assuming you have a single data object with keys and values
var data1 = {
key1: 'expectedIndicator',
value1: expectedIndicator,
key2: 'aleph0',
value2: aleph0,
key3: 'minlambda',
value3: params.minLambda
};
// Add new data to the existing data
existingData.push(data1);
// Write the updated data back to the JSON file
json.write(filename, existingData);
}