forked from agentmodels/webppl-agents
-
Notifications
You must be signed in to change notification settings - Fork 1
/
simple_mdp.wppl
168 lines (153 loc) · 8.55 KB
/
simple_mdp.wppl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// run via webppl --require webppl-dp --require . simple_mdp.wppl
// find out about environment:
var env = getEnv();
// agent parameters:
var params = extend({
// some env-specific default parameter settings
}, env.argv);
// simulation options:
var options = extend({
pPotHiEasy: 0.99, // probability of potting a ball if in easy position and focusing on potting
pPotMidEasy: 0.97, // probability of potting a ball if in easy position and not focusing on anything
pPotLoEasy: 0.96, // probability of potting a ball if in easy position and focusing on continuation
pPotHiHard: 0.90, // probability of potting a ball if in hard position and focusing on potting
pPotMidHard: 0.86, // probability of potting a ball if in hard position and not focusing on anything
pPotLoHard: 0.84, // probability of potting a ball if in hard position and focusing on continuation
pEasyHiEasy: 0.90, // probability of getting an easy position if in easy position and focusing on continuation
pEasyMidEasy: 0.80, // probability of getting an easy position if in easy position and not focusing on anything
pEasyLoEasy: 0.75, // probability of getting an easy position if in easy position and focusing on potting
pEasyHiHard: 0.80, // probability of getting an easy position if in hard position and focusing on continuation
pEasyMidHard: 0.70, // probability of getting an easy position if in hard position and not focusing on anything
pEasyLoHard: 0.65, // probability of getting an easy position if in hard position and focusing on potting
aleph0: [0.8, 0.9], // initial global aspiration (desired expected return);
debug: false, // if true, print debug messages
}, env.argv),
aleph0 = typeof(options.aleph0Lo) === 'undefined' ? options.aleph0 : [options.aleph0Lo, options.aleph0Hi];
// hand-coded world:
var startState = {
code: [1, 1, 1, 1, 0], // match, frame, visit, ball, hard to pot (0) (easy to pot is 1)
terminateAfterAction: false
};
var stateToActions = function(state) {
return [
'.', // don't try particularly hard
'p', // focus on potting
'c' // focus on getting a good position for continuation
];
};
var transition = function(state, action) {
var code = state.code,
match = code[0], // matches 1+2 are the group phase, 3 is the semifinals, 4 is the finals
frame = code[1], // a match consists of at most 2 frames
visit = code[2], // a frame consists of at most 2 visits
ball = code[3], // in each visit, the player first attempts to pot a red (1) and then the black (2)
easy = (code[4]==1);
var potted = sample(Bernoulli({
p: action == 'p' ? (easy ? options.pPotHiEasy : options.pPotHiHard) // if focusing on potting, potting is more likely
: action == '.' ? (easy ? options.pPotMidEasy : options.pPotMidHard) // if not focusing on anything, potting is less likely
: (easy ? options.pPotLoEasy : options.pPotLoHard) // if focusing on continuation, potting is least likely
}));
if (!potted) {
// after failing to pot some ball in the 1st visit, the 2nd visit starts:
if (visit == 1) return { code: [match, frame, 2, 1, 0], terminateAfterAction: false };
// after failing to pot some ball in the 2nd visit of some frame of the 1st match of the group phase,
// that frame and thus that match is lost and the 2nd match starts:
if ((match == 1) && (visit == 2)) return { code: [2, 1, 1, 1, 0], terminateAfterAction: false };
// after failing to pot some ball in the 2nd visit of some frame of the 2nd match of the group phase
// or of the semifinals or finals, that match is lost as well and the player exists the tournament:
return { code: 'lost', terminateAfterAction: true };
} else {
// after potting the 1st ball in some visit, the 2nd ball can be attempted:
if (ball == 1) {
var nextEasy = sample(Bernoulli({
p: action == 'c' ? (easy ? options.pEasyHiEasy : options.pEasyHiHard) // if focusing on continuation, the 2nd ball is more likely to be easy
: action == '.' ? (easy ? options.pEasyMidEasy : options.pEasyMidHard) // if not focusing on anything, the 2nd ball is less likely to be easy
: (easy ? options.pEasyLoEasy : options.pEasyLoHard) // if focusing on potting, the 2nd ball is least likely to be easy
}));
return { code: [match, frame, visit, 2, nextEasy ? 1 : 0], terminateAfterAction: false };
}
// after potting the 2nd ball in some visit in the 1st frame of a match,
// that frame is won and the player made it into the 2nd frame:
if ((frame == 1) && (ball == 2)) return { code: [match, 2, 1, 1, 0], terminateAfterAction: false };
// after winning the semifinals, the player made it into the finals:
if ((match == 3) && (frame == 2) && (ball == 2)) return { code: [4, 1, 1, 1, 0], terminateAfterAction: false };
// after winning the finals, the tournament is won:
if ((match == 4) && (frame == 2) && (ball == 2)) return { code: 'won', terminateAfterAction: true };
// after winning the 2nd frame in some match in the group phase,
// that match is won and the player made it into the semifinals:
return { code: [3, 1, 1, 1, 0], terminateAfterAction: false };
}
}
var expectedDelta = function(state, action) {
return state.code == 'won' ? 1 : 0;
};
var varianceOfDelta = function(state, action) {
return 0;
};
var world = { stateToActions, transition }, mdp = { world, startState };
var refPolicy = function(state) {
return Categorical({vs: ['.', 'p', 'c'], ps: [.8, .1, .1]});
};
var uninformedPolicy = function(state) {
return Categorical({vs: ['.', 'p', 'c'], ps: [1, 1, 1]});
};
// initialize the agent
var agent = makeMDPAgentSatisfia(extend(params, {
expectedDelta: expectedDelta, varianceOfDelta: varianceOfDelta,
uninformedPolicy,
refPolicy,
options
}), world);
// extract its methods:
var localPolicy = agent.localPolicy, propagateAspiration = agent.propagateAspiration,
aspiration4state = agent.aspiration4state,
V = agent.V, V2 = agent.V2,
behaviorEntropy_state = agent.behaviorEntropy_state, behaviorKLdiv_state = agent.behaviorKLdiv_state;
// Generate and draw a trajectory:
var simulate = function(state, aleph) {
if (options.debug) console.log("simulate", state, aleph);
var localPolicy = localPolicy(state, aleph),
action = sample(localPolicy),
r = expectedDelta(state, action),
r2 = squared(r) + varianceOfDelta(state, action);
var sa = [state, action];
if (state.terminateAfterAction) {
if (options.debug) console.log("...terminal state");
return {
trajectory: [sa], // sequence of [state, action] pairs
conditionalExpectedIndicator: r, // expected indicator conditional on this trajectory
conditionalExpectedSquaredIndicator: r2 // expected squared indicator conditional on this trajectory
};
} else {
var nextState = transition(state, action),
nextAleph = propagateAspiration(state, aleph, action, r, nextState),
nextOut = simulate(nextState, nextAleph);
return {
trajectory: [sa].concat(nextOut.trajectory),
conditionalExpectedIndicator: r + nextOut.conditionalExpectedIndicator,
conditionalExpectedSquaredIndicator: r2 + 2*r*nextOut.conditionalExpectedIndicator + nextOut.conditionalExpectedSquaredIndicator
};
}
};
/** /
// calculate expected return and return std.dev. by using the inbuilt expectation function:
var expectedIndicator = expectation(Infer({ model() {
return simulate(mdp.startState, aleph0).conditionalExpectedIndicator;
}}));
console.log("in expectation, we desired indicator", aleph0, "and actually get", expectedIndicator);
/** /
var expectedSquaredIndicator = expectation(Infer({ model() {
return simulate(mdp.startState, aleph0).conditionalExpectedSquaredIndicator;
}})),
stddev = Math.sqrt(expectedSquaredIndicator - Math.pow(expectedIndicator,2));
console.log("indicator has a std.dev. of", stddev,
"which should equal", Math.sqrt(V2(mdp.startState, aleph0) - Math.pow(V(mdp.startState, aleph0),2)));
/**/
console.log("Entropy of policy is", behaviorEntropy_state(mdp.startState, aleph0));
console.log("KL divergence from reference policy is", behaviorKLdiv_state(mdp.startState, aleph0));
/**/
// simulate and show a single trajectory:
var out = simulate(mdp.startState, aleph0);
console.log("trajectory", out.trajectory);
var lpd = agent.localPolicyData;
console.log(lpd, lpd.cache);