simple_mdp.wppl

// run via  webppl --require webppl-dp --require . simple_mdp.wppl

// find out about environment:
var env = getEnv();

// agent parameters:
var params = extend({
  // some env-specific default parameter settings
}, env.argv);

// simulation options:
var options = extend({
      pPotHiEasy: 0.99, // probability of potting a ball if in easy position and focusing on potting
      pPotMidEasy: 0.97, // probability of potting a ball if in easy position and not focusing on anything
      pPotLoEasy: 0.96, // probability of potting a ball if in easy position and focusing on continuation
      pPotHiHard: 0.90, // probability of potting a ball if in hard position and focusing on potting
      pPotMidHard: 0.86, // probability of potting a ball if in hard position and not focusing on anything
      pPotLoHard: 0.84, // probability of potting a ball if in hard position and focusing on continuation
      pEasyHiEasy: 0.90, // probability of getting an easy position if in easy position and focusing on continuation
      pEasyMidEasy: 0.80, // probability of getting an easy position if in easy position and not focusing on anything
      pEasyLoEasy: 0.75, // probability of getting an easy position if in easy position and focusing on potting
      pEasyHiHard: 0.80, // probability of getting an easy position if in hard position and focusing on continuation
      pEasyMidHard: 0.70, // probability of getting an easy position if in hard position and not focusing on anything
      pEasyLoHard: 0.65, // probability of getting an easy position if in hard position and focusing on potting
      aleph0: [0.8, 0.9], // initial global aspiration (desired expected return);
      debug: false, // if true, print debug messages
    }, env.argv),
    aleph0 = typeof(options.aleph0Lo) === 'undefined' ? options.aleph0 : [options.aleph0Lo, options.aleph0Hi];

// hand-coded world:

var startState = {
  code: [1, 1, 1, 1, 0],  // match, frame, visit, ball, hard to pot (0) (easy to pot is 1)
  terminateAfterAction: false
};
var stateToActions = function(state) {
  return [
    '.', // don't try particularly hard
    'p', // focus on potting
    'c' // focus on getting a good position for continuation
  ];
};
var transition = function(state, action) {
  var code = state.code,
      match = code[0], // matches 1+2 are the group phase, 3 is the semifinals, 4 is the finals 
      frame = code[1], // a match consists of at most 2 frames
      visit = code[2], // a frame consists of at most 2 visits
      ball = code[3], // in each visit, the player first attempts to pot a red (1) and then the black (2) 
      easy = (code[4]==1);
  var potted = sample(Bernoulli({
        p: action == 'p' ? (easy ? options.pPotHiEasy : options.pPotHiHard) // if focusing on potting, potting is more likely 
          : action == '.' ? (easy ? options.pPotMidEasy : options.pPotMidHard) // if not focusing on anything, potting is less likely
          : (easy ? options.pPotLoEasy : options.pPotLoHard) // if focusing on continuation, potting is least likely
      }));
  if (!potted) {
    // after failing to pot some ball in the 1st visit, the 2nd visit starts:
    if (visit == 1) return { code: [match, frame, 2, 1, 0], terminateAfterAction: false };
    // after failing to pot some ball in the 2nd visit of some frame of the 1st match of the group phase, 
    // that frame and thus that match is lost and the 2nd match starts:
    if ((match == 1) && (visit == 2)) return { code: [2, 1, 1, 1, 0], terminateAfterAction: false };
    // after failing to pot some ball in the 2nd visit of some frame of the 2nd match of the group phase 
    // or of the semifinals or finals, that match is lost as well and the player exists the tournament:
    return { code: 'lost', terminateAfterAction: true };
  } else {
    // after potting the 1st ball in some visit, the 2nd ball can be attempted:
    if (ball == 1) {
      var nextEasy = sample(Bernoulli({
            p: action == 'c' ? (easy ? options.pEasyHiEasy : options.pEasyHiHard) // if focusing on continuation, the 2nd ball is more likely to be easy 
              : action == '.' ? (easy ? options.pEasyMidEasy : options.pEasyMidHard) // if not focusing on anything, the 2nd ball is less likely to be easy 
              : (easy ? options.pEasyLoEasy : options.pEasyLoHard) // if focusing on potting, the 2nd ball is least likely to be easy
          }));
      return { code: [match, frame, visit, 2, nextEasy ? 1 : 0], terminateAfterAction: false };
    }
    // after potting the 2nd ball in some visit in the 1st frame of a match, 
    // that frame is won and the player made it into the 2nd frame:
    if ((frame == 1) && (ball == 2)) return { code: [match, 2, 1, 1, 0], terminateAfterAction: false };
    // after winning the semifinals, the player made it into the finals:
    if ((match == 3) && (frame == 2) && (ball == 2)) return { code: [4, 1, 1, 1, 0], terminateAfterAction: false };
    // after winning the finals, the tournament is won:
    if ((match == 4) && (frame == 2) && (ball == 2)) return { code: 'won', terminateAfterAction: true };
    // after winning the 2nd frame in some match in the group phase, 
    // that match is won and the player made it into the semifinals:
    return { code: [3, 1, 1, 1, 0], terminateAfterAction: false };
  }
} 
var expectedDelta = function(state, action) {
  return state.code == 'won' ? 1 : 0;
};  
var varianceOfDelta = function(state, action) {
  return 0;
};  
var world = { stateToActions, transition }, mdp = { world, startState };
var refPolicy = function(state) { 
  return Categorical({vs: ['.', 'p', 'c'], ps: [.8, .1, .1]}); 
};
var uninformedPolicy = function(state) { 
  return Categorical({vs: ['.', 'p', 'c'], ps: [1, 1, 1]}); 
};

// initialize the agent
var agent = makeMDPAgentSatisfia(extend(params, {
      expectedDelta: expectedDelta, varianceOfDelta: varianceOfDelta, 
      uninformedPolicy, 
      refPolicy,
      options
    }), world);

// extract its methods:
var localPolicy = agent.localPolicy, propagateAspiration = agent.propagateAspiration,
    aspiration4state = agent.aspiration4state, 
    V = agent.V, V2 = agent.V2,
    behaviorEntropy_state = agent.behaviorEntropy_state, behaviorKLdiv_state = agent.behaviorKLdiv_state;

// Generate and draw a trajectory:
var simulate = function(state, aleph) {
  if (options.debug) console.log("simulate", state, aleph);
  var localPolicy = localPolicy(state, aleph),
      action = sample(localPolicy),
      r = expectedDelta(state, action),
      r2 = squared(r) + varianceOfDelta(state, action);
  var sa = [state, action];
  if (state.terminateAfterAction) {
    if (options.debug) console.log("...terminal state");
    return { 
      trajectory: [sa], // sequence of [state, action] pairs
      conditionalExpectedIndicator: r, // expected indicator conditional on this trajectory
      conditionalExpectedSquaredIndicator: r2 // expected squared indicator conditional on this trajectory
    };
  } else {
    var nextState = transition(state, action),
        nextAleph = propagateAspiration(state, aleph, action, r, nextState),
        nextOut = simulate(nextState, nextAleph);
    return { 
      trajectory: [sa].concat(nextOut.trajectory), 
      conditionalExpectedIndicator: r + nextOut.conditionalExpectedIndicator,
      conditionalExpectedSquaredIndicator: r2 + 2*r*nextOut.conditionalExpectedIndicator + nextOut.conditionalExpectedSquaredIndicator
    };
  }
};


/** /
// calculate expected return and return std.dev. by using the inbuilt expectation function:

var expectedIndicator = expectation(Infer({ model() {
  return simulate(mdp.startState, aleph0).conditionalExpectedIndicator;
}}));
console.log("in expectation, we desired indicator", aleph0, "and actually get", expectedIndicator);

/** /
var expectedSquaredIndicator = expectation(Infer({ model() {
  return simulate(mdp.startState, aleph0).conditionalExpectedSquaredIndicator;
}})),
stddev = Math.sqrt(expectedSquaredIndicator - Math.pow(expectedIndicator,2));
console.log("indicator has a std.dev. of", stddev, 
            "which should equal", Math.sqrt(V2(mdp.startState, aleph0) - Math.pow(V(mdp.startState, aleph0),2)));
/**/
console.log("Entropy of policy is", behaviorEntropy_state(mdp.startState, aleph0));
console.log("KL divergence from reference policy is", behaviorKLdiv_state(mdp.startState, aleph0));
/**/


// simulate and show a single trajectory:
var out = simulate(mdp.startState, aleph0);
console.log("trajectory", out.trajectory);

var lpd = agent.localPolicyData;
console.log(lpd, lpd.cache);