-
Notifications
You must be signed in to change notification settings - Fork 0
/
step6_2.py
135 lines (110 loc) · 5.51 KB
/
step6_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import matplotlib.pyplot as plt
import numpy as np
from tqdm.contrib.concurrent import process_map
from environments.NSClairvoyantAlgorithm import NSClairvoyantAlgorithm
from environments.NSEnvironment import NSEnvironment
from learners.SWUCB1Learner import SWUCB1Learner
from learners.CDUCBLearner import CDUCBLearner
from learners.UCB1Learner import UCB1Learner
from learners.EXP3Learner import EXP3Learner
from utils import plot_statistics
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)
# Simulation parameters
T = 365 # n.b.: T must be equal to horizon parameter in the JSON file
n_experiments = 5000
window_size = 100
# CUSUM parameters
M = 50
eps = 0.5
h = 40
alpha = 0.1
# EXP3 parameter
gamma = 0.1
def run_experiment(_):
# For every experiment, we define new environment and learners
env = NSEnvironment.from_json('data/NS_environment_step6_2.json')
# Clairvoyant
clairvoyant = NSClairvoyantAlgorithm(env)
# Learners
ucb1_learner = UCB1Learner(len(env.prices))
sw_ucb_learner = SWUCB1Learner(len(env.prices), windows_size=window_size)
cd_ucb_learner = CDUCBLearner(len(env.prices), M, eps, h, alpha)
exp3_learner = EXP3Learner(len(env.prices), gamma)
# Data structures
instantaneous_reward_clairvoyant = np.zeros(T)
instantaneous_reward_exp3 = np.zeros(T)
instantaneous_reward_ucb1 = np.zeros(T)
instantaneous_reward_sw_ucb = np.zeros(T)
instantaneous_reward_cd_ucb = np.zeros(T)
instantaneous_regret_clairvoyant = np.zeros(T)
instantaneous_regret_exp3 = np.zeros(T)
instantaneous_regret_ucb1 = np.zeros(T)
instantaneous_regret_sw_ucb = np.zeros(T)
instantaneous_regret_cd_ucb = np.zeros(T)
for t in range(T):
# Clairvoyant algorithm
current_phase = (t // env.phase_length) % env.n_phases
opt_reward = clairvoyant.optimal_rewards[current_phase]
opt_bid_id = clairvoyant.optimal_bids_id[current_phase]
instantaneous_reward_clairvoyant[t] = opt_reward
instantaneous_regret_clairvoyant[t] = 0
# EXP3 learner
pulled_arm = exp3_learner.pull_arm()
bernoulli_reward = env.round(pulled_arm, exp3_learner.t)
exp3_learner.update(pulled_arm, float(bernoulli_reward * env.prices[pulled_arm]))
total_reward = env.compute_reward(pulled_arm, opt_bid_id, user_class=0, phase=current_phase)
instantaneous_reward_exp3[t] = total_reward
regret = opt_reward - total_reward
instantaneous_regret_exp3[t] = regret
# UCB1 learner
pulled_arm = ucb1_learner.pull_arm()
bernoulli_reward = env.round(pulled_arm, ucb1_learner.t)
ucb1_learner.update(pulled_arm, float(bernoulli_reward * env.prices[pulled_arm]))
total_reward = env.compute_reward(pulled_arm, opt_bid_id, user_class=0, phase=current_phase)
instantaneous_reward_ucb1[t] = total_reward
regret = opt_reward - total_reward
instantaneous_regret_ucb1[t] = regret
# SW-UCB learner
pulled_arm = sw_ucb_learner.pull_arm()
bernoulli_reward = env.round(pulled_arm, sw_ucb_learner.t)
sw_ucb_learner.update(pulled_arm, float(bernoulli_reward * env.prices[pulled_arm]))
total_reward = env.compute_reward(pulled_arm, opt_bid_id, user_class=0, phase=current_phase)
instantaneous_reward_sw_ucb[t] = total_reward
regret = opt_reward - total_reward
instantaneous_regret_sw_ucb[t] = regret
# CD-UCB learner
pulled_arm = cd_ucb_learner.pull_arm()
bernoulli_reward = env.round(pulled_arm, cd_ucb_learner.t)
cd_ucb_learner.update(pulled_arm, float(bernoulli_reward * env.prices[pulled_arm]))
total_reward = env.compute_reward(pulled_arm, opt_bid_id, user_class=0, phase=current_phase)
instantaneous_reward_cd_ucb[t] = total_reward
regret = opt_reward - total_reward
instantaneous_regret_cd_ucb[t] = regret
return instantaneous_reward_clairvoyant, instantaneous_reward_exp3, instantaneous_reward_ucb1, \
instantaneous_reward_sw_ucb, instantaneous_reward_cd_ucb, instantaneous_regret_clairvoyant, \
instantaneous_regret_exp3, instantaneous_regret_ucb1, instantaneous_regret_sw_ucb, instantaneous_regret_cd_ucb
if __name__ == '__main__':
# Run the experiments in parallel
results_list = process_map(run_experiment, range(n_experiments), max_workers=10, chunksize=1)
# Array of shape (n_experiments, n_learners * 2, T)
results_array = np.array(results_list)
# Extract the results into multiple arrays of shape (n_experiments, T)
inst_reward_clairvoyant = results_array[:, 0, :]
inst_reward_exp3 = results_array[:, 1, :]
inst_reward_ucb1 = results_array[:, 2, :]
inst_reward_sw_ucb = results_array[:, 3, :]
inst_reward_cd_ucb = results_array[:, 4, :]
inst_regret_clairvoyant = results_array[:, 5, :]
inst_regret_exp3 = results_array[:, 6, :]
inst_regret_ucb1 = results_array[:, 7, :]
inst_regret_sw_ucb = results_array[:, 8, :]
inst_regret_cd_ucb = results_array[:, 9, :]
# Generate plots of the mean and standard deviation of the results
plot_statistics(inst_reward_clairvoyant, inst_regret_clairvoyant, 'Clairvoyant', 'Step 6.2')
plot_statistics(inst_reward_exp3, inst_regret_exp3, 'EXP3', 'Step 6.2')
plot_statistics(inst_reward_ucb1, inst_regret_ucb1, 'UCB1', 'Step 6.2')
plot_statistics(inst_reward_sw_ucb, inst_regret_sw_ucb, 'SW-UCB', 'Step 6.2')
plot_statistics(inst_reward_cd_ucb, inst_regret_cd_ucb, 'CD-UCB', 'Step 6.2')
plt.tight_layout()
plt.show()