-
Notifications
You must be signed in to change notification settings - Fork 0
/
step2.py
87 lines (70 loc) · 3.3 KB
/
step2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import matplotlib.pyplot as plt
import numpy as np
from tqdm.contrib.concurrent import process_map
from environments.ClairvoyantAlgorithm import ClairvoyantAlgorithm
from environments.Environment import Environment
from learners.GPTSLearner import GPTSLearner
from learners.GPUCBLearner import GPUCBLearner
from utils import plot_statistics
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
# Simulation parameters
T = 365
n_experiments = 250
def run_experiment(_):
# For every experiment, we define new environment and learners
env = Environment.from_json('data/environment.json')
# Pricing values (optimal conversion rate and price) are known
clairvoyant = ClairvoyantAlgorithm(env)
opt_reward = clairvoyant.optimal_rewards[0]
opt_price_id = clairvoyant.optimal_prices_id[0]
# Learners
gp_ucb_learner = GPUCBLearner(len(env.bids), env.bids)
gp_ts_learner = GPTSLearner(len(env.bids), env.bids)
# Data structures
instantaneous_reward_clairvoyant = np.zeros(T)
instantaneous_reward_ucb1 = np.zeros(T)
instantaneous_reward_ts = np.zeros(T)
instantaneous_regret_clairvoyant = np.zeros(T)
instantaneous_regret_ucb1 = np.zeros(T)
instantaneous_regret_ts = np.zeros(T)
for t in range(T):
# Clairvoyant algorithm
instantaneous_reward_clairvoyant[t] = opt_reward
instantaneous_regret_clairvoyant[t] = 0
# GP-UCB learner
pulled_arm = gp_ucb_learner.pull_arm()
reward = env.compute_reward(opt_price_id, pulled_arm, user_class=0)
gp_ucb_learner.update(pulled_arm, reward)
instantaneous_reward_ucb1[t] = reward
regret = opt_reward - reward
instantaneous_regret_ucb1[t] = regret
# GP Thompson Sampling learner
pulled_arm = gp_ts_learner.pull_arm()
reward = env.compute_reward(opt_price_id, pulled_arm, user_class=0)
gp_ts_learner.update(pulled_arm, reward)
instantaneous_reward_ts[t] = reward
regret = opt_reward - reward
instantaneous_regret_ts[t] = regret
return instantaneous_reward_clairvoyant, instantaneous_reward_ucb1, instantaneous_reward_ts, \
instantaneous_regret_clairvoyant, instantaneous_regret_ucb1, instantaneous_regret_ts
if __name__ == '__main__':
# Run the experiments in parallel
results_list = process_map(run_experiment, range(n_experiments), max_workers=10, chunksize=1)
# Array of shape (n_experiments, n_learners * 2, T)
results_array = np.array(results_list)
# Extract the results into multiple arrays of shape (n_experiments, T)
inst_reward_clairvoyant = results_array[:, 0, :]
inst_reward_ucb1 = results_array[:, 1, :]
inst_reward_ts = results_array[:, 2, :]
inst_regret_clairvoyant = results_array[:, 3, :]
inst_regret_ucb1 = results_array[:, 4, :]
inst_regret_ts = results_array[:, 5, :]
# Generate plots of the mean and standard deviation of the results
plot_statistics(inst_reward_clairvoyant, inst_regret_clairvoyant, 'Clairvoyant', 'Step 2')
plot_statistics(inst_reward_ucb1, inst_regret_ucb1, 'GP-UCB', 'Step 2')
plot_statistics(inst_reward_ts, inst_regret_ts, 'GP-TS', 'Step 2')
plt.tight_layout()
plt.show()