Skip to content

Commit

Permalink
Add package nectar
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulptel committed Feb 27, 2020
1 parent 578b4f5 commit 7e5b6ef
Show file tree
Hide file tree
Showing 26 changed files with 2,619 additions and 0 deletions.
10 changes: 10 additions & 0 deletions nectar/config/cflp.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[Problem]
n_facility = 10
n_client = 10
n_scenario = 50
extensive_optimality_gap = 0.02
# Time limit in seconds to solve the extensive form
extensive_time_limit = 600
surrogate_optimality_gap = 0.001
# Time limit in seconds to solve the surrogate form
surrogate_time_limit = 300
29 changes: 29 additions & 0 deletions nectar/config/meta.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Configurations for data_manager package

# All the sections and corresponding keys in this file are mandatory,
# i.e., you cannot alter their name but change the value.

[Run]
# `problem` value should match with the folder containing data
# management scripts for a given problem type.
# For example, we have a data-management scripts related to
# Stochastic Capacitated Facility Location (S-CFLP) inside the `cflp`.
# Hence, for data management of S-CFLP we assign `problem` key the
# value `cflp`.
problem = cflp
# Number of processes to run in parallel
n_worker = 4
from_pid = 0
to_pid = 100

# Values in Directory and File section are optional. If nothing is passed,
# we will automatically set the default values.
[Directory]
data = data
result_extensive = result_ext
result_xi = result_xi
[File]
instance = instances.pkl
result_extensive = result_ext.pkl
result_xi = result_xi.pkl

Empty file added nectar/data_manager/__init__.py
Empty file.
86 changes: 86 additions & 0 deletions nectar/data_manager/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Dataset management
Data manager comprises of following modules:
1. generate_instance.py
2. generate_optimal_sol.py
3. generate_xi_star.py (with different heuristics)
4. improve_xi_star.py
5. generate_dataset (responsible for creating dataset for ML models)
One should run modules 1 to 5 in order to create the dataset for ML model.
"""
from argparse import ArgumentParser
from configparser import ConfigParser
from importlib import import_module
from pathlib import Path


def main():
# Load and set configuration
meta_config, problem_config = ConfigParser(), ConfigParser()
ROOT = Path(__file__).parent.parent

# Meta config
meta_config.read(ROOT / "config" / "meta.ini")
data_dir = meta_config.get('Directory', 'data')
problem = meta_config.get('Run', 'problem')
instance_file = meta_config.get('File', 'instance')
result_ext_file = meta_config.get('File', 'result_extensive')
result_xi_file = meta_config.get('File', 'result_xi')

# Problem config
problem_config.read(ROOT / "config" / ".".join([problem, "ini"]))
problem_path = ".".join(["nectar.data_manager", problem])
get_problem_identifier = getattr(import_module(
"nectar.utils.combinatorics."+problem
), "get_problem_identifier")
identifier = get_problem_identifier(problem_config)

# Set path
data_dir_path = ROOT / data_dir / "_".join([problem, identifier])
path = {
"data": data_dir_path,
"result_xi": data_dir_path / result_xi_file,
"result_ext": data_dir_path / result_ext_file,
"instance": data_dir_path / instance_file
}

# Specify the module to run
parser = ArgumentParser()
parser.add_argument('--run', type=str,
help='specify the data_manager module to execute. '
'inst: to generate instances '
'opt: to generate optimal solution '
'repr: to find a representative scenario '
'imp: to improve a representative scenario '
'dataset : to create dataset for ML'
'all: to run all module one after the other ',
default='inst')
args = parser.parse_args()
if args.run == "inst" or args.run == "all":
generate_instance = getattr(import_module(".".join([problem_path, "generate_instance"])),
"generate_instance")
generate_instance(meta_config, problem_config, path)
if args.run == "opt" or args.run == "all":
generate_optimal_sol = getattr(import_module(".".join([problem_path, "generate_optimal_sol"])),
"generate_optimal_sol")
generate_optimal_sol(meta_config, problem_config, path)
if args.run == "repr" or args.run == "all":
generate_xi_hat = getattr(import_module(".".join([problem_path, "generate_xi_hat"])),
"generate_xi_hat")
runs = ConfigParser()
runs.read(Path(__file__).parents[0] / meta_config['Run']['problem'] / "runs.ini")
for idx in runs.sections():
generate_xi_hat(meta_config, runs[idx], path)
if args.run == "imp" or args.run == "all":
improve_xi_hat = getattr(import_module(".".join([problem_path, "improve_xi_hat"])),
"improve_xi_hat")
improve_xi_hat(meta_config, path)
if args.run == "dataset" or args.run == "all":
generate_dataset = getattr(import_module(".".join([problem_path, "generate_dataset"])),
"generate_dataset")
generate_dataset(path)


if __name__ == "__main__":
main()
Empty file.
138 changes: 138 additions & 0 deletions nectar/data_manager/cflp/generate_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import random
from collections import defaultdict
import time
import numpy as np

from ...utils import load_pickle

np.random.seed(7)
random.seed(11)

MIN_C_F, MAX_C_F = 15, 19
MIN_C_V, MAX_C_V = 5, 9

MEAN_C_F = (MAX_C_F - MIN_C_F) / 2
MEAN_C_V = (MAX_C_V - MIN_C_V) / 2


def fetch_scenario(idxs, data):
scenario = []
for idx in idxs:
scenario.append(data[idx]['scenario'])
scenario = np.asarray(scenario)

return scenario


def normalize_scenario(scenario, MIN_SCE, MAX_SCE):
scenario_diff = np.subtract(scenario, MIN_SCE)
scenario_scaled = np.divide(scenario_diff, MAX_SCE - MIN_SCE)
scenario_scaled = (scenario_scaled * 2) - 1

return scenario_scaled


def extract_scenario_features(scenario):
features = []
start_time = time.time()
features.extend(np.max(scenario, axis=0))
features.extend(np.min(scenario, axis=0))
features.extend(np.median(scenario, axis=0))
features.extend(np.quantile(scenario, 0.75, axis=0))
features.extend(np.quantile(scenario, 0.25, axis=0))
features.extend(np.mean(scenario, axis=0))
features.extend(np.std(scenario, axis=0))

for k in [0.9, 1, 1.1, 1.2, 1.5]:
greater_than = []
less_than = []
for i in range(scenario.shape[1]):
i_greater_than = [True] * scenario.shape[0]
i_less_than = [True] * scenario.shape[0]
for j in range(scenario.shape[1]):
if i == j:
continue

i_greater_than = np.logical_and(i_greater_than, (1 + k) * scenario[:, i] >= scenario[:, j])
i_less_than = np.logical_and(i_less_than, scenario[:, i] <= (1 + k) * scenario[:, j])

greater_than.append(sum(i_greater_than) / scenario.shape[0])
less_than.append(sum(i_less_than) / scenario.shape[0])

features.extend(greater_than)
features.extend(less_than)

total_time = time.time() - start_time

return np.asarray(features), total_time


def create_model_input(idxs, instance, cost_normalized, scenarios_normalized):
assert len(idxs) == scenarios_normalized.shape[0]
total_time = 0
x = []
for rank, idx in enumerate(idxs):
x_object = {k: v for k, v in instance[idx].items()}
x_object["pid"] = idx
x_object["c_f_normalized"] = cost_normalized[idx]['c_f']
x_object["c_v_normalized"] = cost_normalized[idx]['c_v']
x_object["scenario_normalized"] = scenarios_normalized[rank]
x_object["scenario_features"], item_time = extract_scenario_features(scenarios_normalized[rank])
total_time += item_time
x.append(x_object)

return {"input": np.asarray(x), "total_time": total_time}


def generate_dataset(path, train_test_split=0.7):
instance = load_pickle(path["instance"])
result_xi = load_pickle(path["result_xi"])
total_time = 0

# Find problem for which we have representative scenario
solved = []
for k, v in result_xi.items():
v["solved_xi"] and solved.append(k)

# Normalize cost
cost_normalized = defaultdict(dict)
start_time = time.time()
for idx in solved:
cost_normalized[idx]['c_f'] = (((instance[idx]['c_f'] - MIN_C_F) / (MAX_C_F - MIN_C_F)) * 2) - 1
cost_normalized[idx]['c_v'] = (((instance[idx]['c_v'] - MIN_C_V) / (MAX_C_V - MIN_C_V)) * 2) - 1
total_time += (time.time() - start_time)

# Shuffle and split into train and test
random.shuffle(solved)
n_train = int(train_test_split * len(solved))
train_idxs, test_idxs = solved[:n_train], solved[n_train:]

# Normalize scenarios
train_scenarios = fetch_scenario(train_idxs, instance)
test_scenarios = fetch_scenario(test_idxs, instance)
start_time = time.time()
MAX_SCE = np.max(train_scenarios, axis=0)
MIN_SCE = np.min(train_scenarios, axis=0)
train_scenarios_normalized = normalize_scenario(train_scenarios, MIN_SCE, MAX_SCE)
test_scenarios_normalized = normalize_scenario(test_scenarios, MIN_SCE, MAX_SCE)
total_time += (time.time() - start_time)

# Prepare training samples
result = create_model_input(train_idxs, instance, cost_normalized, train_scenarios_normalized)
x_train, total_time_train = result["input"], result["total_time"]

result = create_model_input(test_idxs, instance, cost_normalized, test_scenarios_normalized)
x_test, total_time_test = result["input"], result["total_time"]

total_time += (total_time_train + total_time_test)

y_train = np.asarray([{"pid": pid, "xi_hat": result_xi[pid]["xi_hat"]}
for pid in train_idxs])
y_test = np.asarray([{"pid": pid, "xi_hat": result_xi[pid]["xi_hat"]}
for pid in test_idxs])

np.save(path["data"] / "x_train_raw.npy", x_train)
np.save(path["data"] / "y_train_raw.npy", y_train)
np.save(path["data"] / "x_test_raw.npy", x_test)
np.save(path["data"] / "y_test_raw.npy", y_test)
np.save(path["data"] / "preprocessing_time.npy", [total_time / len(solved)])
Loading

0 comments on commit 7e5b6ef

Please sign in to comment.