test: add datagen driver

VowpalWabbit · Sep 3, 2023 · c9f4028 · c9f4028
1 parent 3b33740
commit c9f4028
Show file tree

Hide file tree

Showing 21 changed files with 182 additions and 184 deletions.
diff --git a/python/tests/e2e_v2/assert_job.py b/python/tests/e2e_v2/assert_job.py
@@ -3,7 +3,7 @@
 from numpy.testing import assert_allclose, assert_almost_equal
 from vw_executor.vw import ExecutionStatus
 import vowpalwabbit as vw
-from test_helper import get_function_object
+from test_helper import get_function_object, datagen_driver
 
 
 def remove_non_digits(string):
@@ -82,6 +82,7 @@ def assert_loss_below(job, **kwargs):
 
 
 def assert_prediction_with_generated_data(job, **kwargs):
+
     assert job.status == ExecutionStatus.Success, "job should be successful"
     expected_class = []
     trained_model = vw.Workspace(f"-i {job[0].model9('-f').path} --quiet")
@@ -102,7 +103,12 @@ def assert_prediction_with_generated_data(job, **kwargs):
                 break
         except:
             pass
-    dataFile = data_func_obj(**kwargs["data_func"]["params"])
+    script_directory = os.path.dirname(os.path.realpath(__file__))
+    dataFile = datagen_driver(
+        os.path.join(script_directory, subdir_name),
+        data_func_obj,
+        **kwargs["data_func"]["params"],
+    )
     with open(dataFile, "r") as f:
         for line in f.readlines():
             expected_class.append(line.split("|")[0].strip())

diff --git a/python/tests/e2e_v2/cb/data_generation.py b/python/tests/e2e_v2/cb/data_generation.py
@@ -11,6 +11,7 @@ def random_number_items(items):
 
 
 def generate_cb_data(
+    f,
     num_examples,
     num_features,
     num_actions,
@@ -20,7 +21,6 @@ def generate_cb_data(
     seed=random.randint(0, 100),
 ):
     random.seed(seed)
-    dataFile = f"cb_test_{num_examples}_{num_actions}_{num_features}.txt"
 
     reward_function_obj = get_function_object(
         "cb.reward_functions", reward_function["name"]
@@ -29,43 +29,41 @@ def generate_cb_data(
         "cb.logging_policies", logging_policy["name"]
     )
     features = [f"feature{index}" for index in range(1, num_features + 1)]
-    with open(os.path.join(script_directory, dataFile), "w") as f:
-        for _ in range(num_examples):
-            no_context = len(context_name)
-            if no_context > 1:
-                context = random.randint(1, no_context)
-            else:
-                context = 1
+    for _ in range(num_examples):
+        no_context = len(context_name)
+        if no_context > 1:
+            context = random.randint(1, no_context)
+        else:
+            context = 1
 
-            def return_cost_probability(chosen_action, context=1):
-                cost = -reward_function_obj(
-                    chosen_action, context, **reward_function["params"]
-                )
-                if "params" not in logging_policy:
-                    logging_policy["params"] = {}
-                logging_policy["params"]["chosen_action"] = chosen_action
-                logging_policy["params"]["num_actions"] = num_actions
-                probability = logging_policy_obj(**logging_policy["params"])
-                return cost, probability
+        def return_cost_probability(chosen_action, context=1):
+            cost = -reward_function_obj(
+                chosen_action, context, **reward_function["params"]
+            )
+            if "params" not in logging_policy:
+                logging_policy["params"] = {}
+            logging_policy["params"]["chosen_action"] = chosen_action
+            logging_policy["params"]["num_actions"] = num_actions
+            probability = logging_policy_obj(**logging_policy["params"])
+            return cost, probability
 
-            chosen_action = random.randint(1, num_actions)
-            if no_context > 1:
-                f.write(f"shared | User s_{context_name[context-1]}\n")
-                for action in range(1, num_actions + 1):
+        chosen_action = random.randint(1, num_actions)
+        if no_context > 1:
+            f.write(f"shared | User s_{context_name[context-1]}\n")
+            for action in range(1, num_actions + 1):
 
-                    cost, probability = return_cost_probability(action, context)
-                    if action == chosen_action:
-                        f.write(
-                            f'{action}:{cost}:{probability} | {" ".join(random_number_items(features))}\n'
-                        )
-                    else:
-                        f.write(f'| {" ".join(random_number_items(features))}\n')
+                cost, probability = return_cost_probability(action, context)
+                if action == chosen_action:
+                    f.write(
+                        f'{action}:{cost}:{probability} | {" ".join(random_number_items(features))}\n'
+                    )
+                else:
+                    f.write(f'| {" ".join(random_number_items(features))}\n')
 
-            else:
+        else:
 
-                cost, probability = return_cost_probability(chosen_action)
-                f.write(
-                    f'{chosen_action}:{cost}:{probability} | {" ".join(random_number_items(features))}\n'
-                )
-            f.write("\n")
-    return os.path.join(script_directory, dataFile)
+            cost, probability = return_cost_probability(chosen_action)
+            f.write(
+                f'{chosen_action}:{cost}:{probability} | {" ".join(random_number_items(features))}\n'
+            )
+        f.write("\n")
diff --git a/python/tests/e2e_v2/cb/logging_policies.py b/python/tests/e2e_v2/cb/logging_policies.py
@@ -1,7 +1,6 @@
-def constant_probability(chosen_action, **kwargs):
+def constant_probability(chosen_action):
     return 1
 
 
-def even_probability(chosen_action, **kwargs):
-    num_actions = kwargs["num_actions"]
+def even_probability(chosen_action, num_actions):
     return round(1 / num_actions, 2)
diff --git a/python/tests/e2e_v2/cb/reward_functions.py b/python/tests/e2e_v2/cb/reward_functions.py
@@ -1,13 +1,12 @@
-def fixed_reward(chosen_action, context, **kwargs):
+def fixed_reward(chosen_action, context):
     return 1
 
 
-def constant_reward(chosen_action, context, **kwargs):
-    reward = kwargs["reward"]
+def constant_reward(chosen_action, context, reward):
     return reward[chosen_action - 1]
 
 
-def fixed_reward_two_action(chosen_action, context, **kwargs):
+def fixed_reward_two_action(chosen_action, context):
     if context == 1 and chosen_action == 2:
         return 1
     elif context == 2 and chosen_action == 2:

diff --git a/python/tests/e2e_v2/cb_cont/data_generation.py b/python/tests/e2e_v2/cb_cont/data_generation.py
@@ -11,6 +11,7 @@ def random_number_items(items):
 
 
 def generate_cb_data(
+    f,
     num_examples,
     num_features,
     action_range,
@@ -21,7 +22,6 @@ def generate_cb_data(
 ):
     random.seed(seed)
     num_actions = int(abs(action_range[1] - action_range[0]))
-    dataFile = f"cb_cont_test_{num_examples}_{num_actions}_{num_features}.txt"
 
     reward_function_obj = get_function_object(
         "cb_cont.reward_functions", reward_function["name"]
@@ -30,34 +30,33 @@ def generate_cb_data(
         "cb_cont.logging_policies", logging_policy["name"]
     )
     features = [f"feature{index}" for index in range(1, num_features + 1)]
-    with open(os.path.join(script_directory, dataFile), "w") as f:
-        for _ in range(num_examples):
-            no_context = len(context_name)
-            if no_context > 1:
-                context = random.randint(1, no_context)
-            else:
-                context = 1
-
-            def return_cost_probability(chosen_action, context):
-                cost = -reward_function_obj(
-                    chosen_action, context, **reward_function["params"]
-                )
-                if "params" not in logging_policy:
-                    logging_policy["params"] = {}
-                logging_policy["params"]["chosen_action"] = chosen_action
-                logging_policy["params"]["num_actions"] = num_actions
-                probability = logging_policy_obj(**logging_policy["params"])
-                return cost, probability
-
-            chosen_action = round(random.uniform(0, num_actions), 2)
-            cost, probability = return_cost_probability(chosen_action, context)
-            if no_context == 1:
-                f.write(
-                    f'ca {chosen_action}:{cost}:{probability} | {" ".join(random_number_items(features))}\n'
-                )
-            else:
-                f.write(
-                    f'ca {chosen_action}:{cost}:{probability} | {"s_" + context_name[context-1]} {" ".join(random_number_items(features))}\n'
-                )
-            f.write("\n")
-    return os.path.join(script_directory, dataFile)
+
+    for _ in range(num_examples):
+        no_context = len(context_name)
+        if no_context > 1:
+            context = random.randint(1, no_context)
+        else:
+            context = 1
+
+        def return_cost_probability(chosen_action, context):
+            cost = -reward_function_obj(
+                chosen_action, context, **reward_function["params"]
+            )
+            if "params" not in logging_policy:
+                logging_policy["params"] = {}
+            logging_policy["params"]["chosen_action"] = chosen_action
+            logging_policy["params"]["num_actions"] = num_actions
+            probability = logging_policy_obj(**logging_policy["params"])
+            return cost, probability
+
+        chosen_action = round(random.uniform(0, num_actions), 2)
+        cost, probability = return_cost_probability(chosen_action, context)
+        if no_context == 1:
+            f.write(
+                f'ca {chosen_action}:{cost}:{probability} | {" ".join(random_number_items(features))}\n'
+            )
+        else:
+            f.write(
+                f'ca {chosen_action}:{cost}:{probability} | {"s_" + context_name[context-1]} {" ".join(random_number_items(features))}\n'
+            )
+        f.write("\n")
diff --git a/python/tests/e2e_v2/cb_cont/logging_policies.py b/python/tests/e2e_v2/cb_cont/logging_policies.py
@@ -1,7 +1,6 @@
-def constant_probability(chosen_action, **kwargs):
+def constant_probability(chosen_action):
     return 1
 
 
-def even_probability(chosen_action, **kwargs):
-    num_actions = kwargs["num_actions"]
+def even_probability(chosen_action, num_actions):
     return round(1 / num_actions, 2)
diff --git a/python/tests/e2e_v2/cb_cont/reward_functions.py b/python/tests/e2e_v2/cb_cont/reward_functions.py
@@ -1,13 +1,12 @@
-def fixed_reward(chosen_action, context, **kwargs):
+def fixed_reward(chosen_action, context):
     return 1
 
 
-def piecewise_constant(chosen_action, context, **kwargs):
-    reward = kwargs["reward"]
+def piecewise_constant(chosen_action, context, reward):
     return reward[int(chosen_action) - 1]
 
 
-def fixed_reward_two_action(chosen_action, context, **kwargs):
+def fixed_reward_two_action(chosen_action, context):
     if context == 1 and chosen_action >= 2:
         return 1
     elif context == 2 and chosen_action < 2 and chosen_action >= 1:

diff --git a/python/tests/e2e_v2/classification/data_generation.py b/python/tests/e2e_v2/classification/data_generation.py
@@ -7,28 +7,23 @@
 
 
 def generate_classification_data(
+    f,
     num_example,
-    num_classes,
     num_features,
     classify_func,
     seed=random.randint(0, 100),
     bounds=None,
 ):
     random.seed(seed)
-    dataFile = f"classification_{num_classes}_{num_features}_{num_example}.txt"
     classify_func_obj = get_function_object(
         "classification.classification_functions", classify_func["name"]
     )
     if not bounds:
         bounds = [[0, 1] for _ in range(num_features)]
-    with open(os.path.join(script_directory, dataFile), "w") as f:
-        for _ in range(num_example):
-            x = [
-                random.uniform(bounds[index][0], bounds[index][1])
-                for index in range(num_features)
-            ]
-            y = classify_func_obj(x, **classify_func["params"])
-            f.write(
-                f"{y} |f {' '.join([f'x{i}:{x[i]}' for i in range(num_features)])}\n"
-            )
-    return os.path.join(script_directory, dataFile)
+    for _ in range(num_example):
+        x = [
+            random.uniform(bounds[index][0], bounds[index][1])
+            for index in range(num_features)
+        ]
+        y = classify_func_obj(x, **classify_func["params"])
+        f.write(f"{y} |f {' '.join([f'x{i}:{x[i]}' for i in range(num_features)])}\n")
diff --git a/python/tests/e2e_v2/conftest.py b/python/tests/e2e_v2/conftest.py
@@ -1,5 +1,3 @@
-import pytest
-
 # conftest.py
 def pytest_addoption(parser):
     parser.addoption(

diff --git a/python/tests/e2e_v2/regression/data_generation.py b/python/tests/e2e_v2/regression/data_generation.py
@@ -5,14 +5,9 @@
 
 
 def constant_function(
-    no_sample, constant, x_lower_bound, x_upper_bound, seed=random.randint(0, 100)
+    f, no_sample, constant, x_lower_bound, x_upper_bound, seed=random.randint(0, 100)
 ):
     random.seed(seed)
-    dataFile = (
-        f"constant_func_{no_sample}_{constant}_{x_upper_bound}_{x_lower_bound}.txt"
-    )
-    with open(os.path.join(script_directory, dataFile), "w") as f:
-        for _ in range(no_sample):
-            x = random.uniform(x_lower_bound, x_upper_bound)
-            f.write(f"{constant} |f x:{x}\n")
-    return os.path.join(script_directory, dataFile)
+    for _ in range(no_sample):
+        x = random.uniform(x_lower_bound, x_upper_bound)
+        f.write(f"{constant} |f x:{x}\n")
diff --git a/python/tests/e2e_v2/slate/action_space.py b/python/tests/e2e_v2/slate/action_space.py
@@ -1,7 +1,5 @@
-def new_action_after_threshold(**kwargs):
-    iteration = kwargs.get("iteration", 0)
-    threshold = kwargs.get("threshold", 0)
+def new_action_after_threshold(iteration, threshold, before, after):
     # before iteration 500, it is sunny and after it is raining
     if iteration > threshold:
-        return kwargs["after"]
-    return kwargs["before"]
+        return after
+    return before
diff --git a/python/tests/e2e_v2/slate/assert_job.py b/python/tests/e2e_v2/slate/assert_job.py
@@ -1,4 +1,3 @@
-from numpy.testing import assert_allclose, assert_almost_equal
 from vw_executor.vw import ExecutionStatus
 import numpy as np