Merge pull request #8 from JetBrains-Research/ci-fix-bechmark

Ci fix bechmark
JetBrains-Research · Jun 5, 2024 · 6d841da · 6d841da
2 parents 99b54ca + 16b6869
commit 6d841da
Show file tree

Hide file tree

Showing 10 changed files with 671 additions and 87 deletions.
diff --git a/ci-fixing/ci-fixing-benchmark/README.md b/ci-fixing/ci-fixing-benchmark/README.md
@@ -0,0 +1,58 @@
+## Config
+
+To initialize the benchmark, you need to pass a path to a config file with the following fields (see example in `benchmark.yaml`):
+
+**repos_folder**: here the cloned repos would be stored  
+**out_folder**: here the result files would be stored  
+**data_cache_dir**: here the cached dataset would be stored  
+**username_gh**: your GitHub username  
+**test_username**: Optional. Username that would be displayed in the benchmark. If ommitted, username_gh would be used. We prefer it in that way.  
+**language**: dataset language (now only Python is available)  
+
+## Benchmark usage
+
+Find the example of the benchmark usage code, see `run_benchmark.py` script
+
+To use the benchmark you need to pass a function that fixes the repo according 
+the repo state on a local machine, logs and metadata of the failed workflows `fix_repo_function`).
+
+It should have the following (all optional) arguments:
+(datapoint, repo_path, repo, out_folder)
+
+**datapoint**:  dp from the dataset (its structure would be given below)  
+**repo_path**:  path to the repo in the user's machine  
+**repo**:       git.Repo object from GitPython library  
+**out_folder**: folder for the benchmark results output  
+
+For now, only two functions have been implemented:
+
+`fix_none` -       does nothing  
+`fix_apply_diff` - applies the diff that fixed the issue in the original repo  
+
+## Evaluate dataset
+
+method `CIFixBenchmark.eval_dataset(fix_repo_function)` evaluates dataset:
+
+1. Downloads dataset (from https://huggingface.co/datasets/JetBrains-Research/lca-ci-fixing)
+2. Sends the datapoints on GitHub to run workflows
+3. Requests results from GitHub
+4. Analyzes results and print them.
+
+Further, we may duplicate the request part on our side.
+For debugging, please, limit yourself to a small amount of datapoints (argument `num_dp=num_dp`)
+
+## Outputs
+
+Method's outputs:
+
+1. `jobs_ids.jsonl` - identificators of the jobs that were sent to the GitHub. It is used for the further evaluation
+2. `jobs_results.jsonl` - results of each job.
+3. `jobs_awaiting.jsonl` - list of awaiting jobs (normally should be empty)
+3. `jobs_invalid.jsonl` - list of invalid jobs (normally should be empty)
+
+Examples of these files can be found in the `/examples` folder
+
+You can also evaluate your results using the method `CIFixBenchmark.eval_jobs(result_filename=result_filename)`
+passing `jobs_ids.jsonl` file.
+
+You can download the dataset using the `CIFixBenchmark.get_dataset()` method (example at the end of the file)
diff --git a/ci-fixing/ci-fixing-benchmark/benchmark.py b/ci-fixing/ci-fixing-benchmark/benchmark.py
@@ -1,77 +1,102 @@
-from omegaconf import OmegaConf
+import json
 import os
+import time
+
 import pandas as pd
 from datasets import load_dataset
-import time
-import json
+from omegaconf import OmegaConf
 from tqdm import tqdm
+from typing import List
 
-from benhmark_functions import process_datapoint, get_results
 from benchmark_utils import read_jsonl, save_jsonl
+from benhmark_functions import get_results, process_datapoint
 
 
 def filter_files(directory, files):
     return [file for file in files if file != "meta_info.json"]
 
+def filter_by_id(example, ids):
+    return example['id'] in ids
 
 class CIFixBenchmark:
     def __init__(self, model_name, config_path, token_gh):
         # languages = ["Python", "Kotlin", "Rust", "C++", "Java"]
         benchmark_owner = "LCA-CI-fix-benchmark"
         self.config = OmegaConf.load(config_path)
+        if not "test_username" in self.config:
+            self.config.test_username = self.config.username_gh
         language = self.config.language
-        self.credentials = {"username": self.config.username, "token": token_gh, "model": model_name}
-        # TODO parents=True (??)
+        self.credentials = {
+            "username": self.config.username_gh,
+            "token": token_gh,
+            "model": model_name,
+        }
+
         os.makedirs(self.config.out_folder, exist_ok=True)
         os.makedirs(self.config.repos_folder, exist_ok=True)
         self.dataset_id = f"JetBrains-Research/lca-ci-fixing"
-        OmegaConf.update(self.config, "benchmark_owner", benchmark_owner, force_add=True)
+        OmegaConf.update(
+            self.config, "benchmark_owner", benchmark_owner, force_add=True
+        )
         if hasattr(self.config, "data_cache_dir"):
             self.cache_dir = self.config.data_cache_dir
         else:
             self.cache_dir = None
         self.model_name = model_name
 
-    def get_dataset(self, hf_token=None, num_dp=None, force_download=False, dataset_folder=None):
-        # TODO remove hf_token when dataset becomes public
+    def get_dataset(
+        self, num_dp=None, force_download=False, dataset_folder=None
+    ):
 
         if dataset_folder is not None:
             self.dataset = load_dataset(path=dataset_folder)["train"]
+            #TODO needs refactoring
+            if num_dp is not None:
+                self.dataset = self.dataset.select(range(num_dp))
             return self.dataset
         if force_download:
             download_mode = "force_redownload"
         else:
             download_mode = None
         self.dataset = load_dataset(
-            self.dataset_id, token=hf_token, cache_dir=self.cache_dir, download_mode=download_mode, split="test"
+            self.dataset_id,
+            cache_dir=self.cache_dir,
+            download_mode=download_mode,
+            split="test"
         )
         if num_dp is not None:
             self.dataset = self.dataset.select(range(num_dp))
 
         return self.dataset
 
-    # TODO remove test_dataset argument after debug
     def run_dataset(self, fix_repo_function, test_dataset=None):
         if test_dataset is None:
             test_dataset = self.dataset
         self.jobs_ids = []
-        jobs_ids_file_path = os.path.join(self.config.out_folder, f"jobs_ids_{self.model_name}.jsonl")
+        jobs_ids_file_path = os.path.join(
+            self.config.out_folder, f"jobs_ids_{self.model_name}.jsonl"
+        )
         with open(jobs_ids_file_path, "w") as writer:
             for datapoint in tqdm(test_dataset):
-                job_identificator = process_datapoint(datapoint, fix_repo_function, self.config, self.credentials)
+                job_identificator = process_datapoint(
+                    datapoint, fix_repo_function, self.config, self.credentials
+                )
                 self.jobs_ids.append(job_identificator)
                 json.dump(job_identificator, writer)
                 writer.write("\n")
         return self.jobs_ids
 
-    # TODO remove jobs_ids argument after debug
     def eval_jobs(self, jobs_ids=None, job_ids_file=None, result_filename=None):
         if result_filename is None:
             result_filename = f"jobs_results_{self.model_name}.jsonl"
         # Maybe we need to make some pause
         jobs_results_file_path = os.path.join(self.config.out_folder, result_filename)
-        jobs_awaiting_file_path = os.path.join(self.config.out_folder, f"jobs_awaiting_{self.model_name}.jsonl")
-        jobs_invalid_file_path = os.path.join(self.config.out_folder, f"jobs_invalid_{self.model_name}.jsonl")
+        jobs_awaiting_file_path = os.path.join(
+            self.config.out_folder, f"jobs_awaiting_{self.model_name}.jsonl"
+        )
+        jobs_invalid_file_path = os.path.join(
+            self.config.out_folder, f"jobs_invalid_{self.model_name}.jsonl"
+        )
         result_file = open(jobs_results_file_path, "w")
         if job_ids_file is not None:
             jobs_ids = read_jsonl(job_ids_file)
@@ -102,8 +127,10 @@ def eval_jobs(self, jobs_ids=None, job_ids_file=None, result_filename=None):
                 result_file.close()
                 save_jsonl(jobs_awaiting_file_path, jobs_ids_await)
                 save_jsonl(jobs_invalid_file_path, jobs_ids_invalid)
-                print(f"Waiting 300 s to next request of evaluation. {len(jobs_ids_await)} jobs in waiting list.")
-                time.sleep(300)
+                print(
+                    f"Waiting 360 s to next request of evaluation. {len(jobs_ids_await)} jobs in waiting list."
+                )
+                time.sleep(360)
                 result_file = open(jobs_results_file_path, "a")
 
             n_attempts += 1
@@ -116,17 +143,34 @@ def eval_jobs(self, jobs_ids=None, job_ids_file=None, result_filename=None):
         self.jobs_results = jobs_results
         return jobs_results
 
+    def get_results(self, job_ids_file=None, result_filename=None):
+
+        if job_ids_file is None:
+            job_ids_file = os.path.join(
+                self.config.out_folder, f"jobs_ids_{self.model_name}.jsonl"
+            )
+
+        self.eval_jobs(self, job_ids_file, result_filename)
+        if result_filename is None:
+            result_filename = f"jobs_results_{self.model_name}.jsonl"
+            result_file = os.path.join(self.config.out_folder, result_filename)
+        self.analyze_results(jobs_results_file=result_file)
+
     def analyze_results(self, jobs_results=None, jobs_results_file=None):
         if jobs_results_file is not None:
             jobs_results = read_jsonl(jobs_results_file)
-        elif jobs_results is None:
+        if jobs_results is None:
             jobs_results = self.jobs_ids
 
         results_df = pd.DataFrame(jobs_results)
-        # %%
         total_counts = results_df["conclusion"].value_counts()
         total_ratio = total_counts / len(results_df)
-        difficulty_counts = results_df.groupby("difficulty")["conclusion"].value_counts().unstack().fillna(0)
+        difficulty_counts = (
+            results_df.groupby("difficulty")["conclusion"]
+            .value_counts()
+            .unstack()
+            .fillna(0)
+        )
         difficulty_ratios = difficulty_counts.div(difficulty_counts.sum(axis=1), axis=0)
 
         print("Overall results")
@@ -141,14 +185,20 @@ def analyze_results(self, jobs_results=None, jobs_results_file=None):
     def eval_dataset(
         self,
         fix_repo_function,
-        hf_token=None,
-        num_dp=None,
-        force_download=False,
-        result_filename=None,
-        dataset_folder=None,
+        num_dp: int = None,
+        ids_list: List = None,
+        force_download = False,
+        result_filename = None,
+        dataset_folder = None,
     ):
         print("---------------- Downloading data -------------------")
-        self.get_dataset(hf_token, num_dp=num_dp, force_download=force_download, dataset_folder=dataset_folder)
+        self.get_dataset(
+            num_dp=num_dp,
+            force_download=force_download,
+            dataset_folder=dataset_folder,
+        )
+        if ids_list is not None:
+            self.dataset = self.dataset.filter(lambda example: filter_by_id(example, ids_list))
         print(f"Got {len(self.dataset)} datapoints")
         print("---------------- Running datapoints -------------------")
         self.run_dataset(fix_repo_function)
@@ -158,9 +208,13 @@ def eval_dataset(
 
     def run_datapoint(self, datapoint, fix_repo_function):
         # This method is for debugging reasons
-        jobs_ids_file_path = os.path.join(self.config.out_folder, f"jobs_ids_{self.model_name}.jsonl")
+        jobs_ids_file_path = os.path.join(
+            self.config.out_folder, f"jobs_ids_{self.model_name}.jsonl"
+        )
         with open(jobs_ids_file_path, "w") as writer:
-            job_identificator = process_datapoint(datapoint, fix_repo_function, self.config, self.credentials)
+            job_identificator = process_datapoint(
+                datapoint, fix_repo_function, self.config, self.credentials
+            )
             json.dump(job_identificator, writer)
             writer.write("\n")
         return job_identificator

diff --git a/ci-fixing/ci-fixing-benchmark/benchmark.yaml b/ci-fixing/ci-fixing-benchmark/benchmark.yaml
@@ -1,6 +1,6 @@
 repos_folder: /mnt/data/shared-data/lca/CI-fix-benchmark/repos # here the cloned repos would be stored
 out_folder: /mnt/data/galimzyanov/data/LCA/benchmark/out # here the result files would be stored
 data_cache_dir: /mnt/data/galimzyanov/data/LCA/temp # here the cached dataset would be stored
-username: timur-for-test # your GitHub username
-test_username: test_user # username that would be displayed in the benchmark
+username_gh: timur-for-test # your GitHub username
+# test_username: test_user # username that would be displayed in the benchmark. Optional. If ommitted, username_gh would be used
 language: Python # dataset language (now only Python is available)
diff --git a/ci-fixing/ci-fixing-benchmark/benchmark_utils.py b/ci-fixing/ci-fixing-benchmark/benchmark_utils.py
@@ -1,7 +1,6 @@
-from omegaconf import OmegaConf
 import json
-import shutil
 import os
+import shutil
 
 
 def read_jsonl(file_path):
@@ -11,29 +10,13 @@ def read_jsonl(file_path):
             data.append(json.loads(line))
     return data
 
+
 def save_jsonl(file_path, data):
     with open(file_path, "w") as f:
         for entry in data:
             json.dump(entry, f)
             f.write("\n")
 
-def get_token_gh(config_path):
-    config_private = OmegaConf.load(config_path)
-    with open(config_private.token_gh_path) as f:
-        token_gh = f.read()
-    return token_gh
-
-
-def get_token_hf(config_path):
-    config_private = OmegaConf.load(config_path)
-    token_hf = get_token(config_private.token_hf_path)
-    return token_hf
-
-def get_token(token_path):
-    with open(token_path) as f:
-        token = f.read()
-
-    return token
 
 def filter_out_res(data_folder, out_folder):
     """
@@ -46,8 +29,16 @@ def filter_out_res(data_folder, out_folder):
     orig_path = os.path.join(data_folder, "datapoints_json_verified")
     filtered_path = os.path.join(data_folder, "datapoints_json_filtered")
     os.makedirs(filtered_path, exist_ok=True)
-    original_sha = {result["sha_original"][:7] for result in results_none if result["conclusion"] == "failure"}
-    fixed_sha = {result["sha_original"][:7] for result in results_diff if result["conclusion"] == "success"}
+    original_sha = {
+        result["sha_original"][:7]
+        for result in results_none
+        if result["conclusion"] == "failure"
+    }
+    fixed_sha = {
+        result["sha_original"][:7]
+        for result in results_diff
+        if result["conclusion"] == "success"
+    }
 
     sha_valid = original_sha.intersection(fixed_sha)