diff --git a/code_completion/eval/config/config.yaml b/code_completion/eval/config/config.yaml index 70941e9..9d0a7b6 100644 --- a/code_completion/eval/config/config.yaml +++ b/code_completion/eval/config/config.yaml @@ -1,8 +1,8 @@ defaults: - params: codellama7b - composers_config: python_standard + - dataset: small -dataset: "JetBrains-Research/lca-codegen-small" language: 'python' artifacts_dir: "data/code_generation/artifacts/" diff --git a/code_completion/eval/config/dataset/huge.yaml b/code_completion/eval/config/dataset/huge.yaml new file mode 100644 index 0000000..c253ab3 --- /dev/null +++ b/code_completion/eval/config/dataset/huge.yaml @@ -0,0 +1,2 @@ +path: "JetBrains-Research/lca-project-level-code-completion" +name: "huge_context" diff --git a/code_completion/eval/config/dataset/large.yaml b/code_completion/eval/config/dataset/large.yaml new file mode 100644 index 0000000..d1f50cc --- /dev/null +++ b/code_completion/eval/config/dataset/large.yaml @@ -0,0 +1,2 @@ +path: "JetBrains-Research/lca-project-level-code-completion" +name: "large_context" diff --git a/code_completion/eval/config/dataset/medium.yaml b/code_completion/eval/config/dataset/medium.yaml new file mode 100644 index 0000000..118a1d5 --- /dev/null +++ b/code_completion/eval/config/dataset/medium.yaml @@ -0,0 +1,2 @@ +path: "JetBrains-Research/lca-project-level-code-completion" +name: "medium_context" diff --git a/code_completion/eval/config/dataset/small.yaml b/code_completion/eval/config/dataset/small.yaml new file mode 100644 index 0000000..6e9adad --- /dev/null +++ b/code_completion/eval/config/dataset/small.yaml @@ -0,0 +1,2 @@ +path: "JetBrains-Research/lca-project-level-code-completion" +name: "small_context" diff --git a/code_completion/eval/eval_pipeline.py b/code_completion/eval/eval_pipeline.py index 387be25..bfc448b 100644 --- a/code_completion/eval/eval_pipeline.py +++ b/code_completion/eval/eval_pipeline.py @@ -2,6 +2,7 @@ import warnings import hydra +import omegaconf import torch.cuda from omegaconf import DictConfig @@ -22,7 +23,7 @@ @dataclass class PreprocessConfig: model: str # One of PREPROCESSORS from lca.code_generation.eval.preprocess - dataset: str # Path to dataset + dataset: str | omegaconf.dictconfig.DictConfig # Path to dataset or dictionary with `path`, `name` keys tokenizer: str # Path to tokenizer # config_path: str # Path to composer configs composers: str # One of COMPOSERS from lca.code_generation.eval.preprocess @@ -75,7 +76,10 @@ def __init__(self, config, composers=COMPOSERS): warnings.warn(f'Model and Tokenizer have different paths') # preprocess_params.dataset = config.dataset - self.dataset_name = config.dataset.split('/')[-1].replace('-', '_') + if isinstance(config.dataset, str): + self.dataset_name = config.dataset.split('/')[-1].replace('-', '_') + elif isinstance(config.dataset, omegaconf.dictconfig.DictConfig): + self.dataset_name = config.dataset['name'] dataset_out_dir = os.path.join(config.artifacts_dir, config.language, inference_params['model'], self.dataset_name) # preprocess_params['out_dir'] = os.path.join(dataset_out_dir, 'in') @@ -152,7 +156,7 @@ def run(self): gen_scores, gen_results, em_difference, line_counts = evaluate_generation(self.generator_config) wb_run.log(gen_scores | {'EM_difference': em_difference, 'Line Counts': line_counts, - "dataset": self.config.dataset, "model": self.inference_args.model}) + "dataset": self.dataset_name, "model": self.inference_args.model}) wb_run.finish() with open(os.path.join(self.out_dir, 'generation_scores.json'), 'w') as f: json.dump(gen_results, f, indent=4) @@ -180,7 +184,7 @@ def run_zero_context(self): print(">>Evaluation...") mean_ppl = evaluate(self.eval_args) - return {"perplexity": mean_ppl, "context": 0, "composer": "zero", "dataset": self.config.dataset, + return {"perplexity": mean_ppl, "context": 0, "composer": "zero", "dataset": self.dataset_name, "model": self.inference_args.model} | lost_tokens def run_composer(self, composer, results): @@ -208,7 +212,7 @@ def run_composer(self, composer, results): print(">>>>>>Evaluation...") mean_ppl = evaluate(self.eval_args) results.append({"perplexity": mean_ppl, "context": self.inference_args.context_max, - "composer": composer, "dataset": self.config.dataset, + "composer": composer, "dataset": self.dataset_name, "model": self.inference_args.model} | lost_tokens) print(results[-1]) wb_run.log(results[-1]) diff --git a/code_completion/eval/preprocess.py b/code_completion/eval/preprocess.py index 6b1066e..e217f27 100644 --- a/code_completion/eval/preprocess.py +++ b/code_completion/eval/preprocess.py @@ -52,7 +52,7 @@ def preprocess(args, composer_args): prepared_dataset_path = os.path.join(resolve_directories(args), f'model_inputs_composer_{args.composers}.json') preprocessor = get_preprocessor(args)( - filepath=args.dataset, + dataset_params=args.dataset, tokenizer_path=args.tokenizer, context_len_char=args.context_len_char, **composers diff --git a/code_completion/eval/preprocessors.py b/code_completion/eval/preprocessors.py index 50d801c..f6836cf 100644 --- a/code_completion/eval/preprocessors.py +++ b/code_completion/eval/preprocessors.py @@ -3,6 +3,8 @@ import os.path from typing import Dict, List, Any, Optional, Callable from dataclasses import dataclass + +import omegaconf import torch from datasets import load_dataset from joblib import Parallel, delayed @@ -21,15 +23,15 @@ class TokenizerOutput: class PreprocessorBase: def __init__(self, - filepath: str, + dataset_params: str | dict, tokenizer_path: str | None = None, context_len_char: int = 60_000, context_composer: Callable[[Dict[str, Any]], str] | None = None, completion_composer: Callable[[Dict[str, Any]], str] | None = None, data_source: str = 'hf', ): - self.filepath = filepath - self.data: list[DatapointBase] = self._load_data(filepath) + self.dataset_params = dataset_params + self.data: list[DatapointBase] = self._load_data(dataset_params) self.prepared_data: Optional[List[Dict[str, Any]]] = None self.tokenizer_path = tokenizer_path self.context_composer = context_composer @@ -124,10 +126,16 @@ def save_model_inputs(self, filepath='lca/code_generation/data/model_inputs.json with open(filepath, 'w') as f: json.dump(self.prepared_data, f) - def _load_data(self, path: str) -> list[DatapointBase]: + def _load_data(self, dataset_params: str | dict) -> list[DatapointBase]: if True: #self.data_source == 'hf': data = list() - hf_data = load_dataset(path, split='test') + if isinstance(dataset_params, str): + hf_data = load_dataset(dataset_params, split='test') + elif isinstance(dataset_params, omegaconf.dictconfig.DictConfig): + hf_data = load_dataset(split='test', **dataset_params) + else: + raise ValueError('check `config.dataset`, it must be string or dictionary') + repos_list = list(set([hf_dp['repo'] for hf_dp in hf_data])) repos_map = {repo: repo_num for repo_num, repo in enumerate(repos_list)} @@ -153,8 +161,8 @@ def _load_data(self, path: str) -> list[DatapointBase]: import youtokentome as yttm class FLPythonPreprocessor(PreprocessorBase): - def __init__(self, filepath, tokenizer_path=None, context_len_char=60_000, **composers): - super().__init__(filepath, tokenizer_path, context_len_char, **composers) + def __init__(self, dataset_params, tokenizer_path=None, context_len_char=60_000, **composers): + super().__init__(dataset_params, tokenizer_path, context_len_char, **composers) self.lang_sep_symbol = '₣' self.meta_info_sep_symbol = '𐌼' self.extension = '.py' @@ -192,8 +200,8 @@ def _load_tokenizer(self, path): from transformers import AutoTokenizer class HFPreprocessor(PreprocessorBase): - def __init__(self, filepath, tokenizer_path, context_len_char=60_000, **composers): - super().__init__(filepath, tokenizer_path, context_len_char, **composers) + def __init__(self, dataset_params, tokenizer_path, context_len_char=60_000, **composers): + super().__init__(dataset_params, tokenizer_path, context_len_char, **composers) self.lang_sep_symbol = '' self.meta_info_sep_symbol = 'METASEP' self.extension = '' @@ -222,8 +230,8 @@ def _load_tokenizer(self, path): class StarcoderPreprocessor(HFPreprocessor): - def __init__(self, filepath, tokenizer_path="bigcode/starcoder", context_len_char=60_000, **composers): - super().__init__(filepath, tokenizer_path, context_len_char, **composers) + def __init__(self, dataset_params, tokenizer_path="bigcode/starcoder", context_len_char=60_000, **composers): + super().__init__(dataset_params, tokenizer_path, context_len_char, **composers) self.lang_sep_symbol = 'LANGSEP' self.meta_info_sep_symbol = 'METASEP' self.extension = '.py'