Skip to content

Commit

Permalink
Support new dataset format
Browse files Browse the repository at this point in the history
  • Loading branch information
evgngl committed Jun 4, 2024
1 parent d03c290 commit ad44c80
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 18 deletions.
2 changes: 1 addition & 1 deletion code_completion/eval/config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
defaults:
- params: codellama7b
- composers_config: python_standard
- dataset: small

dataset: "JetBrains-Research/lca-codegen-small"
language: 'python'

artifacts_dir: "data/code_generation/artifacts/"
Expand Down
2 changes: 2 additions & 0 deletions code_completion/eval/config/dataset/huge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
path: "JetBrains-Research/lca-project-level-code-completion"
name: "huge_context"
2 changes: 2 additions & 0 deletions code_completion/eval/config/dataset/large.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
path: "JetBrains-Research/lca-project-level-code-completion"
name: "large_context"
2 changes: 2 additions & 0 deletions code_completion/eval/config/dataset/medium.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
path: "JetBrains-Research/lca-project-level-code-completion"
name: "medium_context"
2 changes: 2 additions & 0 deletions code_completion/eval/config/dataset/small.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
path: "JetBrains-Research/lca-project-level-code-completion"
name: "small_context"
14 changes: 9 additions & 5 deletions code_completion/eval/eval_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import warnings

import hydra
import omegaconf
import torch.cuda
from omegaconf import DictConfig

Expand All @@ -22,7 +23,7 @@
@dataclass
class PreprocessConfig:
model: str # One of PREPROCESSORS from lca.code_generation.eval.preprocess
dataset: str # Path to dataset
dataset: str | omegaconf.dictconfig.DictConfig # Path to dataset or dictionary with `path`, `name` keys
tokenizer: str # Path to tokenizer
# config_path: str # Path to composer configs
composers: str # One of COMPOSERS from lca.code_generation.eval.preprocess
Expand Down Expand Up @@ -75,7 +76,10 @@ def __init__(self, config, composers=COMPOSERS):
warnings.warn(f'Model and Tokenizer have different paths')

# preprocess_params.dataset = config.dataset
self.dataset_name = config.dataset.split('/')[-1].replace('-', '_')
if isinstance(config.dataset, str):
self.dataset_name = config.dataset.split('/')[-1].replace('-', '_')
elif isinstance(config.dataset, omegaconf.dictconfig.DictConfig):
self.dataset_name = config.dataset['name']
dataset_out_dir = os.path.join(config.artifacts_dir, config.language, inference_params['model'],
self.dataset_name)
# preprocess_params['out_dir'] = os.path.join(dataset_out_dir, 'in')
Expand Down Expand Up @@ -152,7 +156,7 @@ def run(self):
gen_scores, gen_results, em_difference, line_counts = evaluate_generation(self.generator_config)

wb_run.log(gen_scores | {'EM_difference': em_difference, 'Line Counts': line_counts,
"dataset": self.config.dataset, "model": self.inference_args.model})
"dataset": self.dataset_name, "model": self.inference_args.model})
wb_run.finish()
with open(os.path.join(self.out_dir, 'generation_scores.json'), 'w') as f:
json.dump(gen_results, f, indent=4)
Expand Down Expand Up @@ -180,7 +184,7 @@ def run_zero_context(self):
print(">>Evaluation...")
mean_ppl = evaluate(self.eval_args)

return {"perplexity": mean_ppl, "context": 0, "composer": "zero", "dataset": self.config.dataset,
return {"perplexity": mean_ppl, "context": 0, "composer": "zero", "dataset": self.dataset_name,
"model": self.inference_args.model} | lost_tokens

def run_composer(self, composer, results):
Expand Down Expand Up @@ -208,7 +212,7 @@ def run_composer(self, composer, results):
print(">>>>>>Evaluation...")
mean_ppl = evaluate(self.eval_args)
results.append({"perplexity": mean_ppl, "context": self.inference_args.context_max,
"composer": composer, "dataset": self.config.dataset,
"composer": composer, "dataset": self.dataset_name,
"model": self.inference_args.model} | lost_tokens)
print(results[-1])
wb_run.log(results[-1])
Expand Down
2 changes: 1 addition & 1 deletion code_completion/eval/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def preprocess(args, composer_args):
prepared_dataset_path = os.path.join(resolve_directories(args), f'model_inputs_composer_{args.composers}.json')

preprocessor = get_preprocessor(args)(
filepath=args.dataset,
dataset_params=args.dataset,
tokenizer_path=args.tokenizer,
context_len_char=args.context_len_char,
**composers
Expand Down
30 changes: 19 additions & 11 deletions code_completion/eval/preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import os.path
from typing import Dict, List, Any, Optional, Callable
from dataclasses import dataclass

import omegaconf
import torch
from datasets import load_dataset
from joblib import Parallel, delayed
Expand All @@ -21,15 +23,15 @@ class TokenizerOutput:

class PreprocessorBase:
def __init__(self,
filepath: str,
dataset_params: str | dict,
tokenizer_path: str | None = None,
context_len_char: int = 60_000,
context_composer: Callable[[Dict[str, Any]], str] | None = None,
completion_composer: Callable[[Dict[str, Any]], str] | None = None,
data_source: str = 'hf',
):
self.filepath = filepath
self.data: list[DatapointBase] = self._load_data(filepath)
self.dataset_params = dataset_params
self.data: list[DatapointBase] = self._load_data(dataset_params)
self.prepared_data: Optional[List[Dict[str, Any]]] = None
self.tokenizer_path = tokenizer_path
self.context_composer = context_composer
Expand Down Expand Up @@ -124,10 +126,16 @@ def save_model_inputs(self, filepath='lca/code_generation/data/model_inputs.json
with open(filepath, 'w') as f:
json.dump(self.prepared_data, f)

def _load_data(self, path: str) -> list[DatapointBase]:
def _load_data(self, dataset_params: str | dict) -> list[DatapointBase]:
if True: #self.data_source == 'hf':
data = list()
hf_data = load_dataset(path, split='test')
if isinstance(dataset_params, str):
hf_data = load_dataset(dataset_params, split='test')
elif isinstance(dataset_params, omegaconf.dictconfig.DictConfig):
hf_data = load_dataset(split='test', **dataset_params)
else:
raise ValueError('check `config.dataset`, it must be string or dictionary')

repos_list = list(set([hf_dp['repo'] for hf_dp in hf_data]))
repos_map = {repo: repo_num for repo_num, repo in enumerate(repos_list)}

Expand All @@ -153,8 +161,8 @@ def _load_data(self, path: str) -> list[DatapointBase]:

import youtokentome as yttm
class FLPythonPreprocessor(PreprocessorBase):
def __init__(self, filepath, tokenizer_path=None, context_len_char=60_000, **composers):
super().__init__(filepath, tokenizer_path, context_len_char, **composers)
def __init__(self, dataset_params, tokenizer_path=None, context_len_char=60_000, **composers):
super().__init__(dataset_params, tokenizer_path, context_len_char, **composers)
self.lang_sep_symbol = '₣'
self.meta_info_sep_symbol = '𐌼'
self.extension = '.py'
Expand Down Expand Up @@ -192,8 +200,8 @@ def _load_tokenizer(self, path):

from transformers import AutoTokenizer
class HFPreprocessor(PreprocessorBase):
def __init__(self, filepath, tokenizer_path, context_len_char=60_000, **composers):
super().__init__(filepath, tokenizer_path, context_len_char, **composers)
def __init__(self, dataset_params, tokenizer_path, context_len_char=60_000, **composers):
super().__init__(dataset_params, tokenizer_path, context_len_char, **composers)
self.lang_sep_symbol = ''
self.meta_info_sep_symbol = 'METASEP'
self.extension = ''
Expand Down Expand Up @@ -222,8 +230,8 @@ def _load_tokenizer(self, path):


class StarcoderPreprocessor(HFPreprocessor):
def __init__(self, filepath, tokenizer_path="bigcode/starcoder", context_len_char=60_000, **composers):
super().__init__(filepath, tokenizer_path, context_len_char, **composers)
def __init__(self, dataset_params, tokenizer_path="bigcode/starcoder", context_len_char=60_000, **composers):
super().__init__(dataset_params, tokenizer_path, context_len_char, **composers)
self.lang_sep_symbol = 'LANGSEP'
self.meta_info_sep_symbol = 'METASEP'
self.extension = '.py'
Expand Down

0 comments on commit ad44c80

Please sign in to comment.