Skip to content

Commit

Permalink
Fix wandb logging, add new models
Browse files Browse the repository at this point in the history
  • Loading branch information
evgngl committed Jun 4, 2024
1 parent dcce154 commit d03c290
Show file tree
Hide file tree
Showing 11 changed files with 237 additions and 133 deletions.
26 changes: 23 additions & 3 deletions code_completion/eval/composers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
'naive': {'module': 'eval.composers', 'name': 'DummyComposer'},
# 'alphabetical': {'module': 'lca.code_generation.eval.composers', 'name': 'AlphabeticalComposer'},
'path_distance': {'module': 'eval.composers', 'name': 'PathDistanceComposer'},
'file_length': {'module': 'eval.composers', 'name': 'FileLengthComposer'},
# 'file_length': {'module': 'eval.composers', 'name': 'FileLengthComposer'},
'half_memory': {'module': 'eval.composers', 'name': 'HalfMemoryComposer'},
'function_class_mask_half': {'module': 'eval.composers', 'name': 'FuncClassComposer'},
'half_memory_path': {'module': 'eval.composers', 'name': 'HalfMemoryPathComposer'},
# 'function_class_mask_half': {'module': 'eval.composers', 'name': 'FuncClassComposer'},
'function_class_mask_one': {'module': 'eval.composers', 'name': 'FuncClassComposerOne'},
'imports_first': {'module': 'eval.composers', 'name': 'ImportsFirstComposer'},
# 'imports_first': {'module': 'eval.composers', 'name': 'ImportsFirstComposer'},
}


Expand Down Expand Up @@ -188,6 +189,25 @@ def context_composer(self, datapoint: DatapointBase) -> str:
return repo_metainfo + self.lang_sep_symbol.join(composed_content)


class HalfMemoryPathComposer(PathDistanceComposer, HalfMemoryComposer):
def context_composer(self, datapoint: DatapointBase) -> str:
# context = datapoint['context']
context = datapoint.get_context()
completion = datapoint.get_completion()
repo_name = datapoint.repo_name
assert len(completion) == 1, 'Only one file should be completed'
completion_path = list(completion)[0]
sorted_pathes = self._sort_filepathes(completion_path, list(context))

composed_content = [path + self.meta_info_sep_symbol + self._forget_half(context[path]) for path in sorted_pathes[::-1]]

composed_content.append(completion_path + self.meta_info_sep_symbol)

repo_metainfo = f"{self.extension}{self.lang_sep_symbol}{repo_name}{self.meta_info_sep_symbol}"

return repo_metainfo + self.lang_sep_symbol.join(composed_content)


class FuncClassComposer(PathDistanceComposer):
@staticmethod
def _filter_func_class(code: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion code_completion/eval/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ language: 'python'

artifacts_dir: "data/code_generation/artifacts/"

wandb_project_name: ???
wandb_project_name: 'LCA_context_composer_choice'

do_generation: True
seed: 42
Expand Down
13 changes: 13 additions & 0 deletions code_completion/eval/config/params/deepseekcoder1b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
preprocess_params:
model: "huggingface"
composers: "none"
tokenizer: "deepseek-ai/deepseek-coder-1.3b-base"

inference_params :
model: "deepseek-coder-1b"
seq_max_len: 16000
input_data_path: ""
context_max: -1

eval_params:
device: cuda
13 changes: 13 additions & 0 deletions code_completion/eval/config/params/deepseekcoder7b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
preprocess_params:
model: "huggingface"
composers: "none"
tokenizer: "deepseek-ai/deepseek-coder-6.7b-base"

inference_params :
model: "deepseek-coder-7b"
seq_max_len: 16000
input_data_path: ""
context_max: -1

eval_params:
device: cuda
13 changes: 13 additions & 0 deletions code_completion/eval/config/params/starcoder2_3b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
preprocess_params:
model: "huggingface"
composers: "none"
tokenizer: "bigcode/starcoder2-3b"

inference_params :
model: "starcoder2-3b"
seq_max_len: 16000
input_data_path: ""
context_max: -1

eval_params:
device: cuda
15 changes: 10 additions & 5 deletions code_completion/eval/eval_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class PreprocessConfig:
# config_path: str # Path to composer configs
composers: str # One of COMPOSERS from lca.code_generation.eval.preprocess
out_dir: str # Where to save preprocessed dataset
context_len_char: int # How much do we need to crop context string 5*seq_max_len by default


@dataclass
Expand Down Expand Up @@ -83,6 +84,7 @@ def __init__(self, config, composers=COMPOSERS):

# eval_params["dataset_dir"] = inference_params["out_dir"]
self.preprocess_args = PreprocessConfig(dataset=config.dataset, out_dir=os.path.join(dataset_out_dir, 'in'),
context_len_char=5 * inference_params['seq_max_len'],
**preprocess_params)
self.inference_args = InferenceConfig(out_dir=os.path.join(dataset_out_dir, 'out'), **inference_params)
self.eval_args = EvalConfig(dataset_dir=self.inference_args.out_dir,
Expand All @@ -105,7 +107,7 @@ def run(self):
do_generation = self.config.do_generation
seed = self.config.seed
# Run Zero context scenario
wb_run = wandb.init(project=self.project_name, group=f"zero_context", )
wb_run = wandb.init(project=self.project_name, group=f"zero_context", name=f"zero_context")
results = list()
result_0 = self.run_zero_context()
results.append(result_0)
Expand Down Expand Up @@ -149,7 +151,8 @@ def run(self):
)
gen_scores, gen_results, em_difference, line_counts = evaluate_generation(self.generator_config)

wb_run.log(gen_scores | {'EM_difference': em_difference, 'Line Counts': line_counts})
wb_run.log(gen_scores | {'EM_difference': em_difference, 'Line Counts': line_counts,
"dataset": self.config.dataset, "model": self.inference_args.model})
wb_run.finish()
with open(os.path.join(self.out_dir, 'generation_scores.json'), 'w') as f:
json.dump(gen_results, f, indent=4)
Expand Down Expand Up @@ -177,10 +180,11 @@ def run_zero_context(self):
print(">>Evaluation...")
mean_ppl = evaluate(self.eval_args)

return {"perplexity": mean_ppl, "context": 0, "composer": "naive"} | lost_tokens
return {"perplexity": mean_ppl, "context": 0, "composer": "zero", "dataset": self.config.dataset,
"model": self.inference_args.model} | lost_tokens

def run_composer(self, composer, results):
wb_run = wandb.init(project=self.project_name, group=f"{composer} composer", )
wb_run = wandb.init(project=self.project_name, group=f"{composer} composer", name=f"{composer} composer")
self.preprocess_args.composers = composer
print(f'>>Preprocessing for {composer} composer...')
prepared_dataset_path = preprocess(self.preprocess_args, self.config.composers_config)
Expand All @@ -204,7 +208,8 @@ def run_composer(self, composer, results):
print(">>>>>>Evaluation...")
mean_ppl = evaluate(self.eval_args)
results.append({"perplexity": mean_ppl, "context": self.inference_args.context_max,
"composer": composer} | lost_tokens)
"composer": composer, "dataset": self.config.dataset,
"model": self.inference_args.model} | lost_tokens)
print(results[-1])
wb_run.log(results[-1])

Expand Down
1 change: 1 addition & 0 deletions code_completion/eval/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def preprocess(args, composer_args):
preprocessor = get_preprocessor(args)(
filepath=args.dataset,
tokenizer_path=args.tokenizer,
context_len_char=args.context_len_char,
**composers
)
preprocessor.prepare_model_input_parallel(dataset_path=prepared_dataset_path, num_workers=1) # Don't change num_workers
Expand Down
16 changes: 9 additions & 7 deletions code_completion/eval/preprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class PreprocessorBase:
def __init__(self,
filepath: str,
tokenizer_path: str | None = None,
context_len_char: int = 60_000,
context_composer: Callable[[Dict[str, Any]], str] | None = None,
completion_composer: Callable[[Dict[str, Any]], str] | None = None,
data_source: str = 'hf',
Expand All @@ -34,6 +35,7 @@ def __init__(self,
self.context_composer = context_composer
self.completion_composer = completion_composer
self.data_source = data_source
self.context_len_char = context_len_char

def compose_context(self, context: Dict[str, str]) -> str:
raise NotImplementedError
Expand Down Expand Up @@ -100,7 +102,7 @@ def tokenize(self, text) -> List[int]:
def tokenize_datapoint(self, datapoint: DatapointBase) -> TokenizerOutput:
# print(len(datapoint.context), len(datapoint.completion))
chunk_size = 1000 # size in lines
cropped_context = datapoint.context[-60_000:] # TODO: connect this to max_seq_len
cropped_context = datapoint.context[-self.context_len_char:] # TODO: connect this to max_seq_len
# context_lines = cropped_context.split('\n')
# context_chunks_by_lines = [context_lines[i:i+chunk_size] for i in range(len(context_lines)//chunk_size)]
# context_chunks = ['\n'.join(lines_chunk) for lines_chunk in context_chunks_by_lines]
Expand Down Expand Up @@ -151,8 +153,8 @@ def _load_data(self, path: str) -> list[DatapointBase]:

import youtokentome as yttm
class FLPythonPreprocessor(PreprocessorBase):
def __init__(self, filepath, tokenizer_path=None, **composers):
super().__init__(filepath, tokenizer_path, **composers)
def __init__(self, filepath, tokenizer_path=None, context_len_char=60_000, **composers):
super().__init__(filepath, tokenizer_path, context_len_char, **composers)
self.lang_sep_symbol = '₣'
self.meta_info_sep_symbol = '𐌼'
self.extension = '.py'
Expand Down Expand Up @@ -190,8 +192,8 @@ def _load_tokenizer(self, path):

from transformers import AutoTokenizer
class HFPreprocessor(PreprocessorBase):
def __init__(self, filepath, tokenizer_path, **composers):
super().__init__(filepath, tokenizer_path, **composers)
def __init__(self, filepath, tokenizer_path, context_len_char=60_000, **composers):
super().__init__(filepath, tokenizer_path, context_len_char, **composers)
self.lang_sep_symbol = ''
self.meta_info_sep_symbol = 'METASEP'
self.extension = ''
Expand Down Expand Up @@ -220,8 +222,8 @@ def _load_tokenizer(self, path):


class StarcoderPreprocessor(HFPreprocessor):
def __init__(self, filepath, tokenizer_path="bigcode/starcoder", **composers):
super().__init__(filepath, tokenizer_path, **composers)
def __init__(self, filepath, tokenizer_path="bigcode/starcoder", context_len_char=60_000, **composers):
super().__init__(filepath, tokenizer_path, context_len_char, **composers)
self.lang_sep_symbol = 'LANGSEP'
self.meta_info_sep_symbol = 'METASEP'
self.extension = '.py'
Expand Down
3 changes: 3 additions & 0 deletions code_completion/model_hub/model_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class ModelMetainfo:
'starcoderbase-7b': ModelMetainfo(builder=HFModelBuilder, checkpoint="bigcode/starcoderbase-7b"),
'starcoderbase': ModelMetainfo(builder=HFModelBuilder, checkpoint="bigcode/starcoderbase"),

'starcoder2-3b': ModelMetainfo(builder=HFModelBuilder, checkpoint="bigcode/starcoder2-3b"),

'starcoderbase-1b-4bit': ModelMetainfo(builder=HFModelBuilder4bit, checkpoint="bigcode/starcoderbase-1b"),
'starcoderbase-3b-4bit': ModelMetainfo(builder=HFModelBuilder4bit, checkpoint="bigcode/starcoderbase-3b"),
'starcoderbase-7b-4bit': ModelMetainfo(builder=HFModelBuilder4bit, checkpoint="bigcode/starcoderbase-7b"),
Expand All @@ -30,4 +32,5 @@ class ModelMetainfo:
'codellama-34b-4bit': ModelMetainfo(builder=HFModelBuilder4bit, checkpoint="codellama/CodeLlama-34b-hf"),

'deepseek-coder-1b': ModelMetainfo(builder=HFModelBuilder, checkpoint="deepseek-ai/deepseek-coder-1.3b-base"),
'deepseek-coder-7b': ModelMetainfo(builder=HFModelBuilder, checkpoint="deepseek-ai/deepseek-coder-6.7b-base"),
}
Loading

0 comments on commit d03c290

Please sign in to comment.