diff --git a/examples/llm/sft/hf.py b/examples/llm/sft/hf.py index 59a20e5e530fb..e3b51d2a9257c 100644 --- a/examples/llm/sft/hf.py +++ b/examples/llm/sft/hf.py @@ -45,8 +45,10 @@ def _create_dataloader(self, dataset, **kwargs) -> DataLoader: **kwargs1, ) + def mk_hf_dataset(tokenizer): - EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN + EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN + def formatting_prompts_func(examples): alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. @@ -59,20 +61,25 @@ def formatting_prompts_func(examples): ### Response: {}""" instruction = examples["context"] - input = examples["question"] - output = examples["answers"]['text'] + input = examples["question"] + output = examples["answers"]['text'] if isinstance(output, list): output = output[0] text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN ans = tokenizer(text) tokens = ans['input_ids'] - return {'tokens': tokens, 'labels': tokens[1:] + [tokens[-1]], } + return { + 'tokens': tokens, + 'labels': tokens[1:] + [tokens[-1]], + } from datasets import load_dataset - dataset = load_dataset("rajpurkar/squad", split = "train") - dataset = dataset.map(formatting_prompts_func, batched = False, batch_size = 2) + + dataset = load_dataset("rajpurkar/squad", split="train") + dataset = dataset.map(formatting_prompts_func, batched=False, batch_size=2) return dataset + def squad(tokenizer) -> pl.LightningDataModule: return SquadDataModuleWithMbs( tokenizer=tokenizer, @@ -83,13 +90,16 @@ def squad(tokenizer) -> pl.LightningDataModule: sanity_check_dist_workers=False, ) + class HfAutoModelPeft(llm.HfAutoModel): def configure_model(self): super().configure_model() self.model.eval() from lora import apply_lora_to_model + apply_lora_to_model(self.model) + if __name__ == '__main__': import argparse @@ -119,8 +129,7 @@ def configure_model(self): llm.api.finetune( model=HfAutoModelPeft(args.model), data=llm.HfDatasetDataModule( - mk_hf_dataset(tokenizer.tokenizer), - pad_token_id=tokenizer.tokenizer.eos_token_id + mk_hf_dataset(tokenizer.tokenizer), pad_token_id=tokenizer.tokenizer.eos_token_id ), trainer=nl.Trainer( devices=args.devices, diff --git a/examples/llm/sft/lora.py b/examples/llm/sft/lora.py index 92dace0d73c81..0d590ed8d95b7 100644 --- a/examples/llm/sft/lora.py +++ b/examples/llm/sft/lora.py @@ -1,7 +1,9 @@ -import torch.nn as nn -import torch import math +import torch +import torch.nn as nn + + class LoraLinear(nn.Module): def __init__(self, orig_linear, r=8, lora_alpha=32, lora_dropout=0.1): super(LoraLinear, self).__init__() @@ -31,6 +33,7 @@ def forward(self, x): lora_res = lora_res @ self.lora_b.t() return res + lora_res * self.scale + # Helper funcs def get_parent_module(model, module_name): print('get_parent_module module_name= ' + str(module_name)) @@ -40,6 +43,7 @@ def get_parent_module(model, module_name): parent = getattr(parent, name) return parent + def apply_lora_to_model(model, r=8, lora_alpha=32, lora_dropout=0.1): for name, module in model.named_modules(): if isinstance(module, nn.Linear) and '_proj' in name: @@ -47,4 +51,4 @@ def apply_lora_to_model(model, r=8, lora_alpha=32, lora_dropout=0.1): target_attr = name.split('.')[-1] orig_lin = getattr(parent_module, target_attr) lora_linear = LoraLinear(orig_lin, r, lora_alpha, lora_dropout) - setattr(parent_module, target_attr, lora_linear) \ No newline at end of file + setattr(parent_module, target_attr, lora_linear) diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index ec3c0252f6555..6dde88079567a 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -21,10 +21,10 @@ from nemo.collections.llm.gpt.data import ( DollyDataModule, FineTuningDataModule, + HfDatasetDataModule, MockDataModule, PreTrainingDataModule, SquadDataModule, - HfDatasetDataModule, ) from nemo.collections.llm.gpt.data.api import dolly, mock, squad from nemo.collections.llm.gpt.model import ( diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py index 4d3d21fb654b2..f4e97d91e5cd5 100644 --- a/nemo/collections/llm/gpt/data/__init__.py +++ b/nemo/collections/llm/gpt/data/__init__.py @@ -14,10 +14,10 @@ from nemo.collections.llm.gpt.data.dolly import DollyDataModule from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule +from nemo.collections.llm.gpt.data.hf_dataset import HfDatasetDataModule from nemo.collections.llm.gpt.data.mock import MockDataModule from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.gpt.data.hf_dataset import HfDatasetDataModule __all__ = [ "FineTuningDataModule", @@ -25,5 +25,5 @@ "DollyDataModule", "MockDataModule", "PreTrainingDataModule", - "HfDatasetDataModule" + "HfDatasetDataModule", ] diff --git a/nemo/collections/llm/gpt/data/hf_dataset.py b/nemo/collections/llm/gpt/data/hf_dataset.py index 083f5fbc52352..7e70a970913e6 100644 --- a/nemo/collections/llm/gpt/data/hf_dataset.py +++ b/nemo/collections/llm/gpt/data/hf_dataset.py @@ -21,14 +21,14 @@ class HfDatasetDataModule(pl.LightningDataModule): def __init__( self, dataset, - num_workers = 2, - pin_memory = True, - persistent_workers = True, - micro_batch_size = 2, - global_batch_size = 2, - pad_token_id = 0, - use_mcore_sampler = False, - mcore_dataloader_type = 'cyclic', + num_workers=2, + pin_memory=True, + persistent_workers=True, + micro_batch_size=2, + global_batch_size=2, + pad_token_id=0, + use_mcore_sampler=False, + mcore_dataloader_type='cyclic', ) -> None: super().__init__() assert pad_token_id is not None @@ -56,10 +56,8 @@ def extract_key_from_dicts(batch, key): def pad_within_micro(batch, pad_token_id): max_len = max(map(len, batch)) - return [ - item + [pad_token_id] * (max_len - len(item)) - for item in batch - ] + return [item + [pad_token_id] * (max_len - len(item)) for item in batch] + return { key: batchify( torch.LongTensor( @@ -103,4 +101,3 @@ def train_dataloader(self, collate_fn=None): rank=rank, world_size=world_size, ) - diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py index a9f53ea4158f9..f29756dc05a7a 100644 --- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py +++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py @@ -18,8 +18,8 @@ from transformers import AutoModelForCausalLM from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer -from nemo.lightning import io from nemo.collections.llm import fn +from nemo.lightning import io def _extract_non_bias_params(model): @@ -66,6 +66,7 @@ def configure_model(self): self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype='auto') else: from transformers import AutoConfig + config = AutoConfig.from_pretained(self.model_name) self.model = AutoModelForCausalLM.from_config(config) self.model.train() diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py index 18db9b164d6aa..dfc91eb764234 100644 --- a/nemo/collections/llm/peft/lora.py +++ b/nemo/collections/llm/peft/lora.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import re -import torch import math +import re from dataclasses import dataclass, field from typing import List, Literal +import torch from megatron.core import parallel_state from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear from torch import nn @@ -107,6 +107,7 @@ def forward(self, x): lora_res = self.dropout(lora_res) return res + lora_res + @dataclass class LoRA(PEFT): """