[GenAI] pack GenAI core package (#7246)

* update * enable llama3_2 * fix tests * pack GenAI core
dotnet · Sep 27, 2024 · be1e428 · be1e428
1 parent 817a77f
commit be1e428
Show file tree

Hide file tree

Showing 15 changed files with 591 additions and 21 deletions.
diff --git a/...rosoft.ML.GenAI.Samples/Llama/LLaMA3_1.cs → ...oft.ML.GenAI.Samples/Llama/LlamaSample.cs b/...rosoft.ML.GenAI.Samples/Llama/LLaMA3_1.cs → ...oft.ML.GenAI.Samples/Llama/LlamaSample.cs
@@ -16,26 +16,25 @@ namespace Microsoft.ML.GenAI.Samples.Llama;
 
 internal class LlamaSample
 {
-    public static async void Run()
+    public static async Task RunLlama(string weightFolder, string checkPointName = "model.safetensors.index.json")
     {
         var device = "cuda";
         if (device == "cuda")
         {
             torch.InitializeDeviceType(DeviceType.CUDA);
         }
 
-        var defaultType = ScalarType.Float16;
+        var defaultType = ScalarType.BFloat16;
         torch.manual_seed(1);
         torch.set_default_dtype(defaultType);
-        var weightFolder = @"C:\Users\xiaoyuz\source\repos\Meta-Llama-3.1-8B-Instruct";
         var configName = "config.json";
         var originalWeightFolder = Path.Combine(weightFolder, "original");
 
         Console.WriteLine("Loading Llama from huggingface model weight folder");
         var stopWatch = System.Diagnostics.Stopwatch.StartNew();
         stopWatch.Start();
         var tokenizer = LlamaTokenizerHelper.FromPretrained(originalWeightFolder);
-        var model = LlamaForCausalLM.FromPretrained(weightFolder, configName, layersOnTargetDevice: -1);
+        var model = LlamaForCausalLM.FromPretrained(weightFolder, configName, checkPointName: checkPointName, layersOnTargetDevice: 26, quantizeToInt8: true);
 
         var pipeline = new CausalLMPipeline<TiktokenTokenizer, LlamaForCausalLM>(tokenizer, model, device);
 

diff --git a/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs b/docs/samples/Microsoft.ML.GenAI.Samples/Program.cs
@@ -1,5 +1,4 @@
 // See https://aka.ms/new-console-template for more information
-using Microsoft.ML.GenAI.Samples.Mistral;
-using Microsoft.ML.GenAI.Samples.Phi3Mini;
+using Microsoft.ML.GenAI.Samples.Llama;
 
-await Mistral_7B_Instruct.WeatherChatAsync();
+await LlamaSample.RunLlama(@"C:\Users\xiaoyuz\source\repos\Llama-3.2-3B-Instruct");
diff --git a/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj b/src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
@@ -2,7 +2,7 @@
 
   <PropertyGroup>
     <TargetFrameworks>net6.0;net8.0</TargetFrameworks>
-    <IsPackable>false</IsPackable>
+    <IsPackable>true</IsPackable>
     <Nullable>enable</Nullable>
     <LangVersion>preview</LangVersion>
   </PropertyGroup>

diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaConfig.cs
@@ -45,11 +45,15 @@ static LlamaConfig()
         var llama3_1_8b_content = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.LLaMA.Resource.Config.meta-llama-3.1-8B-Instruct.json");
         var llama3_1_70b_content = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.LLaMA.Resource.Config.meta-llama-3.1-70B-Instruct.json");
         var llama3_1_405b_content = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.LLaMA.Resource.Config.meta-llama-3.1-405B-Instruct.json");
+        var llama3_2_1b_content = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.LLaMA.Resource.Config.meta-llama-3.2-1B-Instruct.json");
+        var llama3_2_3b_content = Utils.GetEmbeddedResource("Microsoft.ML.GenAI.LLaMA.Resource.Config.meta-llama-3.2-3B-Instruct.json");
 #pragma warning restore MSML_ParameterLocalVarName // Parameter or local variable name not standard
 
         Llama3_1_8B_Instruct = JsonSerializer.Deserialize<LlamaConfig>(llama3_1_8b_content) ?? throw new ArgumentNullException(nameof(llama3_1_8b_content));
         Llama3_1_70B_Instruct = JsonSerializer.Deserialize<LlamaConfig>(llama3_1_70b_content) ?? throw new ArgumentNullException(nameof(llama3_1_70b_content));
         Llama3_1_405B_Instruct = JsonSerializer.Deserialize<LlamaConfig>(llama3_1_405b_content) ?? throw new ArgumentNullException(nameof(llama3_1_405b_content));
+        Llama3_2_1B_Instruct = JsonSerializer.Deserialize<LlamaConfig>(llama3_2_1b_content) ?? throw new ArgumentNullException(nameof(llama3_2_1b_content));
+        Llama_3_2_3B_Instruct = JsonSerializer.Deserialize<LlamaConfig>(llama3_2_3b_content) ?? throw new ArgumentNullException(nameof(llama3_2_3b_content));
     }
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
@@ -67,6 +71,16 @@ static LlamaConfig()
     /// The llama-3.1-405B-Instruct configuration created from https://huggingface.co/meta-llama/Meta-Llama-3.1-405B.
     /// </summary>
     public static LlamaConfig Llama3_1_405B_Instruct { get; }
+
+    /// <summary>
+    /// The llama-3.2-3B-Instruct configuration created from https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct.
+    /// </summary>
+    public static LlamaConfig Llama_3_2_3B_Instruct { get; }
+
+    /// <summary>
+    /// The llama-3.2-1B-Instruct configuration created from https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct.
+    /// </summary>
+    public static LlamaConfig Llama3_2_1B_Instruct { get; }
 #pragma warning restore MSML_GeneralName // This name should be PascalCased
 
     [JsonPropertyName("attention_bias")]

diff --git a/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs b/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs
@@ -8,6 +8,7 @@
 using Microsoft.ML.GenAI.Core.Extension;
 using Microsoft.ML.GenAI.LLaMA.Module;
 using TorchSharp;
+using TorchSharp.Modules;
 using TorchSharp.PyBridge;
 using static TorchSharp.torch;
 
@@ -19,7 +20,7 @@ public class LlamaForCausalLM : nn.Module<CausalLMModelInput, CausalLMModelOutpu
     private readonly int _vocabSize;
 
 #pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
-    private readonly GenAILinear lm_head;
+    private readonly Linear lm_head;
     private readonly LlamaModel model;
 #pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format
 
@@ -30,9 +31,29 @@ public LlamaForCausalLM(LlamaConfig config, string? device = null)
         _vocabSize = config.VocabSize;
 
         model = new LlamaModel(config, device);
-        lm_head = new GenAILinear(config.HiddenSize, config.VocabSize, hasBias: false);
 
-        this.RegisterComponents();
+        // When tie word embeddings is true, the lm_head shares the same weight as the embedding layer.
+        // therefore, the lm_head weight won't be initialized here.
+        // instead, it will be loaded from the embedding layer after the model is loaded.
+        if (config.TieWordEmbeddings)
+        {
+            this.RegisterComponents();
+            lm_head = nn.Linear(config.HiddenSize, config.VocabSize, hasBias: false, dtype: config.DType);
+        }
+        else
+        {
+            lm_head = nn.Linear(config.HiddenSize, config.VocabSize, hasBias: false, dtype: config.DType);
+            this.RegisterComponents();
+        }
+
+    }
+
+    private void TieWordEmbeddings()
+    {
+        var embeddingWeight = model.Embedding.state_dict();
+        this.lm_head.load_state_dict(embeddingWeight);
+
+        this.lm_head.to(device: model.Embedding.weight!.device);
     }
 
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
@@ -61,6 +82,11 @@ public static LlamaForCausalLM FromPretrained(
 
         model.LoadSafeTensors(modelFolder, checkPointName);
         model = model.to(device);
+        if (modelConfig.TieWordEmbeddings)
+        {
+            model.TieWordEmbeddings();
+        }
+
 
         return model;
     }
@@ -107,8 +133,22 @@ public static LlamaForCausalLM FromPretrained(
 
         model.LoadSafeTensors(modelFolder, checkPointName);
 
+        if (quantizeToInt8)
+        {
+            model.ToInt8QuantizeModule();
+        }
+        else if (quantizeToInt4)
+        {
+            model.ToInt4QuantizeModule();
+        }
+
         model = model.ToDynamicLoadingModel(deviceMap, targetDevice);
 
+        if (modelConfig.TieWordEmbeddings)
+        {
+            model.TieWordEmbeddings();
+        }
+
         torch.set_default_device(originalDefaultDevice);
 
         return model;

diff --git a/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj b/src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
@@ -13,13 +13,16 @@
   </ItemGroup>
 
   <ItemGroup>
-    <ProjectReference Include="..\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" PrivateAssets="all" />
-    <ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" PrivateAssets="all" />
-    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+    <ProjectReference Include="..\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" />
   </ItemGroup>
 
   <ItemGroup>
     <EmbeddedResource Include="Resource\Config\*.json" />
   </ItemGroup>
 
+  <ItemGroup>
+    <None Remove="Resource\Config\meta-llama-3.2-1B-Instruct.json" />
+    <None Remove="Resource\Config\meta-llama-3.2-3B-Instruct.json" />
+  </ItemGroup>
+
 </Project>
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs b/src/Microsoft.ML.GenAI.LLaMA/Module/LlamaModel.cs
@@ -30,7 +30,7 @@ public LlamaModel(LlamaConfig config, string? device = null)
         this._paddingIdx = config.PadTokenId;
         this._vocabSize = config.VocabSize;
         var headDim = config.HiddenSize / config.NumAttentionHeads;
-        this.embed_tokens = nn.Embedding(config.VocabSize, config.HiddenSize, padding_idx: this._paddingIdx, dtype: config.DType, device: device);
+        this.embed_tokens = nn.Embedding(config.VocabSize, config.HiddenSize, padding_idx: this._paddingIdx, dtype: config.DType);
         this.layers = new ModuleList<LlamaDecoderLayer>();
 
         for (int i = 0; i < config.NumHiddenLayers; i++)
@@ -47,6 +47,8 @@ public LlamaModel(LlamaConfig config, string? device = null)
         };
     }
 
+    public Embedding Embedding => this.embed_tokens;
+
 #pragma warning disable MSML_GeneralName // This name should be PascalCased
     public override CausalLMModelOutput forward(CausalLMModelInput input)
 #pragma warning restore MSML_GeneralName // This name should be PascalCased

diff --git a/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.2-1B-Instruct.json b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.2-1B-Instruct.json
@@ -0,0 +1,35 @@
+{
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "use_cache": true,
+  "vocab_size": 128256
+}
diff --git a/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.2-3B-Instruct.json b/src/Microsoft.ML.GenAI.LLaMA/Resource/Config/meta-llama-3.2-3B-Instruct.json
@@ -0,0 +1,35 @@
+{
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": [
+    128001,
+    128008,
+    128009
+  ],
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 24,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "factor": 32.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_type": "llama3"
+  },
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "use_cache": true,
+  "vocab_size": 128256
+}
diff --git a/src/Microsoft.ML.GenAI.Mistral/Microsoft.ML.GenAI.Mistral.csproj b/src/Microsoft.ML.GenAI.Mistral/Microsoft.ML.GenAI.Mistral.csproj
@@ -13,9 +13,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <ProjectReference Include="..\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" PrivateAssets="all" />
-    <ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" PrivateAssets="all" />
-    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+    <ProjectReference Include="..\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" />
   </ItemGroup>
 
   <ItemGroup>

diff --git a/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj b/src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj
@@ -13,9 +13,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <ProjectReference Include="..\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" PrivateAssets="all" />
-    <ProjectReference Include="..\Microsoft.ML.Core\Microsoft.ML.Core.csproj" PrivateAssets="all" />
-    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+    <ProjectReference Include="..\Microsoft.ML.GenAI.Core\Microsoft.ML.GenAI.Core.csproj" />
   </ItemGroup>
 
   <ItemGroup>