Skip to content

Commit

Permalink
Misc Changes (#7264)
Browse files Browse the repository at this point in the history
* Add o1 model support

* Replace Usage of tuples with Range in EncodedToken and Remove TorchSharp Range/Index implementation

* Rename SentencePieceBpeTokenizer to allow adding more models to it in the future.

* Make Tokenizer.Decode returns non-nullable string

* Make BPE tokenizer support added tokens

* add net9 package source to the nuget.config file

* Rename TiktokenPreTokenizer to RegexPreTokenizer
  • Loading branch information
tarekgh authored Oct 11, 2024
1 parent e794342 commit 823fc17
Show file tree
Hide file tree
Showing 51 changed files with 433 additions and 565 deletions.
4 changes: 4 additions & 0 deletions NuGet.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
<add key="mlnet-assets" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/machinelearning-assets/nuget/v3/index.json" />
<add key="dotnet-libraries-transport" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet-libraries-transport/nuget/v3/index.json" />
<add key="dotnet8" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet8/nuget/v3/index.json" />
<add key="dotnet9" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/dotnet9/nuget/v3/index.json" />
</packageSources>
<packageSourceMapping>
<packageSource key="dotnet-public">
Expand Down Expand Up @@ -47,6 +48,9 @@
<packageSource key="dotnet8">
<package pattern="*" />
</packageSource>
<packageSource key="dotnet9">
<package pattern="*" />
</packageSource>
</packageSourceMapping>
<disabledPackageSources>
<clear />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

<!-- Remove once we have resolved the TorchSharp issue. -->
<ResolveAssemblyWarnOrErrorOnTargetArchitectureMismatch>None</ResolveAssemblyWarnOrErrorOnTargetArchitectureMismatch>
<SuppressTfmSupportBuildWarnings>true</SuppressTfmSupportBuildWarnings>
</PropertyGroup>

<ItemGroup>
Expand Down
1 change: 1 addition & 0 deletions eng/Versions.props
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
<GoogleProtobufVersion>3.27.1</GoogleProtobufVersion>
<LightGBMVersion>3.3.5</LightGBMVersion>
<MicrosoftBclHashCodeVersion>1.1.1</MicrosoftBclHashCodeVersion>
<MicrosoftBclMemoryVersion>9.0.0-rc.1.24431.7</MicrosoftBclMemoryVersion>
<MicrosoftCodeAnalysisAnalyzersVersion>3.3.4</MicrosoftCodeAnalysisAnalyzersVersion>
<MicrosoftCodeAnalysisCSharpVersion>4.9.2</MicrosoftCodeAnalysisCSharpVersion>
<MicrosoftDotNetInteractiveVersion>1.0.0-beta.24375.2</MicrosoftDotNetInteractiveVersion>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
<TargetFramework>net6.0</TargetFramework>
<IsPackable>false</IsPackable>
<NoWarn>$(NoWarn)</NoWarn>

<!-- Remove once we have resolved the TorchSharp issue. -->
<ResolveAssemblyWarnOrErrorOnTargetArchitectureMismatch>None</ResolveAssemblyWarnOrErrorOnTargetArchitectureMismatch>
<SuppressTfmSupportBuildWarnings>true</SuppressTfmSupportBuildWarnings>
</PropertyGroup>

<ItemGroup>
Expand Down
4 changes: 4 additions & 0 deletions src/Microsoft.ML.GenAI.Core/Microsoft.ML.GenAI.Core.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
<LangVersion>preview</LangVersion>
</PropertyGroup>

<PropertyGroup Condition="'$(TargetFramework)' == 'net6.0'">
<SuppressTfmSupportBuildWarnings>true</SuppressTfmSupportBuildWarnings>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="AutoGen.Core" Version="$(AutoGenVersion)" />
<PackageReference Include="Microsoft.SemanticKernel.Abstractions" Version="$(SemanticKernelVersion)" />
Expand Down
6 changes: 3 additions & 3 deletions src/Microsoft.ML.GenAI.Core/Pipeline/CausalLMPipeline.cs
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ public virtual IEnumerable<string> GenerateStreaming(

return tokens
// Skip the first _ token automatically added by tokenizer
.Where(t => t.Offset != (0, 0))
.Where(t => !t.Offset.Equals(new Range(0, 0)))
.Select(t => t.Id)
.ToArray();
}));
Expand All @@ -268,13 +268,13 @@ public virtual IEnumerable<string> GenerateStreaming(
var tokenIds = token[0].to_type(ScalarType.Int32).data<int>().ToArray();
var duplicateTokenString = this.Tokenizer switch
{
SentencePieceBpeTokenizer bpeTokenizer => bpeTokenizer.Decode(tokenIds.Concat(tokenIds), considerSpecialTokens: true) ?? throw new InvalidOperationException("Failed to decode token ids"),
SentencePieceTokenizer bpeTokenizer => bpeTokenizer.Decode(tokenIds.Concat(tokenIds), considerSpecialTokens: true) ?? throw new InvalidOperationException("Failed to decode token ids"),
_ => this.Tokenizer.Decode(tokenIds.Concat(tokenIds)) ?? throw new InvalidOperationException("Failed to decode token ids"),
};

var tokenString = this.Tokenizer switch
{
SentencePieceBpeTokenizer bpeTokenizer => bpeTokenizer.Decode(tokenIds, considerSpecialTokens: true) ?? throw new InvalidOperationException("Failed to decode token ids"),
SentencePieceTokenizer bpeTokenizer => bpeTokenizer.Decode(tokenIds, considerSpecialTokens: true) ?? throw new InvalidOperationException("Failed to decode token ids"),
_ => this.Tokenizer.Decode(tokenIds) ?? throw new InvalidOperationException("Failed to decode token ids"),
};

Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.GenAI.LLaMA/LlamaTokenizerHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ public static TiktokenTokenizer FromPretrained(
string modelFile = "tokenizer.model")
{
var modelFilePath = Path.Join(modelWeightFolder, modelFile);
var preTokenizer = new TiktokenPreTokenizer(new Regex(_re), _specialTokens);
var preTokenizer = new RegexPreTokenizer(new Regex(_re), _specialTokens);
return TiktokenTokenizer.Create(File.OpenRead(modelFilePath), preTokenizer, normalizer: null, specialTokens: _specialTokens);
}
}
4 changes: 4 additions & 0 deletions src/Microsoft.ML.GenAI.LLaMA/Microsoft.ML.GenAI.LLaMA.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
<IsPackable>true</IsPackable>
</PropertyGroup>

<PropertyGroup Condition="'$(TargetFramework)' == 'net6.0'">
<SuppressTfmSupportBuildWarnings>true</SuppressTfmSupportBuildWarnings>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="TorchSharp.PyBridge" Version="$(TorchSharpPyBridgeVersion)" />
<PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
<IsPackable>true</IsPackable>
</PropertyGroup>

<PropertyGroup Condition="'$(TargetFramework)' == 'net6.0'">
<SuppressTfmSupportBuildWarnings>true</SuppressTfmSupportBuildWarnings>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="TorchSharp.PyBridge" Version="$(TorchSharpPyBridgeVersion)" />
<PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
Expand Down
6 changes: 5 additions & 1 deletion src/Microsoft.ML.GenAI.Phi/Microsoft.ML.GenAI.Phi.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
<IsPackable>true</IsPackable>
</PropertyGroup>

<PropertyGroup Condition="'$(TargetFramework)' == 'net6.0'">
<SuppressTfmSupportBuildWarnings>true</SuppressTfmSupportBuildWarnings>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="TorchSharp.PyBridge" Version="$(TorchSharpPyBridgeVersion)" />
<PackageReference Include="TorchSharp" Version="$(TorchSharpVersion)" />
Expand All @@ -23,5 +27,5 @@
<ItemGroup>
<EmbeddedResource Include="Resource\Config\*.json" />
</ItemGroup>

</Project>
6 changes: 4 additions & 2 deletions src/Microsoft.ML.Tokenizers/EncodedToken.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;

namespace Microsoft.ML.Tokenizers
{
/// <summary>
Expand All @@ -23,15 +25,15 @@ public readonly struct EncodedToken
/// <summary>
/// Gets the offset mapping to the original string.
/// </summary>
public (int Index, int Length) Offset { get; }
public Range Offset { get; }

/// <summary>
/// Construct a new Token object using the token value, Id, and the offset mapping to the original string.
/// </summary>
/// <param name="id">The Id value associated to the token.</param>
/// <param name="value">The token string value.</param>
/// <param name="offset">The offset mapping to the original string.</param>
public EncodedToken(int id, string value, (int, int) offset)
public EncodedToken(int id, string value, Range offset)
{
Id = id;
Offset = offset;
Expand Down
1 change: 1 addition & 0 deletions src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

<ItemGroup Condition="'$(TargetFramework)' == 'netstandard2.0'">
<PackageReference Include="Microsoft.Bcl.HashCode" Version="$(MicrosoftBclHashCodeVersion)" />
<PackageReference Include="Microsoft.Bcl.Memory" Version="$(MicrosoftBclMemoryVersion)" />
</ItemGroup>

</Project>
Loading

0 comments on commit 823fc17

Please sign in to comment.