Skip to content

Commit

Permalink
Add support GPT-4o
Browse files Browse the repository at this point in the history
  • Loading branch information
yethee committed May 14, 2024
1 parent f512721 commit ab36387
Show file tree
Hide file tree
Showing 4 changed files with 200,032 additions and 11 deletions.
7 changes: 7 additions & 0 deletions src/EncoderProvider.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,21 @@ final class EncoderProvider implements ResetInterface
'vocab' => 'https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken',
'pat' => '/(?i:\'s|\'t|\'re|\'ve|\'m|\'ll|\'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/u',
],
'o200k_base' => [
'vocab' => 'https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken',
'pat' => '/[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:\'s|\'t|\'re|\'ve|\'m|\'ll|\'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:\'s|\'t|\'re|\'ve|\'m|\'ll|\'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n\/]*|\s*[\r\n]+|\s+(?!\S)|\s+/u',
],
];
private const MODEL_PREFIX_TO_ENCODING = [
'gpt-4o-' => 'o200k_base',
'gpt-4-' => 'cl100k_base',
'gpt-3.5-turbo-' => 'cl100k_base',
];
private const MODEL_TO_ENCODING = [
'gpt-4o' => 'o200k_base',
'gpt-4' => 'cl100k_base',
'gpt-3.5-turbo' => 'cl100k_base',
'gpt-3.5' => 'cl100k_base',
'text-davinci-003' => 'p50k_base',
'text-davinci-002' => 'p50k_base',
'text-davinci-001' => 'r50k_base',
Expand Down
2 changes: 2 additions & 0 deletions tests/EncoderProviderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,7 @@ public static function getEncoderForModelProvider(): iterable
yield 'text-davinci-003' => ['text-davinci-003', 'p50k_base'];
yield 'text-davinci-edit-001' => ['text-davinci-edit-001', 'p50k_edit'];
yield 'gpt-3.5-turbo-0301' => ['gpt-3.5-turbo-0301', 'cl100k_base'];
yield 'gpt-4-32k' => ['gpt-4-32k', 'cl100k_base'];
yield 'gpt-4o-2024-05-13' => ['gpt-4o-2024-05-13', 'o200k_base'];
}
}
36 changes: 25 additions & 11 deletions tests/EncoderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,26 @@

final class EncoderTest extends TestCase
{
/** @param list<int> $tokens */
/**
* @param non-empty-string $encoding
* @param list<int> $tokens
*/
#[DataProvider('provideDataForFlatTokenization')]
public function testEncode(string $text, array $tokens): void
public function testEncode(string $text, string $encoding, array $tokens): void
{
$encoder = self::getEncoder('cl100k_base');
$encoder = self::getEncoder($encoding);

self::assertSame($tokens, $encoder->encode($text));
}

/** @param list<int> $tokens */
/**
* @param non-empty-string $encoding
* @param list<int> $tokens
*/
#[DataProvider('provideDataForFlatTokenization')]
public function testDecode(string $text, array $tokens): void
public function testDecode(string $text, string $encoding, array $tokens): void
{
$encoder = self::getEncoder('cl100k_base');
$encoder = self::getEncoder($encoding);

self::assertSame($text, $encoder->decode($tokens));
}
Expand All @@ -41,19 +47,27 @@ public function testEncodeInChunks(Encoder $encoder, string $text, int $maxToken
}

/**
* @return iterable<array{string, list<int>}>
* @return iterable<array{string, string, list<int>}>
*
* @psalm-suppress PossiblyUnusedMethod
*/
public static function provideDataForFlatTokenization(): iterable
{
yield 'hello world' => ['hello world', [15339, 1917]];
yield '[cl100k_base] hello world' => ['hello world', 'cl100k_base', [15339, 1917]];

yield '[cl100k_base] привет мир' => ['привет мир', 'cl100k_base', [8164, 2233, 28089, 8341, 11562, 78746]];

yield '[cl100k_base] emoji' => ['🌶', 'cl100k_base', [9468, 234, 114]];

yield '[cl100k_base] new line character' => [".\n", 'cl100k_base', [627]];

yield '[o200k_base] hello world' => ['hello world', 'o200k_base', [24912, 2375]];

yield 'привет мир' => ['привет мир', [8164, 2233, 28089, 8341, 11562, 78746]];
yield '[o200k_base] привет мир' => ['привет мир', 'o200k_base', [9501, 131903, 37934]];

yield 'emoji' => ['🌶', [9468, 234, 114]];
yield '[o200k_base] emoji' => ['🌶', 'o200k_base', [64364, 114]];

yield 'new line character' => [".\n", [627]];
yield '[o200k_base] new line character' => [".\n", 'o200k_base', [558]];
}

/**
Expand Down
Loading

0 comments on commit ab36387

Please sign in to comment.