Skip to content

Commit

Permalink
chunker: switched to tiktoken implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
os-rss committed Dec 7, 2023
1 parent 0f725da commit 6d62006
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 13 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"

[compat]
Aqua = "0.6"
BytePairEncoding = "0.3"
BytePairEncoding = "0.5"
DebugDataWriter = "0.1"
DocOpt = "0.5"
ElasticsearchClient = "0.2"
Expand All @@ -36,7 +36,7 @@ Mocking = "0.7"
Mustache = "1"
OpenAI = "0.8"
OpenAPI = "0.1"
TextEncodeBase = "0.6"
TextEncodeBase = "0.8"
TimeZones = "1"
URIs = "1"

Expand Down
27 changes: 16 additions & 11 deletions src/services/chunks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,33 @@ using .GptPluginServer: Document, DocumentChunk, DocumentChunkMetadata
using UUIDs

using BytePairEncoding: gpt2_codemap, GPT2Tokenization, Merge, BPE, BPETokenization
using BytePairEncoding: tiktoken2bbpe, load_tiktoken
using TextEncodeBase: TextEncodeBase, FlatTokenizer, CodeNormalizer, Sentence, getvalue, CodeUnMap
using HuggingFaceApi

# Global variables
tokenizer = let
url = HuggingFaceURL("gpt2", "merges.txt")
file = HuggingFaceApi.cached_download(url)
bpe = BPE(file)
FlatTokenizer(CodeNormalizer(BPETokenization(GPT2Tokenization(), bpe), gpt2_codemap()))

# tiktoken.get_encoding(
# "cl100k_base"
# ) # The encoding scheme to use for tokenization
end
# const tokenizer = let
# url = HuggingFaceURL("gpt2", "merges.txt")
# file = HuggingFaceApi.cached_download(url)
# bpe = BPE(file)
# FlatTokenizer(CodeNormalizer(BPETokenization(GPT2Tokenization(), bpe), gpt2_codemap()))
# end
# const unmap = CodeUnMap(tokenizer.tokenization.codemap)

const codemap = gpt2_codemap()
const tokenizer = tiktoken2bbpe(load_tiktoken("cl100k_base"), codemap)
const unmap = CodeUnMap(codemap)

encode(text::AbstractString) = tokenizer(Sentence(text))

function decode(tokens::Vector{TextEncodeBase.TokenStage})::String
unmap = CodeUnMap(tokenizer.tokenization.codemap)
map(unmap getvalue, tokens) |> join
end

function decode(tokens::Vector{<:AbstractString})::String
map(unmap, tokens) |> join
end

# Constants
const CHUNK_SIZE = 200 # The target size of each text chunk in tokens
const MIN_CHUNK_SIZE_CHARS = 350 # The minimum size of each text chunk in characters
Expand Down

0 comments on commit 6d62006

Please sign in to comment.