diff --git a/Cargo.lock b/Cargo.lock index f5602dfc5..1e9082fe6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -754,6 +754,7 @@ dependencies = [ "tracing", "tracing-subscriber", "unicode-normalization", + "unicode-segmentation", "uuid", "validator", "workspace-template", diff --git a/services/appflowy-collaborate/Cargo.toml b/services/appflowy-collaborate/Cargo.toml index 61488ed70..96232c813 100644 --- a/services/appflowy-collaborate/Cargo.toml +++ b/services/appflowy-collaborate/Cargo.toml @@ -88,6 +88,8 @@ itertools = "0.12.0" validator = "0.16.1" rayon.workspace = true tiktoken-rs = "0.6.0" +unicode-segmentation = "1.9.0" + [dev-dependencies] rand = "0.8.5" diff --git a/services/appflowy-collaborate/src/indexer/document_indexer.rs b/services/appflowy-collaborate/src/indexer/document_indexer.rs index 2b7e2367f..d722598f6 100644 --- a/services/appflowy-collaborate/src/indexer/document_indexer.rs +++ b/services/appflowy-collaborate/src/indexer/document_indexer.rs @@ -15,6 +15,8 @@ use collab_document::error::DocumentError; use collab_entity::CollabType; use database_entity::dto::{AFCollabEmbeddingParams, AFCollabEmbeddings, EmbeddingContentType}; +use crate::config::get_env_var; +use crate::indexer::open_ai::{split_text_by_max_content_len, split_text_by_max_tokens}; use tiktoken_rs::CoreBPE; use tracing::trace; use uuid::Uuid; @@ -54,12 +56,11 @@ impl Indexer for DocumentIndexer { match result { Ok(document_data) => { let content = document_data.to_plain_text(); - let max_tokens = self.embedding_model.default_dimensions() as usize; create_embedding( object_id, content, CollabType::Document, - max_tokens, + &self.embedding_model, self.tokenizer.clone(), ) .await @@ -129,47 +130,35 @@ impl Indexer for DocumentIndexer { } } -/// ## Execution Time Comparison Results -/// -/// The following results were observed when running `execution_time_comparison_tests`: -/// -/// | Content Size (chars) | Direct Time (ms) | spawn_blocking Time (ms) | -/// |-----------------------|------------------|--------------------------| -/// | 500 | 1 | 1 | -/// | 1000 | 2 | 2 | -/// | 2000 | 5 | 5 | -/// | 5000 | 11 | 11 | -/// | 20000 | 49 | 48 | -/// -/// ## Guidelines for Using `spawn_blocking` -/// -/// - **Short Tasks (< 1 ms)**: -/// Use direct execution on the async runtime. The minimal execution time has negligible impact. -/// -/// - **Moderate Tasks (1–10 ms)**: -/// - For infrequent or low-concurrency tasks, direct execution is acceptable. -/// - For frequent or high-concurrency tasks, consider using `spawn_blocking` to avoid delays. -/// -/// - **Long Tasks (> 10 ms)**: -/// Always offload to a blocking thread with `spawn_blocking` to maintain runtime efficiency and responsiveness. -/// -/// Related blog: -/// https://tokio.rs/blog/2020-04-preemption -/// https://ryhl.io/blog/async-what-is-blocking/ async fn create_embedding( object_id: String, content: String, collab_type: CollabType, - max_tokens: usize, + embedding_model: &EmbeddingModel, tokenizer: Arc, ) -> Result, AppError> { - let split_contents = if content.len() < 500 { - split_text_by_max_tokens(content, max_tokens, tokenizer.as_ref())? + let use_tiktoken = get_env_var("APPFLOWY_AI_CONTENT_SPLITTER_TIKTOKEN", "false") + .parse::() + .unwrap_or(false); + + let split_contents = if use_tiktoken { + let max_tokens = embedding_model.default_dimensions() as usize; + if content.len() < 500 { + split_text_by_max_tokens(content, max_tokens, tokenizer.as_ref())? + } else { + tokio::task::spawn_blocking(move || { + split_text_by_max_tokens(content, max_tokens, tokenizer.as_ref()) + }) + .await?? + } } else { - tokio::task::spawn_blocking(move || { - split_text_by_max_tokens(content, max_tokens, tokenizer.as_ref()) - }) - .await?? + debug_assert!(matches!( + embedding_model, + EmbeddingModel::TextEmbedding3Small + )); + // We assume that every token is ~4 bytes. We're going to split document content into fragments + // of ~2000 tokens each. + split_text_by_max_content_len(content, 8000)? }; Ok( @@ -186,264 +175,3 @@ async fn create_embedding( .collect(), ) } - -fn split_text_by_max_tokens( - content: String, - max_tokens: usize, - tokenizer: &CoreBPE, -) -> Result, AppError> { - if content.is_empty() { - return Ok(vec![]); - } - - let token_ids = tokenizer.encode_ordinary(&content); - let total_tokens = token_ids.len(); - if total_tokens <= max_tokens { - return Ok(vec![content]); - } - - let mut chunks = Vec::new(); - let mut start_idx = 0; - while start_idx < total_tokens { - let mut end_idx = (start_idx + max_tokens).min(total_tokens); - let mut decoded = false; - // Try to decode the chunk, adjust end_idx if decoding fails - while !decoded { - let token_chunk = &token_ids[start_idx..end_idx]; - // Attempt to decode the current chunk - match tokenizer.decode(token_chunk.to_vec()) { - Ok(chunk_text) => { - chunks.push(chunk_text); - start_idx = end_idx; - decoded = true; - }, - Err(_) => { - // If we can extend the chunk, do so - if end_idx < total_tokens { - end_idx += 1; - } else if start_idx + 1 < total_tokens { - // Skip the problematic token at start_idx - start_idx += 1; - end_idx = (start_idx + max_tokens).min(total_tokens); - } else { - // Cannot decode any further, break to avoid infinite loop - start_idx = total_tokens; - break; - } - }, - } - } - } - - Ok(chunks) -} - -#[cfg(test)] -mod tests { - use crate::indexer::document_indexer::split_text_by_max_tokens; - - use tiktoken_rs::cl100k_base; - - #[test] - fn test_split_at_non_utf8() { - let max_tokens = 10; // Small number for testing - - // Content with multibyte characters (emojis) - let content = "Hello πŸ˜ƒ World 🌍! This is a test πŸš€.".to_string(); - let tokenizer = cl100k_base().unwrap(); - let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); - - // Ensure that we didn't split in the middle of a multibyte character - for content in params { - assert!(content.is_char_boundary(0)); - assert!(content.is_char_boundary(content.len())); - } - } - #[test] - fn test_exact_boundary_split() { - let max_tokens = 5; // Set to 5 tokens for testing - let content = "The quick brown fox jumps over the lazy dog".to_string(); - let tokenizer = cl100k_base().unwrap(); - let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); - - let total_tokens = tokenizer.encode_ordinary(&content).len(); - let expected_fragments = (total_tokens + max_tokens - 1) / max_tokens; - assert_eq!(params.len(), expected_fragments); - } - - #[test] - fn test_content_shorter_than_max_len() { - let max_tokens = 100; - let content = "Short content".to_string(); - let tokenizer = cl100k_base().unwrap(); - let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); - - assert_eq!(params.len(), 1); - assert_eq!(params[0], content); - } - - #[test] - fn test_empty_content() { - let max_tokens = 10; - let content = "".to_string(); - let tokenizer = cl100k_base().unwrap(); - let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); - - assert_eq!(params.len(), 0); - } - - #[test] - fn test_content_with_only_multibyte_characters() { - let max_tokens = 1; // Set to 1 token for testing - let content = "πŸ˜€πŸ˜ƒπŸ˜„πŸ˜πŸ˜†".to_string(); - let tokenizer = cl100k_base().unwrap(); - let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); - - let emojis: Vec = content.chars().map(|c| c.to_string()).collect(); - for (param, emoji) in params.iter().zip(emojis.iter()) { - assert_eq!(param, emoji); - } - } - - #[test] - fn test_split_with_combining_characters() { - let max_tokens = 1; // Set to 1 token for testing - let content = "a\u{0301}e\u{0301}i\u{0301}o\u{0301}u\u{0301}".to_string(); // "áéíóú" - let tokenizer = cl100k_base().unwrap(); - let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); - - let total_tokens = tokenizer.encode_ordinary(&content).len(); - assert_eq!(params.len(), total_tokens); - - let reconstructed_content = params.join(""); - assert_eq!(reconstructed_content, content); - } - - #[test] - fn test_large_content() { - let max_tokens = 1000; - let content = "a".repeat(5000); // 5000 characters - let tokenizer = cl100k_base().unwrap(); - let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); - - let total_tokens = tokenizer.encode_ordinary(&content).len(); - let expected_fragments = (total_tokens + max_tokens - 1) / max_tokens; - assert_eq!(params.len(), expected_fragments); - } - - #[test] - fn test_non_ascii_characters() { - let max_tokens = 2; - let content = "Ñéíóú".to_string(); - let tokenizer = cl100k_base().unwrap(); - let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); - - let total_tokens = tokenizer.encode_ordinary(&content).len(); - let expected_fragments = (total_tokens + max_tokens - 1) / max_tokens; - assert_eq!(params.len(), expected_fragments); - - let reconstructed_content: String = params.concat(); - assert_eq!(reconstructed_content, content); - } - - #[test] - fn test_content_with_leading_and_trailing_whitespace() { - let max_tokens = 3; - let content = " abcde ".to_string(); - let tokenizer = cl100k_base().unwrap(); - let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); - - let total_tokens = tokenizer.encode_ordinary(&content).len(); - let expected_fragments = (total_tokens + max_tokens - 1) / max_tokens; - assert_eq!(params.len(), expected_fragments); - - let reconstructed_content: String = params.concat(); - assert_eq!(reconstructed_content, content); - } - - #[test] - fn test_content_with_multiple_zero_width_joiners() { - let max_tokens = 1; - let content = "πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§πŸ‘¨β€πŸ‘¨β€πŸ‘¦β€πŸ‘¦".to_string(); - let tokenizer = cl100k_base().unwrap(); - let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); - - let reconstructed_content: String = params.concat(); - assert_eq!(reconstructed_content, content); - } - - #[test] - fn test_content_with_long_combining_sequences() { - let max_tokens = 1; - let content = "a\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}".to_string(); - let tokenizer = cl100k_base().unwrap(); - let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); - - let reconstructed_content: String = params.concat(); - assert_eq!(reconstructed_content, content); - } -} - -// #[cfg(test)] -// mod execution_time_comparison_tests { -// use crate::indexer::document_indexer::split_text_by_max_tokens; -// use rand::distributions::Alphanumeric; -// use rand::{thread_rng, Rng}; -// use std::sync::Arc; -// use std::time::Instant; -// use tiktoken_rs::{cl100k_base, CoreBPE}; -// -// #[tokio::test] -// async fn test_execution_time_comparison() { -// let tokenizer = Arc::new(cl100k_base().unwrap()); -// let max_tokens = 100; -// -// let sizes = vec![500, 1000, 2000, 5000, 20000]; // Content sizes to test -// for size in sizes { -// let content = generate_random_string(size); -// -// // Measure direct execution time -// let direct_time = measure_direct_execution(content.clone(), max_tokens, &tokenizer); -// -// // Measure spawn_blocking execution time -// let spawn_blocking_time = -// measure_spawn_blocking_execution(content, max_tokens, Arc::clone(&tokenizer)).await; -// -// println!( -// "Content Size: {} | Direct Time: {}ms | spawn_blocking Time: {}ms", -// size, direct_time, spawn_blocking_time -// ); -// } -// } -// -// // Measure direct execution time -// fn measure_direct_execution(content: String, max_tokens: usize, tokenizer: &CoreBPE) -> u128 { -// let start = Instant::now(); -// split_text_by_max_tokens(content, max_tokens, tokenizer).unwrap(); -// start.elapsed().as_millis() -// } -// -// // Measure `spawn_blocking` execution time -// async fn measure_spawn_blocking_execution( -// content: String, -// max_tokens: usize, -// tokenizer: Arc, -// ) -> u128 { -// let start = Instant::now(); -// tokio::task::spawn_blocking(move || { -// split_text_by_max_tokens(content, max_tokens, tokenizer.as_ref()).unwrap() -// }) -// .await -// .unwrap(); -// start.elapsed().as_millis() -// } -// -// pub fn generate_random_string(len: usize) -> String { -// let rng = thread_rng(); -// rng -// .sample_iter(&Alphanumeric) -// .take(len) -// .map(char::from) -// .collect() -// } -// } diff --git a/services/appflowy-collaborate/src/indexer/mod.rs b/services/appflowy-collaborate/src/indexer/mod.rs index c9fe41a08..e354f9e01 100644 --- a/services/appflowy-collaborate/src/indexer/mod.rs +++ b/services/appflowy-collaborate/src/indexer/mod.rs @@ -1,6 +1,8 @@ mod document_indexer; mod ext; +mod open_ai; mod provider; + pub use document_indexer::DocumentIndexer; pub use ext::DocumentDataExt; pub use provider::*; diff --git a/services/appflowy-collaborate/src/indexer/open_ai.rs b/services/appflowy-collaborate/src/indexer/open_ai.rs new file mode 100644 index 000000000..db5fb26a3 --- /dev/null +++ b/services/appflowy-collaborate/src/indexer/open_ai.rs @@ -0,0 +1,361 @@ +use app_error::AppError; +use tiktoken_rs::CoreBPE; +use unicode_segmentation::UnicodeSegmentation; + +/// ## Execution Time Comparison Results +/// +/// The following results were observed when running `execution_time_comparison_tests`: +/// +/// | Content Size (chars) | Direct Time (ms) | spawn_blocking Time (ms) | +/// |-----------------------|------------------|--------------------------| +/// | 500 | 1 | 1 | +/// | 1000 | 2 | 2 | +/// | 2000 | 5 | 5 | +/// | 5000 | 11 | 11 | +/// | 20000 | 49 | 48 | +/// +/// ## Guidelines for Using `spawn_blocking` +/// +/// - **Short Tasks (< 1 ms)**: +/// Use direct execution on the async runtime. The minimal execution time has negligible impact. +/// +/// - **Moderate Tasks (1–10 ms)**: +/// - For infrequent or low-concurrency tasks, direct execution is acceptable. +/// - For frequent or high-concurrency tasks, consider using `spawn_blocking` to avoid delays. +/// +/// - **Long Tasks (> 10 ms)**: +/// Always offload to a blocking thread with `spawn_blocking` to maintain runtime efficiency and responsiveness. +/// +/// Related blog: +/// https://tokio.rs/blog/2020-04-preemption +/// https://ryhl.io/blog/async-what-is-blocking/ +#[inline] +pub fn split_text_by_max_tokens( + content: String, + max_tokens: usize, + tokenizer: &CoreBPE, +) -> Result, AppError> { + if content.is_empty() { + return Ok(vec![]); + } + + let token_ids = tokenizer.encode_ordinary(&content); + let total_tokens = token_ids.len(); + if total_tokens <= max_tokens { + return Ok(vec![content]); + } + + let mut chunks = Vec::new(); + let mut start_idx = 0; + while start_idx < total_tokens { + let mut end_idx = (start_idx + max_tokens).min(total_tokens); + let mut decoded = false; + // Try to decode the chunk, adjust end_idx if decoding fails + while !decoded { + let token_chunk = &token_ids[start_idx..end_idx]; + // Attempt to decode the current chunk + match tokenizer.decode(token_chunk.to_vec()) { + Ok(chunk_text) => { + chunks.push(chunk_text); + start_idx = end_idx; + decoded = true; + }, + Err(_) => { + // If we can extend the chunk, do so + if end_idx < total_tokens { + end_idx += 1; + } else if start_idx + 1 < total_tokens { + // Skip the problematic token at start_idx + start_idx += 1; + end_idx = (start_idx + max_tokens).min(total_tokens); + } else { + // Cannot decode any further, break to avoid infinite loop + start_idx = total_tokens; + break; + } + }, + } + } + } + + Ok(chunks) +} + +#[inline] +pub fn split_text_by_max_content_len( + content: String, + max_content_len: usize, +) -> Result, AppError> { + if content.is_empty() { + return Ok(vec![]); + } + + if content.len() <= max_content_len { + return Ok(vec![content]); + } + + // Content is longer than max_content_len; need to split + let mut result = Vec::with_capacity(1 + content.len() / max_content_len); + let mut fragment = String::with_capacity(max_content_len); + let mut current_len = 0; + + for grapheme in content.graphemes(true) { + let grapheme_len = grapheme.len(); + if current_len + grapheme_len > max_content_len { + if !fragment.is_empty() { + result.push(std::mem::take(&mut fragment)); + } + current_len = 0; + + if grapheme_len > max_content_len { + // Push the grapheme as a fragment on its own + result.push(grapheme.to_string()); + continue; + } + } + fragment.push_str(grapheme); + current_len += grapheme_len; + } + + // Add the last fragment if it's not empty + if !fragment.is_empty() { + result.push(fragment); + } + Ok(result) +} + +#[cfg(test)] +mod tests { + + use crate::indexer::open_ai::{split_text_by_max_content_len, split_text_by_max_tokens}; + use tiktoken_rs::cl100k_base; + + #[test] + fn test_split_at_non_utf8() { + let max_tokens = 10; // Small number for testing + + // Content with multibyte characters (emojis) + let content = "Hello πŸ˜ƒ World 🌍! This is a test πŸš€.".to_string(); + let tokenizer = cl100k_base().unwrap(); + let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); + for content in params { + assert!(content.is_char_boundary(0)); + assert!(content.is_char_boundary(content.len())); + } + + let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); + for content in params { + assert!(content.is_char_boundary(0)); + assert!(content.is_char_boundary(content.len())); + } + } + #[test] + fn test_exact_boundary_split() { + let max_tokens = 5; // Set to 5 tokens for testing + let content = "The quick brown fox jumps over the lazy dog".to_string(); + let tokenizer = cl100k_base().unwrap(); + let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); + + let total_tokens = tokenizer.encode_ordinary(&content).len(); + let expected_fragments = (total_tokens + max_tokens - 1) / max_tokens; + assert_eq!(params.len(), expected_fragments); + } + + #[test] + fn test_content_shorter_than_max_len() { + let max_tokens = 100; + let content = "Short content".to_string(); + let tokenizer = cl100k_base().unwrap(); + let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); + + assert_eq!(params.len(), 1); + assert_eq!(params[0], content); + } + + #[test] + fn test_empty_content() { + let max_tokens = 10; + let content = "".to_string(); + let tokenizer = cl100k_base().unwrap(); + let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); + assert_eq!(params.len(), 0); + + let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); + assert_eq!(params.len(), 0); + } + + #[test] + fn test_content_with_only_multibyte_characters() { + let max_tokens = 1; // Set to 1 token for testing + let content = "πŸ˜€πŸ˜ƒπŸ˜„πŸ˜πŸ˜†".to_string(); + let tokenizer = cl100k_base().unwrap(); + let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); + + let emojis: Vec = content.chars().map(|c| c.to_string()).collect(); + for (param, emoji) in params.iter().zip(emojis.iter()) { + assert_eq!(param, emoji); + } + + let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); + for (param, emoji) in params.iter().zip(emojis.iter()) { + assert_eq!(param, emoji); + } + } + + #[test] + fn test_split_with_combining_characters() { + let max_tokens = 1; // Set to 1 token for testing + let content = "a\u{0301}e\u{0301}i\u{0301}o\u{0301}u\u{0301}".to_string(); // "áéíóú" + + let tokenizer = cl100k_base().unwrap(); + let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); + let total_tokens = tokenizer.encode_ordinary(&content).len(); + assert_eq!(params.len(), total_tokens); + let reconstructed_content = params.join(""); + assert_eq!(reconstructed_content, content); + + let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); + let reconstructed_content: String = params.concat(); + assert_eq!(reconstructed_content, content); + } + + #[test] + fn test_large_content() { + let max_tokens = 1000; + let content = "a".repeat(5000); // 5000 characters + let tokenizer = cl100k_base().unwrap(); + let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); + + let total_tokens = tokenizer.encode_ordinary(&content).len(); + let expected_fragments = (total_tokens + max_tokens - 1) / max_tokens; + assert_eq!(params.len(), expected_fragments); + } + + #[test] + fn test_non_ascii_characters() { + let max_tokens = 2; + let content = "Ñéíóú".to_string(); + let tokenizer = cl100k_base().unwrap(); + let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); + + let total_tokens = tokenizer.encode_ordinary(&content).len(); + let expected_fragments = (total_tokens + max_tokens - 1) / max_tokens; + assert_eq!(params.len(), expected_fragments); + let reconstructed_content: String = params.concat(); + assert_eq!(reconstructed_content, content); + + let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); + let reconstructed_content: String = params.concat(); + assert_eq!(reconstructed_content, content); + } + + #[test] + fn test_content_with_leading_and_trailing_whitespace() { + let max_tokens = 3; + let content = " abcde ".to_string(); + let tokenizer = cl100k_base().unwrap(); + let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); + + let total_tokens = tokenizer.encode_ordinary(&content).len(); + let expected_fragments = (total_tokens + max_tokens - 1) / max_tokens; + assert_eq!(params.len(), expected_fragments); + let reconstructed_content: String = params.concat(); + assert_eq!(reconstructed_content, content); + + let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); + let reconstructed_content: String = params.concat(); + assert_eq!(reconstructed_content, content); + } + + #[test] + fn test_content_with_multiple_zero_width_joiners() { + let max_tokens = 1; + let content = "πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§πŸ‘¨β€πŸ‘¨β€πŸ‘¦β€πŸ‘¦".to_string(); + let tokenizer = cl100k_base().unwrap(); + let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); + let reconstructed_content: String = params.concat(); + assert_eq!(reconstructed_content, content); + + let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); + let reconstructed_content: String = params.concat(); + assert_eq!(reconstructed_content, content); + } + + #[test] + fn test_content_with_long_combining_sequences() { + let max_tokens = 1; + let content = "a\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}".to_string(); + let tokenizer = cl100k_base().unwrap(); + let params = split_text_by_max_tokens(content.clone(), max_tokens, &tokenizer).unwrap(); + let reconstructed_content: String = params.concat(); + assert_eq!(reconstructed_content, content); + + let params = split_text_by_max_content_len(content.clone(), max_tokens).unwrap(); + let reconstructed_content: String = params.concat(); + assert_eq!(reconstructed_content, content); + } +} + +// #[cfg(test)] +// mod execution_time_comparison_tests { +// use crate::indexer::document_indexer::split_text_by_max_tokens; +// use rand::distributions::Alphanumeric; +// use rand::{thread_rng, Rng}; +// use std::sync::Arc; +// use std::time::Instant; +// use tiktoken_rs::{cl100k_base, CoreBPE}; +// +// #[tokio::test] +// async fn test_execution_time_comparison() { +// let tokenizer = Arc::new(cl100k_base().unwrap()); +// let max_tokens = 100; +// +// let sizes = vec![500, 1000, 2000, 5000, 20000]; // Content sizes to test +// for size in sizes { +// let content = generate_random_string(size); +// +// // Measure direct execution time +// let direct_time = measure_direct_execution(content.clone(), max_tokens, &tokenizer); +// +// // Measure spawn_blocking execution time +// let spawn_blocking_time = +// measure_spawn_blocking_execution(content, max_tokens, Arc::clone(&tokenizer)).await; +// +// println!( +// "Content Size: {} | Direct Time: {}ms | spawn_blocking Time: {}ms", +// size, direct_time, spawn_blocking_time +// ); +// } +// } +// +// // Measure direct execution time +// fn measure_direct_execution(content: String, max_tokens: usize, tokenizer: &CoreBPE) -> u128 { +// let start = Instant::now(); +// split_text_by_max_tokens(content, max_tokens, tokenizer).unwrap(); +// start.elapsed().as_millis() +// } +// +// // Measure `spawn_blocking` execution time +// async fn measure_spawn_blocking_execution( +// content: String, +// max_tokens: usize, +// tokenizer: Arc, +// ) -> u128 { +// let start = Instant::now(); +// tokio::task::spawn_blocking(move || { +// split_text_by_max_tokens(content, max_tokens, tokenizer.as_ref()).unwrap() +// }) +// .await +// .unwrap(); +// start.elapsed().as_millis() +// } +// +// pub fn generate_random_string(len: usize) -> String { +// let rng = thread_rng(); +// rng +// .sample_iter(&Alphanumeric) +// .take(len) +// .map(char::from) +// .collect() +// } +// } diff --git a/services/appflowy-worker/src/import_worker/worker.rs b/services/appflowy-worker/src/import_worker/worker.rs index 6d07ebe2e..6be6767d1 100644 --- a/services/appflowy-worker/src/import_worker/worker.rs +++ b/services/appflowy-worker/src/import_worker/worker.rs @@ -245,7 +245,6 @@ async fn process_upcoming_tasks( } } } - info!("[Import] stop reading tasks from stream"); } #[derive(Clone)] struct TaskContext { diff --git a/tests/search/document_search.rs b/tests/search/document_search.rs index 84267d9f4..e696c4369 100644 --- a/tests/search/document_search.rs +++ b/tests/search/document_search.rs @@ -114,7 +114,7 @@ The Five Dysfunctions of a Team by Patrick Lencioni The Five Dysfunctions of a T let params = CalculateSimilarityParams { workspace_id, - input: answer, + input: answer.clone(), expected: r#" Kathryn Petersen is the newly appointed CEO of DecisionTech, a struggling Silicon Valley startup. She steps into a role facing a dysfunctional executive team characterized by poor communication, @@ -131,7 +131,7 @@ The Five Dysfunctions of a Team by Patrick Lencioni The Five Dysfunctions of a T .unwrap() .score; - assert!(score > 0.9, "score: {}", score); + assert!(score > 0.9, "score: {}, input:{}", score, answer); } }