diff --git a/Cargo.lock b/Cargo.lock index b9bd93825..2009d9658 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -752,6 +752,7 @@ dependencies = [ "tokio-util", "tracing", "tracing-subscriber", + "unicode-segmentation", "uuid", "validator", "workspace-template", diff --git a/services/appflowy-collaborate/Cargo.toml b/services/appflowy-collaborate/Cargo.toml index 2e3151a8f..070547f36 100644 --- a/services/appflowy-collaborate/Cargo.toml +++ b/services/appflowy-collaborate/Cargo.toml @@ -87,6 +87,7 @@ lazy_static = "1.4.0" itertools = "0.12.0" validator = "0.16.1" rayon.workspace = true +unicode-segmentation = "1.9.0" [dev-dependencies] rand = "0.8.5" diff --git a/services/appflowy-collaborate/src/indexer/document_indexer.rs b/services/appflowy-collaborate/src/indexer/document_indexer.rs index 7cd42abb3..6dd25c3a1 100644 --- a/services/appflowy-collaborate/src/indexer/document_indexer.rs +++ b/services/appflowy-collaborate/src/indexer/document_indexer.rs @@ -13,6 +13,7 @@ use collab_document::document::DocumentBody; use collab_document::error::DocumentError; use collab_entity::CollabType; use database_entity::dto::{AFCollabEmbeddingParams, AFCollabEmbeddings, EmbeddingContentType}; +use unicode_segmentation::UnicodeSegmentation; use uuid::Uuid; use crate::indexer::{DocumentDataExt, Indexer}; @@ -45,42 +46,12 @@ impl Indexer for DocumentIndexer { match result { Ok(document_data) => { let content = document_data.to_plain_text(); - let mut result = Vec::with_capacity(1 + content.len() / Self::DOC_CONTENT_SPLIT); - - let mut slice = content.as_str(); - while slice.len() > Self::DOC_CONTENT_SPLIT { - // we should split document into multiple fragments - let (left, right) = slice.split_at(Self::DOC_CONTENT_SPLIT); - let param = AFCollabEmbeddingParams { - fragment_id: Uuid::new_v4().to_string(), - object_id: object_id.clone(), - collab_type: CollabType::Document, - content_type: EmbeddingContentType::PlainText, - content: left.to_string(), - embedding: None, - }; - result.push(param); - slice = right; - } - - let content = if slice.len() == content.len() { - content // we didn't slice the content - } else { - slice.to_string() - }; - if !content.is_empty() { - let param = AFCollabEmbeddingParams { - fragment_id: object_id.clone(), - object_id: object_id.clone(), - collab_type: CollabType::Document, - content_type: EmbeddingContentType::PlainText, - content, - embedding: None, - }; - result.push(param); - } - - Ok(result) + create_embedding_params( + object_id, + content, + CollabType::Document, + Self::DOC_CONTENT_SPLIT, + ) }, Err(err) => { if matches!(err, DocumentError::NoRequiredData) { @@ -141,3 +112,335 @@ impl Indexer for DocumentIndexer { })) } } +#[inline] +fn create_embedding_params( + object_id: String, + content: String, + collab_type: CollabType, + max_content_len: usize, +) -> Result, AppError> { + if content.is_empty() { + return Ok(vec![]); + } + + // Helper function to create AFCollabEmbeddingParams + fn create_param( + fragment_id: String, + object_id: &str, + collab_type: &CollabType, + content: String, + ) -> AFCollabEmbeddingParams { + AFCollabEmbeddingParams { + fragment_id, + object_id: object_id.to_string(), + collab_type: collab_type.clone(), + content_type: EmbeddingContentType::PlainText, + content, + embedding: None, + } + } + + if content.len() <= max_content_len { + // Content is short enough; return as a single fragment + let param = create_param(object_id.clone(), &object_id, &collab_type, content); + return Ok(vec![param]); + } + + // Content is longer than max_content_len; need to split + let mut result = Vec::with_capacity(1 + content.len() / max_content_len); + let mut fragment = String::with_capacity(max_content_len); + let mut current_len = 0; + + for grapheme in content.graphemes(true) { + let grapheme_len = grapheme.len(); + if current_len + grapheme_len > max_content_len { + if !fragment.is_empty() { + // Move the fragment to avoid cloning + result.push(create_param( + Uuid::new_v4().to_string(), + &object_id, + &collab_type, + std::mem::take(&mut fragment), + )); + } + current_len = 0; + +3 // Check if the grapheme itself is longer than max_content_len + if grapheme_len > max_content_len { + // Push the grapheme as a fragment on its own + result.push(create_param( + Uuid::new_v4().to_string(), + &object_id, + &collab_type, + grapheme.to_string(), + )); + continue; + } + } + fragment.push_str(grapheme); + current_len += grapheme_len; + } + + // Add the last fragment if it's not empty + if !fragment.is_empty() { + result.push(create_param( + object_id.clone(), + &object_id, + &collab_type, + fragment, + )); + } + + Ok(result) +} +#[cfg(test)] +mod tests { + use crate::indexer::document_indexer::create_embedding_params; + use collab_entity::CollabType; + + #[test] + fn test_split_at_non_utf8() { + let object_id = "test_object".to_string(); + let collab_type = CollabType::Document; + let max_content_len = 10; // Small number for testing + + // Content with multibyte characters (emojis) + let content = "Hello πŸ˜ƒ World 🌍! This is a test πŸš€.".to_string(); + + let params = create_embedding_params( + object_id.clone(), + content.clone(), + collab_type.clone(), + max_content_len, + ) + .unwrap(); + + // Ensure that we didn't split in the middle of a multibyte character + for param in params { + assert!(param.content.is_char_boundary(0)); + assert!(param.content.is_char_boundary(param.content.len())); + } + } + + #[test] + fn test_exact_boundary_split() { + let object_id = "test_object".to_string(); + let collab_type = CollabType::Document; + let max_content_len = 5; // Set to 5 for testing + + // Content length is exactly a multiple of max_content_len + let content = "abcdefghij".to_string(); // 10 characters + + let params = create_embedding_params( + object_id.clone(), + content.clone(), + collab_type.clone(), + max_content_len, + ) + .unwrap(); + + assert_eq!(params.len(), 2); + assert_eq!(params[0].content, "abcde"); + assert_eq!(params[1].content, "fghij"); + } + + #[test] + fn test_content_shorter_than_max_len() { + let object_id = "test_object".to_string(); + let collab_type = CollabType::Document; + let max_content_len = 100; + + let content = "Short content".to_string(); + + let params = create_embedding_params( + object_id.clone(), + content.clone(), + collab_type.clone(), + max_content_len, + ) + .unwrap(); + + assert_eq!(params.len(), 1); + assert_eq!(params[0].content, content); + } + + #[test] + fn test_empty_content() { + let object_id = "test_object".to_string(); + let collab_type = CollabType::Document; + let max_content_len = 10; + + let content = "".to_string(); + + let params = create_embedding_params( + object_id.clone(), + content.clone(), + collab_type.clone(), + max_content_len, + ) + .unwrap(); + + assert_eq!(params.len(), 0); + } + + #[test] + fn test_content_with_only_multibyte_characters() { + let object_id = "test_object".to_string(); + let collab_type = CollabType::Document; + let max_content_len = 4; // Small number for testing + + // Each emoji is 4 bytes in UTF-8 + let content = "πŸ˜€πŸ˜ƒπŸ˜„πŸ˜πŸ˜†".to_string(); + + let params = create_embedding_params( + object_id.clone(), + content.clone(), + collab_type.clone(), + max_content_len, + ) + .unwrap(); + + assert_eq!(params.len(), 5); + let expected_contents = vec!["πŸ˜€", "πŸ˜ƒ", "πŸ˜„", "😁", "πŸ˜†"]; + for (param, expected) in params.iter().zip(expected_contents.iter()) { + assert_eq!(param.content, *expected); + } + } + + #[test] + fn test_split_with_combining_characters() { + let object_id = "test_object".to_string(); + let collab_type = CollabType::Document; + let max_content_len = 5; // Small number for testing + + // String with combining characters (e.g., letters with accents) + let content = "a\u{0301}e\u{0301}i\u{0301}o\u{0301}u\u{0301}".to_string(); // "áéíóú" + + let params = create_embedding_params( + object_id.clone(), + content.clone(), + collab_type.clone(), + max_content_len, + ) + .unwrap(); + + assert_eq!(params.len(), 5); + let expected_contents = vec!["á", "é", "í", "ó", "ú"]; + for (param, expected) in params.iter().zip(expected_contents.iter()) { + assert_eq!(param.content, *expected); + } + } + + #[test] + fn test_large_content() { + let object_id = "test_object".to_string(); + let collab_type = CollabType::Document; + let max_content_len = 1000; + + // Generate a large content string + let content = "a".repeat(5000); // 5000 characters + + let params = create_embedding_params( + object_id.clone(), + content.clone(), + collab_type.clone(), + max_content_len, + ) + .unwrap(); + + assert_eq!(params.len(), 5); // 5000 / 1000 = 5 + for param in params { + assert_eq!(param.content.len(), 1000); + } + } + #[test] + fn test_non_ascii_characters() { + let object_id = "test_object".to_string(); + let collab_type = CollabType::Document; + let max_content_len = 5; + + // Non-ASCII characters: "Ñéíóú" + let content = "Ñéíóú".to_string(); + + let params = create_embedding_params( + object_id.clone(), + content.clone(), + collab_type.clone(), + max_content_len, + ) + .unwrap(); + + // Content should be split into two fragments + assert_eq!(params.len(), 3); + assert_eq!(params[0].content, "Ñé"); + assert_eq!(params[1].content, "Γ­Γ³"); + assert_eq!(params[2].content, "ΓΊ"); + } + + #[test] + fn test_content_with_leading_and_trailing_whitespace() { + let object_id = "test_object".to_string(); + let collab_type = CollabType::Document; + let max_content_len = 5; + + let content = " abcde ".to_string(); + + let params = create_embedding_params( + object_id.clone(), + content.clone(), + collab_type.clone(), + max_content_len, + ) + .unwrap(); + + // Content should include leading and trailing whitespace + assert_eq!(params.len(), 2); + assert_eq!(params[0].content, " abc"); + assert_eq!(params[1].content, "de "); + } + + #[test] + fn test_content_with_multiple_zero_width_joiners() { + let object_id = "test_object".to_string(); + let collab_type = CollabType::Document; + let max_content_len = 10; + + // Complex emoji sequence with multiple zero-width joiners + let content = "πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§πŸ‘¨β€πŸ‘¨β€πŸ‘¦β€πŸ‘¦".to_string(); + + let params = create_embedding_params( + object_id.clone(), + content.clone(), + collab_type.clone(), + max_content_len, + ) + .unwrap(); + + // Each complex emoji should be treated as a single grapheme + assert_eq!(params.len(), 2); + assert_eq!(params[0].content, "πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§"); + assert_eq!(params[1].content, "πŸ‘¨β€πŸ‘¨β€πŸ‘¦β€πŸ‘¦"); + } + + #[test] + fn test_content_with_long_combining_sequences() { + let object_id = "test_object".to_string(); + let collab_type = CollabType::Document; + let max_content_len = 5; + + // Character with multiple combining marks + let content = "a\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}".to_string(); // a with multiple accents + + let params = create_embedding_params( + object_id.clone(), + content.clone(), + collab_type.clone(), + max_content_len, + ) + .unwrap(); + + // The entire combining sequence should be in one fragment + assert_eq!(params.len(), 1); + assert_eq!(params[0].content, content); + } +}