From 90d8615c68faf46d62810e91f366ed204805af1b Mon Sep 17 00:00:00 2001 From: Jasper Huang Date: Mon, 28 Oct 2024 12:13:54 -0700 Subject: [PATCH] Merge pull request #51590 from Expensify/revert-51332-reapply-suffix-search-tree Revert "Revert "Revert "Search suffix tree implementation""" (cherry picked from commit c2ef29100c35a94c351fee0ef1a65a777b4b9075) (CP triggered by AndrewGable) --- src/CONST.ts | 3 - .../Search/SearchRouter/SearchRouter.tsx | 62 +---- src/libs/FastSearch.ts | 140 ------------ src/libs/OptionsListUtils.ts | 38 +--- src/libs/SuffixUkkonenTree/index.ts | 211 ------------------ src/libs/SuffixUkkonenTree/utils.ts | 115 ---------- tests/unit/FastSearchTest.ts | 118 ---------- tests/unit/SuffixUkkonenTreeTest.ts | 63 ------ 8 files changed, 14 insertions(+), 736 deletions(-) delete mode 100644 src/libs/FastSearch.ts delete mode 100644 src/libs/SuffixUkkonenTree/index.ts delete mode 100644 src/libs/SuffixUkkonenTree/utils.ts delete mode 100644 tests/unit/FastSearchTest.ts delete mode 100644 tests/unit/SuffixUkkonenTreeTest.ts diff --git a/src/CONST.ts b/src/CONST.ts index a3df980ff5dd..bb64b0e55ae7 100755 --- a/src/CONST.ts +++ b/src/CONST.ts @@ -1146,9 +1146,6 @@ const CONST = { SEARCH_OPTION_LIST_DEBOUNCE_TIME: 300, RESIZE_DEBOUNCE_TIME: 100, UNREAD_UPDATE_DEBOUNCE_TIME: 300, - SEARCH_CONVERT_SEARCH_VALUES: 'search_convert_search_values', - SEARCH_MAKE_TREE: 'search_make_tree', - SEARCH_BUILD_TREE: 'search_build_tree', SEARCH_FILTER_OPTIONS: 'search_filter_options', USE_DEBOUNCED_STATE_DELAY: 300, }, diff --git a/src/components/Search/SearchRouter/SearchRouter.tsx b/src/components/Search/SearchRouter/SearchRouter.tsx index cc9e9c6ca024..d2cb25c5a5f9 100644 --- a/src/components/Search/SearchRouter/SearchRouter.tsx +++ b/src/components/Search/SearchRouter/SearchRouter.tsx @@ -12,7 +12,6 @@ import useKeyboardShortcut from '@hooks/useKeyboardShortcut'; import useLocalize from '@hooks/useLocalize'; import useResponsiveLayout from '@hooks/useResponsiveLayout'; import useThemeStyles from '@hooks/useThemeStyles'; -import FastSearch from '@libs/FastSearch'; import Log from '@libs/Log'; import * as OptionsListUtils from '@libs/OptionsListUtils'; import {getAllTaxRates} from '@libs/PolicyUtils'; @@ -64,49 +63,6 @@ function SearchRouter({onRouterClose}: SearchRouterProps) { return OptionsListUtils.getSearchOptions(options, '', betas ?? []); }, [areOptionsInitialized, betas, options]); - /** - * Builds a suffix tree and returns a function to search in it. - */ - const findInSearchTree = useMemo(() => { - const fastSearch = FastSearch.createFastSearch([ - { - data: searchOptions.personalDetails, - toSearchableString: (option) => { - const displayName = option.participantsList?.[0]?.displayName ?? ''; - return [option.login ?? '', option.login !== displayName ? displayName : ''].join(); - }, - }, - { - data: searchOptions.recentReports, - toSearchableString: (option) => { - const searchStringForTree = [option.text ?? '', option.login ?? '']; - - if (option.isThread) { - if (option.alternateText) { - searchStringForTree.push(option.alternateText); - } - } else if (!!option.isChatRoom || !!option.isPolicyExpenseChat) { - if (option.subtitle) { - searchStringForTree.push(option.subtitle); - } - } - - return searchStringForTree.join(); - }, - }, - ]); - function search(searchInput: string) { - const [personalDetails, recentReports] = fastSearch.search(searchInput); - - return { - personalDetails, - recentReports, - }; - } - - return search; - }, [searchOptions.personalDetails, searchOptions.recentReports]); - const filteredOptions = useMemo(() => { if (debouncedInputValue.trim() === '') { return { @@ -117,25 +73,15 @@ function SearchRouter({onRouterClose}: SearchRouterProps) { } Timing.start(CONST.TIMING.SEARCH_FILTER_OPTIONS); - const newOptions = findInSearchTree(debouncedInputValue); + const newOptions = OptionsListUtils.filterOptions(searchOptions, debouncedInputValue, {sortByReportTypeInSearch: true, preferChatroomsOverThreads: true}); Timing.end(CONST.TIMING.SEARCH_FILTER_OPTIONS); - const recentReports = newOptions.recentReports.concat(newOptions.personalDetails); - - const userToInvite = OptionsListUtils.pickUserToInvite({ - canInviteUser: true, + return { recentReports: newOptions.recentReports, personalDetails: newOptions.personalDetails, - searchValue: debouncedInputValue, - optionsToExclude: [{login: CONST.EMAIL.NOTIFICATIONS}], - }); - - return { - recentReports, - personalDetails: [], - userToInvite, + userToInvite: newOptions.userToInvite, }; - }, [debouncedInputValue, findInSearchTree]); + }, [debouncedInputValue, searchOptions]); const recentReports: OptionData[] = useMemo(() => { if (debouncedInputValue === '') { diff --git a/src/libs/FastSearch.ts b/src/libs/FastSearch.ts deleted file mode 100644 index 59d28dedd449..000000000000 --- a/src/libs/FastSearch.ts +++ /dev/null @@ -1,140 +0,0 @@ -/* eslint-disable rulesdir/prefer-at */ -import CONST from '@src/CONST'; -import Timing from './actions/Timing'; -import SuffixUkkonenTree from './SuffixUkkonenTree'; - -type SearchableData = { - /** - * The data that should be searchable - */ - data: T[]; - /** - * A function that generates a string from a data entry. The string's value is used for searching. - * If you have multiple fields that should be searchable, simply concat them to the string and return it. - */ - toSearchableString: (data: T) => string; -}; - -// There are certain characters appear very often in our search data (email addresses), which we don't need to search for. -const charSetToSkip = new Set(['@', '.', '#', '$', '%', '&', '*', '+', '-', '/', ':', ';', '<', '=', '>', '?', '_', '~', '!', ' ']); - -/** - * Creates a new "FastSearch" instance. "FastSearch" uses a suffix tree to search for substrings in a list of strings. - * You can provide multiple datasets. The search results will be returned for each dataset. - * - * Note: Creating a FastSearch instance with a lot of data is computationally expensive. You should create an instance once and reuse it. - * Searches will be very fast though, even with a lot of data. - */ -function createFastSearch(dataSets: Array>) { - Timing.start(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES); - const maxNumericListSize = 400_000; - // The user might provide multiple data sets, but internally, the search values will be stored in this one list: - let concatenatedNumericList = new Uint8Array(maxNumericListSize); - // Here we store the index of the data item in the original data list, so we can map the found occurrences back to the original data: - const occurrenceToIndex = new Uint32Array(maxNumericListSize * 4); - // As we are working with ArrayBuffers, we need to keep track of the current offset: - const offset = {value: 1}; - // We store the last offset for a dataSet, so we can map the found occurrences to the correct dataSet: - const listOffsets: number[] = []; - - for (const {data, toSearchableString} of dataSets) { - // Performance critical: the array parameters are passed by reference, so we don't have to create new arrays every time: - dataToNumericRepresentation(concatenatedNumericList, occurrenceToIndex, offset, {data, toSearchableString}); - listOffsets.push(offset.value); - } - concatenatedNumericList[offset.value++] = SuffixUkkonenTree.END_CHAR_CODE; - listOffsets[listOffsets.length - 1] = offset.value; - Timing.end(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES); - - // The list might be larger than necessary, so we clamp it to the actual size: - concatenatedNumericList = concatenatedNumericList.slice(0, offset.value); - - // Create & build the suffix tree: - Timing.start(CONST.TIMING.SEARCH_MAKE_TREE); - const tree = SuffixUkkonenTree.makeTree(concatenatedNumericList); - Timing.end(CONST.TIMING.SEARCH_MAKE_TREE); - - Timing.start(CONST.TIMING.SEARCH_BUILD_TREE); - tree.build(); - Timing.end(CONST.TIMING.SEARCH_BUILD_TREE); - - /** - * Searches for the given input and returns results for each dataset. - */ - function search(searchInput: string): T[][] { - const cleanedSearchString = cleanString(searchInput); - const {numeric} = SuffixUkkonenTree.stringToNumeric(cleanedSearchString, { - charSetToSkip, - // stringToNumeric might return a list that is larger than necessary, so we clamp it to the actual size - // (otherwise the search could fail as we include in our search empty array values): - clamp: true, - }); - const result = tree.findSubstring(Array.from(numeric)); - - const resultsByDataSet = Array.from({length: dataSets.length}, () => new Set()); - // eslint-disable-next-line @typescript-eslint/prefer-for-of - for (let i = 0; i < result.length; i++) { - const occurrenceIndex = result[i]; - const itemIndexInDataSet = occurrenceToIndex[occurrenceIndex]; - const dataSetIndex = listOffsets.findIndex((listOffset) => occurrenceIndex < listOffset); - - if (dataSetIndex === -1) { - throw new Error(`[FastSearch] The occurrence index ${occurrenceIndex} is not in any dataset`); - } - const item = dataSets[dataSetIndex].data[itemIndexInDataSet]; - if (!item) { - throw new Error(`[FastSearch] The item with index ${itemIndexInDataSet} in dataset ${dataSetIndex} is not defined`); - } - resultsByDataSet[dataSetIndex].add(item); - } - - return resultsByDataSet.map((set) => Array.from(set)); - } - - return { - search, - }; -} - -/** - * The suffix tree can only store string like values, and internally stores those as numbers. - * This function converts the user data (which are most likely objects) to a numeric representation. - * Additionally a list of the original data and their index position in the numeric list is created, which is used to map the found occurrences back to the original data. - */ -function dataToNumericRepresentation(concatenatedNumericList: Uint8Array, occurrenceToIndex: Uint32Array, offset: {value: number}, {data, toSearchableString}: SearchableData): void { - data.forEach((option, index) => { - const searchStringForTree = toSearchableString(option); - const cleanedSearchStringForTree = cleanString(searchStringForTree); - - if (cleanedSearchStringForTree.length === 0) { - return; - } - - SuffixUkkonenTree.stringToNumeric(cleanedSearchStringForTree, { - charSetToSkip, - out: { - outArray: concatenatedNumericList, - offset, - outOccurrenceToIndex: occurrenceToIndex, - index, - }, - }); - // eslint-disable-next-line no-param-reassign - occurrenceToIndex[offset.value] = index; - // eslint-disable-next-line no-param-reassign - concatenatedNumericList[offset.value++] = SuffixUkkonenTree.DELIMITER_CHAR_CODE; - }); -} - -/** - * Everything in the tree is treated as lowercase. - */ -function cleanString(input: string) { - return input.toLowerCase(); -} - -const FastSearch = { - createFastSearch, -}; - -export default FastSearch; diff --git a/src/libs/OptionsListUtils.ts b/src/libs/OptionsListUtils.ts index f414d2328ef6..497a2d33cf56 100644 --- a/src/libs/OptionsListUtils.ts +++ b/src/libs/OptionsListUtils.ts @@ -2477,31 +2477,6 @@ function getPersonalDetailSearchTerms(item: Partial) { function getCurrentUserSearchTerms(item: ReportUtils.OptionData) { return [item.text ?? '', item.login ?? '', item.login?.replace(CONST.EMAIL_SEARCH_REGEX, '') ?? '']; } - -type PickUserToInviteParams = { - canInviteUser: boolean; - recentReports: ReportUtils.OptionData[]; - personalDetails: ReportUtils.OptionData[]; - searchValue: string; - config?: FilterOptionsConfig; - optionsToExclude: Option[]; -}; - -const pickUserToInvite = ({canInviteUser, recentReports, personalDetails, searchValue, config, optionsToExclude}: PickUserToInviteParams) => { - let userToInvite = null; - if (canInviteUser) { - if (recentReports.length === 0 && personalDetails.length === 0) { - userToInvite = getUserToInviteOption({ - searchValue, - selectedOptions: config?.selectedOptions, - optionsToExclude, - }); - } - } - - return userToInvite; -}; - /** * Filters options based on the search input value */ @@ -2589,7 +2564,16 @@ function filterOptions(options: Options, searchInputValue: string, config?: Filt recentReports = orderOptions(recentReports, searchValue); } - const userToInvite = pickUserToInvite({canInviteUser, recentReports, personalDetails, searchValue, config, optionsToExclude}); + let userToInvite = null; + if (canInviteUser) { + if (recentReports.length === 0 && personalDetails.length === 0) { + userToInvite = getUserToInviteOption({ + searchValue, + selectedOptions: config?.selectedOptions, + optionsToExclude, + }); + } + } if (maxRecentReportsToShow > 0 && recentReports.length > maxRecentReportsToShow) { recentReports.splice(maxRecentReportsToShow); @@ -2659,7 +2643,6 @@ export { formatMemberForList, formatSectionsFromSearchTerm, getShareLogOptions, - orderOptions, filterOptions, createOptionList, createOptionFromReport, @@ -2674,7 +2657,6 @@ export { shouldUseBoldText, getAttendeeOptions, getAlternateText, - pickUserToInvite, hasReportErrors, }; diff --git a/src/libs/SuffixUkkonenTree/index.ts b/src/libs/SuffixUkkonenTree/index.ts deleted file mode 100644 index bcefd1008493..000000000000 --- a/src/libs/SuffixUkkonenTree/index.ts +++ /dev/null @@ -1,211 +0,0 @@ -/* eslint-disable rulesdir/prefer-at */ -// .at() has a performance overhead we explicitly want to avoid here - -/* eslint-disable no-continue */ -import {ALPHABET_SIZE, DELIMITER_CHAR_CODE, END_CHAR_CODE, SPECIAL_CHAR_CODE, stringToNumeric} from './utils'; - -/** - * This implements a suffix tree using Ukkonen's algorithm. - * A good visualization to learn about the algorithm can be found here: https://brenden.github.io/ukkonen-animation/ - * A good video explaining Ukkonen's algorithm can be found here: https://www.youtube.com/watch?v=ALEV0Hc5dDk - * Note: This implementation is optimized for performance, not necessarily for readability. - * - * You probably don't want to use this directly, but rather use @libs/FastSearch.ts as a easy to use wrapper around this. - */ - -/** - * Creates a new tree instance that can be used to build a suffix tree and search in it. - * The input is a numeric representation of the search string, which can be created using {@link stringToNumeric}. - * Separate search values must be separated by the {@link DELIMITER_CHAR_CODE}. The search string must end with the {@link END_CHAR_CODE}. - * - * The tree will be built using the Ukkonen's algorithm: https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf - */ -function makeTree(numericSearchValues: Uint8Array) { - // Every leaf represents a suffix. There can't be more than n suffixes. - // Every internal node has to have at least 2 children. So the total size of ukkonen tree is not bigger than 2n - 1. - // + 1 is because an extra character at the beginning to offset the 1-based indexing. - const maxNodes = 2 * numericSearchValues.length + 1; - /* - This array represents all internal nodes in the suffix tree. - When building this tree, we'll be given a character in the string, and we need to be able to lookup in constant time - if there's any edge connected to a node starting with that character. For example, given a tree like this: - - root - / | \ - a b c - - and the next character in our string is 'd', we need to be able do check if any of the edges from the root node - start with the letter 'd', without looping through all the edges. - - To accomplish this, each node gets an array matching the alphabet size. - So you can imagine if our alphabet was just [a,b,c,d], then each node would get an array like [0,0,0,0]. - If we add an edge starting with 'a', then the root node would be [1,0,0,0] - So given an arbitrary letter such as 'd', then we can take the position of that letter in its alphabet (position 3 in our example) - and check whether that index in the array is 0 or 1. If it's a 1, then there's an edge starting with the letter 'd'. - - Note that for efficiency, all nodes are stored in a single flat array. That's how we end up with (maxNodes * alphabet_size). - In the example of a 4-character alphabet, we'd have an array like this: - - root root.left root.right last possible node - / \ / \ / \ / \ - [0,0,0,0, 0,0,0,0, 0,0,0,0, ................. 0,0,0,0] - */ - const transitionNodes = new Uint32Array(maxNodes * ALPHABET_SIZE); - - // Storing the range of the original string that each node represents: - const rangeStart = new Uint32Array(maxNodes); - const rangeEnd = new Uint32Array(maxNodes); - - const parent = new Uint32Array(maxNodes); - const suffixLink = new Uint32Array(maxNodes); - - let currentNode = 1; - let currentPosition = 1; - let nodeCounter = 3; - let currentIndex = 1; - - function initializeTree() { - rangeEnd.fill(numericSearchValues.length); - rangeEnd[1] = 0; - rangeEnd[2] = 0; - suffixLink[1] = 2; - for (let i = 0; i < ALPHABET_SIZE; ++i) { - transitionNodes[ALPHABET_SIZE * 2 + i] = 1; - } - } - - function processCharacter(char: number) { - // eslint-disable-next-line no-constant-condition - while (true) { - if (rangeEnd[currentNode] < currentPosition) { - if (transitionNodes[currentNode * ALPHABET_SIZE + char] === 0) { - createNewLeaf(char); - continue; - } - currentNode = transitionNodes[currentNode * ALPHABET_SIZE + char]; - currentPosition = rangeStart[currentNode]; - } - if (currentPosition === 0 || char === numericSearchValues[currentPosition]) { - currentPosition++; - } else { - splitEdge(char); - continue; - } - break; - } - } - - function createNewLeaf(c: number) { - transitionNodes[currentNode * ALPHABET_SIZE + c] = nodeCounter; - rangeStart[nodeCounter] = currentIndex; - parent[nodeCounter++] = currentNode; - currentNode = suffixLink[currentNode]; - - currentPosition = rangeEnd[currentNode] + 1; - } - - function splitEdge(c: number) { - rangeStart[nodeCounter] = rangeStart[currentNode]; - rangeEnd[nodeCounter] = currentPosition - 1; - parent[nodeCounter] = parent[currentNode]; - - transitionNodes[nodeCounter * ALPHABET_SIZE + numericSearchValues[currentPosition]] = currentNode; - transitionNodes[nodeCounter * ALPHABET_SIZE + c] = nodeCounter + 1; - rangeStart[nodeCounter + 1] = currentIndex; - parent[nodeCounter + 1] = nodeCounter; - rangeStart[currentNode] = currentPosition; - parent[currentNode] = nodeCounter; - - transitionNodes[parent[nodeCounter] * ALPHABET_SIZE + numericSearchValues[rangeStart[nodeCounter]]] = nodeCounter; - nodeCounter += 2; - handleDescent(nodeCounter); - } - - function handleDescent(latestNodeIndex: number) { - currentNode = suffixLink[parent[latestNodeIndex - 2]]; - currentPosition = rangeStart[latestNodeIndex - 2]; - while (currentPosition <= rangeEnd[latestNodeIndex - 2]) { - currentNode = transitionNodes[currentNode * ALPHABET_SIZE + numericSearchValues[currentPosition]]; - currentPosition += rangeEnd[currentNode] - rangeStart[currentNode] + 1; - } - if (currentPosition === rangeEnd[latestNodeIndex - 2] + 1) { - suffixLink[latestNodeIndex - 2] = currentNode; - } else { - suffixLink[latestNodeIndex - 2] = latestNodeIndex; - } - currentPosition = rangeEnd[currentNode] - (currentPosition - rangeEnd[latestNodeIndex - 2]) + 2; - } - - function build() { - initializeTree(); - for (currentIndex = 1; currentIndex < numericSearchValues.length; ++currentIndex) { - const c = numericSearchValues[currentIndex]; - processCharacter(c); - } - } - - /** - * Returns all occurrences of the given (sub)string in the input string. - * - * You can think of the tree that we create as a big string that looks like this: - * - * "banana$pancake$apple|" - * The example delimiter character '$' is used to separate the different strings. - * The end character '|' is used to indicate the end of our search string. - * - * This function will return the index(es) of found occurrences within this big string. - * So, when searching for "an", it would return [1, 3, 8]. - */ - function findSubstring(searchValue: number[]) { - const occurrences: number[] = []; - - function dfs(node: number, depth: number) { - const leftRange = rangeStart[node]; - const rightRange = rangeEnd[node]; - const rangeLen = node === 1 ? 0 : rightRange - leftRange + 1; - - for (let i = 0; i < rangeLen && depth + i < searchValue.length && leftRange + i < numericSearchValues.length; i++) { - if (searchValue[depth + i] !== numericSearchValues[leftRange + i]) { - return; - } - } - - let isLeaf = true; - for (let i = 0; i < ALPHABET_SIZE; ++i) { - const tNode = transitionNodes[node * ALPHABET_SIZE + i]; - - // Search speed optimization: don't go through the edge if it's different than the next char: - const correctChar = depth + rangeLen >= searchValue.length || i === searchValue[depth + rangeLen]; - - if (tNode !== 0 && tNode !== 1 && correctChar) { - isLeaf = false; - dfs(tNode, depth + rangeLen); - } - } - - if (isLeaf && depth + rangeLen >= searchValue.length) { - occurrences.push(numericSearchValues.length - (depth + rangeLen) + 1); - } - } - - dfs(1, 0); - return occurrences; - } - - return { - build, - findSubstring, - }; -} - -const SuffixUkkonenTree = { - makeTree, - - // Re-exported from utils: - DELIMITER_CHAR_CODE, - SPECIAL_CHAR_CODE, - END_CHAR_CODE, - stringToNumeric, -}; - -export default SuffixUkkonenTree; diff --git a/src/libs/SuffixUkkonenTree/utils.ts b/src/libs/SuffixUkkonenTree/utils.ts deleted file mode 100644 index 96ee35b15796..000000000000 --- a/src/libs/SuffixUkkonenTree/utils.ts +++ /dev/null @@ -1,115 +0,0 @@ -/* eslint-disable rulesdir/prefer-at */ // .at() has a performance overhead we explicitly want to avoid here -/* eslint-disable no-continue */ - -const CHAR_CODE_A = 'a'.charCodeAt(0); -const ALPHABET = 'abcdefghijklmnopqrstuvwxyz'; -const LETTER_ALPHABET_SIZE = ALPHABET.length; -const ALPHABET_SIZE = LETTER_ALPHABET_SIZE + 3; // +3: special char, delimiter char, end char -const SPECIAL_CHAR_CODE = ALPHABET_SIZE - 3; -const DELIMITER_CHAR_CODE = ALPHABET_SIZE - 2; -const END_CHAR_CODE = ALPHABET_SIZE - 1; - -// Store the results for a char code in a lookup table to avoid recalculating the same values (performance optimization) -const base26LookupTable = new Array(); - -/** - * Converts a number to a base26 representation. - */ -function convertToBase26(num: number): number[] { - if (base26LookupTable[num]) { - return base26LookupTable[num]; - } - if (num < 0) { - throw new Error('convertToBase26: Input must be a non-negative integer'); - } - - const result: number[] = []; - - do { - // eslint-disable-next-line no-param-reassign - num--; - result.unshift(num % 26); - // eslint-disable-next-line no-bitwise, no-param-reassign - num >>= 5; // Equivalent to Math.floor(num / 26), but faster - } while (num > 0); - - base26LookupTable[num] = result; - return result; -} - -/** - * Converts a string to an array of numbers representing the characters of the string. - * Every number in the array is in the range [0, ALPHABET_SIZE-1] (0-28). - * - * The numbers are offset by the character code of 'a' (97). - * - This is so that the numbers from a-z are in the range 0-28. - * - 26 is for encoding special characters. Character numbers that are not within the range of a-z will be encoded as "specialCharacter + base26(charCode)" - * - 27 is for the delimiter character - * - 28 is for the end character - * - * Note: The string should be converted to lowercase first (otherwise uppercase letters get base26'ed taking more space than necessary). - */ -function stringToNumeric( - // The string we want to convert to a numeric representation - input: string, - options?: { - // A set of characters that should be skipped and not included in the numeric representation - charSetToSkip?: Set; - // When out is provided, the function will write the result to the provided arrays instead of creating new ones (performance) - out?: { - outArray: Uint8Array; - // As outArray is a ArrayBuffer we need to keep track of the current offset - offset: {value: number}; - // A map of to map the found occurrences to the correct data set - // As the search string can be very long for high traffic accounts (500k+), this has to be big enough, thus its a Uint32Array - outOccurrenceToIndex?: Uint32Array; - // The index that will be used in the outOccurrenceToIndex array (this is the index of your original data position) - index?: number; - }; - // By default false. By default the outArray may be larger than necessary. If clamp is set to true the outArray will be clamped to the actual size. - clamp?: boolean; - }, -): { - numeric: Uint8Array; - occurrenceToIndex: Uint32Array; - offset: {value: number}; -} { - // The out array might be longer than our input string length, because we encode special characters as multiple numbers using the base26 encoding. - // * 6 is because the upper limit of encoding any char in UTF-8 to base26 is at max 6 numbers. - const outArray = options?.out?.outArray ?? new Uint8Array(input.length * 6); - const offset = options?.out?.offset ?? {value: 0}; - const occurrenceToIndex = options?.out?.outOccurrenceToIndex ?? new Uint32Array(input.length * 16 * 4); - const index = options?.out?.index ?? 0; - - for (let i = 0; i < input.length; i++) { - const char = input[i]; - - if (options?.charSetToSkip?.has(char)) { - continue; - } - - if (char >= 'a' && char <= 'z') { - // char is an alphabet character - occurrenceToIndex[offset.value] = index; - outArray[offset.value++] = char.charCodeAt(0) - CHAR_CODE_A; - } else { - const charCode = input.charCodeAt(i); - occurrenceToIndex[offset.value] = index; - outArray[offset.value++] = SPECIAL_CHAR_CODE; - const asBase26Numeric = convertToBase26(charCode); - // eslint-disable-next-line @typescript-eslint/prefer-for-of - for (let j = 0; j < asBase26Numeric.length; j++) { - occurrenceToIndex[offset.value] = index; - outArray[offset.value++] = asBase26Numeric[j]; - } - } - } - - return { - numeric: options?.clamp ? outArray.slice(0, offset.value) : outArray, - occurrenceToIndex, - offset, - }; -} - -export {stringToNumeric, ALPHABET, ALPHABET_SIZE, SPECIAL_CHAR_CODE, DELIMITER_CHAR_CODE, END_CHAR_CODE}; diff --git a/tests/unit/FastSearchTest.ts b/tests/unit/FastSearchTest.ts deleted file mode 100644 index 029e05e15b1f..000000000000 --- a/tests/unit/FastSearchTest.ts +++ /dev/null @@ -1,118 +0,0 @@ -import FastSearch from '../../src/libs/FastSearch'; - -describe('FastSearch', () => { - it('should insert, and find the word', () => { - const {search} = FastSearch.createFastSearch([ - { - data: ['banana'], - toSearchableString: (data) => data, - }, - ]); - expect(search('an')).toEqual([['banana']]); - }); - - it('should work with multiple words', () => { - const {search} = FastSearch.createFastSearch([ - { - data: ['banana', 'test'], - toSearchableString: (data) => data, - }, - ]); - - expect(search('es')).toEqual([['test']]); - }); - - it('should work when providing two data sets', () => { - const {search} = FastSearch.createFastSearch([ - { - data: ['erica', 'banana'], - toSearchableString: (data) => data, - }, - { - data: ['banana', 'test'], - toSearchableString: (data) => data, - }, - ]); - - expect(search('es')).toEqual([[], ['test']]); - }); - - it('should work with numbers', () => { - const {search} = FastSearch.createFastSearch([ - { - data: [1, 2, 3, 4, 5], - toSearchableString: (data) => String(data), - }, - ]); - - expect(search('2')).toEqual([[2]]); - }); - - it('should work with unicodes', () => { - const {search} = FastSearch.createFastSearch([ - { - data: ['banana', 'ñèşťǒř', 'test'], - toSearchableString: (data) => data, - }, - ]); - - expect(search('èşť')).toEqual([['ñèşťǒř']]); - }); - - it('should work with words containing "reserved special characters"', () => { - const {search} = FastSearch.createFastSearch([ - { - data: ['ba|nana', 'te{st', 'he}llo'], - toSearchableString: (data) => data, - }, - ]); - - expect(search('st')).toEqual([['te{st']]); - expect(search('llo')).toEqual([['he}llo']]); - expect(search('nana')).toEqual([['ba|nana']]); - }); - - it('should be case insensitive', () => { - const {search} = FastSearch.createFastSearch([ - { - data: ['banana', 'TeSt', 'TEST', 'X'], - toSearchableString: (data) => data, - }, - ]); - - expect(search('test')).toEqual([['TeSt', 'TEST']]); - }); - - it('should work with large random data sets', () => { - const data = Array.from({length: 1000}, () => { - return Array.from({length: Math.floor(Math.random() * 22 + 9)}, () => { - const alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789@-_.'; - return alphabet.charAt(Math.floor(Math.random() * alphabet.length)); - }).join(''); - }); - - const {search} = FastSearch.createFastSearch([ - { - data, - toSearchableString: (x) => x, - }, - ]); - - data.forEach((word) => { - expect(search(word)).toEqual([expect.arrayContaining([word])]); - }); - }); - - it('should find email addresses without dots', () => { - const {search} = FastSearch.createFastSearch([ - { - data: ['test.user@example.com', 'unrelated'], - toSearchableString: (data) => data, - }, - ]); - - expect(search('testuser')).toEqual([['test.user@example.com']]); - expect(search('test.user')).toEqual([['test.user@example.com']]); - expect(search('examplecom')).toEqual([['test.user@example.com']]); - }); -}); diff --git a/tests/unit/SuffixUkkonenTreeTest.ts b/tests/unit/SuffixUkkonenTreeTest.ts deleted file mode 100644 index c0c556c16e14..000000000000 --- a/tests/unit/SuffixUkkonenTreeTest.ts +++ /dev/null @@ -1,63 +0,0 @@ -import SuffixUkkonenTree from '@libs/SuffixUkkonenTree/index'; - -describe('SuffixUkkonenTree', () => { - // The suffix tree doesn't take strings, but expects an array buffer, where strings have been separated by a delimiter. - function helperStringsToNumericForTree(strings: string[]) { - const numericLists = strings.map((s) => SuffixUkkonenTree.stringToNumeric(s, {clamp: true})); - const numericList = numericLists.reduce( - (acc, {numeric}) => { - acc.push(...numeric, SuffixUkkonenTree.DELIMITER_CHAR_CODE); - return acc; - }, - // The value we pass to makeTree needs to be offset by one - [0], - ); - numericList.push(SuffixUkkonenTree.END_CHAR_CODE); - return Uint8Array.from(numericList); - } - - it('should insert, build, and find all occurrences', () => { - const strings = ['banana', 'pancake']; - const numericIntArray = helperStringsToNumericForTree(strings); - - const tree = SuffixUkkonenTree.makeTree(numericIntArray); - tree.build(); - const searchValue = SuffixUkkonenTree.stringToNumeric('an', {clamp: true}).numeric; - expect(tree.findSubstring(Array.from(searchValue))).toEqual(expect.arrayContaining([2, 4, 9])); - }); - - it('should find by first character', () => { - const strings = ['pancake', 'banana']; - const numericIntArray = helperStringsToNumericForTree(strings); - const tree = SuffixUkkonenTree.makeTree(numericIntArray); - tree.build(); - const searchValue = SuffixUkkonenTree.stringToNumeric('p', {clamp: true}).numeric; - expect(tree.findSubstring(Array.from(searchValue))).toEqual(expect.arrayContaining([1])); - }); - - it('should handle identical words', () => { - const strings = ['banana', 'banana', 'x']; - const numericIntArray = helperStringsToNumericForTree(strings); - const tree = SuffixUkkonenTree.makeTree(numericIntArray); - tree.build(); - const searchValue = SuffixUkkonenTree.stringToNumeric('an', {clamp: true}).numeric; - expect(tree.findSubstring(Array.from(searchValue))).toEqual(expect.arrayContaining([2, 4, 9, 11])); - }); - - it('should convert string to numeric with a list of chars to skip', () => { - const {numeric} = SuffixUkkonenTree.stringToNumeric('abcabc', { - charSetToSkip: new Set(['b']), - clamp: true, - }); - expect(Array.from(numeric)).toEqual([0, 2, 0, 2]); - }); - - it('should convert string outside of a-z to numeric with clamping', () => { - const {numeric} = SuffixUkkonenTree.stringToNumeric('2', { - clamp: true, - }); - - // "2" in ASCII is 50, so base26(50) = [0, 23] - expect(Array.from(numeric)).toEqual([SuffixUkkonenTree.SPECIAL_CHAR_CODE, 0, 23]); - }); -});