-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #51954 from margelo/perf/search-tree
Search suffix tree implementation
- Loading branch information
Showing
11 changed files
with
1,005 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
import {useMemo} from 'react'; | ||
import FastSearch from '@libs/FastSearch'; | ||
import * as OptionsListUtils from '@libs/OptionsListUtils'; | ||
|
||
type AllOrSelectiveOptions = OptionsListUtils.ReportAndPersonalDetailOptions | OptionsListUtils.Options; | ||
|
||
type Options = { | ||
includeUserToInvite: boolean; | ||
}; | ||
|
||
const emptyResult = { | ||
personalDetails: [], | ||
recentReports: [], | ||
}; | ||
|
||
// You can either use this to search within report and personal details options | ||
function useFastSearchFromOptions( | ||
options: OptionsListUtils.ReportAndPersonalDetailOptions, | ||
config?: {includeUserToInvite: false}, | ||
): (searchInput: string) => OptionsListUtils.ReportAndPersonalDetailOptions; | ||
// Or you can use this to include the user invite option. This will require passing all options | ||
function useFastSearchFromOptions(options: OptionsListUtils.Options, config?: {includeUserToInvite: true}): (searchInput: string) => OptionsListUtils.Options; | ||
|
||
/** | ||
* Hook for making options from OptionsListUtils searchable with FastSearch. | ||
* Builds a suffix tree and returns a function to search in it. | ||
* | ||
* @example | ||
* ``` | ||
* const options = OptionsListUtils.getSearchOptions(...); | ||
* const filterOptions = useFastSearchFromOptions(options); | ||
*/ | ||
function useFastSearchFromOptions( | ||
options: OptionsListUtils.ReportAndPersonalDetailOptions | OptionsListUtils.Options, | ||
{includeUserToInvite}: Options = {includeUserToInvite: false}, | ||
): (searchInput: string) => AllOrSelectiveOptions { | ||
const findInSearchTree = useMemo(() => { | ||
const fastSearch = FastSearch.createFastSearch([ | ||
{ | ||
data: options.personalDetails, | ||
toSearchableString: (option) => { | ||
const displayName = option.participantsList?.[0]?.displayName ?? ''; | ||
return [option.login ?? '', option.login !== displayName ? displayName : ''].join(); | ||
}, | ||
uniqueId: (option) => option.login, | ||
}, | ||
{ | ||
data: options.recentReports, | ||
toSearchableString: (option) => { | ||
const searchStringForTree = [option.text ?? '', option.login ?? '']; | ||
|
||
if (option.isThread) { | ||
if (option.alternateText) { | ||
searchStringForTree.push(option.alternateText); | ||
} | ||
} else if (!!option.isChatRoom || !!option.isPolicyExpenseChat) { | ||
if (option.subtitle) { | ||
searchStringForTree.push(option.subtitle); | ||
} | ||
} | ||
|
||
return searchStringForTree.join(); | ||
}, | ||
}, | ||
]); | ||
|
||
function search(searchInput: string): AllOrSelectiveOptions { | ||
const searchWords = searchInput.split(' ').sort(); // asc sorted | ||
const longestSearchWord = searchWords.at(searchWords.length - 1); // longest word is the last element | ||
if (!longestSearchWord) { | ||
return emptyResult; | ||
} | ||
|
||
// The user might separated words with spaces to do a search such as: "jo d" -> "john doe" | ||
// With the suffix search tree you can only search for one word at a time. Its most efficient to search for the longest word, | ||
// (as this will limit the results the most) and then afterwards run a quick filter on the results to see if the other words are present. | ||
let [personalDetails, recentReports] = fastSearch.search(longestSearchWord); | ||
|
||
if (searchWords.length > 1) { | ||
personalDetails = personalDetails.filter((pd) => OptionsListUtils.isSearchStringMatch(searchInput, pd.text)); | ||
recentReports = recentReports.filter((rr) => OptionsListUtils.isSearchStringMatch(searchInput, rr.text)); | ||
} | ||
|
||
if (includeUserToInvite && 'currentUserOption' in options) { | ||
const userToInvite = OptionsListUtils.filterUserToInvite( | ||
{ | ||
...options, | ||
personalDetails, | ||
recentReports, | ||
}, | ||
searchInput, | ||
); | ||
return { | ||
personalDetails, | ||
recentReports, | ||
userToInvite, | ||
currentUserOption: options.currentUserOption, | ||
}; | ||
} | ||
|
||
return { | ||
personalDetails, | ||
recentReports, | ||
}; | ||
} | ||
|
||
return search; | ||
}, [includeUserToInvite, options]); | ||
|
||
return findInSearchTree; | ||
} | ||
|
||
export default useFastSearchFromOptions; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
/* eslint-disable rulesdir/prefer-at */ | ||
import CONST from '@src/CONST'; | ||
import Timing from './actions/Timing'; | ||
import SuffixUkkonenTree from './SuffixUkkonenTree'; | ||
|
||
type SearchableData<T> = { | ||
/** | ||
* The data that should be searchable | ||
*/ | ||
data: T[]; | ||
/** | ||
* A function that generates a string from a data entry. The string's value is used for searching. | ||
* If you have multiple fields that should be searchable, simply concat them to the string and return it. | ||
*/ | ||
toSearchableString: (data: T) => string; | ||
|
||
/** | ||
* Gives the possibility to identify data by a unique attribute. Assume you have two search results with the same text they might be valid | ||
* and represent different data. In this case, you can provide a function that returns a unique identifier for the data. | ||
* If multiple items with the same identifier are found, only the first one will be returned. | ||
* This fixes: https://github.com/Expensify/App/issues/53579 | ||
*/ | ||
uniqueId?: (data: T) => string | undefined; | ||
}; | ||
|
||
// There are certain characters appear very often in our search data (email addresses), which we don't need to search for. | ||
const charSetToSkip = new Set(['@', '.', '#', '$', '%', '&', '*', '+', '-', '/', ':', ';', '<', '=', '>', '?', '_', '~', '!', ' ', ',', '(', ')']); | ||
|
||
/** | ||
* Creates a new "FastSearch" instance. "FastSearch" uses a suffix tree to search for substrings in a list of strings. | ||
* You can provide multiple datasets. The search results will be returned for each dataset. | ||
* | ||
* Note: Creating a FastSearch instance with a lot of data is computationally expensive. You should create an instance once and reuse it. | ||
* Searches will be very fast though, even with a lot of data. | ||
*/ | ||
function createFastSearch<T>(dataSets: Array<SearchableData<T>>) { | ||
Timing.start(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES); | ||
const maxNumericListSize = 400_000; | ||
// The user might provide multiple data sets, but internally, the search values will be stored in this one list: | ||
let concatenatedNumericList = new Uint8Array(maxNumericListSize); | ||
// Here we store the index of the data item in the original data list, so we can map the found occurrences back to the original data: | ||
const occurrenceToIndex = new Uint32Array(maxNumericListSize * 4); | ||
// As we are working with ArrayBuffers, we need to keep track of the current offset: | ||
const offset = {value: 1}; | ||
// We store the last offset for a dataSet, so we can map the found occurrences to the correct dataSet: | ||
const listOffsets: number[] = []; | ||
|
||
for (const {data, toSearchableString} of dataSets) { | ||
// Performance critical: the array parameters are passed by reference, so we don't have to create new arrays every time: | ||
dataToNumericRepresentation(concatenatedNumericList, occurrenceToIndex, offset, {data, toSearchableString}); | ||
listOffsets.push(offset.value); | ||
} | ||
concatenatedNumericList[offset.value++] = SuffixUkkonenTree.END_CHAR_CODE; | ||
listOffsets[listOffsets.length - 1] = offset.value; | ||
Timing.end(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES); | ||
|
||
// The list might be larger than necessary, so we clamp it to the actual size: | ||
concatenatedNumericList = concatenatedNumericList.slice(0, offset.value); | ||
|
||
// Create & build the suffix tree: | ||
Timing.start(CONST.TIMING.SEARCH_MAKE_TREE); | ||
const tree = SuffixUkkonenTree.makeTree(concatenatedNumericList); | ||
Timing.end(CONST.TIMING.SEARCH_MAKE_TREE); | ||
|
||
Timing.start(CONST.TIMING.SEARCH_BUILD_TREE); | ||
tree.build(); | ||
Timing.end(CONST.TIMING.SEARCH_BUILD_TREE); | ||
|
||
/** | ||
* Searches for the given input and returns results for each dataset. | ||
*/ | ||
function search(searchInput: string): T[][] { | ||
const cleanedSearchString = cleanString(searchInput); | ||
const {numeric} = SuffixUkkonenTree.stringToNumeric(cleanedSearchString, { | ||
charSetToSkip, | ||
// stringToNumeric might return a list that is larger than necessary, so we clamp it to the actual size | ||
// (otherwise the search could fail as we include in our search empty array values): | ||
clamp: true, | ||
}); | ||
const result = tree.findSubstring(Array.from(numeric)); | ||
|
||
const resultsByDataSet = Array.from({length: dataSets.length}, () => new Set<T>()); | ||
const uniqueMap: Record<number, Record<string, T>> = {}; | ||
// eslint-disable-next-line @typescript-eslint/prefer-for-of | ||
for (let i = 0; i < result.length; i++) { | ||
const occurrenceIndex = result[i]; | ||
const itemIndexInDataSet = occurrenceToIndex[occurrenceIndex]; | ||
const dataSetIndex = listOffsets.findIndex((listOffset) => occurrenceIndex < listOffset); | ||
|
||
if (dataSetIndex === -1) { | ||
throw new Error(`[FastSearch] The occurrence index ${occurrenceIndex} is not in any dataset`); | ||
} | ||
const item = dataSets[dataSetIndex].data[itemIndexInDataSet]; | ||
if (!item) { | ||
throw new Error(`[FastSearch] The item with index ${itemIndexInDataSet} in dataset ${dataSetIndex} is not defined`); | ||
} | ||
|
||
// Check for uniqueness eventually | ||
const getUniqueId = dataSets[dataSetIndex].uniqueId; | ||
if (getUniqueId) { | ||
const uniqueId = getUniqueId(item); | ||
if (uniqueId) { | ||
const hasId = uniqueMap[dataSetIndex]?.[uniqueId]; | ||
if (hasId) { | ||
// eslint-disable-next-line no-continue | ||
continue; | ||
} | ||
if (!uniqueMap[dataSetIndex]) { | ||
uniqueMap[dataSetIndex] = {}; | ||
} | ||
uniqueMap[dataSetIndex][uniqueId] = item; | ||
} | ||
} | ||
|
||
resultsByDataSet[dataSetIndex].add(item); | ||
} | ||
|
||
return resultsByDataSet.map((set) => Array.from(set)); | ||
} | ||
|
||
return { | ||
search, | ||
}; | ||
} | ||
|
||
/** | ||
* The suffix tree can only store string like values, and internally stores those as numbers. | ||
* This function converts the user data (which are most likely objects) to a numeric representation. | ||
* Additionally a list of the original data and their index position in the numeric list is created, which is used to map the found occurrences back to the original data. | ||
*/ | ||
function dataToNumericRepresentation<T>(concatenatedNumericList: Uint8Array, occurrenceToIndex: Uint32Array, offset: {value: number}, {data, toSearchableString}: SearchableData<T>): void { | ||
data.forEach((option, index) => { | ||
const searchStringForTree = toSearchableString(option); | ||
const cleanedSearchStringForTree = cleanString(searchStringForTree); | ||
|
||
if (cleanedSearchStringForTree.length === 0) { | ||
return; | ||
} | ||
|
||
SuffixUkkonenTree.stringToNumeric(cleanedSearchStringForTree, { | ||
charSetToSkip, | ||
out: { | ||
outArray: concatenatedNumericList, | ||
offset, | ||
outOccurrenceToIndex: occurrenceToIndex, | ||
index, | ||
}, | ||
}); | ||
// eslint-disable-next-line no-param-reassign | ||
occurrenceToIndex[offset.value] = index; | ||
// eslint-disable-next-line no-param-reassign | ||
concatenatedNumericList[offset.value++] = SuffixUkkonenTree.DELIMITER_CHAR_CODE; | ||
}); | ||
} | ||
|
||
/** | ||
* Everything in the tree is treated as lowercase. | ||
*/ | ||
function cleanString(input: string) { | ||
return input.toLowerCase(); | ||
} | ||
|
||
const FastSearch = { | ||
createFastSearch, | ||
}; | ||
|
||
export default FastSearch; |
Oops, something went wrong.