Skip to content

Commit

Permalink
Merge pull request #51954 from margelo/perf/search-tree
Browse files Browse the repository at this point in the history
Search suffix tree implementation
  • Loading branch information
marcaaron authored Dec 23, 2024
2 parents 8c83bc6 + ed3f3d3 commit bda52b2
Show file tree
Hide file tree
Showing 11 changed files with 1,005 additions and 46 deletions.
3 changes: 3 additions & 0 deletions src/CONST.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1336,6 +1336,9 @@ const CONST = {
SEARCH_OPTION_LIST_DEBOUNCE_TIME: 300,
RESIZE_DEBOUNCE_TIME: 100,
UNREAD_UPDATE_DEBOUNCE_TIME: 300,
SEARCH_CONVERT_SEARCH_VALUES: 'search_convert_search_values',
SEARCH_MAKE_TREE: 'search_make_tree',
SEARCH_BUILD_TREE: 'search_build_tree',
SEARCH_FILTER_OPTIONS: 'search_filter_options',
USE_DEBOUNCED_STATE_DELAY: 300,
LIST_SCROLLING_DEBOUNCE_TIME: 200,
Expand Down
18 changes: 14 additions & 4 deletions src/components/Search/SearchRouter/SearchRouterList.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import type {SearchQueryItem, SearchQueryListItemProps} from '@components/Select
import type {SectionListDataType, SelectionListHandle, UserListItemProps} from '@components/SelectionList/types';
import UserListItem from '@components/SelectionList/UserListItem';
import useActiveWorkspace from '@hooks/useActiveWorkspace';
import useFastSearchFromOptions from '@hooks/useFastSearchFromOptions';
import useLocalize from '@hooks/useLocalize';
import usePolicy from '@hooks/usePolicy';
import useResponsiveLayout from '@hooks/useResponsiveLayout';
Expand Down Expand Up @@ -179,7 +180,7 @@ function SearchRouterList(
if (currentUser) {
autocompleteOptions.push({
name: currentUser.displayName ?? Str.removeSMSDomain(currentUser.login ?? ''),
accountID: currentUser.accountID?.toString() ?? '-1',
accountID: currentUser.accountID?.toString(),
});
}

Expand Down Expand Up @@ -382,21 +383,30 @@ function SearchRouterList(
};
});

/**
* Builds a suffix tree and returns a function to search in it.
*/
const filterOptions = useFastSearchFromOptions(searchOptions, {includeUserToInvite: true});

const recentReportsOptions = useMemo(() => {
if (autocompleteQueryValue.trim() === '') {
return searchOptions.recentReports.slice(0, 20);
}

Timing.start(CONST.TIMING.SEARCH_FILTER_OPTIONS);
const filteredOptions = OptionsListUtils.filterAndOrderOptions(searchOptions, autocompleteQueryValue, {sortByReportTypeInSearch: true, preferChatroomsOverThreads: true});
const filteredOptions = filterOptions(autocompleteQueryValue);
const orderedOptions = OptionsListUtils.combineOrderingOfReportsAndPersonalDetails(filteredOptions, autocompleteQueryValue, {
sortByReportTypeInSearch: true,
preferChatroomsOverThreads: true,
});
Timing.end(CONST.TIMING.SEARCH_FILTER_OPTIONS);

const reportOptions: OptionData[] = [...filteredOptions.recentReports, ...filteredOptions.personalDetails];
const reportOptions: OptionData[] = [...orderedOptions.recentReports, ...orderedOptions.personalDetails];
if (filteredOptions.userToInvite) {
reportOptions.push(filteredOptions.userToInvite);
}
return reportOptions.slice(0, 20);
}, [autocompleteQueryValue, searchOptions]);
}, [autocompleteQueryValue, filterOptions, searchOptions]);

useEffect(() => {
ReportUserActions.searchInServer(autocompleteQueryValue.trim());
Expand Down
113 changes: 113 additions & 0 deletions src/hooks/useFastSearchFromOptions.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import {useMemo} from 'react';
import FastSearch from '@libs/FastSearch';
import * as OptionsListUtils from '@libs/OptionsListUtils';

type AllOrSelectiveOptions = OptionsListUtils.ReportAndPersonalDetailOptions | OptionsListUtils.Options;

type Options = {
includeUserToInvite: boolean;
};

const emptyResult = {
personalDetails: [],
recentReports: [],
};

// You can either use this to search within report and personal details options
function useFastSearchFromOptions(
options: OptionsListUtils.ReportAndPersonalDetailOptions,
config?: {includeUserToInvite: false},
): (searchInput: string) => OptionsListUtils.ReportAndPersonalDetailOptions;
// Or you can use this to include the user invite option. This will require passing all options
function useFastSearchFromOptions(options: OptionsListUtils.Options, config?: {includeUserToInvite: true}): (searchInput: string) => OptionsListUtils.Options;

/**
* Hook for making options from OptionsListUtils searchable with FastSearch.
* Builds a suffix tree and returns a function to search in it.
*
* @example
* ```
* const options = OptionsListUtils.getSearchOptions(...);
* const filterOptions = useFastSearchFromOptions(options);
*/
function useFastSearchFromOptions(
options: OptionsListUtils.ReportAndPersonalDetailOptions | OptionsListUtils.Options,
{includeUserToInvite}: Options = {includeUserToInvite: false},
): (searchInput: string) => AllOrSelectiveOptions {
const findInSearchTree = useMemo(() => {
const fastSearch = FastSearch.createFastSearch([
{
data: options.personalDetails,
toSearchableString: (option) => {
const displayName = option.participantsList?.[0]?.displayName ?? '';
return [option.login ?? '', option.login !== displayName ? displayName : ''].join();
},
uniqueId: (option) => option.login,
},
{
data: options.recentReports,
toSearchableString: (option) => {
const searchStringForTree = [option.text ?? '', option.login ?? ''];

if (option.isThread) {
if (option.alternateText) {
searchStringForTree.push(option.alternateText);
}
} else if (!!option.isChatRoom || !!option.isPolicyExpenseChat) {
if (option.subtitle) {
searchStringForTree.push(option.subtitle);
}
}

return searchStringForTree.join();
},
},
]);

function search(searchInput: string): AllOrSelectiveOptions {
const searchWords = searchInput.split(' ').sort(); // asc sorted
const longestSearchWord = searchWords.at(searchWords.length - 1); // longest word is the last element
if (!longestSearchWord) {
return emptyResult;
}

// The user might separated words with spaces to do a search such as: "jo d" -> "john doe"
// With the suffix search tree you can only search for one word at a time. Its most efficient to search for the longest word,
// (as this will limit the results the most) and then afterwards run a quick filter on the results to see if the other words are present.
let [personalDetails, recentReports] = fastSearch.search(longestSearchWord);

if (searchWords.length > 1) {
personalDetails = personalDetails.filter((pd) => OptionsListUtils.isSearchStringMatch(searchInput, pd.text));
recentReports = recentReports.filter((rr) => OptionsListUtils.isSearchStringMatch(searchInput, rr.text));
}

if (includeUserToInvite && 'currentUserOption' in options) {
const userToInvite = OptionsListUtils.filterUserToInvite(
{
...options,
personalDetails,
recentReports,
},
searchInput,
);
return {
personalDetails,
recentReports,
userToInvite,
currentUserOption: options.currentUserOption,
};
}

return {
personalDetails,
recentReports,
};
}

return search;
}, [includeUserToInvite, options]);

return findInSearchTree;
}

export default useFastSearchFromOptions;
167 changes: 167 additions & 0 deletions src/libs/FastSearch.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/* eslint-disable rulesdir/prefer-at */
import CONST from '@src/CONST';
import Timing from './actions/Timing';
import SuffixUkkonenTree from './SuffixUkkonenTree';

type SearchableData<T> = {
/**
* The data that should be searchable
*/
data: T[];
/**
* A function that generates a string from a data entry. The string's value is used for searching.
* If you have multiple fields that should be searchable, simply concat them to the string and return it.
*/
toSearchableString: (data: T) => string;

/**
* Gives the possibility to identify data by a unique attribute. Assume you have two search results with the same text they might be valid
* and represent different data. In this case, you can provide a function that returns a unique identifier for the data.
* If multiple items with the same identifier are found, only the first one will be returned.
* This fixes: https://github.com/Expensify/App/issues/53579
*/
uniqueId?: (data: T) => string | undefined;
};

// There are certain characters appear very often in our search data (email addresses), which we don't need to search for.
const charSetToSkip = new Set(['@', '.', '#', '$', '%', '&', '*', '+', '-', '/', ':', ';', '<', '=', '>', '?', '_', '~', '!', ' ', ',', '(', ')']);

/**
* Creates a new "FastSearch" instance. "FastSearch" uses a suffix tree to search for substrings in a list of strings.
* You can provide multiple datasets. The search results will be returned for each dataset.
*
* Note: Creating a FastSearch instance with a lot of data is computationally expensive. You should create an instance once and reuse it.
* Searches will be very fast though, even with a lot of data.
*/
function createFastSearch<T>(dataSets: Array<SearchableData<T>>) {
Timing.start(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES);
const maxNumericListSize = 400_000;
// The user might provide multiple data sets, but internally, the search values will be stored in this one list:
let concatenatedNumericList = new Uint8Array(maxNumericListSize);
// Here we store the index of the data item in the original data list, so we can map the found occurrences back to the original data:
const occurrenceToIndex = new Uint32Array(maxNumericListSize * 4);
// As we are working with ArrayBuffers, we need to keep track of the current offset:
const offset = {value: 1};
// We store the last offset for a dataSet, so we can map the found occurrences to the correct dataSet:
const listOffsets: number[] = [];

for (const {data, toSearchableString} of dataSets) {
// Performance critical: the array parameters are passed by reference, so we don't have to create new arrays every time:
dataToNumericRepresentation(concatenatedNumericList, occurrenceToIndex, offset, {data, toSearchableString});
listOffsets.push(offset.value);
}
concatenatedNumericList[offset.value++] = SuffixUkkonenTree.END_CHAR_CODE;
listOffsets[listOffsets.length - 1] = offset.value;
Timing.end(CONST.TIMING.SEARCH_CONVERT_SEARCH_VALUES);

// The list might be larger than necessary, so we clamp it to the actual size:
concatenatedNumericList = concatenatedNumericList.slice(0, offset.value);

// Create & build the suffix tree:
Timing.start(CONST.TIMING.SEARCH_MAKE_TREE);
const tree = SuffixUkkonenTree.makeTree(concatenatedNumericList);
Timing.end(CONST.TIMING.SEARCH_MAKE_TREE);

Timing.start(CONST.TIMING.SEARCH_BUILD_TREE);
tree.build();
Timing.end(CONST.TIMING.SEARCH_BUILD_TREE);

/**
* Searches for the given input and returns results for each dataset.
*/
function search(searchInput: string): T[][] {
const cleanedSearchString = cleanString(searchInput);
const {numeric} = SuffixUkkonenTree.stringToNumeric(cleanedSearchString, {
charSetToSkip,
// stringToNumeric might return a list that is larger than necessary, so we clamp it to the actual size
// (otherwise the search could fail as we include in our search empty array values):
clamp: true,
});
const result = tree.findSubstring(Array.from(numeric));

const resultsByDataSet = Array.from({length: dataSets.length}, () => new Set<T>());
const uniqueMap: Record<number, Record<string, T>> = {};
// eslint-disable-next-line @typescript-eslint/prefer-for-of
for (let i = 0; i < result.length; i++) {
const occurrenceIndex = result[i];
const itemIndexInDataSet = occurrenceToIndex[occurrenceIndex];
const dataSetIndex = listOffsets.findIndex((listOffset) => occurrenceIndex < listOffset);

if (dataSetIndex === -1) {
throw new Error(`[FastSearch] The occurrence index ${occurrenceIndex} is not in any dataset`);
}
const item = dataSets[dataSetIndex].data[itemIndexInDataSet];
if (!item) {
throw new Error(`[FastSearch] The item with index ${itemIndexInDataSet} in dataset ${dataSetIndex} is not defined`);
}

// Check for uniqueness eventually
const getUniqueId = dataSets[dataSetIndex].uniqueId;
if (getUniqueId) {
const uniqueId = getUniqueId(item);
if (uniqueId) {
const hasId = uniqueMap[dataSetIndex]?.[uniqueId];
if (hasId) {
// eslint-disable-next-line no-continue
continue;
}
if (!uniqueMap[dataSetIndex]) {
uniqueMap[dataSetIndex] = {};
}
uniqueMap[dataSetIndex][uniqueId] = item;
}
}

resultsByDataSet[dataSetIndex].add(item);
}

return resultsByDataSet.map((set) => Array.from(set));
}

return {
search,
};
}

/**
* The suffix tree can only store string like values, and internally stores those as numbers.
* This function converts the user data (which are most likely objects) to a numeric representation.
* Additionally a list of the original data and their index position in the numeric list is created, which is used to map the found occurrences back to the original data.
*/
function dataToNumericRepresentation<T>(concatenatedNumericList: Uint8Array, occurrenceToIndex: Uint32Array, offset: {value: number}, {data, toSearchableString}: SearchableData<T>): void {
data.forEach((option, index) => {
const searchStringForTree = toSearchableString(option);
const cleanedSearchStringForTree = cleanString(searchStringForTree);

if (cleanedSearchStringForTree.length === 0) {
return;
}

SuffixUkkonenTree.stringToNumeric(cleanedSearchStringForTree, {
charSetToSkip,
out: {
outArray: concatenatedNumericList,
offset,
outOccurrenceToIndex: occurrenceToIndex,
index,
},
});
// eslint-disable-next-line no-param-reassign
occurrenceToIndex[offset.value] = index;
// eslint-disable-next-line no-param-reassign
concatenatedNumericList[offset.value++] = SuffixUkkonenTree.DELIMITER_CHAR_CODE;
});
}

/**
* Everything in the tree is treated as lowercase.
*/
function cleanString(input: string) {
return input.toLowerCase();
}

const FastSearch = {
createFastSearch,
};

export default FastSearch;
Loading

0 comments on commit bda52b2

Please sign in to comment.