From 1a1782a1db3790a1d6d806cf49edcf4e3e5041fb Mon Sep 17 00:00:00 2001 From: Pascal Chevrel Date: Sun, 15 Nov 2015 19:00:48 +0100 Subject: [PATCH] Issues #320: move all our regex logic into a unit tested class --- app/classes/Transvision/Search.php | 171 ++++++++++++++++++++++++++ app/inc/search_options.php | 19 ++- app/models/3locales_search.php | 9 +- app/models/api/repository_search.php | 27 ++-- app/models/api/suggestions.php | 32 +++-- app/models/api/translation_memory.php | 39 +++--- app/models/mainsearch_entities.php | 2 +- app/models/mainsearch_glossary.php | 15 ++- app/models/mainsearch_strings.php | 14 +-- tests/units/Transvision/Search.php | 139 +++++++++++++++++++++ 10 files changed, 388 insertions(+), 79 deletions(-) create mode 100644 app/classes/Transvision/Search.php create mode 100644 tests/units/Transvision/Search.php diff --git a/app/classes/Transvision/Search.php b/app/classes/Transvision/Search.php new file mode 100644 index 00000000..32878d3f --- /dev/null +++ b/app/classes/Transvision/Search.php @@ -0,0 +1,171 @@ +setSearchTerms('Bookmark this page') + * ->setRegexWholeWords(true) + * ->setRegexCase('sensitive') + * ->setRegexPerfectMatch(false); + */ +class Search +{ + /** + * The trimmed string searched, we keep that one as the canonical reference + * @var string + */ + public $search_terms; + + /** + * The generated regex string updated dynamically via updateRegex() + * @var string + */ + public $regex; + + /** + * Case sensibility of the regex + * @var string + */ + public $regex_case; + + /** + * Consider the space separated string as a single word for search + * @var string + */ + public $regex_whole_words; + + /** + * Only return strings that match the search perfectly (case excluded) + * @var boolean + */ + public $regex_perfect_match; + + /** + * The search terms for the regex, those differ from $search_terms as + * they can be changed dynamically via setRegexSearchTerms() + * @var string + */ + public $regex_search_terms; + + /** + * We set the default values for a search + */ + public function __construct() + { + $this->search_terms = ''; + $this->regex = ''; + $this->regex_case = 'i'; + $this->regex_whole_words = ''; + $this->regex_perfect_match = false; + $this->regex_search_terms = ''; + } + + /** + * Store the searched string in $search_terms and in $regex_search_terms + * + * @param [type] $string [description] + * @return $this + */ + public function setSearchTerms($string) + { + $this->search_terms = trim($string); + $this->regex_search_terms = $this->search_terms; + $this->updateRegex(); + + return $this; + } + + /** + * Allows setting a new searched term for the regex. + * This is mostly useful when you have a multi-words search and need to + * loop through all the words to return results. + * + * @param string $string The string we want to update the regex for + * @return $this + */ + public function setRegexSearchTerms($string) + { + $this->regex_search_terms = $string; + $this->updateRegex(); + + return $this; + } + + /** + * Set the regex case sensibility. + * + * @param boolean $flag 'sensitive' == '' in a regex + * @return $this + */ + public function setRegexCase($flag) + { + $this->regex_case = (boolean) $flag ? '' : 'i'; + $this->updateRegex(); + + return $this; + } + + /** + * Set the regex to only return perfect matches for the searched string. + * We cast the value to a boolean because we usually get it from a GET. + * + * @param boolean $flag Set to True for a perfect match + * @return $this + */ + public function setRegexPerfectMatch($flag) + { + $this->regex_perfect_match = (boolean) $flag; + $this->updateRegex(); + + return $this; + } + + /** + * Set the regex so as that a multi-word search is taken as a single word. + * We cast the value to a boolean because we usually get it from a GET. + * + * @param boolean $flag A string evaluated to True will add \b to the regex + * @return $this + */ + public function setRegexWholeWords($flag) + { + $this->regex_whole_words = (boolean) $flag ? '\b' : ''; + $this->updateRegex(); + + return $this; + } + + /** + * Update the $regex_search_terms value every time + * a setter to the regex is triggered. + * + * @return $this + */ + private function updateRegex() + { + // Search for perfectMatch + if ($this->regex_perfect_match) { + $search = '^' . $this->regex_search_terms . '$'; + } else { + $search = preg_quote($this->regex_search_terms, '~'); + } + + $this->regex = + '~' + . $this->regex_whole_words + . $search + . $this->regex_whole_words + . '~' + . $this->regex_case + . 'u'; + + return $this; + } + +} diff --git a/app/inc/search_options.php b/app/inc/search_options.php index 84f7f119..73e45897 100644 --- a/app/inc/search_options.php +++ b/app/inc/search_options.php @@ -40,18 +40,13 @@ // Locales list for the select boxes $loc_list = Project::getRepositoryLocales($check['repo']); -// Search for perfectMatch -if ($check['perfect_match']) { - $my_search = trim('^' . $my_search . '$'); -} else { - $my_search = preg_quote($my_search, '/'); -} - -// Regex options -$case_sensitive = $check['case_sensitive'] ? '' : 'i'; -$whole_word = $check['whole_word'] ? '\b' : ''; -$delimiter = '~'; -$main_regex = $delimiter . $whole_word . $my_search . $whole_word . $delimiter . $case_sensitive; +// Define our regex +$search = (new Search) + ->setSearchTerms(Utils::cleanString($_GET['recherche'])) + ->setRegexWholeWords($check['whole_word']) + ->setRegexCase($check['case_sensitive']) + ->setRegexPerfectMatch($check['perfect_match']) +; // build the repository switcher $repo_list = Utils::getHtmlSelectOptions($repos_nice_names, $check['repo'], true); diff --git a/app/models/3locales_search.php b/app/models/3locales_search.php index 3168aab1..6cbfcfc2 100644 --- a/app/models/3locales_search.php +++ b/app/models/3locales_search.php @@ -3,14 +3,13 @@ $tmx_target2 = Utils::getRepoStrings($locale2, $check['repo']); -if ($check['perfect_match']) { - $locale3_strings = preg_grep($regex, $tmx_target2); +if ($search->regex_perfect_match) { + $locale3_strings = preg_grep($search->regex, $tmx_target2); } else { $locale3_strings = $tmx_target2; foreach (Utils::uniqueWords($initial_search) as $word) { - $regex = $delimiter . $whole_word . preg_quote($word, $delimiter) . - $whole_word . $delimiter . $case_sensitive . 'u'; - $locale3_strings = preg_grep($regex, $locale3_strings); + $search->setRegexSearchTerms($word); + $locale3_strings = preg_grep($search->regex, $locale3_strings); } } diff --git a/app/models/api/repository_search.php b/app/models/api/repository_search.php index d070d462..2ca5911e 100644 --- a/app/models/api/repository_search.php +++ b/app/models/api/repository_search.php @@ -21,33 +21,34 @@ $source_strings_merged = []; $target_strings_merged = []; +// Define our regex +$search = (new Search) + ->setSearchTerms(Utils::cleanString($initial_search)) + ->setRegexWholeWords($get_option('whole_word')) + ->setRegexCase($get_option('case_sensitive')) + ->setRegexPerfectMatch($get_option('perfect_match')) +; + // We loop through all repositories searched and merge results foreach ($repositories as $repository) { $source_strings = Utils::getRepoStrings($request->parameters[4], $repository); - // Regex options - $whole_word = $get_option('whole_word') ? '\b' : ''; - $case_sensitive = $get_option('case_sensitive') ? '' : 'i'; - - if ($get_option('perfect_match')) { - $regex = '~' . $whole_word . trim('^' . preg_quote($initial_search, '~') . '$') . - $whole_word . '~' . $case_sensitive . 'u'; + if ($search->regex_perfect_match) { if ($request->parameters[2] == 'entities') { - $entities = ShowResults::searchEntities($source_strings, $regex); + $entities = ShowResults::searchEntities($source_strings, $search->regex); $source_strings = array_intersect_key($source_strings, array_flip($entities)); } else { - $source_strings = preg_grep($regex, $source_strings); + $source_strings = preg_grep($search->regex, $source_strings); $entities = array_keys($source_strings); } } else { foreach (Utils::uniqueWords($initial_search) as $word) { - $regex = '~' . $whole_word . preg_quote($word, '~') . - $whole_word . '~' . $case_sensitive . 'u'; + $search->setRegexSearchTerms($word); if ($request->parameters[2] == 'entities') { - $entities = ShowResults::searchEntities($source_strings, $regex); + $entities = ShowResults::searchEntities($source_strings, $search->regex); $source_strings = array_intersect_key($source_strings, array_flip($entities)); } else { - $source_strings = preg_grep($regex, $source_strings); + $source_strings = preg_grep($search->regex, $source_strings); $entities = array_keys($source_strings); } } diff --git a/app/models/api/suggestions.php b/app/models/api/suggestions.php index f09242fb..25885dd5 100644 --- a/app/models/api/suggestions.php +++ b/app/models/api/suggestions.php @@ -1,6 +1,15 @@ extra_parameters[$option])) { + return $request->extra_parameters[$option]; + } + + return false; +}; + $repositories = ($request->parameters[2] == 'global') ? Project::getRepositories() : [$request->parameters[2]]; @@ -12,28 +21,27 @@ $initial_search = Utils::cleanString($request->parameters[5]); $terms = Utils::uniqueWords($initial_search); -// Regex options (not currenty used) -$delimiter = '~'; -$whole_word = isset($check['whole_word']) ? '\b' : ''; -$case_sensitive = isset($check['case_sensitive']) ? '' : 'i'; -$regex = $delimiter . $whole_word . $initial_search . $whole_word . - $delimiter . $case_sensitive . 'u'; +// Define our regex +$search = (new Search) + ->setSearchTerms(Utils::cleanString($initial_search)) + ->setRegexWholeWords($get_option('whole_word')) + ->setRegexCase($get_option('case_sensitive')) + ->setRegexPerfectMatch($get_option('perfect_match')) +; // Loop through all repositories searching in both source and target languages foreach ($repositories as $repository) { $source_strings = Utils::getRepoStrings($request->parameters[3], $repository); foreach ($terms as $word) { - $regex = $delimiter . $whole_word . preg_quote($word, $delimiter) . - $whole_word . $delimiter . $case_sensitive . 'u'; - $source_strings = preg_grep($regex, $source_strings); + $search->setRegexSearchTerms($word); + $source_strings = preg_grep($search->regex, $source_strings); } $source_strings_merged = array_merge($source_strings, $source_strings_merged); $target_strings = Utils::getRepoStrings($request->parameters[4], $repository); foreach ($terms as $word) { - $regex = $delimiter . $whole_word . preg_quote($word, $delimiter) . - $whole_word . $delimiter . $case_sensitive . 'u'; - $target_strings = preg_grep($regex, $target_strings); + $search->setRegexSearchTerms($word); + $target_strings = preg_grep($search->regex, $target_strings); } $target_strings_merged = array_merge($target_strings, $target_strings_merged); } diff --git a/app/models/api/translation_memory.php b/app/models/api/translation_memory.php index dcc178c5..0c0cc216 100644 --- a/app/models/api/translation_memory.php +++ b/app/models/api/translation_memory.php @@ -1,6 +1,16 @@ extra_parameters[$option]) + && (int) $request->extra_parameters[$option] != 0) { + $value = (int) $request->extra_parameters[$option]; + } + + return $value; +}; $repositories = ($request->parameters[2] == 'global') ? Project::getRepositories() : [$request->parameters[2]]; @@ -12,12 +22,13 @@ $initial_search = Utils::cleanString($request->parameters[5]); $terms = Utils::uniqueWords($initial_search); -// Regex options (not currenty used) -$delimiter = '~'; -$whole_word = isset($check['whole_word']) ? '\b' : ''; -$case_sensitive = isset($check['case_sensitive']) ? '' : 'i'; -$regex = $delimiter . $whole_word . $initial_search . $whole_word . - $delimiter . $case_sensitive . 'u'; +// Define our regex +$search = (new Search) + ->setSearchTerms(Utils::cleanString($initial_search)) + ->setRegexWholeWords($get_option('whole_word')) + ->setRegexCase($get_option('case_sensitive')) + ->setRegexPerfectMatch($get_option('perfect_match')) +; // We loop through all repositories and merge the results foreach ($repositories as $repository) { @@ -25,26 +36,14 @@ $target_strings = Utils::getRepoStrings($request->parameters[4], $repository); foreach ($terms as $word) { - $regex = $delimiter . $whole_word . preg_quote($word, $delimiter) . - $whole_word . $delimiter . $case_sensitive . 'u'; - $source_strings = preg_grep($regex, $source_strings); + $search->setRegexSearchTerms($word); + $source_strings = preg_grep($search->regex, $source_strings); } $source_strings_merged = array_merge($source_strings, $source_strings_merged); $target_strings_merged = array_merge($target_strings, $target_strings_merged); } -// Closure to get extra parameters set -$get_option = function ($option) use ($request) { - $value = 0; - if (isset($request->extra_parameters[$option]) - && (int) $request->extra_parameters[$option] != 0) { - $value = (int) $request->extra_parameters[$option]; - } - - return $value; -}; - return $json = ShowResults::getTranslationMemoryResults( array_keys($source_strings_merged), [$source_strings_merged, $target_strings_merged], diff --git a/app/models/mainsearch_entities.php b/app/models/mainsearch_entities.php index 3b347d6c..70e15633 100644 --- a/app/models/mainsearch_entities.php +++ b/app/models/mainsearch_entities.php @@ -12,7 +12,7 @@ $extra_column_header = ''; } -$entities = ShowResults::searchEntities($tmx_source, $main_regex); +$entities = ShowResults::searchEntities($tmx_source, $search->regex); // Display a search hint for the closest string we have if we have no search results if (count($entities) == 0) { diff --git a/app/models/mainsearch_glossary.php b/app/models/mainsearch_glossary.php index 410f9670..a514afcd 100644 --- a/app/models/mainsearch_glossary.php +++ b/app/models/mainsearch_glossary.php @@ -5,12 +5,11 @@ $tmx_source = Utils::getRepoStrings($source_locale, $check['repo']); $tmx_target = Utils::getRepoStrings($locale, $check['repo']); $locale1_strings = $tmx_source; -$search = Utils::uniqueWords($initial_search); +$search_terms = Utils::uniqueWords($initial_search); -foreach ($search as $word) { - $regex = $delimiter . $whole_word . preg_quote($word, $delimiter) . - $whole_word . $delimiter . $case_sensitive . 'u'; - $locale1_strings = preg_grep($regex, $locale1_strings); +foreach ($search_terms as $word) { + $search->setRegexSearchTerms($word); + $locale1_strings = preg_grep($search->regex, $locale1_strings); } // Limit results to 200 @@ -19,16 +18,16 @@ $perfect = $imperfect = []; // We want to test compound words as well, /ex: 'switch to' -$compound_search = (count($search) > 1) ? true : false; +$compound_search = (count($search_terms) > 1) ? true : false; -foreach ($search as $word) { +foreach ($search_terms as $word) { // If the word is one or two letters, we skip it if (mb_strlen($word) < 3) { continue; } // Perfect matches are hits for a single word or a compound word - if ($compound_search || count($search) == 1) { + if ($compound_search || count($search_terms) == 1) { $alternate1 = ucfirst($word); $alternate2 = ucwords($word); $alternate3 = strtolower($word); diff --git a/app/models/mainsearch_strings.php b/app/models/mainsearch_strings.php index d007e60e..26a5d4bd 100644 --- a/app/models/mainsearch_strings.php +++ b/app/models/mainsearch_strings.php @@ -1,22 +1,20 @@ regex_perfect_match) { + $locale1_strings = preg_grep($search->regex, $tmx_source); + $locale2_strings = preg_grep($search->regex, $tmx_target); } else { $locale1_strings = $tmx_source; $locale2_strings = $tmx_target; foreach (Utils::uniqueWords($initial_search) as $word) { - $regex = $delimiter . $whole_word . preg_quote($word, $delimiter) . - $whole_word . $delimiter . $case_sensitive . 'u'; - $locale1_strings = preg_grep($regex, $locale1_strings); - $locale2_strings = preg_grep($regex, $locale2_strings); + $locale1_strings = preg_grep($search->regex, $locale1_strings); + $locale2_strings = preg_grep($search->regex, $locale2_strings); } } if ($check['search_type'] == 'strings_entities') { - $entities = ShowResults::searchEntities($tmx_source, $main_regex); + $entities = ShowResults::searchEntities($tmx_source, $search->regex); foreach ($entities as $entity) { $locale1_strings[$entity] = $tmx_source[$entity]; } diff --git a/tests/units/Transvision/Search.php b/tests/units/Transvision/Search.php new file mode 100644 index 00000000..936bdeea --- /dev/null +++ b/tests/units/Transvision/Search.php @@ -0,0 +1,139 @@ +string($obj->search_terms) + ->isEqualTo(''); + $this + ->string($obj->regex) + ->isEqualTo(''); + $this + ->string($obj->regex_case) + ->isEqualTo('i'); + $this + ->string($obj->regex_whole_words) + ->isEqualTo(''); + $this + ->boolean($obj->regex_perfect_match) + ->isEqualTo(false); + $this + ->string($obj->regex_search_terms) + ->isEqualTo(''); + } + + public function testSetSearchTerms() + { + $obj = new _Search(); + $obj->setSearchTerms(' foobar '); + $this + ->string($obj->search_terms) + ->isEqualTo('foobar'); + $this + ->string($obj->regex_search_terms) + ->isEqualTo('foobar'); + } + + public function testSetRegexSearchTerms() + { + $obj = new _Search(); + $obj->setRegexSearchTerms('A new hope'); + $this + ->string($obj->regex_search_terms) + ->isEqualTo('A new hope') + ->string($obj->regex) + ->isEqualTo('~A new hope~iu'); + } + + public function testSetRegexCase() + { + $obj = new _Search(); + + // Test strings (as passed from GET) + $obj->setRegexCase('sensitive'); + $this + ->string($obj->regex) + ->isEqualTo('~~u'); + + $obj->setRegexCase(''); + $this + ->string($obj->regex) + ->isEqualTo('~~iu'); + + // Test boolean values + $obj->setRegexCase(true); + $this + ->string($obj->regex) + ->isEqualTo('~~u'); + + $obj->setRegexCase(false); + $this + ->string($obj->regex) + ->isEqualTo('~~iu'); + } + + public function testSetRegexPerfectMatch() + { + $obj = new _Search(); + $obj->setRegexPerfectMatch('perfect_match'); + $this + ->boolean($obj->regex_perfect_match) + ->isEqualTo(true) + ->string($obj->regex) + ->isEqualTo('~^$~iu'); + + $obj->setRegexPerfectMatch(false); + $this + ->boolean($obj->regex_perfect_match) + ->isEqualTo(false) + ->string($obj->regex) + ->isEqualTo('~~iu'); + } + + public function testSetRegexWholeWords() + { + $obj = new _Search(); + $obj->setRegexWholeWords('whole_word'); + $this + ->string($obj->regex_whole_words) + ->isEqualTo(true) + ->string($obj->regex) + ->isEqualTo('~\b\b~iu'); + + $obj->setRegexWholeWords(false); + $this + ->string($obj->regex_whole_words) + ->isEqualTo(false) + ->string($obj->regex) + ->isEqualTo('~~iu'); + } + + public function testMultipleRegexChanges() + { + $obj = new _Search(); + $obj + ->setSearchTerms('A new hope') + ->setRegexWholeWords('whole_word') + ->setRegexPerfectMatch(false) + ->setRegexCase('sensitive'); + $this->string($obj->regex) + ->isEqualTo('~\bA new hope\b~u'); + + $obj->setSearchTerms('Return of the jedi') + ->setRegexWholeWords('') + ->setRegexPerfectMatch(true) + ->setRegexCase(''); + $this + ->string($obj->regex) + ->isEqualTo('~^Return of the jedi$~iu'); + } +}