Skip to content

Commit

Permalink
[4.0] Using language specific tokeniser and stemmer for com_finder (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
Hackwar authored and wilsonge committed Jun 20, 2018
1 parent 79f0d5d commit ff3ed42
Show file tree
Hide file tree
Showing 54 changed files with 6,648 additions and 1,115 deletions.
2 changes: 1 addition & 1 deletion .drone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pipeline:
image: joomlaprojects/docker-phpcs
commands:
- echo $(date)
- /root/.composer/vendor/bin/phpcs --report=full --extensions=php -p --standard=build/phpcs/Joomla .
- /root/.composer/vendor/bin/phpcs --report=full --extensions=php -p --encoding=utf-8 --standard=build/phpcs/Joomla .
- echo $(date)

initdb:
Expand Down
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,11 @@ Desktop.ini
/libraries/vendor/simplepie/simplepie/build
/libraries/vendor/simplepie/simplepie/idn/ReadMe.txt
/libraries/vendor/simplepie/simplepie/composer.json
/libraries/vendor/wamania/php-stemmer/.gitignore
/libraries/vendor/wamania/php-stemmer/README.md
/libraries/vendor/wamania/php-stemmer/composer.json
/libraries/vendor/wamania/php-stemmer/phpunit.xml.dist
/libraries/vendor/wamania/php-stemmer/test
/libraries/vendor/zendframework/zend-diactoros/.coveralls.yml
/libraries/vendor/zendframework/zend-diactoros/CHANGELOG.md
/libraries/vendor/zendframework/zend-diactoros/composer.json
Expand Down
23 changes: 0 additions & 23 deletions administrator/components/com_finder/config.xml
Original file line number Diff line number Diff line change
Expand Up @@ -269,29 +269,6 @@
default="0.3"
/>

<field
name="stem"
type="radio"
label="COM_FINDER_CONFIG_STEMMER_ENABLE_LABEL"
class="switcher"
default="1"
>
<option value="0">JNO</option>
<option value="1">JYES</option>
</field>

<field
name="stemmer"
type="list"
label="COM_FINDER_CONFIG_STEMMER_LABEL"
default="snowball"
showon="stem:1"
>
<option value="porter_en">COM_FINDER_CONFIG_STEMMER_PORTER_EN</option>
<option value="fr">COM_FINDER_CONFIG_STEMMER_FR</option>
<option value="snowball">COM_FINDER_CONFIG_STEMMER_SNOWBALL</option>
</field>

<field
name="enable_logging"
type="radio"
Expand Down
142 changes: 15 additions & 127 deletions administrator/components/com_finder/helpers/indexer/helper.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
use Joomla\Registry\Registry;
use Joomla\String\StringHelper;

JLoader::register('FinderIndexerLanguage', __DIR__ . '/language.php');
JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php');
JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php');
JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');

/**
Expand All @@ -25,23 +25,6 @@
*/
class FinderIndexerHelper
{
/**
* The token stemmer object. The stemmer is set by whatever class
* wishes to use it but it must be an instance of FinderIndexerStemmer.
*
* @var FinderIndexerStemmer
* @since 2.5
*/
public static $stemmer;

/**
* A state flag, in order to not constantly check if the stemmer is an instance of FinderIndexerStemmer
*
* @var boolean
* @since 3.7.0
*/
protected static $stemmerOK;

/**
* Method to parse input into plain text.
*
Expand Down Expand Up @@ -73,82 +56,18 @@ public static function parse($input, $format = 'html')
public static function tokenize($input, $lang, $phrase = false)
{
static $cache;
$store = StringHelper::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null;
$store = md5($input . '::' . $lang . '::' . $phrase);

// Check if the string has been tokenized already.
if ($store && isset($cache[$store]))
if (isset($cache[$store]))
{
return $cache[$store];
}

$language = FinderIndexerLanguage::getInstance($lang);
$tokens = array();
$quotes = html_entity_decode('&#8216;&#8217;&#39;', ENT_QUOTES, 'UTF-8');

// Get the simple language key.
$lang = static::getPrimaryLanguage($lang);

/*
* Parsing the string input into terms is a multi-step process.
*
* Regexes:
* 1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma.
* 2. Remove plus, dash, period, and comma characters located before letter characters.
* 3. Remove plus, dash, period, and comma characters located after other characters.
* 4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy.
* 5. Remove orphaned apostrophe, plus, dash, period, and comma characters.
* 6. Remove orphaned quote characters.
* 7. Replace the assorted single quotation marks with the ASCII standard single quotation.
* 8. Remove multiple space characters and replaces with a single space.
*/
$input = StringHelper::strtolower($input);
$input = preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input);
$input = preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input);
$input = preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input);
$input = preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input);
$input = preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input);
$input = preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input);
$input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input);
$input = preg_replace('#\s+#mui', ' ', $input);
$input = trim($input);

// Explode the normalized string to get the terms.
$terms = explode(' ', $input);

/*
* If we have Unicode support and are dealing with Chinese text, Chinese
* has to be handled specially because there are not necessarily any spaces
* between the "words". So, we have to test if the words belong to the Chinese
* character set and if so, explode them into single glyphs or "words".
*/
if ($lang === 'zh')
{
// Iterate through the terms and test if they contain Chinese.
for ($i = 0, $n = count($terms); $i < $n; $i++)
{
$charMatches = array();
$charCount = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);

// Split apart any groups of Chinese characters.
for ($j = 0; $j < $charCount; $j++)
{
$tSplit = StringHelper::str_ireplace($charMatches[0][$j], '', $terms[$i], false);

if (!empty($tSplit))
{
$terms[$i] = $tSplit;
}
else
{
unset($terms[$i]);
}

$terms[] = $charMatches[0][$j];
}
}

// Reset array keys.
$terms = array_values($terms);
}
$terms = $language->tokenise($input);
$terms = array_filter($terms);

/*
* If we have to handle the input as a phrase, that means we don't
Expand All @@ -158,14 +77,14 @@ public static function tokenize($input, $lang, $phrase = false)
if ($phrase === true && count($terms) > 1)
{
// Create tokens from the phrase.
$tokens[] = new FinderIndexerToken($terms, $lang);
$tokens[] = new FinderIndexerToken($terms, $language->language, $language->spacer);
}
else
{
// Create tokens from the terms.
for ($i = 0, $n = count($terms); $i < $n; $i++)
{
$tokens[] = new FinderIndexerToken($terms[$i], $lang);
$tokens[] = new FinderIndexerToken($terms[$i], $language->language);
}

// Create two and three word phrase tokens from the individual words.
Expand All @@ -179,7 +98,7 @@ public static function tokenize($input, $lang, $phrase = false)
if ($i2 < $n && isset($tokens[$i2]))
{
// Tokenize the two word phrase.
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $lang, $lang === 'zh' ? '' : ' ');
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term), $language->language, $language->spacer);
$token->derived = true;

// Add the token to the stack.
Expand All @@ -190,7 +109,7 @@ public static function tokenize($input, $lang, $phrase = false)
if ($i3 < $n && isset($tokens[$i3]))
{
// Tokenize the three word phrase.
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $lang, $lang === 'zh' ? '' : ' ');
$token = new FinderIndexerToken(array($tokens[$i]->term, $tokens[$i2]->term, $tokens[$i3]->term), $language->language, $language->spacer);
$token->derived = true;

// Add the token to the stack.
Expand All @@ -199,22 +118,13 @@ public static function tokenize($input, $lang, $phrase = false)
}
}

if ($store)
{
$cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens);
$cache[$store] = $tokens;

return $cache[$store];
}
else
{
return count($tokens) > 1 ? $tokens : array_shift($tokens);
}
return $cache[$store];
}

/**
* Method to get the base word of a token. This method uses the public
* {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set,
* the original token is returned.
* Method to get the base word of a token.
*
* @param string $token The token to stem.
* @param string $lang The language of the token.
Expand All @@ -225,31 +135,9 @@ public static function tokenize($input, $lang, $phrase = false)
*/
public static function stem($token, $lang)
{
// Trim apostrophes at either end of the token.
$token = trim($token, '\'');

// Trim everything after any apostrophe in the token.
if ($res = explode('\'', $token))
{
$token = $res[0];
}

if (static::$stemmerOK === true)
{
return static::$stemmer->stem($token, $lang);
}
else
{
// Stem the token if we have a valid stemmer to use.
if (static::$stemmer instanceof FinderIndexerStemmer)
{
static::$stemmerOK = true;

return static::$stemmer->stem($token, $lang);
}
}
$language = FinderIndexerLanguage::getInstance($lang);

return $token;
return $language->stem($token);
}

/**
Expand Down
13 changes: 6 additions & 7 deletions administrator/components/com_finder/helpers/indexer/indexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
use Joomla\String\StringHelper;

JLoader::register('FinderIndexerHelper', __DIR__ . '/helper.php');
JLoader::register('FinderIndexerLanguage', __DIR__ . '/language.php');
JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php');
JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php');
JLoader::register('FinderIndexerTaxonomy', __DIR__ . '/taxonomy.php');
JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');

Expand Down Expand Up @@ -213,12 +213,6 @@ public static function getState()
static::$profiler = JProfiler::getInstance('FinderIndexer');
}

// Setup the stemmer.
if ($data->options->get('stem', 1) && $data->options->get('stemmer', 'porter_en'))
{
FinderIndexerHelper::$stemmer = FinderIndexerStemmer::getInstance($data->options->get('stemmer', 'porter_en'));
}

// Set the state.
static::$state = $data;

Expand Down Expand Up @@ -471,6 +465,11 @@ private function tokenizeToDbShort($input, $context, $lang, $format, $count)
// Tokenize the input.
$tokens = FinderIndexerHelper::tokenize($input, $lang);

if (count($tokens) == 0)
{
return $count;
}

// Add the tokens to the database.
$count += $this->addTokensToDb($tokens, $context);

Expand Down
Loading

0 comments on commit ff3ed42

Please sign in to comment.