Skip to content

Commit

Permalink
Merge pull request #8202 from Sesquipedalian/spoofdetector
Browse files Browse the repository at this point in the history
Implements SMF\Unicode\SpoofDectector
  • Loading branch information
Sesquipedalian authored May 14, 2024
2 parents 573bd59 + 41472ef commit 654f791
Show file tree
Hide file tree
Showing 18 changed files with 10,837 additions and 80 deletions.
2 changes: 2 additions & 0 deletions Languages/en_US/Admin.php
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,8 @@
$txt['censor_test_save'] = 'Test';
$txt['censor_case'] = 'Ignore case when censoring';
$txt['censor_whole_words'] = 'Check only whole words';
$txt['spoofdetector_censor'] = 'Detect character spoofing when censoring';
$txt['spoofdetector_censor_desc'] = 'Prevents attempts to bypass the word censor by using visually similar Unicode characters.<br>If enabling this causes some words to be censored incorrectly, click the help icon for solutions.';

$txt['admin_confirm_password'] = '(confirm)';
$txt['admin_incorrect_password'] = 'Incorrect Password';
Expand Down
2 changes: 2 additions & 0 deletions Languages/en_US/Help.php
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,8 @@
$helptxt['disable_wysiwyg'] = 'This setting disallows all users from using the WYSIWYG (&quot;What You See Is What You Get&quot;) editor on the post page.';
$helptxt['lastActive'] = 'Set the number of minutes to show people are active in X number of minutes on the board index. Default is 15 minutes.';

$helptxt['spoofdetector_censor'] = 'Enabling this setting will prevent users from trying to fool the word censor by using visually similar Unicode characters (a.k.a. <a href="https://unicode.org/reports/tr36/#visual_spoofing" class="bbc_link" target="_blank" rel="noopener noreferrer">character spoofing</a>). For example, if "quack" (Latin letter a) is censored, enabling this setting will also censor "quack" (Cyrillic letter a).<br><br>Detecting character spoofing in the word censor may occasionally cause legitimate words to be censored incorrectly. For example, if the word "bum" is censored, the word "burn" might also be caught by the word censor, because "m" and "rn" are considered to be confusable in the Unicode Consortium’s <a href="https://unicode.org/Public/security/latest/confusables.txt" class="bbc_link" target="_blank" rel="noopener noreferrer">official list of confusable characters</a>.<br><br>To prevent a word from being censored incorrectly, just add another entry in the word censor to replace the word with itself <span style="display:inline-block">(e.g.: "burn" => "burn").</span>';

$helptxt['customoptions'] = 'This defines the options that a user may choose from a drop down list. There are a few key points to note on this page:
<ul class="normallist">
<li><strong>Default setting:</strong> Whichever check box has the &quot;radio button&quot; next to it selected will be the default selection for the user when they enter their profile.</li>
Expand Down
1 change: 1 addition & 0 deletions Sources/Actions/Admin/Posts.php
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ public function censor(): void
'allow_no_censored' => empty($_POST['allow_no_censored']) ? '0' : '1',
'censorWholeWord' => empty($_POST['censorWholeWord']) ? '0' : '1',
'censorIgnoreCase' => empty($_POST['censorIgnoreCase']) ? '0' : '1',
'spoofdetector_censor' => empty($_POST['spoofdetector_censor']) ? '0' : '1',
];

IntegrationHook::call('integrate_save_censors', [&$updates]);
Expand Down
2 changes: 2 additions & 0 deletions Sources/Actions/Register2.php
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,8 @@ public static function registerMember(array &$reg_options, bool $return_errors =
// Call an optional function to validate the users' input.
IntegrationHook::call('integrate_register', [&$reg_options, &$theme_vars, &$known_ints, &$known_floats]);

$reg_options['register_vars']['spoofdetector_name'] = Utils::htmlspecialchars(Unicode\SpoofDetector::getSkeletonString(html_entity_decode($reg_options['register_vars']['real_name'] ?? $reg_options['register_vars']['member_name'], ENT_QUOTES)));

$column_names = [];
$values = [];

Expand Down
34 changes: 22 additions & 12 deletions Sources/Lang.php
Original file line number Diff line number Diff line change
Expand Up @@ -598,34 +598,44 @@ public static function censorText(string &$text, bool $force = false): string

IntegrationHook::call('integrate_word_censor', [&$text]);

// Let SpoofDetector help us detect attempts to bypass the word censor.
Unicode\SpoofDetector::enhanceWordCensor($text);

// If they haven't yet been loaded, load them.
if ($censor_vulgar == null) {
$censor_vulgar = explode("\n", Config::$modSettings['censor_vulgar']);
$censor_proper = explode("\n", Config::$modSettings['censor_proper']);

$charset = empty(Config::$modSettings['global_character_set']) ? self::$txt['lang_character_set'] : Config::$modSettings['global_character_set'];

// Quote them for use in regular expressions.
if (!empty(Config::$modSettings['censorWholeWord'])) {
$charset = empty(Config::$modSettings['global_character_set']) ? self::$txt['lang_character_set'] : Config::$modSettings['global_character_set'];
for ($i = 0, $n = count($censor_vulgar); $i < $n; $i++) {
// If a word is replaced with itself, just leave it as it is.
// Why would the admin replace a word with itself, you ask?
// If the spoof detector incorrectly censors an allowed word
// because it happens to be visually confusable with a banned
// word, the admin can create an entry to replace the allowed
// word with itself in order to override the spoof detector.
if ($censor_vulgar[$i] === $censor_proper[$i]) {
$censor_proper[$i] = '$0';
}

for ($i = 0, $n = count($censor_vulgar); $i < $n; $i++) {
$censor_vulgar[$i] = str_replace(['\\\\\\*', '\\*', '&', '\''], ['[*]', '[^\\s]*?', '&amp;', '&#039;'], preg_quote($censor_vulgar[$i], '/'));
$censor_vulgar[$i] = str_replace(['\\\\\\*', '\\*', '&', '\''], ['[*]', '[^\\s]*?', '&amp;', '&#039;'], preg_quote($censor_vulgar[$i], '/'));

if (!empty(Config::$modSettings['censorWholeWord'])) {
// Use the faster \b if we can, or something more complex if we can't
$boundary_before = preg_match('/^\w/', $censor_vulgar[$i]) ? '\b' : ($charset === 'UTF-8' ? '(?<![\p{L}\p{M}\p{N}_])' : '(?<!\w)');
$boundary_after = preg_match('/\w$/', $censor_vulgar[$i]) ? '\b' : ($charset === 'UTF-8' ? '(?![\p{L}\p{M}\p{N}_])' : '(?!\w)');

$censor_vulgar[$i] = '/' . $boundary_before . $censor_vulgar[$i] . $boundary_after . '/' . (empty(Config::$modSettings['censorIgnoreCase']) ? '' : 'i') . ($charset === 'UTF-8' ? 'u' : '');
} else {
$boundary_before = $boundary_after = '';
}

$censor_vulgar[$i] = '/' . $boundary_before . $censor_vulgar[$i] . $boundary_after . '/' . (empty(Config::$modSettings['censorIgnoreCase']) ? '' : 'i') . ($charset === 'UTF-8' ? 'u' : '');
}
}

// Censoring isn't so very complicated :P.
if (empty(Config::$modSettings['censorWholeWord'])) {
$func = !empty(Config::$modSettings['censorIgnoreCase']) ? 'str_ireplace' : 'str_replace';
$text = $func($censor_vulgar, $censor_proper, $text);
} else {
$text = preg_replace($censor_vulgar, $censor_proper, $text);
}
$text = preg_replace($censor_vulgar, $censor_proper, $text);

return $text;
}
Expand Down
4 changes: 4 additions & 0 deletions Sources/Profile.php
Original file line number Diff line number Diff line change
Expand Up @@ -2250,6 +2250,10 @@ protected function prepareToSaveStandardFields(): void
}
}
}

if (!empty($this->new_data['real_name'])) {
$this->new_data['spoofdetector_name'] = Utils::htmlspecialchars(Unicode\SpoofDetector::getSkeletonString(html_entity_decode($this->new_data['real_name'], ENT_QUOTES)));
}
}

/**
Expand Down
143 changes: 143 additions & 0 deletions Sources/Tasks/UpdateSpoofDetectorNames.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
<?php

/**
* Simple Machines Forum (SMF)
*
* @package SMF
* @author Simple Machines https://www.simplemachines.org
* @copyright 2024 Simple Machines and individual contributors
* @license https://www.simplemachines.org/about/smf/license.php BSD
*
* @version 3.0 Alpha 1
*/

declare(strict_types=1);

namespace SMF\Tasks;

use SMF\Config;
use SMF\Db\DatabaseApi as Db;
use SMF\Sapi;
use SMF\Unicode\SpoofDetector;
use SMF\User;
use SMF\Utils;

/**
* Updates the values of the spoofdetector_name column in the members table.
*/
class UpdateSpoofDetectorNames extends BackgroundTask
{
/**
* This executes the task.
*
* @return bool Always returns true
*/
public function execute(): bool
{
Sapi::setTimeLimit(MAX_CLAIM_THRESHOLD);

if (empty($this->_details['last_member_id']) || !is_int($this->_details['last_member_id'])) {
$this->_details['last_member_id'] = 0;
}

// Just in case the column is missing for some reason...
if (
$this->_details['last_member_id'] === 0
&& !in_array('spoofdetector_name', Db::$db->list_columns('{db_prefix}members'))
) {
Db::$db->add_column(
'{db_prefix}messages',
[
'name' => 'spoofdetector_name',
'type' => 'varchar',
'size' => 255,
'null' => false,
'default' => '',
],
[],
'ignore',
);
Db::$db->add_index(
'{db_prefix}messages',
[
'name' => 'idx_spoofdetector_name',
'columns' => ['spoofdetector_name'],
],
[],
'ignore',
);
Db::$db->add_index(
'{db_prefix}messages',
[
'name' => 'idx_spoofdetector_name_id',
'columns' => ['spoofdetector_name', 'id_member'],
],
[],
'ignore',
);
}

$updates = [];

$request = Db::$db->query(
'',
'SELECT id_member, real_name, spoofdetector_name
FROM {db_prefix}members
WHERE id_member > {int:id_member}
ORDER BY id_member
LIMIT {int:limit}',
[
'id_member' => $this->_details['last_member_id'],
'limit' => MAX_CLAIM_THRESHOLD,
],
);

while ($row = Db::$db->fetch_assoc($request)) {
$this->_details['last_member_id'] = $row['id_member'];

$skeleton = Utils::htmlspecialchars(SpoofDetector::getSkeletonString(html_entity_decode($row['real_name'], ENT_QUOTES)));

// Don't bother updating if there's been no change.
if ($row['spoofdetector_name'] === $skeleton) {
continue;
}

$updates[$row['id_member']] = ['spoofdetector_name' => $skeleton];
}
Db::$db->free_result($request);

foreach ($updates as $id_member => $data) {
User::updateMemberData($id_member, $data);
}

if ($this->_details['last_member_id'] < Config::$modSettings['latestMember']) {
$this->respawn();
}

return true;
}

/**
* Adds a new instance of this task to the task list.
*/
private function respawn(): void
{
Db::$db->insert(
'insert',
'{db_prefix}background_tasks',
[
'task_class' => 'string-255',
'task_data' => 'string',
'claimed_time' => 'int',
],
[
get_class($this),
json_encode($this->_details),
0,
],
[],
);
}
}

?>
Loading

0 comments on commit 654f791

Please sign in to comment.