Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix errors with sessions & logging related to invalid UTF-8 #60

Merged
merged 2 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
### Unreleased

* Ensure StackdriverApplicationLogger can always log even with invalid UTF8 characters anywhere in payload
* Fix session & logging bugs when incoming user-agent contains invalid UTF8 or escape characters

### v2.1.0 (2024-08-02)

* Support PHP 8.3
Expand Down
8 changes: 6 additions & 2 deletions src/Logging/StackdriverApplicationLogger.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
use Ingenerator\PHPUtils\Monitoring\MetricId;
use Ingenerator\PHPUtils\Monitoring\MetricsAgent;
use Ingenerator\PHPUtils\Object\InitialisableSingletonTrait;
use Ingenerator\PHPUtils\StringEncoding\StringSanitiser;
use Psr\Log\AbstractLogger;
use Psr\Log\LogLevel;
use RuntimeException;
Expand Down Expand Up @@ -383,7 +384,7 @@ protected function writeLog(array $log_entry): void
try {
$success = file_put_contents(
$this->log_destination,
json_encode($log_entry)."\n",
json_encode($log_entry, JSON_INVALID_UTF8_SUBSTITUTE)."\n",
FILE_APPEND
);

Expand Down Expand Up @@ -428,7 +429,10 @@ public function logRequest(?array $server, ?float $request_start_time = NULL): v
// requestMethod, requestUrl, remoteIp are expected to come from metadata provider as they are
// shared with application log entry context
'status' => $http_code,
'userAgent' => $this->truncate($server['HTTP_USER_AGENT'] ?? NULL, 500),
'userAgent' => $this->truncate(
StringSanitiser::ensurePrintableUtf8($server['HTTP_USER_AGENT'] ?? ''),
500
),
'latency' => $this->calculateRequestLatency($request_start_time),
],
]
Expand Down
4 changes: 3 additions & 1 deletion src/Session/MysqlSession.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@


use Ingenerator\PHPUtils\DateTime\DateString;
use Ingenerator\PHPUtils\StringEncoding\StringSanitiser;
use PDO;
use SessionHandlerInterface;

Expand Down Expand Up @@ -168,7 +169,7 @@ public function create_sid(): string
'hash' => $this->calculateHash(),
'data' => '',
'now' => \date('Y-m-d H:i:s'),
'user_agent' => mb_substr($this->client_user_agent, 0, 255),
'user_agent' => StringSanitiser::ensurePrintableUtf8($this->client_user_agent, 255),
'ip' => $this->client_ip,
]
);
Expand Down Expand Up @@ -307,6 +308,7 @@ public function updateTimestamp($session_id, $session_data): bool
*/
protected function calculateHash(): string
{
// Note, intentionally using the *raw* user-agent (as reported by the client) rather than the sanitised one.
$hash = $this->client_user_agent.$this->hash_salt;

return \sha1($hash);
Expand Down
43 changes: 43 additions & 0 deletions src/StringEncoding/StringSanitiser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<?php

namespace Ingenerator\PHPUtils\StringEncoding;

class StringSanitiser
{

/**
* Sanitises user input to remove invalid UTF8 character sequences and non-printing ASCII (except \n\r\t)
*
* Invalid characters are replaced with `?`. Use when you want to silently ignore errors in the input, for example
* when logging things like user-agents or other non-critical data, rather than causing encoding or database insert
* errors.
*
* @see https://stackoverflow.com/a/57871683/1062943 for original solution by clarkk, updated to allow more whitespace
*
* @param string $input
* @param int|null $max_length
*
* @return string
*/
public static function ensurePrintableUtf8(string $input, ?int $max_length = null): string
{
$previous_substitute = mb_substitute_character();
mb_substitute_character(0xfffd);
try {
$cleaned = preg_replace(
'/[^[:print:]\n\t\r]/u',
'�',
mb_convert_encoding($input, 'UTF-8', 'UTF-8')
);

if ($max_length !== null) {
return mb_substr($cleaned, 0, $max_length);
}

return $cleaned;
} finally {
mb_substitute_character($previous_substitute);
}
}

}
32 changes: 31 additions & 1 deletion test/unit/Logging/StackdriverApplicationLoggerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,32 @@ public function test_it_logs_to_specified_location_as_json_line_with_expected_me
$this->assertSame($messages, $actual, 'Expect correct log messages');
}

public function test_it_handles_invalid_utf8_when_logging()
{
$subject = $this->newSubject();
$line = __LINE__ + 1;
$subject->info("This got truncated somewhere\xD0 then appended to", ['and' => "this was junk\xa3\xa9"]);
$entries = $this->assertLoggedJSONLines();
$this->assertSame(
[
[
'severity' => 'INFO',
'message' => "This got truncated somewhere� then appended to",
'@ingenType' => 'app',
'logging.googleapis.com/sourceLocation' => [
'file' => __FILE__,
'line' => $line,
'function' => __CLASS__.'->'.__FUNCTION__,
],
'custom_context' => [
'and' => 'this was junk��',
],
],
],
$entries
);
}

public function test_it_appends_logs_to_existing_content_if_file_specified()
{
$previous = vfsStream::newFile('existing-log.log')
Expand Down Expand Up @@ -568,8 +594,12 @@ public function test_its_request_logger_logs_http_response_code()
$this->assertSame(402, $entry['httpRequest']['status']);
}

#[TestWith([[], null])]
#[TestWith([[], ''])]
#[TestWith([['HTTP_USER_AGENT' => 'chrome 10'], 'chrome 10'])]
#[TestWith([
['HTTP_USER_AGENT' => "Mozilla/5.0 (compatible; Baiduspider/2.0; \xa3\xa9 and more stuff"],
'Mozilla/5.0 (compatible; Baiduspider/2.0; �� and more stuff',
])]
public function test_its_request_logger_logs_user_agent_from_global_array($server, $expect)
{
http_response_code(402);
Expand Down
122 changes: 122 additions & 0 deletions test/unit/StringEncoding/StringSanitiserTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
<?php

namespace test\unit\Ingenerator\PHPUtils\StringEncoding;

use Ingenerator\PHPUtils\StringEncoding\StringSanitiser;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\TestCase;

class StringSanitiserTest extends TestCase
{

public static function provider_printable_utf8(): array
{
return [
'valid ascii' => [
'I am valid text',
],
'valid ascii with newlines and tabs' => [
"I am valid\nmultiline\ttext with whitespace",
],
'valid ascii with windows newlines' => [
"I am valid\r\nmultiline fróm Windows",
],
'valid UTF8' => [
"Hello from Denmark with æøå",
],
'ascii with BEL control characters' => [
"Ring the \x07",
"Ring the �",
],
'ascii with leading NUL control characters' => [
"\x00nully null",
"�nully null",
],
'ascii with trailing NUL control characters' => [
"nully null\x00",
"nully null�",
],
'ascii with internal NUL control characters' => [
"nully \x00 null",
"nully � null",
],
'utf8 truncated at first escape' => [
"Some partial\xD0",
"Some partial�",
],
'utf8 with invalid trailing bytes' => [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html\xa3\xa9",
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html��",
],
'utf8 with invalid included bytes' => [
"Mozilla/5.0 (compatible; Baiduspider/2.0; \xa3\xa9 and more stuff",
"Mozilla/5.0 (compatible; Baiduspider/2.0; �� and more stuff",
],
'utf8 with invalid leading bytes' => [
"\xa3\xa9Mozilla/5.0 (compatible; Baiduspider/2.0; and more stuff",
"��Mozilla/5.0 (compatible; Baiduspider/2.0; and more stuff",
],
];
}

#[DataProvider('provider_printable_utf8')]
public function test_it_sanitises_to_printable_valid_utf8_string(string $input, ?string $expect = null)
{
$this->assertSame(
$expect ?? $input,
StringSanitiser::ensurePrintableUtf8($input)
);
}

public static function provider_printable_utf8_max_length(): array
{
return [
'with invalid input, still truncates after cleaning' => [
"Mozilla/5.0 \x08 and more \xa3\xa9 nonsense",
[
23 => "Mozilla/5.0 � and more ",
24 => "Mozilla/5.0 � and more �",
25 => "Mozilla/5.0 � and more ��",
255 => "Mozilla/5.0 � and more �� nonsense",
],
],
'with valid input, truncates as multibyte' => [
"Hello from Denmark with æøå and stuff",
[
24 => 'Hello from Denmark with ',
25 => 'Hello from Denmark with æ',
26 => 'Hello from Denmark with æø',
27 => 'Hello from Denmark with æøå',
28 => 'Hello from Denmark with æøå ',
255 => 'Hello from Denmark with æøå and stuff',
],
],
];
}

#[DataProvider('provider_printable_utf8_max_length')]
public function test_it_can_constrain_printable_utf8_to_a_max_length(string $input, array $expected)
{
$actual = [];
foreach (array_keys($expected) as $max_length) {
$actual[$max_length] = StringSanitiser::ensurePrintableUtf8($input, max_length: $max_length);
}
$this->assertSame($expected, $actual);
}

public function test_it_does_not_affect_global_substitute_character_state()
{
$old_encoding = mb_substitute_character();
mb_substitute_character(0x3013);
$input = "whatever \xa3 stuff";
$encoded_with_default = mb_convert_encoding($input, 'UTF-8', 'UTF-8');
$this->assertSame('whatever 〓 stuff', $encoded_with_default);
try {
$this->assertSame('whatever � stuff', StringSanitiser::ensurePrintableUtf8($input));
$this->assertSame($encoded_with_default, mb_convert_encoding($input, 'UTF-8', 'UTF-8'));
} finally {
mb_substitute_character($old_encoding);
}
}

}
Loading