Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F15395032
D16440.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
5 KB
Referenced Files
None
Subscribers
None
D16440.id.diff
View Options
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -506,6 +506,7 @@
'phutil_units' => 'utils/utils.php',
'phutil_utf8_console_strlen' => 'utils/utf8.php',
'phutil_utf8_convert' => 'utils/utf8.php',
+ 'phutil_utf8_encode_codepoint' => 'utils/utf8.php',
'phutil_utf8_hard_wrap' => 'utils/utf8.php',
'phutil_utf8_hard_wrap_html' => 'utils/utf8.php',
'phutil_utf8_is_combining_character' => 'utils/utf8.php',
diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -42,6 +42,68 @@
$this->assertEqual($expect, $result);
}
+ public function testOverlongFormFiltering() {
+ $bad = "\xEF\xBF\xBD";
+
+ $map = array(
+ 'quack' => 'quack',
+
+ // This is U+1000, a valid character.
+ "\xE1\x80\x80" => "\xE1\x80\x80",
+
+ // This is a 2-byte encoding of U+0000.
+ "\xC0\x80" => "{$bad}{$bad}",
+
+ // This is a 3-byte encoding of U+0020.
+ "\xE0\x80\xA0" => "{$bad}{$bad}{$bad}",
+
+ "A \xE0\x83\x83" => "A {$bad}{$bad}{$bad}",
+ );
+
+ foreach ($map as $input => $expect) {
+ $actual = phutil_utf8ize($input);
+ $this->assertEqual(
+ $expect,
+ $actual,
+ pht('Overlong form canonicalization of: %s', $input));
+ }
+ }
+
+ public function testSurrogateFiltering() {
+ $bad = "\xEF\xBF\xBD";
+
+ $map = array(
+ "A \xED\xA9\x98" => "A {$bad}{$bad}{$bad}",
+ );
+
+ foreach ($map as $input => $expect) {
+ $actual = phutil_utf8ize($input);
+ $this->assertEqual(
+ $expect,
+ $actual,
+ pht('Surrogate filtering: %s', $input));
+ }
+ }
+
+
+ public function testUTF8CodepointEncoding() {
+ $map = array(
+ 0x20 => ' ',
+ 0x7E => '~',
+ 0xE9 => "\xC3\xA9",
+ 0x2603 => "\xE2\x98\x83",
+ 0x1F417 => "\xF0\x9F\x90\x97",
+ );
+
+ foreach ($map as $input => $expect) {
+ $actual = phutil_utf8_encode_codepoint($input);
+ $this->assertEqual(
+ $expect,
+ $actual,
+ pht('UTF8 codepoint encoding of "%s".', $input));
+ }
+ }
+
public function testUTF8len() {
$strings = array(
'' => 0,
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -6,6 +6,8 @@
* When invalid byte subsequences are encountered, they will be replaced with
* U+FFFD, the Unicode replacement character.
*
+ * This function treats overlong encodings as invalid.
+ *
* @param string String to convert to valid UTF-8.
* @return string String with invalid UTF-8 byte subsequences replaced with
* U+FFFD.
@@ -21,27 +23,48 @@
// TODO: Provide an optional fast C implementation ala fb_utf8ize() if this
// ever shows up in profiles?
- // NOTE: Overlong 3-byte and 4-byte representations incorrectly survive
- // this function.
-
$result = array();
$regex =
"/([\x01-\x7F]".
- "|[\xC2-\xDF][\x80-\xBF]".
- "|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]".
- "|[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF])".
+ "|[\xC2-\xDF][\x80-\xBF]".
+ "|[\xE0][\xA0-\xBF][\x80-\xBF]".
+ "|[\xE1-\xEF][\x80-\xBF][\x80-\xBF]".
+ "|[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]".
+ "|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]".
+ "|[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF])".
"|(.)/";
+ $replacement = "\xEF\xBF\xBD";
+
$offset = 0;
$matches = null;
while (preg_match($regex, $string, $matches, 0, $offset)) {
if (!isset($matches[2])) {
- $result[] = $matches[1];
+ $match = $matches[1];
+
+ if ($match[0] == "\xED") {
+ // If this is a 3-byte character that may be part of one of the
+ // surrogate ranges, check if it's actually in those ranges. Reject
+ // it as invalid if it is. These sequences are used in UTF16 and
+ // functions like json_encode() refuse to encode them.
+
+ $codepoint = ((ord($match[0]) & 0x0F) << 12)
+ + ((ord($match[1]) & 0x3F) << 6)
+ + ((ord($match[2]) & 0x3F));
+ if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
+ $result[] = str_repeat($replacement, strlen($match));
+ $offset += strlen($matches[0]);
+ continue;
+ }
+ }
+
+ $result[] = $match;
} else {
// Unicode replacement character, U+FFFD.
- $result[] = "\xEF\xBF\xBD";
+ $result[] = $replacement;
}
+
$offset += strlen($matches[0]);
}
@@ -415,6 +438,38 @@
/**
+ * Convert a Unicode codepoint into a UTF8-encoded string.
+ *
+ * @param int Unicode codepoint.
+ * @return string UTF8 encoding.
+ */
+function phutil_utf8_encode_codepoint($codepoint) {
+ if ($codepoint < 0x80) {
+ $r = chr($codepoint);
+ } else if ($codepoint < 0x800) {
+ $r = chr(0xC0 | (($codepoint >> 6) & 0x1F)).
+ chr(0x80 | (($codepoint) & 0x3F));
+ } else if ($codepoint < 0x10000) {
+ $r = chr(0xE0 | (($codepoint >> 12) & 0x0F)).
+ chr(0x80 | (($codepoint >> 6) & 0x3F)).
+ chr(0x80 | (($codepoint) & 0x3F));
+ } else if ($codepoint < 0x110000) {
+ $r = chr(0xF0 | (($codepoint >> 18) & 0x07)).
+ chr(0x80 | (($codepoint >> 12) & 0x3F)).
+ chr(0x80 | (($codepoint >> 6) & 0x3F)).
+ chr(0x80 | (($codepoint) & 0x3F));
+ } else {
+ throw new Exception(
+ pht(
+ 'Encoding UTF8 codepoint "%s" is not supported.',
+ $codepoint));
+ }
+
+ return $r;
+}
+
+
+/**
* Hard-wrap a block of UTF-8 text with embedded HTML tags and entities.
*
* @param string An HTML string with tags and entities.
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mar 17 2025, 2:46 AM (4 w, 5 d ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7706423
Default Alt Text
D16440.id.diff (5 KB)
Attached To
Mode
D16440: Sanitize UTF8 more aggressively to satisfy json_encode()
Attached
Detach File
Event Timeline
Log In to Comment