Page MenuHomePhabricator

D16440.id.diff
No OneTemporary

D16440.id.diff

diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -506,6 +506,7 @@
'phutil_units' => 'utils/utils.php',
'phutil_utf8_console_strlen' => 'utils/utf8.php',
'phutil_utf8_convert' => 'utils/utf8.php',
+ 'phutil_utf8_encode_codepoint' => 'utils/utf8.php',
'phutil_utf8_hard_wrap' => 'utils/utf8.php',
'phutil_utf8_hard_wrap_html' => 'utils/utf8.php',
'phutil_utf8_is_combining_character' => 'utils/utf8.php',
diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -42,6 +42,68 @@
$this->assertEqual($expect, $result);
}
+ public function testOverlongFormFiltering() {
+ $bad = "\xEF\xBF\xBD";
+
+ $map = array(
+ 'quack' => 'quack',
+
+ // This is U+1000, a valid character.
+ "\xE1\x80\x80" => "\xE1\x80\x80",
+
+ // This is a 2-byte encoding of U+0000.
+ "\xC0\x80" => "{$bad}{$bad}",
+
+ // This is a 3-byte encoding of U+0020.
+ "\xE0\x80\xA0" => "{$bad}{$bad}{$bad}",
+
+ "A \xE0\x83\x83" => "A {$bad}{$bad}{$bad}",
+ );
+
+ foreach ($map as $input => $expect) {
+ $actual = phutil_utf8ize($input);
+ $this->assertEqual(
+ $expect,
+ $actual,
+ pht('Overlong form canonicalization of: %s', $input));
+ }
+ }
+
+ public function testSurrogateFiltering() {
+ $bad = "\xEF\xBF\xBD";
+
+ $map = array(
+ "A \xED\xA9\x98" => "A {$bad}{$bad}{$bad}",
+ );
+
+ foreach ($map as $input => $expect) {
+ $actual = phutil_utf8ize($input);
+ $this->assertEqual(
+ $expect,
+ $actual,
+ pht('Surrogate filtering: %s', $input));
+ }
+ }
+
+
+ public function testUTF8CodepointEncoding() {
+ $map = array(
+ 0x20 => ' ',
+ 0x7E => '~',
+ 0xE9 => "\xC3\xA9",
+ 0x2603 => "\xE2\x98\x83",
+ 0x1F417 => "\xF0\x9F\x90\x97",
+ );
+
+ foreach ($map as $input => $expect) {
+ $actual = phutil_utf8_encode_codepoint($input);
+ $this->assertEqual(
+ $expect,
+ $actual,
+ pht('UTF8 codepoint encoding of "%s".', $input));
+ }
+ }
+
public function testUTF8len() {
$strings = array(
'' => 0,
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -6,6 +6,8 @@
* When invalid byte subsequences are encountered, they will be replaced with
* U+FFFD, the Unicode replacement character.
*
+ * This function treats overlong encodings as invalid.
+ *
* @param string String to convert to valid UTF-8.
* @return string String with invalid UTF-8 byte subsequences replaced with
* U+FFFD.
@@ -21,27 +23,48 @@
// TODO: Provide an optional fast C implementation ala fb_utf8ize() if this
// ever shows up in profiles?
- // NOTE: Overlong 3-byte and 4-byte representations incorrectly survive
- // this function.
-
$result = array();
$regex =
"/([\x01-\x7F]".
- "|[\xC2-\xDF][\x80-\xBF]".
- "|[\xE0-\xEF][\x80-\xBF][\x80-\xBF]".
- "|[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF])".
+ "|[\xC2-\xDF][\x80-\xBF]".
+ "|[\xE0][\xA0-\xBF][\x80-\xBF]".
+ "|[\xE1-\xEF][\x80-\xBF][\x80-\xBF]".
+ "|[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]".
+ "|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]".
+ "|[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF])".
"|(.)/";
+ $replacement = "\xEF\xBF\xBD";
+
$offset = 0;
$matches = null;
while (preg_match($regex, $string, $matches, 0, $offset)) {
if (!isset($matches[2])) {
- $result[] = $matches[1];
+ $match = $matches[1];
+
+ if ($match[0] == "\xED") {
+ // If this is a 3-byte character that may be part of one of the
+ // surrogate ranges, check if it's actually in those ranges. Reject
+ // it as invalid if it is. These sequences are used in UTF16 and
+ // functions like json_encode() refuse to encode them.
+
+ $codepoint = ((ord($match[0]) & 0x0F) << 12)
+ + ((ord($match[1]) & 0x3F) << 6)
+ + ((ord($match[2]) & 0x3F));
+ if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
+ $result[] = str_repeat($replacement, strlen($match));
+ $offset += strlen($matches[0]);
+ continue;
+ }
+ }
+
+ $result[] = $match;
} else {
// Unicode replacement character, U+FFFD.
- $result[] = "\xEF\xBF\xBD";
+ $result[] = $replacement;
}
+
$offset += strlen($matches[0]);
}
@@ -415,6 +438,38 @@
/**
+ * Convert a Unicode codepoint into a UTF8-encoded string.
+ *
+ * @param int Unicode codepoint.
+ * @return string UTF8 encoding.
+ */
+function phutil_utf8_encode_codepoint($codepoint) {
+ if ($codepoint < 0x80) {
+ $r = chr($codepoint);
+ } else if ($codepoint < 0x800) {
+ $r = chr(0xC0 | (($codepoint >> 6) & 0x1F)).
+ chr(0x80 | (($codepoint) & 0x3F));
+ } else if ($codepoint < 0x10000) {
+ $r = chr(0xE0 | (($codepoint >> 12) & 0x0F)).
+ chr(0x80 | (($codepoint >> 6) & 0x3F)).
+ chr(0x80 | (($codepoint) & 0x3F));
+ } else if ($codepoint < 0x110000) {
+ $r = chr(0xF0 | (($codepoint >> 18) & 0x07)).
+ chr(0x80 | (($codepoint >> 12) & 0x3F)).
+ chr(0x80 | (($codepoint >> 6) & 0x3F)).
+ chr(0x80 | (($codepoint) & 0x3F));
+ } else {
+ throw new Exception(
+ pht(
+ 'Encoding UTF8 codepoint "%s" is not supported.',
+ $codepoint));
+ }
+
+ return $r;
+}
+
+
+/**
* Hard-wrap a block of UTF-8 text with embedded HTML tags and entities.
*
* @param string An HTML string with tags and entities.

File Metadata

Mime Type
text/plain
Expires
Mar 17 2025, 2:46 AM (4 w, 5 d ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7706423
Default Alt Text
D16440.id.diff (5 KB)

Event Timeline