diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php --- a/src/utils/__tests__/PhutilUTF8TestCase.php +++ b/src/utils/__tests__/PhutilUTF8TestCase.php @@ -61,6 +61,13 @@ ); foreach ($map as $input => $expect) { + if ($input !== $expect) { + $this->assertEqual( + false, + phutil_is_utf8_slowly($input), + pht('Slowly reject overlong form of: %s', $input)); + } + $actual = phutil_utf8ize($input); $this->assertEqual( $expect, @@ -77,6 +84,13 @@ ); foreach ($map as $input => $expect) { + if ($input !== $expect) { + $this->assertEqual( + false, + phutil_is_utf8_slowly($input), + pht('Slowly reject surrogate: %s', $input)); + } + $actual = phutil_utf8ize($input); $this->assertEqual( $expect, diff --git a/src/utils/utf8.php b/src/utils/utf8.php --- a/src/utils/utf8.php +++ b/src/utils/utf8.php @@ -149,6 +149,34 @@ continue; } return false; + } else if ($chr == 0xED) { + // See T11525. Some sequences in this block are surrogate codepoints + // that are reserved for use in UTF16. We should reject them. + $codepoint = ($chr & 0x0F) << 12; + ++$ii; + if ($ii >= $len) { + return false; + } + $chr = ord($string[$ii]); + $codepoint += ($chr & 0x3F) << 6; + if ($chr >= 0x80 && $chr <= 0xBF) { + ++$ii; + if ($ii >= $len) { + return false; + } + $chr = ord($string[$ii]); + $codepoint += ($chr & 0x3F); + + if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) { + // Reject these surrogate codepoints. + return false; + } + + if ($chr >= 0x80 && $chr <= 0xBF) { + continue; + } + } + return false; } else if ($chr > 0xE0 && $chr <= 0xEF) { ++$ii; if ($ii >= $len) {