Page MenuHomePhabricator

D9472.diff
No OneTemporary

D9472.diff

diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -520,6 +520,9 @@
// This isn't valid.
"\xEF\xBF\xC0" => array(false, false, 'Invalid, byte range.'),
+ // This is an invalid nonminimal representation.
+ "\xF0\x81\x80\x80" => array(false, false, 'Nonminimal 4-byte characer.'),
+
// This is the first character above BMP, U+10000.
"\xF0\x90\x80\x80" => array(true, false, 'U+10000'),
"\xF0\x9D\x84\x9E" => array(true, false, 'gclef'),
@@ -538,11 +541,19 @@
foreach ($tests as $input => $test) {
list($expect_utf8, $expect_bmp, $test_name) = $test;
+ // Depending on what's installed on the system, this may use an
+ // extension.
$this->assertEqual(
$expect_utf8,
phutil_is_utf8($input),
pht('is_utf(%s)', $test_name));
+ // Also test this against the pure PHP implementation, explicitly.
+ $this->assertEqual(
+ $expect_utf8,
+ phutil_is_utf8_slowly($input),
+ pht('is_utf_slowly(%s)', $test_name));
+
$this->assertEqual(
$expect_bmp,
phutil_is_utf8_with_only_bmp_characters($input),
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -22,6 +22,9 @@
// TODO: Provide an optional fast C implementation ala fb_utf8ize() if this
// ever shows up in profiles?
+ // NOTE: Overlong 3-byte and 4-byte representations incorrectly survive
+ // this function.
+
$result = array();
$regex =
@@ -58,13 +61,57 @@
* @return bool True if the string is valid UTF-8 with only BMP characters.
*/
function phutil_is_utf8_with_only_bmp_characters($string) {
+ return phutil_is_utf8_slowly($string, $only_bmp = true);
+}
+
+
+/**
+ * Determine if a string is valid UTF-8.
+ *
+ * @param string Some string which may or may not be valid UTF-8.
+ * @return bool True if the string is valid UTF-8.
+ * @group utf8
+ */
+function phutil_is_utf8($string) {
+ if (function_exists('mb_check_encoding')) {
+ // If mbstring is available, this is significantly faster than using PHP.
+ return mb_check_encoding($string, 'UTF-8');
+ }
+
+ return phutil_is_utf8_slowly($string);
+}
- // NOTE: By default, PCRE segfaults on patterns like the one we would need
- // to use here at very small input sizes, at least on some systems (like
- // OS X). This is apparently because the internal implementation is recursive
- // and it blows the stack. See <https://bugs.php.net/bug.php?id=45735> for
- // some discussion. Since the input limit is extremely low (less than 50KB on
- // my system), do this check very very slowly in PHP instead.
+
+/**
+ * Determine if a string is valid UTF-8, slowly.
+ *
+ * This works on any system, but has very poor performance.
+ *
+ * You should call @{function:phutil_is_utf8} instead of this function, as
+ * that function can use more performant mechanisms if they are available on
+ * the system.
+ *
+ * @param string Some string which may or may not be valid UTF-8.
+ * @param bool True to require all characters be part of the basic
+ * multilingual plane (no more than 3-bytes long).
+ * @return bool True if the string is valid UTF-8.
+ */
+function phutil_is_utf8_slowly($string, $only_bmp = false) {
+ // First, check the common case of normal ASCII strings. We're fine if
+ // the string contains no bytes larger than 127.
+ if (preg_match('/^[\x01-\x7F]+\z/', $string)) {
+ return true;
+ }
+
+ // NOTE: In the past, we used a large regular expression in the form of
+ // '(x|y|z)+' to match UTF8 strings. However, PCRE can segfaults on patterns
+ // like this at relatively small input sizes, at least on some systems
+ // (observed on OSX and Windows). This is apparently because the internal
+ // implementation is recursive and it blows the stack.
+
+ // See <https://bugs.php.net/bug.php?id=45735> for some discussion. Since the
+ // input limit is extremely low (less than 50KB on my system), do this check
+ // very very slowly in PHP instead. See also T5316.
$len = strlen($string);
for ($ii = 0; $ii < $len; $ii++) {
@@ -120,6 +167,58 @@
}
}
return false;
+ } else if (!$only_bmp) {
+ if ($chr > 0xF0 && $chr <= 0xF4) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ continue;
+ }
+ }
+ }
+ } else if ($chr == 0xF0) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+
+ // NOTE: As above, this range starts at 0x90, not 0x80. The values
+ // 0x80-0x90 are not minimal representations.
+
+ if ($chr >= 0x90 && $chr <= 0xBF) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ continue;
+ }
+ }
+ }
+ }
}
return false;
@@ -130,34 +229,6 @@
/**
- * Determine if a string is valid UTF-8.
- *
- * @param string Some string which may or may not be valid UTF-8.
- * @return bool True if the string is valid UTF-8.
- * @group utf8
- */
-function phutil_is_utf8($string) {
- if (function_exists('mb_check_encoding')) {
- // If mbstring is available, this is significantly faster than using PHP
- // regexps.
- return mb_check_encoding($string, 'UTF-8');
- }
-
- // NOTE: This incorrectly accepts characters like \xE0\x80\x80, but should
- // not. The MB version works correctly.
-
- $regex =
- "/^(".
- "[\x01-\x7F]+".
- "|([\xC2-\xDF][\x80-\xBF])".
- "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])".
- "|([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]))*\$/";
-
- return (bool)preg_match($regex, $string);
-}
-
-
-/**
* Find the character length of a UTF-8 string.
*
* @param string A valid utf-8 string.

File Metadata

Mime Type
text/plain
Expires
Wed, Jan 22, 9:04 AM (7 h, 11 m)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7027103
Default Alt Text
D9472.diff (6 KB)

Event Timeline