Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F14744458
D9472.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Referenced Files
None
Subscribers
None
D9472.diff
View Options
diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -520,6 +520,9 @@
// This isn't valid.
"\xEF\xBF\xC0" => array(false, false, 'Invalid, byte range.'),
+ // This is an invalid nonminimal representation.
+ "\xF0\x81\x80\x80" => array(false, false, 'Nonminimal 4-byte characer.'),
+
// This is the first character above BMP, U+10000.
"\xF0\x90\x80\x80" => array(true, false, 'U+10000'),
"\xF0\x9D\x84\x9E" => array(true, false, 'gclef'),
@@ -538,11 +541,19 @@
foreach ($tests as $input => $test) {
list($expect_utf8, $expect_bmp, $test_name) = $test;
+ // Depending on what's installed on the system, this may use an
+ // extension.
$this->assertEqual(
$expect_utf8,
phutil_is_utf8($input),
pht('is_utf(%s)', $test_name));
+ // Also test this against the pure PHP implementation, explicitly.
+ $this->assertEqual(
+ $expect_utf8,
+ phutil_is_utf8_slowly($input),
+ pht('is_utf_slowly(%s)', $test_name));
+
$this->assertEqual(
$expect_bmp,
phutil_is_utf8_with_only_bmp_characters($input),
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -22,6 +22,9 @@
// TODO: Provide an optional fast C implementation ala fb_utf8ize() if this
// ever shows up in profiles?
+ // NOTE: Overlong 3-byte and 4-byte representations incorrectly survive
+ // this function.
+
$result = array();
$regex =
@@ -58,13 +61,57 @@
* @return bool True if the string is valid UTF-8 with only BMP characters.
*/
function phutil_is_utf8_with_only_bmp_characters($string) {
+ return phutil_is_utf8_slowly($string, $only_bmp = true);
+}
+
+
+/**
+ * Determine if a string is valid UTF-8.
+ *
+ * @param string Some string which may or may not be valid UTF-8.
+ * @return bool True if the string is valid UTF-8.
+ * @group utf8
+ */
+function phutil_is_utf8($string) {
+ if (function_exists('mb_check_encoding')) {
+ // If mbstring is available, this is significantly faster than using PHP.
+ return mb_check_encoding($string, 'UTF-8');
+ }
+
+ return phutil_is_utf8_slowly($string);
+}
- // NOTE: By default, PCRE segfaults on patterns like the one we would need
- // to use here at very small input sizes, at least on some systems (like
- // OS X). This is apparently because the internal implementation is recursive
- // and it blows the stack. See <https://bugs.php.net/bug.php?id=45735> for
- // some discussion. Since the input limit is extremely low (less than 50KB on
- // my system), do this check very very slowly in PHP instead.
+
+/**
+ * Determine if a string is valid UTF-8, slowly.
+ *
+ * This works on any system, but has very poor performance.
+ *
+ * You should call @{function:phutil_is_utf8} instead of this function, as
+ * that function can use more performant mechanisms if they are available on
+ * the system.
+ *
+ * @param string Some string which may or may not be valid UTF-8.
+ * @param bool True to require all characters be part of the basic
+ * multilingual plane (no more than 3-bytes long).
+ * @return bool True if the string is valid UTF-8.
+ */
+function phutil_is_utf8_slowly($string, $only_bmp = false) {
+ // First, check the common case of normal ASCII strings. We're fine if
+ // the string contains no bytes larger than 127.
+ if (preg_match('/^[\x01-\x7F]+\z/', $string)) {
+ return true;
+ }
+
+ // NOTE: In the past, we used a large regular expression in the form of
+ // '(x|y|z)+' to match UTF8 strings. However, PCRE can segfaults on patterns
+ // like this at relatively small input sizes, at least on some systems
+ // (observed on OSX and Windows). This is apparently because the internal
+ // implementation is recursive and it blows the stack.
+
+ // See <https://bugs.php.net/bug.php?id=45735> for some discussion. Since the
+ // input limit is extremely low (less than 50KB on my system), do this check
+ // very very slowly in PHP instead. See also T5316.
$len = strlen($string);
for ($ii = 0; $ii < $len; $ii++) {
@@ -120,6 +167,58 @@
}
}
return false;
+ } else if (!$only_bmp) {
+ if ($chr > 0xF0 && $chr <= 0xF4) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ continue;
+ }
+ }
+ }
+ } else if ($chr == 0xF0) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+
+ // NOTE: As above, this range starts at 0x90, not 0x80. The values
+ // 0x80-0x90 are not minimal representations.
+
+ if ($chr >= 0x90 && $chr <= 0xBF) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ ++$ii;
+ if ($ii >= $len) {
+ return false;
+ }
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ continue;
+ }
+ }
+ }
+ }
}
return false;
@@ -130,34 +229,6 @@
/**
- * Determine if a string is valid UTF-8.
- *
- * @param string Some string which may or may not be valid UTF-8.
- * @return bool True if the string is valid UTF-8.
- * @group utf8
- */
-function phutil_is_utf8($string) {
- if (function_exists('mb_check_encoding')) {
- // If mbstring is available, this is significantly faster than using PHP
- // regexps.
- return mb_check_encoding($string, 'UTF-8');
- }
-
- // NOTE: This incorrectly accepts characters like \xE0\x80\x80, but should
- // not. The MB version works correctly.
-
- $regex =
- "/^(".
- "[\x01-\x7F]+".
- "|([\xC2-\xDF][\x80-\xBF])".
- "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])".
- "|([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]))*\$/";
-
- return (bool)preg_match($regex, $string);
-}
-
-
-/**
* Find the character length of a UTF-8 string.
*
* @param string A valid utf-8 string.
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Jan 22, 9:04 AM (7 h, 11 m)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7027103
Default Alt Text
D9472.diff (6 KB)
Attached To
Mode
D9472: Fall back to slow UTF8 algorithms that don't crash
Attached
Detach File
Event Timeline
Log In to Comment