Page MenuHomePhabricator

D8312.diff
No OneTemporary

D8312.diff

Index: src/utils/__tests__/PhutilUTF8TestCase.php
===================================================================
--- src/utils/__tests__/PhutilUTF8TestCase.php
+++ src/utils/__tests__/PhutilUTF8TestCase.php
@@ -430,6 +430,13 @@
}
+ public function testUTF8BMPSegfaults() {
+ // This test case fails by segfaulting, or passes by not segfaulting. See
+ // the function implementation for details.
+ $input = str_repeat("\xEF\xBF\xBF", 1024 * 32);
+ phutil_is_utf8_with_only_bmp_characters($input);
+ }
+
public function testUTF8BMP() {
$tests = array(
"" => array(true, true, "empty string"),
Index: src/utils/utf8.php
===================================================================
--- src/utils/utf8.php
+++ src/utils/utf8.php
@@ -58,13 +58,40 @@
* @return bool True if the string is valid UTF-8 with only BMP characters.
*/
function phutil_is_utf8_with_only_bmp_characters($string) {
- $regex =
- "/^(".
- "[\x01-\x7F]+".
- "|([\xC2-\xDF][\x80-\xBF])".
- "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF]))*\$/";
- return (bool)preg_match($regex, $string);
+ // NOTE: By default, PCRE segfaults on patterns like the one we would need
+ // to use here at very small input sizes, at least on some systems (like
+ // OS X). This is apparently because the internal implementation is recursive
+ // and it blows the stack. See <https://bugs.php.net/bug.php?id=45735> for
+ // some discussion. Since the input limit is extremely low (less than 50KB on
+ // my system), do this check very very slowly in PHP instead.
+
+ $len = strlen($string);
+ for ($ii = 0; $ii < $len; $ii++) {
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x01 && $chr <= 0x7F) {
+ continue;
+ } else if ($chr >= 0xC2 && $chr <= 0xDF) {
+ $chr = ord($string[++$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ continue;
+ }
+ return false;
+ } else if ($chr >= 0xE0 && $chr <= 0xEF) {
+ $chr = ord($string[++$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ $chr = ord($string[++$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ continue;
+ }
+ }
+ return false;
+ }
+
+ return false;
+ }
+
+ return true;
}

File Metadata

Mime Type
text/plain
Expires
Thu, Mar 13, 3:25 AM (1 w, 4 d ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7602265
Default Alt Text
D8312.diff (2 KB)

Event Timeline