Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F15368656
D8312.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
2 KB
Referenced Files
None
Subscribers
None
D8312.diff
View Options
Index: src/utils/__tests__/PhutilUTF8TestCase.php
===================================================================
--- src/utils/__tests__/PhutilUTF8TestCase.php
+++ src/utils/__tests__/PhutilUTF8TestCase.php
@@ -430,6 +430,13 @@
}
+ public function testUTF8BMPSegfaults() {
+ // This test case fails by segfaulting, or passes by not segfaulting. See
+ // the function implementation for details.
+ $input = str_repeat("\xEF\xBF\xBF", 1024 * 32);
+ phutil_is_utf8_with_only_bmp_characters($input);
+ }
+
public function testUTF8BMP() {
$tests = array(
"" => array(true, true, "empty string"),
Index: src/utils/utf8.php
===================================================================
--- src/utils/utf8.php
+++ src/utils/utf8.php
@@ -58,13 +58,40 @@
* @return bool True if the string is valid UTF-8 with only BMP characters.
*/
function phutil_is_utf8_with_only_bmp_characters($string) {
- $regex =
- "/^(".
- "[\x01-\x7F]+".
- "|([\xC2-\xDF][\x80-\xBF])".
- "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF]))*\$/";
- return (bool)preg_match($regex, $string);
+ // NOTE: By default, PCRE segfaults on patterns like the one we would need
+ // to use here at very small input sizes, at least on some systems (like
+ // OS X). This is apparently because the internal implementation is recursive
+ // and it blows the stack. See <https://bugs.php.net/bug.php?id=45735> for
+ // some discussion. Since the input limit is extremely low (less than 50KB on
+ // my system), do this check very very slowly in PHP instead.
+
+ $len = strlen($string);
+ for ($ii = 0; $ii < $len; $ii++) {
+ $chr = ord($string[$ii]);
+ if ($chr >= 0x01 && $chr <= 0x7F) {
+ continue;
+ } else if ($chr >= 0xC2 && $chr <= 0xDF) {
+ $chr = ord($string[++$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ continue;
+ }
+ return false;
+ } else if ($chr >= 0xE0 && $chr <= 0xEF) {
+ $chr = ord($string[++$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ $chr = ord($string[++$ii]);
+ if ($chr >= 0x80 && $chr <= 0xBF) {
+ continue;
+ }
+ }
+ return false;
+ }
+
+ return false;
+ }
+
+ return true;
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Mar 13, 3:25 AM (1 w, 4 d ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7602265
Default Alt Text
D8312.diff (2 KB)
Attached To
Mode
D8312: Implement phutil_is_utf8_with_only_bmp_characters() without segfaulting
Attached
Detach File
Event Timeline
Log In to Comment