D9472.diff
No OneTemporary
Actions

Size

6 KB

Referenced Files

None

Subscribers

None

D9472.diff
View Options

	diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
	--- a/src/utils/__tests__/PhutilUTF8TestCase.php
	+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
	@@ -520,6 +520,9 @@
	// This isn't valid.
	"\xEF\xBF\xC0" => array(false, false, 'Invalid, byte range.'),

	+ // This is an invalid nonminimal representation.
	+ "\xF0\x81\x80\x80" => array(false, false, 'Nonminimal 4-byte characer.'),
	+
	// This is the first character above BMP, U+10000.
	"\xF0\x90\x80\x80" => array(true, false, 'U+10000'),
	"\xF0\x9D\x84\x9E" => array(true, false, 'gclef'),
	@@ -538,11 +541,19 @@
	foreach ($tests as $input => $test) {
	list($expect_utf8, $expect_bmp, $test_name) = $test;

	+ // Depending on what's installed on the system, this may use an
	+ // extension.
	$this->assertEqual(
	$expect_utf8,
	phutil_is_utf8($input),
	pht('is_utf(%s)', $test_name));

	+ // Also test this against the pure PHP implementation, explicitly.
	+ $this->assertEqual(
	+ $expect_utf8,
	+ phutil_is_utf8_slowly($input),
	+ pht('is_utf_slowly(%s)', $test_name));
	+
	$this->assertEqual(
	$expect_bmp,
	phutil_is_utf8_with_only_bmp_characters($input),
	diff --git a/src/utils/utf8.php b/src/utils/utf8.php
	--- a/src/utils/utf8.php
	+++ b/src/utils/utf8.php
	@@ -22,6 +22,9 @@
	// TODO: Provide an optional fast C implementation ala fb_utf8ize() if this
	// ever shows up in profiles?

	+ // NOTE: Overlong 3-byte and 4-byte representations incorrectly survive
	+ // this function.
	+
	$result = array();

	$regex =
	@@ -58,13 +61,57 @@
	* @return bool True if the string is valid UTF-8 with only BMP characters.
	*/
	function phutil_is_utf8_with_only_bmp_characters($string) {
	+ return phutil_is_utf8_slowly($string, $only_bmp = true);
	+}
	+
	+
	+/**
	+ * Determine if a string is valid UTF-8.
	+ *
	+ * @param string Some string which may or may not be valid UTF-8.
	+ * @return bool True if the string is valid UTF-8.
	+ * @group utf8
	+ */
	+function phutil_is_utf8($string) {
	+ if (function_exists('mb_check_encoding')) {
	+ // If mbstring is available, this is significantly faster than using PHP.
	+ return mb_check_encoding($string, 'UTF-8');
	+ }
	+
	+ return phutil_is_utf8_slowly($string);
	+}

	- // NOTE: By default, PCRE segfaults on patterns like the one we would need
	- // to use here at very small input sizes, at least on some systems (like
	- // OS X). This is apparently because the internal implementation is recursive
	- // and it blows the stack. See <https://bugs.php.net/bug.php?id=45735> for
	- // some discussion. Since the input limit is extremely low (less than 50KB on
	- // my system), do this check very very slowly in PHP instead.
	+
	+/**
	+ * Determine if a string is valid UTF-8, slowly.
	+ *
	+ * This works on any system, but has very poor performance.
	+ *
	+ * You should call @{function:phutil_is_utf8} instead of this function, as
	+ * that function can use more performant mechanisms if they are available on
	+ * the system.
	+ *
	+ * @param string Some string which may or may not be valid UTF-8.
	+ * @param bool True to require all characters be part of the basic
	+ * multilingual plane (no more than 3-bytes long).
	+ * @return bool True if the string is valid UTF-8.
	+ */
	+function phutil_is_utf8_slowly($string, $only_bmp = false) {
	+ // First, check the common case of normal ASCII strings. We're fine if
	+ // the string contains no bytes larger than 127.
	+ if (preg_match('/^[\x01-\x7F]+\z/', $string)) {
	+ return true;
	+ }
	+
	+ // NOTE: In the past, we used a large regular expression in the form of
	+ // '(x\|y\|z)+' to match UTF8 strings. However, PCRE can segfaults on patterns
	+ // like this at relatively small input sizes, at least on some systems
	+ // (observed on OSX and Windows). This is apparently because the internal
	+ // implementation is recursive and it blows the stack.
	+
	+ // See <https://bugs.php.net/bug.php?id=45735> for some discussion. Since the
	+ // input limit is extremely low (less than 50KB on my system), do this check
	+ // very very slowly in PHP instead. See also T5316.

	$len = strlen($string);
	for ($ii = 0; $ii < $len; $ii++) {
	@@ -120,6 +167,58 @@
	}
	}
	return false;
	+ } else if (!$only_bmp) {
	+ if ($chr > 0xF0 && $chr <= 0xF4) {
	+ ++$ii;
	+ if ($ii >= $len) {
	+ return false;
	+ }
	+ $chr = ord($string[$ii]);
	+ if ($chr >= 0x80 && $chr <= 0xBF) {
	+ ++$ii;
	+ if ($ii >= $len) {
	+ return false;
	+ }
	+ $chr = ord($string[$ii]);
	+ if ($chr >= 0x80 && $chr <= 0xBF) {
	+ ++$ii;
	+ if ($ii >= $len) {
	+ return false;
	+ }
	+ $chr = ord($string[$ii]);
	+ if ($chr >= 0x80 && $chr <= 0xBF) {
	+ continue;
	+ }
	+ }
	+ }
	+ } else if ($chr == 0xF0) {
	+ ++$ii;
	+ if ($ii >= $len) {
	+ return false;
	+ }
	+ $chr = ord($string[$ii]);
	+
	+ // NOTE: As above, this range starts at 0x90, not 0x80. The values
	+ // 0x80-0x90 are not minimal representations.
	+
	+ if ($chr >= 0x90 && $chr <= 0xBF) {
	+ ++$ii;
	+ if ($ii >= $len) {
	+ return false;
	+ }
	+ $chr = ord($string[$ii]);
	+ if ($chr >= 0x80 && $chr <= 0xBF) {
	+ ++$ii;
	+ if ($ii >= $len) {
	+ return false;
	+ }
	+ $chr = ord($string[$ii]);
	+ if ($chr >= 0x80 && $chr <= 0xBF) {
	+ continue;
	+ }
	+ }
	+ }
	+ }
	}

	return false;
	@@ -130,34 +229,6 @@


	/**
	- * Determine if a string is valid UTF-8.
	- *
	- * @param string Some string which may or may not be valid UTF-8.
	- * @return bool True if the string is valid UTF-8.
	- * @group utf8
	- */
	-function phutil_is_utf8($string) {
	- if (function_exists('mb_check_encoding')) {
	- // If mbstring is available, this is significantly faster than using PHP
	- // regexps.
	- return mb_check_encoding($string, 'UTF-8');
	- }
	-
	- // NOTE: This incorrectly accepts characters like \xE0\x80\x80, but should
	- // not. The MB version works correctly.
	-
	- $regex =
	- "/^(".
	- "[\x01-\x7F]+".
	- "\|([\xC2-\xDF][\x80-\xBF])".
	- "\|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])".
	- "\|([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]))*\$/";
	-
	- return (bool)preg_match($regex, $string);
	-}
	-
	-
	-/**
	* Find the character length of a UTF-8 string.
	*
	* @param string A valid utf-8 string.

File Metadata

Mime Type: text/plain
Expires: Wed, Jan 22, 9:04 AM (7 h, 11 m)
Storage Engine: blob
Storage Format: Encrypted (AES-256-CBC)
Storage Handle: 7027103
Default Alt Text: D9472.diff (6 KB)

D9472.diffNo OneTemporaryActions

D9472.diffView Options

File Metadata

Event Timeline

D9472.diff
No OneTemporary
Actions

D9472.diff
View Options