D14339.diff
No OneTemporary
Actions

Size

6 KB

Referenced Files

None

Subscribers

None

D14339.diff
View Options

	diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
	--- a/src/__phutil_library_map__.php
	+++ b/src/__phutil_library_map__.php
	@@ -472,6 +472,7 @@
	'phutil_utf8ize' => 'utils/utf8.php',
	'phutil_utf8v' => 'utils/utf8.php',
	'phutil_utf8v_codepoints' => 'utils/utf8.php',
	+ 'phutil_utf8v_combine_characters' => 'utils/utf8.php',
	'phutil_utf8v_combined' => 'utils/utf8.php',
	'phutil_validate_json' => 'utils/utils.php',
	'phutil_var_export' => 'utils/utils.php',
	diff --git a/src/utils/PhutilUTF8StringTruncator.php b/src/utils/PhutilUTF8StringTruncator.php
	--- a/src/utils/PhutilUTF8StringTruncator.php
	+++ b/src/utils/PhutilUTF8StringTruncator.php
	@@ -92,17 +92,41 @@
	return $string;
	}

	- // If we need the vector of codepoints, build it.
	- $string_pv = null;
	+ // We're going to vectorize the string so we can deal with it in terms
	+ // of unicode characters. If the string is huge (like 10MB) and we are
	+ // only extracting a tiny piece of it (like the first 1024 bytes), we
	+ // want to avoid vectorizing the entire string in cases where there is
	+ // no possibility that we'll need all of it. Try to compute a "hard limit":
	+ // an upper bound on the number of bytes we can ever need to process.
	+
	+ $hard_limits = array();
	+ if ($this->maximumBytes) {
	+ $hard_limits[] = $this->maximumBytes;
	+ }
	+
	if ($this->maximumCodepoints) {
	- $string_pv = phutil_utf8v($string);
	- $point_len = count($string_pv);
	+ // No UTF8 character is longer than 6 bytes, so we can impose a ceiling
	+ // if we have a codepoint limit.
	+ $hard_limits[] = ($this->maximumCodepoints * 6);
	}

	+ if ($hard_limits) {
	+ // Add a few more bytes onto the end so that we have a little more of
	+ // the string than we actually need and can get the right terminator
	+ // behavior.
	+ $hard_limit = max($hard_limits) + 32;
	+ } else {
	+ $hard_limit = null;
	+ }
	+
	+ // Build a vector of characters first.
	+ $string_pv = phutil_utf8v($string, $hard_limit);
	+ $point_len = count($string_pv);
	+
	// We always need the combined vector, even if we're only doing byte or
	// codepoint truncation, because we don't want to truncate to half of a
	// combining character.
	- $string_gv = phutil_utf8v_combined($string);
	+ $string_gv = phutil_utf8v_combine_characters($string_pv);
	$glyph_len = count($string_gv);

	// Now, check if we're still over the limits. For example, a string may
	diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
	--- a/src/utils/__tests__/PhutilUTF8TestCase.php
	+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
	@@ -237,6 +237,25 @@
	}
	}

	+ public function testUTF8LargeTruncation() {
	+ // This is testing that our performance is reasonable when truncating a
	+ // large input into a small output. Runtime should be on the order of the
	+ // output size, not the input size.
	+
	+ $whale = "\xF0\x9F\x90\xB3";
	+ $input = str_repeat($whale, 1024 * 1024);
	+
	+ $result = id(new PhutilUTF8StringTruncator())
	+ ->setMaximumBytes(16)
	+ ->setTerminator('!')
	+ ->truncateString($input);
	+
	+ $this->assertEqual(
	+ str_repeat($whale, 3).'!',
	+ $result,
	+ pht('Large truncation.'));
	+ }
	+
	public function testUTF8Wrap() {
	$inputs = array(
	array(
	diff --git a/src/utils/utf8.php b/src/utils/utf8.php
	--- a/src/utils/utf8.php
	+++ b/src/utils/utf8.php
	@@ -306,17 +306,24 @@
	* also split.
	*
	* @param string A valid utf-8 string.
	+ * @param int\|null Stop processing after examining this many bytes.
	* @return list A list of characters in the string.
	*/
	-function phutil_utf8v($string) {
	+function phutil_utf8v($string, $byte_limit = null) {
	$res = array();
	$len = strlen($string);
	+
	$ii = 0;
	while ($ii < $len) {
	$byte = $string[$ii];
	if ($byte <= "\x7F") {
	$res[] = $byte;
	$ii += 1;
	+
	+ if ($byte_limit && ($ii >= $byte_limit)) {
	+ break;
	+ }
	+
	continue;
	} else if ($byte < "\xC0") {
	throw new Exception(
	@@ -348,7 +355,12 @@
	}
	$res[] = substr($string, $ii, $seq_len);
	$ii += $seq_len;
	+
	+ if ($byte_limit && ($ii >= $byte_limit)) {
	+ break;
	+ }
	}
	+
	return $res;
	}

	@@ -709,6 +721,7 @@
	return false;
	}

	+
	/**
	* Split a UTF-8 string into an array of characters. Combining characters
	* are not split.
	@@ -718,30 +731,53 @@
	*/
	function phutil_utf8v_combined($string) {
	$components = phutil_utf8v($string);
	- $array_length = count($components);
	+ return phutil_utf8v_combine_characters($components);
	+}

	- // If the first character in the string is a combining character,
	- // prepend a space to the string.
	- if (
	- $array_length > 0 &&
	- phutil_utf8_is_combining_character($components[0])) {
	- $string = ' '.$string;
	- $components = phutil_utf8v($string);
	- $array_length++;
	+
	+/**
	+ * Merge combining characters in a UTF-8 string.
	+ *
	+ * This is a low-level method which can allow other operations to do less work.
	+ * If you have a string, call @{method:phutil_utf8v_combined} instead.
	+ *
	+ * @param list List of UTF-8 characters.
	+ * @return list List of UTF-8 strings with combining characters merged.
	+ */
	+function phutil_utf8v_combine_characters(array $characters) {
	+ if (!$characters) {
	+ return array();
	}

	- for ($index = 1; $index < $array_length; $index++) {
	- if (phutil_utf8_is_combining_character($components[$index])) {
	- $components[$index - 1] =
	- $components[$index - 1].$components[$index];
	+ // If the first character in the string is a combining character,
	+ // start with a space.
	+ if (phutil_utf8_is_combining_character($characters[0])) {
	+ $buf = ' ';
	+ } else {
	+ $buf = null;
	+ }

	- unset($components[$index]);
	- $components = array_values($components);
	+ $parts = array();
	+ foreach ($characters as $character) {
	+ if (!isset($character[1])) {
	+ // This an optimization: there are no one-byte combining characters,
	+ // so we can just pass these through unmodified.
	+ $is_combining = false;
	+ } else {
	+ $is_combining = phutil_utf8_is_combining_character($character);
	+ }

	- $index--;
	- $array_length = count($components);
	+ if ($is_combining) {
	+ $buf .= $character;
	+ } else {
	+ if ($buf !== null) {
	+ $parts[] = $buf;
	+ }
	+ $buf = $character;
	}
	}

	- return $components;
	+ $parts[] = $buf;
	+
	+ return $parts;
	}

File Metadata

Mime Type: text/plain
Expires: Wed, Mar 26, 4:44 AM (1 w, 18 h ago)
Storage Engine: blob
Storage Format: Encrypted (AES-256-CBC)
Storage Handle: 7706524
Default Alt Text: D14339.diff (6 KB)

D14339.diffNo OneTemporaryActions

D14339.diffView Options

File Metadata

Event Timeline

D14339.diff
No OneTemporary
Actions

D14339.diff
View Options