Page MenuHomePhabricator

D14339.diff
No OneTemporary

D14339.diff

diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -472,6 +472,7 @@
'phutil_utf8ize' => 'utils/utf8.php',
'phutil_utf8v' => 'utils/utf8.php',
'phutil_utf8v_codepoints' => 'utils/utf8.php',
+ 'phutil_utf8v_combine_characters' => 'utils/utf8.php',
'phutil_utf8v_combined' => 'utils/utf8.php',
'phutil_validate_json' => 'utils/utils.php',
'phutil_var_export' => 'utils/utils.php',
diff --git a/src/utils/PhutilUTF8StringTruncator.php b/src/utils/PhutilUTF8StringTruncator.php
--- a/src/utils/PhutilUTF8StringTruncator.php
+++ b/src/utils/PhutilUTF8StringTruncator.php
@@ -92,17 +92,41 @@
return $string;
}
- // If we need the vector of codepoints, build it.
- $string_pv = null;
+ // We're going to vectorize the string so we can deal with it in terms
+ // of unicode characters. If the string is huge (like 10MB) and we are
+ // only extracting a tiny piece of it (like the first 1024 bytes), we
+ // want to avoid vectorizing the entire string in cases where there is
+ // no possibility that we'll need all of it. Try to compute a "hard limit":
+ // an upper bound on the number of bytes we can ever need to process.
+
+ $hard_limits = array();
+ if ($this->maximumBytes) {
+ $hard_limits[] = $this->maximumBytes;
+ }
+
if ($this->maximumCodepoints) {
- $string_pv = phutil_utf8v($string);
- $point_len = count($string_pv);
+ // No UTF8 character is longer than 6 bytes, so we can impose a ceiling
+ // if we have a codepoint limit.
+ $hard_limits[] = ($this->maximumCodepoints * 6);
}
+ if ($hard_limits) {
+ // Add a few more bytes onto the end so that we have a little more of
+ // the string than we actually need and can get the right terminator
+ // behavior.
+ $hard_limit = max($hard_limits) + 32;
+ } else {
+ $hard_limit = null;
+ }
+
+ // Build a vector of characters first.
+ $string_pv = phutil_utf8v($string, $hard_limit);
+ $point_len = count($string_pv);
+
// We always need the combined vector, even if we're only doing byte or
// codepoint truncation, because we don't want to truncate to half of a
// combining character.
- $string_gv = phutil_utf8v_combined($string);
+ $string_gv = phutil_utf8v_combine_characters($string_pv);
$glyph_len = count($string_gv);
// Now, check if we're still over the limits. For example, a string may
diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -237,6 +237,25 @@
}
}
+ public function testUTF8LargeTruncation() {
+ // This is testing that our performance is reasonable when truncating a
+ // large input into a small output. Runtime should be on the order of the
+ // output size, not the input size.
+
+ $whale = "\xF0\x9F\x90\xB3";
+ $input = str_repeat($whale, 1024 * 1024);
+
+ $result = id(new PhutilUTF8StringTruncator())
+ ->setMaximumBytes(16)
+ ->setTerminator('!')
+ ->truncateString($input);
+
+ $this->assertEqual(
+ str_repeat($whale, 3).'!',
+ $result,
+ pht('Large truncation.'));
+ }
+
public function testUTF8Wrap() {
$inputs = array(
array(
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -306,17 +306,24 @@
* also split.
*
* @param string A valid utf-8 string.
+ * @param int|null Stop processing after examining this many bytes.
* @return list A list of characters in the string.
*/
-function phutil_utf8v($string) {
+function phutil_utf8v($string, $byte_limit = null) {
$res = array();
$len = strlen($string);
+
$ii = 0;
while ($ii < $len) {
$byte = $string[$ii];
if ($byte <= "\x7F") {
$res[] = $byte;
$ii += 1;
+
+ if ($byte_limit && ($ii >= $byte_limit)) {
+ break;
+ }
+
continue;
} else if ($byte < "\xC0") {
throw new Exception(
@@ -348,7 +355,12 @@
}
$res[] = substr($string, $ii, $seq_len);
$ii += $seq_len;
+
+ if ($byte_limit && ($ii >= $byte_limit)) {
+ break;
+ }
}
+
return $res;
}
@@ -709,6 +721,7 @@
return false;
}
+
/**
* Split a UTF-8 string into an array of characters. Combining characters
* are not split.
@@ -718,30 +731,53 @@
*/
function phutil_utf8v_combined($string) {
$components = phutil_utf8v($string);
- $array_length = count($components);
+ return phutil_utf8v_combine_characters($components);
+}
- // If the first character in the string is a combining character,
- // prepend a space to the string.
- if (
- $array_length > 0 &&
- phutil_utf8_is_combining_character($components[0])) {
- $string = ' '.$string;
- $components = phutil_utf8v($string);
- $array_length++;
+
+/**
+ * Merge combining characters in a UTF-8 string.
+ *
+ * This is a low-level method which can allow other operations to do less work.
+ * If you have a string, call @{method:phutil_utf8v_combined} instead.
+ *
+ * @param list List of UTF-8 characters.
+ * @return list List of UTF-8 strings with combining characters merged.
+ */
+function phutil_utf8v_combine_characters(array $characters) {
+ if (!$characters) {
+ return array();
}
- for ($index = 1; $index < $array_length; $index++) {
- if (phutil_utf8_is_combining_character($components[$index])) {
- $components[$index - 1] =
- $components[$index - 1].$components[$index];
+ // If the first character in the string is a combining character,
+ // start with a space.
+ if (phutil_utf8_is_combining_character($characters[0])) {
+ $buf = ' ';
+ } else {
+ $buf = null;
+ }
- unset($components[$index]);
- $components = array_values($components);
+ $parts = array();
+ foreach ($characters as $character) {
+ if (!isset($character[1])) {
+ // This an optimization: there are no one-byte combining characters,
+ // so we can just pass these through unmodified.
+ $is_combining = false;
+ } else {
+ $is_combining = phutil_utf8_is_combining_character($character);
+ }
- $index--;
- $array_length = count($components);
+ if ($is_combining) {
+ $buf .= $character;
+ } else {
+ if ($buf !== null) {
+ $parts[] = $buf;
+ }
+ $buf = $character;
}
}
- return $components;
+ $parts[] = $buf;
+
+ return $parts;
}

File Metadata

Mime Type
text/plain
Expires
Wed, Mar 26, 4:44 AM (1 w, 18 h ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7706524
Default Alt Text
D14339.diff (6 KB)

Event Timeline