Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F15434631
D14339.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
6 KB
Referenced Files
None
Subscribers
None
D14339.diff
View Options
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -472,6 +472,7 @@
'phutil_utf8ize' => 'utils/utf8.php',
'phutil_utf8v' => 'utils/utf8.php',
'phutil_utf8v_codepoints' => 'utils/utf8.php',
+ 'phutil_utf8v_combine_characters' => 'utils/utf8.php',
'phutil_utf8v_combined' => 'utils/utf8.php',
'phutil_validate_json' => 'utils/utils.php',
'phutil_var_export' => 'utils/utils.php',
diff --git a/src/utils/PhutilUTF8StringTruncator.php b/src/utils/PhutilUTF8StringTruncator.php
--- a/src/utils/PhutilUTF8StringTruncator.php
+++ b/src/utils/PhutilUTF8StringTruncator.php
@@ -92,17 +92,41 @@
return $string;
}
- // If we need the vector of codepoints, build it.
- $string_pv = null;
+ // We're going to vectorize the string so we can deal with it in terms
+ // of unicode characters. If the string is huge (like 10MB) and we are
+ // only extracting a tiny piece of it (like the first 1024 bytes), we
+ // want to avoid vectorizing the entire string in cases where there is
+ // no possibility that we'll need all of it. Try to compute a "hard limit":
+ // an upper bound on the number of bytes we can ever need to process.
+
+ $hard_limits = array();
+ if ($this->maximumBytes) {
+ $hard_limits[] = $this->maximumBytes;
+ }
+
if ($this->maximumCodepoints) {
- $string_pv = phutil_utf8v($string);
- $point_len = count($string_pv);
+ // No UTF8 character is longer than 6 bytes, so we can impose a ceiling
+ // if we have a codepoint limit.
+ $hard_limits[] = ($this->maximumCodepoints * 6);
}
+ if ($hard_limits) {
+ // Add a few more bytes onto the end so that we have a little more of
+ // the string than we actually need and can get the right terminator
+ // behavior.
+ $hard_limit = max($hard_limits) + 32;
+ } else {
+ $hard_limit = null;
+ }
+
+ // Build a vector of characters first.
+ $string_pv = phutil_utf8v($string, $hard_limit);
+ $point_len = count($string_pv);
+
// We always need the combined vector, even if we're only doing byte or
// codepoint truncation, because we don't want to truncate to half of a
// combining character.
- $string_gv = phutil_utf8v_combined($string);
+ $string_gv = phutil_utf8v_combine_characters($string_pv);
$glyph_len = count($string_gv);
// Now, check if we're still over the limits. For example, a string may
diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -237,6 +237,25 @@
}
}
+ public function testUTF8LargeTruncation() {
+ // This is testing that our performance is reasonable when truncating a
+ // large input into a small output. Runtime should be on the order of the
+ // output size, not the input size.
+
+ $whale = "\xF0\x9F\x90\xB3";
+ $input = str_repeat($whale, 1024 * 1024);
+
+ $result = id(new PhutilUTF8StringTruncator())
+ ->setMaximumBytes(16)
+ ->setTerminator('!')
+ ->truncateString($input);
+
+ $this->assertEqual(
+ str_repeat($whale, 3).'!',
+ $result,
+ pht('Large truncation.'));
+ }
+
public function testUTF8Wrap() {
$inputs = array(
array(
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -306,17 +306,24 @@
* also split.
*
* @param string A valid utf-8 string.
+ * @param int|null Stop processing after examining this many bytes.
* @return list A list of characters in the string.
*/
-function phutil_utf8v($string) {
+function phutil_utf8v($string, $byte_limit = null) {
$res = array();
$len = strlen($string);
+
$ii = 0;
while ($ii < $len) {
$byte = $string[$ii];
if ($byte <= "\x7F") {
$res[] = $byte;
$ii += 1;
+
+ if ($byte_limit && ($ii >= $byte_limit)) {
+ break;
+ }
+
continue;
} else if ($byte < "\xC0") {
throw new Exception(
@@ -348,7 +355,12 @@
}
$res[] = substr($string, $ii, $seq_len);
$ii += $seq_len;
+
+ if ($byte_limit && ($ii >= $byte_limit)) {
+ break;
+ }
}
+
return $res;
}
@@ -709,6 +721,7 @@
return false;
}
+
/**
* Split a UTF-8 string into an array of characters. Combining characters
* are not split.
@@ -718,30 +731,53 @@
*/
function phutil_utf8v_combined($string) {
$components = phutil_utf8v($string);
- $array_length = count($components);
+ return phutil_utf8v_combine_characters($components);
+}
- // If the first character in the string is a combining character,
- // prepend a space to the string.
- if (
- $array_length > 0 &&
- phutil_utf8_is_combining_character($components[0])) {
- $string = ' '.$string;
- $components = phutil_utf8v($string);
- $array_length++;
+
+/**
+ * Merge combining characters in a UTF-8 string.
+ *
+ * This is a low-level method which can allow other operations to do less work.
+ * If you have a string, call @{method:phutil_utf8v_combined} instead.
+ *
+ * @param list List of UTF-8 characters.
+ * @return list List of UTF-8 strings with combining characters merged.
+ */
+function phutil_utf8v_combine_characters(array $characters) {
+ if (!$characters) {
+ return array();
}
- for ($index = 1; $index < $array_length; $index++) {
- if (phutil_utf8_is_combining_character($components[$index])) {
- $components[$index - 1] =
- $components[$index - 1].$components[$index];
+ // If the first character in the string is a combining character,
+ // start with a space.
+ if (phutil_utf8_is_combining_character($characters[0])) {
+ $buf = ' ';
+ } else {
+ $buf = null;
+ }
- unset($components[$index]);
- $components = array_values($components);
+ $parts = array();
+ foreach ($characters as $character) {
+ if (!isset($character[1])) {
+ // This an optimization: there are no one-byte combining characters,
+ // so we can just pass these through unmodified.
+ $is_combining = false;
+ } else {
+ $is_combining = phutil_utf8_is_combining_character($character);
+ }
- $index--;
- $array_length = count($components);
+ if ($is_combining) {
+ $buf .= $character;
+ } else {
+ if ($buf !== null) {
+ $parts[] = $buf;
+ }
+ $buf = $character;
}
}
- return $components;
+ $parts[] = $buf;
+
+ return $parts;
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Mar 26, 4:44 AM (1 w, 18 h ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7706524
Default Alt Text
D14339.diff (6 KB)
Attached To
Mode
D14339: Improve UTF8StringTruncator behavior for huge inputs
Attached
Detach File
Event Timeline
Log In to Comment