Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F14597248
D9368.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
16 KB
Referenced Files
None
Subscribers
None
D9368.diff
View Options
diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
--- a/src/__phutil_library_map__.php
+++ b/src/__phutil_library_map__.php
@@ -312,6 +312,7 @@
'PhutilTypeSpecTestCase' => 'parser/__tests__/PhutilTypeSpecTestCase.php',
'PhutilURI' => 'parser/PhutilURI.php',
'PhutilURITestCase' => 'parser/__tests__/PhutilURITestCase.php',
+ 'PhutilUTF8StringTruncator' => 'utils/PhutilUTF8StringTruncator.php',
'PhutilUTF8TestCase' => 'utils/__tests__/PhutilUTF8TestCase.php',
'PhutilUtilsTestCase' => 'utils/__tests__/PhutilUtilsTestCase.php',
'PhutilWordPressFuture' => 'future/wordpress/PhutilWordPressFuture.php',
@@ -678,6 +679,7 @@
'PhutilTypeMissingParametersException' => 'Exception',
'PhutilTypeSpecTestCase' => 'PhutilTestCase',
'PhutilURITestCase' => 'PhutilTestCase',
+ 'PhutilUTF8StringTruncator' => 'Phobject',
'PhutilUTF8TestCase' => 'PhutilTestCase',
'PhutilUtilsTestCase' => 'PhutilTestCase',
'PhutilWordPressFuture' => 'FutureProxy',
diff --git a/src/utils/PhutilUTF8StringTruncator.php b/src/utils/PhutilUTF8StringTruncator.php
new file mode 100644
--- /dev/null
+++ b/src/utils/PhutilUTF8StringTruncator.php
@@ -0,0 +1,266 @@
+<?php
+
+/**
+ * Truncate a UTF-8 string to a some maximum number of bytes, codepoints, or
+ * glyphs.
+ *
+ * This class takes some UTF-8 string as input, and emits a valid UTF-8 string
+ * which is no longer than given byte, codepoint, or glyph limits.
+ *
+ * $short = id(new PhutilUTF8StringTruncator())
+ * ->setMaximumGlyphs(80)
+ * ->truncateString($long);
+ *
+ * Byte limits restrict the number of bytes the result may contain. They are
+ * appropriate when you care about how much storage a string requires.
+ *
+ * Codepoint limits restrict the number of codepoints the result may contain.
+ * Since codepoints may have up to 4 bytes, the resulting strings may require
+ * have more than this many bytes. This kind of limit is appropriate when you
+ * are using UTF-8 storage, like MySQL.
+ *
+ * Glyph limits restrict the display size of the string. Because a single glyph
+ * may have an arbitrary number of combining characters, this does not impose
+ * a storage size limit on the string: a string with only one glyph may require
+ * an arbitrarily large number of bytes.
+ *
+ * You can set more than one limit; the smallest limit will be used.
+ *
+ * NOTE: This function makes a best effort to apply some reasonable rules but
+ * will not work well for the full range of unicode languages.
+ *
+ * @group utf8
+ */
+final class PhutilUTF8StringTruncator extends Phobject {
+
+ private $maximumBytes;
+ private $maximumCodepoints;
+ private $maximumGlyphs;
+ private $minimumLimit;
+
+ private $terminator = "\xE2\x80\xA6";
+ private $terminatorBytes = 3;
+ private $terminatorCodepoints = 1;
+ private $terminatorGlyphs = 1;
+
+ public function setMaximumBytes($maximum_bytes) {
+ $this->maximumBytes = $maximum_bytes;
+ $this->didUpdateMaxima();
+ return $this;
+ }
+
+ public function setMaximumCodepoints($maximum_codepoints) {
+ $this->maximumCodepoints = $maximum_codepoints;
+ $this->didUpdateMaxima();
+ return $this;
+ }
+
+ public function setMaximumGlyphs($maximum_glyphs) {
+ $this->maximumGlyphs = $maximum_glyphs;
+ $this->didUpdateMaxima();
+ return $this;
+ }
+
+ private function didUpdateMaxima() {
+ $this->minimumLimit = INF;
+
+ if ($this->maximumBytes) {
+ $this->minimumLimit = min($this->minimumLimit, $this->maximumBytes);
+ }
+
+ if ($this->maximumCodepoints) {
+ $this->minimumLimit = min($this->minimumLimit, $this->maximumCodepoints);
+ }
+
+ if ($this->maximumGlyphs) {
+ $this->minimumLimit = min($this->minimumLimit, $this->maximumGlyphs);
+ }
+ }
+
+ public function setTerminator($terminator) {
+ $this->terminator = $terminator;
+ $this->terminatorBytes = strlen($terminator);
+ $this->terminatorCodepoints = count(phutil_utf8v($terminator));
+ $this->terminatorGlyphs = count(phutil_utf8v_combined($terminator));
+ return $this;
+ }
+
+ public function truncateString($string) {
+ // First, check if the string has fewer bytes than the most restrictive
+ // limit. Codepoints and glyphs always take up at least one byte, so we can
+ // just return the string unmodified if we're under all of the limits.
+ $byte_len = strlen($string);
+ if ($byte_len <= $this->minimumLimit) {
+ return $string;
+ }
+
+ // If we need the vector of codepoints, build it.
+ $string_pv = null;
+ if ($this->maximumCodepoints) {
+ $string_pv = phutil_utf8v($string);
+ $point_len = count($string_pv);
+ }
+
+ // We always need the combined vector, even if we're only doing byte or
+ // codepoint truncation, because we don't want to truncate to half of a
+ // combining character.
+ $string_gv = phutil_utf8v_combined($string);
+ $glyph_len = count($string_gv);
+
+ // Now, check if we're still over the limits. For example, a string may
+ // be over the raw byte limit but under the glyph limit if it contains
+ // several multibyte characters.
+
+ $too_long = false;
+ if ($this->maximumBytes && ($byte_len > $this->maximumBytes)) {
+ $too_long = true;
+ }
+ if ($this->maximumCodepoints && ($point_len > $this->maximumCodepoints)) {
+ $too_long = true;
+ }
+ if ($this->maximumGlyphs && ($glyph_len > $this->maximumGlyphs)) {
+ $too_long = true;
+ }
+
+ if (!$too_long) {
+ return $string;
+ }
+
+ // This string is legitimately longer than at least one of the limits, so
+ // we need to truncate it. Find the minimum cutoff point: this is the last
+ // glyph we can possibly return while satisfying the limits and having space
+ // for the terminator.
+
+ $cutoff = $glyph_len;
+ if ($this->maximumBytes) {
+ if ($byte_len <= $this->maximumBytes) {
+ $cutoff = $glyph_len;
+ } else {
+ $bytes = $this->terminatorBytes;
+ for ($ii = 0; $ii < $glyph_len; $ii++) {
+ $bytes += strlen($string_gv[$ii]);
+ if ($bytes > $this->maximumBytes) {
+ $cutoff = $ii;
+ break;
+ }
+ }
+ }
+ }
+
+ if ($this->maximumCodepoints) {
+ if ($point_len <= $this->maximumCodepoints) {
+ $cutoff = min($cutoff, $glyph_len);
+ } else {
+ $points = 0;
+ for ($ii = 0; $ii < $glyph_len; $ii++) {
+ $glyph_bytes = strlen($string_gv[$ii]);
+ while ($points < $point_len) {
+ $glyph_bytes -= strlen($string_pv[$points]);
+ $points++;
+ if ($glyph_bytes <= 0) {
+ break;
+ }
+ }
+ $points_total = $points + $this->terminatorCodepoints;
+ if ($points_total > $this->maximumCodepoints) {
+ $cutoff = min($cutoff, $ii);
+ break;
+ }
+ }
+ }
+ }
+
+ if ($this->maximumGlyphs) {
+ if ($glyph_len <= $this->maximumGlyphs) {
+ $cutoff = min($cutoff, $glyph_len);
+ } else {
+ $cutoff = min($cutoff, $this->maximumGlyphs - $this->terminatorGlyphs);
+ }
+ }
+
+ // If we don't have enough characters for anything, just return the
+ // terminator.
+ if ($cutoff <= 0) {
+ return $this->terminator;
+ }
+
+ // Otherwise, we're going to try to cut the string off somewhere reasonable
+ // rather than somewhere arbitrary.
+
+ // NOTE: This is not complete, and there are many other word boundary
+ // characters and reasonable places to break words in the UTF-8 character
+ // space. For now, this gives us reasonable behavior for latin langauges. We
+ // don't necessarily have access to PCRE+Unicode so there isn't a great way
+ // for us to look up character attributes.
+
+ // If we encounter these, prefer to break on them instead of cutting the
+ // string off in the middle of a word.
+ static $break_characters = array(
+ ' ' => true,
+ "\n" => true,
+ ';' => true,
+ ':' => true,
+ '[' => true,
+ '(' => true,
+ ',' => true,
+ '-' => true,
+ );
+
+ // If we encounter these, shorten to this character exactly without
+ // appending the terminal.
+ static $stop_characters = array(
+ '.' => true,
+ '!' => true,
+ '?' => true,
+ );
+
+ // Search backward in the string, looking for reasonable places to break it.
+ $word_boundary = null;
+ $stop_boundary = null;
+
+ // If we do a word break with a terminal, we have to look beyond at least
+ // the number of characters in the terminal. If the terminal is longer than
+ // the required length, we'll skip this whole block and return it on its
+ // own.
+
+ // Only search backward for a while. At some point we don't get a better
+ // result by looking through the whole string, and if this is "MMM..." or
+ // a non-latin language without word break characters we're just wasting
+ // time.
+
+ $search = max(0, $cutoff - 256);
+ for ($ii = min($cutoff, $glyph_len - 1); $ii >= $search; $ii--) {
+ $c = $string_gv[$ii];
+
+ if (isset($break_characters[$c])) {
+ $word_boundary = $ii;
+ } else if (isset($stop_characters[$c])) {
+ $stop_boundary = $ii + 1;
+ break;
+ } else {
+ if ($word_boundary !== null) {
+ break;
+ }
+ }
+ }
+
+ if ($stop_boundary !== null) {
+ // We found a character like ".". Cut the string there, without appending
+ // the terminal.
+ $string_part = array_slice($string_gv, 0, $stop_boundary);
+ return implode('', $string_part);
+ }
+
+ // If we didn't find any boundary characters or we found ONLY boundary
+ // characters, just break at the maximum character length.
+ if ($word_boundary === null || $word_boundary === 0) {
+ $word_boundary = $cutoff;
+ }
+
+ $string_part = array_slice($string_gv, 0, $word_boundary);
+ $string_part = implode('', $string_part);
+
+ return $string_part.$this->terminator;
+ }
+
+}
diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -135,7 +135,11 @@
array('111111', 5, '2222', '12222'),
array('D1rp. Derp derp.', 7, '...', 'D1rp.'),
- array('D2rp. Derp derp.', 5, '...', 'D2rp.'),
+
+ // "D2rp." is a better shortening of this, but it's dramatically more
+ // complicated to implement with the newer byte/glyph/character
+ // shortening code.
+ array('D2rp. Derp derp.', 5, '...', 'D2...'),
array('D3rp. Derp derp.', 4, '...', 'D...'),
array('D4rp. Derp derp.', 14, '...', 'D4rp. Derp...'),
array('D5rpderp, derp derp', 16, '...', 'D5rpderp...'),
@@ -160,12 +164,66 @@
foreach ($inputs as $input) {
list($string, $length, $terminal, $expect) = $input;
- $result = phutil_utf8_shorten($string, $length, $terminal);
+ $result = id(new PhutilUTF8StringTruncator())
+ ->setMaximumGlyphs($length)
+ ->setTerminator($terminal)
+ ->truncateString($string);
$this->assertEqual($expect, $result, 'Shortening of '.$string);
}
}
+ public function testUTF8StringTruncator() {
+ $cases = array(
+ array(
+ "o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0",
+ 6, "o\xCD\xA0!",
+ 6, "o\xCD\xA0o\xCD\xA0!",
+ 6, "o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0",
+ ),
+ array(
+ "X\xCD\xA0\xCD\xA0\xCD\xA0Y",
+ 6, '!',
+ 6, "X\xCD\xA0\xCD\xA0\xCD\xA0Y",
+ 6, "X\xCD\xA0\xCD\xA0\xCD\xA0Y",
+ ),
+ array(
+ "X\xCD\xA0\xCD\xA0\xCD\xA0YZ",
+ 6, '!',
+ 5, "X\xCD\xA0\xCD\xA0\xCD\xA0!",
+ 2, "X\xCD\xA0\xCD\xA0\xCD\xA0!",
+ ),
+ array(
+ "\xE2\x98\x83\xE2\x98\x83\xE2\x98\x83\xE2\x98\x83",
+ 4, "\xE2\x98\x83!",
+ 3, "\xE2\x98\x83\xE2\x98\x83!",
+ 3, "\xE2\x98\x83\xE2\x98\x83!",
+ ),
+ );
+
+ foreach ($cases as $case) {
+ list($input, $b_len, $b_out, $p_len, $p_out, $g_len, $g_out) = $case;
+
+ $result = id(new PhutilUTF8StringTruncator())
+ ->setMaximumBytes($b_len)
+ ->setTerminator('!')
+ ->truncateString($input);
+ $this->assertEqual($b_out, $result, 'byte-short of '.$input);
+
+ $result = id(new PhutilUTF8StringTruncator())
+ ->setMaximumCodepoints($p_len)
+ ->setTerminator('!')
+ ->truncateString($input);
+ $this->assertEqual($p_out, $result, 'codepoint-short of '.$input);
+
+ $result = id(new PhutilUTF8StringTruncator())
+ ->setMaximumGlyphs($g_len)
+ ->setTerminator('!')
+ ->truncateString($input);
+ $this->assertEqual($g_out, $result, 'glyph-short of '.$input);
+ }
+ }
+
public function testUTF8Wrap() {
$inputs = array(
array(
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -335,11 +335,9 @@
/**
- * Shorten a string to provide a summary, respecting UTF-8 characters. This
- * function attempts to truncate strings at word boundaries.
+ * Shorten a string to provide a summary, respecting UTF-8 characters.
*
- * NOTE: This function makes a best effort to apply some reasonable rules but
- * will not work well for the full range of unicode languages.
+ * This function is deprecated; use @{class:PhutilUTF8StringTruncator} instead.
*
* @param string UTF-8 string to shorten.
* @param int Maximum length of the result.
@@ -350,89 +348,10 @@
* @group utf8
*/
function phutil_utf8_shorten($string, $length, $terminal = "\xE2\x80\xA6") {
- // If the string has fewer bytes than the minimum length, we can return
- // it unmodified without doing any heavy lifting.
- if (strlen($string) <= $length) {
- return $string;
- }
-
- $string_v = phutil_utf8v_combined($string);
- $string_len = count($string_v);
-
- if ($string_len <= $length) {
- // If the string is already shorter than the requested length, simply return
- // it unmodified.
- return $string;
- }
-
- // NOTE: This is not complete, and there are many other word boundary
- // characters and reasonable places to break words in the UTF-8 character
- // space. For now, this gives us reasonable behavior for latin langauges. We
- // don't necessarily have access to PCRE+Unicode so there isn't a great way
- // for us to look up character attributes.
-
- // If we encounter these, prefer to break on them instead of cutting the
- // string off in the middle of a word.
- static $break_characters = array(
- ' ' => true,
- "\n" => true,
- ';' => true,
- ':' => true,
- '[' => true,
- '(' => true,
- ',' => true,
- '-' => true,
- );
-
- // If we encounter these, shorten to this character exactly without appending
- // the terminal.
- static $stop_characters = array(
- '.' => true,
- '!' => true,
- '?' => true,
- );
-
- // Search backward in the string, looking for reasonable places to break it.
- $word_boundary = null;
- $stop_boundary = null;
-
- $terminal_len = phutil_utf8_strlen($terminal);
-
- // If we do a word break with a terminal, we have to look beyond at least the
- // number of characters in the terminal. If the terminal is longer than the
- // required length, we'll skip this whole block and return it on its own
- $terminal_area = $length - min($length, $terminal_len);
- for ($ii = $length; $ii >= 0; $ii--) {
- $c = $string_v[$ii];
-
- if (isset($break_characters[$c]) && ($ii <= $terminal_area)) {
- $word_boundary = $ii;
- } else if (isset($stop_characters[$c]) && ($ii < $length)) {
- $stop_boundary = $ii + 1;
- break;
- } else {
- if ($word_boundary !== null) {
- break;
- }
- }
- }
-
- if ($stop_boundary !== null) {
- // We found a character like ".". Cut the string there, without appending
- // the terminal.
- $string_part = array_slice($string_v, 0, $stop_boundary);
- return implode('', $string_part);
- }
-
- // If we didn't find any boundary characters or we found ONLY boundary
- // characters, just break at the maximum character length.
- if ($word_boundary === null || $word_boundary === 0) {
- $word_boundary = $terminal_area;
- }
-
- $string_part = array_slice($string_v, 0, $word_boundary);
- $string_part = implode('', $string_part);
- return $string_part.$terminal;
+ return id(new PhutilUTF8StringTruncator())
+ ->setMaximumGlyphs($length)
+ ->setTerminator($terminal)
+ ->truncateString($string);
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Jan 9, 8:24 PM (8 h, 57 m)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
6980193
Default Alt Text
D9368.diff (16 KB)
Attached To
Mode
D9368: Provide more flexible string truncation in libphutil
Attached
Detach File
Event Timeline
Log In to Comment