diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -312,6 +312,7 @@ 'PhutilTypeSpecTestCase' => 'parser/__tests__/PhutilTypeSpecTestCase.php', 'PhutilURI' => 'parser/PhutilURI.php', 'PhutilURITestCase' => 'parser/__tests__/PhutilURITestCase.php', + 'PhutilUTF8StringTruncator' => 'utils/PhutilUTF8StringTruncator.php', 'PhutilUTF8TestCase' => 'utils/__tests__/PhutilUTF8TestCase.php', 'PhutilUtilsTestCase' => 'utils/__tests__/PhutilUtilsTestCase.php', 'PhutilWordPressFuture' => 'future/wordpress/PhutilWordPressFuture.php', @@ -678,6 +679,7 @@ 'PhutilTypeMissingParametersException' => 'Exception', 'PhutilTypeSpecTestCase' => 'PhutilTestCase', 'PhutilURITestCase' => 'PhutilTestCase', + 'PhutilUTF8StringTruncator' => 'Phobject', 'PhutilUTF8TestCase' => 'PhutilTestCase', 'PhutilUtilsTestCase' => 'PhutilTestCase', 'PhutilWordPressFuture' => 'FutureProxy', diff --git a/src/utils/PhutilUTF8StringTruncator.php b/src/utils/PhutilUTF8StringTruncator.php new file mode 100644 --- /dev/null +++ b/src/utils/PhutilUTF8StringTruncator.php @@ -0,0 +1,266 @@ +setMaximumGlyphs(80) + * ->truncateString($long); + * + * Byte limits restrict the number of bytes the result may contain. They are + * appropriate when you care about how much storage a string requires. + * + * Codepoint limits restrict the number of codepoints the result may contain. + * Since codepoints may have up to 4 bytes, the resulting strings may require + * have more than this many bytes. This kind of limit is appropriate when you + * are using UTF-8 storage, like MySQL. + * + * Glyph limits restrict the display size of the string. Because a single glyph + * may have an arbitrary number of combining characters, this does not impose + * a storage size limit on the string: a string with only one glyph may require + * an arbitrarily large number of bytes. + * + * You can set more than one limit; the smallest limit will be used. + * + * NOTE: This function makes a best effort to apply some reasonable rules but + * will not work well for the full range of unicode languages. + * + * @group utf8 + */ +final class PhutilUTF8StringTruncator extends Phobject { + + private $maximumBytes; + private $maximumCodepoints; + private $maximumGlyphs; + private $minimumLimit; + + private $terminator = "\xE2\x80\xA6"; + private $terminatorBytes = 3; + private $terminatorCodepoints = 1; + private $terminatorGlyphs = 1; + + public function setMaximumBytes($maximum_bytes) { + $this->maximumBytes = $maximum_bytes; + $this->didUpdateMaxima(); + return $this; + } + + public function setMaximumCodepoints($maximum_codepoints) { + $this->maximumCodepoints = $maximum_codepoints; + $this->didUpdateMaxima(); + return $this; + } + + public function setMaximumGlyphs($maximum_glyphs) { + $this->maximumGlyphs = $maximum_glyphs; + $this->didUpdateMaxima(); + return $this; + } + + private function didUpdateMaxima() { + $this->minimumLimit = INF; + + if ($this->maximumBytes) { + $this->minimumLimit = min($this->minimumLimit, $this->maximumBytes); + } + + if ($this->maximumCodepoints) { + $this->minimumLimit = min($this->minimumLimit, $this->maximumCodepoints); + } + + if ($this->maximumGlyphs) { + $this->minimumLimit = min($this->minimumLimit, $this->maximumGlyphs); + } + } + + public function setTerminator($terminator) { + $this->terminator = $terminator; + $this->terminatorBytes = strlen($terminator); + $this->terminatorCodepoints = count(phutil_utf8v($terminator)); + $this->terminatorGlyphs = count(phutil_utf8v_combined($terminator)); + return $this; + } + + public function truncateString($string) { + // First, check if the string has fewer bytes than the most restrictive + // limit. Codepoints and glyphs always take up at least one byte, so we can + // just return the string unmodified if we're under all of the limits. + $byte_len = strlen($string); + if ($byte_len <= $this->minimumLimit) { + return $string; + } + + // If we need the vector of codepoints, build it. + $string_pv = null; + if ($this->maximumCodepoints) { + $string_pv = phutil_utf8v($string); + $point_len = count($string_pv); + } + + // We always need the combined vector, even if we're only doing byte or + // codepoint truncation, because we don't want to truncate to half of a + // combining character. + $string_gv = phutil_utf8v_combined($string); + $glyph_len = count($string_gv); + + // Now, check if we're still over the limits. For example, a string may + // be over the raw byte limit but under the glyph limit if it contains + // several multibyte characters. + + $too_long = false; + if ($this->maximumBytes && ($byte_len > $this->maximumBytes)) { + $too_long = true; + } + if ($this->maximumCodepoints && ($point_len > $this->maximumCodepoints)) { + $too_long = true; + } + if ($this->maximumGlyphs && ($glyph_len > $this->maximumGlyphs)) { + $too_long = true; + } + + if (!$too_long) { + return $string; + } + + // This string is legitimately longer than at least one of the limits, so + // we need to truncate it. Find the minimum cutoff point: this is the last + // glyph we can possibly return while satisfying the limits and having space + // for the terminator. + + $cutoff = $glyph_len; + if ($this->maximumBytes) { + if ($byte_len <= $this->maximumBytes) { + $cutoff = $glyph_len; + } else { + $bytes = $this->terminatorBytes; + for ($ii = 0; $ii < $glyph_len; $ii++) { + $bytes += strlen($string_gv[$ii]); + if ($bytes > $this->maximumBytes) { + $cutoff = $ii; + break; + } + } + } + } + + if ($this->maximumCodepoints) { + if ($point_len <= $this->maximumCodepoints) { + $cutoff = min($cutoff, $glyph_len); + } else { + $points = 0; + for ($ii = 0; $ii < $glyph_len; $ii++) { + $glyph_bytes = strlen($string_gv[$ii]); + while ($points < $point_len) { + $glyph_bytes -= strlen($string_pv[$points]); + $points++; + if ($glyph_bytes <= 0) { + break; + } + } + $points_total = $points + $this->terminatorCodepoints; + if ($points_total > $this->maximumCodepoints) { + $cutoff = min($cutoff, $ii); + break; + } + } + } + } + + if ($this->maximumGlyphs) { + if ($glyph_len <= $this->maximumGlyphs) { + $cutoff = min($cutoff, $glyph_len); + } else { + $cutoff = min($cutoff, $this->maximumGlyphs - $this->terminatorGlyphs); + } + } + + // If we don't have enough characters for anything, just return the + // terminator. + if ($cutoff <= 0) { + return $this->terminator; + } + + // Otherwise, we're going to try to cut the string off somewhere reasonable + // rather than somewhere arbitrary. + + // NOTE: This is not complete, and there are many other word boundary + // characters and reasonable places to break words in the UTF-8 character + // space. For now, this gives us reasonable behavior for latin langauges. We + // don't necessarily have access to PCRE+Unicode so there isn't a great way + // for us to look up character attributes. + + // If we encounter these, prefer to break on them instead of cutting the + // string off in the middle of a word. + static $break_characters = array( + ' ' => true, + "\n" => true, + ';' => true, + ':' => true, + '[' => true, + '(' => true, + ',' => true, + '-' => true, + ); + + // If we encounter these, shorten to this character exactly without + // appending the terminal. + static $stop_characters = array( + '.' => true, + '!' => true, + '?' => true, + ); + + // Search backward in the string, looking for reasonable places to break it. + $word_boundary = null; + $stop_boundary = null; + + // If we do a word break with a terminal, we have to look beyond at least + // the number of characters in the terminal. If the terminal is longer than + // the required length, we'll skip this whole block and return it on its + // own. + + // Only search backward for a while. At some point we don't get a better + // result by looking through the whole string, and if this is "MMM..." or + // a non-latin language without word break characters we're just wasting + // time. + + $search = max(0, $cutoff - 256); + for ($ii = min($cutoff, $glyph_len - 1); $ii >= $search; $ii--) { + $c = $string_gv[$ii]; + + if (isset($break_characters[$c])) { + $word_boundary = $ii; + } else if (isset($stop_characters[$c])) { + $stop_boundary = $ii + 1; + break; + } else { + if ($word_boundary !== null) { + break; + } + } + } + + if ($stop_boundary !== null) { + // We found a character like ".". Cut the string there, without appending + // the terminal. + $string_part = array_slice($string_gv, 0, $stop_boundary); + return implode('', $string_part); + } + + // If we didn't find any boundary characters or we found ONLY boundary + // characters, just break at the maximum character length. + if ($word_boundary === null || $word_boundary === 0) { + $word_boundary = $cutoff; + } + + $string_part = array_slice($string_gv, 0, $word_boundary); + $string_part = implode('', $string_part); + + return $string_part.$this->terminator; + } + +} diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php --- a/src/utils/__tests__/PhutilUTF8TestCase.php +++ b/src/utils/__tests__/PhutilUTF8TestCase.php @@ -135,7 +135,11 @@ array('111111', 5, '2222', '12222'), array('D1rp. Derp derp.', 7, '...', 'D1rp.'), - array('D2rp. Derp derp.', 5, '...', 'D2rp.'), + + // "D2rp." is a better shortening of this, but it's dramatically more + // complicated to implement with the newer byte/glyph/character + // shortening code. + array('D2rp. Derp derp.', 5, '...', 'D2...'), array('D3rp. Derp derp.', 4, '...', 'D...'), array('D4rp. Derp derp.', 14, '...', 'D4rp. Derp...'), array('D5rpderp, derp derp', 16, '...', 'D5rpderp...'), @@ -160,12 +164,66 @@ foreach ($inputs as $input) { list($string, $length, $terminal, $expect) = $input; - $result = phutil_utf8_shorten($string, $length, $terminal); + $result = id(new PhutilUTF8StringTruncator()) + ->setMaximumGlyphs($length) + ->setTerminator($terminal) + ->truncateString($string); $this->assertEqual($expect, $result, 'Shortening of '.$string); } } + public function testUTF8StringTruncator() { + $cases = array( + array( + "o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0", + 6, "o\xCD\xA0!", + 6, "o\xCD\xA0o\xCD\xA0!", + 6, "o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0", + ), + array( + "X\xCD\xA0\xCD\xA0\xCD\xA0Y", + 6, '!', + 6, "X\xCD\xA0\xCD\xA0\xCD\xA0Y", + 6, "X\xCD\xA0\xCD\xA0\xCD\xA0Y", + ), + array( + "X\xCD\xA0\xCD\xA0\xCD\xA0YZ", + 6, '!', + 5, "X\xCD\xA0\xCD\xA0\xCD\xA0!", + 2, "X\xCD\xA0\xCD\xA0\xCD\xA0!", + ), + array( + "\xE2\x98\x83\xE2\x98\x83\xE2\x98\x83\xE2\x98\x83", + 4, "\xE2\x98\x83!", + 3, "\xE2\x98\x83\xE2\x98\x83!", + 3, "\xE2\x98\x83\xE2\x98\x83!", + ), + ); + + foreach ($cases as $case) { + list($input, $b_len, $b_out, $p_len, $p_out, $g_len, $g_out) = $case; + + $result = id(new PhutilUTF8StringTruncator()) + ->setMaximumBytes($b_len) + ->setTerminator('!') + ->truncateString($input); + $this->assertEqual($b_out, $result, 'byte-short of '.$input); + + $result = id(new PhutilUTF8StringTruncator()) + ->setMaximumCodepoints($p_len) + ->setTerminator('!') + ->truncateString($input); + $this->assertEqual($p_out, $result, 'codepoint-short of '.$input); + + $result = id(new PhutilUTF8StringTruncator()) + ->setMaximumGlyphs($g_len) + ->setTerminator('!') + ->truncateString($input); + $this->assertEqual($g_out, $result, 'glyph-short of '.$input); + } + } + public function testUTF8Wrap() { $inputs = array( array( diff --git a/src/utils/utf8.php b/src/utils/utf8.php --- a/src/utils/utf8.php +++ b/src/utils/utf8.php @@ -335,11 +335,9 @@ /** - * Shorten a string to provide a summary, respecting UTF-8 characters. This - * function attempts to truncate strings at word boundaries. + * Shorten a string to provide a summary, respecting UTF-8 characters. * - * NOTE: This function makes a best effort to apply some reasonable rules but - * will not work well for the full range of unicode languages. + * This function is deprecated; use @{class:PhutilUTF8StringTruncator} instead. * * @param string UTF-8 string to shorten. * @param int Maximum length of the result. @@ -350,89 +348,10 @@ * @group utf8 */ function phutil_utf8_shorten($string, $length, $terminal = "\xE2\x80\xA6") { - // If the string has fewer bytes than the minimum length, we can return - // it unmodified without doing any heavy lifting. - if (strlen($string) <= $length) { - return $string; - } - - $string_v = phutil_utf8v_combined($string); - $string_len = count($string_v); - - if ($string_len <= $length) { - // If the string is already shorter than the requested length, simply return - // it unmodified. - return $string; - } - - // NOTE: This is not complete, and there are many other word boundary - // characters and reasonable places to break words in the UTF-8 character - // space. For now, this gives us reasonable behavior for latin langauges. We - // don't necessarily have access to PCRE+Unicode so there isn't a great way - // for us to look up character attributes. - - // If we encounter these, prefer to break on them instead of cutting the - // string off in the middle of a word. - static $break_characters = array( - ' ' => true, - "\n" => true, - ';' => true, - ':' => true, - '[' => true, - '(' => true, - ',' => true, - '-' => true, - ); - - // If we encounter these, shorten to this character exactly without appending - // the terminal. - static $stop_characters = array( - '.' => true, - '!' => true, - '?' => true, - ); - - // Search backward in the string, looking for reasonable places to break it. - $word_boundary = null; - $stop_boundary = null; - - $terminal_len = phutil_utf8_strlen($terminal); - - // If we do a word break with a terminal, we have to look beyond at least the - // number of characters in the terminal. If the terminal is longer than the - // required length, we'll skip this whole block and return it on its own - $terminal_area = $length - min($length, $terminal_len); - for ($ii = $length; $ii >= 0; $ii--) { - $c = $string_v[$ii]; - - if (isset($break_characters[$c]) && ($ii <= $terminal_area)) { - $word_boundary = $ii; - } else if (isset($stop_characters[$c]) && ($ii < $length)) { - $stop_boundary = $ii + 1; - break; - } else { - if ($word_boundary !== null) { - break; - } - } - } - - if ($stop_boundary !== null) { - // We found a character like ".". Cut the string there, without appending - // the terminal. - $string_part = array_slice($string_v, 0, $stop_boundary); - return implode('', $string_part); - } - - // If we didn't find any boundary characters or we found ONLY boundary - // characters, just break at the maximum character length. - if ($word_boundary === null || $word_boundary === 0) { - $word_boundary = $terminal_area; - } - - $string_part = array_slice($string_v, 0, $word_boundary); - $string_part = implode('', $string_part); - return $string_part.$terminal; + return id(new PhutilUTF8StringTruncator()) + ->setMaximumGlyphs($length) + ->setTerminator($terminal) + ->truncateString($string); }