D9368.diff
No OneTemporary
Actions

Size

16 KB

Referenced Files

None

Subscribers

None

D9368.diff
View Options

	diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php
	--- a/src/__phutil_library_map__.php
	+++ b/src/__phutil_library_map__.php
	@@ -312,6 +312,7 @@
	'PhutilTypeSpecTestCase' => 'parser/__tests__/PhutilTypeSpecTestCase.php',
	'PhutilURI' => 'parser/PhutilURI.php',
	'PhutilURITestCase' => 'parser/__tests__/PhutilURITestCase.php',
	+ 'PhutilUTF8StringTruncator' => 'utils/PhutilUTF8StringTruncator.php',
	'PhutilUTF8TestCase' => 'utils/__tests__/PhutilUTF8TestCase.php',
	'PhutilUtilsTestCase' => 'utils/__tests__/PhutilUtilsTestCase.php',
	'PhutilWordPressFuture' => 'future/wordpress/PhutilWordPressFuture.php',
	@@ -678,6 +679,7 @@
	'PhutilTypeMissingParametersException' => 'Exception',
	'PhutilTypeSpecTestCase' => 'PhutilTestCase',
	'PhutilURITestCase' => 'PhutilTestCase',
	+ 'PhutilUTF8StringTruncator' => 'Phobject',
	'PhutilUTF8TestCase' => 'PhutilTestCase',
	'PhutilUtilsTestCase' => 'PhutilTestCase',
	'PhutilWordPressFuture' => 'FutureProxy',
	diff --git a/src/utils/PhutilUTF8StringTruncator.php b/src/utils/PhutilUTF8StringTruncator.php
	new file mode 100644
	--- /dev/null
	+++ b/src/utils/PhutilUTF8StringTruncator.php
	@@ -0,0 +1,266 @@
	+<?php
	+
	+/**
	+ * Truncate a UTF-8 string to a some maximum number of bytes, codepoints, or
	+ * glyphs.
	+ *
	+ * This class takes some UTF-8 string as input, and emits a valid UTF-8 string
	+ * which is no longer than given byte, codepoint, or glyph limits.
	+ *
	+ * $short = id(new PhutilUTF8StringTruncator())
	+ * ->setMaximumGlyphs(80)
	+ * ->truncateString($long);
	+ *
	+ * Byte limits restrict the number of bytes the result may contain. They are
	+ * appropriate when you care about how much storage a string requires.
	+ *
	+ * Codepoint limits restrict the number of codepoints the result may contain.
	+ * Since codepoints may have up to 4 bytes, the resulting strings may require
	+ * have more than this many bytes. This kind of limit is appropriate when you
	+ * are using UTF-8 storage, like MySQL.
	+ *
	+ * Glyph limits restrict the display size of the string. Because a single glyph
	+ * may have an arbitrary number of combining characters, this does not impose
	+ * a storage size limit on the string: a string with only one glyph may require
	+ * an arbitrarily large number of bytes.
	+ *
	+ * You can set more than one limit; the smallest limit will be used.
	+ *
	+ * NOTE: This function makes a best effort to apply some reasonable rules but
	+ * will not work well for the full range of unicode languages.
	+ *
	+ * @group utf8
	+ */
	+final class PhutilUTF8StringTruncator extends Phobject {
	+
	+ private $maximumBytes;
	+ private $maximumCodepoints;
	+ private $maximumGlyphs;
	+ private $minimumLimit;
	+
	+ private $terminator = "\xE2\x80\xA6";
	+ private $terminatorBytes = 3;
	+ private $terminatorCodepoints = 1;
	+ private $terminatorGlyphs = 1;
	+
	+ public function setMaximumBytes($maximum_bytes) {
	+ $this->maximumBytes = $maximum_bytes;
	+ $this->didUpdateMaxima();
	+ return $this;
	+ }
	+
	+ public function setMaximumCodepoints($maximum_codepoints) {
	+ $this->maximumCodepoints = $maximum_codepoints;
	+ $this->didUpdateMaxima();
	+ return $this;
	+ }
	+
	+ public function setMaximumGlyphs($maximum_glyphs) {
	+ $this->maximumGlyphs = $maximum_glyphs;
	+ $this->didUpdateMaxima();
	+ return $this;
	+ }
	+
	+ private function didUpdateMaxima() {
	+ $this->minimumLimit = INF;
	+
	+ if ($this->maximumBytes) {
	+ $this->minimumLimit = min($this->minimumLimit, $this->maximumBytes);
	+ }
	+
	+ if ($this->maximumCodepoints) {
	+ $this->minimumLimit = min($this->minimumLimit, $this->maximumCodepoints);
	+ }
	+
	+ if ($this->maximumGlyphs) {
	+ $this->minimumLimit = min($this->minimumLimit, $this->maximumGlyphs);
	+ }
	+ }
	+
	+ public function setTerminator($terminator) {
	+ $this->terminator = $terminator;
	+ $this->terminatorBytes = strlen($terminator);
	+ $this->terminatorCodepoints = count(phutil_utf8v($terminator));
	+ $this->terminatorGlyphs = count(phutil_utf8v_combined($terminator));
	+ return $this;
	+ }
	+
	+ public function truncateString($string) {
	+ // First, check if the string has fewer bytes than the most restrictive
	+ // limit. Codepoints and glyphs always take up at least one byte, so we can
	+ // just return the string unmodified if we're under all of the limits.
	+ $byte_len = strlen($string);
	+ if ($byte_len <= $this->minimumLimit) {
	+ return $string;
	+ }
	+
	+ // If we need the vector of codepoints, build it.
	+ $string_pv = null;
	+ if ($this->maximumCodepoints) {
	+ $string_pv = phutil_utf8v($string);
	+ $point_len = count($string_pv);
	+ }
	+
	+ // We always need the combined vector, even if we're only doing byte or
	+ // codepoint truncation, because we don't want to truncate to half of a
	+ // combining character.
	+ $string_gv = phutil_utf8v_combined($string);
	+ $glyph_len = count($string_gv);
	+
	+ // Now, check if we're still over the limits. For example, a string may
	+ // be over the raw byte limit but under the glyph limit if it contains
	+ // several multibyte characters.
	+
	+ $too_long = false;
	+ if ($this->maximumBytes && ($byte_len > $this->maximumBytes)) {
	+ $too_long = true;
	+ }
	+ if ($this->maximumCodepoints && ($point_len > $this->maximumCodepoints)) {
	+ $too_long = true;
	+ }
	+ if ($this->maximumGlyphs && ($glyph_len > $this->maximumGlyphs)) {
	+ $too_long = true;
	+ }
	+
	+ if (!$too_long) {
	+ return $string;
	+ }
	+
	+ // This string is legitimately longer than at least one of the limits, so
	+ // we need to truncate it. Find the minimum cutoff point: this is the last
	+ // glyph we can possibly return while satisfying the limits and having space
	+ // for the terminator.
	+
	+ $cutoff = $glyph_len;
	+ if ($this->maximumBytes) {
	+ if ($byte_len <= $this->maximumBytes) {
	+ $cutoff = $glyph_len;
	+ } else {
	+ $bytes = $this->terminatorBytes;
	+ for ($ii = 0; $ii < $glyph_len; $ii++) {
	+ $bytes += strlen($string_gv[$ii]);
	+ if ($bytes > $this->maximumBytes) {
	+ $cutoff = $ii;
	+ break;
	+ }
	+ }
	+ }
	+ }
	+
	+ if ($this->maximumCodepoints) {
	+ if ($point_len <= $this->maximumCodepoints) {
	+ $cutoff = min($cutoff, $glyph_len);
	+ } else {
	+ $points = 0;
	+ for ($ii = 0; $ii < $glyph_len; $ii++) {
	+ $glyph_bytes = strlen($string_gv[$ii]);
	+ while ($points < $point_len) {
	+ $glyph_bytes -= strlen($string_pv[$points]);
	+ $points++;
	+ if ($glyph_bytes <= 0) {
	+ break;
	+ }
	+ }
	+ $points_total = $points + $this->terminatorCodepoints;
	+ if ($points_total > $this->maximumCodepoints) {
	+ $cutoff = min($cutoff, $ii);
	+ break;
	+ }
	+ }
	+ }
	+ }
	+
	+ if ($this->maximumGlyphs) {
	+ if ($glyph_len <= $this->maximumGlyphs) {
	+ $cutoff = min($cutoff, $glyph_len);
	+ } else {
	+ $cutoff = min($cutoff, $this->maximumGlyphs - $this->terminatorGlyphs);
	+ }
	+ }
	+
	+ // If we don't have enough characters for anything, just return the
	+ // terminator.
	+ if ($cutoff <= 0) {
	+ return $this->terminator;
	+ }
	+
	+ // Otherwise, we're going to try to cut the string off somewhere reasonable
	+ // rather than somewhere arbitrary.
	+
	+ // NOTE: This is not complete, and there are many other word boundary
	+ // characters and reasonable places to break words in the UTF-8 character
	+ // space. For now, this gives us reasonable behavior for latin langauges. We
	+ // don't necessarily have access to PCRE+Unicode so there isn't a great way
	+ // for us to look up character attributes.
	+
	+ // If we encounter these, prefer to break on them instead of cutting the
	+ // string off in the middle of a word.
	+ static $break_characters = array(
	+ ' ' => true,
	+ "\n" => true,
	+ ';' => true,
	+ ':' => true,
	+ '[' => true,
	+ '(' => true,
	+ ',' => true,
	+ '-' => true,
	+ );
	+
	+ // If we encounter these, shorten to this character exactly without
	+ // appending the terminal.
	+ static $stop_characters = array(
	+ '.' => true,
	+ '!' => true,
	+ '?' => true,
	+ );
	+
	+ // Search backward in the string, looking for reasonable places to break it.
	+ $word_boundary = null;
	+ $stop_boundary = null;
	+
	+ // If we do a word break with a terminal, we have to look beyond at least
	+ // the number of characters in the terminal. If the terminal is longer than
	+ // the required length, we'll skip this whole block and return it on its
	+ // own.
	+
	+ // Only search backward for a while. At some point we don't get a better
	+ // result by looking through the whole string, and if this is "MMM..." or
	+ // a non-latin language without word break characters we're just wasting
	+ // time.
	+
	+ $search = max(0, $cutoff - 256);
	+ for ($ii = min($cutoff, $glyph_len - 1); $ii >= $search; $ii--) {
	+ $c = $string_gv[$ii];
	+
	+ if (isset($break_characters[$c])) {
	+ $word_boundary = $ii;
	+ } else if (isset($stop_characters[$c])) {
	+ $stop_boundary = $ii + 1;
	+ break;
	+ } else {
	+ if ($word_boundary !== null) {
	+ break;
	+ }
	+ }
	+ }
	+
	+ if ($stop_boundary !== null) {
	+ // We found a character like ".". Cut the string there, without appending
	+ // the terminal.
	+ $string_part = array_slice($string_gv, 0, $stop_boundary);
	+ return implode('', $string_part);
	+ }
	+
	+ // If we didn't find any boundary characters or we found ONLY boundary
	+ // characters, just break at the maximum character length.
	+ if ($word_boundary === null \|\| $word_boundary === 0) {
	+ $word_boundary = $cutoff;
	+ }
	+
	+ $string_part = array_slice($string_gv, 0, $word_boundary);
	+ $string_part = implode('', $string_part);
	+
	+ return $string_part.$this->terminator;
	+ }
	+
	+}
	diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
	--- a/src/utils/__tests__/PhutilUTF8TestCase.php
	+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
	@@ -135,7 +135,11 @@
	array('111111', 5, '2222', '12222'),

	array('D1rp. Derp derp.', 7, '...', 'D1rp.'),
	- array('D2rp. Derp derp.', 5, '...', 'D2rp.'),
	+
	+ // "D2rp." is a better shortening of this, but it's dramatically more
	+ // complicated to implement with the newer byte/glyph/character
	+ // shortening code.
	+ array('D2rp. Derp derp.', 5, '...', 'D2...'),
	array('D3rp. Derp derp.', 4, '...', 'D...'),
	array('D4rp. Derp derp.', 14, '...', 'D4rp. Derp...'),
	array('D5rpderp, derp derp', 16, '...', 'D5rpderp...'),
	@@ -160,12 +164,66 @@

	foreach ($inputs as $input) {
	list($string, $length, $terminal, $expect) = $input;
	- $result = phutil_utf8_shorten($string, $length, $terminal);
	+ $result = id(new PhutilUTF8StringTruncator())
	+ ->setMaximumGlyphs($length)
	+ ->setTerminator($terminal)
	+ ->truncateString($string);
	$this->assertEqual($expect, $result, 'Shortening of '.$string);
	}

	}

	+ public function testUTF8StringTruncator() {
	+ $cases = array(
	+ array(
	+ "o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0",
	+ 6, "o\xCD\xA0!",
	+ 6, "o\xCD\xA0o\xCD\xA0!",
	+ 6, "o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0o\xCD\xA0",
	+ ),
	+ array(
	+ "X\xCD\xA0\xCD\xA0\xCD\xA0Y",
	+ 6, '!',
	+ 6, "X\xCD\xA0\xCD\xA0\xCD\xA0Y",
	+ 6, "X\xCD\xA0\xCD\xA0\xCD\xA0Y",
	+ ),
	+ array(
	+ "X\xCD\xA0\xCD\xA0\xCD\xA0YZ",
	+ 6, '!',
	+ 5, "X\xCD\xA0\xCD\xA0\xCD\xA0!",
	+ 2, "X\xCD\xA0\xCD\xA0\xCD\xA0!",
	+ ),
	+ array(
	+ "\xE2\x98\x83\xE2\x98\x83\xE2\x98\x83\xE2\x98\x83",
	+ 4, "\xE2\x98\x83!",
	+ 3, "\xE2\x98\x83\xE2\x98\x83!",
	+ 3, "\xE2\x98\x83\xE2\x98\x83!",
	+ ),
	+ );
	+
	+ foreach ($cases as $case) {
	+ list($input, $b_len, $b_out, $p_len, $p_out, $g_len, $g_out) = $case;
	+
	+ $result = id(new PhutilUTF8StringTruncator())
	+ ->setMaximumBytes($b_len)
	+ ->setTerminator('!')
	+ ->truncateString($input);
	+ $this->assertEqual($b_out, $result, 'byte-short of '.$input);
	+
	+ $result = id(new PhutilUTF8StringTruncator())
	+ ->setMaximumCodepoints($p_len)
	+ ->setTerminator('!')
	+ ->truncateString($input);
	+ $this->assertEqual($p_out, $result, 'codepoint-short of '.$input);
	+
	+ $result = id(new PhutilUTF8StringTruncator())
	+ ->setMaximumGlyphs($g_len)
	+ ->setTerminator('!')
	+ ->truncateString($input);
	+ $this->assertEqual($g_out, $result, 'glyph-short of '.$input);
	+ }
	+ }
	+
	public function testUTF8Wrap() {
	$inputs = array(
	array(
	diff --git a/src/utils/utf8.php b/src/utils/utf8.php
	--- a/src/utils/utf8.php
	+++ b/src/utils/utf8.php
	@@ -335,11 +335,9 @@


	/**
	- * Shorten a string to provide a summary, respecting UTF-8 characters. This
	- * function attempts to truncate strings at word boundaries.
	+ * Shorten a string to provide a summary, respecting UTF-8 characters.
	*
	- * NOTE: This function makes a best effort to apply some reasonable rules but
	- * will not work well for the full range of unicode languages.
	+ * This function is deprecated; use @{class:PhutilUTF8StringTruncator} instead.
	*
	* @param string UTF-8 string to shorten.
	* @param int Maximum length of the result.
	@@ -350,89 +348,10 @@
	* @group utf8
	*/
	function phutil_utf8_shorten($string, $length, $terminal = "\xE2\x80\xA6") {
	- // If the string has fewer bytes than the minimum length, we can return
	- // it unmodified without doing any heavy lifting.
	- if (strlen($string) <= $length) {
	- return $string;
	- }
	-
	- $string_v = phutil_utf8v_combined($string);
	- $string_len = count($string_v);
	-
	- if ($string_len <= $length) {
	- // If the string is already shorter than the requested length, simply return
	- // it unmodified.
	- return $string;
	- }
	-
	- // NOTE: This is not complete, and there are many other word boundary
	- // characters and reasonable places to break words in the UTF-8 character
	- // space. For now, this gives us reasonable behavior for latin langauges. We
	- // don't necessarily have access to PCRE+Unicode so there isn't a great way
	- // for us to look up character attributes.
	-
	- // If we encounter these, prefer to break on them instead of cutting the
	- // string off in the middle of a word.
	- static $break_characters = array(
	- ' ' => true,
	- "\n" => true,
	- ';' => true,
	- ':' => true,
	- '[' => true,
	- '(' => true,
	- ',' => true,
	- '-' => true,
	- );
	-
	- // If we encounter these, shorten to this character exactly without appending
	- // the terminal.
	- static $stop_characters = array(
	- '.' => true,
	- '!' => true,
	- '?' => true,
	- );
	-
	- // Search backward in the string, looking for reasonable places to break it.
	- $word_boundary = null;
	- $stop_boundary = null;
	-
	- $terminal_len = phutil_utf8_strlen($terminal);
	-
	- // If we do a word break with a terminal, we have to look beyond at least the
	- // number of characters in the terminal. If the terminal is longer than the
	- // required length, we'll skip this whole block and return it on its own
	- $terminal_area = $length - min($length, $terminal_len);
	- for ($ii = $length; $ii >= 0; $ii--) {
	- $c = $string_v[$ii];
	-
	- if (isset($break_characters[$c]) && ($ii <= $terminal_area)) {
	- $word_boundary = $ii;
	- } else if (isset($stop_characters[$c]) && ($ii < $length)) {
	- $stop_boundary = $ii + 1;
	- break;
	- } else {
	- if ($word_boundary !== null) {
	- break;
	- }
	- }
	- }
	-
	- if ($stop_boundary !== null) {
	- // We found a character like ".". Cut the string there, without appending
	- // the terminal.
	- $string_part = array_slice($string_v, 0, $stop_boundary);
	- return implode('', $string_part);
	- }
	-
	- // If we didn't find any boundary characters or we found ONLY boundary
	- // characters, just break at the maximum character length.
	- if ($word_boundary === null \|\| $word_boundary === 0) {
	- $word_boundary = $terminal_area;
	- }
	-
	- $string_part = array_slice($string_v, 0, $word_boundary);
	- $string_part = implode('', $string_part);
	- return $string_part.$terminal;
	+ return id(new PhutilUTF8StringTruncator())
	+ ->setMaximumGlyphs($length)
	+ ->setTerminator($terminal)
	+ ->truncateString($string);
	}

File Metadata

Mime Type: text/plain
Expires: Thu, Jan 9, 8:24 PM (8 h, 57 m)
Storage Engine: blob
Storage Format: Encrypted (AES-256-CBC)
Storage Handle: 6980193
Default Alt Text: D9368.diff (16 KB)

D9368.diffNo OneTemporaryActions

D9368.diffView Options

File Metadata

Event Timeline

D9368.diff
No OneTemporary
Actions

D9368.diff
View Options