diff --git a/src/utils/PhutilEditDistanceMatrix.php b/src/utils/PhutilEditDistanceMatrix.php index fadfc83..8e8b900 100644 --- a/src/utils/PhutilEditDistanceMatrix.php +++ b/src/utils/PhutilEditDistanceMatrix.php @@ -1,491 +1,523 @@ setSequences(str_split('ran'), str_split('rat')); * * $cost = $matrix->getEditDistance(); * * Edit distance computation is slow and requires a large amount of memory; * both are roughly O(N * M) in the length of the strings. * * You can customize the cost of insertions, deletions and replacements. By * default, each has a cost of 1. * * $matrix->setReplaceCost(1); * * By default, transpositions are not evaluated (i.e., the algorithm is * Levenshtein). You can cause transpositions to be evaluated by setting a * transpose cost (which will change the algorithm to Damerau-Levenshtein): * * $matrix->setTransposeCost(1); * * You can also provide a cost to alter the type of operation being applied. * Many strings have several equivalently expensive edit sequences, but one * some sequences are more readable to humans than others. Providing a small * cost to alter operation type tends to smooth out the sequence and produce * long runs of a single operation, which are generally more readable. For * example, these strings: * * (x) * ((x)) * * ...have edit strings "issis" and "isssi", which describe edit operations with * the same cost, but the latter string is more comprehensible to human viewers. * * If you set an alter cost, you must call @{method:setComputeString} to enable * type computation. The alter cost should generally be smaller than `c / N`, * where `c` is the smallest operational cost and `N` is the length of the * longest string. For example, if you are using the default costs (insert = 1, * delete = 1, replace = 1) and computing edit distances for strings of fewer * than 1,000 characters, you might set the alter cost to 0.001. */ final class PhutilEditDistanceMatrix extends Phobject { private $insertCost = 1; private $deleteCost = 1; private $replaceCost = 1; private $transposeCost = null; private $alterCost = 0; private $maximumLength; private $computeString; + private $applySmoothing; private $x; private $y; private $prefix; private $suffix; private $distanceMatrix = null; private $typeMatrix = null; public function setMaximumLength($maximum_length) { $this->maximumLength = $maximum_length; return $this; } public function getMaximumLength() { return coalesce($this->maximumLength, $this->getInfinity()); } public function setComputeString($compute_string) { $this->computeString = $compute_string; return $this; } public function getComputeString() { return $this->computeString; } public function setTransposeCost($transpose_cost) { $this->transposeCost = $transpose_cost; return $this; } public function getTransposeCost() { return $this->transposeCost; } public function setReplaceCost($replace_cost) { $this->replaceCost = $replace_cost; return $this; } public function getReplaceCost() { return $this->replaceCost; } public function setDeleteCost($delete_cost) { $this->deleteCost = $delete_cost; return $this; } public function getDeleteCost() { return $this->deleteCost; } public function setInsertCost($insert_cost) { $this->insertCost = $insert_cost; return $this; } public function getInsertCost() { return $this->insertCost; } public function setAlterCost($alter_cost) { $this->alterCost = $alter_cost; return $this; } public function getAlterCost() { return $this->alterCost; } + public function setApplySmoothing($apply_smoothing) { + $this->applySmoothing = $apply_smoothing; + return $this; + } + + public function getApplySmoothing() { + return $this->applySmoothing; + } + public function setSequences(array $x, array $y) { // NOTE: We strip common prefixes and suffixes from the inputs because // the runtime of the edit distance algorithm is large and it is common // to diff similar strings. $xl = count($x); $yl = count($y); $l = min($xl, $yl); $prefix_l = 0; $suffix_l = 0; for ($ii = 0; $ii < $l; $ii++) { if ($x[$ii] !== $y[$ii]) { break; } $prefix_l++; } for ($ii = 1; $ii <= ($l - $prefix_l); $ii++) { if ($x[$xl - $ii] !== $y[$yl - $ii]) { break; } $suffix_l++; } $this->prefix = array_slice($x, 0, $prefix_l); $this->suffix = array_slice($x, $xl - $suffix_l); $this->x = array_slice($x, $prefix_l, $xl - ($suffix_l + $prefix_l)); $this->y = array_slice($y, $prefix_l, $yl - ($suffix_l + $prefix_l)); $this->distanceMatrix = null; return $this; } private function requireSequences() { if ($this->x === null) { throw new PhutilInvalidStateException('setSequences'); } } public function getEditDistance() { $this->requireSequences(); $x = $this->x; $y = $this->y; if (!$x) { return $this->insertCost * count($y); } if (!$y) { return $this->deleteCost * count($x); } $max = $this->getMaximumLength(); if (count($x) > $max || count($y) > $max) { return ($this->insertCost * count($y)) + ($this->deleteCost * count($x)); } if ($x === $y) { return 0; } $matrix = $this->getDistanceMatrix(); return $matrix[count($x)][count($y)]; } /** * Return a string representing the edits between the sequences. The string * has these characters: * * - s (same): Same character in both strings. * - i (insert): Character inserted. * - d (delete): Character deleted. * - x (replace): Character replaced. * - t (transpose): Character transposed. */ public function getEditString() { $this->requireSequences(); $x = $this->x; $y = $this->y; if (!$x && !$y) { return $this->padEditString(''); } if (!$x) { return $this->padEditString(str_repeat('i', count($y))); } if (!$y) { return $this->padEditString(str_repeat('d', count($x))); } if ($x === $y) { return $this->padEditString(str_repeat('s', count($x))); } $max = $this->getMaximumLength(); if (count($x) > $max || count($y) > $max) { return $this->padEditString( str_repeat('d', count($x)). str_repeat('i', count($y))); } $matrix = $this->getTypeMatrix(); $xx = count($x); $yy = count($y); $transposes = array(); $str = ''; while (true) { $type = $matrix[$xx][$yy]; if (is_array($type)) { $chr = 't'; $transposes[$type[0]][$type[1]] = true; $type = $type[2]; } else { $chr = $type; } if (isset($transposes[$xx][$yy])) { $chr = 't'; } if ($type == 's' || $type == 'x') { $xx -= 1; $yy -= 1; } else if ($type == 'i') { $yy -= 1; } else if ($type == 'd') { $xx -= 1; } else { throw new Exception(pht("Unknown type '%s' in type matrix.", $type)); } $str .= $chr; if ($xx <= 0 && $yy <= 0) { break; } } - return $this->padEditString(strrev($str)); + $str = strrev($str); + + if ($this->getApplySmoothing()) { + $str = $this->applySmoothing($str); + } + + return $this->padEditString($str); } private function padEditString($str) { if ($this->prefix) { $str = str_repeat('s', count($this->prefix)).$str; } if ($this->suffix) { $str = $str.str_repeat('s', count($this->suffix)); } return $str; } private function getTypeMatrix() { if (!$this->computeString) { throw new PhutilInvalidStateException('setComputeString'); } if ($this->typeMatrix === null) { $this->computeMatrix($this->x, $this->y); } return $this->typeMatrix; } private function getDistanceMatrix() { if ($this->distanceMatrix === null) { $this->computeMatrix($this->x, $this->y); } return $this->distanceMatrix; } private function computeMatrix(array $x, array $y) { $xl = count($x); $yl = count($y); $m = array(); $infinity = $this->getInfinity(); $use_damerau = ($this->transposeCost !== null); if ($use_damerau) { // Initialize the default cost of a transpose. $m[-1][-1] = $infinity; // Initialize the character position dictionary for Damerau. $d = array(); foreach ($x as $c) { $d[$c] = -1; } foreach ($y as $c) { $d[$c] = -1; } // Initialize the transpose costs for Damerau. for ($xx = 0; $xx <= $xl; $xx++) { $m[$xx][-1] = $infinity; } for ($yy = 0; $yy <= $yl; $yy++) { $m[-1][$yy] = $infinity; } } // Initialize the top row of the matrix. for ($xx = 0; $xx <= $xl; $xx++) { $m[$xx][0] = $xx * $this->deleteCost; } // Initialize the left column of the matrix. $cost = 0; for ($yy = 0; $yy <= $yl; $yy++) { $m[0][$yy] = $yy * $this->insertCost; } $use_types = ($this->computeString); if ($use_types) { $t = array(); for ($xx = 0; $xx <= $xl; $xx++) { $t[$xx][0] = 'd'; } for ($yy = 0; $yy <= $yl; $yy++) { $t[0][$yy] = 'i'; } $t[0][0] = 's'; } $alt_cost = $this->getAlterCost(); if ($alt_cost && !$use_types) { throw new Exception( pht( 'If you provide an alter cost with %s, you must enable '. 'type computation with %s.', 'setAlterCost()', 'setComputeStrings()')); } // Build the edit distance matrix. for ($xx = 1; $xx <= $xl; $xx++) { $last_dy = -1; for ($yy = 1; $yy <= $yl; $yy++) { if ($use_damerau) { $dx = $d[$y[$yy - 1]]; $dy = $last_dy; } if ($x[$xx - 1] === $y[$yy - 1]) { $rep_cost = $m[$xx - 1][$yy - 1] + 0; $rep_type = 's'; } else { $rep_cost = $m[$xx - 1][$yy - 1] + $this->replaceCost; $rep_type = 'x'; } $del_cost = $m[$xx - 1][$yy ] + $this->deleteCost; $ins_cost = $m[$xx ][$yy - 1] + $this->insertCost; if ($alt_cost) { $del_char = $t[$xx - 1][$yy ]; $ins_char = $t[$xx ][$yy - 1]; $rep_char = $t[$xx - 1][$yy - 1]; if ($del_char !== 'd') { $del_cost += $alt_cost; } if ($ins_char !== 'i') { $ins_cost += $alt_cost; } if ($rep_char !== $rep_type) { $rep_cost += $alt_cost; } } if ($rep_cost <= $del_cost && $rep_cost <= $ins_cost) { $cost = $rep_cost; $type = $rep_type; if ($rep_type === 's') { $last_dy = $yy - 1; } } else if ($ins_cost <= $del_cost) { $cost = $ins_cost; $type = 'i'; } else { $cost = $del_cost; $type = 'd'; } if ($use_damerau) { $trn_count = (($xx - $dx - 2) + ($yy - $dy - 2) + 1); $trn_cost = $m[$dx][$dy] + ($trn_count * $this->transposeCost); if ($trn_cost < $cost) { $cost = $trn_cost; $type = array($dx + 1, $dy + 1, $type); } } $m[$xx][$yy] = $cost; if ($use_types) { $t[$xx][$yy] = $type; } } if ($use_damerau) { $d[$x[$xx - 1]] = ($xx - 1); } } $this->distanceMatrix = $m; if ($use_types) { $this->typeMatrix = $t; } } private function getInfinity() { return INF; } private function printMatrix(array $m) { $x = $this->x; $y = $this->y; $p = '% 8s '; printf($p, ' '); foreach (head($m) as $yk => $yv) { printf($p, idx($y, $yk - 1, '-')); } echo "\n"; foreach ($m as $xk => $xv) { printf($p, idx($x, $xk - 1, '-')); foreach ($xv as $yk => $yv) { printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv)); } echo "\n"; } } private function printTypeMatrix(array $t) { $x = $this->x; $y = $this->y; $p = '% 8s '; printf($p, ' '); foreach (head($t) as $yk => $yv) { printf($p, idx($y, $yk - 1, '-')); } echo "\n"; foreach ($t as $xk => $xv) { printf($p, idx($x, $xk - 1, '-')); foreach ($xv as $yk => $yv) { printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv)); } echo "\n"; } } + private function applySmoothing($str) { + $result = $str; + + // Smooth the string out, by replacing short runs of similar characters + // with 'x' operations. This makes the result more readable to humans, + // since there are fewer choppy runs of short added and removed substrings. + do { + $original = $result; + $result = preg_replace('/([xdi])(s{3})([xdi])/', '$1xxx$3', $result); + $result = preg_replace('/([xdi])(s{2})([xdi])/', '$1xx$3', $result); + $result = preg_replace('/([xdi])(s{1})([xdi])/', '$1x$3', $result); + } while ($result != $original); + + return $result; + } + } diff --git a/src/utils/PhutilProseDifferenceEngine.php b/src/utils/PhutilProseDifferenceEngine.php index ee9e168..6b9f8cd 100644 --- a/src/utils/PhutilProseDifferenceEngine.php +++ b/src/utils/PhutilProseDifferenceEngine.php @@ -1,98 +1,104 @@ buildDiff($diff, $u, $v, 1); $diff->reorderParts(); return $diff; } private function buildDiff(PhutilProseDiff $diff, $u, $v, $level) { if ($level == 4) { $diff->addPart('-', $u); $diff->addPart('+', $v); return; } $u_parts = $this->splitCorpus($u, $level); $v_parts = $this->splitCorpus($v, $level); $matrix = id(new PhutilEditDistanceMatrix()) ->setSequences($u_parts, $v_parts) ->setComputeString(true); + // For word-level and character-level changes, smooth the output string + // to reduce the choppiness of the diff. + if ($level > 1) { + $matrix->setApplySmoothing(true); + } + $u_pos = 0; $v_pos = 0; $edits = $matrix->getEditString(); $edits_length = strlen($edits); for ($ii = 0; $ii < $edits_length; $ii++) { $c = $edits[$ii]; if ($c == 's') { $diff->addPart('=', $u_parts[$u_pos]); $u_pos++; $v_pos++; } else if ($c == 'd') { $diff->addPart('-', $u_parts[$u_pos]); $u_pos++; } else if ($c == 'i') { $diff->addPart('+', $v_parts[$v_pos]); $v_pos++; } else if ($c == 'x') { $this->buildDiff($diff, $u_parts[$u_pos], $v_parts[$v_pos], $level + 1); $u_pos++; $v_pos++; } else { throw new Exception( pht( 'Unexpected character ("%s") in edit string.', $c)); } } } private function splitCorpus($corpus, $level) { switch ($level) { case 1: // Level 1: Split into sentences. $expr = '/([\n,!;?\.]+)/'; break; case 2: // Level 2: Split into words. $expr = '/(\s+)/'; break; case 3: // Level 3: Split into characters. return phutil_utf8v_combined($corpus); } $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE); return $this->stitchPieces($pieces); } private function stitchPieces(array $pieces) { $results = array(); $count = count($pieces); for ($ii = 0; $ii < $count; $ii += 2) { $result = $pieces[$ii]; if ($ii + 1 < $count) { $result .= $pieces[$ii + 1]; } $results[] = $result; } // If the input ended with a delimiter, we can get an empty final piece. // Just discard it. if (last($results) == '') { array_pop($results); } return $results; } } diff --git a/src/utils/__tests__/PhutilProseDiffTestCase.php b/src/utils/__tests__/PhutilProseDiffTestCase.php index 2a3f00d..8028b52 100644 --- a/src/utils/__tests__/PhutilProseDiffTestCase.php +++ b/src/utils/__tests__/PhutilProseDiffTestCase.php @@ -1,48 +1,77 @@ assertProseParts( '', '', array(), pht('Empty')); $this->assertProseParts( "xxx\nyyy", "xxx\nzzz\nyyy", array( "= xxx\n", "+ zzz\n", '= yyy', ), pht('Add Paragraph')); $this->assertProseParts( "xxx\nzzz\nyyy", "xxx\nyyy", array( "= xxx\n", "- zzz\n", '= yyy', ), pht('Remove Paragraph')); + + + // Without smoothing, the alogorithm identifies that "shark" and "cat" + // both contain the letter "a" and tries to express this as a very + // fine-grained edit which replaces "sh" with "c" and then "rk" with "t". + // This is technically correct, but it is much easier for human viewers to + // parse if we smooth this into a single removal and a single addition. + + $this->assertProseParts( + 'They say the shark has nine lives.', + 'They say the cat has nine lives.', + array( + '= They say the ', + '- shark', + '+ cat', + '= has nine lives.', + ), + pht('"Shark/cat" word edit smoothenss.')); + + $this->assertProseParts( + 'Rising quickly, she says', + 'Rising quickly, she remarks:', + array( + '= Rising quickly, she ', + '- says', + '+ remarks:', + ), + pht('"Says/remarks" word edit smoothenss.')); + } private function assertProseParts($old, $new, array $expect_parts, $label) { $engine = new PhutilProseDifferenceEngine(); $diff = $engine->getDiff($old, $new); $parts = $diff->getParts(); $actual = array(); foreach ($parts as $part) { $actual[] = $part['type'].' '.$part['text']; } $this->assertEqual($expect_parts, $actual, $label); } }