diff --git a/src/utils/PhutilEditDistanceMatrix.php b/src/utils/PhutilEditDistanceMatrix.php --- a/src/utils/PhutilEditDistanceMatrix.php +++ b/src/utils/PhutilEditDistanceMatrix.php @@ -54,6 +54,7 @@ private $alterCost = 0; private $maximumLength; private $computeString; + private $applySmoothing; private $x; private $y; @@ -126,6 +127,15 @@ return $this->alterCost; } + public function setApplySmoothing($apply_smoothing) { + $this->applySmoothing = $apply_smoothing; + return $this; + } + + public function getApplySmoothing() { + return $this->applySmoothing; + } + public function setSequences(array $x, array $y) { // NOTE: We strip common prefixes and suffixes from the inputs because @@ -276,7 +286,13 @@ } } - return $this->padEditString(strrev($str)); + $str = strrev($str); + + if ($this->getApplySmoothing()) { + $str = $this->applySmoothing($str); + } + + return $this->padEditString($str); } private function padEditString($str) { @@ -488,4 +504,20 @@ } } + private function applySmoothing($str) { + $result = $str; + + // Smooth the string out, by replacing short runs of similar characters + // with 'x' operations. This makes the result more readable to humans, + // since there are fewer choppy runs of short added and removed substrings. + do { + $original = $result; + $result = preg_replace('/([xdi])(s{3})([xdi])/', '$1xxx$3', $result); + $result = preg_replace('/([xdi])(s{2})([xdi])/', '$1xx$3', $result); + $result = preg_replace('/([xdi])(s{1})([xdi])/', '$1x$3', $result); + } while ($result != $original); + + return $result; + } + } diff --git a/src/utils/PhutilProseDifferenceEngine.php b/src/utils/PhutilProseDifferenceEngine.php --- a/src/utils/PhutilProseDifferenceEngine.php +++ b/src/utils/PhutilProseDifferenceEngine.php @@ -25,6 +25,12 @@ ->setSequences($u_parts, $v_parts) ->setComputeString(true); + // For word-level and character-level changes, smooth the output string + // to reduce the choppiness of the diff. + if ($level > 1) { + $matrix->setApplySmoothing(true); + } + $u_pos = 0; $v_pos = 0; diff --git a/src/utils/__tests__/PhutilProseDiffTestCase.php b/src/utils/__tests__/PhutilProseDiffTestCase.php --- a/src/utils/__tests__/PhutilProseDiffTestCase.php +++ b/src/utils/__tests__/PhutilProseDiffTestCase.php @@ -28,6 +28,35 @@ '= yyy', ), pht('Remove Paragraph')); + + + // Without smoothing, the alogorithm identifies that "shark" and "cat" + // both contain the letter "a" and try to express this as a very + // fine-grained edit which replaces "sh" with "c" and then "rk" with "t". + // This is technically correct, but it is much easier for human viewers to + // parse if we smooth this into a single removal and a single addition. + + $this->assertProseParts( + 'They say the shark has nine lives.', + 'They say the cat has nine lives.', + array( + '= They say the ', + '- shark', + '+ cat', + '= has nine lives.', + ), + pht('"Shark/cat" word edit smoothenss.')); + + $this->assertProseParts( + 'Rising quickly, she says', + 'Rising quickly, she remarks:', + array( + '= Rising quickly, she ', + '- says', + '+ remarks:', + ), + pht('"Says/remarks" word edit smoothenss.')); + } private function assertProseParts($old, $new, array $expect_parts, $label) {