diff --git a/src/utils/PhutilProseDifferenceEngine.php b/src/utils/PhutilProseDifferenceEngine.php index f309045..6000f1a 100644 --- a/src/utils/PhutilProseDifferenceEngine.php +++ b/src/utils/PhutilProseDifferenceEngine.php @@ -1,126 +1,199 @@ buildDiff($diff, $u, $v, 1); - $diff->reorderParts(); - - return $diff; + return $this->buildDiff($u, $v, 1); } - private function buildDiff(PhutilProseDiff $diff, $u, $v, $level) { - if ($level == 4) { - $diff->addPart('-', $u); - $diff->addPart('+', $v); - return; - } - + private function buildDiff($u, $v, $level) { $u_parts = $this->splitCorpus($u, $level); $v_parts = $this->splitCorpus($v, $level); $matrix = id(new PhutilEditDistanceMatrix()) ->setMaximumLength(128) ->setSequences($u_parts, $v_parts) ->setComputeString(true); // For word-level and character-level changes, smooth the output string // to reduce the choppiness of the diff. if ($level > 1) { $matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL); } $u_pos = 0; $v_pos = 0; $edits = $matrix->getEditString(); $edits_length = strlen($edits); + $diff = new PhutilProseDiff(); for ($ii = 0; $ii < $edits_length; $ii++) { $c = $edits[$ii]; if ($c == 's') { $diff->addPart('=', $u_parts[$u_pos]); $u_pos++; $v_pos++; } else if ($c == 'd') { $diff->addPart('-', $u_parts[$u_pos]); $u_pos++; } else if ($c == 'i') { $diff->addPart('+', $v_parts[$v_pos]); $v_pos++; } else if ($c == 'x') { - $this->buildDiff($diff, $u_parts[$u_pos], $v_parts[$v_pos], $level + 1); + $diff->addPart('-', $u_parts[$u_pos]); + $diff->addPart('+', $v_parts[$v_pos]); $u_pos++; $v_pos++; } else { throw new Exception( pht( 'Unexpected character ("%s") in edit string.', $c)); } } + + $diff->reorderParts(); + + // If we just built a character-level diff, we're all done and do not + // need to go any deeper. + if ($level == 3) { + return $diff; + } + + $blocks = array(); + $block = null; + foreach ($diff->getParts() as $part) { + $type = $part['type']; + $text = $part['text']; + switch ($type) { + case '=': + if ($block) { + $blocks[] = $block; + $block = null; + } + $blocks[] = array( + 'type' => $type, + 'text' => $text, + ); + break; + case '-': + if (!$block) { + $block = array( + 'type' => '!', + 'old' => '', + 'new' => '', + ); + } + $block['old'] .= $text; + break; + case '+': + if (!$block) { + $block = array( + 'type' => '!', + 'old' => '', + 'new' => '', + ); + } + $block['new'] .= $text; + break; + } + } + + if ($block) { + $blocks[] = $block; + } + + $result = new PhutilProseDiff(); + foreach ($blocks as $block) { + $type = $block['type']; + if ($type == '=') { + $result->addPart('=', $block['text']); + } else { + $old = $block['old']; + $new = $block['new']; + if (!strlen($old) && !strlen($new)) { + // Nothing to do. + } else if (!strlen($old)) { + $result->addPart('+', $new); + } else if (!strlen($new)) { + $result->addPart('-', $old); + } else { + $subdiff = $this->buildDiff( + $old, + $new, + $level + 1); + + foreach ($subdiff->getParts() as $part) { + $result->addPart($part['type'], $part['text']); + } + } + } + } + + $result->reorderParts(); + + return $result; } private function splitCorpus($corpus, $level) { switch ($level) { case 1: // Level 1: Split into sentences. $expr = '/([\n,!;?\.]+)/'; break; case 2: // Level 2: Split into words. $expr = '/(\s+)/'; break; case 3: // Level 3: Split into characters. return phutil_utf8v_combined($corpus); } $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE); return $this->stitchPieces($pieces, $level); } private function stitchPieces(array $pieces, $level) { $results = array(); $count = count($pieces); for ($ii = 0; $ii < $count; $ii += 2) { $result = $pieces[$ii]; if ($ii + 1 < $count) { $result .= $pieces[$ii + 1]; } if ($level == 1) { // Split pieces into separate text and whitespace sections: make one // piece out of all the whitespace at the beginning, one piece out of // all the actual text in the middle, and one piece out of all the // whitespace at the end. $matches = null; preg_match('/^(\s*)(.*?)(\s*)\z/', $result, $matches); if (strlen($matches[1])) { $results[] = $matches[1]; } if (strlen($matches[2])) { $results[] = $matches[2]; } if (strlen($matches[3])) { $results[] = $matches[3]; } } else { $results[] = $result; } } // If the input ended with a delimiter, we can get an empty final piece. // Just discard it. if (last($results) == '') { array_pop($results); } return $results; } } diff --git a/src/utils/__tests__/PhutilProseDiffTestCase.php b/src/utils/__tests__/PhutilProseDiffTestCase.php index 64c978c..8d40484 100644 --- a/src/utils/__tests__/PhutilProseDiffTestCase.php +++ b/src/utils/__tests__/PhutilProseDiffTestCase.php @@ -1,182 +1,204 @@ assertProseParts( '', '', array(), pht('Empty')); $this->assertProseParts( "xxx\nyyy", "xxx\nzzz\nyyy", array( "= xxx\n", "+ zzz\n", '= yyy', ), pht('Add Paragraph')); $this->assertProseParts( "xxx\nzzz\nyyy", "xxx\nyyy", array( "= xxx\n", "- zzz\n", '= yyy', ), pht('Remove Paragraph')); // Without smoothing, the alogorithm identifies that "shark" and "cat" // both contain the letter "a" and tries to express this as a very // fine-grained edit which replaces "sh" with "c" and then "rk" with "t". // This is technically correct, but it is much easier for human viewers to // parse if we smooth this into a single removal and a single addition. $this->assertProseParts( 'They say the shark has nine lives.', 'They say the cat has nine lives.', array( '= They say the ', '- shark', '+ cat', '= has nine lives.', ), pht('"Shark/cat" word edit smoothenss.')); $this->assertProseParts( 'Rising quickly, she says', 'Rising quickly, she remarks:', array( '= Rising quickly, she ', '- says', '+ remarks:', ), pht('"Says/remarks" word edit smoothenss.')); $this->assertProseParts( 'See screenshots', 'Viewed video files', array( '- See screenshots', '+ Viewed video files', ), pht('Complete paragraph rewrite.')); $this->assertProseParts( 'xaaax', 'xbbbx', array( '- xaaax', '+ xbbbx', ), pht('Whole word rewrite with common prefix and suffix.')); $this->assertProseParts( ' aaa ', ' bbb ', array( '= ', '- aaa', '+ bbb', '= ', ), pht('Whole word rewrite with whitespace prefix and suffix.')); $this->assertSummaryProseParts( "a\nb\nc\nd\ne\nf\ng\nh\n", "a\nb\nc\nd\nX\nf\ng\nh\n", array( '.', "= d\n", '- e', '+ X', "= \nf", '.', ), pht('Summary diff with middle change.')); $this->assertSummaryProseParts( "a\nb\nc\nd\ne\nf\ng\nh\n", "X\nb\nc\nd\ne\nf\ng\nh\n", array( '- a', '+ X', "= \nb", '.', ), pht('Summary diff with head change.')); $this->assertSummaryProseParts( "a\nb\nc\nd\ne\nf\ng\nh\n", "a\nb\nc\nd\ne\nf\ng\nX\n", array( '.', "= g\n", '- h', '+ X', "= \n", ), pht('Summary diff with last change.')); $this->assertProseParts( 'aaa aaa aaa aaa, bbb bbb bbb bbb.', "aaa aaa aaa aaa, bbb bbb bbb bbb.\n\n- ccc ccc ccc", array( '= aaa aaa aaa aaa, bbb bbb bbb bbb.', "+ \n\n- ccc ccc ccc", ), pht('Diff with new trailing content.')); + + $this->assertProseParts( + 'aaa aaa aaa aaa, bbb bbb bbb bbb.', + 'aaa aaa aaa aaa bbb bbb bbb bbb.', + array( + '= aaa aaa aaa aaa', + '- ,', + '= bbb bbb bbb bbb.', + ), + pht('Diff with a removed comma.')); + + $this->assertProseParts( + 'aaa aaa aaa aaa, bbb bbb bbb bbb.', + "aaa aaa aaa aaa bbb bbb bbb bbb.\n\n- ccc ccc ccc!", + array( + '= aaa aaa aaa aaa', + '- ,', + '= bbb bbb bbb bbb.', + "+ \n\n- ccc ccc ccc!", + ), + pht('Diff with a removed comma and new trailing content.')); + } private function assertProseParts($old, $new, array $expect_parts, $label) { $engine = new PhutilProseDifferenceEngine(); $diff = $engine->getDiff($old, $new); $parts = $diff->getParts(); $this->assertParts($expect_parts, $parts, $label); } private function assertSummaryProseParts( $old, $new, array $expect_parts, $label) { $engine = new PhutilProseDifferenceEngine(); $diff = $engine->getDiff($old, $new); $parts = $diff->getSummaryParts(); $this->assertParts($expect_parts, $parts, $label); } private function assertParts( array $expect, array $actual_parts, $label) { $actual = array(); foreach ($actual_parts as $actual_part) { $type = $actual_part['type']; $text = $actual_part['text']; switch ($type) { case '.': $actual[] = $type; break; default: $actual[] = "{$type} {$text}"; break; } } $this->assertEqual($expect, $actual, $label); } }