diff --git a/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php b/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php index a5494aa05f..b8bee73961 100644 --- a/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php +++ b/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php @@ -1,275 +1,294 @@ buildDiff($u, $v, 0); } private function buildDiff($u, $v, $level) { $u_parts = $this->splitCorpus($u, $level); $v_parts = $this->splitCorpus($v, $level); if ($level === 0) { $diff = $this->newHashDiff($u_parts, $v_parts); $too_large = false; } else { list($diff, $too_large) = $this->newEditDistanceMatrixDiff( $u_parts, $v_parts, $level); } $diff->reorderParts(); // If we just built a character-level diff, we're all done and do not // need to go any deeper. if ($level == 3) { return $diff; } $blocks = array(); $block = null; foreach ($diff->getParts() as $part) { $type = $part['type']; $text = $part['text']; switch ($type) { case '=': if ($block) { $blocks[] = $block; $block = null; } $blocks[] = array( 'type' => $type, 'text' => $text, ); break; case '-': if (!$block) { $block = array( 'type' => '!', 'old' => '', 'new' => '', ); } $block['old'] .= $text; break; case '+': if (!$block) { $block = array( 'type' => '!', 'old' => '', 'new' => '', ); } $block['new'] .= $text; break; } } if ($block) { $blocks[] = $block; } $result = new PhutilProseDiff(); foreach ($blocks as $block) { $type = $block['type']; if ($type == '=') { $result->addPart('=', $block['text']); } else { $old = $block['old']; $new = $block['new']; if (!strlen($old) && !strlen($new)) { // Nothing to do. } else if (!strlen($old)) { $result->addPart('+', $new); } else if (!strlen($new)) { $result->addPart('-', $old); } else { if ($too_large) { // If this text was too big to diff, don't try to subdivide it. $result->addPart('-', $old); $result->addPart('+', $new); } else { $subdiff = $this->buildDiff( $old, $new, $level + 1); foreach ($subdiff->getParts() as $part) { $result->addPart($part['type'], $part['text']); } } } } } $result->reorderParts(); return $result; } private function splitCorpus($corpus, $level) { switch ($level) { case 0: // Level 0: Split into paragraphs. $expr = '/([\n]+)/'; break; case 1: // Level 1: Split into sentences. $expr = '/([\n,!;?\.]+)/'; break; case 2: // Level 2: Split into words. $expr = '/(\s+)/'; break; case 3: // Level 3: Split into characters. return phutil_utf8v_combined($corpus); } $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE); return $this->stitchPieces($pieces, $level); } private function stitchPieces(array $pieces, $level) { $results = array(); $count = count($pieces); for ($ii = 0; $ii < $count; $ii += 2) { $result = $pieces[$ii]; if ($ii + 1 < $count) { $result .= $pieces[$ii + 1]; } if ($level < 2) { - // Split pieces into separate text and whitespace sections: make one - // piece out of all the whitespace at the beginning, one piece out of - // all the actual text in the middle, and one piece out of all the - // whitespace at the end. - - $matches = null; - preg_match('/^(\s*)(.*?)(\s*)\z/s', $result, $matches); - - if (strlen($matches[1])) { - $results[] = $matches[1]; - } - if (strlen($matches[2])) { - $results[] = $matches[2]; - } - if (strlen($matches[3])) { - $results[] = $matches[3]; + $trimmed_pieces = $this->trimApart($result); + foreach ($trimmed_pieces as $trimmed_piece) { + $results[] = $trimmed_piece; } } else { $results[] = $result; } } // If the input ended with a delimiter, we can get an empty final piece. // Just discard it. if (last($results) == '') { array_pop($results); } return $results; } private function newEditDistanceMatrixDiff( array $u_parts, array $v_parts, $level) { $matrix = id(new PhutilEditDistanceMatrix()) ->setMaximumLength(128) ->setSequences($u_parts, $v_parts) ->setComputeString(true); // For word-level and character-level changes, smooth the output string // to reduce the choppiness of the diff. if ($level > 1) { $matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL); } $u_pos = 0; $v_pos = 0; $edits = $matrix->getEditString(); $edits_length = strlen($edits); $diff = new PhutilProseDiff(); for ($ii = 0; $ii < $edits_length; $ii++) { $c = $edits[$ii]; if ($c == 's') { $diff->addPart('=', $u_parts[$u_pos]); $u_pos++; $v_pos++; } else if ($c == 'd') { $diff->addPart('-', $u_parts[$u_pos]); $u_pos++; } else if ($c == 'i') { $diff->addPart('+', $v_parts[$v_pos]); $v_pos++; } else if ($c == 'x') { $diff->addPart('-', $u_parts[$u_pos]); $diff->addPart('+', $v_parts[$v_pos]); $u_pos++; $v_pos++; } else { throw new Exception( pht( 'Unexpected character ("%s") in edit string.', $c)); } } return array($diff, $matrix->didReachMaximumLength()); } private function newHashDiff(array $u_parts, array $v_parts) { $u_ref = new PhabricatorDocumentRef(); $v_ref = new PhabricatorDocumentRef(); $u_blocks = $this->newDocumentEngineBlocks($u_parts); $v_blocks = $this->newDocumentEngineBlocks($v_parts); $rows = id(new PhabricatorDocumentEngineBlocks()) ->addBlockList($u_ref, $u_blocks) ->addBlockList($v_ref, $v_blocks) ->newTwoUpLayout(); $diff = new PhutilProseDiff(); foreach ($rows as $row) { list($u_block, $v_block) = $row; if ($u_block && $v_block) { if ($u_block->getDifferenceType() === '-') { $diff->addPart('-', $u_block->getContent()); $diff->addPart('+', $v_block->getContent()); } else { $diff->addPart('=', $u_block->getContent()); } } else if ($u_block) { $diff->addPart('-', $u_block->getContent()); } else { $diff->addPart('+', $v_block->getContent()); } } return $diff; } private function newDocumentEngineBlocks(array $parts) { $blocks = array(); foreach ($parts as $part) { $hash = PhabricatorHash::digestForIndex($part); $blocks[] = id(new PhabricatorDocumentEngineBlock()) ->setContent($part) ->setDifferenceHash($hash); } return $blocks; } + public static function trimApart($input) { + // Split pieces into separate text and whitespace sections: make one + // piece out of all the whitespace at the beginning, one piece out of + // all the actual text in the middle, and one piece out of all the + // whitespace at the end. + + $parts = array(); + + $length = strlen($input); + + $corpus = ltrim($input); + $l_length = strlen($corpus); + if ($l_length !== $length) { + $parts[] = substr($input, 0, $length - $l_length); + } + + $corpus = rtrim($corpus); + $lr_length = strlen($corpus); + + if ($lr_length) { + $parts[] = $corpus; + } + + if ($lr_length !== $l_length) { + // NOTE: This will be a negative value; we're slicing from the end of + // the input string. + $parts[] = substr($input, $lr_length - $l_length); + } + + return $parts; + } + } diff --git a/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php b/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php index 823fef650a..d2e11cac5b 100644 --- a/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php +++ b/src/infrastructure/diff/prose/__tests__/PhutilProseDiffTestCase.php @@ -1,254 +1,287 @@ array(), + 'a' => array('a'), + ' a ' => array( + ' ', + 'a', + ' ', + ), + ' a' => array( + ' ', + 'a', + ), + 'a ' => array( + 'a', + ' ', + ), + ' a b ' => array( + ' ', + 'a b', + ' ', + ), + ); + + foreach ($map as $input => $expect) { + $actual = PhutilProseDifferenceEngine::trimApart($input); + $this->assertEqual( + $expect, + $actual, + pht('Trim Apart: %s', $input)); + } + } + public function testProseDiffsDistance() { $this->assertProseParts( '', '', array(), pht('Empty')); $this->assertProseParts( "xxx\nyyy", "xxx\nzzz\nyyy", array( "= xxx\n", "+ zzz\n", '= yyy', ), pht('Add Paragraph')); $this->assertProseParts( "xxx\nzzz\nyyy", "xxx\nyyy", array( "= xxx\n", "- zzz\n", '= yyy', ), pht('Remove Paragraph')); $this->assertProseParts( 'xxx', "xxxyyy\n.zzz", array( '= xxx', "+ yyy\n.zzz", ), pht('Amend paragraph, and add paragraph starting with punctuation')); // Without smoothing, the alogorithm identifies that "shark" and "cat" // both contain the letter "a" and tries to express this as a very // fine-grained edit which replaces "sh" with "c" and then "rk" with "t". // This is technically correct, but it is much easier for human viewers to // parse if we smooth this into a single removal and a single addition. $this->assertProseParts( 'They say the shark has nine lives.', 'They say the cat has nine lives.', array( '= They say the ', '- shark', '+ cat', '= has nine lives.', ), pht('"Shark/cat" word edit smoothenss.')); $this->assertProseParts( 'Rising quickly, she says', 'Rising quickly, she remarks:', array( '= Rising quickly, she ', '- says', '+ remarks:', ), pht('"Says/remarks" word edit smoothenss.')); $this->assertProseParts( 'See screenshots', 'Viewed video files', array( '- See screenshots', '+ Viewed video files', ), pht('Complete paragraph rewrite.')); $this->assertProseParts( 'xaaax', 'xbbbx', array( '- xaaax', '+ xbbbx', ), pht('Whole word rewrite with common prefix and suffix.')); $this->assertProseParts( ' aaa ', ' bbb ', array( '= ', '- aaa', '+ bbb', '= ', ), pht('Whole word rewrite with whitespace prefix and suffix.')); $this->assertSummaryProseParts( "a\nb\nc\nd\ne\nf\ng\nh\n", "a\nb\nc\nd\nX\nf\ng\nh\n", array( '.', "= d\n", '- e', '+ X', "= \nf", '.', ), pht('Summary diff with middle change.')); $this->assertSummaryProseParts( "a\nb\nc\nd\ne\nf\ng\nh\n", "X\nb\nc\nd\ne\nf\ng\nh\n", array( '- a', '+ X', "= \nb", '.', ), pht('Summary diff with head change.')); $this->assertSummaryProseParts( "a\nb\nc\nd\ne\nf\ng\nh\n", "a\nb\nc\nd\ne\nf\ng\nX\n", array( '.', "= g\n", '- h', '+ X', "= \n", ), pht('Summary diff with last change.')); $this->assertProseParts( 'aaa aaa aaa aaa, bbb bbb bbb bbb.', "aaa aaa aaa aaa, bbb bbb bbb bbb.\n\n- ccc ccc ccc", array( '= aaa aaa aaa aaa, bbb bbb bbb bbb.', "+ \n\n- ccc ccc ccc", ), pht('Diff with new trailing content.')); $this->assertProseParts( 'aaa aaa aaa aaa, bbb bbb bbb bbb.', 'aaa aaa aaa aaa bbb bbb bbb bbb.', array( '= aaa aaa aaa aaa', '- ,', '= bbb bbb bbb bbb.', ), pht('Diff with a removed comma.')); $this->assertProseParts( 'aaa aaa aaa aaa, bbb bbb bbb bbb.', "aaa aaa aaa aaa bbb bbb bbb bbb.\n\n- ccc ccc ccc!", array( '= aaa aaa aaa aaa', '- ,', '= bbb bbb bbb bbb.', "+ \n\n- ccc ccc ccc!", ), pht('Diff with a removed comma and new trailing content.')); $this->assertProseParts( '[ ] Walnuts', '[X] Walnuts', array( '= [', '- ', '+ X', '= ] Walnuts', ), pht('Diff adding a tickmark to a checkbox list.')); $this->assertProseParts( '[[ ./week49 ]]', '[[ ./week50 ]]', array( '= [[ ./week', '- 49', '+ 50', '= ]]', ), pht('Diff changing a remarkup wiki link target.')); // Create a large corpus with many sentences and paragraphs. $large_paragraph = 'xyz. '; $large_paragraph = str_repeat($large_paragraph, 50); $large_paragraph = rtrim($large_paragraph); $large_corpus = $large_paragraph."\n\n"; $large_corpus = str_repeat($large_corpus, 50); $large_corpus = rtrim($large_corpus); $this->assertProseParts( $large_corpus, "aaa\n\n".$large_corpus."\n\nzzz", array( "+ aaa\n\n", '= '.$large_corpus, "+ \n\nzzz", ), pht('Adding initial and final lines to a large corpus.')); } private function assertProseParts($old, $new, array $expect_parts, $label) { $engine = new PhutilProseDifferenceEngine(); $diff = $engine->getDiff($old, $new); $parts = $diff->getParts(); $this->assertParts($expect_parts, $parts, $label); } private function assertSummaryProseParts( $old, $new, array $expect_parts, $label) { $engine = new PhutilProseDifferenceEngine(); $diff = $engine->getDiff($old, $new); $parts = $diff->getSummaryParts(); $this->assertParts($expect_parts, $parts, $label); } private function assertParts( array $expect, array $actual_parts, $label) { $actual = array(); foreach ($actual_parts as $actual_part) { $type = $actual_part['type']; $text = $actual_part['text']; switch ($type) { case '.': $actual[] = $type; break; default: $actual[] = "{$type} {$text}"; break; } } $this->assertEqual($expect, $actual, $label); } }