diff --git a/scripts/utils/prosediff.php b/scripts/utils/prosediff.php deleted file mode 100755 --- a/scripts/utils/prosediff.php +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env php -setTagline(pht('show prose differences')); -$args->setSynopsis(<<parseStandardArguments(); -$args->parse( - array( - array( - 'name' => 'files', - 'wildcard' => true, - ), - )); - -$files = $args->getArg('files'); -if (count($files) !== 2) { - $args->printHelpAndExit(); -} -$old_file = head($files); -$new_file = last($files); - -$old_data = Filesystem::readFile($old_file); -$new_data = Filesystem::readFile($new_file); - -$engine = new PhutilProseDifferenceEngine(); - -$prose_diff = $engine->getDiff($old_data, $new_data); - -foreach ($prose_diff->getParts() as $part) { - switch ($part['type']) { - case '-': - echo tsprintf('%B', $part['text']); - break; - case '+': - echo tsprintf('%B', $part['text']); - break; - case '=': - echo tsprintf('%B', $part['text']); - break; - } -} diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -297,9 +297,6 @@ 'PhutilProcessRef' => 'filesystem/PhutilProcessRef.php', 'PhutilProcessRefTestCase' => 'filesystem/__tests__/PhutilProcessRefTestCase.php', 'PhutilProgressSink' => 'progress/PhutilProgressSink.php', - 'PhutilProseDiff' => 'utils/PhutilProseDiff.php', - 'PhutilProseDiffTestCase' => 'utils/__tests__/PhutilProseDiffTestCase.php', - 'PhutilProseDifferenceEngine' => 'utils/PhutilProseDifferenceEngine.php', 'PhutilProtocolChannel' => 'channel/PhutilProtocolChannel.php', 'PhutilProxyException' => 'error/PhutilProxyException.php', 'PhutilProxyIterator' => 'utils/PhutilProxyIterator.php', @@ -841,9 +838,6 @@ 'PhutilProcessRef' => 'Phobject', 'PhutilProcessRefTestCase' => 'PhutilTestCase', 'PhutilProgressSink' => 'Phobject', - 'PhutilProseDiff' => 'Phobject', - 'PhutilProseDiffTestCase' => 'PhutilTestCase', - 'PhutilProseDifferenceEngine' => 'Phobject', 'PhutilProtocolChannel' => 'PhutilChannelChannel', 'PhutilProxyException' => 'Exception', 'PhutilProxyIterator' => array( diff --git a/src/utils/PhutilProseDiff.php b/src/utils/PhutilProseDiff.php deleted file mode 100644 --- a/src/utils/PhutilProseDiff.php +++ /dev/null @@ -1,292 +0,0 @@ -parts[] = array( - 'type' => $type, - 'text' => $text, - ); - return $this; - } - - public function getParts() { - return $this->parts; - } - - /** - * Get diff parts, but replace large blocks of unchanged text with "." - * parts representing missing context. - */ - public function getSummaryParts() { - $parts = $this->getParts(); - - $head_key = head_key($parts); - $last_key = last_key($parts); - - $results = array(); - foreach ($parts as $key => $part) { - $is_head = ($key == $head_key); - $is_last = ($key == $last_key); - - switch ($part['type']) { - case '=': - $pieces = $this->splitTextForSummary($part['text']); - - if ($is_head || $is_last) { - $need = 2; - } else { - $need = 3; - } - - // We don't have enough pieces to omit anything, so just continue. - if (count($pieces) < $need) { - $results[] = $part; - break; - } - - if (!$is_head) { - $results[] = array( - 'type' => '=', - 'text' => head($pieces), - ); - } - - $results[] = array( - 'type' => '.', - 'text' => null, - ); - - if (!$is_last) { - $results[] = array( - 'type' => '=', - 'text' => last($pieces), - ); - } - break; - default: - $results[] = $part; - break; - } - } - - return $results; - } - - - public function reorderParts() { - // Reorder sequences of removed and added sections to put all the "-" - // parts together first, then all the "+" parts together. This produces - // a more human-readable result than intermingling them. - - $o_run = array(); - $n_run = array(); - $result = array(); - foreach ($this->parts as $part) { - $type = $part['type']; - switch ($type) { - case '-': - $o_run[] = $part; - break; - case '+': - $n_run[] = $part; - break; - default: - if ($o_run || $n_run) { - foreach ($this->combineRuns($o_run, $n_run) as $merged_part) { - $result[] = $merged_part; - } - $o_run = array(); - $n_run = array(); - } - $result[] = $part; - break; - } - } - - if ($o_run || $n_run) { - foreach ($this->combineRuns($o_run, $n_run) as $part) { - $result[] = $part; - } - } - - // Now, combine consecuitive runs of the same type of change (like a - // series of "-" parts) into a single run. - $combined = array(); - - $last = null; - $last_text = null; - foreach ($result as $part) { - $type = $part['type']; - - if ($last !== $type) { - if ($last !== null) { - $combined[] = array( - 'type' => $last, - 'text' => $last_text, - ); - } - $last_text = null; - $last = $type; - } - - $last_text .= $part['text']; - } - - if ($last_text !== null) { - $combined[] = array( - 'type' => $last, - 'text' => $last_text, - ); - } - - $this->parts = $combined; - - return $this; - } - - private function combineRuns($o_run, $n_run) { - $o_merge = $this->mergeParts($o_run); - $n_merge = $this->mergeParts($n_run); - - // When removed and added blocks share a prefix or suffix, we sometimes - // want to count it as unchanged (for example, if it is whitespace) but - // sometimes want to count it as changed (for example, if it is a word - // suffix like "ing"). Find common prefixes and suffixes of these layout - // characters and emit them as "=" (unchanged) blocks. - - $layout_characters = array( - ' ' => true, - "\n" => true, - '.' => true, - '!' => true, - ',' => true, - '?' => true, - ']' => true, - '[' => true, - '(' => true, - ')' => true, - '<' => true, - '>' => true, - ); - - $o_text = $o_merge['text']; - $n_text = $n_merge['text']; - $o_len = strlen($o_text); - $n_len = strlen($n_text); - $min_len = min($o_len, $n_len); - - $prefix_len = 0; - for ($pos = 0; $pos < $min_len; $pos++) { - $o = $o_text[$pos]; - $n = $n_text[$pos]; - if ($o !== $n) { - break; - } - if (empty($layout_characters[$o])) { - break; - } - $prefix_len++; - } - - $suffix_len = 0; - for ($pos = 0; $pos < ($min_len - $prefix_len); $pos++) { - $o = $o_text[$o_len - ($pos + 1)]; - $n = $n_text[$n_len - ($pos + 1)]; - if ($o !== $n) { - break; - } - if (empty($layout_characters[$o])) { - break; - } - $suffix_len++; - } - - $results = array(); - - if ($prefix_len) { - $results[] = array( - 'type' => '=', - 'text' => substr($o_text, 0, $prefix_len), - ); - } - - if ($prefix_len < $o_len) { - $results[] = array( - 'type' => '-', - 'text' => substr( - $o_text, - $prefix_len, - $o_len - $prefix_len - $suffix_len), - ); - } - - if ($prefix_len < $n_len) { - $results[] = array( - 'type' => '+', - 'text' => substr( - $n_text, - $prefix_len, - $n_len - $prefix_len - $suffix_len), - ); - } - - if ($suffix_len) { - $results[] = array( - 'type' => '=', - 'text' => substr($o_text, -$suffix_len), - ); - } - - return $results; - } - - private function mergeParts(array $parts) { - $text = ''; - $type = null; - foreach ($parts as $part) { - $part_type = $part['type']; - if ($type === null) { - $type = $part_type; - } - if ($type !== $part_type) { - throw new Exception(pht('Can not merge parts of dissimilar types!')); - } - $text .= $part['text']; - } - - return array( - 'type' => $type, - 'text' => $text, - ); - } - - private function splitTextForSummary($text) { - $matches = null; - - $ok = preg_match('/^(\n*[^\n]+)\n/', $text, $matches); - if (!$ok) { - return array($text); - } - - $head = $matches[1]; - $text = substr($text, strlen($head)); - - $ok = preg_match('/\n([^\n]+\n*)\z/', $text, $matches); - if (!$ok) { - return array($text); - } - - $last = $matches[1]; - $text = substr($text, 0, -strlen($last)); - - if (!strlen(trim($text))) { - return array($head, $last); - } else { - return array($head, $text, $last); - } - } - -} diff --git a/src/utils/PhutilProseDifferenceEngine.php b/src/utils/PhutilProseDifferenceEngine.php deleted file mode 100644 --- a/src/utils/PhutilProseDifferenceEngine.php +++ /dev/null @@ -1,209 +0,0 @@ -buildDiff($u, $v, 0); - } - - private function buildDiff($u, $v, $level) { - $u_parts = $this->splitCorpus($u, $level); - $v_parts = $this->splitCorpus($v, $level); - - $matrix = id(new PhutilEditDistanceMatrix()) - ->setMaximumLength(128) - ->setSequences($u_parts, $v_parts) - ->setComputeString(true); - - // For word-level and character-level changes, smooth the output string - // to reduce the choppiness of the diff. - if ($level > 1) { - $matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL); - } - - $u_pos = 0; - $v_pos = 0; - - $edits = $matrix->getEditString(); - $edits_length = strlen($edits); - - $diff = new PhutilProseDiff(); - for ($ii = 0; $ii < $edits_length; $ii++) { - $c = $edits[$ii]; - if ($c == 's') { - $diff->addPart('=', $u_parts[$u_pos]); - $u_pos++; - $v_pos++; - } else if ($c == 'd') { - $diff->addPart('-', $u_parts[$u_pos]); - $u_pos++; - } else if ($c == 'i') { - $diff->addPart('+', $v_parts[$v_pos]); - $v_pos++; - } else if ($c == 'x') { - $diff->addPart('-', $u_parts[$u_pos]); - $diff->addPart('+', $v_parts[$v_pos]); - $u_pos++; - $v_pos++; - } else { - throw new Exception( - pht( - 'Unexpected character ("%s") in edit string.', - $c)); - } - } - - $diff->reorderParts(); - - // If we just built a character-level diff, we're all done and do not - // need to go any deeper. - if ($level == 3) { - return $diff; - } - - $blocks = array(); - $block = null; - foreach ($diff->getParts() as $part) { - $type = $part['type']; - $text = $part['text']; - switch ($type) { - case '=': - if ($block) { - $blocks[] = $block; - $block = null; - } - $blocks[] = array( - 'type' => $type, - 'text' => $text, - ); - break; - case '-': - if (!$block) { - $block = array( - 'type' => '!', - 'old' => '', - 'new' => '', - ); - } - $block['old'] .= $text; - break; - case '+': - if (!$block) { - $block = array( - 'type' => '!', - 'old' => '', - 'new' => '', - ); - } - $block['new'] .= $text; - break; - } - } - - if ($block) { - $blocks[] = $block; - } - - $result = new PhutilProseDiff(); - foreach ($blocks as $block) { - $type = $block['type']; - if ($type == '=') { - $result->addPart('=', $block['text']); - } else { - $old = $block['old']; - $new = $block['new']; - if (!strlen($old) && !strlen($new)) { - // Nothing to do. - } else if (!strlen($old)) { - $result->addPart('+', $new); - } else if (!strlen($new)) { - $result->addPart('-', $old); - } else { - if ($matrix->didReachMaximumLength()) { - // If this text was too big to diff, don't try to subdivide it. - $result->addPart('-', $old); - $result->addPart('+', $new); - } else { - $subdiff = $this->buildDiff( - $old, - $new, - $level + 1); - - foreach ($subdiff->getParts() as $part) { - $result->addPart($part['type'], $part['text']); - } - } - } - } - } - - $result->reorderParts(); - - return $result; - } - - private function splitCorpus($corpus, $level) { - switch ($level) { - case 0: - // Level 0: Split into paragraphs. - $expr = '/([\n]+)/'; - break; - case 1: - // Level 1: Split into sentences. - $expr = '/([\n,!;?\.]+)/'; - break; - case 2: - // Level 2: Split into words. - $expr = '/(\s+)/'; - break; - case 3: - // Level 3: Split into characters. - return phutil_utf8v_combined($corpus); - } - - $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE); - return $this->stitchPieces($pieces, $level); - } - - private function stitchPieces(array $pieces, $level) { - $results = array(); - $count = count($pieces); - for ($ii = 0; $ii < $count; $ii += 2) { - $result = $pieces[$ii]; - if ($ii + 1 < $count) { - $result .= $pieces[$ii + 1]; - } - - if ($level < 2) { - // Split pieces into separate text and whitespace sections: make one - // piece out of all the whitespace at the beginning, one piece out of - // all the actual text in the middle, and one piece out of all the - // whitespace at the end. - - $matches = null; - preg_match('/^(\s*)(.*?)(\s*)\z/', $result, $matches); - - if (strlen($matches[1])) { - $results[] = $matches[1]; - } - if (strlen($matches[2])) { - $results[] = $matches[2]; - } - if (strlen($matches[3])) { - $results[] = $matches[3]; - } - } else { - $results[] = $result; - } - } - - // If the input ended with a delimiter, we can get an empty final piece. - // Just discard it. - if (last($results) == '') { - array_pop($results); - } - - return $results; - } - -} diff --git a/src/utils/__tests__/PhutilProseDiffTestCase.php b/src/utils/__tests__/PhutilProseDiffTestCase.php deleted file mode 100644 --- a/src/utils/__tests__/PhutilProseDiffTestCase.php +++ /dev/null @@ -1,245 +0,0 @@ -assertProseParts( - '', - '', - array(), - pht('Empty')); - - $this->assertProseParts( - "xxx\nyyy", - "xxx\nzzz\nyyy", - array( - "= xxx\n", - "+ zzz\n", - '= yyy', - ), - pht('Add Paragraph')); - - $this->assertProseParts( - "xxx\nzzz\nyyy", - "xxx\nyyy", - array( - "= xxx\n", - "- zzz\n", - '= yyy', - ), - pht('Remove Paragraph')); - - - // Without smoothing, the alogorithm identifies that "shark" and "cat" - // both contain the letter "a" and tries to express this as a very - // fine-grained edit which replaces "sh" with "c" and then "rk" with "t". - // This is technically correct, but it is much easier for human viewers to - // parse if we smooth this into a single removal and a single addition. - - $this->assertProseParts( - 'They say the shark has nine lives.', - 'They say the cat has nine lives.', - array( - '= They say the ', - '- shark', - '+ cat', - '= has nine lives.', - ), - pht('"Shark/cat" word edit smoothenss.')); - - $this->assertProseParts( - 'Rising quickly, she says', - 'Rising quickly, she remarks:', - array( - '= Rising quickly, she ', - '- says', - '+ remarks:', - ), - pht('"Says/remarks" word edit smoothenss.')); - - $this->assertProseParts( - 'See screenshots', - 'Viewed video files', - array( - '- See screenshots', - '+ Viewed video files', - ), - pht('Complete paragraph rewrite.')); - - $this->assertProseParts( - 'xaaax', - 'xbbbx', - array( - '- xaaax', - '+ xbbbx', - ), - pht('Whole word rewrite with common prefix and suffix.')); - - $this->assertProseParts( - ' aaa ', - ' bbb ', - array( - '= ', - '- aaa', - '+ bbb', - '= ', - ), - pht('Whole word rewrite with whitespace prefix and suffix.')); - - $this->assertSummaryProseParts( - "a\nb\nc\nd\ne\nf\ng\nh\n", - "a\nb\nc\nd\nX\nf\ng\nh\n", - array( - '.', - "= d\n", - '- e', - '+ X', - "= \nf", - '.', - ), - pht('Summary diff with middle change.')); - - $this->assertSummaryProseParts( - "a\nb\nc\nd\ne\nf\ng\nh\n", - "X\nb\nc\nd\ne\nf\ng\nh\n", - array( - '- a', - '+ X', - "= \nb", - '.', - ), - pht('Summary diff with head change.')); - - $this->assertSummaryProseParts( - "a\nb\nc\nd\ne\nf\ng\nh\n", - "a\nb\nc\nd\ne\nf\ng\nX\n", - array( - '.', - "= g\n", - '- h', - '+ X', - "= \n", - ), - pht('Summary diff with last change.')); - - $this->assertProseParts( - 'aaa aaa aaa aaa, bbb bbb bbb bbb.', - "aaa aaa aaa aaa, bbb bbb bbb bbb.\n\n- ccc ccc ccc", - array( - '= aaa aaa aaa aaa, bbb bbb bbb bbb.', - "+ \n\n- ccc ccc ccc", - ), - pht('Diff with new trailing content.')); - - $this->assertProseParts( - 'aaa aaa aaa aaa, bbb bbb bbb bbb.', - 'aaa aaa aaa aaa bbb bbb bbb bbb.', - array( - '= aaa aaa aaa aaa', - '- ,', - '= bbb bbb bbb bbb.', - ), - pht('Diff with a removed comma.')); - - $this->assertProseParts( - 'aaa aaa aaa aaa, bbb bbb bbb bbb.', - "aaa aaa aaa aaa bbb bbb bbb bbb.\n\n- ccc ccc ccc!", - array( - '= aaa aaa aaa aaa', - '- ,', - '= bbb bbb bbb bbb.', - "+ \n\n- ccc ccc ccc!", - ), - pht('Diff with a removed comma and new trailing content.')); - - $this->assertProseParts( - '[ ] Walnuts', - '[X] Walnuts', - array( - '= [', - '- ', - '+ X', - '= ] Walnuts', - ), - pht('Diff adding a tickmark to a checkbox list.')); - - $this->assertProseParts( - '[[ ./week49 ]]', - '[[ ./week50 ]]', - array( - '= [[ ./week', - '- 49', - '+ 50', - '= ]]', - ), - pht('Diff changing a remarkup wiki link target.')); - - // Create a large corpus with many sentences and paragraphs. - $large_paragraph = 'xyz. '; - $large_paragraph = str_repeat($large_paragraph, 50); - $large_paragraph = rtrim($large_paragraph); - - $large_corpus = $large_paragraph."\n\n"; - $large_corpus = str_repeat($large_corpus, 50); - $large_corpus = rtrim($large_corpus); - - $this->assertProseParts( - $large_corpus, - "aaa\n\n".$large_corpus."\n\nzzz", - array( - "+ aaa\n\n", - '= '.$large_corpus, - "+ \n\nzzz", - ), - pht('Adding initial and final lines to a large corpus.')); - - } - - private function assertProseParts($old, $new, array $expect_parts, $label) { - $engine = new PhutilProseDifferenceEngine(); - $diff = $engine->getDiff($old, $new); - - $parts = $diff->getParts(); - - $this->assertParts($expect_parts, $parts, $label); - } - - private function assertSummaryProseParts( - $old, - $new, - array $expect_parts, - $label) { - - $engine = new PhutilProseDifferenceEngine(); - $diff = $engine->getDiff($old, $new); - - $parts = $diff->getSummaryParts(); - - $this->assertParts($expect_parts, $parts, $label); - } - - private function assertParts( - array $expect, - array $actual_parts, - $label) { - - $actual = array(); - foreach ($actual_parts as $actual_part) { - $type = $actual_part['type']; - $text = $actual_part['text']; - - switch ($type) { - case '.': - $actual[] = $type; - break; - default: - $actual[] = "{$type} {$text}"; - break; - } - } - - $this->assertEqual($expect, $actual, $label); - } - - -}