diff --git a/scripts/utils/prosediff.php b/scripts/utils/prosediff.php new file mode 100755 --- /dev/null +++ b/scripts/utils/prosediff.php @@ -0,0 +1,50 @@ +#!/usr/bin/env php +setTagline(pht('show prose differences')); +$args->setSynopsis(<<parseStandardArguments(); +$args->parse( + array( + array( + 'name' => 'files', + 'wildcard' => true, + ), + )); + +$files = $args->getArg('files'); +if (count($files) !== 2) { + $args->printHelpAndExit(); +} +$old_file = head($files); +$new_file = last($files); + +$old_data = Filesystem::readFile($old_file); +$new_data = Filesystem::readFile($new_file); + +$engine = new PhutilProseDifferenceEngine(); + +$prose_diff = $engine->getDiff($old_data, $new_data); + +foreach ($prose_diff->getParts() as $part) { + switch ($part['type']) { + case '-': + echo tsprintf('%B', $part['text']); + break; + case '+': + echo tsprintf('%B', $part['text']); + break; + case '=': + echo tsprintf('%B', $part['text']); + break; + } +} diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -298,6 +298,8 @@ 'PhutilPortuguesePortugalLocale' => 'internationalization/locales/PhutilPortuguesePortugalLocale.php', 'PhutilPregsprintfTestCase' => 'xsprintf/__tests__/PhutilPregsprintfTestCase.php', 'PhutilProcessGroupDaemon' => 'daemon/torture/PhutilProcessGroupDaemon.php', + 'PhutilProseDiff' => 'utils/PhutilProseDiff.php', + 'PhutilProseDifferenceEngine' => 'utils/PhutilProseDifferenceEngine.php', 'PhutilProtocolChannel' => 'channel/PhutilProtocolChannel.php', 'PhutilProxyException' => 'error/PhutilProxyException.php', 'PhutilProxyIterator' => 'utils/PhutilProxyIterator.php', @@ -859,6 +861,8 @@ 'PhutilPortuguesePortugalLocale' => 'PhutilLocale', 'PhutilPregsprintfTestCase' => 'PhutilTestCase', 'PhutilProcessGroupDaemon' => 'PhutilTortureTestDaemon', + 'PhutilProseDiff' => 'Phobject', + 'PhutilProseDifferenceEngine' => 'Phobject', 'PhutilProtocolChannel' => 'PhutilChannelChannel', 'PhutilProxyException' => 'Exception', 'PhutilProxyIterator' => array( diff --git a/src/utils/PhutilProseDiff.php b/src/utils/PhutilProseDiff.php new file mode 100644 --- /dev/null +++ b/src/utils/PhutilProseDiff.php @@ -0,0 +1,89 @@ +parts[] = array( + 'type' => $type, + 'text' => $text, + ); + return $this; + } + + public function getParts() { + return $this->parts; + } + + public function reorderParts() { + // Reorder sequences of removed and added sections to put all the "-" + // parts together first, then all the "+" parts together. This produces + // a more human-readable result than intermingling them. + $o_run = array(); + $n_run = array(); + $result = array(); + foreach ($this->parts as $part) { + $type = $part['type']; + switch ($type) { + case '-': + $o_run[] = $part; + break; + case '+': + $n_run[] = $part; + break; + default: + foreach ($o_run as $o) { + $result[] = $o; + } + foreach ($n_run as $n) { + $result[] = $n; + } + $result[] = $part; + $o_run = array(); + $n_run = array(); + break; + } + } + + foreach ($o_run as $o) { + $result[] = $o; + } + foreach ($n_run as $n) { + $result[] = $n; + } + + // Now, combine consecuitive runs of the same type of change (like a + // series of "-" parts) into a single run. + $combined = array(); + + $last = null; + $last_text = null; + foreach ($result as $part) { + $type = $part['type']; + + if ($last !== $type) { + $combined[] = array( + 'type' => $last, + 'text' => $last_text, + ); + $last_text = null; + $last = $type; + } + + $last_text .= $part['text']; + } + + if ($last_text !== null) { + $combined[] = array( + 'type' => $last, + 'text' => $last_text, + ); + } + + $this->parts = $combined; + + return $this; + } + +} diff --git a/src/utils/PhutilProseDifferenceEngine.php b/src/utils/PhutilProseDifferenceEngine.php new file mode 100644 --- /dev/null +++ b/src/utils/PhutilProseDifferenceEngine.php @@ -0,0 +1,92 @@ +buildDiff($diff, $u, $v, 1); + $diff->reorderParts(); + + return $diff; + } + + private function buildDiff(PhutilProseDiff $diff, $u, $v, $level) { + if ($level == 4) { + $diff->addPart('-', $u); + $diff->addPart('+', $v); + return; + } + + $u_parts = $this->splitCorpus($u, $level); + $v_parts = $this->splitCorpus($v, $level); + + $matrix = id(new PhutilEditDistanceMatrix()) + ->setSequences($u_parts, $v_parts) + ->setComputeString(true); + + $u_pos = 0; + $v_pos = 0; + + $edits = $matrix->getEditString(); + $edits_length = strlen($edits); + + for ($ii = 0; $ii < $edits_length; $ii++) { + $c = $edits[$ii]; + if ($c == 's') { + $diff->addPart('=', $u_parts[$u_pos]); + $u_pos++; + $v_pos++; + } else if ($c == 'd') { + $diff->addPart('-', $u_parts[$u_pos]); + $u_pos++; + } else if ($c == 'i') { + $diff->addPart('+', $v_parts[$v_pos]); + $v_pos++; + } else if ($c == 'x') { + $this->buildDiff($diff, $u_parts[$u_pos], $v_parts[$v_pos], $level + 1); + $u_pos++; + $v_pos++; + } else { + throw new Exception( + pht( + 'Unexpected character ("%s") in edit string.', + $c)); + } + } + } + + private function splitCorpus($corpus, $level) { + switch ($level) { + case 1: + // Level 1: Split into sentences. + $expr = '/([\n,!;?\.]+)/'; + break; + case 2: + // Level 2: Split into words. + $expr = '/(\s+)/'; + break; + case 3: + // Level 3: Split into characters. + return phutil_utf8v_combined($corpus); + } + + $pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE); + return $this->stitchPieces($pieces); + } + + private function stitchPieces(array $pieces) { + $results = array(); + $count = count($pieces); + for ($ii = 0; $ii < $count; $ii += 2) { + $result = $pieces[$ii]; + if ($ii + 1 < $count) { + $result .= $pieces[$ii + 1]; + } + $results[] = $result; + } + + return $results; + } + +}