diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -662,6 +662,7 @@ 'DifferentialSubscribersCommitMessageField' => 'applications/differential/field/DifferentialSubscribersCommitMessageField.php', 'DifferentialSummaryCommitMessageField' => 'applications/differential/field/DifferentialSummaryCommitMessageField.php', 'DifferentialSummaryField' => 'applications/differential/customfield/DifferentialSummaryField.php', + 'DifferentialTabReplacementTestCase' => 'applications/differential/parser/__tests__/DifferentialTabReplacementTestCase.php', 'DifferentialTagsCommitMessageField' => 'applications/differential/field/DifferentialTagsCommitMessageField.php', 'DifferentialTasksCommitMessageField' => 'applications/differential/field/DifferentialTasksCommitMessageField.php', 'DifferentialTestPlanCommitMessageField' => 'applications/differential/field/DifferentialTestPlanCommitMessageField.php', @@ -6332,6 +6333,7 @@ 'DifferentialSubscribersCommitMessageField' => 'DifferentialCommitMessageField', 'DifferentialSummaryCommitMessageField' => 'DifferentialCommitMessageField', 'DifferentialSummaryField' => 'DifferentialCoreCustomField', + 'DifferentialTabReplacementTestCase' => 'PhabricatorTestCase', 'DifferentialTagsCommitMessageField' => 'DifferentialCommitMessageField', 'DifferentialTasksCommitMessageField' => 'DifferentialCommitMessageField', 'DifferentialTestPlanCommitMessageField' => 'DifferentialCommitMessageField', diff --git a/src/applications/differential/parser/DifferentialChangesetParser.php b/src/applications/differential/parser/DifferentialChangesetParser.php --- a/src/applications/differential/parser/DifferentialChangesetParser.php +++ b/src/applications/differential/parser/DifferentialChangesetParser.php @@ -1456,9 +1456,10 @@ $line = phutil_string_cast($line); - if (strpos($line, "\t") !== false) { - $line = $this->replaceTabsWithSpaces($line); - } + // TODO: This should be flexible, eventually. + $tab_width = 2; + + $line = self::replaceTabsWithSpaces($line, $tab_width); $line = str_replace($search, $replace, $line); if ($is_html) { @@ -1543,13 +1544,9 @@ return $rules; } - private function replaceTabsWithSpaces($line) { - // TODO: This should be flexible, eventually. - $tab_width = 2; - - static $tags; - if ($tags === null) { - $tags = array(); + public static function replaceTabsWithSpaces($line, $tab_width) { + static $tags = array(); + if (empty($tags[$tab_width])) { for ($ii = 1; $ii <= $tab_width; $ii++) { $tag = phutil_tag( 'span', @@ -1562,42 +1559,120 @@ } } - // If the line is particularly long, don't try to vectorize it. Use a - // faster approximation of the correct tabstop expansion instead. This - // usually still arrives at the right result. - if (strlen($line) > 256) { - return str_replace("\t", $tags[$tab_width], $line); + // Expand all prefix tabs until we encounter any non-tab character. This + // is cheap and often immediately produces the correct result with no + // further work (and, particularly, no need to handle any unicode cases). + + $len = strlen($line); + + $head = 0; + for ($head = 0; $head < $len; $head++) { + $char = $line[$head]; + if ($char !== "\t") { + break; + } + } + + if ($head) { + if (empty($tags[$tab_width * $head])) { + $tags[$tab_width * $head] = str_repeat($tags[$tab_width], $head); + } + $prefix = $tags[$tab_width * $head]; + $line = substr($line, $head); + } else { + $prefix = ''; + } + + // If we have no remaining tabs elsewhere in the string after taking care + // of all the prefix tabs, we're done. + if (strpos($line, "\t") === false) { + return $prefix.$line; + } + + $len = strlen($line); + + // If the line is particularly long, don't try to do anything special with + // it. Use a faster approximation of the correct tabstop expansion instead. + // This usually still arrives at the right result. + if ($len > 256) { + return $prefix.str_replace("\t", $tags[$tab_width], $line); } - $line = phutil_utf8v_combined($line); $in_tag = false; $pos = 0; - foreach ($line as $key => $char) { - if ($char === '<') { - $in_tag = true; - continue; - } - if ($char === '>') { - $in_tag = false; - continue; + // See PHI1210. If the line only has single-byte characters, we don't need + // to vectorize it and can avoid an expensive UTF8 call. + + $fast_path = preg_match('/^[\x01-\x7F]*\z/', $line); + if ($fast_path) { + $replace = array(); + for ($ii = 0; $ii < $len; $ii++) { + $char = $line[$ii]; + if ($char === '>') { + $in_tag = false; + continue; + } + + if ($in_tag) { + continue; + } + + if ($char === '<') { + $in_tag = true; + continue; + } + + if ($char === "\t") { + $count = $tab_width - ($pos % $tab_width); + $pos += $count; + $replace[$ii] = $tags[$count]; + continue; + } + + $pos++; } - if ($in_tag) { - continue; + if ($replace) { + // Apply replacements starting at the end of the string so they + // don't mess up the offsets for following replacements. + $replace = array_reverse($replace, true); + + foreach ($replace as $replace_pos => $replacement) { + $line = substr_replace($line, $replacement, $replace_pos, 1); + } } + } else { + $line = phutil_utf8v_combined($line); + foreach ($line as $key => $char) { + if ($char === '>') { + $in_tag = false; + continue; + } - if ($char === "\t") { - $count = $tab_width - ($pos % $tab_width); - $pos += $count; - $line[$key] = $tags[$count]; - continue; + if ($in_tag) { + continue; + } + + if ($char === '<') { + $in_tag = true; + continue; + } + + if ($char === "\t") { + $count = $tab_width - ($pos % $tab_width); + $pos += $count; + $line[$key] = $tags[$count]; + continue; + } + + $pos++; } - $pos++; + $line = implode('', $line); } - return implode('', $line); + return $prefix.$line; } } diff --git a/src/applications/differential/parser/__tests__/DifferentialTabReplacementTestCase.php b/src/applications/differential/parser/__tests__/DifferentialTabReplacementTestCase.php new file mode 100644 --- /dev/null +++ b/src/applications/differential/parser/__tests__/DifferentialTabReplacementTestCase.php @@ -0,0 +1,56 @@ + "; + $tab2 = " "; + + $cat = "\xF0\x9F\x90\xB1"; + + $cases = array( + '' => '', + 'x' => 'x', + + // Tabs inside HTML tags should not be replaced. + "<\t>x" => "<\t>x", + + // Normal tabs should be replaced. These are all aligned to the tab + // width, so they'll be replaced inline. + "\tx" => "{$tab2}x", + " \tx" => " {$tab2}x", + "\t x" => "{$tab2} x", + "aa\tx" => "aa{$tab2}x", + "aa \tx" => "aa {$tab2}x", + "aa\t x" => "aa{$tab2} x", + + // This tab is not tabstop-aligned, so it is replaced with fewer + // spaces to bring us to the next tabstop. + " \tx" => " {$tab1}x", + + // Text inside HTML tags should not count when aligning tabs with + // tabstops. + " \tx" => " {$tab1}x", + " \tx" => " {$tab1}x", + + // The code has to take a slow path when inputs contain unicode, but + // should produce the right results and align tabs to tabstops while + // respecting UTF8 display character widths, not byte widths. + "{$cat}\tx" => "{$cat}{$tab1}x", + "{$cat}{$cat}\tx" => "{$cat}{$cat}{$tab2}x", + ); + + foreach ($cases as $input => $expect) { + $actual = DifferentialChangesetParser::replaceTabsWithSpaces( + $input, + 2); + + $this->assertEqual( + $expect, + $actual, + pht('Tabs to Spaces: %s', $input)); + } + } + +}