Changeset View
Changeset View
Standalone View
Standalone View
scripts/utils/utf8.php
- This file was added.
| Property | Old Value | New Value |
|---|---|---|
| File Mode | null | 100755 |
| #!/usr/bin/env php | |||||
| <?php | |||||
| require_once dirname(dirname(__FILE__)).'/__init_script__.php'; | |||||
| $args = new PhutilArgumentParser($argv); | |||||
| $args->setTagline(pht('utf8 charset test script')); | |||||
| $args->setSynopsis(<<<EOHELP | |||||
| **utf8.php** [-C n] __file__ ... | |||||
| Show regions in files which are not valid UTF-8. With "-C n", | |||||
| show __n__ lines of context instead of the default of 3. Use | |||||
| "-" to read stdin. | |||||
| **utf8.php** --test __file__ ... | |||||
| Test for files which are not valid UTF-8. For example, this | |||||
| will find all ".php" files under the working directory which | |||||
| aren't valid UTF-8: | |||||
| find . -type f -name '*.php' | xargs -n256 ./utf8.php -t | |||||
| If the script exits with no output, all input files were | |||||
| valid UTF-8. | |||||
| EOHELP | |||||
| ); | |||||
| $args->parseStandardArguments(); | |||||
| $args->parse(array( | |||||
| array( | |||||
| 'name' => 'context', | |||||
| 'short' => 'C', | |||||
| 'param' => 'lines', | |||||
| 'default' => 3, | |||||
| 'help' => pht( | |||||
| 'Show __lines__ lines of context instead of the default 3.'), | |||||
| 'conflicts' => array( | |||||
| 'test' => pht('with %s, context is not shown.', '--test'), | |||||
| ), | |||||
| ), | |||||
| array( | |||||
| 'name' => 'test', | |||||
| 'short' => 't', | |||||
| 'help' => pht('Print file names containing invalid UTF-8 to stdout.'), | |||||
| ), | |||||
| array( | |||||
| 'name' => 'files', | |||||
| 'wildcard' => true, | |||||
| ), | |||||
| )); | |||||
| $is_test = $args->getArg('test'); | |||||
| $context = $args->getArg('context'); | |||||
| $files = $args->getArg('files'); | |||||
| if (empty($files)) { | |||||
| $args->printHelpAndExit(); | |||||
| } | |||||
| if ($is_test) { | |||||
| $err = test($files); | |||||
| } else { | |||||
| $err = show($files, $context); | |||||
| } | |||||
| exit($err); | |||||
| function read($file) { | |||||
| if ($file === '-') { | |||||
| return file_get_contents('php://stdin'); | |||||
| } else { | |||||
| return Filesystem::readFile($file); | |||||
| } | |||||
| } | |||||
| function name($file) { | |||||
| if ($file === '-') { | |||||
| return 'stdin'; | |||||
| } else { | |||||
| return $file; | |||||
| } | |||||
| } | |||||
| function test(array $files) { | |||||
| foreach ($files as $file) { | |||||
| $data = read($file); | |||||
| if (!phutil_is_utf8($data)) { | |||||
| echo name($file)."\n"; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| function show(array $files, $context) { | |||||
| foreach ($files as $file) { | |||||
| $data = read($file); | |||||
| $ok = phutil_is_utf8($data); | |||||
| if ($ok) { | |||||
| echo pht('OKAY'); | |||||
| } else { | |||||
| echo pht('FAIL'); | |||||
| } | |||||
| echo ' '.name($file)."\n"; | |||||
| if (!$ok) { | |||||
| $lines = explode("\n", $data); | |||||
| $len = count($lines); | |||||
| $map = array(); | |||||
| $bad = array(); | |||||
| foreach ($lines as $n => $line) { | |||||
| if (phutil_is_utf8($line)) { | |||||
| continue; | |||||
| } | |||||
| $bad[$n] = true; | |||||
| for ($jj = max(0, $n - $context); | |||||
| $jj < min($len, $n + 1 + $context); | |||||
| $jj++) { | |||||
| $map[$jj] = true; | |||||
| } | |||||
| } | |||||
| $width = strlen(max(array_keys($map))); | |||||
| // Set $last such that we print a newline on the first iteration through | |||||
| // the loop. | |||||
| $last = -2; | |||||
| foreach ($map as $idx => $ignored) { | |||||
| if ($idx !== $last + 1) { | |||||
| echo "\n"; | |||||
| } | |||||
| $last = $idx; | |||||
| $line = $lines[$idx]; | |||||
| if (!empty($bad[$idx])) { | |||||
| $line = show_problems($line); | |||||
| } | |||||
| printf(" % {$width}d %s\n", $idx + 1, $line); | |||||
| } | |||||
| echo "\n"; | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| function show_problems($line) { | |||||
| $regex = | |||||
| "/^(". | |||||
| "[\x01-\x7F]+". | |||||
| "|([\xC2-\xDF][\x80-\xBF])". | |||||
| "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])". | |||||
| "|([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]))/"; | |||||
| $out = ''; | |||||
| while (strlen($line)) { | |||||
| $match = null; | |||||
| if (preg_match($regex, $line, $match)) { | |||||
| $out .= $match[1]; | |||||
| $line = substr($line, strlen($match[1])); | |||||
| } else { | |||||
| $chr = sprintf('<0x%0X>', ord($line[0])); | |||||
| $chr = phutil_console_format('##%s##', $chr); | |||||
| $out .= $chr; | |||||
| $line = substr($line, 1); | |||||
| } | |||||
| } | |||||
| return $out; | |||||
| } | |||||