diff --git a/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryCommitChangeParserWorker.php b/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryCommitChangeParserWorker.php index 9ff7f89234..35d88d2eca 100644 --- a/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryCommitChangeParserWorker.php +++ b/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryCommitChangeParserWorker.php @@ -1,77 +1,102 @@ getCommitIdentifier(); + $callsign = $repository->getCallsign(); + $full_name = 'r'.$callsign.$identifier; + + $this->log("Parsing %s...\n", $full_name); + if ($this->isBadCommit($full_name)) { + $this->log("This commit is marked bad!"); + $result = null; + } else { + $result = $this->parseCommitChanges($repository, $commit); + } + + $this->finishParse(); + + return $result; + } + public static function lookupOrCreatePaths(array $paths) { $repository = new PhabricatorRepository(); $conn_w = $repository->establishConnection('w'); $result_map = self::lookupPaths($paths); $missing_paths = array_fill_keys($paths, true); $missing_paths = array_diff_key($missing_paths, $result_map); $missing_paths = array_keys($missing_paths); if ($missing_paths) { foreach (array_chunk($missing_paths, 128) as $path_chunk) { $sql = array(); foreach ($path_chunk as $path) { $sql[] = qsprintf($conn_w, '(%s, %s)', $path, md5($path)); } queryfx( $conn_w, 'INSERT IGNORE INTO %T (path, pathHash) VALUES %Q', PhabricatorRepository::TABLE_PATH, implode(', ', $sql)); } $result_map += self::lookupPaths($missing_paths); } return $result_map; } private static function lookupPaths(array $paths) { $repository = new PhabricatorRepository(); $conn_w = $repository->establishConnection('w'); $result_map = array(); foreach (array_chunk($paths, 128) as $path_chunk) { $chunk_map = queryfx_all( $conn_w, 'SELECT path, id FROM %T WHERE pathHash IN (%Ls)', PhabricatorRepository::TABLE_PATH, array_map('md5', $path_chunk)); foreach ($chunk_map as $row) { $result_map[$row['path']] = $row['id']; } } return $result_map; } protected function finishParse() { $commit = $this->commit; $commit->writeImportStatusFlag( PhabricatorRepositoryCommit::IMPORTED_CHANGE); id(new PhabricatorSearchIndexer()) ->indexDocumentByPHID($commit->getPHID()); PhabricatorOwnersPackagePathValidator::updateOwnersPackagePaths($commit); if ($this->shouldQueueFollowupTasks()) { PhabricatorWorker::scheduleTask( 'PhabricatorRepositoryCommitOwnersWorker', array( 'commitID' => $commit->getID(), )); } } } diff --git a/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryGitCommitChangeParserWorker.php b/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryGitCommitChangeParserWorker.php index acc047464a..da2cdbf18f 100644 --- a/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryGitCommitChangeParserWorker.php +++ b/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryGitCommitChangeParserWorker.php @@ -1,279 +1,270 @@ getCallsign().$commit->getCommitIdentifier(); - echo "Parsing {$full_name}...\n"; - if ($this->isBadCommit($full_name)) { - echo "This commit is marked bad!\n"; - return; - } - // Check if the commit has parents. We're testing to see whether it is the // first commit in history (in which case we must use "git log") or some // other commit (in which case we can use "git diff"). We'd rather use // "git diff" because it has the right behavior for merge commits, but // it requires the commit to have a parent that we can diff against. The // first commit doesn't, so "commit^" is not a valid ref. list($parents) = $repository->execxLocalCommand( 'log -n1 --format=%s %s', '%P', $commit->getCommitIdentifier()); $use_log = !strlen(trim($parents)); if ($use_log) { // This is the first commit so we need to use "log". We know it's not a // merge commit because it couldn't be merging anything, so this is safe. // NOTE: "--pretty=format: " is to disable diff output, we only want the // part we get from "--raw". list($raw) = $repository->execxLocalCommand( 'log -n1 -M -C -B --find-copies-harder --raw -t '. '--pretty=format: --abbrev=40 %s', $commit->getCommitIdentifier()); } else { // Otherwise, we can use "diff", which will give us output for merges. // We diff against the first parent, as this is generally the expectation // and results in sensible behavior. list($raw) = $repository->execxLocalCommand( 'diff -n1 -M -C -B --find-copies-harder --raw -t '. '--abbrev=40 %s^1 %s', $commit->getCommitIdentifier(), $commit->getCommitIdentifier()); } $changes = array(); $move_away = array(); $copy_away = array(); $lines = explode("\n", $raw); foreach ($lines as $line) { if (!strlen(trim($line))) { continue; } list($old_mode, $new_mode, $old_hash, $new_hash, $more_stuff) = preg_split('/ +/', $line, 5); // We may only have two pieces here. list($action, $src_path, $dst_path) = array_merge( explode("\t", $more_stuff), array(null)); // Normalize the paths for consistency with the SVN workflow. $src_path = '/'.$src_path; if ($dst_path) { $dst_path = '/'.$dst_path; } $old_mode = intval($old_mode, 8); $new_mode = intval($new_mode, 8); switch ($new_mode & 0160000) { case 0160000: $file_type = DifferentialChangeType::FILE_SUBMODULE; break; case 0120000: $file_type = DifferentialChangeType::FILE_SYMLINK; break; case 0040000: $file_type = DifferentialChangeType::FILE_DIRECTORY; break; default: $file_type = DifferentialChangeType::FILE_NORMAL; break; } // TODO: We can detect binary changes as git does, through a combination // of running 'git check-attr' for stuff like 'binary', 'merge' or 'diff', // and by falling back to inspecting the first 8,000 characters of the // buffer for null bytes (this is seriously git's algorithm, see // buffer_is_binary() in xdiff-interface.c). $change_type = null; $change_path = $src_path; $change_target = null; $is_direct = true; switch ($action[0]) { case 'A': $change_type = DifferentialChangeType::TYPE_ADD; break; case 'D': $change_type = DifferentialChangeType::TYPE_DELETE; break; case 'C': $change_type = DifferentialChangeType::TYPE_COPY_HERE; $change_path = $dst_path; $change_target = $src_path; $copy_away[$change_target][] = $change_path; break; case 'R': $change_type = DifferentialChangeType::TYPE_MOVE_HERE; $change_path = $dst_path; $change_target = $src_path; $move_away[$change_target][] = $change_path; break; case 'T': // Type of the file changed, fall through and treat it as a // modification. Not 100% sure this is the right thing to do but it // seems reasonable. case 'M': if ($file_type == DifferentialChangeType::FILE_DIRECTORY) { $change_type = DifferentialChangeType::TYPE_CHILD; $is_direct = false; } else { $change_type = DifferentialChangeType::TYPE_CHANGE; } break; // NOTE: "U" (unmerged) and "X" (unknown) statuses are also possible // in theory but shouldn't appear here. default: throw new Exception("Failed to parse line '{$line}'."); } $changes[$change_path] = array( 'repositoryID' => $repository->getID(), 'commitID' => $commit->getID(), 'path' => $change_path, 'changeType' => $change_type, 'fileType' => $file_type, 'isDirect' => $is_direct, 'commitSequence' => $commit->getEpoch(), 'targetPath' => $change_target, 'targetCommitID' => $change_target ? $commit->getID() : null, ); } // Add a change to '/' since git doesn't mention it. $changes['/'] = array( 'repositoryID' => $repository->getID(), 'commitID' => $commit->getID(), 'path' => '/', 'changeType' => DifferentialChangeType::TYPE_CHILD, 'fileType' => DifferentialChangeType::FILE_DIRECTORY, 'isDirect' => false, 'commitSequence' => $commit->getEpoch(), 'targetPath' => null, 'targetCommitID' => null, ); foreach ($copy_away as $change_path => $destinations) { if (isset($move_away[$change_path])) { $change_type = DifferentialChangeType::TYPE_MULTICOPY; $is_direct = true; unset($move_away[$change_path]); } else { $change_type = DifferentialChangeType::TYPE_COPY_AWAY; // This change is direct if we picked up a modification above (i.e., // the original copy source was also edited). Otherwise the original // wasn't touched, so leave it as an indirect change. $is_direct = isset($changes[$change_path]); } $reference = $changes[reset($destinations)]; $changes[$change_path] = array( 'repositoryID' => $repository->getID(), 'commitID' => $commit->getID(), 'path' => $change_path, 'changeType' => $change_type, 'fileType' => $reference['fileType'], 'isDirect' => $is_direct, 'commitSequence' => $commit->getEpoch(), 'targetPath' => null, 'targetCommitID' => null, ); } foreach ($move_away as $change_path => $destinations) { $reference = $changes[reset($destinations)]; $changes[$change_path] = array( 'repositoryID' => $repository->getID(), 'commitID' => $commit->getID(), 'path' => $change_path, 'changeType' => DifferentialChangeType::TYPE_MOVE_AWAY, 'fileType' => $reference['fileType'], 'isDirect' => true, 'commitSequence' => $commit->getEpoch(), 'targetPath' => null, 'targetCommitID' => null, ); } $paths = array(); foreach ($changes as $change) { $paths[$change['path']] = true; if ($change['targetPath']) { $paths[$change['targetPath']] = true; } } $path_map = $this->lookupOrCreatePaths(array_keys($paths)); foreach ($changes as $key => $change) { $changes[$key]['pathID'] = $path_map[$change['path']]; if ($change['targetPath']) { $changes[$key]['targetPathID'] = $path_map[$change['targetPath']]; } else { $changes[$key]['targetPathID'] = null; } } $conn_w = $repository->establishConnection('w'); $changes_sql = array(); foreach ($changes as $change) { $values = array( (int)$change['repositoryID'], (int)$change['pathID'], (int)$change['commitID'], $change['targetPathID'] ? (int)$change['targetPathID'] : 'null', $change['targetCommitID'] ? (int)$change['targetCommitID'] : 'null', (int)$change['changeType'], (int)$change['fileType'], (int)$change['isDirect'], (int)$change['commitSequence'], ); $changes_sql[] = '('.implode(', ', $values).')'; } queryfx( $conn_w, 'DELETE FROM %T WHERE commitID = %d', PhabricatorRepository::TABLE_PATHCHANGE, $commit->getID()); foreach (array_chunk($changes_sql, 256) as $sql_chunk) { queryfx( $conn_w, 'INSERT INTO %T (repositoryID, pathID, commitID, targetPathID, targetCommitID, changeType, fileType, isDirect, commitSequence) VALUES %Q', PhabricatorRepository::TABLE_PATHCHANGE, implode(', ', $sql_chunk)); } - - $this->finishParse(); } } diff --git a/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryMercurialCommitChangeParserWorker.php b/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryMercurialCommitChangeParserWorker.php index d0444c3547..14360b6f7f 100644 --- a/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryMercurialCommitChangeParserWorker.php +++ b/src/applications/repository/worker/commitchangeparser/PhabricatorRepositoryMercurialCommitChangeParserWorker.php @@ -1,339 +1,330 @@ getCallsign().$commit->getCommitIdentifier(); - echo "Parsing {$full_name}...\n"; - if ($this->isBadCommit($full_name)) { - echo "This commit is marked bad!\n"; - return; - } - list($stdout) = $repository->execxLocalCommand( 'status -C --change %s', $commit->getCommitIdentifier()); $status = ArcanistMercurialParser::parseMercurialStatusDetails($stdout); $common_attributes = array( 'repositoryID' => $repository->getID(), 'commitID' => $commit->getID(), 'commitSequence' => $commit->getEpoch(), ); $changes = array(); // Like Git, Mercurial doesn't track directories directly. We need to infer // directory creation and removal by observing file creation and removal // and testing if the directories in question are previously empty (thus, // created) or subsequently empty (thus, removed). $maybe_new_directories = array(); $maybe_del_directories = array(); $all_directories = array(); // Parse the basic information from "hg status", which shows files that // were directly affected by the change. foreach ($status as $path => $path_info) { $path = '/'.$path; $flags = $path_info['flags']; $change_target = $path_info['from'] ? '/'.$path_info['from'] : null; $changes[$path] = array( 'path' => $path, 'isDirect' => true, 'targetPath' => $change_target, 'targetCommitID' => $change_target ? $commit->getID() : null, // We're going to fill these in shortly. 'changeType' => null, 'fileType' => null, 'flags' => $flags, ) + $common_attributes; if ($flags & ArcanistRepositoryAPI::FLAG_ADDED) { $maybe_new_directories[] = dirname($path); } else if ($flags & ArcanistRepositoryAPI::FLAG_DELETED) { $maybe_del_directories[] = dirname($path); } $all_directories[] = dirname($path); } // Add change information for each source path which doesn't appear in the // status. These files were copied, but were not modified. We also know they // must exist. foreach ($changes as $path => $change) { $from = $change['targetPath']; if ($from && empty($changes[$from])) { $changes[$from] = array( 'path' => $from, 'isDirect' => false, 'targetPath' => null, 'targetCommitID' => null, 'changeType' => DifferentialChangeType::TYPE_COPY_AWAY, 'fileType' => null, 'flags' => 0, ) + $common_attributes; } } $away = array(); foreach ($changes as $path => $change) { $target_path = $change['targetPath']; if ($target_path) { $away[$target_path][] = $path; } } // Now that we have all the direct changes, figure out change types. foreach ($changes as $path => $change) { $flags = $change['flags']; $from = $change['targetPath']; if ($from) { $target = $changes[$from]; } else { $target = null; } if ($flags & ArcanistRepositoryAPI::FLAG_ADDED) { if ($target) { if ($target['flags'] & ArcanistRepositoryAPI::FLAG_DELETED) { $change_type = DifferentialChangeType::TYPE_MOVE_HERE; } else { $change_type = DifferentialChangeType::TYPE_COPY_HERE; } } else { $change_type = DifferentialChangeType::TYPE_ADD; } } else if ($flags & ArcanistRepositoryAPI::FLAG_DELETED) { if (isset($away[$path])) { if (count($away[$path]) > 1) { $change_type = DifferentialChangeType::TYPE_MULTICOPY; } else { $change_type = DifferentialChangeType::TYPE_MOVE_AWAY; } } else { $change_type = DifferentialChangeType::TYPE_DELETE; } } else { if (isset($away[$path])) { $change_type = DifferentialChangeType::TYPE_COPY_AWAY; } else { $change_type = DifferentialChangeType::TYPE_CHANGE; } } $changes[$path]['changeType'] = $change_type; } // Go through all the affected directories and identify any which were // actually added or deleted. $dir_status = array(); foreach ($maybe_del_directories as $dir) { $exists = false; foreach (DiffusionPathIDQuery::expandPathToRoot($dir) as $path) { if (isset($dir_status[$path])) { break; } // If we know some child exists, we know this path exists. If we don't // know that a child exists, test if this directory still exists. if (!$exists) { $exists = $this->mercurialPathExists( $repository, $path, $commit->getCommitIdentifier()); } if ($exists) { $dir_status[$path] = DifferentialChangeType::TYPE_CHILD; } else { $dir_status[$path] = DifferentialChangeType::TYPE_DELETE; } } } list($stdout) = $repository->execxLocalCommand( 'parents --rev %s --style default', $commit->getCommitIdentifier()); $parents = ArcanistMercurialParser::parseMercurialLog($stdout); $parent = reset($parents); if ($parent) { // TODO: We should expand this to a full 40-character hash using "hg id". $parent = $parent['rev']; } foreach ($maybe_new_directories as $dir) { $exists = false; foreach (DiffusionPathIDQuery::expandPathToRoot($dir) as $path) { if (isset($dir_status[$path])) { break; } if (!$exists) { if ($parent) { $exists = $this->mercurialPathExists($repository, $path, $parent); } else { $exists = false; } } if ($exists) { $dir_status[$path] = DifferentialChangeType::TYPE_CHILD; } else { $dir_status[$path] = DifferentialChangeType::TYPE_ADD; } } } foreach ($all_directories as $dir) { foreach (DiffusionPathIDQuery::expandPathToRoot($dir) as $path) { if (isset($dir_status[$path])) { break; } $dir_status[$path] = DifferentialChangeType::TYPE_CHILD; } } // Merge all the directory statuses into the path statuses. foreach ($dir_status as $path => $status) { if (isset($changes[$path])) { // TODO: The UI probably doesn't handle any of these cases with // terrible elegance, but they are exceedingly rare. $existing_type = $changes[$path]['changeType']; if ($existing_type == DifferentialChangeType::TYPE_DELETE) { // This change removes a file, replaces it with a directory, and then // adds children of that directory. Mark it as a "change" instead, // and make the type a directory. $changes[$path]['fileType'] = DifferentialChangeType::FILE_DIRECTORY; $changes[$path]['changeType'] = DifferentialChangeType::TYPE_CHANGE; } else if ($existing_type == DifferentialChangeType::TYPE_MOVE_AWAY || $existing_type == DifferentialChangeType::TYPE_MULTICOPY) { // This change moves or copies a file, replaces it with a directory, // and then adds children to that directory. Mark it as "copy away" // instead of whatever it was, and make the type a directory. $changes[$path]['fileType'] = DifferentialChangeType::FILE_DIRECTORY; $changes[$path]['changeType'] = DifferentialChangeType::TYPE_COPY_AWAY; } else if ($existing_type == DifferentialChangeType::TYPE_ADD) { // This change removes a diretory and replaces it with a file. Mark // it as "change" instead of "add". $changes[$path]['changeType'] = DifferentialChangeType::TYPE_CHANGE; } continue; } $changes[$path] = array( 'path' => $path, 'isDirect' => ($status == DifferentialChangeType::TYPE_CHILD) ? false : true, 'fileType' => DifferentialChangeType::FILE_DIRECTORY, 'changeType' => $status, 'targetPath' => null, 'targetCommitID' => null, ) + $common_attributes; } // TODO: use "hg diff --git" to figure out which files are symlinks. foreach ($changes as $path => $change) { if (empty($change['fileType'])) { $changes[$path]['fileType'] = DifferentialChangeType::FILE_NORMAL; } } $all_paths = array(); foreach ($changes as $path => $change) { $all_paths[$path] = true; if ($change['targetPath']) { $all_paths[$change['targetPath']] = true; } } $path_map = $this->lookupOrCreatePaths(array_keys($all_paths)); foreach ($changes as $key => $change) { $changes[$key]['pathID'] = $path_map[$change['path']]; if ($change['targetPath']) { $changes[$key]['targetPathID'] = $path_map[$change['targetPath']]; } else { $changes[$key]['targetPathID'] = null; } } $conn_w = $repository->establishConnection('w'); $changes_sql = array(); foreach ($changes as $change) { $values = array( (int)$change['repositoryID'], (int)$change['pathID'], (int)$change['commitID'], $change['targetPathID'] ? (int)$change['targetPathID'] : 'null', $change['targetCommitID'] ? (int)$change['targetCommitID'] : 'null', (int)$change['changeType'], (int)$change['fileType'], (int)$change['isDirect'], (int)$change['commitSequence'], ); $changes_sql[] = '('.implode(', ', $values).')'; } queryfx( $conn_w, 'DELETE FROM %T WHERE commitID = %d', PhabricatorRepository::TABLE_PATHCHANGE, $commit->getID()); foreach (array_chunk($changes_sql, 256) as $sql_chunk) { queryfx( $conn_w, 'INSERT INTO %T (repositoryID, pathID, commitID, targetPathID, targetCommitID, changeType, fileType, isDirect, commitSequence) VALUES %Q', PhabricatorRepository::TABLE_PATHCHANGE, implode(', ', $sql_chunk)); } - - $this->finishParse(); } private function mercurialPathExists( PhabricatorRepository $repository, $path, $rev) { if ($path == '/') { return true; } // NOTE: For directories, this grabs the entire directory contents, but // we don't have any more surgical approach available to us in Mercurial. // We can't use "log" because it doesn't have enough information for us // to figure out when a directory is deleted by a change. list($err) = $repository->execLocalCommand( 'cat --rev %s -- %s > /dev/null', $rev, $path); if ($err) { return false; } else { return true; } } } diff --git a/src/applications/repository/worker/commitchangeparser/PhabricatorRepositorySvnCommitChangeParserWorker.php b/src/applications/repository/worker/commitchangeparser/PhabricatorRepositorySvnCommitChangeParserWorker.php index 32bd2c7ea6..386458c7b0 100644 --- a/src/applications/repository/worker/commitchangeparser/PhabricatorRepositorySvnCommitChangeParserWorker.php +++ b/src/applications/repository/worker/commitchangeparser/PhabricatorRepositorySvnCommitChangeParserWorker.php @@ -1,790 +1,779 @@ getDetail('remote-uri'); $svn_commit = $commit->getCommitIdentifier(); - $callsign = $repository->getCallsign(); - $full_name = 'r'.$callsign.$svn_commit; - echo "Parsing {$full_name}...\n"; - - if ($this->isBadCommit($full_name)) { - echo "This commit is marked bad!\n"; - return; - } - // Pull the top-level path changes out of "svn log". This is pretty // straightforward; just parse the XML log. $log = $this->getSVNLogXMLObject($uri, $svn_commit, $verbose = true); $entry = $log->logentry[0]; if (!$entry->paths) { // TODO: Explicitly mark this commit as broken elsewhere? This isn't // supposed to happen but we have some cases like rE27 and rG935 in the // Facebook repositories where things got all clowned up. return; } $raw_paths = array(); foreach ($entry->paths->path as $path) { $name = trim((string)$path); $raw_paths[$name] = array( 'rawPath' => $name, 'rawTargetPath' => (string)$path['copyfrom-path'], 'rawChangeType' => (string)$path['action'], 'rawTargetCommit' => (string)$path['copyfrom-rev'], ); } $copied_or_moved_map = array(); $deleted_paths = array(); $add_paths = array(); foreach ($raw_paths as $path => $raw_info) { if ($raw_info['rawTargetPath']) { $copied_or_moved_map[$raw_info['rawTargetPath']][] = $raw_info; } switch ($raw_info['rawChangeType']) { case 'D': $deleted_paths[$path] = $raw_info; break; case 'A': case 'R': $add_paths[$path] = $raw_info; break; } } // If a path was deleted, we need to look in the repository history to // figure out where the former valid location for it is so we can figure out // if it was a directory or not, among other things. $lookup_here = array(); foreach ($raw_paths as $path => $raw_info) { if ($raw_info['rawChangeType'] != 'D') { continue; } // If a change copies a directory and then deletes something from it, // we need to look at the old location for information about the path, not // the new location. This workflow is pretty ridiculous -- so much so that // Trac gets it wrong. See Facebook rO6 for an example, if you happen to // work at Facebook. $parents = $this->expandAllParentPaths($path, $include_self = true); foreach ($parents as $parent) { if (isset($add_paths[$parent])) { $relative_path = substr($path, strlen($parent)); $lookup_here[$path] = array( 'rawPath' => $add_paths[$parent]['rawTargetPath'].$relative_path, 'rawCommit' => $add_paths[$parent]['rawTargetCommit'], ); continue 2; } } // Otherwise we can just look at the previous revision. $lookup_here[$path] = array( 'rawPath' => $path, 'rawCommit' => $svn_commit - 1, ); } $lookup = array(); foreach ($raw_paths as $path => $raw_info) { if ($raw_info['rawChangeType'] == 'D') { $lookup[$path] = $lookup_here[$path]; } else { // For everything that wasn't deleted, we can just look it up directly. $lookup[$path] = array( 'rawPath' => $path, 'rawCommit' => $svn_commit, ); } } $effects = array(); $path_file_types = $this->lookupPathFileTypes($repository, $lookup); foreach ($raw_paths as $path => $raw_info) { if ($raw_info['rawChangeType'] == 'D' && $path_file_types[$path] == DifferentialChangeType::FILE_DIRECTORY) { // Bad. Child paths aren't enumerated in "svn log" so we need // to go fishing. $list = $this->lookupRecursiveFileList( $repository, $lookup[$path]); foreach ($list as $deleted_path => $path_file_type) { $deleted_path = rtrim($path.'/'.$deleted_path, '/'); if (!empty($raw_paths[$deleted_path])) { // We somehow learned about this deletion explicitly? // TODO: Unclear how this is possible. continue; } $effect_type = DifferentialChangeType::TYPE_DELETE; $effect_target_path = null; if (isset($copied_or_moved_map[$deleted_path])) { $effect_target_path = $path; if (count($copied_or_moved_map[$deleted_path]) > 1) { $effect_type = DifferentialChangeType::TYPE_MULTICOPY; } else { $effect_type = DifferentialChangeType::TYPE_MOVE_AWAY; } } $effects[$deleted_path] = array( 'rawPath' => $deleted_path, 'rawTargetPath' => $effect_target_path, 'rawTargetCommit' => null, 'rawDirect' => true, 'changeType' => $effect_type, 'fileType' => $path_file_type, ); $deleted_paths[$deleted_path] = $effects[$deleted_path]; } } } $resolved_types = array(); $supplemental = array(); foreach ($raw_paths as $path => $raw_info) { if (isset($resolved_types[$path])) { $type = $resolved_types[$path]; } else { switch ($raw_info['rawChangeType']) { case 'D': if (isset($copied_or_moved_map[$path])) { if (count($copied_or_moved_map[$path]) > 1) { $type = DifferentialChangeType::TYPE_MULTICOPY; } else { $type = DifferentialChangeType::TYPE_MOVE_AWAY; } } else { $type = DifferentialChangeType::TYPE_DELETE; } break; case 'A': $copy_from = $raw_info['rawTargetPath']; $copy_rev = $raw_info['rawTargetCommit']; if (!strlen($copy_from)) { $type = DifferentialChangeType::TYPE_ADD; } else { if (isset($deleted_paths[$copy_from])) { $type = DifferentialChangeType::TYPE_MOVE_HERE; $other_type = DifferentialChangeType::TYPE_MOVE_AWAY; } else { $type = DifferentialChangeType::TYPE_COPY_HERE; $other_type = DifferentialChangeType::TYPE_COPY_AWAY; } $source_file_type = $this->lookupPathFileType( $repository, $copy_from, array( 'rawPath' => $copy_from, 'rawCommit' => $copy_rev, )); if ($source_file_type == DifferentialChangeType::FILE_DELETED) { throw new Exception( "Something is wrong; source of a copy must exist."); } if ($source_file_type != DifferentialChangeType::FILE_DIRECTORY) { if (isset($raw_paths[$copy_from]) || isset($effects[$copy_from])) { break; } $effects[$copy_from] = array( 'rawPath' => $copy_from, 'rawTargetPath' => null, 'rawTargetCommit' => null, 'rawDirect' => false, 'changeType' => $other_type, 'fileType' => $source_file_type, ); } else { // ULTRADISASTER. We've added a directory which was copied // or moved from somewhere else. This is the most complex and // ridiculous case. $list = $this->lookupRecursiveFileList( $repository, array( 'rawPath' => $copy_from, 'rawCommit' => $copy_rev, )); foreach ($list as $from_path => $from_file_type) { $full_from = rtrim($copy_from.'/'.$from_path, '/'); $full_to = rtrim($path.'/'.$from_path, '/'); if (empty($raw_paths[$full_to])) { $effects[$full_to] = array( 'rawPath' => $full_to, 'rawTargetPath' => $full_from, 'rawTargetCommit' => $copy_rev, 'rawDirect' => true, 'changeType' => $type, 'fileType' => $from_file_type, ); } else { // This means we picked the file up explicitly elsewhere. // If the file as modified, SVN will drop the copy // information. We need to restore it. $supplemental[$full_to]['rawTargetPath'] = $full_from; $supplemental[$full_to]['rawTargetCommit'] = $copy_rev; if ($raw_paths[$full_to]['rawChangeType'] == 'M') { $resolved_types[$full_to] = $type; } } if (empty($raw_paths[$full_from]) && empty($effects[$full_from])) { if ($other_type == DifferentialChangeType::TYPE_COPY_AWAY) { // Add an indirect effect for the copied file, if we // don't already have an entry for it (e.g., a separate // change). $effects[$full_from] = array( 'rawPath' => $full_from, 'rawTargetPath' => null, 'rawTargetCommit' => null, 'rawDirect' => false, 'changeType' => $other_type, 'fileType' => $from_file_type, ); } } } } } break; // This is "replaced", caused by "svn rm"-ing a file, putting another // in its place, and then "svn add"-ing it. We do not distinguish // between this and "M". case 'R': case 'M': if (isset($copied_or_moved_map[$path])) { $type = DifferentialChangeType::TYPE_COPY_AWAY; } else { $type = DifferentialChangeType::TYPE_CHANGE; } break; } } $resolved_types[$path] = $type; } foreach ($raw_paths as $path => $raw_info) { $raw_paths[$path]['changeType'] = $resolved_types[$path]; if (isset($supplemental[$path])) { foreach ($supplemental[$path] as $key => $value) { $raw_paths[$path][$key] = $value; } } } foreach ($raw_paths as $path => $raw_info) { $effects[$path] = array( 'rawPath' => $path, 'rawTargetPath' => $raw_info['rawTargetPath'], 'rawTargetCommit' => $raw_info['rawTargetCommit'], 'rawDirect' => true, 'changeType' => $raw_info['changeType'], 'fileType' => $path_file_types[$path], ); } $parents = array(); foreach ($effects as $path => $effect) { foreach ($this->expandAllParentPaths($path) as $parent_path) { $parents[$parent_path] = true; } } $parents = array_keys($parents); foreach ($parents as $parent) { if (isset($effects[$parent])) { continue; } $effects[$parent] = array( 'rawPath' => $parent, 'rawTargetPath' => null, 'rawTargetCommit' => null, 'rawDirect' => false, 'changeType' => DifferentialChangeType::TYPE_CHILD, 'fileType' => DifferentialChangeType::FILE_DIRECTORY, ); } $lookup_paths = array(); foreach ($effects as $effect) { $lookup_paths[$effect['rawPath']] = true; if ($effect['rawTargetPath']) { $lookup_paths[$effect['rawTargetPath']] = true; } } $lookup_paths = array_keys($lookup_paths); $lookup_commits = array(); foreach ($effects as $effect) { if ($effect['rawTargetCommit']) { $lookup_commits[$effect['rawTargetCommit']] = true; } } $lookup_commits = array_keys($lookup_commits); $path_map = $this->lookupOrCreatePaths($lookup_paths); $commit_map = $this->lookupSvnCommits($repository, $lookup_commits); $this->writeChanges($repository, $commit, $effects, $path_map, $commit_map); $this->writeBrowse($repository, $commit, $effects, $path_map); - - $this->finishParse(); } private function writeChanges( PhabricatorRepository $repository, PhabricatorRepositoryCommit $commit, array $effects, array $path_map, array $commit_map) { $conn_w = $repository->establishConnection('w'); $sql = array(); foreach ($effects as $effect) { $sql[] = qsprintf( $conn_w, '(%d, %d, %d, %nd, %nd, %d, %d, %d, %d)', $repository->getID(), $path_map[$effect['rawPath']], $commit->getID(), $effect['rawTargetPath'] ? $path_map[$effect['rawTargetPath']] : null, $effect['rawTargetCommit'] ? $commit_map[$effect['rawTargetCommit']] : null, $effect['changeType'], $effect['fileType'], $effect['rawDirect'] ? 1 : 0, $commit->getCommitIdentifier()); } queryfx( $conn_w, 'DELETE FROM %T WHERE commitID = %d', PhabricatorRepository::TABLE_PATHCHANGE, $commit->getID()); foreach (array_chunk($sql, 512) as $sql_chunk) { queryfx( $conn_w, 'INSERT INTO %T (repositoryID, pathID, commitID, targetPathID, targetCommitID, changeType, fileType, isDirect, commitSequence) VALUES %Q', PhabricatorRepository::TABLE_PATHCHANGE, implode(', ', $sql_chunk)); } } private function writeBrowse( PhabricatorRepository $repository, PhabricatorRepositoryCommit $commit, array $effects, array $path_map) { $conn_w = $repository->establishConnection('w'); $sql = array(); foreach ($effects as $effect) { $type = $effect['changeType']; if (!$effect['rawDirect']) { if ($type == DifferentialChangeType::TYPE_COPY_AWAY) { // Don't write COPY_AWAY to the filesystem table if it isn't a direct // event. continue; } if ($type == DifferentialChangeType::TYPE_CHILD) { // Don't write CHILD to the filesystem table. Although doing these // writes has the nice property of letting you see when a directory's // contents were last changed, it explodes the table tremendously // and makes Diffusion far slower. continue; } } if ($effect['rawPath'] == '/') { // Don't write any events on '/' to the filesystem table; in // particular, it doesn't have a meaningful parentID. continue; } $existed = !DifferentialChangeType::isDeleteChangeType($type); $sql[] = qsprintf( $conn_w, '(%d, %d, %d, %d, %d, %d)', $repository->getID(), $path_map[$this->getParentPath($effect['rawPath'])], $commit->getCommitIdentifier(), $path_map[$effect['rawPath']], $existed ? 1 : 0, $effect['fileType']); } queryfx( $conn_w, 'DELETE FROM %T WHERE repositoryID = %d AND svnCommit = %d', PhabricatorRepository::TABLE_FILESYSTEM, $repository->getID(), $commit->getCommitIdentifier()); foreach (array_chunk($sql, 512) as $sql_chunk) { queryfx( $conn_w, 'INSERT INTO %T (repositoryID, parentID, svnCommit, pathID, existed, fileType) VALUES %Q', PhabricatorRepository::TABLE_FILESYSTEM, implode(', ', $sql_chunk)); } } private function lookupSvnCommits( PhabricatorRepository $repository, array $commits) { if (!$commits) { return array(); } $commit_table = new PhabricatorRepositoryCommit(); $commit_data = queryfx_all( $commit_table->establishConnection('w'), 'SELECT id, commitIdentifier FROM %T WHERE repositoryID = %d AND commitIdentifier in (%Ls)', $commit_table->getTableName(), $repository->getID(), $commits); $commit_map = ipull($commit_data, 'id', 'commitIdentifier'); $need = array(); foreach ($commits as $commit) { if (empty($commit_map[$commit])) { $need[] = $commit; } } // If we are parsing a Subversion repository and have been configured to // import only some subdirectory of it, we may find commits which reference // other foreign commits outside of the directory (for instance, because of // a move or copy). Rather than trying to execute full parses on them, just // create stub commits and identify the stubs as foreign commits. if ($need) { $subpath = $repository->getDetail('svn-subpath'); if (!$subpath) { $commits = implode(', ', $need); throw new Exception( "Missing commits ({$need}) in a SVN repository which is not ". "configured for subdirectory-only parsing!"); } foreach ($need as $foreign_commit) { $commit = new PhabricatorRepositoryCommit(); $commit->setRepositoryID($repository->getID()); $commit->setCommitIdentifier($foreign_commit); $commit->setEpoch(0); $commit->save(); $data = new PhabricatorRepositoryCommitData(); $data->setCommitID($commit->getID()); $data->setAuthorName(''); $data->setCommitMessage(''); $data->setCommitDetails( array( 'foreign-svn-stub' => true, // Denormalize this to make it easier to debug cases where someone // did half a parse and then changed the subdirectory or something // like that. 'svn-subpath' => $subpath, )); $data->save(); $commit_map[$foreign_commit] = $commit->getID(); } } return $commit_map; } private function lookupPathFileType( PhabricatorRepository $repository, $path, array $path_info) { $result = $this->lookupPathFileTypes( $repository, array( $path => $path_info, )); return $result[$path]; } private function lookupPathFileTypes( PhabricatorRepository $repository, array $paths) { $result_map = array(); $repository_uri = $repository->getDetail('remote-uri'); if (isset($paths['/'])) { $result_map['/'] = DifferentialChangeType::FILE_DIRECTORY; unset($paths['/']); } $parents = array(); $path_mapping = array(); foreach ($paths as $path => $lookup) { $parent = dirname($lookup['rawPath']); $parent = ltrim($parent, '/'); $parent = $this->encodeSVNPath($parent); $parent = $repository_uri.$parent.'@'.$lookup['rawCommit']; $parent = escapeshellarg($parent); $parents[$parent] = true; $path_mapping[$parent][] = dirname($path); } // Reverse this list so we can pop $path_mapping, as that's more efficient // than shifting it. We need to associate these maps positionally because // a change can copy the same source path from multiple revisions via // "svn cp path@1 a; svn cp path@2 b;" and the XML output gives us no way // to distinguish which revision we're looking at except based on its // position in the document. $all_paths = array_reverse(array_keys($parents)); foreach (array_chunk($all_paths, 64) as $path_chunk) { list($raw_xml) = $repository->execxRemoteCommand( '--xml ls %C', implode(' ', $path_chunk)); $xml = new SimpleXMLElement($raw_xml); foreach ($xml->list as $list) { $list_path = (string)$list['path']; // SVN is a big mess. See Facebook rG8 (a revision which adds files // with spaces in their names) for an example. $list_path = rawurldecode($list_path); if ($list_path == $repository_uri) { $base = '/'; } else { $base = substr($list_path, strlen($repository_uri)); } $mapping = array_pop($path_mapping); foreach ($list->entry as $entry) { $val = $this->getFileTypeFromSVNKind($entry['kind']); foreach ($mapping as $base_path) { // rtrim() causes us to handle top-level directories correctly. $key = rtrim($base_path, '/').'/'.$entry->name; $result_map[$key] = $val; } } } } foreach ($paths as $path => $lookup) { if (empty($result_map[$path])) { $result_map[$path] = DifferentialChangeType::FILE_DELETED; } } return $result_map; } private function encodeSVNPath($path) { $path = rawurlencode($path); $path = str_replace('%2F', '/', $path); return $path; } private function getFileTypeFromSVNKind($kind) { $kind = (string)$kind; switch ($kind) { case 'dir': return DifferentialChangeType::FILE_DIRECTORY; case 'file': return DifferentialChangeType::FILE_NORMAL; default: throw new Exception("Unknown SVN file kind '{$kind}'."); } } private function lookupRecursiveFileList( PhabricatorRepository $repository, array $info) { $path = $info['rawPath']; $rev = $info['rawCommit']; $path = $this->encodeSVNPath($path); $hashkey = md5($repository->getDetail('remote-uri').$path.'@'.$rev); // This method is quite horrible. The underlying challenge is that some // commits in the Facebook repository are enormous, taking multiple hours // to 'ls -R' out of the repository and producing XML files >1GB in size. // If we try to SimpleXML them, the object exhausts available memory on a // 64G machine. Instead, cache the XML output and then parse it line by line // to limit space requirements. $cache_loc = sys_get_temp_dir().'/diffusion.'.$hashkey.'.svnls'; if (!Filesystem::pathExists($cache_loc)) { $tmp = new TempFile(); $repository->execxRemoteCommand( '--xml ls -R %s%s@%d > %s', $repository->getDetail('remote-uri'), $path, $rev, $tmp); execx( 'mv %s %s', $tmp, $cache_loc); } $map = $this->parseRecursiveListFileData($cache_loc); Filesystem::remove($cache_loc); return $map; } private function parseRecursiveListFileData($file_path) { $map = array(); $mode = 'xml'; $done = false; $entry = null; foreach (new LinesOfALargeFile($file_path) as $lno => $line) { switch ($mode) { case 'entry': if ($line == '') { $entry = implode('', $entry); $pattern = '@^\s+kind="(file|dir)">'. '(.*?)'. '((.*?))?@'; $matches = null; if (!preg_match($pattern, $entry, $matches)) { throw new Exception("Unable to parse entry!"); } $map[html_entity_decode($matches[2])] = $this->getFileTypeFromSVNKind($matches[1]); $mode = 'entry-or-end'; } else { $entry[] = $line; } break; case 'entry-or-end': if ($line == '') { $done = true; break 2; } else if ($line == ' or = 1) { array_pop($parts); $parents[] = '/'.implode('/', $parts); } return $parents; } }