diff --git a/resources/sql/autopatches/20171002.cngram.01.maniphest.sql b/resources/sql/autopatches/20171002.cngram.01.maniphest.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.01.maniphest.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.02.event.sql b/resources/sql/autopatches/20171002.cngram.02.event.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.02.event.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_calendar.calendar_event_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.03.revision.sql b/resources/sql/autopatches/20171002.cngram.03.revision.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.03.revision.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_differential.differential_revision_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.04.fund.sql b/resources/sql/autopatches/20171002.cngram.04.fund.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.04.fund.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_fund.fund_initiative_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.05.owners.sql b/resources/sql/autopatches/20171002.cngram.05.owners.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.05.owners.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_owners.owners_package_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.06.passphrase.sql b/resources/sql/autopatches/20171002.cngram.06.passphrase.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.06.passphrase.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_passphrase.passphrase_credential_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.07.blog.sql b/resources/sql/autopatches/20171002.cngram.07.blog.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.07.blog.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_phame.phame_blog_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.08.post.sql b/resources/sql/autopatches/20171002.cngram.08.post.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.08.post.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_phame.phame_post_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.09.pholio.sql b/resources/sql/autopatches/20171002.cngram.09.pholio.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.09.pholio.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_pholio.pholio_mock_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.10.phriction.sql b/resources/sql/autopatches/20171002.cngram.10.phriction.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.10.phriction.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_phriction.phriction_document_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.11.project.sql b/resources/sql/autopatches/20171002.cngram.11.project.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.11.project.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_project.project_project_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.12.user.sql b/resources/sql/autopatches/20171002.cngram.12.user.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.12.user.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_user.user_user_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.13.repository.sql b/resources/sql/autopatches/20171002.cngram.13.repository.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.13.repository.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_repository.repository_repository_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20171002.cngram.14.commit.sql b/resources/sql/autopatches/20171002.cngram.14.commit.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20171002.cngram.14.commit.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_repository.repository_commit_fngrams_common ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT}, + needsCollection BOOL NOT NULL, + UNIQUE KEY `key_ngram` (ngram), + KEY `key_collect` (needsCollection) +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/src/applications/config/schema/PhabricatorConfigSchemaSpec.php b/src/applications/config/schema/PhabricatorConfigSchemaSpec.php --- a/src/applications/config/schema/PhabricatorConfigSchemaSpec.php +++ b/src/applications/config/schema/PhabricatorConfigSchemaSpec.php @@ -73,6 +73,12 @@ $engine->getNgramsTableName(), $engine->getNgramsSchemaColumns(), $engine->getNgramsSchemaKeys()); + + $this->buildRawSchema( + $engine->getApplicationName(), + $engine->getCommonNgramsTableName(), + $engine->getCommonNgramsSchemaColumns(), + $engine->getCommonNgramsSchemaKeys()); } protected function buildRawSchema( diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php --- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php +++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php @@ -165,21 +165,46 @@ $ferret_field['normalCorpus']); } - $sql = array(); - foreach ($ngrams as $ngram) { - $sql[] = qsprintf( + if ($ngrams) { + $common = queryfx_all( $conn, - '(%d, %s)', - $document_id, - $ngram); + 'SELECT ngram FROM %T WHERE ngram IN (%Ls)', + $engine->getCommonNgramsTableName(), + $ngrams); + $common = ipull($common, 'ngram', 'ngram'); + + foreach ($ngrams as $key => $ngram) { + if (isset($common[$ngram])) { + unset($ngrams[$key]); + continue; + } + + // NOTE: MySQL discards trailing whitespace in CHAR(X) columns. + $trim_ngram = rtrim($ngram, ' '); + if (isset($common[$ngram])) { + unset($ngrams[$key]); + continue; + } + } } - foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) { - queryfx( - $conn, - 'INSERT INTO %T (documentID, ngram) VALUES %Q', - $engine->getNgramsTableName(), - $chunk); + if ($ngrams) { + $sql = array(); + foreach ($ngrams as $ngram) { + $sql[] = qsprintf( + $conn, + '(%d, %s)', + $document_id, + $ngram); + } + + foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) { + queryfx( + $conn, + 'INSERT INTO %T (documentID, ngram) VALUES %Q', + $engine->getNgramsTableName(), + $chunk); + } } } catch (Exception $ex) { $object->killTransaction(); diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php --- a/src/applications/search/ferret/PhabricatorFerretEngine.php +++ b/src/applications/search/ferret/PhabricatorFerretEngine.php @@ -295,4 +295,35 @@ ); } + public function getCommonNgramsTableName() { + $application = $this->getApplicationName(); + $scope = $this->getScopeName(); + + return "{$application}_{$scope}_fngrams_common"; + } + + public function getCommonNgramsSchemaColumns() { + return array( + 'id' => 'auto', + 'ngram' => 'char3', + 'needsCollection' => 'bool', + ); + } + + public function getCommonNgramsSchemaKeys() { + return array( + 'PRIMARY' => array( + 'columns' => array('id'), + 'unique' => true, + ), + 'key_ngram' => array( + 'columns' => array('ngram'), + 'unique' => true, + ), + 'key_collect' => array( + 'columns' => array('needsCollection'), + ), + ); + } + } diff --git a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php --- a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php +++ b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php @@ -1700,6 +1700,34 @@ } } + // Remove common ngrams, like "the", which occur too frequently in + // documents to be useful in constraining the query. The best ngrams + // are obscure sequences which occur in very few documents. + + if ($flat) { + $common_ngrams = queryfx_all( + $conn, + 'SELECT ngram FROM %T WHERE ngram IN (%Ls)', + $engine->getCommonNgramsTableName(), + ipull($flat, 'ngram')); + $common_ngrams = ipull($common_ngrams, 'ngram', 'ngram'); + + foreach ($flat as $key => $spec) { + $ngram = $spec['ngram']; + if (isset($common_ngrams[$ngram])) { + unset($flat[$key]); + continue; + } + + // NOTE: MySQL discards trailing whitespace in CHAR(X) columns. + $trim_ngram = rtrim($ngram, ' '); + if (isset($common_ngrams[$trim_ngram])) { + unset($flat[$key]); + continue; + } + } + } + // MySQL only allows us to join a maximum of 61 tables per query. Each // ngram is going to cost us a join toward that limit, so if the user // specified a very long query string, just pick 16 of the ngrams