Page MenuHomePhabricator

D18672.id44830.diff
No OneTemporary

D18672.id44830.diff

diff --git a/resources/sql/autopatches/20171002.cngram.01.maniphest.sql b/resources/sql/autopatches/20171002.cngram.01.maniphest.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.01.maniphest.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.02.event.sql b/resources/sql/autopatches/20171002.cngram.02.event.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.02.event.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_calendar.calendar_event_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.03.revision.sql b/resources/sql/autopatches/20171002.cngram.03.revision.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.03.revision.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_differential.differential_revision_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.04.fund.sql b/resources/sql/autopatches/20171002.cngram.04.fund.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.04.fund.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_fund.fund_initiative_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.05.owners.sql b/resources/sql/autopatches/20171002.cngram.05.owners.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.05.owners.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_owners.owners_package_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.06.passphrase.sql b/resources/sql/autopatches/20171002.cngram.06.passphrase.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.06.passphrase.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_passphrase.passphrase_credential_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.07.blog.sql b/resources/sql/autopatches/20171002.cngram.07.blog.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.07.blog.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_phame.phame_blog_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.08.post.sql b/resources/sql/autopatches/20171002.cngram.08.post.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.08.post.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_phame.phame_post_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.09.pholio.sql b/resources/sql/autopatches/20171002.cngram.09.pholio.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.09.pholio.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_pholio.pholio_mock_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.10.phriction.sql b/resources/sql/autopatches/20171002.cngram.10.phriction.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.10.phriction.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_phriction.phriction_document_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.11.project.sql b/resources/sql/autopatches/20171002.cngram.11.project.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.11.project.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_project.project_project_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.12.user.sql b/resources/sql/autopatches/20171002.cngram.12.user.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.12.user.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_user.user_user_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.13.repository.sql b/resources/sql/autopatches/20171002.cngram.13.repository.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.13.repository.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_repository.repository_repository_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.14.commit.sql b/resources/sql/autopatches/20171002.cngram.14.commit.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.14.commit.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_repository.repository_commit_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/src/applications/config/schema/PhabricatorConfigSchemaSpec.php b/src/applications/config/schema/PhabricatorConfigSchemaSpec.php
--- a/src/applications/config/schema/PhabricatorConfigSchemaSpec.php
+++ b/src/applications/config/schema/PhabricatorConfigSchemaSpec.php
@@ -73,6 +73,12 @@
$engine->getNgramsTableName(),
$engine->getNgramsSchemaColumns(),
$engine->getNgramsSchemaKeys());
+
+ $this->buildRawSchema(
+ $engine->getApplicationName(),
+ $engine->getCommonNgramsTableName(),
+ $engine->getCommonNgramsSchemaColumns(),
+ $engine->getCommonNgramsSchemaKeys());
}
protected function buildRawSchema(
diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
--- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
+++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
@@ -165,21 +165,46 @@
$ferret_field['normalCorpus']);
}
- $sql = array();
- foreach ($ngrams as $ngram) {
- $sql[] = qsprintf(
+ if ($ngrams) {
+ $common = queryfx_all(
$conn,
- '(%d, %s)',
- $document_id,
- $ngram);
+ 'SELECT ngram FROM %T WHERE ngram IN (%Ls)',
+ $engine->getCommonNgramsTableName(),
+ $ngrams);
+ $common = ipull($common, 'ngram', 'ngram');
+
+ foreach ($ngrams as $key => $ngram) {
+ if (isset($common[$ngram])) {
+ unset($ngrams[$key]);
+ continue;
+ }
+
+ // NOTE: MySQL discards trailing whitespace in CHAR(X) columns.
+ $trim_ngram = rtrim($ngram, ' ');
+ if (isset($common[$ngram])) {
+ unset($ngrams[$key]);
+ continue;
+ }
+ }
}
- foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) {
- queryfx(
- $conn,
- 'INSERT INTO %T (documentID, ngram) VALUES %Q',
- $engine->getNgramsTableName(),
- $chunk);
+ if ($ngrams) {
+ $sql = array();
+ foreach ($ngrams as $ngram) {
+ $sql[] = qsprintf(
+ $conn,
+ '(%d, %s)',
+ $document_id,
+ $ngram);
+ }
+
+ foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) {
+ queryfx(
+ $conn,
+ 'INSERT INTO %T (documentID, ngram) VALUES %Q',
+ $engine->getNgramsTableName(),
+ $chunk);
+ }
}
} catch (Exception $ex) {
$object->killTransaction();
diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php
--- a/src/applications/search/ferret/PhabricatorFerretEngine.php
+++ b/src/applications/search/ferret/PhabricatorFerretEngine.php
@@ -295,4 +295,35 @@
);
}
+ public function getCommonNgramsTableName() {
+ $application = $this->getApplicationName();
+ $scope = $this->getScopeName();
+
+ return "{$application}_{$scope}_fngrams_common";
+ }
+
+ public function getCommonNgramsSchemaColumns() {
+ return array(
+ 'id' => 'auto',
+ 'ngram' => 'char3',
+ 'needsCollection' => 'bool',
+ );
+ }
+
+ public function getCommonNgramsSchemaKeys() {
+ return array(
+ 'PRIMARY' => array(
+ 'columns' => array('id'),
+ 'unique' => true,
+ ),
+ 'key_ngram' => array(
+ 'columns' => array('ngram'),
+ 'unique' => true,
+ ),
+ 'key_collect' => array(
+ 'columns' => array('needsCollection'),
+ ),
+ );
+ }
+
}
diff --git a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
--- a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
+++ b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
@@ -1700,6 +1700,34 @@
}
}
+ // Remove common ngrams, like "the", which occur too frequently in
+ // documents to be useful in constraining the query. The best ngrams
+ // are obscure sequences which occur in very few documents.
+
+ if ($flat) {
+ $common_ngrams = queryfx_all(
+ $conn,
+ 'SELECT ngram FROM %T WHERE ngram IN (%Ls)',
+ $engine->getCommonNgramsTableName(),
+ ipull($flat, 'ngram'));
+ $common_ngrams = ipull($common_ngrams, 'ngram', 'ngram');
+
+ foreach ($flat as $key => $spec) {
+ $ngram = $spec['ngram'];
+ if (isset($common_ngrams[$ngram])) {
+ unset($flat[$key]);
+ continue;
+ }
+
+ // NOTE: MySQL discards trailing whitespace in CHAR(X) columns.
+ $trim_ngram = rtrim($ngram, ' ');
+ if (isset($common_ngrams[$trim_ngram])) {
+ unset($flat[$key]);
+ continue;
+ }
+ }
+ }
+
// MySQL only allows us to join a maximum of 61 tables per query. Each
// ngram is going to cost us a join toward that limit, so if the user
// specified a very long query string, just pick 16 of the ngrams

File Metadata

Mime Type
text/plain
Expires
Sat, Nov 9, 6:28 AM (3 d, 18 h ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
6732327
Default Alt Text
D18672.id44830.diff (12 KB)

Event Timeline