Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F14027303
D18672.id44830.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
12 KB
Referenced Files
None
Subscribers
None
D18672.id44830.diff
View Options
diff --git a/resources/sql/autopatches/20171002.cngram.01.maniphest.sql b/resources/sql/autopatches/20171002.cngram.01.maniphest.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.01.maniphest.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.02.event.sql b/resources/sql/autopatches/20171002.cngram.02.event.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.02.event.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_calendar.calendar_event_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.03.revision.sql b/resources/sql/autopatches/20171002.cngram.03.revision.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.03.revision.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_differential.differential_revision_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.04.fund.sql b/resources/sql/autopatches/20171002.cngram.04.fund.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.04.fund.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_fund.fund_initiative_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.05.owners.sql b/resources/sql/autopatches/20171002.cngram.05.owners.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.05.owners.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_owners.owners_package_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.06.passphrase.sql b/resources/sql/autopatches/20171002.cngram.06.passphrase.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.06.passphrase.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_passphrase.passphrase_credential_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.07.blog.sql b/resources/sql/autopatches/20171002.cngram.07.blog.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.07.blog.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_phame.phame_blog_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.08.post.sql b/resources/sql/autopatches/20171002.cngram.08.post.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.08.post.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_phame.phame_post_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.09.pholio.sql b/resources/sql/autopatches/20171002.cngram.09.pholio.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.09.pholio.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_pholio.pholio_mock_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.10.phriction.sql b/resources/sql/autopatches/20171002.cngram.10.phriction.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.10.phriction.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_phriction.phriction_document_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.11.project.sql b/resources/sql/autopatches/20171002.cngram.11.project.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.11.project.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_project.project_project_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.12.user.sql b/resources/sql/autopatches/20171002.cngram.12.user.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.12.user.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_user.user_user_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.13.repository.sql b/resources/sql/autopatches/20171002.cngram.13.repository.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.13.repository.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_repository.repository_repository_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/resources/sql/autopatches/20171002.cngram.14.commit.sql b/resources/sql/autopatches/20171002.cngram.14.commit.sql
new file mode 100644
--- /dev/null
+++ b/resources/sql/autopatches/20171002.cngram.14.commit.sql
@@ -0,0 +1,7 @@
+CREATE TABLE {$NAMESPACE}_repository.repository_commit_fngrams_common (
+ id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT},
+ needsCollection BOOL NOT NULL,
+ UNIQUE KEY `key_ngram` (ngram),
+ KEY `key_collect` (needsCollection)
+) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT};
diff --git a/src/applications/config/schema/PhabricatorConfigSchemaSpec.php b/src/applications/config/schema/PhabricatorConfigSchemaSpec.php
--- a/src/applications/config/schema/PhabricatorConfigSchemaSpec.php
+++ b/src/applications/config/schema/PhabricatorConfigSchemaSpec.php
@@ -73,6 +73,12 @@
$engine->getNgramsTableName(),
$engine->getNgramsSchemaColumns(),
$engine->getNgramsSchemaKeys());
+
+ $this->buildRawSchema(
+ $engine->getApplicationName(),
+ $engine->getCommonNgramsTableName(),
+ $engine->getCommonNgramsSchemaColumns(),
+ $engine->getCommonNgramsSchemaKeys());
}
protected function buildRawSchema(
diff --git a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
--- a/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
+++ b/src/applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php
@@ -165,21 +165,46 @@
$ferret_field['normalCorpus']);
}
- $sql = array();
- foreach ($ngrams as $ngram) {
- $sql[] = qsprintf(
+ if ($ngrams) {
+ $common = queryfx_all(
$conn,
- '(%d, %s)',
- $document_id,
- $ngram);
+ 'SELECT ngram FROM %T WHERE ngram IN (%Ls)',
+ $engine->getCommonNgramsTableName(),
+ $ngrams);
+ $common = ipull($common, 'ngram', 'ngram');
+
+ foreach ($ngrams as $key => $ngram) {
+ if (isset($common[$ngram])) {
+ unset($ngrams[$key]);
+ continue;
+ }
+
+ // NOTE: MySQL discards trailing whitespace in CHAR(X) columns.
+ $trim_ngram = rtrim($ngram, ' ');
+ if (isset($common[$ngram])) {
+ unset($ngrams[$key]);
+ continue;
+ }
+ }
}
- foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) {
- queryfx(
- $conn,
- 'INSERT INTO %T (documentID, ngram) VALUES %Q',
- $engine->getNgramsTableName(),
- $chunk);
+ if ($ngrams) {
+ $sql = array();
+ foreach ($ngrams as $ngram) {
+ $sql[] = qsprintf(
+ $conn,
+ '(%d, %s)',
+ $document_id,
+ $ngram);
+ }
+
+ foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) {
+ queryfx(
+ $conn,
+ 'INSERT INTO %T (documentID, ngram) VALUES %Q',
+ $engine->getNgramsTableName(),
+ $chunk);
+ }
}
} catch (Exception $ex) {
$object->killTransaction();
diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php
--- a/src/applications/search/ferret/PhabricatorFerretEngine.php
+++ b/src/applications/search/ferret/PhabricatorFerretEngine.php
@@ -295,4 +295,35 @@
);
}
+ public function getCommonNgramsTableName() {
+ $application = $this->getApplicationName();
+ $scope = $this->getScopeName();
+
+ return "{$application}_{$scope}_fngrams_common";
+ }
+
+ public function getCommonNgramsSchemaColumns() {
+ return array(
+ 'id' => 'auto',
+ 'ngram' => 'char3',
+ 'needsCollection' => 'bool',
+ );
+ }
+
+ public function getCommonNgramsSchemaKeys() {
+ return array(
+ 'PRIMARY' => array(
+ 'columns' => array('id'),
+ 'unique' => true,
+ ),
+ 'key_ngram' => array(
+ 'columns' => array('ngram'),
+ 'unique' => true,
+ ),
+ 'key_collect' => array(
+ 'columns' => array('needsCollection'),
+ ),
+ );
+ }
+
}
diff --git a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
--- a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
+++ b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php
@@ -1700,6 +1700,34 @@
}
}
+ // Remove common ngrams, like "the", which occur too frequently in
+ // documents to be useful in constraining the query. The best ngrams
+ // are obscure sequences which occur in very few documents.
+
+ if ($flat) {
+ $common_ngrams = queryfx_all(
+ $conn,
+ 'SELECT ngram FROM %T WHERE ngram IN (%Ls)',
+ $engine->getCommonNgramsTableName(),
+ ipull($flat, 'ngram'));
+ $common_ngrams = ipull($common_ngrams, 'ngram', 'ngram');
+
+ foreach ($flat as $key => $spec) {
+ $ngram = $spec['ngram'];
+ if (isset($common_ngrams[$ngram])) {
+ unset($flat[$key]);
+ continue;
+ }
+
+ // NOTE: MySQL discards trailing whitespace in CHAR(X) columns.
+ $trim_ngram = rtrim($ngram, ' ');
+ if (isset($common_ngrams[$trim_ngram])) {
+ unset($flat[$key]);
+ continue;
+ }
+ }
+ }
+
// MySQL only allows us to join a maximum of 61 tables per query. Each
// ngram is going to cost us a join toward that limit, so if the user
// specified a very long query string, just pick 16 of the ngrams
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sat, Nov 9, 6:28 AM (3 d, 18 h ago)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
6732327
Default Alt Text
D18672.id44830.diff (12 KB)
Attached To
Mode
D18672: Allow the Ferret engine to remove "common" ngrams from the index
Attached
Detach File
Event Timeline
Log In to Comment