diff --git a/resources/sql/autopatches/20170828.ferret.01.taskdoc.sql b/resources/sql/autopatches/20170828.ferret.01.taskdoc.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20170828.ferret.01.taskdoc.sql @@ -0,0 +1,9 @@ +CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_fdocument ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + objectPHID VARBINARY(64) NOT NULL, + isClosed BOOL NOT NULL, + authorPHID VARBINARY(64), + ownerPHID VARBINARY(64), + epochCreated INT UNSIGNED NOT NULL, + epochModified INT UNSIGNED NOT NULL +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20170828.ferret.02.taskfield.sql b/resources/sql/autopatches/20170828.ferret.02.taskfield.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20170828.ferret.02.taskfield.sql @@ -0,0 +1,7 @@ +CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_ffield ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + documentID INT UNSIGNED NOT NULL, + fieldKey VARCHAR(4) NOT NULL COLLATE {$COLLATE_TEXT}, + rawCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT}, + normalCorpus LONGTEXT NOT NULL COLLATE {$COLLATE_SORT} +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20170828.ferret.03.taskngrams.sql b/resources/sql/autopatches/20170828.ferret.03.taskngrams.sql new file mode 100644 --- /dev/null +++ b/resources/sql/autopatches/20170828.ferret.03.taskngrams.sql @@ -0,0 +1,5 @@ +CREATE TABLE {$NAMESPACE}_maniphest.maniphest_task_fngrams ( + id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, + documentID INT UNSIGNED NOT NULL, + ngram CHAR(3) NOT NULL COLLATE {$COLLATE_TEXT} +) ENGINE=InnoDB, COLLATE {$COLLATE_TEXT}; diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -1533,6 +1533,10 @@ 'ManiphestTaskEditBulkJobType' => 'applications/maniphest/bulk/ManiphestTaskEditBulkJobType.php', 'ManiphestTaskEditController' => 'applications/maniphest/controller/ManiphestTaskEditController.php', 'ManiphestTaskEditEngineLock' => 'applications/maniphest/editor/ManiphestTaskEditEngineLock.php', + 'ManiphestTaskFerretDocument' => 'applications/maniphest/storage/ManiphestTaskFerretDocument.php', + 'ManiphestTaskFerretEngine' => 'applications/maniphest/search/ManiphestTaskFerretEngine.php', + 'ManiphestTaskFerretField' => 'applications/maniphest/storage/ManiphestTaskFerretField.php', + 'ManiphestTaskFerretNgrams' => 'applications/maniphest/storage/ManiphestTaskFerretNgrams.php', 'ManiphestTaskFulltextEngine' => 'applications/maniphest/search/ManiphestTaskFulltextEngine.php', 'ManiphestTaskGraph' => 'infrastructure/graph/ManiphestTaskGraph.php', 'ManiphestTaskHasCommitEdgeType' => 'applications/maniphest/edge/ManiphestTaskHasCommitEdgeType.php', @@ -2828,6 +2832,12 @@ 'PhabricatorFeedStoryNotification' => 'applications/notification/storage/PhabricatorFeedStoryNotification.php', 'PhabricatorFeedStoryPublisher' => 'applications/feed/PhabricatorFeedStoryPublisher.php', 'PhabricatorFeedStoryReference' => 'applications/feed/storage/PhabricatorFeedStoryReference.php', + 'PhabricatorFerretDocument' => 'applications/search/ferret/PhabricatorFerretDocument.php', + 'PhabricatorFerretEngine' => 'applications/search/ferret/PhabricatorFerretEngine.php', + 'PhabricatorFerretField' => 'applications/search/ferret/PhabricatorFerretField.php', + 'PhabricatorFerretFulltextEngineExtension' => 'applications/search/engineextension/PhabricatorFerretFulltextEngineExtension.php', + 'PhabricatorFerretInterface' => 'applications/search/ferret/PhabricatorFerretInterface.php', + 'PhabricatorFerretNgrams' => 'applications/search/ferret/PhabricatorFerretNgrams.php', 'PhabricatorFile' => 'applications/files/storage/PhabricatorFile.php', 'PhabricatorFileAES256StorageFormat' => 'applications/files/format/PhabricatorFileAES256StorageFormat.php', 'PhabricatorFileBundleLoader' => 'applications/files/query/PhabricatorFileBundleLoader.php', @@ -3195,6 +3205,7 @@ 'PhabricatorNamedQueryQuery' => 'applications/search/query/PhabricatorNamedQueryQuery.php', 'PhabricatorNavigationRemarkupRule' => 'infrastructure/markup/rule/PhabricatorNavigationRemarkupRule.php', 'PhabricatorNeverTriggerClock' => 'infrastructure/daemon/workers/clock/PhabricatorNeverTriggerClock.php', + 'PhabricatorNgramEngine' => 'applications/search/ngrams/PhabricatorNgramEngine.php', 'PhabricatorNgramsIndexEngineExtension' => 'applications/search/engineextension/PhabricatorNgramsIndexEngineExtension.php', 'PhabricatorNgramsInterface' => 'applications/search/interface/PhabricatorNgramsInterface.php', 'PhabricatorNotificationBuilder' => 'applications/notification/builder/PhabricatorNotificationBuilder.php', @@ -6659,6 +6670,7 @@ 'PhabricatorSpacesInterface', 'PhabricatorConduitResultInterface', 'PhabricatorFulltextInterface', + 'PhabricatorFerretInterface', 'DoorkeeperBridgedObjectInterface', 'PhabricatorEditEngineSubtypeInterface', 'PhabricatorEditEngineLockableInterface', @@ -6682,6 +6694,10 @@ 'ManiphestTaskEditBulkJobType' => 'PhabricatorWorkerBulkJobType', 'ManiphestTaskEditController' => 'ManiphestController', 'ManiphestTaskEditEngineLock' => 'PhabricatorEditEngineLock', + 'ManiphestTaskFerretDocument' => 'PhabricatorFerretDocument', + 'ManiphestTaskFerretEngine' => 'PhabricatorFerretEngine', + 'ManiphestTaskFerretField' => 'PhabricatorFerretField', + 'ManiphestTaskFerretNgrams' => 'PhabricatorFerretNgrams', 'ManiphestTaskFulltextEngine' => 'PhabricatorFulltextEngine', 'ManiphestTaskGraph' => 'PhabricatorObjectGraph', 'ManiphestTaskHasCommitEdgeType' => 'PhabricatorEdgeType', @@ -8147,6 +8163,11 @@ 'PhabricatorFeedStoryNotification' => 'PhabricatorFeedDAO', 'PhabricatorFeedStoryPublisher' => 'Phobject', 'PhabricatorFeedStoryReference' => 'PhabricatorFeedDAO', + 'PhabricatorFerretDocument' => 'PhabricatorSearchDAO', + 'PhabricatorFerretEngine' => 'Phobject', + 'PhabricatorFerretField' => 'PhabricatorSearchDAO', + 'PhabricatorFerretFulltextEngineExtension' => 'PhabricatorFulltextEngineExtension', + 'PhabricatorFerretNgrams' => 'PhabricatorSearchDAO', 'PhabricatorFile' => array( 'PhabricatorFileDAO', 'PhabricatorApplicationTransactionInterface', @@ -8565,6 +8586,7 @@ 'PhabricatorNamedQueryQuery' => 'PhabricatorCursorPagedPolicyAwareQuery', 'PhabricatorNavigationRemarkupRule' => 'PhutilRemarkupRule', 'PhabricatorNeverTriggerClock' => 'PhabricatorTriggerClock', + 'PhabricatorNgramEngine' => 'Phobject', 'PhabricatorNgramsIndexEngineExtension' => 'PhabricatorIndexEngineExtension', 'PhabricatorNgramsInterface' => 'PhabricatorIndexableInterface', 'PhabricatorNotificationBuilder' => 'Phobject', diff --git a/src/applications/maniphest/query/ManiphestTaskSearchEngine.php b/src/applications/maniphest/query/ManiphestTaskSearchEngine.php --- a/src/applications/maniphest/query/ManiphestTaskSearchEngine.php +++ b/src/applications/maniphest/query/ManiphestTaskSearchEngine.php @@ -49,6 +49,8 @@ $subtype_map = id(new ManiphestTask())->newEditEngineSubtypeMap(); $hide_subtypes = (count($subtype_map) == 1); + $hide_ferret = !PhabricatorEnv::getEnvConfig('phabricator.show-prototypes'); + return array( id(new PhabricatorOwnersSearchField()) ->setLabel(pht('Assigned To')) @@ -89,6 +91,10 @@ id(new PhabricatorSearchTextField()) ->setLabel(pht('Contains Words')) ->setKey('fulltext'), + id(new PhabricatorSearchTextField()) + ->setLabel(pht('Matches (Prototype)')) + ->setKey('ferret') + ->setIsHidden($hide_ferret), id(new PhabricatorSearchThreeStateField()) ->setLabel(pht('Open Parents')) ->setKey('hasParents') @@ -145,6 +151,7 @@ 'priorities', 'subtypes', 'fulltext', + 'ferret', 'hasParents', 'hasSubtasks', 'parentIDs', @@ -224,6 +231,12 @@ $query->withFullTextSearch($map['fulltext']); } + if (strlen($map['ferret'])) { + $query->withFerretConstraint( + id(new ManiphestTask())->newFerretEngine(), + $map['ferret']); + } + if ($map['parentIDs']) { $query->withParentTaskIDs($map['parentIDs']); } diff --git a/src/applications/maniphest/search/ManiphestTaskFerretEngine.php b/src/applications/maniphest/search/ManiphestTaskFerretEngine.php new file mode 100644 --- /dev/null +++ b/src/applications/maniphest/search/ManiphestTaskFerretEngine.php @@ -0,0 +1,18 @@ +getPHID(); + $engine = $object->newFerretEngine(); + + $ferret_document = $engine->newDocumentObject() + ->setObjectPHID($phid) + ->setIsClosed(0) + ->setEpochCreated(0) + ->setEpochModified(0); + + $stemmer = new PhutilSearchStemmer(); + + $ferret_fields = array(); + $ngrams_source = array(); + foreach ($document->getFieldData() as $field) { + list($key, $raw_corpus) = $field; + + if (!strlen($raw_corpus)) { + continue; + } + + $normal_corpus = $stemmer->stemCorpus($raw_corpus); + + $ferret_fields[] = $engine->newFieldObject() + ->setFieldKey($key) + ->setRawCorpus($raw_corpus) + ->setNormalCorpus($normal_corpus); + + $ngrams_source[] = $raw_corpus; + } + $ngrams_source = implode(' ', $ngrams_source); + + $ngrams = id(new PhabricatorNgramEngine()) + ->getNgramsFromString($ngrams_source, 'index'); + + $ferret_document->openTransaction(); + $this->deleteOldDocument($engine, $object, $document); + + $ferret_document->save(); + + $document_id = $ferret_document->getID(); + foreach ($ferret_fields as $ferret_field) { + $ferret_field + ->setDocumentID($document_id) + ->save(); + } + + $ferret_ngrams = $engine->newNgramsObject(); + $conn = $ferret_ngrams->establishConnection('w'); + + $sql = array(); + foreach ($ngrams as $ngram) { + $sql[] = qsprintf( + $conn, + '(%d, %s)', + $document_id, + $ngram); + } + + foreach (PhabricatorLiskDAO::chunkSQL($sql) as $chunk) { + queryfx( + $conn, + 'INSERT INTO %T (documentID, ngram) VALUES %Q', + $ferret_ngrams->getTableName(), + $chunk); + } + $ferret_document->saveTransaction(); + } + + + private function deleteOldDocument( + PhabricatorFerretEngine $engine, + $object, + PhabricatorSearchAbstractDocument $document) { + + $old_document = $engine->newDocumentObject()->loadOneWhere( + 'objectPHID = %s', + $document->getPHID()); + if (!$old_document) { + return; + } + + $conn = $old_document->establishConnection('w'); + $old_id = $old_document->getID(); + + queryfx( + $conn, + 'DELETE FROM %T WHERE id = %d', + $engine->newDocumentObject()->getTableName(), + $old_id); + + queryfx( + $conn, + 'DELETE FROM %T WHERE documentID = %d', + $engine->newFieldObject()->getTableName(), + $old_id); + + queryfx( + $conn, + 'DELETE FROM %T WHERE documentID = %d', + $engine->newNgramsObject()->getTableName(), + $old_id); + } + +} diff --git a/src/applications/search/ferret/PhabricatorFerretDocument.php b/src/applications/search/ferret/PhabricatorFerretDocument.php new file mode 100644 --- /dev/null +++ b/src/applications/search/ferret/PhabricatorFerretDocument.php @@ -0,0 +1,40 @@ + false, + self::CONFIG_COLUMN_SCHEMA => array( + 'isClosed' => 'bool', + 'authorPHID' => 'phid?', + 'ownerPHID' => 'phid?', + 'epochCreated' => 'epoch', + 'epochModified' => 'epoch', + ), + self::CONFIG_KEY_SCHEMA => array( + 'key_object' => array( + 'columns' => array('objectPHID'), + 'unique' => true, + ), + ), + ) + parent::getConfiguration(); + } + + public function getTableName() { + $application = $this->getApplicationName(); + $key = $this->getIndexKey(); + return "{$application}_{$key}_fdocument"; + } + +} diff --git a/src/applications/search/ferret/PhabricatorFerretEngine.php b/src/applications/search/ferret/PhabricatorFerretEngine.php new file mode 100644 --- /dev/null +++ b/src/applications/search/ferret/PhabricatorFerretEngine.php @@ -0,0 +1,9 @@ + false, + self::CONFIG_COLUMN_SCHEMA => array( + 'documentID' => 'uint32', + 'fieldKey' => 'text4', + 'rawCorpus' => 'sort', + 'normalCorpus' => 'sort', + ), + self::CONFIG_KEY_SCHEMA => array( + 'key_document' => array( + 'columns' => array('documentID', 'fieldKey'), + ), + ), + ) + parent::getConfiguration(); + } + + public function getTableName() { + $application = $this->getApplicationName(); + $key = $this->getIndexKey(); + return "{$application}_{$key}_ffield"; + } + +} diff --git a/src/applications/search/ferret/PhabricatorFerretInterface.php b/src/applications/search/ferret/PhabricatorFerretInterface.php new file mode 100644 --- /dev/null +++ b/src/applications/search/ferret/PhabricatorFerretInterface.php @@ -0,0 +1,7 @@ + false, + self::CONFIG_COLUMN_SCHEMA => array( + 'documentID' => 'uint32', + 'ngram' => 'char3', + ), + self::CONFIG_KEY_SCHEMA => array( + 'key_ngram' => array( + 'columns' => array('ngram', 'documentID'), + ), + 'key_object' => array( + 'columns' => array('documentID'), + ), + ), + ) + parent::getConfiguration(); + } + + public function getTableName() { + $application = $this->getApplicationName(); + $key = $this->getIndexKey(); + return "{$application}_{$key}_fngrams"; + } + +} diff --git a/src/applications/search/ngrams/PhabricatorNgramEngine.php b/src/applications/search/ngrams/PhabricatorNgramEngine.php new file mode 100644 --- /dev/null +++ b/src/applications/search/ngrams/PhabricatorNgramEngine.php @@ -0,0 +1,41 @@ +tokenizeString($value); + + $ngrams = array(); + foreach ($tokens as $token) { + $token = phutil_utf8_strtolower($token); + + switch ($mode) { + case 'query': + break; + case 'index': + $token = ' '.$token.' '; + break; + case 'prefix': + $token = ' '.$token; + break; + } + + $len = (strlen($token) - 2); + for ($ii = 0; $ii < $len; $ii++) { + $ngram = substr($token, $ii, 3); + $ngrams[$ngram] = $ngram; + } + } + + ksort($ngrams); + + return array_keys($ngrams); + } + +} diff --git a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php --- a/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php +++ b/src/infrastructure/query/policy/PhabricatorCursorPagedPolicyAwareQuery.php @@ -27,6 +27,8 @@ private $spacePHIDs; private $spaceIsArchived; private $ngrams = array(); + private $ferretEngine; + private $ferretConstraints; protected function getPageCursors(array $page) { return array( @@ -270,6 +272,7 @@ $joins[] = $this->buildEdgeLogicJoinClause($conn); $joins[] = $this->buildApplicationSearchJoinClause($conn); $joins[] = $this->buildNgramsJoinClause($conn); + $joins[] = $this->buildFerretJoinClause($conn); return $joins; } @@ -292,6 +295,7 @@ $where[] = $this->buildEdgeLogicWhereClause($conn); $where[] = $this->buildSpacesWhereClause($conn); $where[] = $this->buildNgramsWhereClause($conn); + $where[] = $this->buildFerretWhereClause($conn); return $where; } @@ -1373,6 +1377,146 @@ } +/* -( Ferret )------------------------------------------------------------- */ + + + public function withFerretConstraint( + PhabricatorFerretEngine $engine, + $raw_query) { + + if ($this->ferretEngine) { + throw new Exception( + pht( + 'Query may not have multiple fulltext constraints.')); + } + + if (!strlen($raw_query)) { + return $this; + } + + $this->ferretEngine = $engine; + $this->ferretConstraints = preg_split('/\s+/', $raw_query); + + return $this; + } + + protected function buildFerretJoinClause(AphrontDatabaseConnection $conn) { + if (!$this->ferretEngine) { + return array(); + } + + $engine = $this->ferretEngine; + $ngram_engine = new PhabricatorNgramEngine(); + + $ngram_table = $engine->newNgramsObject(); + $ngram_table_name = $ngram_table->getTableName(); + + $flat = array(); + foreach ($this->ferretConstraints as $term) { + $value = $term; + $length = count(phutil_utf8v($term)); + + if ($length >= 3) { + $ngrams = $ngram_engine->getNgramsFromString($value, 'query'); + $prefix = false; + } else if ($length == 2) { + $ngrams = $ngram_engine->getNgramsFromString($value, 'prefix'); + $prefix = false; + } else { + $ngrams = array(' '.$value); + $prefix = true; + } + + foreach ($ngrams as $ngram) { + $flat[] = array( + 'table' => $ngram_table_name, + 'ngram' => $ngram, + 'prefix' => $prefix, + ); + } + } + + // MySQL only allows us to join a maximum of 61 tables per query. Each + // ngram is going to cost us a join toward that limit, so if the user + // specified a very long query string, just pick 16 of the ngrams + // at random. + if (count($flat) > 16) { + shuffle($flat); + $flat = array_slice($flat, 0, 16); + } + + $alias = $this->getPrimaryTableAlias(); + if ($alias) { + $phid_column = qsprintf($conn, '%T.%T', $alias, 'phid'); + } else { + $phid_column = qsprintf($conn, '%T', 'phid'); + } + + $document_table = $engine->newDocumentObject(); + $field_table = $engine->newFieldObject(); + + $joins = array(); + $joins[] = qsprintf( + $conn, + 'JOIN %T ftdoc ON ftdoc.objectPHID = %Q', + $document_table->getTableName(), + $phid_column); + + $idx = 1; + foreach ($flat as $spec) { + $table = $spec['table']; + $ngram = $spec['ngram']; + $prefix = $spec['prefix']; + + $alias = 'ft'.$idx++; + + if ($prefix) { + $joins[] = qsprintf( + $conn, + 'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram LIKE %>', + $table, + $alias, + $alias, + $alias, + $ngram); + } else { + $joins[] = qsprintf( + $conn, + 'JOIN %T %T ON %T.documentID = ftdoc.id AND %T.ngram = %s', + $table, + $alias, + $alias, + $alias, + $ngram); + } + } + + $joins[] = qsprintf( + $conn, + 'JOIN %T ftfield ON ftdoc.id = ftfield.documentID', + $field_table->getTableName()); + + return $joins; + } + + protected function buildFerretWhereClause(AphrontDatabaseConnection $conn) { + if (!$this->ferretEngine) { + return array(); + } + + $where = array(); + foreach ($this->ferretConstraints as $constraint) { + $where[] = qsprintf( + $conn, + '(ftfield.rawCorpus LIKE %~ OR ftfield.normalCorpus LIKE %~)', + $constraint, + $constraint); + } + + return $where; + } + + /* -( Ngrams )------------------------------------------------------------- */