Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F14832577
D17330.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
2 KB
Referenced Files
None
Subscribers
None
D17330.id.diff
View Options
diff --git a/src/search/PhutilSearchStemmer.php b/src/search/PhutilSearchStemmer.php
--- a/src/search/PhutilSearchStemmer.php
+++ b/src/search/PhutilSearchStemmer.php
@@ -9,10 +9,12 @@
}
public function stemCorpus($corpus) {
- $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF]+/', $corpus);
+ $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus);
$words = array();
foreach ($tokens as $key => $token) {
+ $token = trim($token, '._');
+
if (strlen($token) < 3) {
continue;
}
@@ -37,6 +39,13 @@
* @phutil-external-symbol class Porter
*/
private function applyStemmer($normalized_token) {
+ // If the token has internal punctuation, handle it literally. This
+ // deals with things like domain names, Conduit API methods, and other
+ // sorts of informal tokens.
+ if (preg_match('/[._]/', $normalized_token)) {
+ return $normalized_token;
+ }
+
static $loaded;
if ($loaded === null) {
@@ -45,6 +54,7 @@
$loaded = true;
}
+
$stem = Porter::stem($normalized_token);
// If the stem is too short, it won't be a candidate for indexing. These
diff --git a/src/search/__tests__/PhutilSearchStemmerTestCase.php b/src/search/__tests__/PhutilSearchStemmerTestCase.php
--- a/src/search/__tests__/PhutilSearchStemmerTestCase.php
+++ b/src/search/__tests__/PhutilSearchStemmerTestCase.php
@@ -35,6 +35,14 @@
// be stemmed.
'dns' => 'dns',
'nis' => 'nis',
+
+ // Complex tokens with internal punctuation should be left untouched;
+ // these are usually things like domain names, API calls, informal tags,
+ // etc.
+ 'apples' => 'appl',
+ 'bananas' => 'banana',
+ 'apples_bananas' => 'apples_bananas',
+ 'apples_bananas.apples_bananas' => 'apples_bananas.apples_bananas',
);
$stemmer = new PhutilSearchStemmer();
@@ -57,6 +65,10 @@
'in a 1-meter radius around a target.' =>
'firebal seventh level spell which deal 2d16 point damag meter '.
'radiu around target',
+ 'apples-bananas' => 'appl banana',
+ 'apples_bananas' => 'apples_bananas',
+ 'apples.bananas' => 'apples.bananas',
+ 'oddly-proportioned' => 'oddli proport',
);
$stemmer = new PhutilSearchStemmer();
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Jan 31, 5:39 AM (11 h, 52 m)
Storage Engine
blob
Storage Format
Encrypted (AES-256-CBC)
Storage Handle
7077526
Default Alt Text
D17330.id.diff (2 KB)
Attached To
Mode
D17330: Don't stem "words" with a "." or "_" in them
Attached
Detach File
Event Timeline
Log In to Comment