Changeset View
Changeset View
Standalone View
Standalone View
src/search/PhutilSearchStemmer.php
<?php | <?php | ||||
final class PhutilSearchStemmer | final class PhutilSearchStemmer | ||||
extends Phobject { | extends Phobject { | ||||
public function stemToken($token) { | public function stemToken($token) { | ||||
$token = $this->normalizeToken($token); | $token = $this->normalizeToken($token); | ||||
return $this->applyStemmer($token); | return $this->applyStemmer($token); | ||||
} | } | ||||
public function stemCorpus($corpus) { | public function stemCorpus($corpus) { | ||||
$tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF]+/', $corpus); | $tokens = preg_split('/[^a-zA-Z0-9\x7F-\xFF._]+/', $corpus); | ||||
$words = array(); | $words = array(); | ||||
foreach ($tokens as $key => $token) { | foreach ($tokens as $key => $token) { | ||||
$token = trim($token, '._'); | |||||
if (strlen($token) < 3) { | if (strlen($token) < 3) { | ||||
continue; | continue; | ||||
} | } | ||||
$normal_word = $this->normalizeToken($token); | $normal_word = $this->normalizeToken($token); | ||||
$words[$normal_word] = $normal_word; | $words[$normal_word] = $normal_word; | ||||
} | } | ||||
$stems = array(); | $stems = array(); | ||||
foreach ($words as $normal_word) { | foreach ($words as $normal_word) { | ||||
$stems[] = $this->applyStemmer($normal_word); | $stems[] = $this->applyStemmer($normal_word); | ||||
} | } | ||||
return implode(' ', $stems); | return implode(' ', $stems); | ||||
} | } | ||||
private function normalizeToken($token) { | private function normalizeToken($token) { | ||||
return phutil_utf8_strtolower($token); | return phutil_utf8_strtolower($token); | ||||
} | } | ||||
/** | /** | ||||
* @phutil-external-symbol class Porter | * @phutil-external-symbol class Porter | ||||
*/ | */ | ||||
private function applyStemmer($normalized_token) { | private function applyStemmer($normalized_token) { | ||||
// If the token has internal punctuation, handle it literally. This | |||||
// deals with things like domain names, Conduit API methods, and other | |||||
// sorts of informal tokens. | |||||
if (preg_match('/[._]/', $normalized_token)) { | |||||
return $normalized_token; | |||||
} | |||||
static $loaded; | static $loaded; | ||||
if ($loaded === null) { | if ($loaded === null) { | ||||
$root = dirname(phutil_get_library_root('phutil')); | $root = dirname(phutil_get_library_root('phutil')); | ||||
require_once $root.'/externals/porter-stemmer/src/Porter.php'; | require_once $root.'/externals/porter-stemmer/src/Porter.php'; | ||||
$loaded = true; | $loaded = true; | ||||
} | } | ||||
$stem = Porter::stem($normalized_token); | $stem = Porter::stem($normalized_token); | ||||
// If the stem is too short, it won't be a candidate for indexing. These | // If the stem is too short, it won't be a candidate for indexing. These | ||||
// tokens are also likely to be acronyms (like "DNS") rather than real | // tokens are also likely to be acronyms (like "DNS") rather than real | ||||
// English words. | // English words. | ||||
if (strlen($stem) < 3) { | if (strlen($stem) < 3) { | ||||
return $normalized_token; | return $normalized_token; | ||||
} | } | ||||
return $stem; | return $stem; | ||||
} | } | ||||
} | } |