diff --git a/src/infrastructure/markup/blockrule/PhutilRemarkupHeaderBlockRule.php b/src/infrastructure/markup/blockrule/PhutilRemarkupHeaderBlockRule.php --- a/src/infrastructure/markup/blockrule/PhutilRemarkupHeaderBlockRule.php +++ b/src/infrastructure/markup/blockrule/PhutilRemarkupHeaderBlockRule.php @@ -73,12 +73,8 @@ } private function generateAnchor($level, $text) { - $anchor = strtolower($text); - $anchor = preg_replace('/[^a-z0-9]/', '-', $anchor); - $anchor = preg_replace('/--+/', '-', $anchor); - $anchor = trim($anchor, '-'); - $anchor = substr($anchor, 0, 24); - $anchor = trim($anchor, '-'); + $anchor = self::getAnchorNameFromHeaderText($text); + $base = $anchor; $key = self::KEY_HEADER_TOC; @@ -159,4 +155,31 @@ return phutil_implode_html("\n", $toc); } + public static function getAnchorNameFromHeaderText($text) { + $anchor = phutil_utf8_strtolower($text); + + // Replace all latin characters which are not "a-z" or "0-9" with "-". + // Preserve other characters, since non-latin letters and emoji work + // fine in anchors. + $anchor = preg_replace('/[\x00-\x29\x3A-\x60\x7B-\x7F]+/', '-', $anchor); + $anchor = trim($anchor, '-'); + + // Truncate the fragment to something reasonable. + $anchor = id(new PhutilUTF8StringTruncator()) + ->setMaximumGlyphs(32) + ->setTerminator('') + ->truncateString($anchor); + + // If the fragment is terminated by a word which "The U.S. Government + // Printing Office Style Manual" normally discourages capitalizing in + // titles, discard it. This is an arbitrary heuristic intended to avoid + // awkward hanging words in anchors. + $anchor = preg_replace( + '/-(a|an|the|at|by|for|in|of|on|per|to|up|and|as|but|if|or|nor)\z/', + '', + $anchor); + + return $anchor; + } + }