Changeset View
Changeset View
Standalone View
Standalone View
src/utils/utf8.php
Show First 20 Lines • Show All 322 Lines • ▼ Show 20 Lines | foreach ($display_glyphs as $display_glyph) { | ||||
} | } | ||||
} | } | ||||
return $len; | return $len; | ||||
} | } | ||||
/** | /** | ||||
* Test if a string contains Chinese, Japanese, or Korean characters. | |||||
* | |||||
* Most languages use spaces to separate words, but these languages do not. | |||||
* | |||||
* @param string String to examine, in UTF8. | |||||
* @return bool True if the string contains Chinese, Japanese, or Korean | |||||
* characters. | |||||
*/ | |||||
function phutil_utf8_is_cjk($string) { | |||||
amckinley: Is this a function that will need to change all the time? I have no idea how stable UTF8 is for… | |||||
$codepoints = phutil_utf8v_codepoints($string); | |||||
foreach ($codepoints as $codepoint) { | |||||
// CJK Unified Ideographs | |||||
if ($codepoint >= 0x4E00 && $codepoint <= 0x9FFF) { | |||||
return true; | |||||
} | |||||
// CJK Unified Ideographs Extension A | |||||
if ($codepoint >= 0x3400 && $codepoint <= 0x4DBF) { | |||||
return true; | |||||
} | |||||
// CJK Unified Ideographs Extension B | |||||
if ($codepoint >= 0x20000 && $codepoint <= 0x2A6DF) { | |||||
return true; | |||||
} | |||||
// CJK Unified Ideographs Extension C | |||||
if ($codepoint >= 0x2A700 && $codepoint <= 0x2B73F) { | |||||
return true; | |||||
} | |||||
// CJK Unified Ideographs Extension D | |||||
if ($codepoint >= 0x2B740 && $codepoint <= 0x2B81F) { | |||||
return true; | |||||
} | |||||
// CJK Unified Ideographs Extension E | |||||
if ($codepoint >= 0x2B820 && $codepoint <= 0x2CEAF) { | |||||
return true; | |||||
} | |||||
// CJK Unified Ideographs Extension F | |||||
if ($codepoint >= 0x2CEB0 && $codepoint <= 0x2EBEF) { | |||||
return true; | |||||
} | |||||
// CJK Compatibility Ideographs | |||||
if ($codepoint >= 0xF900 && $codepoint <= 0xFAFF) { | |||||
return true; | |||||
} | |||||
} | |||||
return false; | |||||
} | |||||
/** | |||||
* Split a UTF-8 string into an array of characters. Combining characters are | * Split a UTF-8 string into an array of characters. Combining characters are | ||||
* also split. | * also split. | ||||
* | * | ||||
* @param string A valid utf-8 string. | * @param string A valid utf-8 string. | ||||
* @param int|null Stop processing after examining this many bytes. | * @param int|null Stop processing after examining this many bytes. | ||||
* @return list A list of characters in the string. | * @return list A list of characters in the string. | ||||
*/ | */ | ||||
function phutil_utf8v($string, $byte_limit = null) { | function phutil_utf8v($string, $byte_limit = null) { | ||||
▲ Show 20 Lines • Show All 503 Lines • Show Last 20 Lines |
Is this a function that will need to change all the time? I have no idea how stable UTF8 is for CJK stuff.