diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php
--- a/src/utils/__tests__/PhutilUTF8TestCase.php
+++ b/src/utils/__tests__/PhutilUTF8TestCase.php
@@ -520,6 +520,9 @@
       // This isn't valid.
       "\xEF\xBF\xC0" => array(false, false, 'Invalid, byte range.'),
 
+      // This is an invalid nonminimal representation.
+      "\xF0\x81\x80\x80" => array(false, false, 'Nonminimal 4-byte characer.'),
+
       // This is the first character above BMP, U+10000.
       "\xF0\x90\x80\x80" => array(true, false, 'U+10000'),
       "\xF0\x9D\x84\x9E" => array(true, false, 'gclef'),
@@ -538,11 +541,19 @@
     foreach ($tests as $input => $test) {
       list($expect_utf8, $expect_bmp, $test_name) = $test;
 
+      // Depending on what's installed on the system, this may use an
+      // extension.
       $this->assertEqual(
         $expect_utf8,
         phutil_is_utf8($input),
         pht('is_utf(%s)', $test_name));
 
+      // Also test this against the pure PHP implementation, explicitly.
+      $this->assertEqual(
+        $expect_utf8,
+        phutil_is_utf8_slowly($input),
+        pht('is_utf_slowly(%s)', $test_name));
+
       $this->assertEqual(
         $expect_bmp,
         phutil_is_utf8_with_only_bmp_characters($input),
diff --git a/src/utils/utf8.php b/src/utils/utf8.php
--- a/src/utils/utf8.php
+++ b/src/utils/utf8.php
@@ -22,6 +22,9 @@
   // TODO: Provide an optional fast C implementation ala fb_utf8ize() if this
   // ever shows up in profiles?
 
+  // NOTE: Overlong 3-byte and 4-byte representations incorrectly survive
+  // this function.
+
   $result = array();
 
   $regex =
@@ -58,13 +61,57 @@
  * @return bool   True if the string is valid UTF-8 with only BMP characters.
  */
 function phutil_is_utf8_with_only_bmp_characters($string) {
+  return phutil_is_utf8_slowly($string, $only_bmp = true);
+}
+
+
+/**
+ * Determine if a string is valid UTF-8.
+ *
+ * @param string  Some string which may or may not be valid UTF-8.
+ * @return bool    True if the string is valid UTF-8.
+ * @group utf8
+ */
+function phutil_is_utf8($string) {
+  if (function_exists('mb_check_encoding')) {
+    // If mbstring is available, this is significantly faster than using PHP.
+    return mb_check_encoding($string, 'UTF-8');
+  }
+
+  return phutil_is_utf8_slowly($string);
+}
 
-  // NOTE: By default, PCRE segfaults on patterns like the one we would need
-  // to use here at very small input sizes, at least on some systems (like
-  // OS X). This is apparently because the internal implementation is recursive
-  // and it blows the stack. See <https://bugs.php.net/bug.php?id=45735> for
-  // some discussion. Since the input limit is extremely low (less than 50KB on
-  // my system), do this check very very slowly in PHP instead.
+
+/**
+ * Determine if a string is valid UTF-8, slowly.
+ *
+ * This works on any system, but has very poor performance.
+ *
+ * You should call @{function:phutil_is_utf8} instead of this function, as
+ * that function can use more performant mechanisms if they are available on
+ * the system.
+ *
+ * @param string  Some string which may or may not be valid UTF-8.
+ * @param bool    True to require all characters be part of the basic
+ *                multilingual plane (no more than 3-bytes long).
+ * @return bool   True if the string is valid UTF-8.
+ */
+function phutil_is_utf8_slowly($string, $only_bmp = false) {
+  // First, check the common case of normal ASCII strings. We're fine if
+  // the string contains no bytes larger than 127.
+  if (preg_match('/^[\x01-\x7F]+\z/', $string)) {
+    return true;
+  }
+
+  // NOTE: In the past, we used a large regular expression in the form of
+  // '(x|y|z)+' to match UTF8 strings. However, PCRE can segfaults on patterns
+  // like this at relatively small input sizes, at least on some systems
+  // (observed on OSX and Windows). This is apparently because the internal
+  // implementation is recursive and it blows the stack.
+
+  // See <https://bugs.php.net/bug.php?id=45735> for some discussion. Since the
+  // input limit is extremely low (less than 50KB on my system), do this check
+  // very very slowly in PHP instead. See also T5316.
 
   $len = strlen($string);
   for ($ii = 0; $ii < $len; $ii++) {
@@ -120,6 +167,58 @@
         }
       }
       return false;
+    } else if (!$only_bmp) {
+      if ($chr > 0xF0 && $chr <= 0xF4) {
+        ++$ii;
+        if ($ii >= $len) {
+          return false;
+        }
+        $chr = ord($string[$ii]);
+        if ($chr >= 0x80 && $chr <= 0xBF) {
+          ++$ii;
+          if ($ii >= $len) {
+            return false;
+          }
+          $chr = ord($string[$ii]);
+          if ($chr >= 0x80 && $chr <= 0xBF) {
+            ++$ii;
+            if ($ii >= $len) {
+              return false;
+            }
+            $chr = ord($string[$ii]);
+            if ($chr >= 0x80 && $chr <= 0xBF) {
+              continue;
+            }
+          }
+        }
+      } else if ($chr == 0xF0) {
+        ++$ii;
+        if ($ii >= $len) {
+          return false;
+        }
+        $chr = ord($string[$ii]);
+
+        // NOTE: As above, this range starts at 0x90, not 0x80. The values
+        // 0x80-0x90 are not minimal representations.
+
+        if ($chr >= 0x90 && $chr <= 0xBF) {
+          ++$ii;
+          if ($ii >= $len) {
+            return false;
+          }
+          $chr = ord($string[$ii]);
+          if ($chr >= 0x80 && $chr <= 0xBF) {
+            ++$ii;
+            if ($ii >= $len) {
+              return false;
+            }
+            $chr = ord($string[$ii]);
+            if ($chr >= 0x80 && $chr <= 0xBF) {
+              continue;
+            }
+          }
+        }
+      }
     }
 
     return false;
@@ -130,34 +229,6 @@
 
 
 /**
- * Determine if a string is valid UTF-8.
- *
- * @param string  Some string which may or may not be valid UTF-8.
- * @return bool    True if the string is valid UTF-8.
- * @group utf8
- */
-function phutil_is_utf8($string) {
-  if (function_exists('mb_check_encoding')) {
-    // If mbstring is available, this is significantly faster than using PHP
-    // regexps.
-    return mb_check_encoding($string, 'UTF-8');
-  }
-
-  // NOTE: This incorrectly accepts characters like \xE0\x80\x80, but should
-  // not. The MB version works correctly.
-
-  $regex =
-    "/^(".
-      "[\x01-\x7F]+".
-    "|([\xC2-\xDF][\x80-\xBF])".
-    "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])".
-    "|([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]))*\$/";
-
-  return (bool)preg_match($regex, $string);
-}
-
-
-/**
  * Find the character length of a UTF-8 string.
  *
  * @param string A valid utf-8 string.