[3] | 1 | <?php |
---|
| 2 | |
---|
| 3 | /** |
---|
| 4 | * PCRE Regular expressions for UTF-8. Note this file is not actually used by |
---|
| 5 | * the rest of the library but these regular expressions can be useful to have |
---|
| 6 | * available. |
---|
| 7 | * @version $Id: patterns.php,v 1.1 2006/02/25 14:20:02 harryf Exp $ |
---|
| 8 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
| 9 | * @package utf8 |
---|
| 10 | * @subpackage patterns |
---|
| 11 | */ |
---|
| 12 | |
---|
| 13 | /** |
---|
| 14 | * PCRE Pattern to check a UTF-8 string is valid |
---|
| 15 | * Comes from W3 FAQ: Multilingual Forms |
---|
| 16 | * Note: modified to include full ASCII range including control chars |
---|
| 17 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
| 18 | * @package utf8 |
---|
| 19 | * @subpackage patterns |
---|
| 20 | */ |
---|
| 21 | $UTF8_VALID = '^('. |
---|
| 22 | '[\x00-\x7F]'. # ASCII (including control chars) |
---|
| 23 | '|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte |
---|
| 24 | '|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs |
---|
| 25 | '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte |
---|
| 26 | '|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates |
---|
| 27 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3 |
---|
| 28 | '|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15 |
---|
| 29 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16 |
---|
| 30 | ')*$'; |
---|
| 31 | |
---|
| 32 | /** |
---|
| 33 | * PCRE Pattern to match single UTF-8 characters |
---|
| 34 | * Comes from W3 FAQ: Multilingual Forms |
---|
| 35 | * Note: modified to include full ASCII range including control chars |
---|
| 36 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
| 37 | * @package utf8 |
---|
| 38 | * @subpackage patterns |
---|
| 39 | */ |
---|
| 40 | $UTF8_MATCH = |
---|
| 41 | '([\x00-\x7F])'. # ASCII (including control chars) |
---|
| 42 | '|([\xC2-\xDF][\x80-\xBF])'. # Non-overlong 2-byte |
---|
| 43 | '|(\xE0[\xA0-\xBF][\x80-\xBF])'. # Excluding overlongs |
---|
| 44 | '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2})'. # Straight 3-byte |
---|
| 45 | '|(\xED[\x80-\x9F][\x80-\xBF])'. # Excluding surrogates |
---|
| 46 | '|(\xF0[\x90-\xBF][\x80-\xBF]{2})'. # Planes 1-3 |
---|
| 47 | '|([\xF1-\xF3][\x80-\xBF]{3})'. # Planes 4-15 |
---|
| 48 | '|(\xF4[\x80-\x8F][\x80-\xBF]{2})'; # Plane 16 |
---|
| 49 | |
---|
| 50 | /** |
---|
| 51 | * PCRE Pattern to locate bad bytes in a UTF-8 string |
---|
| 52 | * Comes from W3 FAQ: Multilingual Forms |
---|
| 53 | * Note: modified to include full ASCII range including control chars |
---|
| 54 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
| 55 | * @package utf8 |
---|
| 56 | * @subpackage patterns |
---|
| 57 | */ |
---|
| 58 | $UTF8_BAD = |
---|
| 59 | '([\x00-\x7F]'. # ASCII (including control chars) |
---|
| 60 | '|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte |
---|
| 61 | '|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs |
---|
| 62 | '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte |
---|
| 63 | '|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates |
---|
| 64 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3 |
---|
| 65 | '|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15 |
---|
| 66 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16 |
---|
| 67 | '|(.{1}))'; # Invalid byte |
---|