1 | <?php |
---|
2 | |
---|
3 | /** |
---|
4 | * PCRE Regular expressions for UTF-8. Note this file is not actually used by |
---|
5 | * the rest of the library but these regular expressions can be useful to have |
---|
6 | * available. |
---|
7 | * @version $Id: patterns.php,v 1.1 2006/02/25 14:20:02 harryf Exp $ |
---|
8 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
9 | * @package utf8 |
---|
10 | * @subpackage patterns |
---|
11 | */ |
---|
12 | |
---|
13 | /** |
---|
14 | * PCRE Pattern to check a UTF-8 string is valid |
---|
15 | * Comes from W3 FAQ: Multilingual Forms |
---|
16 | * Note: modified to include full ASCII range including control chars |
---|
17 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
18 | * @package utf8 |
---|
19 | * @subpackage patterns |
---|
20 | */ |
---|
21 | $UTF8_VALID = '^('. |
---|
22 | '[\x00-\x7F]'. # ASCII (including control chars) |
---|
23 | '|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte |
---|
24 | '|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs |
---|
25 | '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte |
---|
26 | '|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates |
---|
27 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3 |
---|
28 | '|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15 |
---|
29 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16 |
---|
30 | ')*$'; |
---|
31 | |
---|
32 | /** |
---|
33 | * PCRE Pattern to match single UTF-8 characters |
---|
34 | * Comes from W3 FAQ: Multilingual Forms |
---|
35 | * Note: modified to include full ASCII range including control chars |
---|
36 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
37 | * @package utf8 |
---|
38 | * @subpackage patterns |
---|
39 | */ |
---|
40 | $UTF8_MATCH = |
---|
41 | '([\x00-\x7F])'. # ASCII (including control chars) |
---|
42 | '|([\xC2-\xDF][\x80-\xBF])'. # Non-overlong 2-byte |
---|
43 | '|(\xE0[\xA0-\xBF][\x80-\xBF])'. # Excluding overlongs |
---|
44 | '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2})'. # Straight 3-byte |
---|
45 | '|(\xED[\x80-\x9F][\x80-\xBF])'. # Excluding surrogates |
---|
46 | '|(\xF0[\x90-\xBF][\x80-\xBF]{2})'. # Planes 1-3 |
---|
47 | '|([\xF1-\xF3][\x80-\xBF]{3})'. # Planes 4-15 |
---|
48 | '|(\xF4[\x80-\x8F][\x80-\xBF]{2})'; # Plane 16 |
---|
49 | |
---|
50 | /** |
---|
51 | * PCRE Pattern to locate bad bytes in a UTF-8 string |
---|
52 | * Comes from W3 FAQ: Multilingual Forms |
---|
53 | * Note: modified to include full ASCII range including control chars |
---|
54 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
55 | * @package utf8 |
---|
56 | * @subpackage patterns |
---|
57 | */ |
---|
58 | $UTF8_BAD = |
---|
59 | '([\x00-\x7F]'. # ASCII (including control chars) |
---|
60 | '|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte |
---|
61 | '|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs |
---|
62 | '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte |
---|
63 | '|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates |
---|
64 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3 |
---|
65 | '|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15 |
---|
66 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16 |
---|
67 | '|(.{1}))'; # Invalid byte |
---|