source: trunk/web/punbb/include/utf8/utils/patterns.php @ 8

Last change on this file since 8 was 6, checked in by dj3c1t, 13 years ago

mise a jour du trunk

File size: 2.7 KB
RevLine 
[6]1<?php
2
3/**
4* PCRE Regular expressions for UTF-8. Note this file is not actually used by
5* the rest of the library but these regular expressions can be useful to have
6* available.
7* @version $Id: patterns.php,v 1.1 2006/02/25 14:20:02 harryf Exp $
8* @see http://www.w3.org/International/questions/qa-forms-utf-8
9* @package utf8
10* @subpackage patterns
11*/
12
13/**
14* PCRE Pattern to check a UTF-8 string is valid
15* Comes from W3 FAQ: Multilingual Forms
16* Note: modified to include full ASCII range including control chars
17* @see http://www.w3.org/International/questions/qa-forms-utf-8
18* @package utf8
19* @subpackage patterns
20*/
21$UTF8_VALID = '^('.
22        '[\x00-\x7F]'.                          # ASCII (including control chars)
23        '|[\xC2-\xDF][\x80-\xBF]'.              # Non-overlong 2-byte
24        '|\xE0[\xA0-\xBF][\x80-\xBF]'.          # Excluding overlongs
25        '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.   # Straight 3-byte
26        '|\xED[\x80-\x9F][\x80-\xBF]'.          # Excluding surrogates
27        '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.       # Planes 1-3
28        '|[\xF1-\xF3][\x80-\xBF]{3}'.           # Planes 4-15
29        '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.       # Plane 16
30        ')*$';
31
32/**
33* PCRE Pattern to match single UTF-8 characters
34* Comes from W3 FAQ: Multilingual Forms
35* Note: modified to include full ASCII range including control chars
36* @see http://www.w3.org/International/questions/qa-forms-utf-8
37* @package utf8
38* @subpackage patterns
39*/
40$UTF8_MATCH =
41        '([\x00-\x7F])'.                          # ASCII (including control chars)
42        '|([\xC2-\xDF][\x80-\xBF])'.              # Non-overlong 2-byte
43        '|(\xE0[\xA0-\xBF][\x80-\xBF])'.          # Excluding overlongs
44        '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2})'.   # Straight 3-byte
45        '|(\xED[\x80-\x9F][\x80-\xBF])'.          # Excluding surrogates
46        '|(\xF0[\x90-\xBF][\x80-\xBF]{2})'.       # Planes 1-3
47        '|([\xF1-\xF3][\x80-\xBF]{3})'.           # Planes 4-15
48        '|(\xF4[\x80-\x8F][\x80-\xBF]{2})';       # Plane 16
49
50/**
51* PCRE Pattern to locate bad bytes in a UTF-8 string
52* Comes from W3 FAQ: Multilingual Forms
53* Note: modified to include full ASCII range including control chars
54* @see http://www.w3.org/International/questions/qa-forms-utf-8
55* @package utf8
56* @subpackage patterns
57*/
58$UTF8_BAD =
59        '([\x00-\x7F]'.                          # ASCII (including control chars)
60        '|[\xC2-\xDF][\x80-\xBF]'.               # Non-overlong 2-byte
61        '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # Excluding overlongs
62        '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # Straight 3-byte
63        '|\xED[\x80-\x9F][\x80-\xBF]'.           # Excluding surrogates
64        '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # Planes 1-3
65        '|[\xF1-\xF3][\x80-\xBF]{3}'.            # Planes 4-15
66        '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # Plane 16
67        '|(.{1}))';                              # Invalid byte
Note: See TracBrowser for help on using the repository browser.