1 | <?php |
---|
2 | |
---|
3 | /** |
---|
4 | * Tools to help with ASCII in UTF-8 |
---|
5 | * @version $Id: ascii.php,v 1.5 2006/10/16 20:38:12 harryf Exp $ |
---|
6 | * @package utf8 |
---|
7 | * @subpackage ascii |
---|
8 | */ |
---|
9 | |
---|
10 | /** |
---|
11 | * Tests whether a string contains only 7bit ASCII bytes. |
---|
12 | * You might use this to conditionally check whether a string |
---|
13 | * needs handling as UTF-8 or not, potentially offering performance |
---|
14 | * benefits by using the native PHP equivalent if it's just ASCII e.g.; |
---|
15 | * |
---|
16 | * <code> |
---|
17 | * if ( utf8_is_ascii($someString) ) { |
---|
18 | * // It's just ASCII - use the native PHP version |
---|
19 | * $someString = strtolower($someString); |
---|
20 | * } else { |
---|
21 | * $someString = utf8_strtolower($someString); |
---|
22 | * } |
---|
23 | * </code> |
---|
24 | * |
---|
25 | * @param string |
---|
26 | * @return boolean TRUE if it's all ASCII |
---|
27 | * @package utf8 |
---|
28 | * @subpackage ascii |
---|
29 | * @see utf8_is_ascii_ctrl |
---|
30 | */ |
---|
31 | function utf8_is_ascii($str) |
---|
32 | { |
---|
33 | // Search for any bytes which are outside the ASCII range... |
---|
34 | return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1); |
---|
35 | } |
---|
36 | |
---|
37 | /** |
---|
38 | * Tests whether a string contains only 7bit ASCII bytes with device |
---|
39 | * control codes omitted. The device control codes can be found on the |
---|
40 | * second table here: http://www.w3schools.com/tags/ref_ascii.asp |
---|
41 | * |
---|
42 | * @param string |
---|
43 | * @return boolean TRUE if it's all ASCII without device control codes |
---|
44 | * @package utf8 |
---|
45 | * @subpackage ascii |
---|
46 | * @see utf8_is_ascii |
---|
47 | */ |
---|
48 | function utf8_is_ascii_ctrl($str) |
---|
49 | { |
---|
50 | // Search for any bytes which are outside the ASCII range, or are device control codes |
---|
51 | if (strlen($str) > 0) |
---|
52 | return (preg_match('/[^\x09\x0A\x0D\x20-\x7E]/', $str) !== 1); |
---|
53 | |
---|
54 | return false; |
---|
55 | } |
---|
56 | |
---|
57 | /** |
---|
58 | * Strip out all non-7bit ASCII bytes |
---|
59 | * If you need to transmit a string to system which you know can only |
---|
60 | * support 7bit ASCII, you could use this function. |
---|
61 | * @param string |
---|
62 | * @return string with non ASCII bytes removed |
---|
63 | * @package utf8 |
---|
64 | * @subpackage ascii |
---|
65 | * @see utf8_strip_non_ascii_ctrl |
---|
66 | */ |
---|
67 | function utf8_strip_non_ascii($str) |
---|
68 | { |
---|
69 | ob_start(); |
---|
70 | |
---|
71 | while (preg_match('/^([\x00-\x7F]+)|([^\x00-\x7F]+)/S', $str, $matches)) |
---|
72 | { |
---|
73 | if (!isset($matches[2])) |
---|
74 | echo $matches[0]; |
---|
75 | |
---|
76 | $str = substr($str, strlen($matches[0])); |
---|
77 | } |
---|
78 | |
---|
79 | $result = ob_get_contents(); |
---|
80 | ob_end_clean(); |
---|
81 | |
---|
82 | return $result; |
---|
83 | } |
---|
84 | |
---|
85 | /** |
---|
86 | * Strip out device control codes in the ASCII range |
---|
87 | * which are not permitted in XML. Note that this leaves |
---|
88 | * multi-byte characters untouched - it only removes device |
---|
89 | * control codes |
---|
90 | * @see http://hsivonen.iki.fi/producing-xml/#controlchar |
---|
91 | * @param string |
---|
92 | * @return string control codes removed |
---|
93 | */ |
---|
94 | function utf8_strip_ascii_ctrl($str) |
---|
95 | { |
---|
96 | ob_start(); |
---|
97 | |
---|
98 | while (preg_match('/^([^\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)|([\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)/S', $str, $matches)) |
---|
99 | { |
---|
100 | if (!isset($matches[2])) |
---|
101 | echo $matches[0]; |
---|
102 | |
---|
103 | $str = substr($str, strlen($matches[0])); |
---|
104 | } |
---|
105 | |
---|
106 | $result = ob_get_contents(); |
---|
107 | ob_end_clean(); |
---|
108 | |
---|
109 | return $result; |
---|
110 | } |
---|
111 | |
---|
112 | /** |
---|
113 | * Strip out all non 7bit ASCII bytes and ASCII device control codes. |
---|
114 | * For a list of ASCII device control codes see the 2nd table here: |
---|
115 | * http://www.w3schools.com/tags/ref_ascii.asp |
---|
116 | * |
---|
117 | * @param string |
---|
118 | * @return boolean TRUE if it's all ASCII |
---|
119 | * @package utf8 |
---|
120 | * @subpackage ascii |
---|
121 | */ |
---|
122 | function utf8_strip_non_ascii_ctrl($str) |
---|
123 | { |
---|
124 | ob_start(); |
---|
125 | |
---|
126 | while (preg_match( '/^([\x09\x0A\x0D\x20-\x7E]+)|([^\x09\x0A\x0D\x20-\x7E]+)/S', $str, $matches)) |
---|
127 | { |
---|
128 | if (!isset($matches[2])) |
---|
129 | echo $matches[0]; |
---|
130 | |
---|
131 | $str = substr($str, strlen($matches[0])); |
---|
132 | } |
---|
133 | |
---|
134 | $result = ob_get_contents(); |
---|
135 | ob_end_clean(); |
---|
136 | |
---|
137 | return $result; |
---|
138 | } |
---|
139 | |
---|
140 | /** |
---|
141 | * Replace accented UTF-8 characters by unaccented ASCII-7 "equivalents". |
---|
142 | * The purpose of this function is to replace characters commonly found in Latin |
---|
143 | * alphabets with something more or less equivalent from the ASCII range. This can |
---|
144 | * be useful for converting a UTF-8 to something ready for a filename, for example. |
---|
145 | * Following the use of this function, you would probably also pass the string |
---|
146 | * through utf8_strip_non_ascii to clean out any other non-ASCII chars |
---|
147 | * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) |
---|
148 | * letters. Default is to deaccent both cases ($case = 0) |
---|
149 | * |
---|
150 | * For a more complete implementation of transliteration, see the utf8_to_ascii package |
---|
151 | * available from the phputf8 project downloads: |
---|
152 | * http://prdownloads.sourceforge.net/phputf8 |
---|
153 | * |
---|
154 | * @param string UTF-8 string |
---|
155 | * @param int (optional) -1 lowercase only, +1 uppercase only, 1 both cases |
---|
156 | * @param string UTF-8 with accented characters replaced by ASCII chars |
---|
157 | * @return string accented chars replaced with ascii equivalents |
---|
158 | * @author Andreas Gohr <andi@splitbrain.org> |
---|
159 | * @package utf8 |
---|
160 | * @subpackage ascii |
---|
161 | */ |
---|
162 | function utf8_accents_to_ascii($str, $case=0) |
---|
163 | { |
---|
164 | static $UTF8_LOWER_ACCENTS = null; |
---|
165 | static $UTF8_UPPER_ACCENTS = null; |
---|
166 | |
---|
167 | if($case <= 0) |
---|
168 | { |
---|
169 | |
---|
170 | if (is_null($UTF8_LOWER_ACCENTS)) |
---|
171 | { |
---|
172 | $UTF8_LOWER_ACCENTS = array( |
---|
173 | 'à ' => 'a', 'ÃŽ' => 'o', 'Ä' => 'd', 'áž' => 'f', 'ë' => 'e', 'Å¡' => 's', 'Æ¡' => 'o', |
---|
174 | 'Ã' => 'ss', 'Ä' => 'a', 'Å' => 'r', 'È' => 't', 'Å' => 'n', 'Ä' => 'a', 'Ä·' => 'k', |
---|
175 | 'Å' => 's', 'ỳ' => 'y', 'Å' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'á¹' => 'p', 'ó' => 'o', |
---|
176 | 'ú' => 'u', 'Ä' => 'e', 'é' => 'e', 'ç' => 'c', 'áº' => 'w', 'Ä' => 'c', 'õ' => 'o', |
---|
177 | 'ṡ' => 's', 'Þ' => 'o', 'Ä£' => 'g', 'ŧ' => 't', 'È' => 's', 'Ä' => 'e', 'Ä' => 'c', |
---|
178 | 'Å' => 's', 'î' => 'i', 'ű' => 'u', 'Ä' => 'c', 'Ä' => 'e', 'ŵ' => 'w', 'ṫ' => 't', |
---|
179 | 'Å«' => 'u', 'Ä' => 'c', 'ö' => 'oe', 'Ú' => 'e', 'Å·' => 'y', 'Ä
' => 'a', 'Å' => 'l', |
---|
180 | 'ų' => 'u', 'ů' => 'u', 'Å' => 's', 'Ä' => 'g', 'ÄŒ' => 'l', 'Æ' => 'f', 'ÅŸ' => 'z', |
---|
181 | 'áº' => 'w', 'áž' => 'b', 'Ã¥' => 'a', 'ì' => 'i', 'ï' => 'i', 'áž' => 'd', 'Å¥' => 't', |
---|
182 | 'Å' => 'r', 'À' => 'ae', 'Ã' => 'i', 'Å' => 'r', 'ê' => 'e', 'ÃŒ' => 'ue', 'ò' => 'o', |
---|
183 | 'Ä' => 'e', 'ñ' => 'n', 'Å' => 'n', 'Ä¥' => 'h', 'Ä' => 'g', 'Ä' => 'd', 'ĵ' => 'j', |
---|
184 | 'ÿ' => 'y', 'Å©' => 'u', 'Å' => 'u', 'Æ°' => 'u', 'Å£' => 't', 'Ãœ' => 'y', 'Å' => 'o', |
---|
185 | 'â' => 'a', 'ÄŸ' => 'l', 'áº
' => 'w', 'Ō' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', |
---|
186 | 'á¹' => 'm', 'Å' => 'o', 'Ä©' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', |
---|
187 | 'û' => 'u', 'ß' => 'th', 'ð' => 'dh', 'Ê' => 'ae', 'µ' => 'u', 'Ä' => 'e', |
---|
188 | ); |
---|
189 | } |
---|
190 | |
---|
191 | $str = str_replace(array_keys($UTF8_LOWER_ACCENTS), array_values($UTF8_LOWER_ACCENTS), $str); |
---|
192 | } |
---|
193 | |
---|
194 | if($case >= 0) |
---|
195 | { |
---|
196 | if (is_null($UTF8_UPPER_ACCENTS)) |
---|
197 | { |
---|
198 | $UTF8_UPPER_ACCENTS = array( |
---|
199 | 'Ã' => 'A', 'Ã' => 'O', 'Ä' => 'D', 'áž' => 'F', 'Ã' => 'E', 'Å ' => 'S', 'Æ ' => 'O', |
---|
200 | 'Ä' => 'A', 'Å' => 'R', 'È' => 'T', 'Å' => 'N', 'Ä' => 'A', 'Ķ' => 'K', |
---|
201 | 'Å' => 'S', 'Ỳ' => 'Y', 'Å
' => 'N', 'Ĺ' => 'L', 'ÄŠ' => 'H', 'á¹' => 'P', 'Ã' => 'O', |
---|
202 | 'Ã' => 'U', 'Ä' => 'E', 'Ã' => 'E', 'Ã' => 'C', 'áº' => 'W', 'Ä' => 'C', 'Ã' => 'O', |
---|
203 | 'á¹ ' => 'S', 'Ã' => 'O', 'Ä¢' => 'G', 'ÅŠ' => 'T', 'È' => 'S', 'Ä' => 'E', 'Ä' => 'C', |
---|
204 | 'Å' => 'S', 'Ã' => 'I', 'Å°' => 'U', 'Ä' => 'C', 'Ä' => 'E', 'ÅŽ' => 'W', 'Ṫ' => 'T', |
---|
205 | 'Ū' => 'U', 'Ä' => 'C', 'Ã' => 'Oe', 'Ã' => 'E', 'Ŷ' => 'Y', 'Ä' => 'A', 'Å' => 'L', |
---|
206 | 'Ų' => 'U', 'Å®' => 'U', 'Å' => 'S', 'Ä' => 'G', 'Ä»' => 'L', 'Æ' => 'F', 'Åœ' => 'Z', |
---|
207 | 'áº' => 'W', 'áž' => 'B', 'Ã
' => 'A', 'Ã' => 'I', 'Ã' => 'I', 'áž' => 'D', 'Å€' => 'T', |
---|
208 | 'Å' => 'R', 'Ã' => 'Ae', 'Ã' => 'I', 'Å' => 'R', 'Ã' => 'E', 'Ã' => 'Ue', 'Ã' => 'O', |
---|
209 | 'Ä' => 'E', 'Ã' => 'N', 'Å' => 'N', 'Ä€' => 'H', 'Ä' => 'G', 'Ä' => 'D', 'ÄŽ' => 'J', |
---|
210 | 'Åž' => 'Y', 'Åš' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Å¢' => 'T', 'Ã' => 'Y', 'Å' => 'O', |
---|
211 | 'Ã' => 'A', 'Äœ' => 'L', 'áº' => 'W', 'Å»' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ä ' => 'G', |
---|
212 | 'á¹' => 'M', 'Å' => 'O', 'Äš' => 'I', 'Ã' => 'U', 'Ä®' => 'I', 'Ź' => 'Z', 'Ã' => 'A', |
---|
213 | 'Ã' => 'U', 'Ã' => 'Th', 'Ã' => 'Dh', 'Ã' => 'Ae', 'Ä' => 'E', |
---|
214 | ); |
---|
215 | } |
---|
216 | |
---|
217 | $str = str_replace(array_keys($UTF8_UPPER_ACCENTS), array_values($UTF8_UPPER_ACCENTS), $str); |
---|
218 | } |
---|
219 | |
---|
220 | return $str; |
---|
221 | } |
---|