Context Navigation

source: branches/rsr.v5.1.1/web/punbb/include/utf8/utils/bad.php @ 6

Last change on this file since 6 was 3, checked in by dj3c1t, 13 years ago
passage a Fluxbb 1.4.7
File size: 11.8 KB

Line
1	<?php
2
3	/**
4	* @version $Id: bad.php,v 1.2 2006/02/26 13:20:44 harryf Exp $
5	* Tools for locating / replacing bad bytes in UTF-8 strings
6	* The Original Code is Mozilla Communicator client code.
7	* The Initial Developer of the Original Code is
8	* Netscape Communications Corporation.
9	* Portions created by the Initial Developer are Copyright (C) 1998
10	* the Initial Developer. All Rights Reserved.
11	* Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
12	* Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
13	* @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
14	* @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
15	* @see http://hsivonen.iki.fi/php-utf8/
16	* @package utf8
17	* @subpackage bad
18	* @see utf8_is_valid
19	*/
20
21	/**
22	* Locates the first bad byte in a UTF-8 string returning it's
23	* byte index in the string
24	* PCRE Pattern to locate bad bytes in a UTF-8 string
25	* Comes from W3 FAQ: Multilingual Forms
26	* Note: modified to include full ASCII range including control chars
27	* @see http://www.w3.org/International/questions/qa-forms-utf-8
28	* @param string
29	* @return mixed integer byte index or FALSE if no bad found
30	* @package utf8
31	* @subpackage bad
32	*/
33	function utf8_bad_find($str)
34	{
35	$UTF8_BAD =
36	'([\x00-\x7F]'. # ASCII (including control chars)
37	'\|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte
38	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs
39	'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte
40	'\|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates
41	'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3
42	'\|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15
43	'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16
44	'\|(.{1}))'; # Invalid byte
45	$pos = 0;
46	$badList = array();
47
48	while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches))
49	{
50	$bytes = strlen($matches[0]);
51
52	if (isset($matches[2]))
53	return $pos;
54
55	$pos += $bytes;
56	$str = substr($str,$bytes);
57	}
58
59	return false;
60	}
61
62	/**
63	* Locates all bad bytes in a UTF-8 string and returns a list of their
64	* byte index in the string
65	* PCRE Pattern to locate bad bytes in a UTF-8 string
66	* Comes from W3 FAQ: Multilingual Forms
67	* Note: modified to include full ASCII range including control chars
68	* @see http://www.w3.org/International/questions/qa-forms-utf-8
69	* @param string
70	* @return mixed array of integers or FALSE if no bad found
71	* @package utf8
72	* @subpackage bad
73	*/
74	function utf8_bad_findall($str)
75	{
76	$UTF8_BAD =
77	'([\x00-\x7F]'. # ASCII (including control chars)
78	'\|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte
79	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs
80	'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte
81	'\|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates
82	'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3
83	'\|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15
84	'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16
85	'\|(.{1}))'; # Invalid byte
86	$pos = 0;
87	$badList = array();
88
89	while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches))
90	{
91	$bytes = strlen($matches[0]);
92
93	if (isset($matches[2]))
94	$badList[] = $pos;
95
96	$pos += $bytes;
97	$str = substr($str,$bytes);
98	}
99
100	if (count($badList) > 0)
101	return $badList;
102
103	return false;
104	}
105
106	/**
107	* Strips out any bad bytes from a UTF-8 string and returns the rest
108	* PCRE Pattern to locate bad bytes in a UTF-8 string
109	* Comes from W3 FAQ: Multilingual Forms
110	* Note: modified to include full ASCII range including control chars
111	* @see http://www.w3.org/International/questions/qa-forms-utf-8
112	* @param string
113	* @return string
114	* @package utf8
115	* @subpackage bad
116	*/
117	function utf8_bad_strip($str)
118	{
119	$UTF8_BAD =
120	'([\x00-\x7F]'. # ASCII (including control chars)
121	'\|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte
122	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs
123	'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte
124	'\|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates
125	'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3
126	'\|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15
127	'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16
128	'\|(.{1}))'; # Invalid byte
129
130	ob_start();
131
132	while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches))
133	{
134	if (!isset($matches[2]))
135	echo $matches[0];
136
137	$str = substr($str, strlen($matches[0]));
138	}
139
140	$result = ob_get_contents();
141	ob_end_clean();
142
143	return $result;
144	}
145
146	/**
147	* Replace bad bytes with an alternative character - ASCII character
148	* recommended is replacement char
149	* PCRE Pattern to locate bad bytes in a UTF-8 string
150	* Comes from W3 FAQ: Multilingual Forms
151	* Note: modified to include full ASCII range including control chars
152	* @see http://www.w3.org/International/questions/qa-forms-utf-8
153	* @param string to search
154	* @param string to replace bad bytes with (defaults to '?') - use ASCII
155	* @return string
156	* @package utf8
157	* @subpackage bad
158	*/
159	function utf8_bad_replace($str, $replace='?')
160	{
161	$UTF8_BAD =
162	'([\x00-\x7F]'. # ASCII (including control chars)
163	'\|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte
164	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs
165	'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte
166	'\|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates
167	'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3
168	'\|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15
169	'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16
170	'\|(.{1}))'; # Invalid byte
171
172	ob_start();
173
174	while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches))
175	{
176	if (!isset($matches[2]))
177	echo $matches[0];
178	else
179	echo $replace;
180
181	$str = substr($str, strlen($matches[0]));
182	}
183
184	$result = ob_get_contents();
185	ob_end_clean();
186
187	return $result;
188	}
189
190	/**
191	* Return code from utf8_bad_identify() when a five octet sequence is detected.
192	* Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
193	* do not represent a useful character
194	* @see utf8_bad_identify
195	* @package utf8
196	* @subpackage bad
197	*/
198	define('UTF8_BAD_5OCTET', 1);
199
200	/**
201	* Return code from utf8_bad_identify() when a six octet sequence is detected.
202	* Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
203	* do not represent a useful character
204	* @see utf8_bad_identify
205	* @package utf8
206	* @subpackage bad
207	*/
208	define('UTF8_BAD_6OCTET', 2);
209
210	/**
211	* Return code from utf8_bad_identify().
212	* Invalid octet for use as start of multi-byte UTF-8 sequence
213	* @see utf8_bad_identify
214	* @package utf8
215	* @subpackage bad
216	*/
217	define('UTF8_BAD_SEQID', 3);
218
219	/**
220	* Return code from utf8_bad_identify().
221	* From Unicode 3.1, non-shortest form is illegal
222	* @see utf8_bad_identify
223	* @package utf8
224	* @subpackage bad
225	*/
226	define('UTF8_BAD_NONSHORT', 4);
227
228	/**
229	* Return code from utf8_bad_identify().
230	* From Unicode 3.2, surrogate characters are illegal
231	* @see utf8_bad_identify
232	* @package utf8
233	* @subpackage bad
234	*/
235	define('UTF8_BAD_SURROGATE', 5);
236
237	/**
238	* Return code from utf8_bad_identify().
239	* Codepoints outside the Unicode range are illegal
240	* @see utf8_bad_identify
241	* @package utf8
242	* @subpackage bad
243	*/
244	define('UTF8_BAD_UNIOUTRANGE', 6);
245
246	/**
247	* Return code from utf8_bad_identify().
248	* Incomplete multi-octet sequence
249	* Note: this is kind of a "catch-all"
250	* @see utf8_bad_identify
251	* @package utf8
252	* @subpackage bad
253	*/
254	define('UTF8_BAD_SEQINCOMPLETE', 7);
255
256	/**
257	* Reports on the type of bad byte found in a UTF-8 string. Returns a
258	* status code on the first bad byte found
259	* @author <hsivonen@iki.fi>
260	* @param string UTF-8 encoded string
261	* @return mixed integer constant describing problem or FALSE if valid UTF-8
262	* @see utf8_bad_explain
263	* @see http://hsivonen.iki.fi/php-utf8/
264	* @package utf8
265	* @subpackage bad
266	*/
267	function utf8_bad_identify($str, &$i)
268	{
269	$mState = 0; // Cached expected number of octets after the current octet
270	// until the beginning of the next UTF8 character sequence
271	$mUcs4 = 0; // Cached Unicode character
272	$mBytes = 1; // Cached expected number of octets in the current sequence
273
274	$len = strlen($str);
275
276	for($i=0; $i < $len; $i++)
277	{
278	$in = ord($str{$i});
279
280	if ( $mState == 0)
281	{
282	// When mState is zero we expect either a US-ASCII character or a multi-octet sequence.
283	if (0 == (0x80 & ($in)))
284	{
285	// US-ASCII, pass straight through.
286	$mBytes = 1;
287	}
288	else if (0xC0 == (0xE0 & ($in)))
289	{
290	// First octet of 2 octet sequence
291	$mUcs4 = ($in);
292	$mUcs4 = ($mUcs4 & 0x1F) << 6;
293	$mState = 1;
294	$mBytes = 2;
295	}
296	else if (0xE0 == (0xF0 & ($in)))
297	{
298	// First octet of 3 octet sequence
299	$mUcs4 = ($in);
300	$mUcs4 = ($mUcs4 & 0x0F) << 12;
301	$mState = 2;
302	$mBytes = 3;
303	}
304	else if (0xF0 == (0xF8 & ($in)))
305	{
306	// First octet of 4 octet sequence
307	$mUcs4 = ($in);
308	$mUcs4 = ($mUcs4 & 0x07) << 18;
309	$mState = 3;
310	$mBytes = 4;
311	}
312	else if (0xF8 == (0xFC & ($in)))
313	{
314	/* First octet of 5 octet sequence.
315	*
316	* This is illegal because the encoded codepoint must be either
317	* (a) not the shortest form or
318	* (b) outside the Unicode range of 0-0x10FFFF.
319	*/
320	return UTF8_BAD_5OCTET;
321	}
322	else if (0xFC == (0xFE & ($in)))
323	{
324	// First octet of 6 octet sequence, see comments for 5 octet sequence.
325	return UTF8_BAD_6OCTET;
326	}
327	else
328	{
329	// Current octet is neither in the US-ASCII range nor a legal first
330	// octet of a multi-octet sequence.
331	return UTF8_BAD_SEQID;
332	}
333	}
334	else
335	{
336	// When mState is non-zero, we expect a continuation of the multi-octet sequence
337	if (0x80 == (0xC0 & ($in)))
338	{
339	// Legal continuation.
340	$shift = ($mState - 1) * 6;
341	$tmp = $in;
342	$tmp = ($tmp & 0x0000003F) << $shift;
343	$mUcs4 \|= $tmp;
344
345	/**
346	* End of the multi-octet sequence. mUcs4 now contains the final
347	* Unicode codepoint to be output
348	*/
349	if (0 == --$mState)
350	{
351	// From Unicode 3.1, non-shortest form is illegal
352	if (((2 == $mBytes) && ($mUcs4 < 0x0080)) \|\|
353	((3 == $mBytes) && ($mUcs4 < 0x0800)) \|\|
354	((4 == $mBytes) && ($mUcs4 < 0x10000)) )
355	return UTF8_BAD_NONSHORT;
356	else if (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal
357	return UTF8_BAD_SURROGATE;
358	else if ($mUcs4 > 0x10FFFF) // Codepoints outside the Unicode range are illegal
359	return UTF8_BAD_UNIOUTRANGE;
360
361	// Initialize UTF8 cache
362	$mState = 0;
363	$mUcs4 = 0;
364	$mBytes = 1;
365	}
366
367	}
368	else
369	{
370	// ((0xC0 & (*in) != 0x80) && (mState != 0))
371	// Incomplete multi-octet sequence.
372	$i--;
373	return UTF8_BAD_SEQINCOMPLETE;
374	}
375	}
376	}
377
378	// Incomplete multi-octet sequence
379	if ($mState != 0)
380	{
381	$i--;
382	return UTF8_BAD_SEQINCOMPLETE;
383	}
384
385	// No bad octets found
386	$i = null;
387	return false;
388	}
389
390	/**
391	* Takes a return code from utf8_bad_identify() are returns a message
392	* (in English) explaining what the problem is.
393	* @param int return code from utf8_bad_identify
394	* @return mixed string message or FALSE if return code unknown
395	* @see utf8_bad_identify
396	* @package utf8
397	* @subpackage bad
398	*/
399	function utf8_bad_explain($code)
400	{
401	switch ($code)
402	{
403	case UTF8_BAD_5OCTET:
404	return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
405	break;
406
407	case UTF8_BAD_6OCTET:
408	return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
409	break;
410
411	case UTF8_BAD_SEQID:
412	return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
413	break;
414
415	case UTF8_BAD_NONSHORT:
416	return 'From Unicode 3.1, non-shortest form is illegal';
417	break;
418
419	case UTF8_BAD_SURROGATE:
420	return 'From Unicode 3.2, surrogate characters are illegal';
421	break;
422
423	case UTF8_BAD_UNIOUTRANGE:
424	return 'Codepoints outside the Unicode range are illegal';
425	break;
426
427	case UTF8_BAD_SEQINCOMPLETE:
428	return 'Incomplete multi-octet sequence';
429	break;
430	}
431
432	trigger_error('Unknown error code: '.$code, E_USER_WARNING);
433
434	return false;
435	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: