Context Navigation

source: branches/rsr.v5.1.dev/web/punbb/include/utf8/utils/bad.php @ 3

Last change on this file since 3 was 3, checked in by dj3c1t, 13 years ago
passage a Fluxbb 1.4.7
File size: 11.8 KB

Rev	Line
[3]	1	<?php
	2
	3	/**
	4	* @version $Id: bad.php,v 1.2 2006/02/26 13:20:44 harryf Exp $
	5	* Tools for locating / replacing bad bytes in UTF-8 strings
	6	* The Original Code is Mozilla Communicator client code.
	7	* The Initial Developer of the Original Code is
	8	* Netscape Communications Corporation.
	9	* Portions created by the Initial Developer are Copyright (C) 1998
	10	* the Initial Developer. All Rights Reserved.
	11	* Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
	12	* Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
	13	* @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
	14	* @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
	15	* @see http://hsivonen.iki.fi/php-utf8/
	16	* @package utf8
	17	* @subpackage bad
	18	* @see utf8_is_valid
	19	*/
	20
	21	/**
	22	* Locates the first bad byte in a UTF-8 string returning it's
	23	* byte index in the string
	24	* PCRE Pattern to locate bad bytes in a UTF-8 string
	25	* Comes from W3 FAQ: Multilingual Forms
	26	* Note: modified to include full ASCII range including control chars
	27	* @see http://www.w3.org/International/questions/qa-forms-utf-8
	28	* @param string
	29	* @return mixed integer byte index or FALSE if no bad found
	30	* @package utf8
	31	* @subpackage bad
	32	*/
	33	function utf8_bad_find($str)
	34	{
	35	$UTF8_BAD =
	36	'([\x00-\x7F]'. # ASCII (including control chars)
	37	'\|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte
	38	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs
	39	'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte
	40	'\|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates
	41	'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3
	42	'\|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15
	43	'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16
	44	'\|(.{1}))'; # Invalid byte
	45	$pos = 0;
	46	$badList = array();
	47
	48	while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches))
	49	{
	50	$bytes = strlen($matches[0]);
	51
	52	if (isset($matches[2]))
	53	return $pos;
	54
	55	$pos += $bytes;
	56	$str = substr($str,$bytes);
	57	}
	58
	59	return false;
	60	}
	61
	62	/**
	63	* Locates all bad bytes in a UTF-8 string and returns a list of their
	64	* byte index in the string
	65	* PCRE Pattern to locate bad bytes in a UTF-8 string
	66	* Comes from W3 FAQ: Multilingual Forms
	67	* Note: modified to include full ASCII range including control chars
	68	* @see http://www.w3.org/International/questions/qa-forms-utf-8
	69	* @param string
	70	* @return mixed array of integers or FALSE if no bad found
	71	* @package utf8
	72	* @subpackage bad
	73	*/
	74	function utf8_bad_findall($str)
	75	{
	76	$UTF8_BAD =
	77	'([\x00-\x7F]'. # ASCII (including control chars)
	78	'\|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte
	79	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs
	80	'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte
	81	'\|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates
	82	'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3
	83	'\|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15
	84	'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16
	85	'\|(.{1}))'; # Invalid byte
	86	$pos = 0;
	87	$badList = array();
	88
	89	while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches))
	90	{
	91	$bytes = strlen($matches[0]);
	92
	93	if (isset($matches[2]))
	94	$badList[] = $pos;
	95
	96	$pos += $bytes;
	97	$str = substr($str,$bytes);
	98	}
	99
	100	if (count($badList) > 0)
	101	return $badList;
	102
	103	return false;
	104	}
	105
	106	/**
	107	* Strips out any bad bytes from a UTF-8 string and returns the rest
	108	* PCRE Pattern to locate bad bytes in a UTF-8 string
	109	* Comes from W3 FAQ: Multilingual Forms
	110	* Note: modified to include full ASCII range including control chars
	111	* @see http://www.w3.org/International/questions/qa-forms-utf-8
	112	* @param string
	113	* @return string
	114	* @package utf8
	115	* @subpackage bad
	116	*/
	117	function utf8_bad_strip($str)
	118	{
	119	$UTF8_BAD =
	120	'([\x00-\x7F]'. # ASCII (including control chars)
	121	'\|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte
	122	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs
	123	'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte
	124	'\|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates
	125	'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3
	126	'\|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15
	127	'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16
	128	'\|(.{1}))'; # Invalid byte
	129
	130	ob_start();
	131
	132	while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches))
	133	{
	134	if (!isset($matches[2]))
	135	echo $matches[0];
	136
	137	$str = substr($str, strlen($matches[0]));
	138	}
	139
	140	$result = ob_get_contents();
	141	ob_end_clean();
	142
	143	return $result;
	144	}
	145
	146	/**
	147	* Replace bad bytes with an alternative character - ASCII character
	148	* recommended is replacement char
	149	* PCRE Pattern to locate bad bytes in a UTF-8 string
	150	* Comes from W3 FAQ: Multilingual Forms
	151	* Note: modified to include full ASCII range including control chars
	152	* @see http://www.w3.org/International/questions/qa-forms-utf-8
	153	* @param string to search
	154	* @param string to replace bad bytes with (defaults to '?') - use ASCII
	155	* @return string
	156	* @package utf8
	157	* @subpackage bad
	158	*/
	159	function utf8_bad_replace($str, $replace='?')
	160	{
	161	$UTF8_BAD =
	162	'([\x00-\x7F]'. # ASCII (including control chars)
	163	'\|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte
	164	'\|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs
	165	'\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte
	166	'\|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates
	167	'\|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3
	168	'\|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15
	169	'\|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16
	170	'\|(.{1}))'; # Invalid byte
	171
	172	ob_start();
	173
	174	while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches))
	175	{
	176	if (!isset($matches[2]))
	177	echo $matches[0];
	178	else
	179	echo $replace;
	180
	181	$str = substr($str, strlen($matches[0]));
	182	}
	183
	184	$result = ob_get_contents();
	185	ob_end_clean();
	186
	187	return $result;
	188	}
	189
	190	/**
	191	* Return code from utf8_bad_identify() when a five octet sequence is detected.
	192	* Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
	193	* do not represent a useful character
	194	* @see utf8_bad_identify
	195	* @package utf8
	196	* @subpackage bad
	197	*/
	198	define('UTF8_BAD_5OCTET', 1);
	199
	200	/**
	201	* Return code from utf8_bad_identify() when a six octet sequence is detected.
	202	* Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
	203	* do not represent a useful character
	204	* @see utf8_bad_identify
	205	* @package utf8
	206	* @subpackage bad
	207	*/
	208	define('UTF8_BAD_6OCTET', 2);
	209
	210	/**
	211	* Return code from utf8_bad_identify().
	212	* Invalid octet for use as start of multi-byte UTF-8 sequence
	213	* @see utf8_bad_identify
	214	* @package utf8
	215	* @subpackage bad
	216	*/
	217	define('UTF8_BAD_SEQID', 3);
	218
	219	/**
	220	* Return code from utf8_bad_identify().
	221	* From Unicode 3.1, non-shortest form is illegal
	222	* @see utf8_bad_identify
	223	* @package utf8
	224	* @subpackage bad
	225	*/
	226	define('UTF8_BAD_NONSHORT', 4);
	227
	228	/**
	229	* Return code from utf8_bad_identify().
	230	* From Unicode 3.2, surrogate characters are illegal
	231	* @see utf8_bad_identify
	232	* @package utf8
	233	* @subpackage bad
	234	*/
	235	define('UTF8_BAD_SURROGATE', 5);
	236
	237	/**
	238	* Return code from utf8_bad_identify().
	239	* Codepoints outside the Unicode range are illegal
	240	* @see utf8_bad_identify
	241	* @package utf8
	242	* @subpackage bad
	243	*/
	244	define('UTF8_BAD_UNIOUTRANGE', 6);
	245
	246	/**
	247	* Return code from utf8_bad_identify().
	248	* Incomplete multi-octet sequence
	249	* Note: this is kind of a "catch-all"
	250	* @see utf8_bad_identify
	251	* @package utf8
	252	* @subpackage bad
	253	*/
	254	define('UTF8_BAD_SEQINCOMPLETE', 7);
	255
	256	/**
	257	* Reports on the type of bad byte found in a UTF-8 string. Returns a
	258	* status code on the first bad byte found
	259	* @author <hsivonen@iki.fi>
	260	* @param string UTF-8 encoded string
	261	* @return mixed integer constant describing problem or FALSE if valid UTF-8
	262	* @see utf8_bad_explain
	263	* @see http://hsivonen.iki.fi/php-utf8/
	264	* @package utf8
	265	* @subpackage bad
	266	*/
	267	function utf8_bad_identify($str, &$i)
	268	{
	269	$mState = 0; // Cached expected number of octets after the current octet
	270	// until the beginning of the next UTF8 character sequence
	271	$mUcs4 = 0; // Cached Unicode character
	272	$mBytes = 1; // Cached expected number of octets in the current sequence
	273
	274	$len = strlen($str);
	275
	276	for($i=0; $i < $len; $i++)
	277	{
	278	$in = ord($str{$i});
	279
	280	if ( $mState == 0)
	281	{
	282	// When mState is zero we expect either a US-ASCII character or a multi-octet sequence.
	283	if (0 == (0x80 & ($in)))
	284	{
	285	// US-ASCII, pass straight through.
	286	$mBytes = 1;
	287	}
	288	else if (0xC0 == (0xE0 & ($in)))
	289	{
	290	// First octet of 2 octet sequence
	291	$mUcs4 = ($in);
	292	$mUcs4 = ($mUcs4 & 0x1F) << 6;
	293	$mState = 1;
	294	$mBytes = 2;
	295	}
	296	else if (0xE0 == (0xF0 & ($in)))
	297	{
	298	// First octet of 3 octet sequence
	299	$mUcs4 = ($in);
	300	$mUcs4 = ($mUcs4 & 0x0F) << 12;
	301	$mState = 2;
	302	$mBytes = 3;
	303	}
	304	else if (0xF0 == (0xF8 & ($in)))
	305	{
	306	// First octet of 4 octet sequence
	307	$mUcs4 = ($in);
	308	$mUcs4 = ($mUcs4 & 0x07) << 18;
	309	$mState = 3;
	310	$mBytes = 4;
	311	}
	312	else if (0xF8 == (0xFC & ($in)))
	313	{
	314	/* First octet of 5 octet sequence.
	315	*
	316	* This is illegal because the encoded codepoint must be either
	317	* (a) not the shortest form or
	318	* (b) outside the Unicode range of 0-0x10FFFF.
	319	*/
	320	return UTF8_BAD_5OCTET;
	321	}
	322	else if (0xFC == (0xFE & ($in)))
	323	{
	324	// First octet of 6 octet sequence, see comments for 5 octet sequence.
	325	return UTF8_BAD_6OCTET;
	326	}
	327	else
	328	{
	329	// Current octet is neither in the US-ASCII range nor a legal first
	330	// octet of a multi-octet sequence.
	331	return UTF8_BAD_SEQID;
	332	}
	333	}
	334	else
	335	{
	336	// When mState is non-zero, we expect a continuation of the multi-octet sequence
	337	if (0x80 == (0xC0 & ($in)))
	338	{
	339	// Legal continuation.
	340	$shift = ($mState - 1) * 6;
	341	$tmp = $in;
	342	$tmp = ($tmp & 0x0000003F) << $shift;
	343	$mUcs4 \|= $tmp;
	344
	345	/**
	346	* End of the multi-octet sequence. mUcs4 now contains the final
	347	* Unicode codepoint to be output
	348	*/
	349	if (0 == --$mState)
	350	{
	351	// From Unicode 3.1, non-shortest form is illegal
	352	if (((2 == $mBytes) && ($mUcs4 < 0x0080)) \|\|
	353	((3 == $mBytes) && ($mUcs4 < 0x0800)) \|\|
	354	((4 == $mBytes) && ($mUcs4 < 0x10000)) )
	355	return UTF8_BAD_NONSHORT;
	356	else if (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal
	357	return UTF8_BAD_SURROGATE;
	358	else if ($mUcs4 > 0x10FFFF) // Codepoints outside the Unicode range are illegal
	359	return UTF8_BAD_UNIOUTRANGE;
	360
	361	// Initialize UTF8 cache
	362	$mState = 0;
	363	$mUcs4 = 0;
	364	$mBytes = 1;
	365	}
	366
	367	}
	368	else
	369	{
	370	// ((0xC0 & (*in) != 0x80) && (mState != 0))
	371	// Incomplete multi-octet sequence.
	372	$i--;
	373	return UTF8_BAD_SEQINCOMPLETE;
	374	}
	375	}
	376	}
	377
	378	// Incomplete multi-octet sequence
	379	if ($mState != 0)
	380	{
	381	$i--;
	382	return UTF8_BAD_SEQINCOMPLETE;
	383	}
	384
	385	// No bad octets found
	386	$i = null;
	387	return false;
	388	}
	389
	390	/**
	391	* Takes a return code from utf8_bad_identify() are returns a message
	392	* (in English) explaining what the problem is.
	393	* @param int return code from utf8_bad_identify
	394	* @return mixed string message or FALSE if return code unknown
	395	* @see utf8_bad_identify
	396	* @package utf8
	397	* @subpackage bad
	398	*/
	399	function utf8_bad_explain($code)
	400	{
	401	switch ($code)
	402	{
	403	case UTF8_BAD_5OCTET:
	404	return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
	405	break;
	406
	407	case UTF8_BAD_6OCTET:
	408	return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
	409	break;
	410
	411	case UTF8_BAD_SEQID:
	412	return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
	413	break;
	414
	415	case UTF8_BAD_NONSHORT:
	416	return 'From Unicode 3.1, non-shortest form is illegal';
	417	break;
	418
	419	case UTF8_BAD_SURROGATE:
	420	return 'From Unicode 3.2, surrogate characters are illegal';
	421	break;
	422
	423	case UTF8_BAD_UNIOUTRANGE:
	424	return 'Codepoints outside the Unicode range are illegal';
	425	break;
	426
	427	case UTF8_BAD_SEQINCOMPLETE:
	428	return 'Incomplete multi-octet sequence';
	429	break;
	430	}
	431
	432	trigger_error('Unknown error code: '.$code, E_USER_WARNING);
	433
	434	return false;
	435	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: