[6] | 1 | <?php |
---|
| 2 | |
---|
| 3 | /** |
---|
| 4 | * @version $Id: bad.php,v 1.2 2006/02/26 13:20:44 harryf Exp $ |
---|
| 5 | * Tools for locating / replacing bad bytes in UTF-8 strings |
---|
| 6 | * The Original Code is Mozilla Communicator client code. |
---|
| 7 | * The Initial Developer of the Original Code is |
---|
| 8 | * Netscape Communications Corporation. |
---|
| 9 | * Portions created by the Initial Developer are Copyright (C) 1998 |
---|
| 10 | * the Initial Developer. All Rights Reserved. |
---|
| 11 | * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi) |
---|
| 12 | * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com) |
---|
| 13 | * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp |
---|
| 14 | * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp |
---|
| 15 | * @see http://hsivonen.iki.fi/php-utf8/ |
---|
| 16 | * @package utf8 |
---|
| 17 | * @subpackage bad |
---|
| 18 | * @see utf8_is_valid |
---|
| 19 | */ |
---|
| 20 | |
---|
| 21 | /** |
---|
| 22 | * Locates the first bad byte in a UTF-8 string returning it's |
---|
| 23 | * byte index in the string |
---|
| 24 | * PCRE Pattern to locate bad bytes in a UTF-8 string |
---|
| 25 | * Comes from W3 FAQ: Multilingual Forms |
---|
| 26 | * Note: modified to include full ASCII range including control chars |
---|
| 27 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
| 28 | * @param string |
---|
| 29 | * @return mixed integer byte index or FALSE if no bad found |
---|
| 30 | * @package utf8 |
---|
| 31 | * @subpackage bad |
---|
| 32 | */ |
---|
| 33 | function utf8_bad_find($str) |
---|
| 34 | { |
---|
| 35 | $UTF8_BAD = |
---|
| 36 | '([\x00-\x7F]'. # ASCII (including control chars) |
---|
| 37 | '|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte |
---|
| 38 | '|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs |
---|
| 39 | '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte |
---|
| 40 | '|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates |
---|
| 41 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3 |
---|
| 42 | '|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15 |
---|
| 43 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16 |
---|
| 44 | '|(.{1}))'; # Invalid byte |
---|
| 45 | $pos = 0; |
---|
| 46 | $badList = array(); |
---|
| 47 | |
---|
| 48 | while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) |
---|
| 49 | { |
---|
| 50 | $bytes = strlen($matches[0]); |
---|
| 51 | |
---|
| 52 | if (isset($matches[2])) |
---|
| 53 | return $pos; |
---|
| 54 | |
---|
| 55 | $pos += $bytes; |
---|
| 56 | $str = substr($str,$bytes); |
---|
| 57 | } |
---|
| 58 | |
---|
| 59 | return false; |
---|
| 60 | } |
---|
| 61 | |
---|
| 62 | /** |
---|
| 63 | * Locates all bad bytes in a UTF-8 string and returns a list of their |
---|
| 64 | * byte index in the string |
---|
| 65 | * PCRE Pattern to locate bad bytes in a UTF-8 string |
---|
| 66 | * Comes from W3 FAQ: Multilingual Forms |
---|
| 67 | * Note: modified to include full ASCII range including control chars |
---|
| 68 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
| 69 | * @param string |
---|
| 70 | * @return mixed array of integers or FALSE if no bad found |
---|
| 71 | * @package utf8 |
---|
| 72 | * @subpackage bad |
---|
| 73 | */ |
---|
| 74 | function utf8_bad_findall($str) |
---|
| 75 | { |
---|
| 76 | $UTF8_BAD = |
---|
| 77 | '([\x00-\x7F]'. # ASCII (including control chars) |
---|
| 78 | '|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte |
---|
| 79 | '|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs |
---|
| 80 | '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte |
---|
| 81 | '|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates |
---|
| 82 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3 |
---|
| 83 | '|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15 |
---|
| 84 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16 |
---|
| 85 | '|(.{1}))'; # Invalid byte |
---|
| 86 | $pos = 0; |
---|
| 87 | $badList = array(); |
---|
| 88 | |
---|
| 89 | while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) |
---|
| 90 | { |
---|
| 91 | $bytes = strlen($matches[0]); |
---|
| 92 | |
---|
| 93 | if (isset($matches[2])) |
---|
| 94 | $badList[] = $pos; |
---|
| 95 | |
---|
| 96 | $pos += $bytes; |
---|
| 97 | $str = substr($str,$bytes); |
---|
| 98 | } |
---|
| 99 | |
---|
| 100 | if (count($badList) > 0) |
---|
| 101 | return $badList; |
---|
| 102 | |
---|
| 103 | return false; |
---|
| 104 | } |
---|
| 105 | |
---|
| 106 | /** |
---|
| 107 | * Strips out any bad bytes from a UTF-8 string and returns the rest |
---|
| 108 | * PCRE Pattern to locate bad bytes in a UTF-8 string |
---|
| 109 | * Comes from W3 FAQ: Multilingual Forms |
---|
| 110 | * Note: modified to include full ASCII range including control chars |
---|
| 111 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
| 112 | * @param string |
---|
| 113 | * @return string |
---|
| 114 | * @package utf8 |
---|
| 115 | * @subpackage bad |
---|
| 116 | */ |
---|
| 117 | function utf8_bad_strip($str) |
---|
| 118 | { |
---|
| 119 | $UTF8_BAD = |
---|
| 120 | '([\x00-\x7F]'. # ASCII (including control chars) |
---|
| 121 | '|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte |
---|
| 122 | '|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs |
---|
| 123 | '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte |
---|
| 124 | '|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates |
---|
| 125 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3 |
---|
| 126 | '|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15 |
---|
| 127 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16 |
---|
| 128 | '|(.{1}))'; # Invalid byte |
---|
| 129 | |
---|
| 130 | ob_start(); |
---|
| 131 | |
---|
| 132 | while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) |
---|
| 133 | { |
---|
| 134 | if (!isset($matches[2])) |
---|
| 135 | echo $matches[0]; |
---|
| 136 | |
---|
| 137 | $str = substr($str, strlen($matches[0])); |
---|
| 138 | } |
---|
| 139 | |
---|
| 140 | $result = ob_get_contents(); |
---|
| 141 | ob_end_clean(); |
---|
| 142 | |
---|
| 143 | return $result; |
---|
| 144 | } |
---|
| 145 | |
---|
| 146 | /** |
---|
| 147 | * Replace bad bytes with an alternative character - ASCII character |
---|
| 148 | * recommended is replacement char |
---|
| 149 | * PCRE Pattern to locate bad bytes in a UTF-8 string |
---|
| 150 | * Comes from W3 FAQ: Multilingual Forms |
---|
| 151 | * Note: modified to include full ASCII range including control chars |
---|
| 152 | * @see http://www.w3.org/International/questions/qa-forms-utf-8 |
---|
| 153 | * @param string to search |
---|
| 154 | * @param string to replace bad bytes with (defaults to '?') - use ASCII |
---|
| 155 | * @return string |
---|
| 156 | * @package utf8 |
---|
| 157 | * @subpackage bad |
---|
| 158 | */ |
---|
| 159 | function utf8_bad_replace($str, $replace='?') |
---|
| 160 | { |
---|
| 161 | $UTF8_BAD = |
---|
| 162 | '([\x00-\x7F]'. # ASCII (including control chars) |
---|
| 163 | '|[\xC2-\xDF][\x80-\xBF]'. # Non-overlong 2-byte |
---|
| 164 | '|\xE0[\xA0-\xBF][\x80-\xBF]'. # Excluding overlongs |
---|
| 165 | '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # Straight 3-byte |
---|
| 166 | '|\xED[\x80-\x9F][\x80-\xBF]'. # Excluding surrogates |
---|
| 167 | '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # Planes 1-3 |
---|
| 168 | '|[\xF1-\xF3][\x80-\xBF]{3}'. # Planes 4-15 |
---|
| 169 | '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # Plane 16 |
---|
| 170 | '|(.{1}))'; # Invalid byte |
---|
| 171 | |
---|
| 172 | ob_start(); |
---|
| 173 | |
---|
| 174 | while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) |
---|
| 175 | { |
---|
| 176 | if (!isset($matches[2])) |
---|
| 177 | echo $matches[0]; |
---|
| 178 | else |
---|
| 179 | echo $replace; |
---|
| 180 | |
---|
| 181 | $str = substr($str, strlen($matches[0])); |
---|
| 182 | } |
---|
| 183 | |
---|
| 184 | $result = ob_get_contents(); |
---|
| 185 | ob_end_clean(); |
---|
| 186 | |
---|
| 187 | return $result; |
---|
| 188 | } |
---|
| 189 | |
---|
| 190 | /** |
---|
| 191 | * Return code from utf8_bad_identify() when a five octet sequence is detected. |
---|
| 192 | * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so |
---|
| 193 | * do not represent a useful character |
---|
| 194 | * @see utf8_bad_identify |
---|
| 195 | * @package utf8 |
---|
| 196 | * @subpackage bad |
---|
| 197 | */ |
---|
| 198 | define('UTF8_BAD_5OCTET', 1); |
---|
| 199 | |
---|
| 200 | /** |
---|
| 201 | * Return code from utf8_bad_identify() when a six octet sequence is detected. |
---|
| 202 | * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so |
---|
| 203 | * do not represent a useful character |
---|
| 204 | * @see utf8_bad_identify |
---|
| 205 | * @package utf8 |
---|
| 206 | * @subpackage bad |
---|
| 207 | */ |
---|
| 208 | define('UTF8_BAD_6OCTET', 2); |
---|
| 209 | |
---|
| 210 | /** |
---|
| 211 | * Return code from utf8_bad_identify(). |
---|
| 212 | * Invalid octet for use as start of multi-byte UTF-8 sequence |
---|
| 213 | * @see utf8_bad_identify |
---|
| 214 | * @package utf8 |
---|
| 215 | * @subpackage bad |
---|
| 216 | */ |
---|
| 217 | define('UTF8_BAD_SEQID', 3); |
---|
| 218 | |
---|
| 219 | /** |
---|
| 220 | * Return code from utf8_bad_identify(). |
---|
| 221 | * From Unicode 3.1, non-shortest form is illegal |
---|
| 222 | * @see utf8_bad_identify |
---|
| 223 | * @package utf8 |
---|
| 224 | * @subpackage bad |
---|
| 225 | */ |
---|
| 226 | define('UTF8_BAD_NONSHORT', 4); |
---|
| 227 | |
---|
| 228 | /** |
---|
| 229 | * Return code from utf8_bad_identify(). |
---|
| 230 | * From Unicode 3.2, surrogate characters are illegal |
---|
| 231 | * @see utf8_bad_identify |
---|
| 232 | * @package utf8 |
---|
| 233 | * @subpackage bad |
---|
| 234 | */ |
---|
| 235 | define('UTF8_BAD_SURROGATE', 5); |
---|
| 236 | |
---|
| 237 | /** |
---|
| 238 | * Return code from utf8_bad_identify(). |
---|
| 239 | * Codepoints outside the Unicode range are illegal |
---|
| 240 | * @see utf8_bad_identify |
---|
| 241 | * @package utf8 |
---|
| 242 | * @subpackage bad |
---|
| 243 | */ |
---|
| 244 | define('UTF8_BAD_UNIOUTRANGE', 6); |
---|
| 245 | |
---|
| 246 | /** |
---|
| 247 | * Return code from utf8_bad_identify(). |
---|
| 248 | * Incomplete multi-octet sequence |
---|
| 249 | * Note: this is kind of a "catch-all" |
---|
| 250 | * @see utf8_bad_identify |
---|
| 251 | * @package utf8 |
---|
| 252 | * @subpackage bad |
---|
| 253 | */ |
---|
| 254 | define('UTF8_BAD_SEQINCOMPLETE', 7); |
---|
| 255 | |
---|
| 256 | /** |
---|
| 257 | * Reports on the type of bad byte found in a UTF-8 string. Returns a |
---|
| 258 | * status code on the first bad byte found |
---|
| 259 | * @author <hsivonen@iki.fi> |
---|
| 260 | * @param string UTF-8 encoded string |
---|
| 261 | * @return mixed integer constant describing problem or FALSE if valid UTF-8 |
---|
| 262 | * @see utf8_bad_explain |
---|
| 263 | * @see http://hsivonen.iki.fi/php-utf8/ |
---|
| 264 | * @package utf8 |
---|
| 265 | * @subpackage bad |
---|
| 266 | */ |
---|
| 267 | function utf8_bad_identify($str, &$i) |
---|
| 268 | { |
---|
| 269 | $mState = 0; // Cached expected number of octets after the current octet |
---|
| 270 | // until the beginning of the next UTF8 character sequence |
---|
| 271 | $mUcs4 = 0; // Cached Unicode character |
---|
| 272 | $mBytes = 1; // Cached expected number of octets in the current sequence |
---|
| 273 | |
---|
| 274 | $len = strlen($str); |
---|
| 275 | |
---|
| 276 | for($i=0; $i < $len; $i++) |
---|
| 277 | { |
---|
| 278 | $in = ord($str{$i}); |
---|
| 279 | |
---|
| 280 | if ( $mState == 0) |
---|
| 281 | { |
---|
| 282 | // When mState is zero we expect either a US-ASCII character or a multi-octet sequence. |
---|
| 283 | if (0 == (0x80 & ($in))) |
---|
| 284 | { |
---|
| 285 | // US-ASCII, pass straight through. |
---|
| 286 | $mBytes = 1; |
---|
| 287 | } |
---|
| 288 | else if (0xC0 == (0xE0 & ($in))) |
---|
| 289 | { |
---|
| 290 | // First octet of 2 octet sequence |
---|
| 291 | $mUcs4 = ($in); |
---|
| 292 | $mUcs4 = ($mUcs4 & 0x1F) << 6; |
---|
| 293 | $mState = 1; |
---|
| 294 | $mBytes = 2; |
---|
| 295 | } |
---|
| 296 | else if (0xE0 == (0xF0 & ($in))) |
---|
| 297 | { |
---|
| 298 | // First octet of 3 octet sequence |
---|
| 299 | $mUcs4 = ($in); |
---|
| 300 | $mUcs4 = ($mUcs4 & 0x0F) << 12; |
---|
| 301 | $mState = 2; |
---|
| 302 | $mBytes = 3; |
---|
| 303 | } |
---|
| 304 | else if (0xF0 == (0xF8 & ($in))) |
---|
| 305 | { |
---|
| 306 | // First octet of 4 octet sequence |
---|
| 307 | $mUcs4 = ($in); |
---|
| 308 | $mUcs4 = ($mUcs4 & 0x07) << 18; |
---|
| 309 | $mState = 3; |
---|
| 310 | $mBytes = 4; |
---|
| 311 | } |
---|
| 312 | else if (0xF8 == (0xFC & ($in))) |
---|
| 313 | { |
---|
| 314 | /* First octet of 5 octet sequence. |
---|
| 315 | * |
---|
| 316 | * This is illegal because the encoded codepoint must be either |
---|
| 317 | * (a) not the shortest form or |
---|
| 318 | * (b) outside the Unicode range of 0-0x10FFFF. |
---|
| 319 | */ |
---|
| 320 | return UTF8_BAD_5OCTET; |
---|
| 321 | } |
---|
| 322 | else if (0xFC == (0xFE & ($in))) |
---|
| 323 | { |
---|
| 324 | // First octet of 6 octet sequence, see comments for 5 octet sequence. |
---|
| 325 | return UTF8_BAD_6OCTET; |
---|
| 326 | } |
---|
| 327 | else |
---|
| 328 | { |
---|
| 329 | // Current octet is neither in the US-ASCII range nor a legal first |
---|
| 330 | // octet of a multi-octet sequence. |
---|
| 331 | return UTF8_BAD_SEQID; |
---|
| 332 | } |
---|
| 333 | } |
---|
| 334 | else |
---|
| 335 | { |
---|
| 336 | // When mState is non-zero, we expect a continuation of the multi-octet sequence |
---|
| 337 | if (0x80 == (0xC0 & ($in))) |
---|
| 338 | { |
---|
| 339 | // Legal continuation. |
---|
| 340 | $shift = ($mState - 1) * 6; |
---|
| 341 | $tmp = $in; |
---|
| 342 | $tmp = ($tmp & 0x0000003F) << $shift; |
---|
| 343 | $mUcs4 |= $tmp; |
---|
| 344 | |
---|
| 345 | /** |
---|
| 346 | * End of the multi-octet sequence. mUcs4 now contains the final |
---|
| 347 | * Unicode codepoint to be output |
---|
| 348 | */ |
---|
| 349 | if (0 == --$mState) |
---|
| 350 | { |
---|
| 351 | // From Unicode 3.1, non-shortest form is illegal |
---|
| 352 | if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || |
---|
| 353 | ((3 == $mBytes) && ($mUcs4 < 0x0800)) || |
---|
| 354 | ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) |
---|
| 355 | return UTF8_BAD_NONSHORT; |
---|
| 356 | else if (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal |
---|
| 357 | return UTF8_BAD_SURROGATE; |
---|
| 358 | else if ($mUcs4 > 0x10FFFF) // Codepoints outside the Unicode range are illegal |
---|
| 359 | return UTF8_BAD_UNIOUTRANGE; |
---|
| 360 | |
---|
| 361 | // Initialize UTF8 cache |
---|
| 362 | $mState = 0; |
---|
| 363 | $mUcs4 = 0; |
---|
| 364 | $mBytes = 1; |
---|
| 365 | } |
---|
| 366 | |
---|
| 367 | } |
---|
| 368 | else |
---|
| 369 | { |
---|
| 370 | // ((0xC0 & (*in) != 0x80) && (mState != 0)) |
---|
| 371 | // Incomplete multi-octet sequence. |
---|
| 372 | $i--; |
---|
| 373 | return UTF8_BAD_SEQINCOMPLETE; |
---|
| 374 | } |
---|
| 375 | } |
---|
| 376 | } |
---|
| 377 | |
---|
| 378 | // Incomplete multi-octet sequence |
---|
| 379 | if ($mState != 0) |
---|
| 380 | { |
---|
| 381 | $i--; |
---|
| 382 | return UTF8_BAD_SEQINCOMPLETE; |
---|
| 383 | } |
---|
| 384 | |
---|
| 385 | // No bad octets found |
---|
| 386 | $i = null; |
---|
| 387 | return false; |
---|
| 388 | } |
---|
| 389 | |
---|
| 390 | /** |
---|
| 391 | * Takes a return code from utf8_bad_identify() are returns a message |
---|
| 392 | * (in English) explaining what the problem is. |
---|
| 393 | * @param int return code from utf8_bad_identify |
---|
| 394 | * @return mixed string message or FALSE if return code unknown |
---|
| 395 | * @see utf8_bad_identify |
---|
| 396 | * @package utf8 |
---|
| 397 | * @subpackage bad |
---|
| 398 | */ |
---|
| 399 | function utf8_bad_explain($code) |
---|
| 400 | { |
---|
| 401 | switch ($code) |
---|
| 402 | { |
---|
| 403 | case UTF8_BAD_5OCTET: |
---|
| 404 | return 'Five octet sequences are valid UTF-8 but are not supported by Unicode'; |
---|
| 405 | break; |
---|
| 406 | |
---|
| 407 | case UTF8_BAD_6OCTET: |
---|
| 408 | return 'Six octet sequences are valid UTF-8 but are not supported by Unicode'; |
---|
| 409 | break; |
---|
| 410 | |
---|
| 411 | case UTF8_BAD_SEQID: |
---|
| 412 | return 'Invalid octet for use as start of multi-byte UTF-8 sequence'; |
---|
| 413 | break; |
---|
| 414 | |
---|
| 415 | case UTF8_BAD_NONSHORT: |
---|
| 416 | return 'From Unicode 3.1, non-shortest form is illegal'; |
---|
| 417 | break; |
---|
| 418 | |
---|
| 419 | case UTF8_BAD_SURROGATE: |
---|
| 420 | return 'From Unicode 3.2, surrogate characters are illegal'; |
---|
| 421 | break; |
---|
| 422 | |
---|
| 423 | case UTF8_BAD_UNIOUTRANGE: |
---|
| 424 | return 'Codepoints outside the Unicode range are illegal'; |
---|
| 425 | break; |
---|
| 426 | |
---|
| 427 | case UTF8_BAD_SEQINCOMPLETE: |
---|
| 428 | return 'Incomplete multi-octet sequence'; |
---|
| 429 | break; |
---|
| 430 | } |
---|
| 431 | |
---|
| 432 | trigger_error('Unknown error code: '.$code, E_USER_WARNING); |
---|
| 433 | |
---|
| 434 | return false; |
---|
| 435 | } |
---|