[3] | 1 | <?php |
---|
| 2 | |
---|
| 3 | /** |
---|
| 4 | * Tools to help with ASCII in UTF-8 |
---|
| 5 | * @version $Id: ascii.php,v 1.5 2006/10/16 20:38:12 harryf Exp $ |
---|
| 6 | * @package utf8 |
---|
| 7 | * @subpackage ascii |
---|
| 8 | */ |
---|
| 9 | |
---|
| 10 | /** |
---|
| 11 | * Tests whether a string contains only 7bit ASCII bytes. |
---|
| 12 | * You might use this to conditionally check whether a string |
---|
| 13 | * needs handling as UTF-8 or not, potentially offering performance |
---|
| 14 | * benefits by using the native PHP equivalent if it's just ASCII e.g.; |
---|
| 15 | * |
---|
| 16 | * <code> |
---|
| 17 | * if ( utf8_is_ascii($someString) ) { |
---|
| 18 | * // It's just ASCII - use the native PHP version |
---|
| 19 | * $someString = strtolower($someString); |
---|
| 20 | * } else { |
---|
| 21 | * $someString = utf8_strtolower($someString); |
---|
| 22 | * } |
---|
| 23 | * </code> |
---|
| 24 | * |
---|
| 25 | * @param string |
---|
| 26 | * @return boolean TRUE if it's all ASCII |
---|
| 27 | * @package utf8 |
---|
| 28 | * @subpackage ascii |
---|
| 29 | * @see utf8_is_ascii_ctrl |
---|
| 30 | */ |
---|
| 31 | function utf8_is_ascii($str) |
---|
| 32 | { |
---|
| 33 | // Search for any bytes which are outside the ASCII range... |
---|
| 34 | return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1); |
---|
| 35 | } |
---|
| 36 | |
---|
| 37 | /** |
---|
| 38 | * Tests whether a string contains only 7bit ASCII bytes with device |
---|
| 39 | * control codes omitted. The device control codes can be found on the |
---|
| 40 | * second table here: http://www.w3schools.com/tags/ref_ascii.asp |
---|
| 41 | * |
---|
| 42 | * @param string |
---|
| 43 | * @return boolean TRUE if it's all ASCII without device control codes |
---|
| 44 | * @package utf8 |
---|
| 45 | * @subpackage ascii |
---|
| 46 | * @see utf8_is_ascii |
---|
| 47 | */ |
---|
| 48 | function utf8_is_ascii_ctrl($str) |
---|
| 49 | { |
---|
| 50 | // Search for any bytes which are outside the ASCII range, or are device control codes |
---|
| 51 | if (strlen($str) > 0) |
---|
| 52 | return (preg_match('/[^\x09\x0A\x0D\x20-\x7E]/', $str) !== 1); |
---|
| 53 | |
---|
| 54 | return false; |
---|
| 55 | } |
---|
| 56 | |
---|
| 57 | /** |
---|
| 58 | * Strip out all non-7bit ASCII bytes |
---|
| 59 | * If you need to transmit a string to system which you know can only |
---|
| 60 | * support 7bit ASCII, you could use this function. |
---|
| 61 | * @param string |
---|
| 62 | * @return string with non ASCII bytes removed |
---|
| 63 | * @package utf8 |
---|
| 64 | * @subpackage ascii |
---|
| 65 | * @see utf8_strip_non_ascii_ctrl |
---|
| 66 | */ |
---|
| 67 | function utf8_strip_non_ascii($str) |
---|
| 68 | { |
---|
| 69 | ob_start(); |
---|
| 70 | |
---|
| 71 | while (preg_match('/^([\x00-\x7F]+)|([^\x00-\x7F]+)/S', $str, $matches)) |
---|
| 72 | { |
---|
| 73 | if (!isset($matches[2])) |
---|
| 74 | echo $matches[0]; |
---|
| 75 | |
---|
| 76 | $str = substr($str, strlen($matches[0])); |
---|
| 77 | } |
---|
| 78 | |
---|
| 79 | $result = ob_get_contents(); |
---|
| 80 | ob_end_clean(); |
---|
| 81 | |
---|
| 82 | return $result; |
---|
| 83 | } |
---|
| 84 | |
---|
| 85 | /** |
---|
| 86 | * Strip out device control codes in the ASCII range |
---|
| 87 | * which are not permitted in XML. Note that this leaves |
---|
| 88 | * multi-byte characters untouched - it only removes device |
---|
| 89 | * control codes |
---|
| 90 | * @see http://hsivonen.iki.fi/producing-xml/#controlchar |
---|
| 91 | * @param string |
---|
| 92 | * @return string control codes removed |
---|
| 93 | */ |
---|
| 94 | function utf8_strip_ascii_ctrl($str) |
---|
| 95 | { |
---|
| 96 | ob_start(); |
---|
| 97 | |
---|
| 98 | while (preg_match('/^([^\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)|([\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)/S', $str, $matches)) |
---|
| 99 | { |
---|
| 100 | if (!isset($matches[2])) |
---|
| 101 | echo $matches[0]; |
---|
| 102 | |
---|
| 103 | $str = substr($str, strlen($matches[0])); |
---|
| 104 | } |
---|
| 105 | |
---|
| 106 | $result = ob_get_contents(); |
---|
| 107 | ob_end_clean(); |
---|
| 108 | |
---|
| 109 | return $result; |
---|
| 110 | } |
---|
| 111 | |
---|
| 112 | /** |
---|
| 113 | * Strip out all non 7bit ASCII bytes and ASCII device control codes. |
---|
| 114 | * For a list of ASCII device control codes see the 2nd table here: |
---|
| 115 | * http://www.w3schools.com/tags/ref_ascii.asp |
---|
| 116 | * |
---|
| 117 | * @param string |
---|
| 118 | * @return boolean TRUE if it's all ASCII |
---|
| 119 | * @package utf8 |
---|
| 120 | * @subpackage ascii |
---|
| 121 | */ |
---|
| 122 | function utf8_strip_non_ascii_ctrl($str) |
---|
| 123 | { |
---|
| 124 | ob_start(); |
---|
| 125 | |
---|
| 126 | while (preg_match( '/^([\x09\x0A\x0D\x20-\x7E]+)|([^\x09\x0A\x0D\x20-\x7E]+)/S', $str, $matches)) |
---|
| 127 | { |
---|
| 128 | if (!isset($matches[2])) |
---|
| 129 | echo $matches[0]; |
---|
| 130 | |
---|
| 131 | $str = substr($str, strlen($matches[0])); |
---|
| 132 | } |
---|
| 133 | |
---|
| 134 | $result = ob_get_contents(); |
---|
| 135 | ob_end_clean(); |
---|
| 136 | |
---|
| 137 | return $result; |
---|
| 138 | } |
---|
| 139 | |
---|
| 140 | /** |
---|
| 141 | * Replace accented UTF-8 characters by unaccented ASCII-7 "equivalents". |
---|
| 142 | * The purpose of this function is to replace characters commonly found in Latin |
---|
| 143 | * alphabets with something more or less equivalent from the ASCII range. This can |
---|
| 144 | * be useful for converting a UTF-8 to something ready for a filename, for example. |
---|
| 145 | * Following the use of this function, you would probably also pass the string |
---|
| 146 | * through utf8_strip_non_ascii to clean out any other non-ASCII chars |
---|
| 147 | * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) |
---|
| 148 | * letters. Default is to deaccent both cases ($case = 0) |
---|
| 149 | * |
---|
| 150 | * For a more complete implementation of transliteration, see the utf8_to_ascii package |
---|
| 151 | * available from the phputf8 project downloads: |
---|
| 152 | * http://prdownloads.sourceforge.net/phputf8 |
---|
| 153 | * |
---|
| 154 | * @param string UTF-8 string |
---|
| 155 | * @param int (optional) -1 lowercase only, +1 uppercase only, 1 both cases |
---|
| 156 | * @param string UTF-8 with accented characters replaced by ASCII chars |
---|
| 157 | * @return string accented chars replaced with ascii equivalents |
---|
| 158 | * @author Andreas Gohr <andi@splitbrain.org> |
---|
| 159 | * @package utf8 |
---|
| 160 | * @subpackage ascii |
---|
| 161 | */ |
---|
| 162 | function utf8_accents_to_ascii($str, $case=0) |
---|
| 163 | { |
---|
| 164 | static $UTF8_LOWER_ACCENTS = null; |
---|
| 165 | static $UTF8_UPPER_ACCENTS = null; |
---|
| 166 | |
---|
| 167 | if($case <= 0) |
---|
| 168 | { |
---|
| 169 | |
---|
| 170 | if (is_null($UTF8_LOWER_ACCENTS)) |
---|
| 171 | { |
---|
| 172 | $UTF8_LOWER_ACCENTS = array( |
---|
| 173 | 'à ' => 'a', 'ÃŽ' => 'o', 'Ä' => 'd', 'áž' => 'f', 'ë' => 'e', 'Å¡' => 's', 'Æ¡' => 'o', |
---|
| 174 | 'Ã' => 'ss', 'Ä' => 'a', 'Å' => 'r', 'È' => 't', 'Å' => 'n', 'Ä' => 'a', 'Ä·' => 'k', |
---|
| 175 | 'Å' => 's', 'ỳ' => 'y', 'Å' => 'n', 'ĺ' => 'l', 'ħ' => 'h', 'á¹' => 'p', 'ó' => 'o', |
---|
| 176 | 'ú' => 'u', 'Ä' => 'e', 'é' => 'e', 'ç' => 'c', 'áº' => 'w', 'Ä' => 'c', 'õ' => 'o', |
---|
| 177 | 'ṡ' => 's', 'Þ' => 'o', 'Ä£' => 'g', 'ŧ' => 't', 'È' => 's', 'Ä' => 'e', 'Ä' => 'c', |
---|
| 178 | 'Å' => 's', 'î' => 'i', 'ű' => 'u', 'Ä' => 'c', 'Ä' => 'e', 'ŵ' => 'w', 'ṫ' => 't', |
---|
| 179 | 'Å«' => 'u', 'Ä' => 'c', 'ö' => 'oe', 'Ú' => 'e', 'Å·' => 'y', 'Ä
' => 'a', 'Å' => 'l', |
---|
| 180 | 'ų' => 'u', 'ů' => 'u', 'Å' => 's', 'Ä' => 'g', 'ÄŒ' => 'l', 'Æ' => 'f', 'ÅŸ' => 'z', |
---|
| 181 | 'áº' => 'w', 'áž' => 'b', 'Ã¥' => 'a', 'ì' => 'i', 'ï' => 'i', 'áž' => 'd', 'Å¥' => 't', |
---|
| 182 | 'Å' => 'r', 'À' => 'ae', 'Ã' => 'i', 'Å' => 'r', 'ê' => 'e', 'ÃŒ' => 'ue', 'ò' => 'o', |
---|
| 183 | 'Ä' => 'e', 'ñ' => 'n', 'Å' => 'n', 'Ä¥' => 'h', 'Ä' => 'g', 'Ä' => 'd', 'ĵ' => 'j', |
---|
| 184 | 'ÿ' => 'y', 'Å©' => 'u', 'Å' => 'u', 'Æ°' => 'u', 'Å£' => 't', 'Ãœ' => 'y', 'Å' => 'o', |
---|
| 185 | 'â' => 'a', 'ÄŸ' => 'l', 'áº
' => 'w', 'Ō' => 'z', 'ī' => 'i', 'ã' => 'a', 'ġ' => 'g', |
---|
| 186 | 'á¹' => 'm', 'Å' => 'o', 'Ä©' => 'i', 'ù' => 'u', 'į' => 'i', 'ź' => 'z', 'á' => 'a', |
---|
| 187 | 'û' => 'u', 'ß' => 'th', 'ð' => 'dh', 'Ê' => 'ae', 'µ' => 'u', 'Ä' => 'e', |
---|
| 188 | ); |
---|
| 189 | } |
---|
| 190 | |
---|
| 191 | $str = str_replace(array_keys($UTF8_LOWER_ACCENTS), array_values($UTF8_LOWER_ACCENTS), $str); |
---|
| 192 | } |
---|
| 193 | |
---|
| 194 | if($case >= 0) |
---|
| 195 | { |
---|
| 196 | if (is_null($UTF8_UPPER_ACCENTS)) |
---|
| 197 | { |
---|
| 198 | $UTF8_UPPER_ACCENTS = array( |
---|
| 199 | 'Ã' => 'A', 'Ã' => 'O', 'Ä' => 'D', 'áž' => 'F', 'Ã' => 'E', 'Å ' => 'S', 'Æ ' => 'O', |
---|
| 200 | 'Ä' => 'A', 'Å' => 'R', 'È' => 'T', 'Å' => 'N', 'Ä' => 'A', 'Ķ' => 'K', |
---|
| 201 | 'Å' => 'S', 'Ỳ' => 'Y', 'Å
' => 'N', 'Ĺ' => 'L', 'ÄŠ' => 'H', 'á¹' => 'P', 'Ã' => 'O', |
---|
| 202 | 'Ã' => 'U', 'Ä' => 'E', 'Ã' => 'E', 'Ã' => 'C', 'áº' => 'W', 'Ä' => 'C', 'Ã' => 'O', |
---|
| 203 | 'á¹ ' => 'S', 'Ã' => 'O', 'Ä¢' => 'G', 'ÅŠ' => 'T', 'È' => 'S', 'Ä' => 'E', 'Ä' => 'C', |
---|
| 204 | 'Å' => 'S', 'Ã' => 'I', 'Å°' => 'U', 'Ä' => 'C', 'Ä' => 'E', 'ÅŽ' => 'W', 'Ṫ' => 'T', |
---|
| 205 | 'Ū' => 'U', 'Ä' => 'C', 'Ã' => 'Oe', 'Ã' => 'E', 'Ŷ' => 'Y', 'Ä' => 'A', 'Å' => 'L', |
---|
| 206 | 'Ų' => 'U', 'Å®' => 'U', 'Å' => 'S', 'Ä' => 'G', 'Ä»' => 'L', 'Æ' => 'F', 'Åœ' => 'Z', |
---|
| 207 | 'áº' => 'W', 'áž' => 'B', 'Ã
' => 'A', 'Ã' => 'I', 'Ã' => 'I', 'áž' => 'D', 'Å€' => 'T', |
---|
| 208 | 'Å' => 'R', 'Ã' => 'Ae', 'Ã' => 'I', 'Å' => 'R', 'Ã' => 'E', 'Ã' => 'Ue', 'Ã' => 'O', |
---|
| 209 | 'Ä' => 'E', 'Ã' => 'N', 'Å' => 'N', 'Ä€' => 'H', 'Ä' => 'G', 'Ä' => 'D', 'ÄŽ' => 'J', |
---|
| 210 | 'Åž' => 'Y', 'Åš' => 'U', 'Ŭ' => 'U', 'Ư' => 'U', 'Å¢' => 'T', 'Ã' => 'Y', 'Å' => 'O', |
---|
| 211 | 'Ã' => 'A', 'Äœ' => 'L', 'áº' => 'W', 'Å»' => 'Z', 'Ī' => 'I', 'Ã' => 'A', 'Ä ' => 'G', |
---|
| 212 | 'á¹' => 'M', 'Å' => 'O', 'Äš' => 'I', 'Ã' => 'U', 'Ä®' => 'I', 'Ź' => 'Z', 'Ã' => 'A', |
---|
| 213 | 'Ã' => 'U', 'Ã' => 'Th', 'Ã' => 'Dh', 'Ã' => 'Ae', 'Ä' => 'E', |
---|
| 214 | ); |
---|
| 215 | } |
---|
| 216 | |
---|
| 217 | $str = str_replace(array_keys($UTF8_UPPER_ACCENTS), array_values($UTF8_UPPER_ACCENTS), $str); |
---|
| 218 | } |
---|
| 219 | |
---|
| 220 | return $str; |
---|
| 221 | } |
---|