[6] | 1 | <?php |
---|
| 2 | |
---|
| 3 | /** |
---|
| 4 | * Locate a byte index given a UTF-8 character index |
---|
| 5 | * @version $Id: position.php,v 1.1 2006/10/01 00:01:31 harryf Exp $ |
---|
| 6 | * @package utf8 |
---|
| 7 | * @subpackage position |
---|
| 8 | */ |
---|
| 9 | |
---|
| 10 | /** |
---|
| 11 | * Given a string and a character index in the string, in |
---|
| 12 | * terms of the UTF-8 character position, returns the byte |
---|
| 13 | * index of that character. Can be useful when you want to |
---|
| 14 | * PHP's native string functions but we warned, locating |
---|
| 15 | * the byte can be expensive |
---|
| 16 | * Takes variable number of parameters - first must be |
---|
| 17 | * the search string then 1 to n UTF-8 character positions |
---|
| 18 | * to obtain byte indexes for - it is more efficient to search |
---|
| 19 | * the string for multiple characters at once, than make |
---|
| 20 | * repeated calls to this function |
---|
| 21 | * |
---|
| 22 | * @author Chris Smith<chris@jalakai.co.uk> |
---|
| 23 | * @param string string to locate index in |
---|
| 24 | * @param int (n times) |
---|
| 25 | * @return mixed - int if only one input int, array if more |
---|
| 26 | * @return boolean TRUE if it's all ASCII |
---|
| 27 | * @package utf8 |
---|
| 28 | * @subpackage position |
---|
| 29 | */ |
---|
| 30 | function utf8_byte_position() |
---|
| 31 | { |
---|
| 32 | $args = func_get_args(); |
---|
| 33 | $str =& array_shift($args); |
---|
| 34 | |
---|
| 35 | if (!is_string($str)) |
---|
| 36 | return false; |
---|
| 37 | |
---|
| 38 | $result = array(); |
---|
| 39 | $prev = array(0, 0); // Trivial byte index, character offset pair |
---|
| 40 | $i = utf8_locate_next_chr($str, 300); // Use a short piece of str to estimate bytes per character. $i (& $j) -> byte indexes into $str |
---|
| 41 | $c = strlen(utf8_decode(substr($str, 0, $i))); // $c -> character offset into $str |
---|
| 42 | |
---|
| 43 | // Deal with arguments from lowest to highest |
---|
| 44 | sort($args); |
---|
| 45 | |
---|
| 46 | foreach ($args as $offset) |
---|
| 47 | { |
---|
| 48 | // Sanity checks FIXME |
---|
| 49 | |
---|
| 50 | // 0 is an easy check |
---|
| 51 | if ($offset == 0) |
---|
| 52 | { |
---|
| 53 | $result[] = 0; continue; |
---|
| 54 | } |
---|
| 55 | |
---|
| 56 | // Ensure no endless looping |
---|
| 57 | $safety_valve = 50; |
---|
| 58 | |
---|
| 59 | do |
---|
| 60 | { |
---|
| 61 | if (($c - $prev[1]) == 0) |
---|
| 62 | { |
---|
| 63 | // Hack: gone past end of string |
---|
| 64 | $error = 0; |
---|
| 65 | $i = strlen($str); |
---|
| 66 | break; |
---|
| 67 | } |
---|
| 68 | |
---|
| 69 | $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1])); |
---|
| 70 | $j = utf8_locate_next_chr($str, $j); // Correct to utf8 character boundary |
---|
| 71 | $prev = array($i,$c); // Save the index, offset for use next iteration |
---|
| 72 | |
---|
| 73 | if ($j > $i) |
---|
| 74 | $c += strlen(utf8_decode(substr($str, $i, $j-$i))); // Determine new character offset |
---|
| 75 | else |
---|
| 76 | $c -= strlen(utf8_decode(substr($str, $j, $i-$j))); // Ditto |
---|
| 77 | |
---|
| 78 | $error = abs($c-$offset); |
---|
| 79 | $i = $j; // Ready for next time around |
---|
| 80 | } |
---|
| 81 | while (($error > 7) && --$safety_valve); // From 7 it is faster to iterate over the string |
---|
| 82 | |
---|
| 83 | if ($error && $error <= 7) |
---|
| 84 | { |
---|
| 85 | if ($c < $offset) |
---|
| 86 | { |
---|
| 87 | // Move up |
---|
| 88 | while ($error--) |
---|
| 89 | $i = utf8_locate_next_chr($str, ++$i); |
---|
| 90 | } |
---|
| 91 | else |
---|
| 92 | { |
---|
| 93 | // Move down |
---|
| 94 | while ($error--) |
---|
| 95 | $i = utf8_locate_current_chr($str, --$i); |
---|
| 96 | } |
---|
| 97 | |
---|
| 98 | // Ready for next arg |
---|
| 99 | $c = $offset; |
---|
| 100 | } |
---|
| 101 | |
---|
| 102 | $result[] = $i; |
---|
| 103 | } |
---|
| 104 | |
---|
| 105 | if (count($result) == 1) |
---|
| 106 | return $result[0]; |
---|
| 107 | |
---|
| 108 | return $result; |
---|
| 109 | } |
---|
| 110 | |
---|
| 111 | /** |
---|
| 112 | * Given a string and any byte index, returns the byte index |
---|
| 113 | * of the start of the current UTF-8 character, relative to supplied |
---|
| 114 | * position. If the current character begins at the same place as the |
---|
| 115 | * supplied byte index, that byte index will be returned. Otherwise |
---|
| 116 | * this function will step backwards, looking for the index where |
---|
| 117 | * curent UTF-8 character begins |
---|
| 118 | * @author Chris Smith<chris@jalakai.co.uk> |
---|
| 119 | * @param string |
---|
| 120 | * @param int byte index in the string |
---|
| 121 | * @return int byte index of start of next UTF-8 character |
---|
| 122 | * @package utf8 |
---|
| 123 | * @subpackage position |
---|
| 124 | */ |
---|
| 125 | function utf8_locate_current_chr( &$str, $idx ) |
---|
| 126 | { |
---|
| 127 | if ($idx <= 0) |
---|
| 128 | return 0; |
---|
| 129 | |
---|
| 130 | $limit = strlen($str); |
---|
| 131 | if ($idx >= $limit) |
---|
| 132 | return $limit; |
---|
| 133 | |
---|
| 134 | // Binary value for any byte after the first in a multi-byte UTF-8 character |
---|
| 135 | // will be like 10xxxxxx so & 0xC0 can be used to detect this kind |
---|
| 136 | // of byte - assuming well formed UTF-8 |
---|
| 137 | while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) |
---|
| 138 | $idx--; |
---|
| 139 | |
---|
| 140 | return $idx; |
---|
| 141 | } |
---|
| 142 | |
---|
| 143 | /** |
---|
| 144 | * Given a string and any byte index, returns the byte index |
---|
| 145 | * of the start of the next UTF-8 character, relative to supplied |
---|
| 146 | * position. If the next character begins at the same place as the |
---|
| 147 | * supplied byte index, that byte index will be returned. |
---|
| 148 | * @author Chris Smith<chris@jalakai.co.uk> |
---|
| 149 | * @param string |
---|
| 150 | * @param int byte index in the string |
---|
| 151 | * @return int byte index of start of next UTF-8 character |
---|
| 152 | * @package utf8 |
---|
| 153 | * @subpackage position |
---|
| 154 | */ |
---|
| 155 | function utf8_locate_next_chr(&$str, $idx) |
---|
| 156 | { |
---|
| 157 | if ($idx <= 0) |
---|
| 158 | return 0; |
---|
| 159 | |
---|
| 160 | $limit = strlen($str); |
---|
| 161 | if ($idx >= $limit) |
---|
| 162 | return $limit; |
---|
| 163 | |
---|
| 164 | // Binary value for any byte after the first in a multi-byte UTF-8 character |
---|
| 165 | // will be like 10xxxxxx so & 0xC0 can be used to detect this kind |
---|
| 166 | // of byte - assuming well formed UTF-8 |
---|
| 167 | while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) |
---|
| 168 | $idx++; |
---|
| 169 | |
---|
| 170 | return $idx; |
---|
| 171 | } |
---|