1 | <?php |
---|
2 | |
---|
3 | /** |
---|
4 | * Locate a byte index given a UTF-8 character index |
---|
5 | * @version $Id: position.php,v 1.1 2006/10/01 00:01:31 harryf Exp $ |
---|
6 | * @package utf8 |
---|
7 | * @subpackage position |
---|
8 | */ |
---|
9 | |
---|
10 | /** |
---|
11 | * Given a string and a character index in the string, in |
---|
12 | * terms of the UTF-8 character position, returns the byte |
---|
13 | * index of that character. Can be useful when you want to |
---|
14 | * PHP's native string functions but we warned, locating |
---|
15 | * the byte can be expensive |
---|
16 | * Takes variable number of parameters - first must be |
---|
17 | * the search string then 1 to n UTF-8 character positions |
---|
18 | * to obtain byte indexes for - it is more efficient to search |
---|
19 | * the string for multiple characters at once, than make |
---|
20 | * repeated calls to this function |
---|
21 | * |
---|
22 | * @author Chris Smith<chris@jalakai.co.uk> |
---|
23 | * @param string string to locate index in |
---|
24 | * @param int (n times) |
---|
25 | * @return mixed - int if only one input int, array if more |
---|
26 | * @return boolean TRUE if it's all ASCII |
---|
27 | * @package utf8 |
---|
28 | * @subpackage position |
---|
29 | */ |
---|
30 | function utf8_byte_position() |
---|
31 | { |
---|
32 | $args = func_get_args(); |
---|
33 | $str =& array_shift($args); |
---|
34 | |
---|
35 | if (!is_string($str)) |
---|
36 | return false; |
---|
37 | |
---|
38 | $result = array(); |
---|
39 | $prev = array(0, 0); // Trivial byte index, character offset pair |
---|
40 | $i = utf8_locate_next_chr($str, 300); // Use a short piece of str to estimate bytes per character. $i (& $j) -> byte indexes into $str |
---|
41 | $c = strlen(utf8_decode(substr($str, 0, $i))); // $c -> character offset into $str |
---|
42 | |
---|
43 | // Deal with arguments from lowest to highest |
---|
44 | sort($args); |
---|
45 | |
---|
46 | foreach ($args as $offset) |
---|
47 | { |
---|
48 | // Sanity checks FIXME |
---|
49 | |
---|
50 | // 0 is an easy check |
---|
51 | if ($offset == 0) |
---|
52 | { |
---|
53 | $result[] = 0; continue; |
---|
54 | } |
---|
55 | |
---|
56 | // Ensure no endless looping |
---|
57 | $safety_valve = 50; |
---|
58 | |
---|
59 | do |
---|
60 | { |
---|
61 | if (($c - $prev[1]) == 0) |
---|
62 | { |
---|
63 | // Hack: gone past end of string |
---|
64 | $error = 0; |
---|
65 | $i = strlen($str); |
---|
66 | break; |
---|
67 | } |
---|
68 | |
---|
69 | $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1])); |
---|
70 | $j = utf8_locate_next_chr($str, $j); // Correct to utf8 character boundary |
---|
71 | $prev = array($i,$c); // Save the index, offset for use next iteration |
---|
72 | |
---|
73 | if ($j > $i) |
---|
74 | $c += strlen(utf8_decode(substr($str, $i, $j-$i))); // Determine new character offset |
---|
75 | else |
---|
76 | $c -= strlen(utf8_decode(substr($str, $j, $i-$j))); // Ditto |
---|
77 | |
---|
78 | $error = abs($c-$offset); |
---|
79 | $i = $j; // Ready for next time around |
---|
80 | } |
---|
81 | while (($error > 7) && --$safety_valve); // From 7 it is faster to iterate over the string |
---|
82 | |
---|
83 | if ($error && $error <= 7) |
---|
84 | { |
---|
85 | if ($c < $offset) |
---|
86 | { |
---|
87 | // Move up |
---|
88 | while ($error--) |
---|
89 | $i = utf8_locate_next_chr($str, ++$i); |
---|
90 | } |
---|
91 | else |
---|
92 | { |
---|
93 | // Move down |
---|
94 | while ($error--) |
---|
95 | $i = utf8_locate_current_chr($str, --$i); |
---|
96 | } |
---|
97 | |
---|
98 | // Ready for next arg |
---|
99 | $c = $offset; |
---|
100 | } |
---|
101 | |
---|
102 | $result[] = $i; |
---|
103 | } |
---|
104 | |
---|
105 | if (count($result) == 1) |
---|
106 | return $result[0]; |
---|
107 | |
---|
108 | return $result; |
---|
109 | } |
---|
110 | |
---|
111 | /** |
---|
112 | * Given a string and any byte index, returns the byte index |
---|
113 | * of the start of the current UTF-8 character, relative to supplied |
---|
114 | * position. If the current character begins at the same place as the |
---|
115 | * supplied byte index, that byte index will be returned. Otherwise |
---|
116 | * this function will step backwards, looking for the index where |
---|
117 | * curent UTF-8 character begins |
---|
118 | * @author Chris Smith<chris@jalakai.co.uk> |
---|
119 | * @param string |
---|
120 | * @param int byte index in the string |
---|
121 | * @return int byte index of start of next UTF-8 character |
---|
122 | * @package utf8 |
---|
123 | * @subpackage position |
---|
124 | */ |
---|
125 | function utf8_locate_current_chr( &$str, $idx ) |
---|
126 | { |
---|
127 | if ($idx <= 0) |
---|
128 | return 0; |
---|
129 | |
---|
130 | $limit = strlen($str); |
---|
131 | if ($idx >= $limit) |
---|
132 | return $limit; |
---|
133 | |
---|
134 | // Binary value for any byte after the first in a multi-byte UTF-8 character |
---|
135 | // will be like 10xxxxxx so & 0xC0 can be used to detect this kind |
---|
136 | // of byte - assuming well formed UTF-8 |
---|
137 | while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) |
---|
138 | $idx--; |
---|
139 | |
---|
140 | return $idx; |
---|
141 | } |
---|
142 | |
---|
143 | /** |
---|
144 | * Given a string and any byte index, returns the byte index |
---|
145 | * of the start of the next UTF-8 character, relative to supplied |
---|
146 | * position. If the next character begins at the same place as the |
---|
147 | * supplied byte index, that byte index will be returned. |
---|
148 | * @author Chris Smith<chris@jalakai.co.uk> |
---|
149 | * @param string |
---|
150 | * @param int byte index in the string |
---|
151 | * @return int byte index of start of next UTF-8 character |
---|
152 | * @package utf8 |
---|
153 | * @subpackage position |
---|
154 | */ |
---|
155 | function utf8_locate_next_chr(&$str, $idx) |
---|
156 | { |
---|
157 | if ($idx <= 0) |
---|
158 | return 0; |
---|
159 | |
---|
160 | $limit = strlen($str); |
---|
161 | if ($idx >= $limit) |
---|
162 | return $limit; |
---|
163 | |
---|
164 | // Binary value for any byte after the first in a multi-byte UTF-8 character |
---|
165 | // will be like 10xxxxxx so & 0xC0 can be used to detect this kind |
---|
166 | // of byte - assuming well formed UTF-8 |
---|
167 | while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) |
---|
168 | $idx++; |
---|
169 | |
---|
170 | return $idx; |
---|
171 | } |
---|