Context Navigation

source: branches/rsr.v5.1.dev/web/punbb/include/utf8/utils/position.php @ 6

Last change on this file since 6 was 3, checked in by dj3c1t, 13 years ago
passage a Fluxbb 1.4.7
File size: 4.5 KB

Line
1	<?php
2
3	/**
4	* Locate a byte index given a UTF-8 character index
5	* @version $Id: position.php,v 1.1 2006/10/01 00:01:31 harryf Exp $
6	* @package utf8
7	* @subpackage position
8	*/
9
10	/**
11	* Given a string and a character index in the string, in
12	* terms of the UTF-8 character position, returns the byte
13	* index of that character. Can be useful when you want to
14	* PHP's native string functions but we warned, locating
15	* the byte can be expensive
16	* Takes variable number of parameters - first must be
17	* the search string then 1 to n UTF-8 character positions
18	* to obtain byte indexes for - it is more efficient to search
19	* the string for multiple characters at once, than make
20	* repeated calls to this function
21	*
22	* @author Chris Smith<chris@jalakai.co.uk>
23	* @param string string to locate index in
24	* @param int (n times)
25	* @return mixed - int if only one input int, array if more
26	* @return boolean TRUE if it's all ASCII
27	* @package utf8
28	* @subpackage position
29	*/
30	function utf8_byte_position()
31	{
32	$args = func_get_args();
33	$str =& array_shift($args);
34
35	if (!is_string($str))
36	return false;
37
38	$result = array();
39	$prev = array(0, 0); // Trivial byte index, character offset pair
40	$i = utf8_locate_next_chr($str, 300); // Use a short piece of str to estimate bytes per character. $i (& $j) -> byte indexes into $str
41	$c = strlen(utf8_decode(substr($str, 0, $i))); // $c -> character offset into $str
42
43	// Deal with arguments from lowest to highest
44	sort($args);
45
46	foreach ($args as $offset)
47	{
48	// Sanity checks FIXME
49
50	// 0 is an easy check
51	if ($offset == 0)
52	{
53	$result[] = 0; continue;
54	}
55
56	// Ensure no endless looping
57	$safety_valve = 50;
58
59	do
60	{
61	if (($c - $prev[1]) == 0)
62	{
63	// Hack: gone past end of string
64	$error = 0;
65	$i = strlen($str);
66	break;
67	}
68
69	$j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));
70	$j = utf8_locate_next_chr($str, $j); // Correct to utf8 character boundary
71	$prev = array($i,$c); // Save the index, offset for use next iteration
72
73	if ($j > $i)
74	$c += strlen(utf8_decode(substr($str, $i, $j-$i))); // Determine new character offset
75	else
76	$c -= strlen(utf8_decode(substr($str, $j, $i-$j))); // Ditto
77
78	$error = abs($c-$offset);
79	$i = $j; // Ready for next time around
80	}
81	while (($error > 7) && --$safety_valve); // From 7 it is faster to iterate over the string
82
83	if ($error && $error <= 7)
84	{
85	if ($c < $offset)
86	{
87	// Move up
88	while ($error--)
89	$i = utf8_locate_next_chr($str, ++$i);
90	}
91	else
92	{
93	// Move down
94	while ($error--)
95	$i = utf8_locate_current_chr($str, --$i);
96	}
97
98	// Ready for next arg
99	$c = $offset;
100	}
101
102	$result[] = $i;
103	}
104
105	if (count($result) == 1)
106	return $result[0];
107
108	return $result;
109	}
110
111	/**
112	* Given a string and any byte index, returns the byte index
113	* of the start of the current UTF-8 character, relative to supplied
114	* position. If the current character begins at the same place as the
115	* supplied byte index, that byte index will be returned. Otherwise
116	* this function will step backwards, looking for the index where
117	* curent UTF-8 character begins
118	* @author Chris Smith<chris@jalakai.co.uk>
119	* @param string
120	* @param int byte index in the string
121	* @return int byte index of start of next UTF-8 character
122	* @package utf8
123	* @subpackage position
124	*/
125	function utf8_locate_current_chr( &$str, $idx )
126	{
127	if ($idx <= 0)
128	return 0;
129
130	$limit = strlen($str);
131	if ($idx >= $limit)
132	return $limit;
133
134	// Binary value for any byte after the first in a multi-byte UTF-8 character
135	// will be like 10xxxxxx so & 0xC0 can be used to detect this kind
136	// of byte - assuming well formed UTF-8
137	while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80))
138	$idx--;
139
140	return $idx;
141	}
142
143	/**
144	* Given a string and any byte index, returns the byte index
145	* of the start of the next UTF-8 character, relative to supplied
146	* position. If the next character begins at the same place as the
147	* supplied byte index, that byte index will be returned.
148	* @author Chris Smith<chris@jalakai.co.uk>
149	* @param string
150	* @param int byte index in the string
151	* @return int byte index of start of next UTF-8 character
152	* @package utf8
153	* @subpackage position
154	*/
155	function utf8_locate_next_chr(&$str, $idx)
156	{
157	if ($idx <= 0)
158	return 0;
159
160	$limit = strlen($str);
161	if ($idx >= $limit)
162	return $limit;
163
164	// Binary value for any byte after the first in a multi-byte UTF-8 character
165	// will be like 10xxxxxx so & 0xC0 can be used to detect this kind
166	// of byte - assuming well formed UTF-8
167	while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80))
168	$idx++;
169
170	return $idx;
171	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: