Context Navigation

source: branches/rsr.v5.1.dev/web/punbb/include/utf8/utils/position.php @ 3

Last change on this file since 3 was 3, checked in by dj3c1t, 13 years ago
passage a Fluxbb 1.4.7
File size: 4.5 KB

Rev	Line
[3]	1	<?php
	2
	3	/**
	4	* Locate a byte index given a UTF-8 character index
	5	* @version $Id: position.php,v 1.1 2006/10/01 00:01:31 harryf Exp $
	6	* @package utf8
	7	* @subpackage position
	8	*/
	9
	10	/**
	11	* Given a string and a character index in the string, in
	12	* terms of the UTF-8 character position, returns the byte
	13	* index of that character. Can be useful when you want to
	14	* PHP's native string functions but we warned, locating
	15	* the byte can be expensive
	16	* Takes variable number of parameters - first must be
	17	* the search string then 1 to n UTF-8 character positions
	18	* to obtain byte indexes for - it is more efficient to search
	19	* the string for multiple characters at once, than make
	20	* repeated calls to this function
	21	*
	22	* @author Chris Smith<chris@jalakai.co.uk>
	23	* @param string string to locate index in
	24	* @param int (n times)
	25	* @return mixed - int if only one input int, array if more
	26	* @return boolean TRUE if it's all ASCII
	27	* @package utf8
	28	* @subpackage position
	29	*/
	30	function utf8_byte_position()
	31	{
	32	$args = func_get_args();
	33	$str =& array_shift($args);
	34
	35	if (!is_string($str))
	36	return false;
	37
	38	$result = array();
	39	$prev = array(0, 0); // Trivial byte index, character offset pair
	40	$i = utf8_locate_next_chr($str, 300); // Use a short piece of str to estimate bytes per character. $i (& $j) -> byte indexes into $str
	41	$c = strlen(utf8_decode(substr($str, 0, $i))); // $c -> character offset into $str
	42
	43	// Deal with arguments from lowest to highest
	44	sort($args);
	45
	46	foreach ($args as $offset)
	47	{
	48	// Sanity checks FIXME
	49
	50	// 0 is an easy check
	51	if ($offset == 0)
	52	{
	53	$result[] = 0; continue;
	54	}
	55
	56	// Ensure no endless looping
	57	$safety_valve = 50;
	58
	59	do
	60	{
	61	if (($c - $prev[1]) == 0)
	62	{
	63	// Hack: gone past end of string
	64	$error = 0;
	65	$i = strlen($str);
	66	break;
	67	}
	68
	69	$j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));
	70	$j = utf8_locate_next_chr($str, $j); // Correct to utf8 character boundary
	71	$prev = array($i,$c); // Save the index, offset for use next iteration
	72
	73	if ($j > $i)
	74	$c += strlen(utf8_decode(substr($str, $i, $j-$i))); // Determine new character offset
	75	else
	76	$c -= strlen(utf8_decode(substr($str, $j, $i-$j))); // Ditto
	77
	78	$error = abs($c-$offset);
	79	$i = $j; // Ready for next time around
	80	}
	81	while (($error > 7) && --$safety_valve); // From 7 it is faster to iterate over the string
	82
	83	if ($error && $error <= 7)
	84	{
	85	if ($c < $offset)
	86	{
	87	// Move up
	88	while ($error--)
	89	$i = utf8_locate_next_chr($str, ++$i);
	90	}
	91	else
	92	{
	93	// Move down
	94	while ($error--)
	95	$i = utf8_locate_current_chr($str, --$i);
	96	}
	97
	98	// Ready for next arg
	99	$c = $offset;
	100	}
	101
	102	$result[] = $i;
	103	}
	104
	105	if (count($result) == 1)
	106	return $result[0];
	107
	108	return $result;
	109	}
	110
	111	/**
	112	* Given a string and any byte index, returns the byte index
	113	* of the start of the current UTF-8 character, relative to supplied
	114	* position. If the current character begins at the same place as the
	115	* supplied byte index, that byte index will be returned. Otherwise
	116	* this function will step backwards, looking for the index where
	117	* curent UTF-8 character begins
	118	* @author Chris Smith<chris@jalakai.co.uk>
	119	* @param string
	120	* @param int byte index in the string
	121	* @return int byte index of start of next UTF-8 character
	122	* @package utf8
	123	* @subpackage position
	124	*/
	125	function utf8_locate_current_chr( &$str, $idx )
	126	{
	127	if ($idx <= 0)
	128	return 0;
	129
	130	$limit = strlen($str);
	131	if ($idx >= $limit)
	132	return $limit;
	133
	134	// Binary value for any byte after the first in a multi-byte UTF-8 character
	135	// will be like 10xxxxxx so & 0xC0 can be used to detect this kind
	136	// of byte - assuming well formed UTF-8
	137	while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80))
	138	$idx--;
	139
	140	return $idx;
	141	}
	142
	143	/**
	144	* Given a string and any byte index, returns the byte index
	145	* of the start of the next UTF-8 character, relative to supplied
	146	* position. If the next character begins at the same place as the
	147	* supplied byte index, that byte index will be returned.
	148	* @author Chris Smith<chris@jalakai.co.uk>
	149	* @param string
	150	* @param int byte index in the string
	151	* @return int byte index of start of next UTF-8 character
	152	* @package utf8
	153	* @subpackage position
	154	*/
	155	function utf8_locate_next_chr(&$str, $idx)
	156	{
	157	if ($idx <= 0)
	158	return 0;
	159
	160	$limit = strlen($str);
	161	if ($idx >= $limit)
	162	return $limit;
	163
	164	// Binary value for any byte after the first in a multi-byte UTF-8 character
	165	// will be like 10xxxxxx so & 0xC0 can be used to detect this kind
	166	// of byte - assuming well formed UTF-8
	167	while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80))
	168	$idx++;
	169
	170	return $idx;
	171	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: