Context Navigation

source: branches/rsr.v5.1.dev/web/punbb/include/utf8/utils/ascii.php @ 8

Last change on this file since 8 was 3, checked in by dj3c1t, 13 years ago
passage a Fluxbb 1.4.7
File size: 7.6 KB

Rev	Line
[3]	1	<?php
	2
	3	/**
	4	* Tools to help with ASCII in UTF-8
	5	* @version $Id: ascii.php,v 1.5 2006/10/16 20:38:12 harryf Exp $
	6	* @package utf8
	7	* @subpackage ascii
	8	*/
	9
	10	/**
	11	* Tests whether a string contains only 7bit ASCII bytes.
	12	* You might use this to conditionally check whether a string
	13	* needs handling as UTF-8 or not, potentially offering performance
	14	* benefits by using the native PHP equivalent if it's just ASCII e.g.;
	15	*
	16	* <code>
	17	* if ( utf8_is_ascii($someString) ) {
	18	* // It's just ASCII - use the native PHP version
	19	* $someString = strtolower($someString);
	20	* } else {
	21	* $someString = utf8_strtolower($someString);
	22	* }
	23	* </code>
	24	*
	25	* @param string
	26	* @return boolean TRUE if it's all ASCII
	27	* @package utf8
	28	* @subpackage ascii
	29	* @see utf8_is_ascii_ctrl
	30	*/
	31	function utf8_is_ascii($str)
	32	{
	33	// Search for any bytes which are outside the ASCII range...
	34	return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
	35	}
	36
	37	/**
	38	* Tests whether a string contains only 7bit ASCII bytes with device
	39	* control codes omitted. The device control codes can be found on the
	40	* second table here: http://www.w3schools.com/tags/ref_ascii.asp
	41	*
	42	* @param string
	43	* @return boolean TRUE if it's all ASCII without device control codes
	44	* @package utf8
	45	* @subpackage ascii
	46	* @see utf8_is_ascii
	47	*/
	48	function utf8_is_ascii_ctrl($str)
	49	{
	50	// Search for any bytes which are outside the ASCII range, or are device control codes
	51	if (strlen($str) > 0)
	52	return (preg_match('/[^\x09\x0A\x0D\x20-\x7E]/', $str) !== 1);
	53
	54	return false;
	55	}
	56
	57	/**
	58	* Strip out all non-7bit ASCII bytes
	59	* If you need to transmit a string to system which you know can only
	60	* support 7bit ASCII, you could use this function.
	61	* @param string
	62	* @return string with non ASCII bytes removed
	63	* @package utf8
	64	* @subpackage ascii
	65	* @see utf8_strip_non_ascii_ctrl
	66	*/
	67	function utf8_strip_non_ascii($str)
	68	{
	69	ob_start();
	70
	71	while (preg_match('/^([\x00-\x7F]+)\|([^\x00-\x7F]+)/S', $str, $matches))
	72	{
	73	if (!isset($matches[2]))
	74	echo $matches[0];
	75
	76	$str = substr($str, strlen($matches[0]));
	77	}
	78
	79	$result = ob_get_contents();
	80	ob_end_clean();
	81
	82	return $result;
	83	}
	84
	85	/**
	86	* Strip out device control codes in the ASCII range
	87	* which are not permitted in XML. Note that this leaves
	88	* multi-byte characters untouched - it only removes device
	89	* control codes
	90	* @see http://hsivonen.iki.fi/producing-xml/#controlchar
	91	* @param string
	92	* @return string control codes removed
	93	*/
	94	function utf8_strip_ascii_ctrl($str)
	95	{
	96	ob_start();
	97
	98	while (preg_match('/^([^\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)\|([\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)/S', $str, $matches))
	99	{
	100	if (!isset($matches[2]))
	101	echo $matches[0];
	102
	103	$str = substr($str, strlen($matches[0]));
	104	}
	105
	106	$result = ob_get_contents();
	107	ob_end_clean();
	108
	109	return $result;
	110	}
	111
	112	/**
	113	* Strip out all non 7bit ASCII bytes and ASCII device control codes.
	114	* For a list of ASCII device control codes see the 2nd table here:
	115	* http://www.w3schools.com/tags/ref_ascii.asp
	116	*
	117	* @param string
	118	* @return boolean TRUE if it's all ASCII
	119	* @package utf8
	120	* @subpackage ascii
	121	*/
	122	function utf8_strip_non_ascii_ctrl($str)
	123	{
	124	ob_start();
	125
	126	while (preg_match( '/^([\x09\x0A\x0D\x20-\x7E]+)\|([^\x09\x0A\x0D\x20-\x7E]+)/S', $str, $matches))
	127	{
	128	if (!isset($matches[2]))
	129	echo $matches[0];
	130
	131	$str = substr($str, strlen($matches[0]));
	132	}
	133
	134	$result = ob_get_contents();
	135	ob_end_clean();
	136
	137	return $result;
	138	}
	139
	140	/**
	141	* Replace accented UTF-8 characters by unaccented ASCII-7 "equivalents".
	142	* The purpose of this function is to replace characters commonly found in Latin
	143	* alphabets with something more or less equivalent from the ASCII range. This can
	144	* be useful for converting a UTF-8 to something ready for a filename, for example.
	145	* Following the use of this function, you would probably also pass the string
	146	* through utf8_strip_non_ascii to clean out any other non-ASCII chars
	147	* Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
	148	* letters. Default is to deaccent both cases ($case = 0)
	149	*
	150	* For a more complete implementation of transliteration, see the utf8_to_ascii package
	151	* available from the phputf8 project downloads:
	152	* http://prdownloads.sourceforge.net/phputf8
	153	*
	154	* @param string UTF-8 string
	155	* @param int (optional) -1 lowercase only, +1 uppercase only, 1 both cases
	156	* @param string UTF-8 with accented characters replaced by ASCII chars
	157	* @return string accented chars replaced with ascii equivalents
	158	* @author Andreas Gohr <andi@splitbrain.org>
	159	* @package utf8
	160	* @subpackage ascii
	161	*/
	162	function utf8_accents_to_ascii($str, $case=0)
	163	{
	164	static $UTF8_LOWER_ACCENTS = null;
	165	static $UTF8_UPPER_ACCENTS = null;
	166
	167	if($case <= 0)
	168	{
	169
	170	if (is_null($UTF8_LOWER_ACCENTS))
	171	{
	172	$UTF8_LOWER_ACCENTS = array(
	173	'Ã ' => 'a', 'ÃŽ' => 'o', 'Ä' => 'd', 'áž' => 'f', 'Ã«' => 'e', 'Å¡' => 's', 'Æ¡' => 'o',
	174	'Ã' => 'ss', 'Ä' => 'a', 'Å' => 'r', 'È' => 't', 'Å' => 'n', 'Ä' => 'a', 'Ä·' => 'k',
	175	'Å' => 's', 'á»³' => 'y', 'Å' => 'n', 'Äº' => 'l', 'Ä§' => 'h', 'á¹' => 'p', 'Ã³' => 'o',
	176	'Ãº' => 'u', 'Ä' => 'e', 'Ã©' => 'e', 'Ã§' => 'c', 'áº' => 'w', 'Ä' => 'c', 'Ãµ' => 'o',
	177	'á¹¡' => 's', 'Ãž' => 'o', 'Ä£' => 'g', 'Å§' => 't', 'È' => 's', 'Ä' => 'e', 'Ä' => 'c',
	178	'Å' => 's', 'Ã®' => 'i', 'Å±' => 'u', 'Ä' => 'c', 'Ä' => 'e', 'Åµ' => 'w', 'á¹«' => 't',
	179	'Å«' => 'u', 'Ä' => 'c', 'Ã¶' => 'oe', 'Ãš' => 'e', 'Å·' => 'y', 'Ä' => 'a', 'Å' => 'l',
	180	'Å³' => 'u', 'Å¯' => 'u', 'Å' => 's', 'Ä' => 'g', 'ÄŒ' => 'l', 'Æ' => 'f', 'ÅŸ' => 'z',
	181	'áº' => 'w', 'áž' => 'b', 'Ã¥' => 'a', 'Ã¬' => 'i', 'Ã¯' => 'i', 'áž' => 'd', 'Å¥' => 't',
	182	'Å' => 'r', 'Ã€' => 'ae', 'Ã' => 'i', 'Å' => 'r', 'Ãª' => 'e', 'ÃŒ' => 'ue', 'Ã²' => 'o',
	183	'Ä' => 'e', 'Ã±' => 'n', 'Å' => 'n', 'Ä¥' => 'h', 'Ä' => 'g', 'Ä' => 'd', 'Äµ' => 'j',
	184	'Ã¿' => 'y', 'Å©' => 'u', 'Å' => 'u', 'Æ°' => 'u', 'Å£' => 't', 'Ãœ' => 'y', 'Å' => 'o',
	185	'Ã¢' => 'a', 'ÄŸ' => 'l', 'áº' => 'w', 'ÅŒ' => 'z', 'Ä«' => 'i', 'Ã£' => 'a', 'Ä¡' => 'g',
	186	'á¹' => 'm', 'Å' => 'o', 'Ä©' => 'i', 'Ã¹' => 'u', 'Ä¯' => 'i', 'Åº' => 'z', 'Ã¡' => 'a',
	187	'Ã»' => 'u', 'ÃŸ' => 'th', 'Ã°' => 'dh', 'ÃŠ' => 'ae', 'Âµ' => 'u', 'Ä' => 'e',
	188	);
	189	}
	190
	191	$str = str_replace(array_keys($UTF8_LOWER_ACCENTS), array_values($UTF8_LOWER_ACCENTS), $str);
	192	}
	193
	194	if($case >= 0)
	195	{
	196	if (is_null($UTF8_UPPER_ACCENTS))
	197	{
	198	$UTF8_UPPER_ACCENTS = array(
	199	'Ã' => 'A', 'Ã' => 'O', 'Ä' => 'D', 'áž' => 'F', 'Ã' => 'E', 'Å ' => 'S', 'Æ ' => 'O',
	200	'Ä' => 'A', 'Å' => 'R', 'È' => 'T', 'Å' => 'N', 'Ä' => 'A', 'Ä¶' => 'K',
	201	'Å' => 'S', 'á»²' => 'Y', 'Å' => 'N', 'Ä¹' => 'L', 'ÄŠ' => 'H', 'á¹' => 'P', 'Ã' => 'O',
	202	'Ã' => 'U', 'Ä' => 'E', 'Ã' => 'E', 'Ã' => 'C', 'áº' => 'W', 'Ä' => 'C', 'Ã' => 'O',
	203	'á¹ ' => 'S', 'Ã' => 'O', 'Ä¢' => 'G', 'ÅŠ' => 'T', 'È' => 'S', 'Ä' => 'E', 'Ä' => 'C',
	204	'Å' => 'S', 'Ã' => 'I', 'Å°' => 'U', 'Ä' => 'C', 'Ä' => 'E', 'ÅŽ' => 'W', 'á¹ª' => 'T',
	205	'Åª' => 'U', 'Ä' => 'C', 'Ã' => 'Oe', 'Ã' => 'E', 'Å¶' => 'Y', 'Ä' => 'A', 'Å' => 'L',
	206	'Å²' => 'U', 'Å®' => 'U', 'Å' => 'S', 'Ä' => 'G', 'Ä»' => 'L', 'Æ' => 'F', 'Åœ' => 'Z',
	207	'áº' => 'W', 'áž' => 'B', 'Ã' => 'A', 'Ã' => 'I', 'Ã' => 'I', 'áž' => 'D', 'Å€' => 'T',
	208	'Å' => 'R', 'Ã' => 'Ae', 'Ã' => 'I', 'Å' => 'R', 'Ã' => 'E', 'Ã' => 'Ue', 'Ã' => 'O',
	209	'Ä' => 'E', 'Ã' => 'N', 'Å' => 'N', 'Ä€' => 'H', 'Ä' => 'G', 'Ä' => 'D', 'ÄŽ' => 'J',
	210	'Åž' => 'Y', 'Åš' => 'U', 'Å¬' => 'U', 'Æ¯' => 'U', 'Å¢' => 'T', 'Ã' => 'Y', 'Å' => 'O',
	211	'Ã' => 'A', 'Äœ' => 'L', 'áº' => 'W', 'Å»' => 'Z', 'Äª' => 'I', 'Ã' => 'A', 'Ä ' => 'G',
	212	'á¹' => 'M', 'Å' => 'O', 'Äš' => 'I', 'Ã' => 'U', 'Ä®' => 'I', 'Å¹' => 'Z', 'Ã' => 'A',
	213	'Ã' => 'U', 'Ã' => 'Th', 'Ã' => 'Dh', 'Ã' => 'Ae', 'Ä' => 'E',
	214	);
	215	}
	216
	217	$str = str_replace(array_keys($UTF8_UPPER_ACCENTS), array_values($UTF8_UPPER_ACCENTS), $str);
	218	}
	219
	220	return $str;
	221	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: