UTF-8 String Walking/Functions
Posted: Fri Mar 19, 2010 5:18 am
I was recently re-touching this so that my utf8 functions didn't rely on the mbstring library. My reason for that was that I plan on deploying some projects on environments I have no control over. I need to do some benchmarks to see how much better the mb functions are, I may wrap those if they exist in the implementation instead of always doing my own. This includes several string manipulation functions which can work with multibyte character formats. Right now it's hard-coded for utf8, but if you were to change the _mv_checkUtf8Char function to instead detect another kind it should work... This is untested though.
To enable proper multibyte support you need to define MV_USE_UTF8 as true.
multibyteString.php
Now, I have a second part to this which provides a consistent interface for iterating over strings. Both files need to be included to work.
The idea behind this is that if you use these functions you can basically freely toggle full UTF8 support on or off. So you can get the rough performance range (plus some overhead) of single byte characters on local applications, but if you release something for an international market you can flip on UTF support (and take the performance hit that comes with that) without having to completely re-visit all of your foundations. This is actually a difficult problem to solve if you have not built your program from the ground up considering these problems.
stringWalker.php
using the mv_StringWalker guarantees a character by character iteration over a string instead of a byte by byte iteration (at least when MV_USE_UTF8 is true). It provides a consistent interface for iterating over and arbitrary manipulation of strings (you can see examples of it in use in the multibyteString.php file.)
Comments, critique, suggestions welcome.
To enable proper multibyte support you need to define MV_USE_UTF8 as true.
multibyteString.php
Code: Select all
<?PHP
/*-------------------------------------------------*\
| Developer: MichaelHamilton.com | mike@m2tm.net |
|----------------Keep Header Intact-----------------|
\*-------------------------------------------------*/
if(!defined('_MV_CORE_MULTIBYTESTRING_PHP_')){
define('_MV_CORE_MULTIBYTESTRING_PHP_', true);
//functions which rely on the mb library existing or not:
if(function_exists('mb_get_info')){
define('MV_USING_MULTIBYTE', true);
mb_internal_encoding('UTF-8');
mb_http_output('UTF-8');
//call this function in ob_start(mv_obStartCallback());
function mv_obStartCallback(){
return "mb_output_handler";
}
}else{ //Default to typical php implementations when no mbstring extension exists
define('MV_USING_MULTIBYTE', false);
ini_set('default_charset', 'UTF-8');
//call this function in ob_start(mv_obStartCallback());
function mv_obStartCallback(){
return NULL;
}
}
//User decided toggle to turn UTF on and take a performance hit or not.
if(MV_USE_UTF8 == true){
function mv_strlen($string){
$stringLength = new mv_StringWalker($string);
$stringLength->setToEnd();
return $stringLength->getCharIndex();
}
function mv_strpos($haystack, $needle, $offset = 0){
if($needle == '' || $haystack == ''){return false;}
$haystackWalker = new mv_StringWalker($haystack);
$needleWalker = new mv_StringWalker($needle);
$position = 0;
while(($character = $haystackWalker->goNext()) !== false){
if(($position >= $offset) && ($needleWalker->goNext() === $character)){
if($needleWalker->lookNext() === false){
return $position;
}
}else{
$position = $haystackWalker->getCharIndex();
$needleWalker->setToStart();
}
}
return false;
}
function mv_stripos($haystack, $needle, $offset = 0){
return mv_strpos(mv_strtolower($haystack), mv_strtolower($needle), $offset);
}
function _mv_substrSetupValues($string, &$start, &$length, &$stringLength /*pure output*/){
$stringLength = mv_strlen($string);
if($length === '!'){$length = $stringLength;}
if($length == 0){return '';}
if(abs($start) > $stringLength){return false;}
if($start < 0){
$start = $stringLength + $start;
}
if($length < 0){
$stringLength+=$length;
if($start > $stringLength){return '';}
}
return true;
}
function mv_substr($string, $start, $length = '!'){
if(($setupStatus = _mv_substrSetupValues($string, $start, $length, $stringLength)) === true){
$stringWalker = new mv_StringWalker($string);
$stringWalker->locate($start);
$returnString = ''; $characterCount = 0;
while(($character = $stringWalker->goNext()) !== false){
if((($characterCount+$start) >= $stringLength) || ($length > 0 && $characterCount >= $length)){ //exit early conditions
break;
}
$returnString.=$character;
$characterCount++;
}
return $returnString;
}else{
return $setupStatus;
}
}
function mv_ischar($character){
return ($character != '')?_mv_checkUtf8Char($character):false;
}
//Function by javalc6@gmail.com - http://php.net/manual/en/function.mb-check-encoding.php
//modified for 1 character to exclude loop for performance (profiled and offers improvement)
function _mv_checkUtf8Char($str) {
$len = strlen($str);
$i = 0;
$c = ord($str[0]);
if ($c > 128) {
if (($c <= 191)) return false;
elseif ($c <= 223) $bytes = 2;
elseif ($c <= 239) $bytes = 3;
elseif ($c <= 247) $bytes = 4;
else return false;
if (($bytes) > $len) return false;
while ($bytes > 1) {
$i++;
$b = ord($str[$i]);
if ($b < 128 || $b > 191) return false;
$bytes--;
}
}
return true;
}
function mv_strrev($string){
$returnString = '';
$StringWalker = new mv_StringWalker($string);
while(($readChar = $StringWalker->goNext()) !== false){
$returnString = $readChar.$returnString;
}
return $returnString;
}
function _mv_trim_counter(&$stringWalker, $directionFunction, &$characterArray){
$distance = 0;
while(($char = $stringWalker->$directionFunction()) !== false && in_array($char, $characterArray)){
$distance++;
}
return $distance;
}
function mv_trim($string, $characters = " \t\n\r\0\x0B"){
$characterArray = mv_getCharacterArray($characters);
$returnString = '';
$stringWalker = new mv_StringWalker($string);
$start = _mv_trim_counter($stringWalker, 'goNext', $characterArray);
$stringWalker->setToEnd();
$length = $stringWalker->getCharIndex();
if($start == $length){return '';}
$end = _mv_trim_counter($stringWalker, 'goPrev', $characterArray);
return mv_substr($string, $start, $length-$end-$start);
}
function mv_getChar($string, $i){
if($i < 0){return false;}
$chr = false; $length = mv_strlen($string);
$stringWalker = new mv_StringWalker($string);
return $stringWalker->locate($i);
}
}else{
function mv_strlen($string){
return strlen($string);
}
function mv_strpos($haystack, $needle, $offset = 0){
return strpos($haystack, $needle, $offset);
}
//PHP 4/5 compatability
function mv_stripos($haystack, $needle, $offset = 0){
return strpos(mv_strtolower($haystack), mv_strtolower($needle), $offset);
}
function mv_substr($string, $start, $length = '!'){
if($length === '!'){
return substr($string, $start);
}else{
return substr($string, $start, $length);
}
}
function mv_ischar($character){
return ($character != '');
}
function mv_strrev($string){
return strrev($string);
}
function mv_trim($string, $characters = " \t\n\r\0\x0B"){
return trim($string, $characters);
}
function mv_getChar($string, $i){
return ($string[$i] == '')?false:$string[$i];
}
}
function mv_setDatabaseCharset($DBConnection = null){
$SQL = "SET character_set_results = 'utf8', character_set_client = 'utf8',
character_set_connection = 'utf8', character_set_database = 'utf8',
character_set_server = 'utf8', names = 'utf8'";
if($DBConnection == null){
mysql_query($SQL);
if (function_exists('mysql_set_charset') !== false) {
mysql_set_charset('utf8');
}
}else{
mysql_query($SQL, $DBConnection);
if (function_exists('mysql_set_charset') !== false) {
mysql_set_charset('utf8', $DBConnection);
}
}
}
//Function by leha_grobov - http://php.net/manual/en/function.strtolower.php
function _mv_convertcase($string, $tolower){
static $uppercase = array(
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U",
"V", "W", "X", "Y", "Z", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç", "È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï",
"Ð", "Ñ", "Ò", "Ó", "Ô", "Õ", "Ö", "Ø", "Ù", "Ú", "Û", "Ü", "Ý", "?", "?", "?", "?", "?", "?", "?", "?",
"?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?",
"?", "?", "?", "?"
);
static $lowercase = array(
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u",
"v", "w", "x", "y", "z", "à", "á", "â", "ã", "ä", "å", "æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï",
"ð", "ñ", "ò", "ó", "ô", "õ", "ö", "ø", "ù", "ú", "û", "ü", "ý", "?", "?", "?", "?", "?", "?", "?", "?",
"?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?",
"?", "?", "?", "?"
);
if($tolower == true){
return str_replace($uppercase, $lowercase, $string);
}else{
return str_replace($lowercase, $uppercase, $string);
}
}
//it's useful to use str_replace as it is UTF-8 safe and provides better support at a small performance hit.
function mv_strtolower($string){
return _mv_convertcase($string, true);
}
function mv_strtoupper($string){
return _mv_convertcase($string, false);
}
//PHP 4/5 compatability
function mv_strstr_common($haystack, $needle, $part = false, $insensitive = false){
$pos = ($insensitive)?
mv_strpos(mv_strtolower($haystack), mv_strtolower($needle), 0, $encoding):
$pos = mv_strpos($haystack, $needle, 0);
if($pos !== false){
if($part){
return mv_substr($haystack, 0, $pos);
}else{
return mv_substr($haystack, $pos);
}
}
return false;
}
function mv_strstr($haystack, $needle, $part = false){
return mv_strstr_common($haystack, $needle, $part, false);
}
function mv_stristr($haystack, $needle, $part = false){
return mv_strstr_common($haystack, $needle, $part, true);
}
//This is a bit more useful than the strrpos functions (which are inconsistant in php4 and 5 and the offset behavior is weird)
function mv_backstrpos($haystack, $needle, $offset = 0){
$length = mv_strlen($haystack);
$offset = ($offset > 0)?($length - $offset):abs($offset);
$pos = mv_strpos(mv_strrev($haystack), mv_strrev($needle), $offset);
return ($pos === false)?false:( $length - $pos - mv_strlen($needle) );
}
function mv_backstripos($haystack, $needle, $offset = 0){
return mv_backstrpos(mv_strtolower($haystack), mv_strtolower($needle), $offset);
}
function mv_htmlentities($string, $quoteStyle = ENT_COMPAT){
return htmlentities($string, $quoteStyle, 'UTF-8');
}
function mv_isWhitespace($char){
return $char === ' ' || $char === "\t" || $char === "\n" || $char === "\r" || $char === "\0" || $char === "\x0B";
}
function mv_getWhitespaceArray(){
static $whiteSpaceArray = array(' ', "\t", "\n", "\r", "\0", "\x0B");
return $whiteSpaceArray;
}
//This is fine for small strings, avoid using it too often for memory use reasons.
//"Hello" would return array('H', 'e', 'l', 'l', 'o') and is UTF-8 friendly
function mv_getCharacterArray($string){
$StringWalker = new mv_StringWalker($string);
$charArray = array();
while($StringWalker->goNext() !== false){
$charArray[] = $StringWalker->getLastRead();
}
return $charArray;
}
}
?>
The idea behind this is that if you use these functions you can basically freely toggle full UTF8 support on or off. So you can get the rough performance range (plus some overhead) of single byte characters on local applications, but if you release something for an international market you can flip on UTF support (and take the performance hit that comes with that) without having to completely re-visit all of your foundations. This is actually a difficult problem to solve if you have not built your program from the ground up considering these problems.
stringWalker.php
Code: Select all
<?PHP
/*-------------------------------------------------*\
| Developer: MichaelHamilton.com | mike@m2tm.net |
|----------------Keep Header Intact-----------------|
\*-------------------------------------------------*/
if(!defined('_MV_CORE_STRINGWALKER_PHP_')){
define('_MV_CORE_STRINGWALKER_PHP_', true);
//depending on if we have to handle UTF8 input or not, this can be optimized
if(MV_USE_UTF8 === true){
class mv_StringWalker{
function mv_StringWalker($string = ''){
$this->setString($string);
}
//PUBLIC=------------------------------------------------------
function setString($string){
$this->recent = false;
$this->byteIndex = array(0);
$this->charIndex = array(0);
$this->string = $string;
$this->length = strlen($string);
}
function setToEnd($iteratorId = 0){
while($this->lookNext($iteratorId) !== false){
$this->goNext($iteratorId);
}
}
function setToStart($iteratorId = 0){
$this->charIndex[$iteratorId] = 0;
$this->byteIndex[$iteratorId] = 0;
}
function locate($index, $iteratorId = 0){
$this->charIndex[$iteratorId] = 0;
$this->byteIndex[$iteratorId] = 0;
while($this->charIndex[$iteratorId] < $index && $this->goNext($iteratorId) !== false){}
return $this->lookNext($iteratorId);
}
function getCharIndex($iteratorId = 0){
return $this->charIndex[$iteratorId];
}
function getByteIndex($iteratorId = 0){
return $this->byteIndex[$iteratorId];
}
function lookNext($iteratorId = 0){
return $this->_go(0, 1, $this->length, $iteratorId);
}
function goNext($iteratorId = 0){
return $this->_go(1, 1, $this->length, $iteratorId);
}
function lookPrev($iteratorId = 0){
return $this->_go(0, 0, 0, $iteratorId);
}
function goPrev($iteratorId = 0){
return $this->_go(-1, 0, 0, $iteratorId);
}
function getLastRead(){
return $this->recent;
}
function copyPosition($iteratorIdSource, $iteratorIdDestination){
$this->byteIndex[$iteratorIdDestination] = $this->byteIndex[$iteratorIdSource];
$this->charIndex[$iteratorIdDestination] = $this->charIndex[$iteratorIdSource];
}
function inject($injectString, $iteratorId = 0, $revalidateIterators = true){
$tmpLen = false;
if($revalidateIterators){
$tmpLen = $this->_offsetInvalidatedIteratorsAfterInject($injectString, $this->charIndex[$iteratorId]);
if($tmpLen !== false){
$this->length+=$tmpLen;
}
}
if($tmpLen === false){
$this->length+= ($injectLength = strlen($injectString));
}
$this->string = substr($this->string, 0, $this->byteIndex[$iteratorId]).
$injectString.
substr($this->string, $this->byteIndex[$iteratorId]);
}
function append($appendString){
$this->length+=strlen($appendString);
$this->string.=$appendString;
}
//PRIVATE=-----------------------------------------------------
function _offsetInvalidatedIteratorsAfterInject($injectString, $maxCharIndex){
foreach($this->charIndex as $value){
if($value > $maxCharIndex){
$lengthCheck = new mv_StringWalker($injectString);
$lengthCheck->setToEnd();
$this->_offsetGreaterIterators($maxCharIndex, $lengthCheck->getCharIndex(), $lengthCheck->getByteIndex());
return $lengthCheck->getByteIndex();
break;
}
}
return false;
}
function _offsetGreaterIterators($maxCharIndex, $charCount, $byteCount){
foreach($this->charIndex as $key => $value){
if($value > $maxCharIndex){
$this->charIndex[$key]+=$charCount;
$this->byteIndex[$key]+=$byteCount;
}
}
}
function _go($directionVector, $compareOption, $endValue, $iteratorId){
$tmpchr = $this->_simulateMove($byteDiff, $iteratorId, $compareOption, $endValue);
if($byteDiff > 0){
$this->charIndex[$iteratorId]+=(1*$directionVector);
$this->byteIndex[$iteratorId]+=($byteDiff*$directionVector);
return $tmpchr;
}else{
return false;
}
}
function _simulateMove(&$byteDiff, $iteratorId, $compareOption, $endValue){
$tmpchr = '';
$byteDiff = 0;
$tmpByteIndex = $this->byteIndex[$iteratorId];
if($compareOption){
for(;$tmpByteIndex < $endValue && !mv_ischar($tmpchr);$tmpByteIndex++){
$tmpchr.=$this->string[$tmpByteIndex];
$byteDiff++;
}
}else{
for(;$tmpByteIndex >= $endValue && !mv_ischar($tmpchr);$tmpByteIndex--){
$tmpchr=$this->string[$tmpByteIndex].$tmpchr;
$byteDiff++;
}
}
$this->recent = $tmpchr;
return $tmpchr;
}
var $recent;
var $charIndex;
var $byteIndex;
var $string, $length;
}
}else{
//single byte character optimizations
class mv_StringWalker{
function mv_StringWalker($string = ''){
$this->setString($string);
}
//PUBLIC=------------------------------------------------------
function setString($string){
$this->recent = false;
$this->charIndex = array(0);
$this->string = $string;
$this->length = strlen($string);
}
function setToEnd($iteratorId = 0){
$this->charIndex[$iteratorId] = $this->length-1;
$this->recent = $this->string[$this->length-1];
}
function setToStart($iteratorId = 0){
$this->charIndex[$iteratorId] = 0;
}
function locate($index, $iteratorId = 0){
if($index > $this->length){
$this->charIndex[$iteratorId] = 0;
return false;
}
if($index < 0){$index = 0;}
$this->charIndex[$iteratorId] = $index;
return $this->string[$index];
}
function getCharIndex($iteratorId = 0){
return $this->charIndex[$iteratorId];
}
//provide a consistant interface
function getByteIndex($iteratorId = 0){
return $this->charIndex[$iteratorId];
}
function lookNext($iteratorId = 0){
if(($this->charIndex[$iteratorId]+1) <= $this->length){
$this->recent = $this->string[$this->charIndex[$iteratorId]];
return $this->recent;
}else{
return false;
}
}
function goNext($iteratorId = 0){
if(($this->charIndex[$iteratorId]+1) <= $this->length){
$this->recent = $this->string[$this->charIndex[$iteratorId]];
$this->charIndex[$iteratorId]++;
return $this->recent;
}else{
return false;
}
}
function lookPrev($iteratorId = 0){
if(($this->charIndex[$iteratorId]-1) >= $this->length){
$this->recent = $this->string[$this->charIndex[$iteratorId]];
return $this->recent;
}else{
return false;
}
}
function goPrev($iteratorId = 0){
if(($this->charIndex[$iteratorId]-1) >= $this->length){
$this->recent = $this->string[$this->charIndex[$iteratorId]];
$this->charIndex[$iteratorId]--;
return $this->recent;
}else{
return false;
}
}
function getLastRead(){
return $this->recent;
}
function copyPosition($iteratorIdSource, $iteratorIdDestination){
$this->charIndex[$iteratorIdDestination] = $this->charIndex[$iteratorIdSource];
}
function inject($injectString, $iteratorId = 0, $revalidateIterators = true){
$tmpLen = false;
if($revalidateIterators){
$tmpLen = $this->_offsetInvalidatedIteratorsAfterInject($injectString, $this->charIndex[$iteratorId]);
if($tmpLen !== false){
$this->length+=$tmpLen;
}
}
if($tmpLen === false){
$this->length+= ($injectLength = strlen($injectString));
}
$this->string = substr($this->string, 0, $this->byteIndex[$iteratorId]).
$injectString.
substr($this->string, $this->byteIndex[$iteratorId]);
}
function append($appendString){
$this->length+=strlen($appendString);
$this->string.=$appendString;
}
//PRIVATE=-----------------------------------------------------
function _offsetInvalidatedIteratorsAfterInject($injectString, $maxCharIndex){
foreach($this->charIndex as $value){
if($value > $maxCharIndex){
$lengthCheck = new mv_StringWalker($injectString);
$lengthCheck->setToEnd();
$this->_offsetGreaterIterators($maxCharIndex, $lengthCheck->getCharIndex());
return $lengthCheck->getByteIndex();
break;
}
}
return false;
}
function _offsetGreaterIterators($maxCharIndex, $charCount){
foreach($this->charIndex as $key => $value){
if($value > $maxCharIndex){
$this->charIndex[$key]+=$charCount;
}
}
}
var $recent;
var $charIndex;
var $byteIndex;
var $string, $length;
}
}
}
?>
Comments, critique, suggestions welcome.