Page 1 of 1

UTF-8 String Walking/Functions

Posted: Fri Mar 19, 2010 5:18 am
by M2tM
I was recently re-touching this so that my utf8 functions didn't rely on the mbstring library. My reason for that was that I plan on deploying some projects on environments I have no control over. I need to do some benchmarks to see how much better the mb functions are, I may wrap those if they exist in the implementation instead of always doing my own. This includes several string manipulation functions which can work with multibyte character formats. Right now it's hard-coded for utf8, but if you were to change the _mv_checkUtf8Char function to instead detect another kind it should work... This is untested though.

To enable proper multibyte support you need to define MV_USE_UTF8 as true.

multibyteString.php

Code: Select all

 
<?PHP
/*-------------------------------------------------*\
|   Developer: MichaelHamilton.com | mike@m2tm.net  |
|----------------Keep Header Intact-----------------|
\*-------------------------------------------------*/
if(!defined('_MV_CORE_MULTIBYTESTRING_PHP_')){
    define('_MV_CORE_MULTIBYTESTRING_PHP_', true);
 
    //functions which rely on the mb library existing or not:
    if(function_exists('mb_get_info')){
        define('MV_USING_MULTIBYTE', true);
        mb_internal_encoding('UTF-8');
        mb_http_output('UTF-8');
 
        //call this function in ob_start(mv_obStartCallback());
        function mv_obStartCallback(){
            return "mb_output_handler";
        }
    }else{ //Default to typical php implementations when no mbstring extension exists
        define('MV_USING_MULTIBYTE', false);
        ini_set('default_charset', 'UTF-8');
        
        //call this function in ob_start(mv_obStartCallback());
        function mv_obStartCallback(){
            return NULL;
        }
    }
 
    //User decided toggle to turn UTF on and take a performance hit or not.
    if(MV_USE_UTF8 == true){        
        function mv_strlen($string){
            $stringLength = new mv_StringWalker($string);
            $stringLength->setToEnd();
            return $stringLength->getCharIndex();
        }
        
        function mv_strpos($haystack, $needle, $offset = 0){
            if($needle == '' || $haystack == ''){return false;}
            $haystackWalker = new mv_StringWalker($haystack);
            $needleWalker = new mv_StringWalker($needle);
            $position = 0;
            while(($character = $haystackWalker->goNext()) !== false){
                if(($position >= $offset) && ($needleWalker->goNext() === $character)){
                    if($needleWalker->lookNext() === false){
                        return $position;
                    }
                }else{
                    $position = $haystackWalker->getCharIndex();
                    $needleWalker->setToStart();
                }
            }
            return false;
        }
        
        function mv_stripos($haystack, $needle, $offset = 0){
            return mv_strpos(mv_strtolower($haystack), mv_strtolower($needle), $offset);
        }
        
        function _mv_substrSetupValues($string, &$start, &$length, &$stringLength /*pure output*/){
            $stringLength = mv_strlen($string);
            if($length === '!'){$length = $stringLength;}
            if($length == 0){return '';}
            if(abs($start) > $stringLength){return false;}
            if($start < 0){
                $start = $stringLength + $start;
            }
            if($length < 0){
                $stringLength+=$length;
                if($start > $stringLength){return '';}
            }
            return true;
        }
        
        function mv_substr($string, $start, $length = '!'){
            if(($setupStatus = _mv_substrSetupValues($string, $start, $length, $stringLength)) === true){
                $stringWalker = new mv_StringWalker($string);
                $stringWalker->locate($start);
                $returnString = ''; $characterCount = 0;
                while(($character = $stringWalker->goNext()) !== false){
                    if((($characterCount+$start) >= $stringLength) || ($length > 0 && $characterCount >= $length)){ //exit early conditions
                        break;
                    }
                    $returnString.=$character;
                    $characterCount++;
                }
                return $returnString;
            }else{
                return $setupStatus;
            }
        }
        
        function mv_ischar($character){
            return ($character != '')?_mv_checkUtf8Char($character):false;
        }
        
        //Function by javalc6@gmail.com - http://php.net/manual/en/function.mb-check-encoding.php
        //modified for 1 character to exclude loop for performance (profiled and offers improvement)
        function _mv_checkUtf8Char($str) {
            $len = strlen($str);
            $i = 0;
            $c = ord($str[0]);
            if ($c > 128) {
                if (($c <= 191)) return false;
                elseif ($c <= 223) $bytes = 2;
                elseif ($c <= 239) $bytes = 3;
                elseif ($c <= 247) $bytes = 4;
                else return false;
                if (($bytes) > $len) return false;
                while ($bytes > 1) {
                    $i++;
                    $b = ord($str[$i]);
                    if ($b < 128 || $b > 191) return false;
                    $bytes--;
                }
            }
            return true;
        }
        
        function mv_strrev($string){
            $returnString = '';
            $StringWalker = new mv_StringWalker($string);
            while(($readChar = $StringWalker->goNext()) !== false){
                $returnString = $readChar.$returnString;
            }
            return $returnString;
        }
        
        function _mv_trim_counter(&$stringWalker, $directionFunction, &$characterArray){
            $distance = 0;
            while(($char = $stringWalker->$directionFunction()) !== false && in_array($char, $characterArray)){
                $distance++;
            }
            return $distance;
        }
        function mv_trim($string, $characters = " \t\n\r\0\x0B"){
            $characterArray = mv_getCharacterArray($characters);
            $returnString = '';
            $stringWalker = new mv_StringWalker($string);
            
            $start = _mv_trim_counter($stringWalker, 'goNext', $characterArray);
            
            $stringWalker->setToEnd();
            $length = $stringWalker->getCharIndex();
            if($start == $length){return '';}
 
            $end = _mv_trim_counter($stringWalker, 'goPrev', $characterArray);
            
            return mv_substr($string, $start, $length-$end-$start);
        }
        
        function mv_getChar($string, $i){
            if($i < 0){return false;}
            $chr = false; $length = mv_strlen($string);
            $stringWalker = new mv_StringWalker($string);
            return $stringWalker->locate($i);
        }
    }else{
        function mv_strlen($string){
            return strlen($string);
        }
        
        function mv_strpos($haystack, $needle, $offset = 0){
            return strpos($haystack, $needle, $offset);
        }
        
        //PHP 4/5 compatability
        function mv_stripos($haystack, $needle, $offset = 0){
            return strpos(mv_strtolower($haystack), mv_strtolower($needle), $offset);
        }
        
        function mv_substr($string, $start, $length = '!'){
            if($length === '!'){
                return substr($string, $start);
            }else{
                return substr($string, $start, $length);
            }
        }
        
        function mv_ischar($character){
            return ($character != '');
        }
        
        function mv_strrev($string){
            return strrev($string);
        }
        
        function mv_trim($string, $characters = " \t\n\r\0\x0B"){
            return trim($string, $characters);
        }
        
        function mv_getChar($string, $i){
            return ($string[$i] == '')?false:$string[$i];
        }
    }
    
    function mv_setDatabaseCharset($DBConnection = null){
        $SQL = "SET character_set_results = 'utf8', character_set_client = 'utf8',
                character_set_connection = 'utf8', character_set_database = 'utf8',
                character_set_server = 'utf8', names = 'utf8'";
        if($DBConnection == null){
            mysql_query($SQL);
            if (function_exists('mysql_set_charset') !== false) {
                mysql_set_charset('utf8');
            }
        }else{
            mysql_query($SQL, $DBConnection);
            if (function_exists('mysql_set_charset') !== false) {
                mysql_set_charset('utf8', $DBConnection);
            }
        }
    }
    
    //Function by leha_grobov - http://php.net/manual/en/function.strtolower.php
    function _mv_convertcase($string, $tolower){
        static $uppercase = array(
            "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U",
            "V", "W", "X", "Y", "Z", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç", "È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï",
            "Ð", "Ñ", "Ò", "Ó", "Ô", "Õ", "Ö", "Ø", "Ù", "Ú", "Û", "Ü", "Ý", "?", "?", "?", "?", "?", "?", "?", "?",
            "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?",
            "?", "?", "?", "?"
        );
        static $lowercase = array(
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u",
            "v", "w", "x", "y", "z", "à", "á", "â", "ã", "ä", "å", "æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï",
            "ð", "ñ", "ò", "ó", "ô", "õ", "ö", "ø", "ù", "ú", "û", "ü", "ý", "?", "?", "?", "?", "?", "?", "?", "?",
            "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?",
            "?", "?", "?", "?"
        );
 
        if($tolower == true){
            return str_replace($uppercase, $lowercase, $string);
        }else{
            return str_replace($lowercase, $uppercase, $string);
        }
    }
 
    //it's useful to use str_replace as it is UTF-8 safe and provides better support at a small performance hit.
    function mv_strtolower($string){
        return _mv_convertcase($string, true);
    }
    
    function mv_strtoupper($string){
        return _mv_convertcase($string, false);
    }
    
    //PHP 4/5 compatability
    function mv_strstr_common($haystack, $needle, $part = false, $insensitive = false){
        $pos = ($insensitive)?
                mv_strpos(mv_strtolower($haystack), mv_strtolower($needle), 0, $encoding):
                $pos = mv_strpos($haystack, $needle, 0);
        if($pos !== false){
            if($part){
                return mv_substr($haystack, 0, $pos);
            }else{
                return mv_substr($haystack, $pos);
            }
        }
        return false;
    }
    
    function mv_strstr($haystack, $needle, $part = false){
        return mv_strstr_common($haystack, $needle, $part, false);
    }
    
    function mv_stristr($haystack, $needle, $part = false){
        return mv_strstr_common($haystack, $needle, $part, true);
    }
    
    //This is a bit more useful than the strrpos functions (which are inconsistant in php4 and 5 and the offset behavior is weird)
    function mv_backstrpos($haystack, $needle, $offset = 0){
        $length = mv_strlen($haystack);
        $offset = ($offset > 0)?($length - $offset):abs($offset);
        $pos = mv_strpos(mv_strrev($haystack), mv_strrev($needle), $offset);
        return ($pos === false)?false:( $length - $pos - mv_strlen($needle) );
    }
 
    function mv_backstripos($haystack, $needle, $offset = 0){
        return mv_backstrpos(mv_strtolower($haystack), mv_strtolower($needle), $offset);
    }
    
    function mv_htmlentities($string, $quoteStyle = ENT_COMPAT){
        return htmlentities($string, $quoteStyle, 'UTF-8');
    }
 
    function mv_isWhitespace($char){
        return $char === ' ' || $char === "\t" || $char === "\n" || $char === "\r" || $char === "\0" || $char === "\x0B";
    }
 
    function mv_getWhitespaceArray(){
        static $whiteSpaceArray = array(' ', "\t", "\n", "\r", "\0", "\x0B");
        return $whiteSpaceArray;
    }
 
    //This is fine for small strings, avoid using it too often for memory use reasons.
    //"Hello" would return array('H', 'e', 'l', 'l', 'o') and is UTF-8 friendly
    function mv_getCharacterArray($string){
        $StringWalker = new mv_StringWalker($string);
        $charArray = array();
        while($StringWalker->goNext() !== false){
            $charArray[] = $StringWalker->getLastRead();
        }
        return $charArray;
    }
}
?>
 
Now, I have a second part to this which provides a consistent interface for iterating over strings. Both files need to be included to work.

The idea behind this is that if you use these functions you can basically freely toggle full UTF8 support on or off. So you can get the rough performance range (plus some overhead) of single byte characters on local applications, but if you release something for an international market you can flip on UTF support (and take the performance hit that comes with that) without having to completely re-visit all of your foundations. This is actually a difficult problem to solve if you have not built your program from the ground up considering these problems.

stringWalker.php

Code: Select all

 
<?PHP
/*-------------------------------------------------*\
|   Developer: MichaelHamilton.com | mike@m2tm.net  |
|----------------Keep Header Intact-----------------|
\*-------------------------------------------------*/
if(!defined('_MV_CORE_STRINGWALKER_PHP_')){
    define('_MV_CORE_STRINGWALKER_PHP_', true);
    
    //depending on if we have to handle UTF8 input or not, this can be optimized
    if(MV_USE_UTF8 === true){
        class mv_StringWalker{
            function mv_StringWalker($string = ''){
                $this->setString($string);
            }
        //PUBLIC=------------------------------------------------------
            function setString($string){
                $this->recent = false;
                $this->byteIndex = array(0);
                $this->charIndex = array(0);
                $this->string = $string;
                $this->length = strlen($string);
            }
 
            function setToEnd($iteratorId = 0){
                while($this->lookNext($iteratorId) !== false){
                    $this->goNext($iteratorId);
                }
            }
            
            function setToStart($iteratorId = 0){
                $this->charIndex[$iteratorId] = 0;
                $this->byteIndex[$iteratorId] = 0;
            }
 
            function locate($index, $iteratorId = 0){
                $this->charIndex[$iteratorId] = 0;
                $this->byteIndex[$iteratorId] = 0;
                while($this->charIndex[$iteratorId] < $index && $this->goNext($iteratorId) !== false){}
                return $this->lookNext($iteratorId);
            }
 
            function getCharIndex($iteratorId = 0){
                return $this->charIndex[$iteratorId];
            }
 
            function getByteIndex($iteratorId = 0){
                return $this->byteIndex[$iteratorId];
            }
 
            function lookNext($iteratorId = 0){
                return $this->_go(0, 1, $this->length, $iteratorId);
            }
            function goNext($iteratorId = 0){
                return $this->_go(1, 1, $this->length, $iteratorId);
            }
 
            function lookPrev($iteratorId = 0){
                return $this->_go(0, 0, 0, $iteratorId);
            }
            function goPrev($iteratorId = 0){
                return $this->_go(-1, 0, 0, $iteratorId);
            }
 
            function getLastRead(){
                return $this->recent;
            }
 
            function copyPosition($iteratorIdSource, $iteratorIdDestination){
                $this->byteIndex[$iteratorIdDestination] = $this->byteIndex[$iteratorIdSource];
                $this->charIndex[$iteratorIdDestination] = $this->charIndex[$iteratorIdSource];
            }
            
            function inject($injectString, $iteratorId = 0, $revalidateIterators = true){
                $tmpLen = false;
                if($revalidateIterators){
                    $tmpLen = $this->_offsetInvalidatedIteratorsAfterInject($injectString, $this->charIndex[$iteratorId]);
                    if($tmpLen !== false){
                        $this->length+=$tmpLen;
                    }
                }
                if($tmpLen === false){
                    $this->length+= ($injectLength = strlen($injectString));
                }
                $this->string = substr($this->string, 0, $this->byteIndex[$iteratorId]).
                                $injectString.
                                substr($this->string, $this->byteIndex[$iteratorId]);
            }
            
            function append($appendString){
                $this->length+=strlen($appendString);
                $this->string.=$appendString;
            }
 
        //PRIVATE=-----------------------------------------------------
            function _offsetInvalidatedIteratorsAfterInject($injectString, $maxCharIndex){
                foreach($this->charIndex as $value){
                    if($value > $maxCharIndex){
                        $lengthCheck = new mv_StringWalker($injectString);
                        $lengthCheck->setToEnd();
                        $this->_offsetGreaterIterators($maxCharIndex, $lengthCheck->getCharIndex(), $lengthCheck->getByteIndex());
                        return $lengthCheck->getByteIndex();
                        break;
                    }
                }
                return false;
            }
            
            function _offsetGreaterIterators($maxCharIndex, $charCount, $byteCount){
                foreach($this->charIndex as $key => $value){
                    if($value > $maxCharIndex){
                        $this->charIndex[$key]+=$charCount;
                        $this->byteIndex[$key]+=$byteCount;
                    }
                }
            }
            
            function _go($directionVector, $compareOption, $endValue, $iteratorId){
                $tmpchr = $this->_simulateMove($byteDiff, $iteratorId, $compareOption, $endValue);
                if($byteDiff > 0){
                    $this->charIndex[$iteratorId]+=(1*$directionVector);
                    $this->byteIndex[$iteratorId]+=($byteDiff*$directionVector);
                    return $tmpchr;
                }else{
                    return false;
                }
            }
 
            function _simulateMove(&$byteDiff, $iteratorId, $compareOption, $endValue){
                $tmpchr = '';
                $byteDiff = 0;
                $tmpByteIndex = $this->byteIndex[$iteratorId];
                if($compareOption){
                    for(;$tmpByteIndex < $endValue && !mv_ischar($tmpchr);$tmpByteIndex++){
                        $tmpchr.=$this->string[$tmpByteIndex];
                        $byteDiff++;
                    }
                }else{
                    for(;$tmpByteIndex >= $endValue && !mv_ischar($tmpchr);$tmpByteIndex--){
                        $tmpchr=$this->string[$tmpByteIndex].$tmpchr;
                        $byteDiff++;
                    }             
                }
                $this->recent = $tmpchr;
                return $tmpchr;
            }
 
            var $recent;
            var $charIndex;
            var $byteIndex;
            var $string, $length;
        }
    }else{
        //single byte character optimizations
        class mv_StringWalker{
            function mv_StringWalker($string = ''){
                $this->setString($string);
            }
        //PUBLIC=------------------------------------------------------
            function setString($string){
                $this->recent = false;
                $this->charIndex = array(0);
                $this->string = $string;
                $this->length = strlen($string);
            }
 
            function setToEnd($iteratorId = 0){
                $this->charIndex[$iteratorId] = $this->length-1;
                $this->recent = $this->string[$this->length-1];
            }
            
            function setToStart($iteratorId = 0){
                $this->charIndex[$iteratorId] = 0;
            }
 
            function locate($index, $iteratorId = 0){
                if($index > $this->length){
                    $this->charIndex[$iteratorId] = 0;
                    return false;
                }
                
                if($index < 0){$index = 0;}
                $this->charIndex[$iteratorId] = $index;
                return $this->string[$index];
            }
 
            function getCharIndex($iteratorId = 0){
                return $this->charIndex[$iteratorId];
            }
 
            //provide a consistant interface
            function getByteIndex($iteratorId = 0){
                return $this->charIndex[$iteratorId];
            }
 
            function lookNext($iteratorId = 0){
                if(($this->charIndex[$iteratorId]+1) <= $this->length){
                    $this->recent = $this->string[$this->charIndex[$iteratorId]];
                    return $this->recent;
                }else{
                    return false;
                }
            }
            function goNext($iteratorId = 0){
                if(($this->charIndex[$iteratorId]+1) <= $this->length){
                    $this->recent = $this->string[$this->charIndex[$iteratorId]];
                    $this->charIndex[$iteratorId]++;
                    return $this->recent;
                }else{
                    return false;
                }
            }
 
            function lookPrev($iteratorId = 0){
                if(($this->charIndex[$iteratorId]-1) >= $this->length){
                    $this->recent = $this->string[$this->charIndex[$iteratorId]];
                    return $this->recent;
                }else{
                    return false;
                }
            }
            function goPrev($iteratorId = 0){
                if(($this->charIndex[$iteratorId]-1) >= $this->length){
                    $this->recent = $this->string[$this->charIndex[$iteratorId]];
                    $this->charIndex[$iteratorId]--;
                    return $this->recent;
                }else{
                    return false;
                }
            }
 
            function getLastRead(){
                return $this->recent;
            }
 
            function copyPosition($iteratorIdSource, $iteratorIdDestination){
                $this->charIndex[$iteratorIdDestination] = $this->charIndex[$iteratorIdSource];
            }
            
            function inject($injectString, $iteratorId = 0, $revalidateIterators = true){
                $tmpLen = false;
                if($revalidateIterators){
                    $tmpLen = $this->_offsetInvalidatedIteratorsAfterInject($injectString, $this->charIndex[$iteratorId]);
                    if($tmpLen !== false){
                        $this->length+=$tmpLen;
                    }
                }
                if($tmpLen === false){
                    $this->length+= ($injectLength = strlen($injectString));
                }
                $this->string = substr($this->string, 0, $this->byteIndex[$iteratorId]).
                                $injectString.
                                substr($this->string, $this->byteIndex[$iteratorId]);
            }
            
            function append($appendString){
                $this->length+=strlen($appendString);
                $this->string.=$appendString;
            }
 
        //PRIVATE=-----------------------------------------------------
            function _offsetInvalidatedIteratorsAfterInject($injectString, $maxCharIndex){
                foreach($this->charIndex as $value){
                    if($value > $maxCharIndex){
                        $lengthCheck = new mv_StringWalker($injectString);
                        $lengthCheck->setToEnd();
                        $this->_offsetGreaterIterators($maxCharIndex, $lengthCheck->getCharIndex());
                        return $lengthCheck->getByteIndex();
                        break;
                    }
                }
                return false;
            }
            
            function _offsetGreaterIterators($maxCharIndex, $charCount){
                foreach($this->charIndex as $key => $value){
                    if($value > $maxCharIndex){
                        $this->charIndex[$key]+=$charCount;
                    }
                }
            }
 
            var $recent;
            var $charIndex;
            var $byteIndex;
            var $string, $length;
        }
    }
}
?>
 
using the mv_StringWalker guarantees a character by character iteration over a string instead of a byte by byte iteration (at least when MV_USE_UTF8 is true). It provides a consistent interface for iterating over and arbitrary manipulation of strings (you can see examples of it in use in the multibyteString.php file.)

Comments, critique, suggestions welcome.

Re: UTF-8 String Walking/Functions

Posted: Fri Mar 19, 2010 1:00 pm
by Christopher
A lot of code to comment on ... a couple of questions:

- The class needs the functions included, but only uses two functions. Could you implement that code in the class to remove the dependency? Or combine the files?

- Should this follow the Iterator interface instead of you lookNext(), lookPrev() naming?

- Is there a cleaner, single class implementation possible rather than the big if()'s ?

- Is there a way to auto-detect which library is needed? Likewise auto-detect if mb_string is available (function_exists()) ?

Re: UTF-8 String Walking/Functions

Posted: Fri Mar 19, 2010 1:57 pm
by Weirdan
This assumes you're working with NFC-normalized strings, right? Otherwise using something like http://us2.php.net/manual/en/ref.intl.grapheme.php seems more appropriate to me.

Re: UTF-8 String Walking/Functions

Posted: Fri Mar 19, 2010 3:24 pm
by M2tM
- The class needs the functions included, but only uses two functions. Could you implement that code in the class to remove the dependency? Or combine the files?
Great question!

I could do this, organizationally I prefer the two in different files as they tackle different things. In my own project I have something like this:
Project/Strings/package.php
Project/Strings/multibyteString.php
Project/Strings/stringWalker.php

Where package includes all the required files for "Strings". People are free to include things as they like and I certainly wouldn't stop anyone from merging the files. I prefer to stay away from duplicated code which is why I'm not really keen on implementing those functions again even though it would remove the dependency. Keep in mind the mv_ischar implementation changes based on a define and then you have to drag the functions that it requires as well so you would still have a logical dependency, but now you'd also have duplicated code. If you were to plan on using the stringWalker class elsewhere it would be trivial to remove the dependency but I always use the two together.
- Should this follow the Iterator interface instead of you lookNext(), lookPrev() naming?
If you mean the C++ definition of iterators, I don't think this would be a good idea. The primary reason is efficiency, as soon as you start piling too much abstraction onto a class which is meant to execute functions on every character in a string you really can get a lot of overhead in simply looping over a string. I have a profiler (nuSphere's PhpED) and have noticed a difference as soon as I start adding even extra functions in hot-spot code. I had a basic accessor function that was called about 30000 when running my test suite and it saved about 5% speed by eliminating it in favor of passing one more argument to several functions.

If I were to port this to another language I would probably tackle many of these problems differently.
- Is there a cleaner, single class implementation possible rather than the big if()'s ?
There is! In fact, if you really wanted to you could remove the if and only use the UTF8 version. The reason I include the if switch is so that you get optimizations for everything. On non-utf8 strings however the second version is more efficient. On my unit test suite (which tests many other classes as well, but uses this one heavily) I notice when using non-utf8 characters and the utf8 stringWalker I get 3.3 second execution. When using the optimized version 2.7 seconds.

The big if statement may not be pretty, but the other option is to implement the problem in terms of polymorphism which adds another layer of abstraction and when iterating over large strings it can cause a noticeable and unnecessary performance hit.
- Is there a way to auto-detect which library is needed? Likewise auto-detect if mb_string is available (function_exists()) ?
When mbstring is not available this will still process UTF8 strings. My old method would auto-detect if mbstring is available and then offer UTF8, but if you don't have mbstring it wouldn't work at all. This actually implements everything and has no reliance on mbstring. If mbstring is installed it makes use of its html output and sets the buffer callback appropriately for ob_start, but you do not need the library. I know it defines 'MV_USING_MULTIBYTE' I think I'll remove that definition or replace it with 'MV_MULTIBYTE_PLUGIN_LOADED' or something more descriptive.

To to be clear the entire purpose of this code is to have no dependence on mbstring at all and to implement an in-PHP implementation of multibyte string functions which can be both PHP version independant (4-5) and plugin independent.
This assumes you're working with NFC-normalized strings, right? Otherwise using something like http://us2.php.net/manual/en/ref.intl.grapheme.php seems more appropriate to me.
graphmeme would be great if you can guarantee your distribution will have PHP5 and that plugin. This is primarily meant to be for situations where you may be distributing a library and want maximum coverage. Otherwise you could just use the mbstring extension or graphmeme and either wrap them as I have here so that you can toggle UTF8 on or off or just call them where you want them.

I'm pretty sure the strings have to be valid UTF8 strings, but if you could check it out and explain any problems with it that would be cool. I did some digging and found another project's source includes UTF8 cleaning functions, obviously I can't just lift those, but if normalization is a concern you could always implement something like that for checking:
http://www.cybercosmonaut.de/xref3/nav. ... index.html (includes/utf8/utf_normalizer.php)

Re: UTF-8 String Walking/Functions

Posted: Fri Mar 19, 2010 4:21 pm
by Weirdan
M2tM wrote:
- Should this follow the Iterator interface instead of you lookNext(), lookPrev() naming?
If you mean the C++ definition of iterators,
No, the SPL's definition of Iterator:

Code: Select all

 
weirdan@virtual-debian:/home/sam/bugfix$ php --rc iterator
Interface [ <internal:Core> interface Iterator extends Traversable ] {
 
  - Constants [0] {
  }
 
  - Static properties [0] {
  }
 
  - Static methods [0] {
  }
 
  - Properties [0] {
  }
 
  - Methods [5] {
    Method [ <internal:Core> abstract public method current ] {
    }
 
    Method [ <internal:Core> abstract public method next ] {
    }
 
    Method [ <internal:Core> abstract public method key ] {
    }
 
    Method [ <internal:Core> abstract public method valid ] {
    }
 
    Method [ <internal:Core> abstract public method rewind ] {
    }
  }
}
 
Then, in php5 it would be possible to do

Code: Select all

 
foreach (new mv_stringWalker($string) as $char) {
  //do something to $char
}
 
M2tM wrote: The big if statement may not be pretty, but the other option is to implement the problem in terms of polymorphism which adds another layer of abstraction and when iterating over large strings it can cause a noticeable and unnecessary performance hit.
Well, you could split two versions in their respective files (like mv_strings.ascii.php and mv_strings.utf.php) and pull them into the mv_strings.php (or whatever you call it) via include. This would give someone an option to include required version directly to avoid performance hit of conditional function definition (bytecode caches generally have troubles caching conditional definitions).
M2tM wrote: I'm pretty sure the strings have to be valid UTF8 strings, but if you could check it out and explain any problems with it that would be cool.
Having differently normalized strings does not make them invalid utf, even having differently normalized characters inside the same string does not. Normalization is important when you have to compare binary representations (or sort strings, for that matter), because visually equivalent strings could differ in their byte representations. Consider these two utf8 sequences:

Code: Select all

 
$nfc = "\xc3\xa1";
$nfd = "\x61\xcc\x81";
 
They both represent the same character (if you ask any human): á . However the latter is in decomposed form (LATIN_SMALL_LETTER_A + COMBINING_ACUTE_ACCENT) while former is in composed form (LATIN_SMALL_LETTER_A_WITH_ACUTE)

Re: UTF-8 String Walking/Functions

Posted: Fri Mar 19, 2010 5:35 pm
by M2tM
Oh, cool, I'll take a look at that iterator support... That said, the focus is to keep code both PHP4 and PHP5 compatible and if the SPL iterator is not available in PHP4 then it misses one my constraints. I will still take a look at it though as it is clearly useful if someone doesn't want to adhere to the PHP4 constraint but wants to make use of this code (which I totally respect).

Your suggestion to split the two definitions into separate files is a good one. I'll do that for sure.

I see what you mean by normalization now, I was a little confused about the term earlier. I do not currently have a normalization function, but I'll look into adding something to do that. Normalization is potentially a performance heavy operation and so I probably won't build it into each function, but will instead provide the function for people to use alongside these functions. It would make sense even for basic ascii character representation to normalize the line endings.

That is an excellent suggestion and I thank you for bringing it up!

Re: UTF-8 String Walking/Functions

Posted: Tue Mar 23, 2010 6:05 pm
by M2tM
I've run the profiler and done some tweaking and brough more organization to the previous mess of redeclarations. I found that though there was a small performance hit from implementing basic inheritance with the string class, it was more than offset by simply unrolling the _go and _simulate functions.

Plus, this way you can decide to use byte characters for most things and then specify which are Utf8 strings if you need to do a mix of operations. Before it was basically one or the other.

I've done a similar thing to the mutlibyteString.php functions.

multibyteString.php

Code: Select all

 
<?PHP
/*-------------------------------------------------*\
|   Developer: MichaelHamilton.com | mike@m2tm.net  |
|----------------Keep Header Intact-----------------|
\*-------------------------------------------------*/
if(!defined('_MV_CORE_MULTIBYTESTRING_PHP_')){
    define('_MV_CORE_MULTIBYTESTRING_PHP_', true);
 
    //functions which rely on the mb library existing or not:
    if(function_exists('mb_get_info')){
        define('MV_USING_MULTIBYTE', true);
        mb_internal_encoding('UTF-8');
        mb_http_output('UTF-8');
 
        //call this function in ob_start(mv_obStartCallback());
        function mv_obStartCallback(){
            return "mb_output_handler";
        }
    }else{ //Default to typical php implementations when no mbstring extension exists
        define('MV_USING_MULTIBYTE', false);
        ini_set('default_charset', 'UTF-8');
        
        //call this function in ob_start(mv_obStartCallback());
        function mv_obStartCallback(){
            return NULL;
        }
    }
    
    //PHP 4/5 compatability
    function mv_strstr_common($haystack, $needle, $part, $insensitive, $utf8){
        $pos = ($insensitive)?
                mv_strpos(mv_strtolower($haystack), mv_strtolower($needle), 0, $encoding):
                $pos = mv_strpos($haystack, $needle, 0);
        if($pos !== false){
            if($part){
                if($utf8){
                    return mv_substrUtf8($haystack, 0, $pos);
                }else{
                    return mv_substrByteChar($haystack, 0, $pos);
                }
            }else{
                if($utf8){
                    return mv_substrUtf8($haystack, $pos);
                }else{
                    return mv_substrByteChar($haystack, $pos);
                }
            }
        }
        return false;
    }
    
    include realpath(dirname(__FILE__).DIRECTORY_SEPARATOR.'multibyteStringByteCharFunctions.php');
    include realpath(dirname(__FILE__).DIRECTORY_SEPARATOR.'multibyteStringUtf8Functions.php');
    
    //User decided toggle to turn UTF on and take a performance hit or not.
    if(MV_USE_UTF8 == true){
        function mv_strlen($string){
            return mv_strlenUtf8($string);
        }
        
        function mv_strpos($haystack, $needle, $offset = 0){
            return mv_strposUtf8($haystack, $needle, $offset);
        }
        
        //PHP 4/5 compatability
        function mv_stripos($haystack, $needle, $offset = 0){
            return mv_striposUtf8($haystack, $needle, $offset);
        }
        
        function mv_substr($string, $start, $length = '!'){
            return mv_substrUtf8($string, $start, $length);
        }
        
        function mv_ischar($character){
            return mv_ischarUtf8($character);
        }
        
        function mv_strrev($string){
            return mv_strrevUtf8($string);
        }
        
        function mv_trim($string, $characters = " \t\n\r\0\x0B"){
            return mv_trimUtf8($string, $characters);
        }
        
        function mv_getChar($string, $i){
            return mv_getCharUtf8($string, $i);
        }
        
        function mv_strstr($haystack, $needle, $part = false){
            return mv_strstr_common($haystack, $needle, $part, false, true);
        }
        
        function mv_stristr($haystack, $needle, $part = false){
            return mv_strstr_common($haystack, $needle, $part, true, true);
        }
        
        function mv_backstrpos($haystack, $needle, $offset = 0){
            return mv_backstrposUtf8($haystack, $needle, $offset);
        }
 
        function mv_backstripos($haystack, $needle, $offset = 0){
            return mv_backstriposUtf8($haystack, $needle, $offset);
        }
    }else{
        function mv_strlen($string){
            return strlen($string);
        }
        
        function mv_strpos($haystack, $needle, $offset = 0){
            return strpos($haystack, $needle, $offset);
        }
        
        //PHP 4/5 compatability
        function mv_stripos($haystack, $needle, $offset = 0){
            return strpos(mv_strtolower($haystack), mv_strtolower($needle), $offset);
        }
        
        function mv_substr($string, $start, $length = '!'){
            if($length === '!'){
                return substr($string, $start);
            }else{
                return substr($string, $start, $length);
            }
        }
        
        function mv_ischar($character){
            return ($character != '');
        }
        
        function mv_strrev($string){
            return strrev($string);
        }
        
        function mv_trim($string, $characters = " \t\n\r\0\x0B"){
            return trim($string, $characters);
        }
        
        function mv_getChar($string, $i){
            return ($string[$i] == '')?false:$string[$i];
        }
        
        function mv_strstr($haystack, $needle, $part = false){
            return mv_strstr_common($haystack, $needle, $part, false, true);
        }
        
        function mv_stristr($haystack, $needle, $part = false){
            return mv_strstr_common($haystack, $needle, $part, true, true);
        }
        
        function mv_backstrpos($haystack, $needle, $offset = 0){
            return mv_backstrposByteChar($haystack, $needle, $offset);
        }
 
        function mv_backstripos($haystack, $needle, $offset = 0){
            return mv_backstriposByteChar($haystack, $needle, $offset);
        }
    }
    
    function mv_setDatabaseCharset($DBConnection = null){
        $SQL = "SET character_set_results = 'utf8', character_set_client = 'utf8',
                character_set_connection = 'utf8', character_set_database = 'utf8',
                character_set_server = 'utf8', names = 'utf8'";
        if($DBConnection == null){
            mysql_query($SQL);
            if (function_exists('mysql_set_charset') !== false) {
                mysql_set_charset('utf8');
            }
        }else{
            mysql_query($SQL, $DBConnection);
            if (function_exists('mysql_set_charset') !== false) {
                mysql_set_charset('utf8', $DBConnection);
            }
        }
    }
    
    //Function by leha_grobov - http://php.net/manual/en/function.strtolower.php
    function _mv_convertcase($string, $tolower){
        static $uppercase = array(
            "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U",
            "V", "W", "X", "Y", "Z", "À", "Á", "Â", "Ã", "Ä", "Å", "Æ", "Ç", "È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï",
            "Ð", "Ñ", "Ò", "Ó", "Ô", "Õ", "Ö", "Ø", "Ù", "Ú", "Û", "Ü", "Ý", "?", "?", "?", "?", "?", "?", "?", "?",
            "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?",
            "?", "?", "?", "?"
        );
        static $lowercase = array(
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u",
            "v", "w", "x", "y", "z", "à", "á", "â", "ã", "ä", "å", "æ", "ç", "è", "é", "ê", "ë", "ì", "í", "î", "ï",
            "ð", "ñ", "ò", "ó", "ô", "õ", "ö", "ø", "ù", "ú", "û", "ü", "ý", "?", "?", "?", "?", "?", "?", "?", "?",
            "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?",
            "?", "?", "?", "?"
        );
 
        if($tolower == true){
            return str_replace($uppercase, $lowercase, $string);
        }else{
            return str_replace($lowercase, $uppercase, $string);
        }
    }
 
    //it's useful to use str_replace as it is UTF-8 safe and provides better support at a small performance hit.
    function mv_strtolower($string){
        return _mv_convertcase($string, true);
    }
    
    function mv_strtoupper($string){
        return _mv_convertcase($string, false);
    }
    
    function mv_htmlentities($string, $quoteStyle = ENT_COMPAT){
        return htmlentities($string, $quoteStyle, 'UTF-8');
    }
 
    function mv_isWhitespace($char){
        return $char === ' ' || $char === "\t" || $char === "\n" || $char === "\r" || $char === "\0" || $char === "\x0B";
    }
 
    function mv_getWhitespaceArray(){
        static $whiteSpaceArray = array(' ', "\t", "\n", "\r", "\0", "\x0B");
        return $whiteSpaceArray;
    }
 
    //This is fine for small strings, avoid using it too often for memory use reasons.
    //"Hello" would return array('H', 'e', 'l', 'l', 'o') and is UTF-8 friendly
    function mv_getCharacterArray($string, $utf8 = MV_USE_UTF8){
        if($utf8){
            $StringWalker = new mv_StringWalkerUtf8($string);
        }else{
            $StringWalker = new mv_StringWalkerByteChar($string);
        }
        $charArray = array();
        while($StringWalker->goNext() !== false){
            $charArray[] = $StringWalker->getLastRead();
        }
        return $charArray;
    }
}
?>
 
multibyteSTringByteCharFunctions.php

Code: Select all

 
<?PHP
if(!defined('_MV_CORE_MULTIBYTESTRING_BYTECHARFUNCTIONS_PHP_')){
    define('_MV_CORE_MULTIBYTESTRING_BYTECHARFUNCTIONS_PHP_', true);
    function mv_strlenByteChar($string){
        return strlen($string);
    }
    
    function mv_strposByteChar($haystack, $needle, $offset = 0){
        return strpos($haystack, $needle, $offset);
    }
    
    //PHP 4/5 compatability
    function mv_striposByteChar($haystack, $needle, $offset = 0){
        return strpos(mv_strtolower($haystack), mv_strtolower($needle), $offset);
    }
    
    function mv_substrByteChar($string, $start, $length = '!'){
        if($length === '!'){
            return substr($string, $start);
        }else{
            return substr($string, $start, $length);
        }
    }
    
    function mv_ischarByteChar($character){
        return ($character != '');
    }
    
    function mv_strrevByteChar($string){
        return strrev($string);
    }
    
    function mv_trimByteChar($string, $characters = " \t\n\r\0\x0B"){
        return trim($string, $characters);
    }
    
    function mv_getCharByteChar($string, $i){
        return ($string[$i] == '')?false:$string[$i];
    }
    
    function mv_strstrByteChar($haystack, $needle, $part = false){
        return mv_strstr_common($haystack, $needle, $part, false, false);
    }
    
    function mv_stristrByteChar($haystack, $needle, $part = false){
        return mv_strstr_common($haystack, $needle, $part, true, false);
    }
    
    function mv_backstrposByteChar($haystack, $needle, $offset = 0){
        $length = mv_strlenByteChar($haystack);
        $offset = ($offset > 0)?($length - $offset):abs($offset);
        $pos = mv_strposByteChar(mv_strrevByteChar($haystack), mv_strrevByteChar($needle), $offset);
        return ($pos === false)?false:( $length - $pos - mv_strlenByteChar($needle) );
    }
 
    function mv_backstriposByteChar($haystack, $needle, $offset = 0){
        return mv_backstrposByteChar(mv_strtolower($haystack), mv_strtolower($needle), $offset);
    }
}
?>
 
multibyteStringUtf8Functions.php

Code: Select all

 
<?PHP
if(!defined('_MV_CORE_MULTIBYTESTRING_UTF8FUNCTIONS_PHP_')){
    define('_MV_CORE_MULTIBYTESTRING_UTF8FUNCTIONS_PHP_', true);
    function mv_strlenUtf8($string){
        $stringLength = new mv_StringWalkerUtf8($string);
        $stringLength->setToEnd();
        return $stringLength->getCharIndex();
    }
    
    function mv_strposUtf8($haystack, $needle, $offset = 0){
        if($needle == '' || $haystack == ''){return false;}
        $haystackWalker = new mv_StringWalkerUtf8($haystack);
        $needleWalker = new mv_StringWalkerUtf8($needle);
        $position = 0;
        while(($character = $haystackWalker->goNext()) !== false){
            if(($position >= $offset) && ($needleWalker->goNext() === $character)){
                if($needleWalker->lookNext() === false){
                    return $position;
                }
            }else{
                $position = $haystackWalker->getCharIndex();
                $needleWalker->setToStart();
            }
        }
        return false;
    }
    
    function mv_striposUtf8($haystack, $needle, $offset = 0){
        return mv_strposUtf8(mv_strtolower($haystack), mv_strtolower($needle), $offset);
    }
    
    function _mv_substrSetupValuesUtf8($string, &$start, &$length, &$stringLength /*pure output*/){
        $stringLength = mv_strlenUtf8($string);
        if($length === '!'){$length = $stringLength;}
        if($length == 0){return '';}
        if(abs($start) > $stringLength){return false;}
        if($start < 0){
            $start = $stringLength + $start;
        }
        if($length < 0){
            $stringLength+=$length;
            if($start > $stringLength){return '';}
        }
        return true;
    }
    
    function mv_substrUtf8($string, $start, $length = '!'){
        if(($setupStatus = _mv_substrSetupValuesUtf8($string, $start, $length, $stringLength)) === true){
            $stringWalker = new mv_StringWalkerUtf8($string);
            $stringWalker->locate($start);
            $returnString = ''; $characterCount = 0;
            while(($character = $stringWalker->goNext()) !== false){
                if((($characterCount+$start) >= $stringLength) || ($length > 0 && $characterCount >= $length)){ //exit early conditions
                    break;
                }
                $returnString.=$character;
                $characterCount++;
            }
            return $returnString;
        }else{
            return $setupStatus;
        }
    }
    
    function mv_ischarUtf8($character){
        return ($character != '')?_mv_checkUtf8Char($character):false;
    }
    
    //Function by javalc6@gmail.com - http://php.net/manual/en/function.mb-check-encoding.php
    //modified for 1 character to exclude loop for performance (profiled and offers improvement)
    function _mv_checkUtf8Char($str) {
        $len = strlen($str);
        $i = 0;
        $c = ord($str[0]);
        if ($c > 128) {
            if (($c <= 191)) return false;
            elseif ($c <= 223) $bytes = 2;
            elseif ($c <= 239) $bytes = 3;
            elseif ($c <= 247) $bytes = 4;
            else return false;
            if (($bytes) > $len) return false;
            while ($bytes > 1) {
                $i++;
                $b = ord($str[$i]);
                if ($b < 128 || $b > 191) return false;
                $bytes--;
            }
        }
        return true;
    }
    
    function mv_strrevUtf8($string){
        $returnString = '';
        $StringWalker = new mv_StringWalkerUtf8($string);
        while(($readChar = $StringWalker->goNext()) !== false){
            $returnString = $readChar.$returnString;
        }
        return $returnString;
    }
    
    function _mv_trim_counterUtf8(&$stringWalker, $directionFunction, &$characterArray){
        $distance = 0;
        while(($char = $stringWalker->$directionFunction()) !== false && in_array($char, $characterArray)){
            $distance++;
        }
        return $distance;
    }
    function mv_trimUtf8($string, $characters = " \t\n\r\0\x0B"){
        $characterArray = mv_getCharacterArray($characters);
        $returnString = '';
        $stringWalker = new mv_StringWalkerUtf8($string);
        
        $start = _mv_trim_counterUtf8($stringWalker, 'goNext', $characterArray);
        
        $stringWalker->setToEnd();
        $length = $stringWalker->getCharIndex();
        if($start == $length){return '';}
 
        $end = _mv_trim_counter($stringWalker, 'goPrev', $characterArray);
        
        return mv_substr($string, $start, $length-$end-$start);
    }
    
    function mv_getCharUtf8($string, $i){
        if($i < 0){return false;}
        $stringWalker = new mv_StringWalkerUtf8($string);
        return $stringWalker->locate($i);
    }
    
    function mv_strstrUtf8($haystack, $needle, $part = false){
        return mv_strstr_common($haystack, $needle, $part, false, false);
    }
    
    function mv_stristrUtf8($haystack, $needle, $part = false){
        return mv_strstr_common($haystack, $needle, $part, true, false);
    }
    
    function mv_backstrposUtf8($haystack, $needle, $offset = 0){
        $length = mv_strlenUtf8($haystack);
        $offset = ($offset > 0)?($length - $offset):abs($offset);
        $pos = mv_strposUtf8(mv_strrevUtf8($haystack), mv_strrevUtf8($needle), $offset);
        return ($pos === false)?false:( $length - $pos - mv_strlenUtf8($needle) );
    }
 
    function mv_backstriposUtf8($haystack, $needle, $offset = 0){
        return mv_backstrposUtf8(mv_strtolower($haystack), mv_strtolower($needle), $offset);
    }
}
?>
 
stringWalker.php

Code: Select all

 
<?PHP
/*-------------------------------------------------*\
|   Developer: MichaelHamilton.com | mike@m2tm.net  |
|----------------Keep Header Intact-----------------|
\*-------------------------------------------------*/
if(!defined('_MV_CORE_STRINGWALKER_PHP_')){
    define('_MV_CORE_STRINGWALKER_PHP_', true);
    
    
    class mv_StringWalkerBase {
        function mv_StringWalkerBase($string = ''){
            $this->setString($string);
        }
        
    //PUBLIC=------------------------------------------------------
        function setString($string){
            $this->recent = false;
            $this->byteIndex = array(0);
            $this->charIndex = array(0);
            $this->string = $string;
            $this->length = strlen($string);
        }
        
        function getCharIndex($iteratorId = 0){
            return $this->charIndex[$iteratorId];
        }
 
        function getByteIndex($iteratorId = 0){
            return $this->byteIndex[$iteratorId];
        }
        
        function setToStart($iteratorId = 0){
            $this->charIndex[$iteratorId] = 0;
            $this->byteIndex[$iteratorId] = 0;
        }
        
        function setToEnd($iteratorId = 0){
            //Overload
        }
 
        function locate($index, $iteratorId = 0){
            //Overload
        }
 
        function lookNext($iteratorId = 0){
            //Overload
        }
        function goNext($iteratorId = 0){
            //Overload
        }
 
        function lookPrev($iteratorId = 0){
            //Overload
        }
        function goPrev($iteratorId = 0){
            //Overload
        }
 
        function getLastRead(){
            return $this->recent;
        }
 
        function copyPosition($iteratorIdSource, $iteratorIdDestination){
            $this->byteIndex[$iteratorIdDestination] = $this->byteIndex[$iteratorIdSource];
            $this->charIndex[$iteratorIdDestination] = $this->charIndex[$iteratorIdSource];
        }
        
        function inject($injectString, $iteratorId = 0, $revalidateIterators = true){
            $tmpLen = false;
            if($revalidateIterators){
                $tmpLen = $this->_offsetInvalidatedIteratorsAfterInject($injectString, $this->charIndex[$iteratorId]);
                if($tmpLen !== false){
                    $this->length+=$tmpLen;
                }
            }
            if($tmpLen === false){
                $this->length+= ($injectLength = strlen($injectString));
            }
            $this->string = substr($this->string, 0, $this->byteIndex[$iteratorId]).
                            $injectString.
                            substr($this->string, $this->byteIndex[$iteratorId]);
        }
        
        function append($appendString){
            $this->length+=strlen($appendString);
            $this->string.=$appendString;
        }
 
    //PRIVATE=-----------------------------------------------------
        function _offsetInvalidatedIteratorsAfterInject($injectString, $maxCharIndex){
            foreach($this->charIndex as $value){
                if($value > $maxCharIndex){
                    $className = get_class($this);
                    $lengthCheck = new $className($injectString);
                    $lengthCheck->setToEnd();
                    $this->_offsetGreaterIterators($maxCharIndex, $lengthCheck->getCharIndex(), $lengthCheck->getByteIndex());
                    return $lengthCheck->getByteIndex();
                    break;
                }
            }
            return false;
        }
        
        function _offsetGreaterIterators($maxCharIndex, $charCount, $byteCount){
            foreach($this->charIndex as $key => $value){
                if($value > $maxCharIndex){
                    $this->charIndex[$key]+=$charCount;
                    $this->byteIndex[$key]+=$byteCount;
                }
            }
        }
 
        var $recent;
        var $charIndex;
        var $byteIndex;
        var $string, $length;
    }
    
    class mv_StringWalkerUtf8 extends mv_StringWalkerBase{
        function mv_StringWalkerUtf8($string = ''){
            parent::mv_StringWalkerBase($string);
        }
        
    //PUBLIC=----------------------------------------------------------------- 
        function setToEnd($iteratorId = 0){
            while($this->lookNext($iteratorId) !== false){
                $this->goNext($iteratorId);
            }
        }
        
        function locate($index, $iteratorId = 0){
            $this->charIndex[$iteratorId] = 0;
            $this->byteIndex[$iteratorId] = 0;
            while($this->charIndex[$iteratorId] < $index && $this->goNext($iteratorId) !== false){}
            return $this->lookNext($iteratorId);
        }
        
        function lookNext($iteratorId = 0){
            //return $this->_go(0, 1, $this->length, $iteratorId);
            $tmpchr = '';
            $byteDiff = 0;
            $tmpByteIndex = $this->byteIndex[$iteratorId];
            for(;$tmpByteIndex < $this->length && !mv_ischarUtf8($tmpchr);$tmpByteIndex++){
                $tmpchr.=$this->string[$tmpByteIndex];
                $byteDiff++;
            }
            $this->recent = $tmpchr;
            if($byteDiff > 0){
                return $tmpchr;
            }else{
                return false;
            }
        }
        function goNext($iteratorId = 0){
            $tmpchr = '';
            $byteDiff = 0;
            $tmpByteIndex = $this->byteIndex[$iteratorId];
            for(;$tmpByteIndex < $this->length && !mv_ischarUtf8($tmpchr);$tmpByteIndex++){
                $tmpchr.=$this->string[$tmpByteIndex];
                $byteDiff++;
            }
            $this->recent = $tmpchr;
            if($byteDiff > 0){
                $this->charIndex[$iteratorId]++;
                $this->byteIndex[$iteratorId]+=$byteDiff;
                return $tmpchr;
            }else{
                return false;
            }
        }
 
        function lookPrev($iteratorId = 0){
            $tmpchr = '';
            $byteDiff = 0;
            $tmpByteIndex = $this->byteIndex[$iteratorId];
            for(;$tmpByteIndex >= 0 && !mv_ischarUtf8($tmpchr);$tmpByteIndex--){
                $tmpchr.=$this->string[$tmpByteIndex];
                $byteDiff++;
            }
            $this->recent = $tmpchr;
            if($byteDiff > 0){
                return $tmpchr;
            }else{
                return false;
            }
        }
        function goPrev($iteratorId = 0){
            $tmpchr = '';
            $byteDiff = 0;
            $tmpByteIndex = $this->byteIndex[$iteratorId];
            for(;$tmpByteIndex >= 0 && !mv_ischarUtf8($tmpchr);$tmpByteIndex--){
                $tmpchr.=$this->string[$tmpByteIndex];
                $byteDiff++;
            }
            $this->recent = $tmpchr;
            if($byteDiff > 0){
                $this->charIndex[$iteratorId]--;
                $this->byteIndex[$iteratorId]-=$byteDiff;
                return $tmpchr;
            }else{
                return false;
            }
        }
    }
    
    //single byte character optimizations
    class mv_StringWalkerByteChar extends mv_StringWalkerBase{
        function mv_StringWalkerByteChar($string = ''){
            parent::mv_StringWalkerBase($string);
        }
        
    //PUBLIC=------------------------------------------------------
        function lookNext($iteratorId = 0){
            if(($this->charIndex[$iteratorId]+1) <= $this->length){
                $this->recent = $this->string[$this->charIndex[$iteratorId]];
                return $this->recent;
            }else{
                return false;
            }
        }
        function goNext($iteratorId = 0){
            if(($this->charIndex[$iteratorId]+1) <= $this->length){
                $this->recent = $this->string[$this->charIndex[$iteratorId]];
                $this->charIndex[$iteratorId]++;
                $this->byteIndex[$iteratorId]++;
                return $this->recent;
            }else{
                return false;
            }
        }
 
        function lookPrev($iteratorId = 0){
            if(($this->charIndex[$iteratorId]-1) >= $this->length){
                $this->recent = $this->string[$this->charIndex[$iteratorId]];
                return $this->recent;
            }else{
                return false;
            }
        }
        function goPrev($iteratorId = 0){
            if(($this->charIndex[$iteratorId]-1) >= $this->length){
                $this->recent = $this->string[$this->charIndex[$iteratorId]];
                $this->charIndex[$iteratorId]--;
                $this->byteIndex[$iteratorId]--;
                return $this->recent;
            }else{
                return false;
            }
        }
    }
    
    //setting the default "typedef" for mv_StringWalker
    if(MV_USE_UTF8 === true){
        class mv_StringWalker extends mv_StringWalkerUtf8{
            function mv_StringWalkerByteChar($string = ''){
                parent::mv_StringWalkerUtf8($string);
            }
        }
    }else{
        class mv_StringWalker extends mv_StringWalkerByteChar{
            function mv_StringWalkerByteChar($string = ''){
                parent::mv_StringWalkerUtf8($string);
            }
        }
    }
}
?>