well, your problem is that you're treating each character like a byte when it is a multibyte character.
So you may be just outputting half of a character.
This is a little overkill, but I include this class and many other helpful things in my
MutedTemplate code example I posted here on the forums a while ago... Nobody really looked at it in detail though:
multibyteString.php
Code: Select all
<?PHP
/*------------------------------------------------------------------*\
| Developer: Michael Hamilton | http://www.m2tm.net | maxmike@gmail.com |
\*------------------------------------------------------------------*/
if(!defined('_MV_CORE_MULTIBYTESTRING_PHP_')){
define('_MV_CORE_MULTIBYTESTRING_PHP_', true);
if(!function_exists('mb_trim')){
function mb_trim($string, $charlist='\\\\s', $ltrim=true, $rtrim=true){
$both_ends = $ltrim && $rtrim;
$char_class_inner = preg_replace(
array( '/[\^\-\]\\\]/S', '/\\\{4}/S' ),
array( '\\\\\\0', '\\' ),
$charlist
);
$work_horse = '[' . $char_class_inner . ']+';
$ltrim && $left_pattern = '^' . $work_horse;
$rtrim && $right_pattern = $work_horse . '$';
if($both_ends){
$pattern_middle = $left_pattern . '|' . $right_pattern;
}elseif($ltrim){
$pattern_middle = $left_pattern;
}else{
$pattern_middle = $right_pattern;
}
return preg_replace("/$pattern_middle/usSD", '', $string);
}
}
//re-implement this because of a difference (the $part parameter) between PHP 4 and PHP 5.3
function mb_strstr_common($haystack, $needle, $part = false, $encoding = NULL, $insensitive = false){
if($encoding == NULL){
$encoding = mb_internal_encoding();
}
$pos = ($insensitive)?
mb_strpos(mb_strtolower($haystack, $encoding), mb_strtolower($needle, $encoding), 0, $encoding):
$pos = mb_strpos($haystack, $needle, 0, $encoding);
if($pos !== false){
if($part){
return mb_substr($haystack, 0, $pos, $encoding);
}else{
return mb_substr($haystack, $pos, $encoding);
}
}
return false;
}
function _mv_strstr_common($haystack, $needle, $part = false, $insensitive = false){
$pos = ($insensitive)?stripos($haystack, $needle, 0):strpos($haystack, $needle, 0);
if($pos !== false){
if($part){
return substr($haystack, 0, $pos);
}else{
return substr($haystack, $pos);
}
}
return false;
}
function mv_setDatabaseCharset($DBConnection = null){
$SQL = "SET character_set_results = 'utf8', character_set_client = 'utf8',
character_set_connection = 'utf8', character_set_database = 'utf8',
character_set_server = 'utf8', names = 'utf8'";
if($DBConnection == null){
mysql_query($SQL);
if (function_exists('mysql_set_charset') !== false) {
mysql_set_charset('utf8');
}
}else{
mysql_query($SQL, $DBConnection);
if (function_exists('mysql_set_charset') !== false) {
mysql_set_charset('utf8', $DBConnection);
}
}
}
//Multibyte UTF-8 functions:
if(function_exists('mb_get_info')){
define('MV_USING_MULTIBYTE', true);
mb_internal_encoding('UTF-8');
mb_http_output('UTF-8');
//call this function in ob_start(mv_obStartCallback());
function mv_obStartCallback(){
return "mb_output_handler";
}
function mv_strtolower($string){
return mb_strtolower($string, 'UTF-8');
}
function mv_strtoupper($string){
return mb_strtoupper($string, 'UTF-8');
}
function mv_strlen($string){
return mb_strlen($string, 'UTF-8');
}
function mv_substr($string, $start, $length = NULL){
if($length == NULL){
$length = mb_strlen($string, 'UTF-8');
}else{
if($start != 0){if($start > mb_strlen($string, 'UTF-8')){return false;}}
}
return mb_substr($string, $start, $length, 'UTF-8');
}
function mv_strpos($haystack, $needle, $offset = 0){
if($haystack == ''){return false;}
return mb_strpos($haystack, $needle, $offset, 'UTF-8');
}
function mv_stripos($haystack, $needle, $offset = 0){
if($haystack == ''){return false;}
return mb_stripos($haystack, $needle, $offset, 'UTF-8');
}
function mv_strrev($string){
$returnString = '';
$StringWalker = new mv_StringWalker($string);
while(($readChar = $StringWalker->goNext()) !== false){
$returnString = $readChar.$returnString;
}
return $returnString;
}
function mv_strstr($haystack, $needle, $part = false){
return mb_strstr_common($haystack, $needle, false, 'UTF-8');
}
function mv_stristr($haystack, $needle, $part = false){
return mb_strstr_common($haystack, $needle, false, 'UTF-8', true);
}
function mv_ischar($character){
return ($character != '')?mb_check_encoding($character, 'UTF-8'):false;
}
function mv_trim($string, $characters = " \t\n\r\0\x0B"){
$characterArray = mv_getCharacterArray($characters);
$returnString = '';
$stringWalker = new mv_StringWalker($string);
$start = _mv_trim_counter($stringWalker, 'goNext', $characterArray);
$stringWalker->setToEnd();
$length = $stringWalker->getCharIndex();
if($start == $length){return '';}
$end = _mv_trim_counter($stringWalker, 'goPrev', $characterArray);
return mv_substr($string, $start, $length-$end-$start);
}
function _mv_trim_counter(&$stringWalker, $directionFunction, $characterArray){
$distance = 0;
while(($char = $stringWalker->$directionFunction()) !== false && in_array($char, $characterArray)){
$distance++;
}
return $distance;
}
function mv_getChar($string, $i){
$chr = false; $length = mv_strlen($string);
$stringWalker = new mv_StringWalker($string);
if($stringWalker->locate($i)){
return $stringWalker->goNext();
}else{
return '';
}
}
function mv_htmlentities($string, $quoteStyle = ENT_COMPAT){
return htmlentities($string, $quoteStyle, 'UTF-8');
}
}else{ //Default to typical php implementations when no mbstring extension exists
define('MV_USING_MULTIBYTE', false);
//call this function in ob_start(mv_obStartCallback());
function mv_obStartCallback(){
return NULL;
}
function mv_strtolower($string){
return strtolower($string);
}
function mv_strtoupper($string){
return strtoupper($string);
}
function mv_strlen($string){
return strlen($string);
}
function mv_substr($string, $start, $length = NULL){
if($length == NULL){$length = strlen($string);}
return substr($string, $start, $length);
}
function mv_strpos($haystack, $needle, $offset = 0){
return strpos($haystack, $needle, $offset);
}
function mv_stripos($haystack, $needle, $offset = 0){
return stripos($haystack, $needle, $offset);
}
function mv_strrev($string){
return strrev($string);
}
function mv_strstr($haystack, $needle, $part = false){
return _mv_strstr_common($haystack, $needle, $part);
}
function mv_stristr($haystack, $needle, $part = false){
return _mv_strstr_common($haystack, $needle, $part, true);
}
function mv_ischar($character){
return ($character != '');
}
function mv_trim($string, $characters = " \t\n\r\0\x0B"){
return trim($string, $characters);
}
function mv_getChar($string, $i){
return $string[$i];
}
function mv_htmlentities($string, $quoteStyle = ENT_COMPAT){
return htmlentities($string, $quoteStyle);
}
}
//This is a bit more useful than the strrpos functions (which are inconsistant in php4 and 5 and the offset behavior is weird)
function mv_backstrpos($haystack, $needle, $offset = 0){
$length = mv_strlen($haystack);
$offset = ($offset > 0)?($length - $offset):abs($offset);
$pos = mv_strpos(mv_strrev($haystack), mv_strrev($needle), $offset);
return ($pos === false)?false:( $length - $pos - mv_strlen($needle) );
}
function mv_backstripos($haystack, $needle, $offset = 0){
return mv_backstrpos(mv_strtolower($haystack), mv_strtolower($needle), $offset);
}
function mv_isWhitespace($char){
return $char === ' ' || $char === "\t" || $char === "\n" || $char === "\r" || $char === "\0" || $char === "\x0B";
}
function mv_getWhitespaceArray(){
static $whiteSpaceArray = array(' ', "\t", "\n", "\r", "\0", "\x0B");
return $whiteSpaceArray;
}
//This is fine for small strings, avoid using it too often for memory use reasons.
//"Hello" would return array('H', 'e', 'l', 'l', 'o') and is UTF-8 friendly
function mv_getCharacterArray($string){
$StringWalker = new mv_StringWalker($string);
$charArray = array();
while($StringWalker->goNext() !== false){
$charArray[] = $StringWalker->getLastRead();
}
return $charArray;
}
}
?>
stringWalker.php
Code: Select all
<?PHP
/*------------------------------------------------------------------*\
| Developer: Michael Hamilton | http://www.m2tm.net | maxmike@gmail.com |
\*------------------------------------------------------------------*/
if(!defined('_MV_CORE_STRINGWALKER_PHP_')){
define('_MV_CORE_STRINGWALKER_PHP_', true);
class mv_StringWalker{
function mv_StringWalker($string = ''){
$this->setString($string);
}
//PUBLIC=------------------------------------------------------
function setString($string){
$this->recent = false;
$this->byteIndex = array(0);
$this->charIndex = array(0);
$this->string = $string;
$this->length = strlen($string);
}
function setToEnd($iteratorId = 0){
$this->charIndex[$iteratorId] = mv_strlen($this->string);
$this->byteIndex[$iteratorId] = $this->length;
}
function locate($index, $iteratorId = 0){
$this->charIndex[$iteratorId] = 0;
$this->byteIndex[$iteratorId] = 0;
$nextValid = true;
while($this->charIndex < $index && $nextValid !== false){$nextValid = $this->goNext($iteratorId);}
return $nextValid;
}
function getCharIndex($iteratorId = 0){
return $this->charIndex[$iteratorId];
}
function getByteIndex($iteratorId = 0){
return $this->byteIndex[$iteratorId];
}
function lookNext($iteratorId = 0){
return $this->_go(0, 1, $this->length, $iteratorId);
}
function goNext($iteratorId = 0){
return $this->_go(1, 1, $this->length, $iteratorId);
}
function lookPrev($iteratorId = 0){
return $this->_go(0, 0, 0, $iteratorId);
}
function goPrev($iteratorId = 0){
return $this->_go(-1, 0, 0, $iteratorId);
}
function getLastRead(){
return $this->recent;
}
function copyPosition($iteratorIdSource, $iteratorIdDestination){
$this->byteIndex[$iteratorIdDestination] = $this->byteIndex[$iteratorIdSource];
$this->charIndex[$iteratorIdDestination] = $this->charIndex[$iteratorIdSource];
}
function inject($injectString, $iteratorId = 0, $revalidateIterators = true){
$tmpLen = false;
if($revalidateIterators){
$tmpLen = $this->_offsetInvalidatedIteratorsAfterInject($injectString, $this->charIndex[$iteratorId]);
if($tmpLen !== false){
$this->length+=$tmpLen;
}
}
if($tmpLen === false){
$this->length+= ($injectLength = strlen($injectString));
}
$this->string = substr($this->string, 0, $this->byteIndex[$iteratorId]).
$injectString.
substr($this->string, $this->byteIndex[$iteratorId]);
}
function append($appendString){
$this->length+=strlen($appendString);
$this->string.=$appendString;
}
//PRIVATE=-----------------------------------------------------
function _offsetInvalidatedIteratorsAfterInject($injectString, $maxCharIndex){
foreach($this->charIndex as $value){
if($value > $maxCharIndex){
$lengthCheck = new mv_StringWalker($injectString);
$lengthCheck->setToEnd();
$this->_offsetGreaterIterators($maxCharIndex, $lengthCheck->getCharIndex(), $lengthCheck->getByteIndex());
return $lengthCheck->getByteIndex();
break;
}
}
return false;
}
function _offsetGreaterIterators($maxCharIndex, $charCount, $byteCount){
foreach($this->charIndex as $key => $value){
if($value > $maxCharIndex){
$this->charIndex[$key]+=$charCount;
$this->byteIndex[$key]+=$byteCount;
}
}
}
function _go($directionVector, $compareOption, $endValue, $iteratorId){
$tmpchr = $this->_simulateMove($byteDiff, $iteratorId, $compareOption, $endValue);
if($byteDiff > 0){
$this->charIndex[$iteratorId]+=(1*$directionVector);
$this->byteIndex[$iteratorId]+=($byteDiff*$directionVector);
return $tmpchr;
}else{
return false;
}
}
function _simulateMove(&$byteDiff, $iteratorId, $compareOption, $endValue){
$tmpchr = '';
$byteDiff = 0;
$tmpByteIndex = $this->byteIndex[$iteratorId];
if($compareOption){
for(;$tmpByteIndex < $endValue && !mv_ischar($tmpchr);$tmpByteIndex++){
$tmpchr.=$this->string[$tmpByteIndex];
$byteDiff++;
}
}else{
for(;$tmpByteIndex >= $endValue && !mv_ischar($tmpchr);$tmpByteIndex--){
$tmpchr=$this->string[$tmpByteIndex].$tmpchr;
$byteDiff++;
}
}
$this->recent = $tmpchr;
return $tmpchr;
}
var $recent;
var $charIndex;
var $byteIndex;
var $string, $length;
}
}
?>
Finally if you use that you can write something like:
Code: Select all
$string = new mv_StringWalker("????");
while($character = $string->goNext()){
echo "ChrIndex(".$string->getCharIndex()."):ByteIndex(".$string->getByteIndex().") - ".$character."<br />";
}
This relies on the mbstring library. With this library you get output:
ChrIndex(1):ByteIndex(2) - ?
ChrIndex(2):ByteIndex(4) - ?
ChrIndex(3):ByteIndex(6) - ?
ChrIndex(4):ByteIndex(8) - ?
Without it will be:
ChrIndex(1):ByteIndex(1) - Î
ChrIndex(2):ByteIndex(2) - ‘
ChrIndex(3):ByteIndex(3) - Î
ChrIndex(4):ByteIndex(4) - ’
ChrIndex(5):ByteIndex(5) - Î
ChrIndex(6):ByteIndex(6) - “
ChrIndex(7):ByteIndex(7) - Î
ChrIndex(8):ByteIndex(8) - ”
I should mention the chrindex and byteindex are both off by one (as it is read after advancing the string iterator.)