Page 1 of 1

unicode comparision

Posted: Sun Jan 20, 2008 6:28 am
by novice4eva
Dear friends, I am having problems while comparing unicode values. What i am doing is sending the unicode value as get parameter which comes as "square brackets" in IE while it shows some value like "%E0%A4%9F" in address bar in mozilla. Internally i have stored the unicode equivalent values in an array as "ज". I need to compare the value sent as get parameter with the ones in the array that i have stored.

This is my code:

Code: Select all

 
function alphabetical_order($order_by)
{
    $array = array();
    if(returnLanguageDesc('ENGLISH','NEPALI')=='ENGLISH')
        $array = range(A,Z);
    else
    {
        for($i=2325;$i<=2362;$i++)
        {
            if($i==2345 || $i==2353 || $i==2355 || $i==2356)
                continue;
            array_push($array,'&#'.$i);
        }
        array_push($array,'&#2340;&#2381;&#2352;'); 
        array_push($array,'&#2332;&#2381;&#2334;'); 
    }
    foreach($array as $value)
    {
        
        $sel=(isset($order_by) && $order_by==$value)?'pagingSelected':NULL;
        echo '<a href="'.$_SERVER['PHP_SELF'].'?order_by='.$value.'"><span class="'.$sel.'">'.$value.'</span></a>'.'&nbsp;&nbsp;';
    }
    $sel=(!isset($order_by) || trim($order_by)=='')?'pagingSelected':NULL;
    echo '<a href="'.$_SERVER['PHP_SELF'].'?order_by="><span class="'.$sel.'">ALL</span></a>'.'&nbsp;';
 
}
 
 
Thanks in advance..Cheers

Re: unicode comparision

Posted: Sun Jan 20, 2008 12:31 pm
by Christopher
Take a look at the multi-byte string functions.

Re: unicode comparision

Posted: Wed Jan 23, 2008 10:22 pm
by novice4eva
Thanks for pointing me to the right direction. I did go through some of the functions and the only thing i could think of first was to change the encoding format to one common (i changed them to UTF-8, then i tried ASCII too), but still the equality comparison didn't work :(

[SOLVED ]Re: unicode comparision

Posted: Tue Feb 05, 2008 5:48 am
by novice4eva
What i actually need was a ascii to utf-8 and vice- versa converter :dubious:
So one of my friend found out the missing piece in php.net(i think)...
These were the functions needed to do the conversion

Code: Select all

 
/*
    convert &#nnnn; to nnnn
    eg &#65;  to  65
    arguments &#65;
*/
 
function stringToCodePoints($str) {
     $str = fixCharacters($str);
     $str = preg_replace_callback('/&#([0-9]+);/', create_function('$s','return $s[1];'),$str);
     $str = preg_replace_callback('/&#x([a-f0-9]+);/i', create_function('$s','return hexdec($s[1]);'), $str);
     return $str;
}
 
 
function fixCharacters($str) {
     $r = array(
     '&#128;' => '&#8364;',
     '&#129;' => '',
     '&#130;' => '&#8218;',
     '&#131;' => '&#402;',
     '&#132;' => '&#8222;',
     '&#133;' => '&#8230;',
     '&#134;' => '&#8224;',
     '&#135;' => '&#8225;',
     '&#136;' => '&#710;',
     '&#137;' => '&#8240;',
     '&#138;' => '&#352;',
     '&#139;' => '&#8249;',
     '&#140;' => '&#338;',
     '&#141;' => '',
     '&#142;' => '&#381;',
     '&#143;' => '',
     '&#144;' => '',
     '&#145;' => '&#8216;',
     '&#146;' => '&#8217;',
     '&#147;' => '&#8220;',
     '&#148;' => '&#8221;',
     '&#149;' => '&#8226;',
     '&#150;' => '&#8211;',
     '&#151;' => '&#8212;',
     '&#152;' => '&#732;',
     '&#153;' => '&#8482;',
     '&#154;' => '&#353;',
     '&#155;' => '&#8250;',
     '&#156;' => '&#339;',
     '&#157;' => '',
     '&#158;' => '&#382;',
     '&#159;' => '&#376;'
     );
     return strtr($str, $r);
}
 
function code2utf($num){
 if ($num < 128) {
  return chr($num);
 }
 if ($num < 2048) {
  return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
 }
 if ($num < 65536) {
  return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
 }
 if ($num < 2097152) {
  return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
 }
 return '';
}
 
 
function asciiToUtf8($string)
{
        $output_str=NULL;
        $exp_string=explode(";",$string);
        $count_length=count($exp_string)-1; 
        foreach($exp_string as $key=>$val)
        {
            if($count_length==$key)
            {
                $count_last_value=strlen($val);
                for($i=0;$i<$count_last_value;$i++)
                    $output_str.=dcr2utf8(ord($val[$i]));
                return $output_str;                 
            }
            $main_value=$val.";";   
            $count_value=strlen($main_value);
            $actual_value=substr($main_value,-7,7);
            $conversion_str=substr($main_value,0,-7);
            $str_conv_count=strlen($conversion_str);
            for($i=0;$i<$str_conv_count;$i++){
                $output_str.=dcr2utf8(ord($conversion_str[$i]));
            }               
            $output_str.=dcr2utf8(stringToCodePoints($actual_value));
        }
    
    return $output_str;
 
}
 
/*
    * @param $source string encoded using utf-8 [STRING] eg 65
    * @return string of unicode entities [STRING] A
    * @access public
*/
function dcr2utf8($src){
 $dest = '';
 if($src < 0){
  return false;
 }elseif($src <= 0x007f){
  $dest .= chr($src);
 }elseif($src <= 0x07ff){
  $dest .= chr(0xc0 | ($src >> 6));
  $dest .= chr(0x80 | ($src & 0x003f));
 }elseif($src == 0xFEFF){
  // nop -- zap the BOM
 }elseif ($src >= 0xD800 && $src <= 0xDFFF){
  // found a surrogate
  return false;
 }elseif($src <= 0xffff){
  $dest .= chr(0xe0 | ($src >> 12));
  $dest .= chr(0x80 | (($src >> 6) & 0x003f));
  $dest .= chr(0x80 | ($src & 0x003f));
 }elseif($src <= 0x10ffff){
  $dest .= chr(0xf0 | ($src >> 18));
  $dest .= chr(0x80 | (($src >> 12) & 0x3f));
  $dest .= chr(0x80 | (($src >> 6) & 0x3f));
  $dest .= chr(0x80 | ($src & 0x3f));
 }else{ 
  // out of range
  return false;
 }
 return $dest;
}
 
function utf8ToAscii($source){
    $exp_source=explode(" ",$source);
    $concat=NULL;
    foreach($exp_source as $key=>$val){
        $output_str.=$concat.publicutf8ToUnicodeEntities($val);
        $concat=" ";
    }
    return $output_str;
}
/**
* takes a string of utf-8 encoded characters and converts it to a string of unicode entities
* each unicode entitiy has the form &#nnnnn; n={0..9} and can be displayed by utf-8 supporting
* browsers
* @param $source string encoded using utf-8 [STRING]
* @return string of unicode entities [STRING]
* @access publicutf8ToUnicodeEntities
*/
function publicutf8ToUnicodeEntities($source) {
    // array used to figure what number to decrement from character order value
    // according to number of characters used to map unicode to ascii by utf-8
    $decrement[4] = 240;
    $decrement[3] = 224;
    $decrement[2] = 192;
    $decrement[1] = 0;
   
    // the number of bits to shift each charNum by
    $shift[1][0] = 0;
    $shift[2][0] = 6;
    $shift[2][1] = 0;
    $shift[3][0] = 12;
    $shift[3][1] = 6;
    $shift[3][2] = 0;
    $shift[4][0] = 18;
    $shift[4][1] = 12;
    $shift[4][2] = 6;
    $shift[4][3] = 0;
   
    $pos = 0;
    $len = strlen ($source);
    $encodedString = '';
    while ($pos < $len) {
        $asciiPos = ord (substr ($source, $pos, 1));
        if (($asciiPos >= 240) && ($asciiPos <= 255)) {
            // 4 chars representing one unicode character
            $thisLetter = substr ($source, $pos, 4);
            $pos += 4;
        }
        else if (($asciiPos >= 224) && ($asciiPos <= 239)) {
            // 3 chars representing one unicode character
            $thisLetter = substr ($source, $pos, 3);
            $pos += 3;
        }
        else if (($asciiPos >= 192) && ($asciiPos <= 223)) {
            // 2 chars representing one unicode character
            $thisLetter = substr ($source, $pos, 2);
            $pos += 2;
        }
        else {
            // 1 char (lower ascii)
            $thisLetter = substr ($source, $pos, 1);
            $pos += 1;
        }
 
        // process the string representing the letter to a unicode entity
        $thisLen = strlen ($thisLetter);
        $thisPos = 0;
        $decimalCode = 0;
        while ($thisPos < $thisLen) {
            $thisCharOrd = ord (substr ($thisLetter, $thisPos, 1));
            if ($thisPos == 0) {
                $charNum = intval ($thisCharOrd - $decrement[$thisLen]);
                $decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
            }
            else {
                $charNum = intval ($thisCharOrd - 128);
                $decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
            }
 
            $thisPos++;
        }
 
        if ($thisLen == 1)
            $encodedLetter = "&#". str_pad($decimalCode, 3, "0", STR_PAD_LEFT) . ';';
        else
            $encodedLetter = "&#". str_pad($decimalCode, 4, "0", STR_PAD_LEFT) . ';';
 
        $encodedString .= $encodedLetter;
    }
 
    return $encodedString;
}
 
 
Finally i can go :drunk: