So one of my friend found out the missing piece in php.net(i think)...
Code: Select all
/*
convert &#nnnn; to nnnn
eg A to 65
arguments A
*/
function stringToCodePoints($str) {
$str = fixCharacters($str);
$str = preg_replace_callback('/&#([0-9]+);/', create_function('$s','return $s[1];'),$str);
$str = preg_replace_callback('/&#x([a-f0-9]+);/i', create_function('$s','return hexdec($s[1]);'), $str);
return $str;
}
function fixCharacters($str) {
$r = array(
'€' => '€',
'' => '',
'‚' => '‚',
'ƒ' => 'ƒ',
'„' => '„',
'…' => '…',
'†' => '†',
'‡' => '‡',
'ˆ' => 'ˆ',
'‰' => '‰',
'Š' => 'Š',
'‹' => '‹',
'Œ' => 'Œ',
'' => '',
'Ž' => 'Ž',
'' => '',
'' => '',
'‘' => '‘',
'’' => '’',
'“' => '“',
'”' => '”',
'•' => '•',
'–' => '–',
'—' => '—',
'˜' => '˜',
'™' => '™',
'š' => 'š',
'›' => '›',
'œ' => 'œ',
'' => '',
'ž' => 'ž',
'Ÿ' => 'Ÿ'
);
return strtr($str, $r);
}
function code2utf($num){
if ($num < 128) {
return chr($num);
}
if ($num < 2048) {
return chr(($num >> 6) + 192) . chr(($num & 63) + 128);
}
if ($num < 65536) {
return chr(($num >> 12) + 224) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
}
if ($num < 2097152) {
return chr(($num >> 18) + 240) . chr((($num >> 12) & 63) + 128) . chr((($num >> 6) & 63) + 128) . chr(($num & 63) + 128);
}
return '';
}
function asciiToUtf8($string)
{
$output_str=NULL;
$exp_string=explode(";",$string);
$count_length=count($exp_string)-1;
foreach($exp_string as $key=>$val)
{
if($count_length==$key)
{
$count_last_value=strlen($val);
for($i=0;$i<$count_last_value;$i++)
$output_str.=dcr2utf8(ord($val[$i]));
return $output_str;
}
$main_value=$val.";";
$count_value=strlen($main_value);
$actual_value=substr($main_value,-7,7);
$conversion_str=substr($main_value,0,-7);
$str_conv_count=strlen($conversion_str);
for($i=0;$i<$str_conv_count;$i++){
$output_str.=dcr2utf8(ord($conversion_str[$i]));
}
$output_str.=dcr2utf8(stringToCodePoints($actual_value));
}
return $output_str;
}
/*
* @param $source string encoded using utf-8 [STRING] eg 65
* @return string of unicode entities [STRING] A
* @access public
*/
function dcr2utf8($src){
$dest = '';
if($src < 0){
return false;
}elseif($src <= 0x007f){
$dest .= chr($src);
}elseif($src <= 0x07ff){
$dest .= chr(0xc0 | ($src >> 6));
$dest .= chr(0x80 | ($src & 0x003f));
}elseif($src == 0xFEFF){
// nop -- zap the BOM
}elseif ($src >= 0xD800 && $src <= 0xDFFF){
// found a surrogate
return false;
}elseif($src <= 0xffff){
$dest .= chr(0xe0 | ($src >> 12));
$dest .= chr(0x80 | (($src >> 6) & 0x003f));
$dest .= chr(0x80 | ($src & 0x003f));
}elseif($src <= 0x10ffff){
$dest .= chr(0xf0 | ($src >> 18));
$dest .= chr(0x80 | (($src >> 12) & 0x3f));
$dest .= chr(0x80 | (($src >> 6) & 0x3f));
$dest .= chr(0x80 | ($src & 0x3f));
}else{
// out of range
return false;
}
return $dest;
}
function utf8ToAscii($source){
$exp_source=explode(" ",$source);
$concat=NULL;
foreach($exp_source as $key=>$val){
$output_str.=$concat.publicutf8ToUnicodeEntities($val);
$concat=" ";
}
return $output_str;
}
/**
* takes a string of utf-8 encoded characters and converts it to a string of unicode entities
* each unicode entitiy has the form &#nnnnn; n={0..9} and can be displayed by utf-8 supporting
* browsers
* @param $source string encoded using utf-8 [STRING]
* @return string of unicode entities [STRING]
* @access publicutf8ToUnicodeEntities
*/
function publicutf8ToUnicodeEntities($source) {
// array used to figure what number to decrement from character order value
// according to number of characters used to map unicode to ascii by utf-8
$decrement[4] = 240;
$decrement[3] = 224;
$decrement[2] = 192;
$decrement[1] = 0;
// the number of bits to shift each charNum by
$shift[1][0] = 0;
$shift[2][0] = 6;
$shift[2][1] = 0;
$shift[3][0] = 12;
$shift[3][1] = 6;
$shift[3][2] = 0;
$shift[4][0] = 18;
$shift[4][1] = 12;
$shift[4][2] = 6;
$shift[4][3] = 0;
$pos = 0;
$len = strlen ($source);
$encodedString = '';
while ($pos < $len) {
$asciiPos = ord (substr ($source, $pos, 1));
if (($asciiPos >= 240) && ($asciiPos <= 255)) {
// 4 chars representing one unicode character
$thisLetter = substr ($source, $pos, 4);
$pos += 4;
}
else if (($asciiPos >= 224) && ($asciiPos <= 239)) {
// 3 chars representing one unicode character
$thisLetter = substr ($source, $pos, 3);
$pos += 3;
}
else if (($asciiPos >= 192) && ($asciiPos <= 223)) {
// 2 chars representing one unicode character
$thisLetter = substr ($source, $pos, 2);
$pos += 2;
}
else {
// 1 char (lower ascii)
$thisLetter = substr ($source, $pos, 1);
$pos += 1;
}
// process the string representing the letter to a unicode entity
$thisLen = strlen ($thisLetter);
$thisPos = 0;
$decimalCode = 0;
while ($thisPos < $thisLen) {
$thisCharOrd = ord (substr ($thisLetter, $thisPos, 1));
if ($thisPos == 0) {
$charNum = intval ($thisCharOrd - $decrement[$thisLen]);
$decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
}
else {
$charNum = intval ($thisCharOrd - 128);
$decimalCode += ($charNum << $shift[$thisLen][$thisPos]);
}
$thisPos++;
}
if ($thisLen == 1)
$encodedLetter = "&#". str_pad($decimalCode, 3, "0", STR_PAD_LEFT) . ';';
else
$encodedLetter = "&#". str_pad($decimalCode, 4, "0", STR_PAD_LEFT) . ';';
$encodedString .= $encodedLetter;
}
return $encodedString;
}