You should also read up here:
* http://www.randomchaos.com/documents/?s ... nd_unicode
* http://en.wikipedia.org/wiki/UTF-8
Code: Select all
function utf8_to_character_array($string) {
$character_array = array();
$values = array();
$looking_for = 1;
$string_length = strlen($string);
//each iteration represents a byte
for ($i = 0; $i < $string_length; $i++) {
//get integer value of this
$value = ord($string[$i]);
if ($looking_for == 1) { //we are looking for the beginning
if ($value < 128) { //check if byte begins with zero
//it does, simple ASCII character
$character_array[] = $value;
} elseif ($value >= 128 && $value < 224) { //check if byte is 110xxxxx
$looking_for = 2; //character is two bytes
$values[] = $value; //save the byte for later processing
continue;
} elseif ($value >= 224 && $value < 239) { //check if byte is 1110xxxx
$looking_for = 3;
$values[] = $value; //save the byte for later processing
continue;
} elseif ($value >= 239 && $value < 247) { //check if byte is 11110xxx
//unimplemented, ignore
} else {
//nonsensical byte, ignore
continue;
}
} elseif ($looking_for == 2) { //two byte character
//sanity check
if (!($value >= 128 && $value < 192)) { //check if byte isn't 10xxxxxx
//nonsensical byte, ignore
continue;
}
$values[] = $value;
//extract x's from 110xxxxx 10xxxxxx
$character_array[] = (($values[0] % 32) * 64) +
($values[1] % 64);
} elseif ($looking_for == 3) { //three byte character
//sanity check
if (!($value >= 128 && $value < 192)) { //check if byte isn't 10xxxxxx
//nonsensical byte, ignore
continue;
}
$values[] = $value;
if (count($values) == 2) { //is there one last byte?
continue;
}
//extract x's from 1110xxxx 10xxxxxx 10xxxxxx
$character_array[] = (($values[0] % 16) * 4096) +
(($values[1] % 64 ) * 64) +
($values[2] % 64);
} elseif ($looking_for == 4) { //four byte character
//unimplemented, ignore
}
//cleanup
$looking_for = 1;
$values = array();
}
return $character_array;
}Edit Meh, PEAR does this too... never thought of using the shift right...
Code: Select all
if ($value >> 5 == 6) {
$values[] = ($value - 192) << 6;
$search = 2;
} elseif ($value >> 4 == 14) {
$values[] = ($value - 224) << 12;
$search = 3;