Codepages? Am I missing something? From what I read utf does not have the concept of codepages, just unicode character ranges....feyd wrote:either find or build a utf-8 analyzer that looks at the code pages the stream switches to..
well, here it is:feyd wrote: count the number of characters in the pooled languages.. use that count to base your switch between left-to-right to right-to-left.
Code: Select all
function get_ranges($filename = 'uc-ranges.txt') {
$ranges = array();
foreach(file($filename) as $line) {
list($start, $end, $name) = preg_split('/(\\.\\.|; )/', trim($line));
$ranges[$name] = array('start'=>hexdec($start), 'end' => hexdec($end));
}
return $ranges;
}
function parse_utf8($string) {
static $masks = array(
0 => 127,
2 => 31,
3 => 15,
4 => 7,
);
$ret = array();
for($i = 0, $len = strlen($string); $i < $len; ) {
$oct_len = strpos( sprintf("%'08b", ord($string{$i}) ), '0' );
$char = 0;
for($q = $oct_len - ($oct_len > 0); $q >= 0; $q--) {
$char |= (
(
ord( $string{$i + $q} )
&
( $q > 0 ? 63 : $masks[$oct_len] )
)
<<
( ($oct_len - ($oct_len > 0) - $q) * 6 )
);
}
$ret[] = $char;
$i += ( $oct_len + ($oct_len == 0) );
}
if($ret[0] == 0xfeff) array_shift($ret); // get rid of signature octets
return $ret;
}
function utf_counts_by_ranges($string, $ranges) {
$ret = array();
foreach( parse_utf8($string) as $char) {
foreach( $ranges as $name => $boundaries)
if( ( $char >= $boundaries['start'] ) && ($char <= $boundaries['end']) )
@$ret[$name]++;
}
return $ret;
}
var_dump(utf_counts_by_ranges(file_get_contents('asd'), get_ranges()));