File format (mime type) identifying
Posted: Wed Dec 26, 2007 11:10 am
I was bored and figured I'd try hammering together a class which could be used to figure out what format/type a file was (mime type more specifically, idea from this post). The following was thrown together in an hour and is far from perfect as most of the time was spent browsing the internets (mostly http://filext.com/) for info on each file type.
PHP5+ only
Usage:
Output:
The code is pretty much self-explanatory, what identify() does is to read the first few bytes of the file, convert it to a hexadecimal representation and then look it up in the identifier table. If a match is found the requested info is returned from the format table. There is some uncertainty regarding avi/wav as those formats are just placeholders for others. I'm not really sure how to distinguish them without digging deeper into the file, something I'm trying to stay away from. I was also hoping of being able to determine whether a file is binary or not but as far as I know that would require checking for a nul character which could be pretty time consuming.
Like I wrote before, this is a pretty hackish attempt so suggestions for improvements are most welcome.
Update #1: Changed class method determine() to identify().
PHP5+ only
Code: Select all
class FileFormatIdentifier {
// File format table
protected $formatTable = array(
'image/jpeg' => array('name' => 'JPEG/JIFF Image', 'ext' => array('jpg', 'jpeg', 'jpe')),
'image/png' => array('name' => 'Portable (Public) Network Graphic', 'ext' => array('png')),
'image/gif' => array('name' => 'Graphic Interchange Format', 'ext' => array('gif')),
'image/tga' => array('name' => 'Truevision Targa Graphic', 'ext' => array('tga')),
'image/tif' => array('name' => 'Tagged Image Format File', 'ext' => array('tif')),
'image/bmp' => array('name' => 'Windows OS/2 Bitmap Graphic', 'ext' => array('bmp')),
'image/photoshop' => array('name' => 'Photoshop Format', 'ext' => array('psd')),
'application/msword' => array('name' => 'Word Document', 'ext' => array('doc')),
'application/msexcel' => array('name' => 'Excel Worksheet', 'ext' => array('xls')),
'video/avi' => array('name' => 'Audio Video Interleave File', 'ext' => array('avi')),
'audio/wav' => array('name' => 'Waveform Audio', 'ext' => array('wav')),
'audio/mid' => array('name' => 'MIDI-sequention Sound', 'ext' => array('mid', 'midi')),
'audio/mpeg' => array('name' => 'MPEG Audio Stream, Layer III', 'ext' => array('mp3')),
'video/mpeg' => array('name' => 'MPEG 1 System Stream', 'ext' => array('mpg', 'mpeg')),
'video/quicktime' => array('name' => 'QuickTime Video Clip', 'ext' => array('mov')),
'application/x-zip-compressed' => array('name' => 'Compressed Archive File', 'ext' => array('zip')),
'application/x-rar-compressed' => array('name' => 'WinRAR Compressed Archive', 'ext' => array('rar', 'r01')),
'application/x-ace-compressed' => array('name' => 'WinAce Compressed File', 'ext' => array('ace')),
'application/x-7z-compressed' => array('name' => '7-Zip Compressed File', 'ext' => array('7z')),
'font/ttf' => array('name' => 'TrueType Font', 'ext' => array('ttf')),
'font/otf' => array('name' => 'Open Type Font Format', 'ext' => array('otf')) // No idea about mime type
);
protected $identRefTable = array(
'89504e470d0a1a0a0000000d49484452' => 'image/png',
'38425053000100000000000000' => 'image/photoshop',
'4d54686400000006000100' => 'audio/mid',
'd0cf11e0a1b11ae100' => 'application/msexcel',
'd0cf11e0a1b11ae1' => 'application/msword',
'526172211a0700' => 'application/x-rar-compressed',
'2a2a4143452a2a' => 'application/x-ace-compressed',
'377abcaf271c' => 'application/x-7z-compressed',
'0001000000' => 'font/ttf',
'4f54544f00' => 'font/otf',
'504b0304' => 'application/x-zip-compressed',
#'52494646' => 'audio/wav',
'52494646' => 'video/avi',
'47494638' => 'image/gif',
'49492a00' => 'image/tif',
'4d4d002a' => 'image/tif',
'49492a00' => 'image/tif',
'4944330' => 'audio/mpeg',
'000001' => 'video/mpeg',
'ffd8ff' => 'image/jpeg',
'424d' => 'image/bmp',
'6d' => 'video/quicktime',
'00' => 'image/tga',
'ff' => 'audio/mp3'
);
protected $maxLength;
public function __construct() {
// Determine maximum length to read from files
$maxLength = 0;
foreach ($this->identRefTable as &$ident) {
$len = strlen($ident);
if ($len > $maxLength) $maxLength = $len;
}
$this->maxLength = $maxLength;
}
public function infoFor($mime) {
if (!isset($this->formatTable[$mime])) return null;
return $this->formatTable[$mime];
}
public function identify($target, $returnType = 'mime') {
// Make sure the target is a file we can work with
if (!is_file($target)) throw new Exception("'$target' is not a valid file.");
// Attempt to read the file header (the first $this->maxLength bytes) from the target file
$header = file_get_contents($target, false, null, 0, $this->maxLength);
if ($header === false) throw new Exception("Failed to get contents of '$target'.");
// Convert the header to a hexadecimal representation to work with
$header = bin2hex($header);
// Return the requested info
foreach ($this->identRefTable as $ident => $mime)
if (substr($header, 0, strlen($ident)) == $ident)
switch ($returnType) {
case 'format':
case 'name':
return $this->formatTable[$mime]['name']; break;
case 'ext': return $this->formatTable[$mime]['ext'][0]; break;
case 'exts': return $this->formatTable[$mime]['ext']; break;
case '*': return $this->formatTable[$mime]; break;
case 'mime':
default:
return $mime;
}
// No match found in table
return '';
}
}Code: Select all
$FFI = new FileFormatIdentifier;
// Determine mime type from file header
echo "Mime type: ". $FFI->identify('test.gif') ."<br />";
// Same as above but return name of the format
echo "Format name: ". $FFI->identify('test.gif', 'format') ."<br />";
echo "<pre>";
// ... return an associative array with format name & possible extensions
var_dump($FFI->identify('test.gif', '*'));
// Return an associative array (like above) with info for the specified mime type
var_dump($FFi->infoFor('image/jpeg'));
echo "</pre>";Code: Select all
Mime type: image/jpeg
Format name: JPEG/JIFF Image
array(2) {
["name"]=>
string(15) "JPEG/JIFF Image"
["ext"]=>
array(3) {
[0]=>
string(3) "jpg"
[1]=>
string(4) "jpeg"
[2]=>
string(3) "jpe"
}
}
array(2) {
["name"]=>
string(15) "JPEG/JIFF Image"
["ext"]=>
array(3) {
[0]=>
string(3) "jpg"
[1]=>
string(4) "jpeg"
[2]=>
string(3) "jpe"
}
}Like I wrote before, this is a pretty hackish attempt so suggestions for improvements are most welcome.
Update #1: Changed class method determine() to identify().