Page 1 of 1

File format (mime type) identifying

Posted: Wed Dec 26, 2007 11:10 am
by vigge89
I was bored and figured I'd try hammering together a class which could be used to figure out what format/type a file was (mime type more specifically, idea from this post). The following was thrown together in an hour and is far from perfect as most of the time was spent browsing the internets (mostly http://filext.com/) for info on each file type.
PHP5+ only

Code: Select all

class FileFormatIdentifier {
    
    // File format table
    protected $formatTable = array(
        'image/jpeg' => array('name' => 'JPEG/JIFF Image', 'ext' => array('jpg', 'jpeg', 'jpe')),
        'image/png' => array('name' => 'Portable (Public) Network Graphic', 'ext' => array('png')),
        'image/gif' => array('name' => 'Graphic Interchange Format', 'ext' => array('gif')),
        'image/tga' => array('name' => 'Truevision Targa Graphic', 'ext' => array('tga')),
        'image/tif' => array('name' => 'Tagged Image Format File', 'ext' => array('tif')),
        'image/bmp' => array('name' => 'Windows OS/2 Bitmap Graphic', 'ext' => array('bmp')),
        'image/photoshop' => array('name' => 'Photoshop Format', 'ext' => array('psd')),
        'application/msword' => array('name' => 'Word Document', 'ext' => array('doc')),
        'application/msexcel' => array('name' => 'Excel Worksheet', 'ext' => array('xls')),
        'video/avi' => array('name' => 'Audio Video Interleave File', 'ext' => array('avi')),
        'audio/wav' => array('name' => 'Waveform Audio', 'ext' => array('wav')),
        'audio/mid' => array('name' => 'MIDI-sequention Sound', 'ext' => array('mid', 'midi')),
        'audio/mpeg' => array('name' => 'MPEG Audio Stream, Layer III', 'ext' => array('mp3')),
        'video/mpeg' => array('name' => 'MPEG 1 System Stream', 'ext' => array('mpg', 'mpeg')),
        'video/quicktime' => array('name' => 'QuickTime Video Clip', 'ext' => array('mov')),
        'application/x-zip-compressed' => array('name' => 'Compressed Archive File', 'ext' => array('zip')),
        'application/x-rar-compressed' => array('name' => 'WinRAR Compressed Archive', 'ext' => array('rar', 'r01')),
        'application/x-ace-compressed' => array('name' => 'WinAce Compressed File', 'ext' => array('ace')),
        'application/x-7z-compressed' => array('name' => '7-Zip Compressed File', 'ext' => array('7z')),
        'font/ttf' => array('name' => 'TrueType Font', 'ext' => array('ttf')),
        'font/otf' => array('name' => 'Open Type Font Format', 'ext' => array('otf')) // No idea about mime type
    );
    protected $identRefTable = array(
        '89504e470d0a1a0a0000000d49484452' => 'image/png',
        '38425053000100000000000000' => 'image/photoshop',
        '4d54686400000006000100' => 'audio/mid',
        'd0cf11e0a1b11ae100' => 'application/msexcel',
        'd0cf11e0a1b11ae1' => 'application/msword',
        '526172211a0700' => 'application/x-rar-compressed',
        '2a2a4143452a2a' => 'application/x-ace-compressed',
        '377abcaf271c' => 'application/x-7z-compressed',
        '0001000000' => 'font/ttf',
        '4f54544f00' => 'font/otf',
        '504b0304' => 'application/x-zip-compressed',
        #'52494646' => 'audio/wav',
        '52494646' => 'video/avi',
        '47494638' => 'image/gif',
        '49492a00' => 'image/tif',
        '4d4d002a' => 'image/tif',
        '49492a00' => 'image/tif',
        '4944330' => 'audio/mpeg',
        '000001' => 'video/mpeg',
        'ffd8ff' => 'image/jpeg',
        '424d' => 'image/bmp',
        '6d' => 'video/quicktime',
        '00' => 'image/tga',
        'ff' => 'audio/mp3'
    );
 
    protected $maxLength;
 
    public function __construct() {
        // Determine maximum length to read from files
        $maxLength = 0;
        foreach ($this->identRefTable as &$ident) {
            $len = strlen($ident);
            if ($len > $maxLength) $maxLength = $len;
        }
        $this->maxLength = $maxLength;
    }
 
    public function infoFor($mime) {
        if (!isset($this->formatTable[$mime])) return null;
        return $this->formatTable[$mime];
    }
 
    public function identify($target, $returnType = 'mime') {
        
        // Make sure the target is a file we can work with
        if (!is_file($target)) throw new Exception("'$target' is not a valid file.");
 
        // Attempt to read the file header (the first $this->maxLength bytes) from the target file
        $header = file_get_contents($target, false, null, 0, $this->maxLength);
        if ($header === false) throw new Exception("Failed to get contents of '$target'.");
 
        // Convert the header to a hexadecimal representation to work with
        $header = bin2hex($header);
                
        // Return the requested info
        foreach ($this->identRefTable as $ident => $mime)
            if (substr($header, 0, strlen($ident)) == $ident)
                switch ($returnType) {
                    case 'format':
                    case 'name':
                        return $this->formatTable[$mime]['name']; break;
                    case 'ext': return $this->formatTable[$mime]['ext'][0]; break;
                    case 'exts': return $this->formatTable[$mime]['ext']; break;
                    case '*': return $this->formatTable[$mime]; break;
                    case 'mime':
                    default:
                         return $mime;
                }
        
        // No match found in table
        return '';
 
    }
 
}
Usage:

Code: Select all

$FFI = new FileFormatIdentifier;
 
// Determine mime type from file header
echo "Mime type: ". $FFI->identify('test.gif') ."<br />";
// Same as above but return name of the format
echo "Format name: ". $FFI->identify('test.gif', 'format') ."<br />";
 
echo "<pre>";
// ... return an associative array with format name & possible extensions
var_dump($FFI->identify('test.gif', '*'));
// Return an associative array (like above) with info for the specified mime type
var_dump($FFi->infoFor('image/jpeg'));
echo "</pre>";
Output:

Code: Select all

Mime type: image/jpeg
Format name: JPEG/JIFF Image
 
array(2) {
  ["name"]=>
  string(15) "JPEG/JIFF Image"
  ["ext"]=>
  array(3) {
    [0]=>
    string(3) "jpg"
    [1]=>
    string(4) "jpeg"
    [2]=>
    string(3) "jpe"
  }
}
array(2) {
  ["name"]=>
  string(15) "JPEG/JIFF Image"
  ["ext"]=>
  array(3) {
    [0]=>
    string(3) "jpg"
    [1]=>
    string(4) "jpeg"
    [2]=>
    string(3) "jpe"
  }
}
The code is pretty much self-explanatory, what identify() does is to read the first few bytes of the file, convert it to a hexadecimal representation and then look it up in the identifier table. If a match is found the requested info is returned from the format table. There is some uncertainty regarding avi/wav as those formats are just placeholders for others. I'm not really sure how to distinguish them without digging deeper into the file, something I'm trying to stay away from. I was also hoping of being able to determine whether a file is binary or not but as far as I know that would require checking for a nul character which could be pretty time consuming.

Like I wrote before, this is a pretty hackish attempt so suggestions for improvements are most welcome.

Update #1: Changed class method determine() to identify().

Posted: Wed Dec 26, 2007 2:32 pm
by Ambush Commander
This feels like duplicated work to me. There are databases out there of magic byte sequences => file types, so it would seem to me that a more efficient use of time would be parsing that format, and maybe compiling $formatTable.

Posted: Wed Dec 26, 2007 3:50 pm
by vigge89
I'm sure there are, but the problem was that I couldn't find anything already available for PHP as a plain script easily readable. If you've got any links I'd gladly take a look at them though.

Posted: Wed Dec 26, 2007 4:36 pm
by Kieran Huggins
I think this class is a welcome scratch. Mime detection in PHP has always been extremely poor and/or difficult to implement.

There's a DB of magic bytes here: http://magicdb.org - but it looks like a dead-ish project to me. the DB itself could be culled into your class at any rate!

Posted: Wed Dec 26, 2007 4:59 pm
by vigge89
I'm currently looking through the magic file provided in the apache package for some more common formats. The magicdb page was nice enough to provide some example files to test against, cheers for that ;)
An update is coming up soon, decided to allow both strings and hex representations in the ident table.

Posted: Wed Dec 26, 2007 5:26 pm
by vigge89
Updated:

Code: Select all

class FileFormatIdentifier {
    
    // File format table
    // Format: $mime => array(name => $formatName, ext => array($extensions...))
    protected $formatTable = array(
        'image/jpeg' => array('name' => 'JPEG/JIFF Image', 'ext' => array('jpg', 'jpeg', 'jpe')),
        'image/png' => array('name' => 'Portable (Public) Network Graphic', 'ext' => array('png')),
        'video/mng' => array('name' => 'Multi-image Network Graphic Animation', 'ext' => array('mng')),
        'image/gif' => array('name' => 'Graphic Interchange Format', 'ext' => array('gif')),
        'image/tga' => array('name' => 'Truevision Targa Graphic', 'ext' => array('tga')),
        'image/tif' => array('name' => 'Tagged Image Format File', 'ext' => array('tif')),
        'image/bmp' => array('name' => 'Windows OS/2 Bitmap Graphic', 'ext' => array('bmp')),
        'image/photoshop' => array('name' => 'Photoshop Format Image', 'ext' => array('psd')),
        'application/msword' => array('name' => 'Word Document', 'ext' => array('doc')),
        'application/msexcel' => array('name' => 'Excel Worksheet', 'ext' => array('xls')),
        'video/avi' => array('name' => 'Audio Video Interleave File', 'ext' => array('avi')),
        'audio/wav' => array('name' => 'Waveform Audio', 'ext' => array('wav')),
        'audio/mid' => array('name' => 'MIDI-sequention Sound', 'ext' => array('mid', 'midi')),
        'audio/mpeg' => array('name' => 'MPEG Audio Stream, Layer III', 'ext' => array('mp3')),
        'video/mpeg' => array('name' => 'MPEG 1 System Stream', 'ext' => array('mpg', 'mpeg')),
        'video/quicktime' => array('name' => 'QuickTime Video Clip', 'ext' => array('mov')),
        'application/x-shockwave-flash' => array('name' => 'Macromedia Flash Format File', 'ext' => array('swf')),
        'application/pdf' => array('name' => 'Acrobat Portable Document Format', 'ext' => array('pdf')),
        'application/x-zip-compressed' => array('name' => 'Compressed Archive File', 'ext' => array('zip')),
        'application/x-rar-compressed' => array('name' => 'WinRAR Compressed Archive', 'ext' => array('rar', 'r01')),
        'application/x-ace-compressed' => array('name' => 'WinAce Compressed File', 'ext' => array('ace')),
        'application/x-7z-compressed' => array('name' => '7-Zip Compressed File', 'ext' => array('7z')),
        'application/x-bzip' => array('name' => 'Bzip 2 UNIX Compressed File', 'ext' => array('bz2', 'tbz2', 'tb2')),
        'application/x-gzip' => array('name' => 'Gzip Compressed Archive', 'ext' => array('gz')),
        'application/x-tar' => array('name' => 'Tape Archive File', 'ext' => array('tar')),
        'font/ttf' => array('name' => 'TrueType Font', 'ext' => array('ttf')),
        'font/otf' => array('name' => 'Open Type Font Format', 'ext' => array('otf')), // No idea about mime type
        'text/html' => array('name' => 'HyperText Markup Language', 'ext' => array('htm', 'html')),
        'text/xml' => array('name' => 'Extensible Markup Language File', 'ext' => array('xml'))
    );
    // Ident reference table
    // Format: array($representation, $ident, $mime)
    // 's' => string, 'h' => hexadecimal
    protected $identRefTable = array(
        array('h', 'd0cf11e0a1b11ae100', 'application/msword'),
        #array('h', 'd0cf11e0a1b11ae100', 'application/msexcel'),
        array('h', '89504e470d0a1a0a00', 'image/png'),
        array('h', '8a4d4e470d0a1a0a00', 'video/mng'),
        array('h', '0001000000', 'font/ttf'),
        array('h', '4f54544f00', 'font/otf'),
        array('h', 'ffd8ff', 'image/jpeg'),
        array('h', '4944330', 'audio/mpeg'),
        array('h', '000001', 'video/mpeg'),
        array('s', '8BPS', 'image/photoshop'),
        array('s', 'MThd', 'audio/mid'),
        array('s', '**ACE**', 'application/x-ace-compressed'),
        array('s', 'Rar!', 'application/x-rar-compressed'),
        array('s', 'PK', 'application/x-zip-compressed'),
        array('s', 'BZh', 'application/x-bzip'),
        array('h', '1f8b08', 'application/x-gzip'),
        array('s', '7z', 'application/x-7z-compressed'),
        #array('s', 'ustar', 'application/x-tar'), // should be at byte position 257
        #array('s', 'RIFF', 'audio/wav'),
        array('s', 'RIFF', 'video/avi'),
        array('s', 'GIF8', 'image/gif'),
        array('s', 'MM.*', 'image/tif'),
        array('s', 'II*', 'image/tif'),
        array('h', '424d', 'image/bmp'),
        array('s', '%PDF', 'application/pdf'),
        array('s', 'FWS', 'application/x-shockwave-flash'),
        array('h', '6d', 'video/quicktime'),
        array('h', '00', 'image/tga'),
        array('h', 'ff', 'audio/mp3'),
        array('s', '<!DOCTYPE HTML', 'text/html'),
        array('s', '<!doctype html', 'text/html'),
        array('s', '<HTML', 'text/html'),
        array('s', '<html', 'text/html'),
        array('s', '<?xml', 'text/xml')
    );
 
    protected $maxLength;
 
    public function __construct() {
        // Determine maximum length to read from files
        $maxLength = 0;
        foreach ($this->identRefTable as &$ident) {
            $len = strlen($ident[1]);
            if ($len > $maxLength) $maxLength = $len;
        }
        $this->maxLength = $maxLength;
//      foreach ($this->identRefTable as $ident => $mime)
//          echo "$mime: $ident => ".pack('A', $ident)."<br />";
    }
 
    public function infoFor($mime) {
        if (!isset($this->formatTable[$mime])) return null;
        return $this->formatTable[$mime];
    }
 
    public function identify($target, $returnType = 'mime') {
        
        // Make sure the target is a file we can work with
        if (!is_file($target)) throw new Exception("'$target' is not a valid file.");
 
        // Attempt to read the file header (the first $this->maxLength bytes) from the target file
        $bin = file_get_contents($target, false, null, 0, 50);
        if ($bin === false) throw new Exception("Failed to get contents of '$target'.");
 
        // Convert the header to a hexadecimal representation to work with
        $hex = bin2hex($bin);
        
        // Return the requested info
        foreach ($this->identRefTable as $ident) {
            if ($ident[0] == 's') $cmp = &$bin;
            else $cmp = &$hex;
            if (substr($cmp, 0, strlen($ident[1])) == $ident[1])
                switch ($returnType) {
                    case 'format':
                    case 'name':
                        return $this->formatTable[$ident[2]]['name']; break;
                    case 'ext': return $this->formatTable[$ident[2]]['ext'][0]; break;
                    case 'exts': return $this->formatTable[$ident[2]]['ext']; break;
                    case '*': return $this->formatTable[$ident[2]]; break;
                    case 'mime':
                    default:
                         return $ident[2];
                }
        }
        
        // No match found in table
        return '';
 
    }
 
}
The Tar format was kinda problematic as the identifier is located at the 257th byte, so I scrapped it for now.

Posted: Wed Dec 26, 2007 8:16 pm
by s.dot
vigge89 wrote:The Tar format was kinda problematic as the identifier is located at the 257th byte, so I scrapped it for now.
fread() or fgetc() :)

Posted: Wed Dec 26, 2007 8:18 pm
by Ambush Commander
Agreed. While file_get_contents is the most efficient way to read the entire file to a string, the small number of leading bytes you'll need works better with fopen() and its kin.

Posted: Wed Dec 26, 2007 8:34 pm
by vigge89
I was unsure about which method to use and spent quite some time trying to decide but ended up going for the simpler solution thinking I could revamp it later. Now that you've said it I guess I'll switch to the old school way for the next update ;)

Posted: Fri Jan 04, 2008 8:21 pm
by vigge89
New version coming up...
Changes:
+ Turned the class static as there isn't much use for it to be instantiated (correct me if I'm wrong)
+ Byte offset functionality
+ Regex matching (needed to distinguish AVI from WAV) - rather hackish at the moment (any suggestions?)
+ Ability to look up format from extension
+ Storing extensions in a separate table
+ More formats added to table

I'm still somewhat uncertain regarding how to read the magic bytes from the files. As of now it will read from the start of the file to $maxLength, which is initially set by determineMaxLenght() to the length of the longest ident in $identRefTable. The data is then compared to each entry and if a match is found the mime type associated with it is returned. I'm sure there's a faster/better way to do it, but how?

Input and suggestions are more then welcome!

For the mods: Is it preferred to post larger blocks of code on an external site like pastebin.ca or should I keep it in the thread?

Edit: Found some useful links for anyone interested:
Common files signature table
Linux Magic Numbers (alot more then what my apache2 magic file contained)

Code: Select all

<?php
 
class FileFormatIdentifier {
    
    // File format information table
    // Format: $mime => array(name => $formatName)
    protected static $formatInfoTable = array(
        'image/jpeg' => array('name' => 'JPEG/JIFF Image'),
        'image/png' => array('name' => 'Portable (Public) Network Graphic'),
        'video/mng' => array('name' => 'Multi-image Network Graphic Animation'),
        'image/gif' => array('name' => 'Graphic Interchange Format Image'),
        'image/tga' => array('name' => 'Truevision Targa Graphic'),
        'image/tif' => array('name' => 'Tagged Image Format File'),
        'image/bmp' => array('name' => 'Windows OS/2 Bitmap Graphic'),
        'image/svg+xml' => array('name' => 'Scalable Vector Graphic'),
        'image/photoshop' => array('name' => 'Photoshop Format Image'),
        'image/vnd.microsoft.icon' => array('name' => 'Windows Icon'),
        'application/ogg' => array('name' => 'Ogg Multimedia File'),
        'audio/wav' => array('name' => 'Waveform Audio File'),
        'audio/mid' => array('name' => 'MIDI-sequention Sound File'),
        'audio/mpeg' => array('name' => 'MPEG Audio Stream (Layer III) File'),
        'video/mpeg' => array('name' => 'MPEG System Stream File'),
        'video/3gpp' => array('name' => '3GPP Multimedia File'),
        'video/quicktime' => array('name' => 'QuickTime Video Clip'),
        'video/avi' => array('name' => 'Audio Video Interleave File'),
        'application/x-shockwave-flash' => array('name' => 'Macromedia Flash File'),
        'application/pdf' => array('name' => 'Acrobat Portable Document File'),
        'application/winhlp' => array('name' => 'Windows Help File'),
        'application/msword' => array('name' => 'Word Document'),
        'application/msexcel' => array('name' => 'Excel Worksheet'),
        'application/mspowerpoint' => array('name' => 'PowerPoint Presentation'),
        'application/x-zip-compressed' => array('name' => 'Compressed Archive'),
        'application/x-rar-compressed' => array('name' => 'WinRAR Compressed Archive'),
        'application/x-ace-compressed' => array('name' => 'WinAce Compressed Archive'),
        'application/x-7z-compressed' => array('name' => '7-Zip Compressed Archive'),
        'application/x-bzip' => array('name' => 'Bzip 2 UNIX Compressed Archive'),
        'application/x-gzip' => array('name' => 'Gzip Compressed Archive'),
        'application/x-tar' => array('name' => 'Tape Archive'),
        'application/java-archive' => array('name' => 'Java Archive'),
        'font/ttf' => array('name' => 'TrueType Font'),
        'font/otf' => array('name' => 'Open Type Font'),
        'text/plain' => array('name' => 'Text File'),
        'text/html' => array('name' => 'HyperText Markup Language File'),
        'application/xhtml+xml' => array('name' => 'Extensible HyperText Markup Language File'),
        'text/xml' => array('name' => 'Extensible Markup Language File'),
        'application/x-httpd-php' => array('name' => 'PHP Script'),
        'application/x-java-class' => array('name' => 'Java Bytecode'),
        'application/octet-stream' => array('name' => 'Executable File')
    );
    
    // Ident reference table
    // Format: array($byteOffset, $representation, $ident, $mime)
    // s => string, h => hexadecimal, r => regular expression pattern
    protected static $identRefTable = array(
        array(0, 'h', '504b0304140008000800', 'application/java-archive'),
        array(0, 'h', '89504e470d0a1a0a00', 'image/png'),
        array(0, 'h', '8a4d4e470d0a1a0a00', 'video/mng'),
        array(0, 'h', 'cafebabe', 'application/x-java-class'),
        array(0, 'h', '0001000000', 'font/ttf'),
        array(0, 'h', '4f54544f00', 'font/otf'),
        array(0, 'h', '4944330', 'audio/mpeg'),
        array(0, 'h', '000001b', 'video/mpeg'),
        array(0, 'h', '00000100', 'image/vnd.microsoft.icon'),
        array(0, 'h', '000000', 'video/3gpp'),
        array(0, 's', '8BPS', 'image/photoshop'),
        array(0, 's', 'MThd', 'audio/mid'),
        array(0, 's', 'OggS', 'application/ogg'),
        array(0, 's', '**ACE**', 'application/x-ace-compressed'),
        array(0, 's', 'Rar!', 'application/x-rar-compressed'),
        array(0, 's', 'PK', 'application/x-zip-compressed'),
        array(0, 's', 'BZh', 'application/x-bzip'),
        array(0, 'h', '1f8b08', 'application/x-gzip'),
        array(0, 's', '7z', 'application/x-7z-compressed'),
        array(257, 's', 'ustar', 'application/x-tar'),
        array(0, 'sr', 'RIFF....WAVE', 'audio/wav'),
        array(0, 'sr', 'RIFF....AVI', 'video/avi'),
        array(0, 's', 'GIF8', 'image/gif'),
        array(0, 's', 'MM.*', 'image/tif'),
        array(0, 's', 'II*', 'image/tif'),
        array(0, 'h', 'ffd8', 'image/jpeg'),
        array(0, 'h', '424d', 'image/bmp'),
        array(0, 's', 'MZ', 'application/octet-stream'),
        array(0, 's', '?_', 'application/winhlp'),
        array(0, 's', '%PDF', 'application/pdf'),
        array(0, 's', 'FWS', 'application/x-shockwave-flash'),
        array(0, 'h', '6d', 'video/quicktime'),
        array(508, 'h', 'ffffffffeca5c100', 'application/msword'),
        array(508, 'h', 'fffffffffdffffff1f', 'application/msexcel'),
        array(508, 'h', 'fffffffffdffffffc3', 'application/mspowerpoint'),
        array(0, 'h', 'efbbbf', 'text/plain'), // UTF-8
        array(0, 'h', 'fffe', 'text/plain'), // UTF-16 LE
        array(0, 'h', 'feff', 'text/plain'), // UTF-16 BE
        array(0, 'h', 'fffe0000', 'text/plain'), // UTF-32 LE
        array(0, 'h', '0000feff', 'text/plain'),  // UTF-32 BE
        array(0, 'h', '00', 'image/tga'),
        array(0, 'h', 'ff', 'audio/mpeg'),
        array(0, 's', '<?php', 'application/x-httpd-php'),
        array(0, 's', '<!DOCTYPE HTML', 'text/html'),
        array(0, 's', '<!DOCTYPE html', 'text/html'),
        array(0, 's', '<!doctype html', 'text/html'),
        array(0, 's', '<HTML', 'text/html'),
        array(0, 's', '<html', 'text/html'),
        array(0, 's', '<?xml', 'text/xml')
    );
 
    // Extension reference table
    // Format: $extension => $mime
    protected static $extRefTable = array(
        'jpg' => 'image/jpeg',
        'jpeg' => 'image/jpeg',
        'jpe' => 'image/jpeg',
        'png' => 'image/png',
        'mng' => 'video/mng',
        'gif' => 'image/gif',
        'tga' => 'image/tga',
        'tif' => 'image/tif',
        'bmp' => 'image/bmp',
        'ico' => 'image/vnd.microsoft.icon',
        'psd' => 'image/photoshop',
        'avi' => 'video/avi',
        'wav' => 'audio/wav',
        'mid' => 'audio/mid',
        'midi' => 'audio/mid',
        'mp3' => 'audio/mpeg',
        'mpg' => 'video/mpeg',
        'mpeg' => 'video/mpeg',
        'ogg' => 'application/ogg',
        'ogm' => 'application/ogg',
        'ogv' => 'application/ogg',
        'oga' => 'application/ogg',
        '3gp' => 'video/3gpp',
        '3g2' => 'video/3gpp',
        'mov' => 'video/quicktime',
        'swf' => 'application/x-shockwave-flash',
        'zip' => 'application/x-zip-compressed',
        'rar' => 'application/x-rar-compressed',
        'r01' => 'application/x-rar-compressed',
        'ace' => 'application/x-ace-compressed',
        '7z' => 'application/x-7z-compressed',
        'jar' => 'application/java-archive',
        'bz2' => 'application/x-bzip',
        'tbz2' => 'application/x-bzip',
        'tb2' => 'application/x-bzip',
        'gz' => 'application/x-gzip',
        'tar' => 'application/x-tar',
        'exe' => 'application/octet-stream',
        'com' => 'application/octet-stream',
        'dll' => 'application/octet-stream',
        'pdf' => 'application/pdf',
        'doc' => 'application/msword',
        'xls' => 'application/msexcel',
        'ppt' => 'application/mspowerpoint',
        'ttf' => 'font/ttf',
        'otf' => 'font/otf',
        'htm' => 'text/html',
        'html' => 'text/html',
        'xhtml' => 'text/html',
        'xht' => 'text/html',
        'xml' => 'text/xml',
        'svg' => 'image/svg+xml',
        'php' => 'application/x-httpd-php',
        'class' => 'application/x-java-class',
        'txt' => 'text/plain',
        'log' => 'text/plain',
        'msg' => 'text/plain',
        'rtf' => 'text/plain',
        'nfo' => 'text/plain'
    );
    
    // Maximum number of bytes to read from the file starting at offset 0
    protected static $maxLength;
 
    // Determine maximum length to read from files
    protected static function determineMaxLenght() {
        $maxLength = 0;
        foreach (self::$identRefTable as &$ident) {
            $len = strlen($ident[2]);
            if ($ident[1] == 'h') $len /= 2;
            $len += $ident[0];
            if ($len > $maxLength) $maxLength = $len;
        }
        self::$maxLength = $maxLength;
    }
 
    // Attempts to figure out file format by looking at the first few bytes of the file
    public static function fromHeader($target) {
 
        // Determine max lenght to read from file if it hasn't been done already
        if (!isset(self::$maxLength)) self::determineMaxLenght();
 
        // Make sure the target is a file we can work with before opening it
        if (!is_file($target)) throw new Exception("'$target' is not a valid file.");
        $handle = @fopen($target, 'r');
        if ($handle === false) throw new Exception("Could not open target file '$target'.");
        
        // Read data from target file
        $bin = fread($handle, self::$maxLength);
        if ($bin === false) throw new Exception("Unable to read data from '$target'.");
        fclose($handle);
 
        $len = strlen($bin);
 
        // Convert the header to a hexadecimal representation to work with
        $hex = bin2hex($bin);
 
        // Compare data with each entry in the ident table
        foreach (self::$identRefTable as $ident) {
            // Skip current if the data read isn't long enough
            if ($ident[0]> $len) continue;
            // Compare by string or hexadecimal representation?
            if ($ident[1][0] == 's') $cmp = substr($bin, $ident[0], strlen($ident[2]));
            else $cmp = substr($hex, $ident[0]*2, strlen($ident[2]));
            // Return the mime type associated with the ident if we have a match
            // Compare with regex if a second character in the representation variable is present (r)
            if ( (isset($ident[1][1]) && preg_match('~^'.$ident[2].'~', $cmp)) || $cmp == $ident[2] )
                return $ident[3];
        }
        
        // No match found in table
        return '';
 
    }
 
    // Tries to guess the file format by the targets file extension
    public static function fromExtension($target) {
        $ext = self::extractExtension($target);
        if (strlen($ext) < 1 || !isset(self::$extRefTable[$ext]))
            return '';
        return self::$extRefTable[$ext];
    }
 
    // Extracts file extension from the target path
    public static function extractExtension($target) {
        preg_match('~(?>\.([A-Za-z0-9_]+))?$~', $target, $targetParts);
        return strtolower($targetParts[1]);
    }
 
    // Returns info (from file format information table) for the specified mime type
    public static function infoFor($mime) {
        if (!isset(self::$formatInfoTable[$mime])) return array();
        return self::$formatInfoTable[$mime];
    }
 
    // Returns possible extensions for the specified mime type
    public static function extensionsFor($mime) {
        return array_keys(self::$extRefTable, $mime);
    }
 
}

Re: File format (mime type) identifying

Posted: Wed Aug 08, 2012 5:50 pm
by cfallen
All I care about is making sure that a file is an archive type of file. But I wanted to not make so many iterations thru the loop of potential files, and only read the specific portion of data from the file where the "magic" is supposed to be, instead of reading from the beginning of every file to the longest offset of all possible filetypes. For this I used fseek().

Code: Select all

<?
class DetectArchiveFile {
   
    // Ident reference table
    // Format: array($byteOffset, $representation, $ident, $mime)
    // s => string, h => hexadecimal, r => regular expression pattern
    public $identRefTable = array(
        array(0,  's',   'PK',                   'application/x-zip-compressed'),
        array(0,  'h',   '1f8b08',               'application/x-gzip'),
        array(0,  'h',   '78',                   'application/x-apple-diskimage'),
        array(0,  's',   'BZh',                  'application/x-bzip'),
        array(0,  's',   '7z',                   'application/x-7z-compressed'),
        array(0,  'h',   '526172211A0700',       'application/x-rar-compressed'),
        array(257,'s',   'ustar',                'application/x-tar'),
        array(0,  's',   'PK',                   'application/x-zip'),
        array(0,  'h',   '504B0304',             'application/zip'),
        array(29152,'h',   '57696E5A6970',       'application/zip'),
    );
 
    // Extension reference table
    // Format: $extension => $mime
    public $extRefTable = array(
        '7z'  => 'application/x-7z-compressed',
        'bz2' => 'application/x-bzip',
        'dmg' => 'application/x-apple-diskimage',
        'gz'  => 'application/x-gzip',
        'jar' => 'application/java-archive',
        'rar' => 'application/x-rar-compressed',
        'tar' => 'application/x-tar',
        'tbz' => 'application/x-bzip',
        'tgz' => 'application/x-tar',
        'zip' => 'application/x-zip-compressed',
    );
   
    function __construct() { }
 
    // Attempts to figure out file format by looking at the "magic" spots the file
    function fromHeader($target) 
    {
        if (!is_file($target)) 
            throw new Exception("Invalid file: $target");

        $fp = @fopen($target, 'r');
        if ($fp === false) 
            throw new Exception("Could not open target file: $target");
       
        $retval = 'NOT FOUND';
        
        // Go thru the MAGIC list and see if data in this file matches
        foreach ($this->identRefTable as $ident) 
        {
            if(fseek($fp, $ident[0]) !== -1)  // set file position. Usually 0, but 257 for tar and Winzip has a long one
            {
                // How many bytes from the current position should be read in? 
                $len = strlen($ident[2]);

                if ($ident[1] == 'h') 
                {
                    $len /= 2;
                }

                $magic = fread($fp, $len);  // Read Magic from its offset in the file through its length
                if ($magic === false) 
                    throw new Exception("Unable to read data from: $target");


                if ($ident[1] == 'h') 
                {
                    $magic = bin2hex($magic);
                }


                // Return the mime type associated with the ident if we have a match
                                             // Compare with regex if a second character in the representation variable is present (r)
                if ( (isset($ident[1][1]) && preg_match('/^'. $ident[2]. '/', $magic)) || 
                     $magic == $ident[2] )
                {
                    $retval = $ident[3];
                    break;
                }
            }
            else 
            {
                echo "Couldn;t seek!\n";
                continue; // check the next Magic pattern against this file            
            }
        }

        fclose($fp);
       
        // No match found in table
        return $retval;
 
    }
 
    // Tries to guess the file format by the targets file extension
    function fromExtension($target) {
        $ext = strtolower( array_pop( explode( '.', $target ) ) );
        if (strlen($ext) < 1 || !isset($this->extRefTable[$ext]))
            return '';
        return $this->extRefTable[$ext];
    }
 
}

$FFI = new DetectArchiveFile;

$file = $_SERVER['argv'][1];


// Determine mime type from file header

$format = $FFI->fromHeader($file);

echo "Mime type for $file from MAGIC bytes: $format\n";