File format (mime type) identifying

Coding Critique is the place to post source code for peer review by other members of DevNetwork. Any kind of code can be posted. Code posted does not have to be limited to PHP. All members are invited to contribute constructive criticism with the goal of improving the code. Posted code should include some background information about it and what areas you specifically would like help with.

Popular code excerpts may be moved to "Code Snippets" by the moderators.

Moderator: General Moderators

Post Reply
User avatar
vigge89
Forum Regular
Posts: 875
Joined: Wed Jul 30, 2003 3:29 am
Location: Sweden

File format (mime type) identifying

Post by vigge89 »

I was bored and figured I'd try hammering together a class which could be used to figure out what format/type a file was (mime type more specifically, idea from this post). The following was thrown together in an hour and is far from perfect as most of the time was spent browsing the internets (mostly http://filext.com/) for info on each file type.
PHP5+ only

Code: Select all

class FileFormatIdentifier {
    
    // File format table
    protected $formatTable = array(
        'image/jpeg' => array('name' => 'JPEG/JIFF Image', 'ext' => array('jpg', 'jpeg', 'jpe')),
        'image/png' => array('name' => 'Portable (Public) Network Graphic', 'ext' => array('png')),
        'image/gif' => array('name' => 'Graphic Interchange Format', 'ext' => array('gif')),
        'image/tga' => array('name' => 'Truevision Targa Graphic', 'ext' => array('tga')),
        'image/tif' => array('name' => 'Tagged Image Format File', 'ext' => array('tif')),
        'image/bmp' => array('name' => 'Windows OS/2 Bitmap Graphic', 'ext' => array('bmp')),
        'image/photoshop' => array('name' => 'Photoshop Format', 'ext' => array('psd')),
        'application/msword' => array('name' => 'Word Document', 'ext' => array('doc')),
        'application/msexcel' => array('name' => 'Excel Worksheet', 'ext' => array('xls')),
        'video/avi' => array('name' => 'Audio Video Interleave File', 'ext' => array('avi')),
        'audio/wav' => array('name' => 'Waveform Audio', 'ext' => array('wav')),
        'audio/mid' => array('name' => 'MIDI-sequention Sound', 'ext' => array('mid', 'midi')),
        'audio/mpeg' => array('name' => 'MPEG Audio Stream, Layer III', 'ext' => array('mp3')),
        'video/mpeg' => array('name' => 'MPEG 1 System Stream', 'ext' => array('mpg', 'mpeg')),
        'video/quicktime' => array('name' => 'QuickTime Video Clip', 'ext' => array('mov')),
        'application/x-zip-compressed' => array('name' => 'Compressed Archive File', 'ext' => array('zip')),
        'application/x-rar-compressed' => array('name' => 'WinRAR Compressed Archive', 'ext' => array('rar', 'r01')),
        'application/x-ace-compressed' => array('name' => 'WinAce Compressed File', 'ext' => array('ace')),
        'application/x-7z-compressed' => array('name' => '7-Zip Compressed File', 'ext' => array('7z')),
        'font/ttf' => array('name' => 'TrueType Font', 'ext' => array('ttf')),
        'font/otf' => array('name' => 'Open Type Font Format', 'ext' => array('otf')) // No idea about mime type
    );
    protected $identRefTable = array(
        '89504e470d0a1a0a0000000d49484452' => 'image/png',
        '38425053000100000000000000' => 'image/photoshop',
        '4d54686400000006000100' => 'audio/mid',
        'd0cf11e0a1b11ae100' => 'application/msexcel',
        'd0cf11e0a1b11ae1' => 'application/msword',
        '526172211a0700' => 'application/x-rar-compressed',
        '2a2a4143452a2a' => 'application/x-ace-compressed',
        '377abcaf271c' => 'application/x-7z-compressed',
        '0001000000' => 'font/ttf',
        '4f54544f00' => 'font/otf',
        '504b0304' => 'application/x-zip-compressed',
        #'52494646' => 'audio/wav',
        '52494646' => 'video/avi',
        '47494638' => 'image/gif',
        '49492a00' => 'image/tif',
        '4d4d002a' => 'image/tif',
        '49492a00' => 'image/tif',
        '4944330' => 'audio/mpeg',
        '000001' => 'video/mpeg',
        'ffd8ff' => 'image/jpeg',
        '424d' => 'image/bmp',
        '6d' => 'video/quicktime',
        '00' => 'image/tga',
        'ff' => 'audio/mp3'
    );
 
    protected $maxLength;
 
    public function __construct() {
        // Determine maximum length to read from files
        $maxLength = 0;
        foreach ($this->identRefTable as &$ident) {
            $len = strlen($ident);
            if ($len > $maxLength) $maxLength = $len;
        }
        $this->maxLength = $maxLength;
    }
 
    public function infoFor($mime) {
        if (!isset($this->formatTable[$mime])) return null;
        return $this->formatTable[$mime];
    }
 
    public function identify($target, $returnType = 'mime') {
        
        // Make sure the target is a file we can work with
        if (!is_file($target)) throw new Exception("'$target' is not a valid file.");
 
        // Attempt to read the file header (the first $this->maxLength bytes) from the target file
        $header = file_get_contents($target, false, null, 0, $this->maxLength);
        if ($header === false) throw new Exception("Failed to get contents of '$target'.");
 
        // Convert the header to a hexadecimal representation to work with
        $header = bin2hex($header);
                
        // Return the requested info
        foreach ($this->identRefTable as $ident => $mime)
            if (substr($header, 0, strlen($ident)) == $ident)
                switch ($returnType) {
                    case 'format':
                    case 'name':
                        return $this->formatTable[$mime]['name']; break;
                    case 'ext': return $this->formatTable[$mime]['ext'][0]; break;
                    case 'exts': return $this->formatTable[$mime]['ext']; break;
                    case '*': return $this->formatTable[$mime]; break;
                    case 'mime':
                    default:
                         return $mime;
                }
        
        // No match found in table
        return '';
 
    }
 
}
Usage:

Code: Select all

$FFI = new FileFormatIdentifier;
 
// Determine mime type from file header
echo "Mime type: ". $FFI->identify('test.gif') ."<br />";
// Same as above but return name of the format
echo "Format name: ". $FFI->identify('test.gif', 'format') ."<br />";
 
echo "<pre>";
// ... return an associative array with format name & possible extensions
var_dump($FFI->identify('test.gif', '*'));
// Return an associative array (like above) with info for the specified mime type
var_dump($FFi->infoFor('image/jpeg'));
echo "</pre>";
Output:

Code: Select all

Mime type: image/jpeg
Format name: JPEG/JIFF Image
 
array(2) {
  ["name"]=>
  string(15) "JPEG/JIFF Image"
  ["ext"]=>
  array(3) {
    [0]=>
    string(3) "jpg"
    [1]=>
    string(4) "jpeg"
    [2]=>
    string(3) "jpe"
  }
}
array(2) {
  ["name"]=>
  string(15) "JPEG/JIFF Image"
  ["ext"]=>
  array(3) {
    [0]=>
    string(3) "jpg"
    [1]=>
    string(4) "jpeg"
    [2]=>
    string(3) "jpe"
  }
}
The code is pretty much self-explanatory, what identify() does is to read the first few bytes of the file, convert it to a hexadecimal representation and then look it up in the identifier table. If a match is found the requested info is returned from the format table. There is some uncertainty regarding avi/wav as those formats are just placeholders for others. I'm not really sure how to distinguish them without digging deeper into the file, something I'm trying to stay away from. I was also hoping of being able to determine whether a file is binary or not but as far as I know that would require checking for a nul character which could be pretty time consuming.

Like I wrote before, this is a pretty hackish attempt so suggestions for improvements are most welcome.

Update #1: Changed class method determine() to identify().
User avatar
Ambush Commander
DevNet Master
Posts: 3698
Joined: Mon Oct 25, 2004 9:29 pm
Location: New Jersey, US

Post by Ambush Commander »

This feels like duplicated work to me. There are databases out there of magic byte sequences => file types, so it would seem to me that a more efficient use of time would be parsing that format, and maybe compiling $formatTable.
User avatar
vigge89
Forum Regular
Posts: 875
Joined: Wed Jul 30, 2003 3:29 am
Location: Sweden

Post by vigge89 »

I'm sure there are, but the problem was that I couldn't find anything already available for PHP as a plain script easily readable. If you've got any links I'd gladly take a look at them though.
User avatar
Kieran Huggins
DevNet Master
Posts: 3635
Joined: Wed Dec 06, 2006 4:14 pm
Location: Toronto, Canada
Contact:

Post by Kieran Huggins »

I think this class is a welcome scratch. Mime detection in PHP has always been extremely poor and/or difficult to implement.

There's a DB of magic bytes here: http://magicdb.org - but it looks like a dead-ish project to me. the DB itself could be culled into your class at any rate!
User avatar
vigge89
Forum Regular
Posts: 875
Joined: Wed Jul 30, 2003 3:29 am
Location: Sweden

Post by vigge89 »

I'm currently looking through the magic file provided in the apache package for some more common formats. The magicdb page was nice enough to provide some example files to test against, cheers for that ;)
An update is coming up soon, decided to allow both strings and hex representations in the ident table.
User avatar
vigge89
Forum Regular
Posts: 875
Joined: Wed Jul 30, 2003 3:29 am
Location: Sweden

Post by vigge89 »

Updated:

Code: Select all

class FileFormatIdentifier {
    
    // File format table
    // Format: $mime => array(name => $formatName, ext => array($extensions...))
    protected $formatTable = array(
        'image/jpeg' => array('name' => 'JPEG/JIFF Image', 'ext' => array('jpg', 'jpeg', 'jpe')),
        'image/png' => array('name' => 'Portable (Public) Network Graphic', 'ext' => array('png')),
        'video/mng' => array('name' => 'Multi-image Network Graphic Animation', 'ext' => array('mng')),
        'image/gif' => array('name' => 'Graphic Interchange Format', 'ext' => array('gif')),
        'image/tga' => array('name' => 'Truevision Targa Graphic', 'ext' => array('tga')),
        'image/tif' => array('name' => 'Tagged Image Format File', 'ext' => array('tif')),
        'image/bmp' => array('name' => 'Windows OS/2 Bitmap Graphic', 'ext' => array('bmp')),
        'image/photoshop' => array('name' => 'Photoshop Format Image', 'ext' => array('psd')),
        'application/msword' => array('name' => 'Word Document', 'ext' => array('doc')),
        'application/msexcel' => array('name' => 'Excel Worksheet', 'ext' => array('xls')),
        'video/avi' => array('name' => 'Audio Video Interleave File', 'ext' => array('avi')),
        'audio/wav' => array('name' => 'Waveform Audio', 'ext' => array('wav')),
        'audio/mid' => array('name' => 'MIDI-sequention Sound', 'ext' => array('mid', 'midi')),
        'audio/mpeg' => array('name' => 'MPEG Audio Stream, Layer III', 'ext' => array('mp3')),
        'video/mpeg' => array('name' => 'MPEG 1 System Stream', 'ext' => array('mpg', 'mpeg')),
        'video/quicktime' => array('name' => 'QuickTime Video Clip', 'ext' => array('mov')),
        'application/x-shockwave-flash' => array('name' => 'Macromedia Flash Format File', 'ext' => array('swf')),
        'application/pdf' => array('name' => 'Acrobat Portable Document Format', 'ext' => array('pdf')),
        'application/x-zip-compressed' => array('name' => 'Compressed Archive File', 'ext' => array('zip')),
        'application/x-rar-compressed' => array('name' => 'WinRAR Compressed Archive', 'ext' => array('rar', 'r01')),
        'application/x-ace-compressed' => array('name' => 'WinAce Compressed File', 'ext' => array('ace')),
        'application/x-7z-compressed' => array('name' => '7-Zip Compressed File', 'ext' => array('7z')),
        'application/x-bzip' => array('name' => 'Bzip 2 UNIX Compressed File', 'ext' => array('bz2', 'tbz2', 'tb2')),
        'application/x-gzip' => array('name' => 'Gzip Compressed Archive', 'ext' => array('gz')),
        'application/x-tar' => array('name' => 'Tape Archive File', 'ext' => array('tar')),
        'font/ttf' => array('name' => 'TrueType Font', 'ext' => array('ttf')),
        'font/otf' => array('name' => 'Open Type Font Format', 'ext' => array('otf')), // No idea about mime type
        'text/html' => array('name' => 'HyperText Markup Language', 'ext' => array('htm', 'html')),
        'text/xml' => array('name' => 'Extensible Markup Language File', 'ext' => array('xml'))
    );
    // Ident reference table
    // Format: array($representation, $ident, $mime)
    // 's' => string, 'h' => hexadecimal
    protected $identRefTable = array(
        array('h', 'd0cf11e0a1b11ae100', 'application/msword'),
        #array('h', 'd0cf11e0a1b11ae100', 'application/msexcel'),
        array('h', '89504e470d0a1a0a00', 'image/png'),
        array('h', '8a4d4e470d0a1a0a00', 'video/mng'),
        array('h', '0001000000', 'font/ttf'),
        array('h', '4f54544f00', 'font/otf'),
        array('h', 'ffd8ff', 'image/jpeg'),
        array('h', '4944330', 'audio/mpeg'),
        array('h', '000001', 'video/mpeg'),
        array('s', '8BPS', 'image/photoshop'),
        array('s', 'MThd', 'audio/mid'),
        array('s', '**ACE**', 'application/x-ace-compressed'),
        array('s', 'Rar!', 'application/x-rar-compressed'),
        array('s', 'PK', 'application/x-zip-compressed'),
        array('s', 'BZh', 'application/x-bzip'),
        array('h', '1f8b08', 'application/x-gzip'),
        array('s', '7z', 'application/x-7z-compressed'),
        #array('s', 'ustar', 'application/x-tar'), // should be at byte position 257
        #array('s', 'RIFF', 'audio/wav'),
        array('s', 'RIFF', 'video/avi'),
        array('s', 'GIF8', 'image/gif'),
        array('s', 'MM.*', 'image/tif'),
        array('s', 'II*', 'image/tif'),
        array('h', '424d', 'image/bmp'),
        array('s', '%PDF', 'application/pdf'),
        array('s', 'FWS', 'application/x-shockwave-flash'),
        array('h', '6d', 'video/quicktime'),
        array('h', '00', 'image/tga'),
        array('h', 'ff', 'audio/mp3'),
        array('s', '<!DOCTYPE HTML', 'text/html'),
        array('s', '<!doctype html', 'text/html'),
        array('s', '<HTML', 'text/html'),
        array('s', '<html', 'text/html'),
        array('s', '<?xml', 'text/xml')
    );
 
    protected $maxLength;
 
    public function __construct() {
        // Determine maximum length to read from files
        $maxLength = 0;
        foreach ($this->identRefTable as &$ident) {
            $len = strlen($ident[1]);
            if ($len > $maxLength) $maxLength = $len;
        }
        $this->maxLength = $maxLength;
//      foreach ($this->identRefTable as $ident => $mime)
//          echo "$mime: $ident => ".pack('A', $ident)."<br />";
    }
 
    public function infoFor($mime) {
        if (!isset($this->formatTable[$mime])) return null;
        return $this->formatTable[$mime];
    }
 
    public function identify($target, $returnType = 'mime') {
        
        // Make sure the target is a file we can work with
        if (!is_file($target)) throw new Exception("'$target' is not a valid file.");
 
        // Attempt to read the file header (the first $this->maxLength bytes) from the target file
        $bin = file_get_contents($target, false, null, 0, 50);
        if ($bin === false) throw new Exception("Failed to get contents of '$target'.");
 
        // Convert the header to a hexadecimal representation to work with
        $hex = bin2hex($bin);
        
        // Return the requested info
        foreach ($this->identRefTable as $ident) {
            if ($ident[0] == 's') $cmp = &$bin;
            else $cmp = &$hex;
            if (substr($cmp, 0, strlen($ident[1])) == $ident[1])
                switch ($returnType) {
                    case 'format':
                    case 'name':
                        return $this->formatTable[$ident[2]]['name']; break;
                    case 'ext': return $this->formatTable[$ident[2]]['ext'][0]; break;
                    case 'exts': return $this->formatTable[$ident[2]]['ext']; break;
                    case '*': return $this->formatTable[$ident[2]]; break;
                    case 'mime':
                    default:
                         return $ident[2];
                }
        }
        
        // No match found in table
        return '';
 
    }
 
}
The Tar format was kinda problematic as the identifier is located at the 257th byte, so I scrapped it for now.
Last edited by vigge89 on Fri Jan 18, 2008 9:50 am, edited 1 time in total.
User avatar
s.dot
Tranquility In Moderation
Posts: 5001
Joined: Sun Feb 06, 2005 7:18 pm
Location: Indiana

Post by s.dot »

vigge89 wrote:The Tar format was kinda problematic as the identifier is located at the 257th byte, so I scrapped it for now.
fread() or fgetc() :)
Set Search Time - A google chrome extension. When you search only results from the past year (or set time period) are displayed. Helps tremendously when using new technologies to avoid outdated results.
User avatar
Ambush Commander
DevNet Master
Posts: 3698
Joined: Mon Oct 25, 2004 9:29 pm
Location: New Jersey, US

Post by Ambush Commander »

Agreed. While file_get_contents is the most efficient way to read the entire file to a string, the small number of leading bytes you'll need works better with fopen() and its kin.
User avatar
vigge89
Forum Regular
Posts: 875
Joined: Wed Jul 30, 2003 3:29 am
Location: Sweden

Post by vigge89 »

I was unsure about which method to use and spent quite some time trying to decide but ended up going for the simpler solution thinking I could revamp it later. Now that you've said it I guess I'll switch to the old school way for the next update ;)
User avatar
vigge89
Forum Regular
Posts: 875
Joined: Wed Jul 30, 2003 3:29 am
Location: Sweden

Post by vigge89 »

New version coming up...
Changes:
+ Turned the class static as there isn't much use for it to be instantiated (correct me if I'm wrong)
+ Byte offset functionality
+ Regex matching (needed to distinguish AVI from WAV) - rather hackish at the moment (any suggestions?)
+ Ability to look up format from extension
+ Storing extensions in a separate table
+ More formats added to table

I'm still somewhat uncertain regarding how to read the magic bytes from the files. As of now it will read from the start of the file to $maxLength, which is initially set by determineMaxLenght() to the length of the longest ident in $identRefTable. The data is then compared to each entry and if a match is found the mime type associated with it is returned. I'm sure there's a faster/better way to do it, but how?

Input and suggestions are more then welcome!

For the mods: Is it preferred to post larger blocks of code on an external site like pastebin.ca or should I keep it in the thread?

Edit: Found some useful links for anyone interested:
Common files signature table
Linux Magic Numbers (alot more then what my apache2 magic file contained)

Code: Select all

<?php
 
class FileFormatIdentifier {
    
    // File format information table
    // Format: $mime => array(name => $formatName)
    protected static $formatInfoTable = array(
        'image/jpeg' => array('name' => 'JPEG/JIFF Image'),
        'image/png' => array('name' => 'Portable (Public) Network Graphic'),
        'video/mng' => array('name' => 'Multi-image Network Graphic Animation'),
        'image/gif' => array('name' => 'Graphic Interchange Format Image'),
        'image/tga' => array('name' => 'Truevision Targa Graphic'),
        'image/tif' => array('name' => 'Tagged Image Format File'),
        'image/bmp' => array('name' => 'Windows OS/2 Bitmap Graphic'),
        'image/svg+xml' => array('name' => 'Scalable Vector Graphic'),
        'image/photoshop' => array('name' => 'Photoshop Format Image'),
        'image/vnd.microsoft.icon' => array('name' => 'Windows Icon'),
        'application/ogg' => array('name' => 'Ogg Multimedia File'),
        'audio/wav' => array('name' => 'Waveform Audio File'),
        'audio/mid' => array('name' => 'MIDI-sequention Sound File'),
        'audio/mpeg' => array('name' => 'MPEG Audio Stream (Layer III) File'),
        'video/mpeg' => array('name' => 'MPEG System Stream File'),
        'video/3gpp' => array('name' => '3GPP Multimedia File'),
        'video/quicktime' => array('name' => 'QuickTime Video Clip'),
        'video/avi' => array('name' => 'Audio Video Interleave File'),
        'application/x-shockwave-flash' => array('name' => 'Macromedia Flash File'),
        'application/pdf' => array('name' => 'Acrobat Portable Document File'),
        'application/winhlp' => array('name' => 'Windows Help File'),
        'application/msword' => array('name' => 'Word Document'),
        'application/msexcel' => array('name' => 'Excel Worksheet'),
        'application/mspowerpoint' => array('name' => 'PowerPoint Presentation'),
        'application/x-zip-compressed' => array('name' => 'Compressed Archive'),
        'application/x-rar-compressed' => array('name' => 'WinRAR Compressed Archive'),
        'application/x-ace-compressed' => array('name' => 'WinAce Compressed Archive'),
        'application/x-7z-compressed' => array('name' => '7-Zip Compressed Archive'),
        'application/x-bzip' => array('name' => 'Bzip 2 UNIX Compressed Archive'),
        'application/x-gzip' => array('name' => 'Gzip Compressed Archive'),
        'application/x-tar' => array('name' => 'Tape Archive'),
        'application/java-archive' => array('name' => 'Java Archive'),
        'font/ttf' => array('name' => 'TrueType Font'),
        'font/otf' => array('name' => 'Open Type Font'),
        'text/plain' => array('name' => 'Text File'),
        'text/html' => array('name' => 'HyperText Markup Language File'),
        'application/xhtml+xml' => array('name' => 'Extensible HyperText Markup Language File'),
        'text/xml' => array('name' => 'Extensible Markup Language File'),
        'application/x-httpd-php' => array('name' => 'PHP Script'),
        'application/x-java-class' => array('name' => 'Java Bytecode'),
        'application/octet-stream' => array('name' => 'Executable File')
    );
    
    // Ident reference table
    // Format: array($byteOffset, $representation, $ident, $mime)
    // s => string, h => hexadecimal, r => regular expression pattern
    protected static $identRefTable = array(
        array(0, 'h', '504b0304140008000800', 'application/java-archive'),
        array(0, 'h', '89504e470d0a1a0a00', 'image/png'),
        array(0, 'h', '8a4d4e470d0a1a0a00', 'video/mng'),
        array(0, 'h', 'cafebabe', 'application/x-java-class'),
        array(0, 'h', '0001000000', 'font/ttf'),
        array(0, 'h', '4f54544f00', 'font/otf'),
        array(0, 'h', '4944330', 'audio/mpeg'),
        array(0, 'h', '000001b', 'video/mpeg'),
        array(0, 'h', '00000100', 'image/vnd.microsoft.icon'),
        array(0, 'h', '000000', 'video/3gpp'),
        array(0, 's', '8BPS', 'image/photoshop'),
        array(0, 's', 'MThd', 'audio/mid'),
        array(0, 's', 'OggS', 'application/ogg'),
        array(0, 's', '**ACE**', 'application/x-ace-compressed'),
        array(0, 's', 'Rar!', 'application/x-rar-compressed'),
        array(0, 's', 'PK', 'application/x-zip-compressed'),
        array(0, 's', 'BZh', 'application/x-bzip'),
        array(0, 'h', '1f8b08', 'application/x-gzip'),
        array(0, 's', '7z', 'application/x-7z-compressed'),
        array(257, 's', 'ustar', 'application/x-tar'),
        array(0, 'sr', 'RIFF....WAVE', 'audio/wav'),
        array(0, 'sr', 'RIFF....AVI', 'video/avi'),
        array(0, 's', 'GIF8', 'image/gif'),
        array(0, 's', 'MM.*', 'image/tif'),
        array(0, 's', 'II*', 'image/tif'),
        array(0, 'h', 'ffd8', 'image/jpeg'),
        array(0, 'h', '424d', 'image/bmp'),
        array(0, 's', 'MZ', 'application/octet-stream'),
        array(0, 's', '?_', 'application/winhlp'),
        array(0, 's', '%PDF', 'application/pdf'),
        array(0, 's', 'FWS', 'application/x-shockwave-flash'),
        array(0, 'h', '6d', 'video/quicktime'),
        array(508, 'h', 'ffffffffeca5c100', 'application/msword'),
        array(508, 'h', 'fffffffffdffffff1f', 'application/msexcel'),
        array(508, 'h', 'fffffffffdffffffc3', 'application/mspowerpoint'),
        array(0, 'h', 'efbbbf', 'text/plain'), // UTF-8
        array(0, 'h', 'fffe', 'text/plain'), // UTF-16 LE
        array(0, 'h', 'feff', 'text/plain'), // UTF-16 BE
        array(0, 'h', 'fffe0000', 'text/plain'), // UTF-32 LE
        array(0, 'h', '0000feff', 'text/plain'),  // UTF-32 BE
        array(0, 'h', '00', 'image/tga'),
        array(0, 'h', 'ff', 'audio/mpeg'),
        array(0, 's', '<?php', 'application/x-httpd-php'),
        array(0, 's', '<!DOCTYPE HTML', 'text/html'),
        array(0, 's', '<!DOCTYPE html', 'text/html'),
        array(0, 's', '<!doctype html', 'text/html'),
        array(0, 's', '<HTML', 'text/html'),
        array(0, 's', '<html', 'text/html'),
        array(0, 's', '<?xml', 'text/xml')
    );
 
    // Extension reference table
    // Format: $extension => $mime
    protected static $extRefTable = array(
        'jpg' => 'image/jpeg',
        'jpeg' => 'image/jpeg',
        'jpe' => 'image/jpeg',
        'png' => 'image/png',
        'mng' => 'video/mng',
        'gif' => 'image/gif',
        'tga' => 'image/tga',
        'tif' => 'image/tif',
        'bmp' => 'image/bmp',
        'ico' => 'image/vnd.microsoft.icon',
        'psd' => 'image/photoshop',
        'avi' => 'video/avi',
        'wav' => 'audio/wav',
        'mid' => 'audio/mid',
        'midi' => 'audio/mid',
        'mp3' => 'audio/mpeg',
        'mpg' => 'video/mpeg',
        'mpeg' => 'video/mpeg',
        'ogg' => 'application/ogg',
        'ogm' => 'application/ogg',
        'ogv' => 'application/ogg',
        'oga' => 'application/ogg',
        '3gp' => 'video/3gpp',
        '3g2' => 'video/3gpp',
        'mov' => 'video/quicktime',
        'swf' => 'application/x-shockwave-flash',
        'zip' => 'application/x-zip-compressed',
        'rar' => 'application/x-rar-compressed',
        'r01' => 'application/x-rar-compressed',
        'ace' => 'application/x-ace-compressed',
        '7z' => 'application/x-7z-compressed',
        'jar' => 'application/java-archive',
        'bz2' => 'application/x-bzip',
        'tbz2' => 'application/x-bzip',
        'tb2' => 'application/x-bzip',
        'gz' => 'application/x-gzip',
        'tar' => 'application/x-tar',
        'exe' => 'application/octet-stream',
        'com' => 'application/octet-stream',
        'dll' => 'application/octet-stream',
        'pdf' => 'application/pdf',
        'doc' => 'application/msword',
        'xls' => 'application/msexcel',
        'ppt' => 'application/mspowerpoint',
        'ttf' => 'font/ttf',
        'otf' => 'font/otf',
        'htm' => 'text/html',
        'html' => 'text/html',
        'xhtml' => 'text/html',
        'xht' => 'text/html',
        'xml' => 'text/xml',
        'svg' => 'image/svg+xml',
        'php' => 'application/x-httpd-php',
        'class' => 'application/x-java-class',
        'txt' => 'text/plain',
        'log' => 'text/plain',
        'msg' => 'text/plain',
        'rtf' => 'text/plain',
        'nfo' => 'text/plain'
    );
    
    // Maximum number of bytes to read from the file starting at offset 0
    protected static $maxLength;
 
    // Determine maximum length to read from files
    protected static function determineMaxLenght() {
        $maxLength = 0;
        foreach (self::$identRefTable as &$ident) {
            $len = strlen($ident[2]);
            if ($ident[1] == 'h') $len /= 2;
            $len += $ident[0];
            if ($len > $maxLength) $maxLength = $len;
        }
        self::$maxLength = $maxLength;
    }
 
    // Attempts to figure out file format by looking at the first few bytes of the file
    public static function fromHeader($target) {
 
        // Determine max lenght to read from file if it hasn't been done already
        if (!isset(self::$maxLength)) self::determineMaxLenght();
 
        // Make sure the target is a file we can work with before opening it
        if (!is_file($target)) throw new Exception("'$target' is not a valid file.");
        $handle = @fopen($target, 'r');
        if ($handle === false) throw new Exception("Could not open target file '$target'.");
        
        // Read data from target file
        $bin = fread($handle, self::$maxLength);
        if ($bin === false) throw new Exception("Unable to read data from '$target'.");
        fclose($handle);
 
        $len = strlen($bin);
 
        // Convert the header to a hexadecimal representation to work with
        $hex = bin2hex($bin);
 
        // Compare data with each entry in the ident table
        foreach (self::$identRefTable as $ident) {
            // Skip current if the data read isn't long enough
            if ($ident[0]> $len) continue;
            // Compare by string or hexadecimal representation?
            if ($ident[1][0] == 's') $cmp = substr($bin, $ident[0], strlen($ident[2]));
            else $cmp = substr($hex, $ident[0]*2, strlen($ident[2]));
            // Return the mime type associated with the ident if we have a match
            // Compare with regex if a second character in the representation variable is present (r)
            if ( (isset($ident[1][1]) && preg_match('~^'.$ident[2].'~', $cmp)) || $cmp == $ident[2] )
                return $ident[3];
        }
        
        // No match found in table
        return '';
 
    }
 
    // Tries to guess the file format by the targets file extension
    public static function fromExtension($target) {
        $ext = self::extractExtension($target);
        if (strlen($ext) < 1 || !isset(self::$extRefTable[$ext]))
            return '';
        return self::$extRefTable[$ext];
    }
 
    // Extracts file extension from the target path
    public static function extractExtension($target) {
        preg_match('~(?>\.([A-Za-z0-9_]+))?$~', $target, $targetParts);
        return strtolower($targetParts[1]);
    }
 
    // Returns info (from file format information table) for the specified mime type
    public static function infoFor($mime) {
        if (!isset(self::$formatInfoTable[$mime])) return array();
        return self::$formatInfoTable[$mime];
    }
 
    // Returns possible extensions for the specified mime type
    public static function extensionsFor($mime) {
        return array_keys(self::$extRefTable, $mime);
    }
 
}
cfallen
Forum Newbie
Posts: 1
Joined: Wed Aug 08, 2012 5:43 pm

Re: File format (mime type) identifying

Post by cfallen »

All I care about is making sure that a file is an archive type of file. But I wanted to not make so many iterations thru the loop of potential files, and only read the specific portion of data from the file where the "magic" is supposed to be, instead of reading from the beginning of every file to the longest offset of all possible filetypes. For this I used fseek().

Code: Select all

<?
class DetectArchiveFile {
   
    // Ident reference table
    // Format: array($byteOffset, $representation, $ident, $mime)
    // s => string, h => hexadecimal, r => regular expression pattern
    public $identRefTable = array(
        array(0,  's',   'PK',                   'application/x-zip-compressed'),
        array(0,  'h',   '1f8b08',               'application/x-gzip'),
        array(0,  'h',   '78',                   'application/x-apple-diskimage'),
        array(0,  's',   'BZh',                  'application/x-bzip'),
        array(0,  's',   '7z',                   'application/x-7z-compressed'),
        array(0,  'h',   '526172211A0700',       'application/x-rar-compressed'),
        array(257,'s',   'ustar',                'application/x-tar'),
        array(0,  's',   'PK',                   'application/x-zip'),
        array(0,  'h',   '504B0304',             'application/zip'),
        array(29152,'h',   '57696E5A6970',       'application/zip'),
    );
 
    // Extension reference table
    // Format: $extension => $mime
    public $extRefTable = array(
        '7z'  => 'application/x-7z-compressed',
        'bz2' => 'application/x-bzip',
        'dmg' => 'application/x-apple-diskimage',
        'gz'  => 'application/x-gzip',
        'jar' => 'application/java-archive',
        'rar' => 'application/x-rar-compressed',
        'tar' => 'application/x-tar',
        'tbz' => 'application/x-bzip',
        'tgz' => 'application/x-tar',
        'zip' => 'application/x-zip-compressed',
    );
   
    function __construct() { }
 
    // Attempts to figure out file format by looking at the "magic" spots the file
    function fromHeader($target) 
    {
        if (!is_file($target)) 
            throw new Exception("Invalid file: $target");

        $fp = @fopen($target, 'r');
        if ($fp === false) 
            throw new Exception("Could not open target file: $target");
       
        $retval = 'NOT FOUND';
        
        // Go thru the MAGIC list and see if data in this file matches
        foreach ($this->identRefTable as $ident) 
        {
            if(fseek($fp, $ident[0]) !== -1)  // set file position. Usually 0, but 257 for tar and Winzip has a long one
            {
                // How many bytes from the current position should be read in? 
                $len = strlen($ident[2]);

                if ($ident[1] == 'h') 
                {
                    $len /= 2;
                }

                $magic = fread($fp, $len);  // Read Magic from its offset in the file through its length
                if ($magic === false) 
                    throw new Exception("Unable to read data from: $target");


                if ($ident[1] == 'h') 
                {
                    $magic = bin2hex($magic);
                }


                // Return the mime type associated with the ident if we have a match
                                             // Compare with regex if a second character in the representation variable is present (r)
                if ( (isset($ident[1][1]) && preg_match('/^'. $ident[2]. '/', $magic)) || 
                     $magic == $ident[2] )
                {
                    $retval = $ident[3];
                    break;
                }
            }
            else 
            {
                echo "Couldn;t seek!\n";
                continue; // check the next Magic pattern against this file            
            }
        }

        fclose($fp);
       
        // No match found in table
        return $retval;
 
    }
 
    // Tries to guess the file format by the targets file extension
    function fromExtension($target) {
        $ext = strtolower( array_pop( explode( '.', $target ) ) );
        if (strlen($ext) < 1 || !isset($this->extRefTable[$ext]))
            return '';
        return $this->extRefTable[$ext];
    }
 
}

$FFI = new DetectArchiveFile;

$file = $_SERVER['argv'][1];


// Determine mime type from file header

$format = $FFI->fromHeader($file);

echo "Mime type for $file from MAGIC bytes: $format\n";
Post Reply