PHP Developers Network

A community of PHP developers offering assistance, advice, discussion, and friendship.
 
Loading
It is currently Thu Dec 05, 2019 8:33 pm

All times are UTC - 5 hours




Post new topic Reply to topic  [ 11 posts ] 
Author Message
PostPosted: Wed Dec 26, 2007 12:10 pm 
Offline
Forum Regular
User avatar

Joined: Wed Jul 30, 2003 3:29 am
Posts: 875
Location: Sweden


Top
 Profile  
 
 Post subject:
PostPosted: Wed Dec 26, 2007 3:32 pm 
Offline
DevNet Master
User avatar

Joined: Mon Oct 25, 2004 9:29 pm
Posts: 3698
Location: New Jersey, US
This feels like duplicated work to me. There are databases out there of magic byte sequences => file types, so it would seem to me that a more efficient use of time would be parsing that format, and maybe compiling $formatTable.


Top
 Profile  
 
 Post subject:
PostPosted: Wed Dec 26, 2007 4:50 pm 
Offline
Forum Regular
User avatar

Joined: Wed Jul 30, 2003 3:29 am
Posts: 875
Location: Sweden
I'm sure there are, but the problem was that I couldn't find anything already available for PHP as a plain script easily readable. If you've got any links I'd gladly take a look at them though.


Top
 Profile  
 
 Post subject:
PostPosted: Wed Dec 26, 2007 5:36 pm 
Offline
DevNet Master
User avatar

Joined: Wed Dec 06, 2006 5:14 pm
Posts: 3635
Location: Toronto, Canada
I think this class is a welcome scratch. Mime detection in PHP has always been extremely poor and/or difficult to implement.

There's a DB of magic bytes here: http://magicdb.org - but it looks like a dead-ish project to me. the DB itself could be culled into your class at any rate!


Top
 Profile  
 
 Post subject:
PostPosted: Wed Dec 26, 2007 5:59 pm 
Offline
Forum Regular
User avatar

Joined: Wed Jul 30, 2003 3:29 am
Posts: 875
Location: Sweden
I'm currently looking through the magic file provided in the apache package for some more common formats. The magicdb page was nice enough to provide some example files to test against, cheers for that ;)
An update is coming up soon, decided to allow both strings and hex representations in the ident table.


Top
 Profile  
 
 Post subject:
PostPosted: Wed Dec 26, 2007 6:26 pm 
Offline
Forum Regular
User avatar

Joined: Wed Jul 30, 2003 3:29 am
Posts: 875
Location: Sweden
Updated:
Syntax: [ Download ] [ Hide ]
class FileFormatIdentifier {
   
    // File format table
    // Format: $mime => array(name => $formatName, ext => array($extensions...))
    protected $formatTable = array(
        'image/jpeg' => array('name' => 'JPEG/JIFF Image', 'ext' => array('jpg', 'jpeg', 'jpe')),
        'image/png' => array('name' => 'Portable (Public) Network Graphic', 'ext' => array('png')),
        'video/mng' => array('name' => 'Multi-image Network Graphic Animation', 'ext' => array('mng')),
        'image/gif' => array('name' => 'Graphic Interchange Format', 'ext' => array('gif')),
        'image/tga' => array('name' => 'Truevision Targa Graphic', 'ext' => array('tga')),
        'image/tif' => array('name' => 'Tagged Image Format File', 'ext' => array('tif')),
        'image/bmp' => array('name' => 'Windows OS/2 Bitmap Graphic', 'ext' => array('bmp')),
        'image/photoshop' => array('name' => 'Photoshop Format Image', 'ext' => array('psd')),
        'application/msword' => array('name' => 'Word Document', 'ext' => array('doc')),
        'application/msexcel' => array('name' => 'Excel Worksheet', 'ext' => array('xls')),
        'video/avi' => array('name' => 'Audio Video Interleave File', 'ext' => array('avi')),
        'audio/wav' => array('name' => 'Waveform Audio', 'ext' => array('wav')),
        'audio/mid' => array('name' => 'MIDI-sequention Sound', 'ext' => array('mid', 'midi')),
        'audio/mpeg' => array('name' => 'MPEG Audio Stream, Layer III', 'ext' => array('mp3')),
        'video/mpeg' => array('name' => 'MPEG 1 System Stream', 'ext' => array('mpg', 'mpeg')),
        'video/quicktime' => array('name' => 'QuickTime Video Clip', 'ext' => array('mov')),
        'application/x-shockwave-flash' => array('name' => 'Macromedia Flash Format File', 'ext' => array('swf')),
        'application/pdf' => array('name' => 'Acrobat Portable Document Format', 'ext' => array('pdf')),
        'application/x-zip-compressed' => array('name' => 'Compressed Archive File', 'ext' => array('zip')),
        'application/x-rar-compressed' => array('name' => 'WinRAR Compressed Archive', 'ext' => array('rar', 'r01')),
        'application/x-ace-compressed' => array('name' => 'WinAce Compressed File', 'ext' => array('ace')),
        'application/x-7z-compressed' => array('name' => '7-Zip Compressed File', 'ext' => array('7z')),
        'application/x-bzip' => array('name' => 'Bzip 2 UNIX Compressed File', 'ext' => array('bz2', 'tbz2', 'tb2')),
        'application/x-gzip' => array('name' => 'Gzip Compressed Archive', 'ext' => array('gz')),
        'application/x-tar' => array('name' => 'Tape Archive File', 'ext' => array('tar')),
        'font/ttf' => array('name' => 'TrueType Font', 'ext' => array('ttf')),
        'font/otf' => array('name' => 'Open Type Font Format', 'ext' => array('otf')), // No idea about mime type
        'text/html' => array('name' => 'HyperText Markup Language', 'ext' => array('htm', 'html')),
        'text/xml' => array('name' => 'Extensible Markup Language File', 'ext' => array('xml'))
    );
    // Ident reference table
    // Format: array($representation, $ident, $mime)
    // 's' => string, 'h' => hexadecimal
    protected $identRefTable = array(
        array('h', 'd0cf11e0a1b11ae100', 'application/msword'),
        #array('h', 'd0cf11e0a1b11ae100', 'application/msexcel'),
        array('h', '89504e470d0a1a0a00', 'image/png'),
        array('h', '8a4d4e470d0a1a0a00', 'video/mng'),
        array('h', '0001000000', 'font/ttf'),
        array('h', '4f54544f00', 'font/otf'),
        array('h', 'ffd8ff', 'image/jpeg'),
        array('h', '4944330', 'audio/mpeg'),
        array('h', '000001', 'video/mpeg'),
        array('s', '8BPS', 'image/photoshop'),
        array('s', 'MThd', 'audio/mid'),
        array('s', '**ACE**', 'application/x-ace-compressed'),
        array('s', 'Rar!', 'application/x-rar-compressed'),
        array('s', 'PK', 'application/x-zip-compressed'),
        array('s', 'BZh', 'application/x-bzip'),
        array('h', '1f8b08', 'application/x-gzip'),
        array('s', '7z', 'application/x-7z-compressed'),
        #array('s', 'ustar', 'application/x-tar'), // should be at byte position 257
        #array('s', 'RIFF', 'audio/wav'),
        array('s', 'RIFF', 'video/avi'),
        array('s', 'GIF8', 'image/gif'),
        array('s', 'MM.*', 'image/tif'),
        array('s', 'II*', 'image/tif'),
        array('h', '424d', 'image/bmp'),
        array('s', '%PDF', 'application/pdf'),
        array('s', 'FWS', 'application/x-shockwave-flash'),
        array('h', '6d', 'video/quicktime'),
        array('h', '00', 'image/tga'),
        array('h', 'ff', 'audio/mp3'),
        array('s', '<!DOCTYPE HTML', 'text/html'),
        array('s', '<!doctype html', 'text/html'),
        array('s', '<HTML', 'text/html'),
        array('s', '<html', 'text/html'),
        array('s', '<?xml', 'text/xml')
    );
 
    protected $maxLength;
 
    public function __construct() {
        // Determine maximum length to read from files
        $maxLength = 0;
        foreach ($this->identRefTable as &$ident) {
            $len = strlen($ident[1]);
            if ($len > $maxLength) $maxLength = $len;
        }
        $this->maxLength = $maxLength;
//      foreach ($this->identRefTable as $ident => $mime)
//          echo "$mime: $ident => ".pack('
A', $ident)."<br />";
    }
 
    public function infoFor($mime) {
        if (!isset($this->formatTable[$mime])) return null;
        return $this->formatTable[$mime];
    }
 
    public function identify($target, $returnType = '
mime') {
       
        // Make sure the target is a file we can work with
        if (!is_file($target)) throw new Exception("'
$target' is not a valid file.");
 
        // Attempt to read the file header (the first $this->maxLength bytes) from the target file
        $bin = file_get_contents($target, false, null, 0, 50);
        if ($bin === false) throw new Exception("Failed to get contents of '
$target'.");
 
        // Convert the header to a hexadecimal representation to work with
        $hex = bin2hex($bin);
       
        // Return the requested info
        foreach ($this->identRefTable as $ident) {
            if ($ident[0] == '
s') $cmp = &$bin;
            else $cmp = &$hex;
            if (substr($cmp, 0, strlen($ident[1])) == $ident[1])
                switch ($returnType) {
                    case '
format':
                    case '
name':
                        return $this->formatTable[$ident[2]]['
name']; break;
                    case '
ext': return $this->formatTable[$ident[2]]['ext'][0]; break;
                    case '
exts': return $this->formatTable[$ident[2]]['ext']; break;
                    case '
*': return $this->formatTable[$ident[2]]; break;
                    case '
mime':
                    default:
                         return $ident[2];
                }
        }
       
        // No match found in table
        return '
';
 
    }
 
}

The Tar format was kinda problematic as the identifier is located at the 257th byte, so I scrapped it for now.


Last edited by vigge89 on Fri Jan 18, 2008 10:50 am, edited 1 time in total.

Top
 Profile  
 
 Post subject:
PostPosted: Wed Dec 26, 2007 9:16 pm 
Offline
Tranquility In Moderation
User avatar

Joined: Sun Feb 06, 2005 8:18 pm
Posts: 5001
Location: Indiana

_________________
- A google chrome extension. When you search only results from the past year (or set time period) are displayed. Helps tremendously when using new technologies to avoid outdated results.


Top
 Profile  
 
 Post subject:
PostPosted: Wed Dec 26, 2007 9:18 pm 
Offline
DevNet Master
User avatar

Joined: Mon Oct 25, 2004 9:29 pm
Posts: 3698
Location: New Jersey, US
Agreed. While file_get_contents is the most efficient way to read the entire file to a string, the small number of leading bytes you'll need works better with fopen() and its kin.


Top
 Profile  
 
 Post subject:
PostPosted: Wed Dec 26, 2007 9:34 pm 
Offline
Forum Regular
User avatar

Joined: Wed Jul 30, 2003 3:29 am
Posts: 875
Location: Sweden
I was unsure about which method to use and spent quite some time trying to decide but ended up going for the simpler solution thinking I could revamp it later. Now that you've said it I guess I'll switch to the old school way for the next update ;)


Top
 Profile  
 
 Post subject:
PostPosted: Fri Jan 04, 2008 9:21 pm 
Offline
Forum Regular
User avatar

Joined: Wed Jul 30, 2003 3:29 am
Posts: 875
Location: Sweden
New version coming up...
Changes:
+ Turned the class static as there isn't much use for it to be instantiated (correct me if I'm wrong)
+ Byte offset functionality
+ Regex matching (needed to distinguish AVI from WAV) - rather hackish at the moment (any suggestions?)
+ Ability to look up format from extension
+ Storing extensions in a separate table
+ More formats added to table

I'm still somewhat uncertain regarding how to read the magic bytes from the files. As of now it will read from the start of the file to $maxLength, which is initially set by determineMaxLenght() to the length of the longest ident in $identRefTable. The data is then compared to each entry and if a match is found the mime type associated with it is returned. I'm sure there's a faster/better way to do it, but how?

Input and suggestions are more then welcome!

For the mods: Is it preferred to post larger blocks of code on an external site like pastebin.ca or should I keep it in the thread?

Edit: Found some useful links for anyone interested:

(alot more then what my apache2 magic file contained)

Syntax: [ Download ] [ Hide ]
<?php
 
class FileFormatIdentifier {
   
    // File format information table
    // Format: $mime => array(name => $formatName)
    protected static $formatInfoTable = array(
        'image/jpeg' => array('name' => 'JPEG/JIFF Image'),
        'image/png' => array('name' => 'Portable (Public) Network Graphic'),
        'video/mng' => array('name' => 'Multi-image Network Graphic Animation'),
        'image/gif' => array('name' => 'Graphic Interchange Format Image'),
        'image/tga' => array('name' => 'Truevision Targa Graphic'),
        'image/tif' => array('name' => 'Tagged Image Format File'),
        'image/bmp' => array('name' => 'Windows OS/2 Bitmap Graphic'),
        'image/svg+xml' => array('name' => 'Scalable Vector Graphic'),
        'image/photoshop' => array('name' => 'Photoshop Format Image'),
        'image/vnd.microsoft.icon' => array('name' => 'Windows Icon'),
        'application/ogg' => array('name' => 'Ogg Multimedia File'),
        'audio/wav' => array('name' => 'Waveform Audio File'),
        'audio/mid' => array('name' => 'MIDI-sequention Sound File'),
        'audio/mpeg' => array('name' => 'MPEG Audio Stream (Layer III) File'),
        'video/mpeg' => array('name' => 'MPEG System Stream File'),
        'video/3gpp' => array('name' => '3GPP Multimedia File'),
        'video/quicktime' => array('name' => 'QuickTime Video Clip'),
        'video/avi' => array('name' => 'Audio Video Interleave File'),
        'application/x-shockwave-flash' => array('name' => 'Macromedia Flash File'),
        'application/pdf' => array('name' => 'Acrobat Portable Document File'),
        'application/winhlp' => array('name' => 'Windows Help File'),
        'application/msword' => array('name' => 'Word Document'),
        'application/msexcel' => array('name' => 'Excel Worksheet'),
        'application/mspowerpoint' => array('name' => 'PowerPoint Presentation'),
        'application/x-zip-compressed' => array('name' => 'Compressed Archive'),
        'application/x-rar-compressed' => array('name' => 'WinRAR Compressed Archive'),
        'application/x-ace-compressed' => array('name' => 'WinAce Compressed Archive'),
        'application/x-7z-compressed' => array('name' => '7-Zip Compressed Archive'),
        'application/x-bzip' => array('name' => 'Bzip 2 UNIX Compressed Archive'),
        'application/x-gzip' => array('name' => 'Gzip Compressed Archive'),
        'application/x-tar' => array('name' => 'Tape Archive'),
        'application/java-archive' => array('name' => 'Java Archive'),
        'font/ttf' => array('name' => 'TrueType Font'),
        'font/otf' => array('name' => 'Open Type Font'),
        'text/plain' => array('name' => 'Text File'),
        'text/html' => array('name' => 'HyperText Markup Language File'),
        'application/xhtml+xml' => array('name' => 'Extensible HyperText Markup Language File'),
        'text/xml' => array('name' => 'Extensible Markup Language File'),
        'application/x-httpd-php' => array('name' => 'PHP Script'),
        'application/x-java-class' => array('name' => 'Java Bytecode'),
        'application/octet-stream' => array('name' => 'Executable File')
    );
   
    // Ident reference table
    // Format: array($byteOffset, $representation, $ident, $mime)
    // s => string, h => hexadecimal, r => regular expression pattern
    protected static $identRefTable = array(
        array(0, 'h', '504b0304140008000800', 'application/java-archive'),
        array(0, 'h', '89504e470d0a1a0a00', 'image/png'),
        array(0, 'h', '8a4d4e470d0a1a0a00', 'video/mng'),
        array(0, 'h', 'cafebabe', 'application/x-java-class'),
        array(0, 'h', '0001000000', 'font/ttf'),
        array(0, 'h', '4f54544f00', 'font/otf'),
        array(0, 'h', '4944330', 'audio/mpeg'),
        array(0, 'h', '000001b', 'video/mpeg'),
        array(0, 'h', '00000100', 'image/vnd.microsoft.icon'),
        array(0, 'h', '000000', 'video/3gpp'),
        array(0, 's', '8BPS', 'image/photoshop'),
        array(0, 's', 'MThd', 'audio/mid'),
        array(0, 's', 'OggS', 'application/ogg'),
        array(0, 's', '**ACE**', 'application/x-ace-compressed'),
        array(0, 's', 'Rar!', 'application/x-rar-compressed'),
        array(0, 's', 'PK', 'application/x-zip-compressed'),
        array(0, 's', 'BZh', 'application/x-bzip'),
        array(0, 'h', '1f8b08', 'application/x-gzip'),
        array(0, 's', '7z', 'application/x-7z-compressed'),
        array(257, 's', 'ustar', 'application/x-tar'),
        array(0, 'sr', 'RIFF....WAVE', 'audio/wav'),
        array(0, 'sr', 'RIFF....AVI', 'video/avi'),
        array(0, 's', 'GIF8', 'image/gif'),
        array(0, 's', 'MM.*', 'image/tif'),
        array(0, 's', 'II*', 'image/tif'),
        array(0, 'h', 'ffd8', 'image/jpeg'),
        array(0, 'h', '424d', 'image/bmp'),
        array(0, 's', 'MZ', 'application/octet-stream'),
        array(0, 's', '?_', 'application/winhlp'),
        array(0, 's', '%PDF', 'application/pdf'),
        array(0, 's', 'FWS', 'application/x-shockwave-flash'),
        array(0, 'h', '6d', 'video/quicktime'),
        array(508, 'h', 'ffffffffeca5c100', 'application/msword'),
        array(508, 'h', 'fffffffffdffffff1f', 'application/msexcel'),
        array(508, 'h', 'fffffffffdffffffc3', 'application/mspowerpoint'),
        array(0, 'h', 'efbbbf', 'text/plain'), // UTF-8
        array(0, 'h', 'fffe', 'text/plain'), // UTF-16 LE
        array(0, 'h', 'feff', 'text/plain'), // UTF-16 BE
        array(0, 'h', 'fffe0000', 'text/plain'), // UTF-32 LE
        array(0, 'h', '0000feff', 'text/plain'),  // UTF-32 BE
        array(0, 'h', '00', 'image/tga'),
        array(0, 'h', 'ff', 'audio/mpeg'),
        array(0, 's', '<?php', 'application/x-httpd-php'),
        array(0, 's', '<!DOCTYPE HTML', 'text/html'),
        array(0, 's', '<!DOCTYPE html', 'text/html'),
        array(0, 's', '<!doctype html', 'text/html'),
        array(0, 's', '<HTML', 'text/html'),
        array(0, 's', '<html', 'text/html'),
        array(0, 's', '<?xml', 'text/xml')
    );
 
    // Extension reference table
    // Format: $extension => $mime
    protected static $extRefTable = array(
        'jpg' => 'image/jpeg',
        'jpeg' => 'image/jpeg',
        'jpe' => 'image/jpeg',
        'png' => 'image/png',
        'mng' => 'video/mng',
        'gif' => 'image/gif',
        'tga' => 'image/tga',
        'tif' => 'image/tif',
        'bmp' => 'image/bmp',
        'ico' => 'image/vnd.microsoft.icon',
        'psd' => 'image/photoshop',
        'avi' => 'video/avi',
        'wav' => 'audio/wav',
        'mid' => 'audio/mid',
        'midi' => 'audio/mid',
        'mp3' => 'audio/mpeg',
        'mpg' => 'video/mpeg',
        'mpeg' => 'video/mpeg',
        'ogg' => 'application/ogg',
        'ogm' => 'application/ogg',
        'ogv' => 'application/ogg',
        'oga' => 'application/ogg',
        '3gp' => 'video/3gpp',
        '3g2' => 'video/3gpp',
        'mov' => 'video/quicktime',
        'swf' => 'application/x-shockwave-flash',
        'zip' => 'application/x-zip-compressed',
        'rar' => 'application/x-rar-compressed',
        'r01' => 'application/x-rar-compressed',
        'ace' => 'application/x-ace-compressed',
        '7z' => 'application/x-7z-compressed',
        'jar' => 'application/java-archive',
        'bz2' => 'application/x-bzip',
        'tbz2' => 'application/x-bzip',
        'tb2' => 'application/x-bzip',
        'gz' => 'application/x-gzip',
        'tar' => 'application/x-tar',
        'exe' => 'application/octet-stream',
        'com' => 'application/octet-stream',
        'dll' => 'application/octet-stream',
        'pdf' => 'application/pdf',
        'doc' => 'application/msword',
        'xls' => 'application/msexcel',
        'ppt' => 'application/mspowerpoint',
        'ttf' => 'font/ttf',
        'otf' => 'font/otf',
        'htm' => 'text/html',
        'html' => 'text/html',
        'xhtml' => 'text/html',
        'xht' => 'text/html',
        'xml' => 'text/xml',
        'svg' => 'image/svg+xml',
        'php' => 'application/x-httpd-php',
        'class' => 'application/x-java-class',
        'txt' => 'text/plain',
        'log' => 'text/plain',
        'msg' => 'text/plain',
        'rtf' => 'text/plain',
        'nfo' => 'text/plain'
    );
   
    // Maximum number of bytes to read from the file starting at offset 0
    protected static $maxLength;
 
    // Determine maximum length to read from files
    protected static function determineMaxLenght() {
        $maxLength = 0;
        foreach (self::$identRefTable as &$ident) {
            $len = strlen($ident[2]);
            if ($ident[1] == 'h') $len /= 2;
            $len += $ident[0];
            if ($len > $maxLength) $maxLength = $len;
        }
        self::$maxLength = $maxLength;
    }
 
    // Attempts to figure out file format by looking at the first few bytes of the file
    public static function fromHeader($target) {
 
        // Determine max lenght to read from file if it hasn't been done already
        if (!isset(self::$maxLength)) self::determineMaxLenght();
 
        // Make sure the target is a file we can work with before opening it
        if (!is_file($target)) throw new Exception("'$target' is not a valid file.");
        $handle = @fopen($target, 'r');
        if ($handle === false) throw new Exception("Could not open target file '$target'.");
       
        // Read data from target file
        $bin = fread($handle, self::$maxLength);
        if ($bin === false) throw new Exception("Unable to read data from '$target'.");
        fclose($handle);
 
        $len = strlen($bin);
 
        // Convert the header to a hexadecimal representation to work with
        $hex = bin2hex($bin);
 
        // Compare data with each entry in the ident table
        foreach (self::$identRefTable as $ident) {
            // Skip current if the data read isn't long enough
            if ($ident[0]> $len) continue;
            // Compare by string or hexadecimal representation?
            if ($ident[1][0] == 's') $cmp = substr($bin, $ident[0], strlen($ident[2]));
            else $cmp = substr($hex, $ident[0]*2, strlen($ident[2]));
            // Return the mime type associated with the ident if we have a match
            // Compare with regex if a second character in the representation variable is present (r)
            if ( (isset($ident[1][1]) && preg_match('~^'.$ident[2].'~', $cmp)) || $cmp == $ident[2] )
                return $ident[3];
        }
       
        // No match found in table
        return '';
 
    }
 
    // Tries to guess the file format by the targets file extension
    public static function fromExtension($target) {
        $ext = self::extractExtension($target);
        if (strlen($ext) < 1 || !isset(self::$extRefTable[$ext]))
            return '';
        return self::$extRefTable[$ext];
    }
 
    // Extracts file extension from the target path
    public static function extractExtension($target) {
        preg_match('~(?>\.([A-Za-z0-9_]+))?$~', $target, $targetParts);
        return strtolower($targetParts[1]);
    }
 
    // Returns info (from file format information table) for the specified mime type
    public static function infoFor($mime) {
        if (!isset(self::$formatInfoTable[$mime])) return array();
        return self::$formatInfoTable[$mime];
    }
 
    // Returns possible extensions for the specified mime type
    public static function extensionsFor($mime) {
        return array_keys(self::$extRefTable, $mime);
    }
 
}


Top
 Profile  
 
PostPosted: Wed Aug 08, 2012 5:50 pm 
Offline
Forum Newbie

Joined: Wed Aug 08, 2012 5:43 pm
Posts: 1


Top
 Profile  
 
Display posts from previous:  Sort by  
Post new topic Reply to topic  [ 11 posts ] 

All times are UTC - 5 hours


Who is online

Users browsing this forum: No registered users and 4 guests


You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot post attachments in this forum

Jump to:  
Powered by phpBB® Forum Software © phpBB Group