PHP Developers Network

A community of PHP developers offering assistance, advice, discussion, and friendship.
 
Loading
It is currently Tue Sep 26, 2017 6:12 am

All times are UTC - 5 hours




Post new topic Reply to topic  [ 8 posts ] 
Author Message
PostPosted: Mon Feb 09, 2009 11:25 pm 
Offline
Forum Commoner

Joined: Thu Aug 28, 2008 7:03 pm
Posts: 55
Here is my FixedBitNotation class. Read my next post for an explanation of when and how to use it. Please critique.

Syntax: [ Download ] [ Hide ]
<?php
/**
 * FixedBitNotation
 *
 * @author Andre DeMarre
 * @package FixedBitNotation
 */

 
/**
 * The FixedBitNotation class is for binary to text conversion. It
 * can handle many encoding schemes, formally defined or not, that
 * use a fixed number of bits to encode each character.
 *
 * @package FixedBitNotation
 */

class FixedBitNotation
{
    protected $_chars;
    protected $_bitsPerCharacter;
    protected $_radix;
    protected $_rightPadFinalBits;
    protected $_padFinalGroup;
    protected $_padCharacter;
    protected $_charmap;
 
    /**
     * Constructor
     *
     * @param integer $bitsPerCharacter Bits to use for each encoded character
     * @param string  $chars Base character alphabet
     * @param boolean $rightPadFinalBits How to encode last character
     * @param boolean $padFinalGroup Add padding to end of encoded output
     * @param string  $padCharacter Character to use for padding
     */

    public function __construct(
        $bitsPerCharacter, $chars = NULL, $rightPadFinalBits = FALSE,
        $padFinalGroup = FALSE, $padCharacter = '=')
    {
        // Ensure validity of $chars
        if (!is_string($chars) || ($charLength = strlen($chars)) < 2) {
            $chars = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-,';
            $charLength = 64;
        }
 
        // Ensure validity of $bitsPerCharacter
        if ($bitsPerCharacter < 1) {
            // $bitsPerCharacter must be at least 1
            $bitsPerCharacter = 1;
            $radix = 2;
 
        } elseif ($charLength < 1 << $bitsPerCharacter) {
            // Character length of $chars is too small for $bitsPerCharacter
            // Set $bitsPerCharacter to greatest acceptable value
            $bitsPerCharacter = 1;
            $radix = 2;
 
            while ($charLength >= ($radix <<= 1) && $bitsPerCharacter < 8) {
                $bitsPerCharacter++;
            }
 
            $radix >>= 1;
 
        } elseif ($bitsPerCharacter > 8) {
            // $bitsPerCharacter must not be greater than 8
            $bitsPerCharacter = 8;
            $radix = 256;
 
        } else {
            $radix = 1 << $bitsPerCharacter;
        }
 
        $this->_chars = $chars;
        $this->_bitsPerCharacter = $bitsPerCharacter;
        $this->_radix = $radix;
        $this->_rightPadFinalBits = $rightPadFinalBits;
        $this->_padFinalGroup = $padFinalGroup;
        $this->_padCharacter = $padCharacter[0];
    }
 
    /**
     * Encode a string
     *
     * @param  string $rawString Binary data to encode
     * @return string
     */

    public function encode($rawString)
    {
        // Unpack string into an array of bytes
        $bytes = unpack('C*', $rawString);
        $byteCount = count($bytes);
 
        $encodedString = '';
        $byte = array_shift($bytes);
        $bitsRead = 0;
 
        $chars = $this->_chars;
        $bitsPerCharacter = $this->_bitsPerCharacter;
        $rightPadFinalBits = $this->_rightPadFinalBits;
        $padFinalGroup = $this->_padFinalGroup;
        $padCharacter = $this->_padCharacter;
 
        // Generate encoded output; each loop produces one encoded character
        for ($c = 0; $c < $byteCount * 8 / $bitsPerCharacter; $c++) {
 
            // Get the bits needed for this encoded character
            if ($bitsRead + $bitsPerCharacter > 8) {
                // Not enough bits remain in this byte for the current character
                // Save the remaining bits before getting the next byte
                $oldBitCount = 8 - $bitsRead;
                $oldBits = $byte ^ ($byte >> $oldBitCount << $oldBitCount);
                $newBitCount = $bitsPerCharacter - $oldBitCount;
 
                if (!$bytes) {
                    // Last bits; match final character and exit loop
                    if ($rightPadFinalBits) $oldBits <<= $newBitCount;
                    $encodedString .= $chars[$oldBits];
 
                    if ($padFinalGroup) {
                        // Array of the lowest common multiples of $bitsPerCharacter and 8, divided by 8
                        $lcmMap = array(1 => 1, 2 => 1, 3 => 3, 4 => 1, 5 => 5, 6 => 3, 7 => 7, 8 => 1);
                        $bytesPerGroup = $lcmMap[$bitsPerCharacter];
                        $pads = $bytesPerGroup * 8 / $bitsPerCharacter - ceil((strlen($rawString) % $bytesPerGroup) * 8 / $bitsPerCharacter);
                        $encodedString .= str_repeat($padCharacter[0], $pads);
                    }
 
                    break;
                }
 
                // Get next byte
                $byte = array_shift($bytes);
                $bitsRead = 0;
 
            } else {
                $oldBitCount = 0;
                $newBitCount = $bitsPerCharacter;
            }
 
            // Read only the needed bits from this byte
            $bits = $byte >> 8 - ($bitsRead + ($newBitCount));
            $bits ^= $bits >> $newBitCount << $newBitCount;
            $bitsRead += $newBitCount;
 
            if ($oldBitCount) {
                // Bits come from seperate bytes, add $oldBits to $bits
                $bits = ($oldBits << $newBitCount) | $bits;
            }
 
            $encodedString .= $chars[$bits];
        }
 
        return $encodedString;
    }
 
    /**
     * Decode a string
     *
     * @param  string  $encodedString Data to decode
     * @param  boolean $caseSensitive
     * @param  boolean $strict Returns NULL if $encodedString contains an undecodable character
     * @return string|NULL
     */

    public function decode($encodedString, $caseSensitive = TRUE, $strict = FALSE)
    {
        if (!$encodedString || !is_string($encodedString)) {
            // Empty string, nothing to decode
            return '';
        }
 
        $chars = $this->_chars;
        $bitsPerCharacter = $this->_bitsPerCharacter;
        $radix = $this->_radix;
        $rightPadFinalBits = $this->_rightPadFinalBits;
        $padFinalGroup = $this->_padFinalGroup;
        $padCharacter = $this->_padCharacter;
 
        // Get index of encoded characters
        if ($this->_charmap) {
            $charmap = $this->_charmap;
 
        } else {
            $charmap = array();
 
            for ($i = 0; $i < $radix; $i++) {
                $charmap[$chars[$i]] = $i;
            }
 
            $this->_charmap = $charmap;
        }
 
        // The last encoded character is $encodedString[$lastNotatedIndex]
        $lastNotatedIndex = strlen($encodedString) - 1;
 
        // Remove trailing padding characters
        while ($encodedString[$lastNotatedIndex] == $padCharacter[0]) {
            $encodedString = substr($encodedString, 0, $lastNotatedIndex);
            $lastNotatedIndex--;
        }
 
        $rawString = '';
        $byte = 0;
        $bitsWritten = 0;
 
        // Convert each encoded character to a series of unencoded bits
        for ($c = 0; $c <= $lastNotatedIndex; $c++) {
 
            if (!isset($charmap[$encodedString[$c]]) && !$caseSensitive) {
                // Encoded character was not found; try other case
                if (isset($charmap[$cUpper = strtoupper($encodedString[$c])])) {
                    $charmap[$encodedString[$c]] = $charmap[$cUpper];
 
                } elseif (isset($charmap[$cLower = strtolower($encodedString[$c])])) {
                    $charmap[$encodedString[$c]] = $charmap[$cLower];
                }
            }
 
            if (isset($charmap[$encodedString[$c]])) {
                $bitsNeeded = 8 - $bitsWritten;
                $unusedBitCount = $bitsPerCharacter - $bitsNeeded;
 
                // Get the new bits ready
                if ($bitsNeeded > $bitsPerCharacter) {
                    // New bits aren't enough to complete a byte; shift them left into position
                    $newBits = $charmap[$encodedString[$c]] << $bitsNeeded - $bitsPerCharacter;
                    $bitsWritten += $bitsPerCharacter;
 
                } elseif ($c != $lastNotatedIndex || $rightPadFinalBits) {
                    // Zero or more too many bits to complete a byte; shift right
                    $newBits = $charmap[$encodedString[$c]] >> $unusedBitCount;
                    $bitsWritten = 8; //$bitsWritten += $bitsNeeded;
 
                } else {
                    // Final bits don't need to be shifted
                    $newBits = $charmap[$encodedString[$c]];
                    $bitsWritten = 8;
                }
 
                $byte |= $newBits;
 
                if ($bitsWritten == 8 || $c == $lastNotatedIndex) {
                    // Byte is ready to be written
                    $rawString .= pack('C', $byte);
 
                    if ($c != $lastNotatedIndex) {
                        // Start the next byte
                        $bitsWritten = $unusedBitCount;
                        $byte = ($charmap[$encodedString[$c]] ^ ($newBits << $unusedBitCount)) << 8 - $bitsWritten;
                    }
                }
 
            } elseif ($strict) {
                // Unable to decode character; abort
                return NULL;
            }
        }
 
        return $rawString;
    }
}
 

EDITS: Added a missing $, changed default of decode()'s $strict argument to FALSE.


Last edited by André D on Thu Feb 19, 2009 2:07 am, edited 5 times in total.

Top
 Profile  
 
PostPosted: Mon Feb 09, 2009 11:26 pm 
Offline
Forum Commoner

Joined: Thu Aug 28, 2008 7:03 pm
Posts: 55
The FixedBitNotation class is for general purpose binary to text conversion with arbitrary encodings. You can use it to handle variants of many encodings such as Base64 or Base32.

Most binary to text encoding schemes use a fixed number of bits (up to 6) of binary data to generate each encoded character. The algorithms used for these encodings are very similar, so I set out to write a single algorithm that handles them all. (Note that Ascii85 does not work this way; it uses four bytes to generate five encoded characters, and each character is not derived from a fixed number of bits.)

These encodings are usually used to represent data in a notation that is safe for transport, but as the following examples show, there are other uses.

How it works

First, create an instance. The constructor accepts five arguments:

integer $bitsPerCharacter (required) - This is an integer specifying the number of bits from the raw binary string to use for each encoded character. The practical range is 1 to 6; you may use up to 8, but you will have to provide a base character string ($chars) that is at least pow(2, $bitsPerCharacter) characters long. So even with 7 bits per character you need to specify a value for $chars that is 128 characters long, which exceeds the number of printable ASCII characters.

The output's radix relates to the value of $bitsPerCharacter as follows:
1: base-2 (binary)
2: base-4
3: base-8 (octal)
4: base-16 (hexadecimal)
5: base-32
6: base-64
7: base-128
8: base-256

string $chars (optional) - This is a string that specifies the base alphabet to use in your notation. As explained above, the string length of $chars is related to the value of $bitsPerCharacter. If $chars is not long enough for $bitsPerCharacter, $bitsPerCharacter will be reduced to the greatest value supported by $chars. The default value of $chars is "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-,".

boolean $rightPadFinalBits (optional) - This boolean determines how to handle the bits in the last encoded character when the number of bits remaining is less than $bitsPerCharacter. If TRUE, empty bits will be added on the right as needed to fill the quota. If FALSE (the default), they will be on the left. For most content transfer encoding schemes you will set this to TRUE.

boolean $padFinalGroup (optional) - It's common to encode characters in groups. For example, Base64 (which is based on 6 bits per character) converts 3 raw bytes into 4 encoded characters. If not enough bytes remain at the end, the final group will be padded with "=" to complete a group of 4 characters, and the encoded character length is always a multiple of 4. Some programs rely on the padding for decoding; FixedBitNotation does not.

string $padCharacter (optional) - If $padFinalGroup is TRUE, this is the character to use. The default is "=".

The encode() method accepts one argument:

string $rawString (required) - This is the string that you want to encode.

The decode() method accepts three arguments:

string $encodedString (required) - This is the string that you want to decode.

boolean $caseSensitive (optional) - To decode in a case-sensitive manner. The default is TRUE.

boolean $strict (optional) - If TRUE, NULL will be returned if $encodedString contains an undecodable character (which may include whitespace; see below about handling whitespace). If FALSE (the default), unknown characters are simply ignored.

When to use FixedBitNotation

Use it when you want to use an encoding for which PHP does not provide a built-in function. PHP provides the base64_encode() and base64_decode() functions, but if you need to use a modifed alphabet, you can either use strtr() to translate the base64_encode() output, or you can specify your own alphabet with FixedBitNotation.

To encode a string with modified Base64 for URLs and filenames, where the "+" and "/" are replaced with "-" and "_", you would do:
Syntax: [ Download ] [ Hide ]
<?php
$modifiedBase64 = new FixedBitNotation(6, 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_', TRUE, TRUE);
$encoded = $modifiedBase64->encode("encode this \xBF\xC2\xBF");
// ZW5jb2RlIHRoaXMgv8K_
?>


PHP does not provide any Base32 encode or decode methods. By setting $bitsPerCharacter to 5 and specifying your desired alphabet in $chars, you can handle any variant of Base32:
Syntax: [ Download ] [ Hide ]
<?php
// RFC 4648 Base32 alphabet
$base32 = new FixedBitNotation(5, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567', TRUE, TRUE);
$encoded = $base32->encode('encode this');
// MVXGG33EMUQHI2DJOM======
?>


Octal notation:
Syntax: [ Download ] [ Hide ]
<?php
$octal = new FixedBitNotation(3);
$encoded = $octal->encode('encode this');
// 312671433366214510072150322711
?>


A convenient way to go back and forth between binary notation and a real binary string:
Syntax: [ Download ] [ Hide ]
<?php
$binary = new FixedBitNotation(1);
$encoded = $binary->encode('encode this');
// 0110010101101110011000110110111101100100011001010010000001110100011010000110100101110011
$decoded = $binary->decode($encoded);
// encode this
?>


PHP has its own fixed-bit notation that it uses to generate session identifiers. The default for $chars (see above) matches the alphabet PHP uses. The session.hash_bits_per_character php.ini configuration option accepts a value between 4 to 6. Since 4 results in standard hexadecimal, you don't need this class to emulate PHP's session IDs, but you do for 5 and 6. With the raw_output parameter of PHP's hashing functions, you can create unique IDs of the exact same form by choosing $bitsPerCharacter and setting $rightPadFinalBits to FALSE (the default):
Syntax: [ Download ] [ Hide ]
<?php
// Generate a value that follows the form:
// session.hash_function = 0
// session.hash_bits_per_character = 5
$notate5bpc = new FixedBitNotation(5);
$id = $notate5bpc->encode(md5(uniqid(mt_rand(), TRUE), TRUE));
// q3c8n4vqpq11i0vr6ucmafg1h3
?>

Syntax: [ Download ] [ Hide ]
<?php
// Generate a value that follows the form:
// session.hash_function = 1
// session.hash_bits_per_character = 6
$notate6bpc = new FixedBitNotation(6);
$id = $notate6bpc->encode(sha1(uniqid(mt_rand(), TRUE), TRUE));
// 7Hf91mVc,q-9W1VndNNh3evVN83
?>

(Let's not make this a discussion of the randomness of rand(), mt_rand() or uniqid(); that's not the point.)

I use the above technique to generate unique IDs for all kinds of things, or any time I want a hash digest in a notation other than hexadecimal. For some uses, the decode() method is valuable for converting notated hash digests back into their raw binary form for efficient data storage.

I've also found this FixedBitNotation class useful for creating promotion codes or auto-generated passwords with a carefully chosen alphabet. Whenever you generate codes that will be read and typed in by users, you should use distinct symbols that are not easily confused with others. With a full alphabet it's possible to inadvertently form offensive words. I like to use capital letters and omit vowels and zero. This leaves 30 alphanumeric characters, but we need 32 to use 5 bits per character, so two characters will be used twice in the base alphabet. I accept this because the result doesn't need to be reversible, and even with the bias toward two of the characters, the character distribution is well balanced. Keep the character bias in mind when choosing an output length that makes your codes sufficiently hard to guess.
Syntax: [ Download ] [ Hide ]
<?php
// Generate an eight character password
$pwEncoder = new FixedBitNotation(5, '123456789BCDFGHJKLMNPQRSTVWXYZHZ');
$password = substr($pwEncoder->encode(md5(uniqid(mt_rand(), TRUE), TRUE)), 0, 8);
// HW42NMCP
?>


When not to use FixedBitNotation

Do not use FixedBitNotation when there is a native PHP function to suit your needs. If you're using Base64 encoding with the standard alphabet, use base64_encode() and base64_decode(); they're faster. For that reason, you might even prefer the strtr() suggestion I mentioned earlier for handling Base64 or hexadecimal with a modified alphabet.

Instead of using FixedBitNotation for encoding and decoding hexadecimal (like the binary example above), consider using bin2hex() and pack() instead; they're about 20 times faster:
Syntax: [ Download ] [ Hide ]
<?php
$encoded = bin2hex('encode this'); // 656e636f64652074686973
$decoded = pack('H*', $encoded); // encode this
?>


Finally, please understand that this is not encryption. Do not use this class to secure your data.

Handling whitespace

Variations of some content transfer encoding schemes specify a fixed or maximum line length. To add line endings to your encoded output, you can use chunk_split() or wordwrap(). To handle whitespace with decode(), you can simply set $strict to FALSE (the default) to ignore all characters that are not part of the base alphabet. But if you want to set $strict to TRUE, you can use str_replace() on the encoded string before trying to decode:
Syntax: [ Download ] [ Hide ]
<?php
// Remove line breaks from encoded data before decoding
$encoded = str_replace(array("\r", "\n"), '', $encoded);
$decoded = $fbnInstance->decode($encoded, TRUE, TRUE);
?>

Syntax: [ Download ] [ Hide ]
<?php
// Remove whitespace from encoded data before decoding
$encoded = str_replace(array(" ", "\t", "\r", "\n", "\0", "\x0B"), '', $encoded);
$decoded = $fbnInstance->decode($encoded, TRUE, TRUE);
?>


Top
 Profile  
 
PostPosted: Thu Feb 19, 2009 9:15 am 
Offline
DevNet Resident
User avatar

Joined: Sun Sep 03, 2006 5:19 am
Posts: 1579
Location: Sofia, Bulgaria
I haven't read the code in detail, so I won't comment on that, just a quick note that you the pad character can be not in the encoding alphabet - is this intentional, or did you just forget to check.
A second point is go check the google chart API and provide shortcut implementations for their encodings (I reckon this will be a popular feature)


Top
 Profile  
 
PostPosted: Thu Feb 19, 2009 12:52 pm 
Offline
Forum Commoner

Joined: Thu Aug 28, 2008 7:03 pm
Posts: 55
Mordred wrote:
the pad character can be not in the encoding alphabet

Thanks for the comments, but where do you see the pad character in the alphabet?


Top
 Profile  
 
PostPosted: Thu Feb 19, 2009 3:38 pm 
Offline
Forum Commoner

Joined: Thu Aug 28, 2008 7:03 pm
Posts: 55
I may have misinterpreted what you were saying; sorry. You are right if you are saying that I don't check to make sure that the pad character is not in the alphabet. I left that up to the user, but maybe I should add a check. Good point.


Top
 Profile  
 
PostPosted: Fri Feb 20, 2009 12:40 am 
Offline
Breakbeat Nuttzer
User avatar

Joined: Wed Mar 24, 2004 8:57 am
Posts: 13098
Location: Melbourne, Australia
Just curious, is there such a thing as base128 that uses all 7 bits available for all US-ASCII characters?

Not that it would look very neat with all those WS/control characters in there :P


Top
 Profile  
 
PostPosted: Fri Feb 20, 2009 4:13 pm 
Offline
Forum Commoner

Joined: Thu Aug 28, 2008 7:03 pm
Posts: 55
Chris Corbyn wrote:
Just curious, is there such a thing as base128 that uses all 7 bits available for all US-ASCII characters?

Not that it would look very neat with all those WS/control characters in there :P

A base-128 encoding wouldn't be very practical as a content transfer encoding because there are fewer than 128 safe ASCII characters. You can do it though. Maybe someone else can think of a practical use case for it:

Syntax: [ Download ] [ Hide ]
<?php
// This example uses the 7-bit ASCII characters
$base128chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F"
              . "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"
              . "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F"
              . "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3A\x3B\x3C\x3D\x3E\x3F"
              . "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4A\x4B\x4C\x4D\x4E\x4F"
              . "\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5A\x5B\x5C\x5D\x5E\x5F"
              . "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6A\x6B\x6C\x6D\x6E\x6F"
              . "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x69\x7A\x7B\x7C\x7D\x7E\x7F";
 
// No end padding because no 7-bit ASCII characters remain for the pad character
$base128 = new FixedBitNotation(7, $base128chars, TRUE);
$encoded = $base128->encode('encode this');
?>


Top
 Profile  
 
PostPosted: Fri May 17, 2013 7:43 pm 
Offline
Forum Commoner

Joined: Thu Aug 28, 2008 7:03 pm
Posts: 55
Since I first posted this code, it has founds its way into a few open source projects, which is great.

This class, renamed Base2n, is now maintained in a Git repository: https://github.com/ademarre/binary-to-text-php

Check the repository if you want the latest version, under the MIT license.


Top
 Profile  
 
Display posts from previous:  Sort by  
Post new topic Reply to topic  [ 8 posts ] 

All times are UTC - 5 hours


Who is online

Users browsing this forum: No registered users and 1 guest


You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot post attachments in this forum

Jump to:  
cron
Powered by phpBB® Forum Software © phpBB Group