This section of the archives stores flipcode's complete Developer Toolbox collection, featuring a variety of mini-articles and source code contributions from our readers.

 

  Huffman Compression
  Submitted by



This is a little huffman compressor which I made recently. I was curious on how compression algorithms work and tried to implement one and the following code is what resulted. To my big surprise, the algorithm to build the huffman-tree is only a few lines of understandable code. Note that the compression algorithms to compress a file are more an example of how to use the huffman-tree and it's definitely not meant to be an illustration of brilliant software-design. Note that this implementation is not optimized and has a few nasty hacks and neither does it beat the compression ratio of the well-known compressors out there, but for people who want to take a quick peak how these compression algorithms work this might provide a nice start.

                uncompressed        this algorithm        zip (windows commander)
.pcx file          148.750                94.055                        72.555     
.ase file          821.472               492.978                     162.102
Roger Boerdijk


Currently browsing [huffman.zip] (5,379 bytes) - [huffman.h] - (3,785 bytes)

#ifndef _HUFFMAN_COMPRESSION_HPP
#define _HUFFMAN_COMPRESSION_HPP
#include "..\defines.h"
// _    _ _    _ ______ ______ __  __          _   _ 
//| |  | | |  | |  ____|  ____|  \/  |   /\   | \ | |
//| |__| | |  | | |__  | |__  | \  / |  /  \  |  \| |
//|  __  | |  | |  __| |  __| | |\/| | / /\ \ | . ` |
//| |  | | |__| | |    | |    | |  | |/ ____ \| |\  |
//|_|  |_|\____/|_|    |_|    |_|  |_/_/    \_\_| \_|
// Huffman data compression tree     
// Roger Boerdijk 30.06.2002
// email: kitt3n@home.nl                                              

// Use the code any way you wish; you assume all responsibility for its 
// correctness and suitability should you use it. 
 
//-------------------------------------------------------------------------------------------------

class cHuffmanNode
{
  friend class cHuffmanTree;

private: U8 ch; U8 reserved[3]; U32 frequency; class cHuffmanNode *parent, *left_child, *right_child;

public:

cHuffmanNode () { ch=0; frequency=0; reserved[0]=reserved[1]=reserved[2]=0; parent=left_child=right_child=NULL; }

int operator () (const cHuffmanNode& a, const cHuffmanNode& b) const; int operator () (const cHuffmanNode* a, const cHuffmanNode* b) const; };

//------------------------------------------------------------------------------------------------- class cHuffmanTree { // Note that it's up to the client libary to store the tree // to use it for later uncompression. Also note that using // the same frequency-table, the tree can be rebuilt, so you // can either store the tree, or the frequency-table! private: cHuffmanNode* root; std::vector<pair<U8,U32> > ft;

public:

cHuffmanTree () : root (NULL) {}; cHuffmanTree (U32 frequency_table[256]); cHuffmanTree (std::vector<pair<U8, U32> > frequency_table); cHuffmanTree (U8* data, U32 size); ~cHuffmanTree ();

void getStringTable (std::vector<std::string>& bitString);

U32 getCompressedSize (U8* udata, U32 len); U32 compress (U8* udata, U32 len, U8** cdata); U32 uncompress (U8* cdata, U32 len, U8** udata); U32 compress (const I8* inname, const I8* outfile); U32 uncompress (const I8* inname, const I8* outfile); U32 getTree (std::string& theTree); void setTree (const std::string& theTree);

private: bool buildTreeDo (const std::vector<pair<U8, U32> > frequency_table); void freeTree (void); void freeTreeR (cHuffmanNode* root); char* getCharacterPath (cHuffmanNode* n, int c); bool gcp (char* s, cHuffmanNode* n, int c);

bool write_bit (bool bit, U8& theByte, bool reset=false); bool read_bit (bool& bit, U8 byte, bool reset=false);

U32 getTreeR (std::string& theTree, cHuffmanNode* node); cHuffmanNode* setTreeR (const std::string& theTree, U32& idx);

};

#endif


Currently browsing [huffman.zip] (5,379 bytes) - [huffman.cpp] - (14,306 bytes)

#include "huffman.h"

// Roger Boerdijk 30.06.2002 // email: kitt3n@home.nl // Use the code any way you wish; you assume all responsibility for its // correctness and suitability should you use it. //------------------------------------------------------------------------------------------------- int cHuffmanNode::operator () (const cHuffmanNode& a, const cHuffmanNode& b) const { return a.frequency>b.frequency; }

int cHuffmanNode::operator () (const cHuffmanNode* a, const cHuffmanNode* b) const { return a->frequency>b->frequency; }

//------------------------------------------------------------------------------------------------- cHuffmanTree::cHuffmanTree (U32 frequency_table[256]) : root(NULL) { // Constructor for a byte-frequency table - usefull for compressing files ft.resize (256); for (int i=0; i<256; i++) { ft[i].first = i; ft[i].second=frequency_table[i]; } buildTreeDo (ft); }

cHuffmanTree::cHuffmanTree (std::vector<pair<U8, U32> > frequency_table) : root (NULL) { // more generic frequency table, can contain more/less ch's with // respective frequencies than the byte-frequency-table. // Nice if you have a limited character-set, for example only // A-Z and 0-9, instead of the entire ascii-table ft = frequency_table; buildTreeDo (ft); }

cHuffmanTree::cHuffmanTree (U8* data, U32 size) : root (NULL) { // very simular to cHuffmanTree::buildTree (U32 frequency_table[256]) // only this function will build the frequency-table itself pair<U8, U32> thePair; ft.resize (256); for (int z=0; z<ft.size(); z++) { thePair.first = z; thePair.second = 0; ft[z]=thePair; }

// count frequencies for (int i=0; i<size; i++) ft[data[i]].second++;

buildTreeDo (ft); }

cHuffmanTree ::~cHuffmanTree () { freeTree (); }

//------------------------------------------------------------------------------------------------- bool cHuffmanTree::buildTreeDo (const std::vector<pair<U8, U32> > frequency_table) { // If there was already a tree, cleanup and build the new one if (root) { freeTree (); }

ft = frequency_table; std::priority_queue<cHuffmanNode*, std::vector<cHuffmanNode*>, cHuffmanNode> h; cHuffmanNode* t = NULL;

// 1. Fill a heap with characters sorted on frequency // First in heap character with highest frequency for (int i=0; i<frequency_table.size(); i++) { // We push if the symbol has at least 1 occurance, // if the symbol has 0 occurances it's not usefull // to put it in the tree. if (frequency_table[i].second!=0) { cHuffmanNode* tmp = new cHuffmanNode; assert (tmp && "cHuffman::buildTree() Failed!"); tmp->ch = frequency_table[i].first; tmp->frequency = frequency_table[i].second; h.push (tmp); } }

// 2. Build the tree from the heap while (!h.empty()) { if (h.size()==1) { // If h contains only one character X, make X the root of T t = h.top(); h.pop(); } else { // Pick two characters X and Y with the lowest frequencies // and delete them from H cHuffmanNode* X=h.top(); h.pop(); cHuffmanNode* Y=h.top(); h.pop(); // replace X and Y with a new character Z, whose frequency // is the sum of the frequencies of X and Y cHuffmanNode* Z = new cHuffmanNode; Z->frequency = X->frequency+Y->frequency; Z->left_child = X; Z->right_child = Y; h.push (Z); } }

root = t; //buildCode (frequency_table, t) return true; }

//------------------------------------------------------------------------------------------------- void cHuffmanTree::freeTree (void) { if (root) { freeTreeR (root); root=NULL; } }

void cHuffmanTree::freeTreeR (cHuffmanNode* node) { if (!node) return;

if (!(node->right_child && node->left_child)) { delete node; return; }

if (node->left_child) freeTreeR (node->left_child); if (node->right_child) freeTreeR (node->right_child);

// we deleted all of this node's children, now delete // the node itself delete node; }

//------------------------------------------------------------------------------------------------- U32 cHuffmanTree::getTree (std::string& theTree) { return getTreeR (theTree, root); }

U32 cHuffmanTree::getTreeR (std::string& theTree, cHuffmanNode* node) { // This function stores reconstruction-data for our tree // Note that the "1" and "0" do NOT represent huffman-codes // but it tells if a node is a branch or a leaf // Note that we don't store frequency data for reconstruction! if (!node) return 0;

if (!(node->left_child && node->right_child)) { // leaf char ch = node->ch; theTree.append ("1"); // note: will give problems with >256 frequency-tables! theTree+=ch; return 2; } else { // Branch theTree.append ("0"); return 1+ getTreeR (theTree, node->left_child)+ getTreeR (theTree, node->right_child); }

std::vector<std::string> bs; getStringTable (bs); }

void cHuffmanTree::setTree (const std::string& theTree) { if (root) freeTree ();

U32 index=0; root = setTreeR (theTree, index); }

cHuffmanNode* cHuffmanTree::setTreeR (const std::string& theTree, U32& idx) { // This function reconstructs our tree // Note that we don't reconstruct frequency data! if (idx>=theTree.size()) return NULL;

cHuffmanNode* n = new cHuffmanNode; U8 b = theTree[idx];

if (b == '0') { // branch n->ch = 0; n->left_child = setTreeR (theTree, ++idx); n->right_child = setTreeR (theTree, ++idx); } else { // leaf idx++; n->ch = theTree[idx]; n->left_child = NULL; n->right_child = NULL; } return n; }

//------------------------------------------------------------------------------------------------- bool cHuffmanTree::gcp(char* s, cHuffmanNode* n, int c) { // get-character-path worker function. if (!n) return 0; if (!(n->left_child && n->right_child)) { if (c == n->ch) return true; else return false; } else { char _s[256]; if (gcp(s, n->left_child, c)) { strcpy(_s, s); strcpy(s, "0"); strcat(s, _s); return true; } else if (gcp(s, n->right_child, c)) { strcpy(_s, s); strcpy(s, "1"); strcat(s, _s); return true; } else return false; } }

char* cHuffmanTree::getCharacterPath (cHuffmanNode* n, int c) { // Get the path through the tree to find the bit-string // we need to encode / decode a character. Note that // we are using a static-member ie NOT an example of // good programming! static char string[256]; strcpy(string, ""); if (gcp(string, n, c)) return string; else return ""; }

void cHuffmanTree::getStringTable (std::vector<std::string>& bitString) { // Example function which will get all string encodings of each character. // Note that '0' is a 0-bit and '1' is a 1-bit bitString.resize (ft.size()); for (int i=0; i<ft.size(); i++) { char* tmp = getCharacterPath (root, ft[i].first); bitString[i]=tmp; } }

//-------------------------------------------------------------------------------------------------

bool cHuffmanTree::write_bit (bool bit, U8& theByte, bool reset) { // grow a byte, bit by bit, and then flush it theByte reset will // initialize the function to default-state. will return true as // soon as 8 bits were written and it is up to the called to save // the value in theByte. // note that if the function returns false, theByte does not // contain any usefull value static long bit_index = 7; static U8 byte = 0x00;

bool flush = false;

if (reset) { bit_index=7; byte = 0x00; return false; }

if(bit_index == -1) { // flush byte bit_index = 7; byte = 0x00; }

//byte |= (bit ? 1 : 0) << bit_index; byte |= bit << bit_index; bit_index--; if(bit_index == -1) { theByte = byte; flush=true; } return flush; }

bool cHuffmanTree::read_bit(bool& bit, U8 byte, bool reset) { // return one bit at a time, returns true when the // byte finished and it wants the next byte static long bit_index = 7; bool requestNextByte=false; int bit_value;

if (reset) { bit_index = 7; requestNextByte=false; return true; }

// bit_value = (byte & (1 << bit_index)) ? 1 : 0; bit_value = byte & (1 << bit_index); bit = bit_value!=0; bit_index--;

if(bit_index == -1) { requestNextByte=true; bit_index = 7; }

return requestNextByte; }

U32 cHuffmanTree::getCompressedSize (U8* udata, U32 len) { assert (root && udata);

// get stringtable with compression info int i; std::vector<std::string> bs; getStringTable (bs);

// calculate datasize U32 bitsEstimated=0; U32 byteEstimated=0; for (i=0; i<len; i++) { bitsEstimated+=bs[udata[i]].length(); assert (bs[udata[i]].length()!=0 && "cHuffmanTree::compress() Unknown ch"); }

// 4 bytes (U32) extra for uncompressed size byteEstimated=bitsEstimated/8; //+bitsEstimated%8==0?0:1+4; byteEstimated+=bitsEstimated%8==0?0:1; byteEstimated+=4;

return byteEstimated; }

//------------------------------------------------------------------------------------------------- U32 cHuffmanTree::compress (U8* udata, U32 len, U8** cdata) { // NOTE: Client has to deallocate memory which we allocate here! // (maybe we can somehow change this) assert (root);

// get stringtable with compression info int i; std::vector<std::string> bs; getStringTable (bs);

U32 byteEstimated = getCompressedSize (udata, len);

U8* data = new U8[byteEstimated]; if (data) { *cdata = data; *((U32*)data)=len; data+=4;

// compress data U8 cbyte; write_bit (false, cbyte, true); // reset for (i=0; i<len; i++) { U32 bitlen = bs[udata[i]].length(); const I8* str = bs[udata[i]].c_str();

for (int j=0; j<bitlen; j++) { // write the correct bit into our temp-buffer if (write_bit(((str[j] - '0')!=0), cbyte)) { // we wrote 8 bits, copy to our compressed buffer // and prepare for next byte *data=cbyte; data++; } } }

// fill last byte to end with 0 for (int fillbit=0; fillbit<8; fillbit++) { if (write_bit(0, cbyte)) { *data=cbyte; data++; break; } } return byteEstimated; } return 0; }

U32 cHuffmanTree::uncompress (U8* cdata, U32 len, U8** udata) { // NOTE: Client has to deallocate memory which we allocate here! // (maybe we can somehow change this) U32 ulen = *((U32*)cdata); cdata+=4;

U8* data = new U8[ulen]; if (data) { *udata=data;

bool bit=0; cHuffmanNode* curnode = root; U32 decodedU8s=0;

// make sure our readbit is in initialized state read_bit (bit, cdata[0], true);

while (decodedU8s<ulen) { if (cHuffmanTree::read_bit(bit, cdata[0])) { // next byte! cdata++; }

if (bit==0) { curnode = curnode->left_child; } else { curnode = curnode->right_child; }

// if we're at a leaf, output the character and start over if (!(curnode->left_child && curnode->right_child)) { data[decodedU8s]=(curnode->ch); decodedU8s++; curnode = root; } } return ulen; } return 0; }

//------------------------------------------------------------------------------------------------- U32 cHuffmanTree::compress (const I8* inname, const I8* outfile) { bool success=false; FILE* fp, *fpout; if ((fp=fopen((char*)inname, "rb"))!=NULL) { if ((fpout=fopen((char*)outfile, "wb"))!=NULL) { // determine filesize fseek (fp, 0, SEEK_END); U32 len = ftell (fp); fseek (fp, 0, SEEK_SET);

// read entire file into ram U8* ram = new U8[len]; U8* zip; if (ram) { fread (ram, len, 1, fp); }

// build huffman tree & compress cHuffmanTree tree2 (ram, len); U32 csize = tree2.compress (ram , len, &zip);

std::string htR; tree2.getTree (htR); U16 htS = htR.size(); U8 htC;

fwrite (&htS, sizeof (htS), 1, fpout); for (int i=0; i<htR.size(); i++) { htC = htR[i]; fwrite (&htC, 1, 1, fpout); } fwrite (zip, csize, 1, fpout);

delete zip; delete ram;

fclose (fpout); success=true; } fclose (fp); } return success; }

U32 cHuffmanTree::uncompress (const I8* inname, const I8* outfile) { bool success=false; FILE* fp, *fpout; if ((fp=fopen((char*)inname, "rb"))!=NULL) { if ((fpout=fopen((char*)outfile, "wb"))!=NULL) { // determine filesize fseek (fp, 0, SEEK_END); U32 len = ftell (fp); fseek (fp, 0, SEEK_SET);

// read entire file into ram U8* zip = new U8[len]; if (zip) { fread (zip, len, 1, fp); }

// build huffman tree & decompress U16 htS = *(U16*)(zip); std::string htR; htR.resize (htS); for (int i=0; i<htS; i++) { htR[i]=*(zip+i+2); } U32 htSize = sizeof(htS)+htR.size()*sizeof(U8);

U8* ram; cHuffmanTree tree2; tree2.setTree (htR); U32 usize = tree2.uncompress (zip+htSize, len-htSize, &ram);

fwrite (ram, usize, 1, fpout); delete zip; delete ram;

fclose (fpout); success=true; } fclose (fp); } return success; }


The zip file viewer built into the Developer Toolbox made use of the zlib library, as well as the zlibdll source additions.

 

Copyright 1999-2008 (C) FLIPCODE.COM and/or the original content author(s). All rights reserved.
Please read our Terms, Conditions, and Privacy information.