Wednesday, July 25, 2012

Modifying gzip to only do huffman coding

Recently I needed to compress certain types of files on a system with very limited memory. After looking at LZ77, LZ78, and derivatives I chose simple huffman coding as it gave me ~50% savings on the file, with limited resources.

The first possible candidate was http://entropyware.info/shcodec/index.html

In the end, I modified gzip to output only huffman compressed data. I modified puff.c (included with zlib) to only care about these types of blocks.

Patch to zlib

--- ../gzip-1.4-orig/deflate.c  2010-01-03 12:26:02.000000000 -0500
+++ deflate.c   2012-07-25 11:47:30.000000000 -0400
@@ -673,8 +673,11 @@ off_t deflate()
     int flush;               /* set if current block must be flushed */
     int match_available = 0; /* set if previous match exists */
     register unsigned match_length = MIN_MATCH-1; /* length of best match */
+    extern int huffonly;       /* gzip.c */

+    if (huffonly == 0) {
     if (compr_level <= 3) return deflate_fast(); /* optimized for speed */
+    }

     /* Process the input block. */
     while (lookahead != 0) {
@@ -690,7 +693,8 @@ off_t deflate()

         if (hash_head != NIL && prev_length < max_lazy_match &&
             strstart - hash_head <= MAX_DIST &&
-            strstart <= window_size - MIN_LOOKAHEAD) {
+            strstart <= window_size - MIN_LOOKAHEAD &&
+           huffonly==0) {
             /* To simplify the code, we prevent matches with the string
              * of window index 0 (in particular we have to avoid a match
              * of the string with itself at the start of the input file).
@@ -710,7 +714,8 @@ off_t deflate()
         /* If there was a match at the previous step and the current
          * match is not better, output the previous match:
          */
-        if (prev_length >= MIN_MATCH && match_length <= prev_length) {
+        if (prev_length >= MIN_MATCH && match_length <= prev_length
+               && huffonly==0) {

             check_match(strstart-1, prev_match, prev_length);

--- ../gzip-1.4-orig/gzip.c     2010-01-03 12:26:02.000000000 -0500
+++ gzip.c      2012-07-25 11:53:29.000000000 -0400
@@ -194,6 +194,7 @@ char *env;            /* contents of GZI
 char **args = NULL;   /* argv pointer if GZIP env variable defined */
 char const *z_suffix; /* default suffix (can be set with --suffix) */
 size_t z_len;         /* strlen(z_suffix) */
+int huffonly = 0;   /* only use huffman codes, no LZ77 */

 /* The set of signals that are caught.  */
 static sigset_t caught_signals;
@@ -271,6 +272,7 @@ struct option longopts[] =
     {"best",       0, 0, '9'}, /* compress better */
     {"lzw",        0, 0, 'Z'}, /* make output compatible with old compress */
     {"bits",       1, 0, 'b'}, /* max number of bits per code (implies -Z) */
+    {"huffonly",   0, 0, 'O'}, /* ascii text mode */
     { 0, 0, 0, 0 }
 };

@@ -352,6 +354,7 @@ local void help()
  "  -Z, --lzw         produce output compatible with old compress",
  "  -b, --bits=BITS   max number of bits per code (implies -Z)",
 #endif
+ "  -O, --huffonly    only compress using dynamic huffman codes (block type 2)",
  "",
  "With no FILE, or when FILE is -, read standard input.",
  "",
@@ -502,6 +505,9 @@ int main (int argc, char **argv)
            try_help ();
            break;
 #endif
+       case 'O':
+           huffonly = 1;
+           break;
        case '1':  case '2':  case '3':  case '4':
        case '5':  case '6':  case '7':  case '8':  case '9':
            level = optc - '0';     

Changes to puff.c

--- puff.orig.c 2010-04-25 05:04:16.000000000 -0400
+++ puff.new.c  2012-07-25 13:33:16.000000000 -0400
@@ -160,6 +160,7 @@ local int bits(struct state *s, int need
  * - A stored block can have zero length.  This is sometimes used to byte-align
  *   subsets of the compressed data for random access or partial recovery.
  */
+#ifndef CONFIG_ONLY_BLOCK_2
 local int stored(struct state *s)
 {
     unsigned len;       /* length of stored block */
@@ -194,6 +195,7 @@ local int stored(struct state *s)
     /* done with a valid stored block */
     return 0;
 }
+#endif

 /*
  * Huffman code decoding tables.  count[1..MAXBITS] is the number of symbols of
@@ -437,6 +439,7 @@ local int codes(struct state *s,
                 const struct huffman *distcode)
 {
     int symbol;         /* decoded symbol */
+#ifndef CONFIG_ONLY_BLOCK_2
     int len;            /* length for copy */
     unsigned dist;      /* distance for copy */
     static const short lens[29] = { /* Size base for length codes 257..285 */
@@ -453,6 +456,7 @@ local int codes(struct state *s,
         0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
         7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
         12, 12, 13, 13};
+#endif

     /* decode literals and length/distance pairs */
     do {
@@ -469,6 +473,10 @@ local int codes(struct state *s,
             s->outcnt++;
         }
         else if (symbol > 256) {        /* length */
+#ifdef CONFIG_ONLY_BLOCK_2
+           /* I never put out anything but literals */
+           return -11;
+#else /* CONFIG_ONLY_BLOCK_2 */
             /* get and compute length */
             symbol -= 257;
             if (symbol >= 29)
@@ -501,6 +509,7 @@ local int codes(struct state *s,
             }
             else
                 s->outcnt += len;
+#endif /* CONFIG_ONLY_BLOCK_2 */
         }
     } while (symbol != 256);            /* end of block symbol */

@@ -532,6 +541,7 @@ local int codes(struct state *s,
  *   length, this can be implemented as an incomplete code.  Then the invalid
  *   codes are detected while decoding.
  */
+#ifndef CONFIG_ONLY_BLOCK_2
 local int fixed(struct state *s)
 {
     static int virgin = 1;
@@ -573,6 +583,7 @@ local int fixed(struct state *s)
     /* decode data until end-of-block code */
     return codes(s, &lencode, &distcode);
 }
+#endif

 /*
  * Process a dynamic codes block.
@@ -668,7 +679,9 @@ local int dynamic(struct state *s)
     int err;                            /* construct() return value */
     short lengths[MAXCODES];            /* descriptor code lengths */
     short lencnt[MAXBITS+1], lensym[MAXLCODES];         /* lencode memory */
+#ifndef CONFIG_ONLY_BLOCK_2
     short distcnt[MAXBITS+1], distsym[MAXDCODES];       /* distcode memory */
+#endif
     struct huffman lencode, distcode;   /* length and distance codes */
     static const short order[19] =      /* permutation of code length codes */
         {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
@@ -676,8 +689,10 @@ local int dynamic(struct state *s)
     /* construct lencode and distcode */
     lencode.count = lencnt;
     lencode.symbol = lensym;
+#ifndef CONFIG_ONLY_BLOCK_2
     distcode.count = distcnt;
     distcode.symbol = distsym;
+#endif

     /* get number of lengths in each table, check lengths */
     nlen = bits(s, 5) + 257;
@@ -735,9 +750,11 @@ local int dynamic(struct state *s)
         return -7;      /* incomplete code ok only for single length 1 code */

     /* build huffman table for distance codes */
+#ifndef CONFIG_ONLY_BLOCK_2
     err = construct(&distcode, lengths + nlen, ndist);
     if (err && (err < 0 || ndist != distcode.count[0] + distcode.count[1]))
         return -8;      /* incomplete code ok only for single length 1 code */
+#endif

     /* decode data until end-of-block code */
     return codes(s, &lencode, &distcode);
@@ -816,6 +833,7 @@ int puff(unsigned char *dest,
         do {
             last = bits(&s, 1);         /* one if last block */
             type = bits(&s, 2);         /* block type 0..3 */
+#ifndef CONFIG_ONLY_BLOCK_2
             err = type == 0 ?
                     stored(&s) :
                     (type == 1 ?
@@ -823,6 +841,10 @@ int puff(unsigned char *dest,
                         (type == 2 ?
                             dynamic(&s) :
                             -1));       /* type == 3, invalid */
+#else
+           /* only support block type 2 */
+            err = type == 2 ? dynamic(&s) : -1;
+#endif
             if (err != 0)
                 break;                  /* return with error */
         } while (!last);

Wednesday, July 18, 2012

A second look at probability theory

I'm studying Principles of Communication Engineering by Wozencraft and Jacobs and the second chapter goes over probability theory, except it uses measure theoretic probability theory.
Here are my notes when reading about σ-algebras (sigma-algebra).

Thursday, July 5, 2012

40G Ethernet Blog

Fantastic website: 40G Ethernet Blog

Tuesday, July 3, 2012

Notes on pluggable optical I/O modules

Why have pluggable I/O interfaces

  • Users can pick transceiver & cable for specific application
  • Copper vs optical
  • Active vs passive cable

SFP+

  • Enhancement of SFP
  • Up to 10 Gbit/s

XFP

XFP vs SFP+

  • 10GbE can use both XFP & SFP+
  • TBD

Overview of different standards




(Originally from a Tyco Presentation )

40 GbE architecture


(Originally from The IEEE Std 802.3ba-2010 40Gb/s and 100Gb/s Architecture)

40 GbE implementation examples
(Originally from The IEEE Std 802.3ba-2010 40Gb/s and 100Gb/s Architecture)

QSFP

CFP

CXP

  • CXP
  • Active optical cable
  • TBD

Monday, July 2, 2012

Can I use this embedded processor?

At work I'm asked if we can use a particular processor for a new system. Other than mechanical, electrical, and I/O considerations, software and tool support is very important.

When selecting processors for medium-sized embedded systems, I ask:

Is it a well-known architecture? For our applications, we stick with ARM, MIPS. We evaluated Intel, but they didn't meet our I/O requirements.

Linux support? I check Wikipedia and the Linux repository.

GCC support? I check the GCC repository.

ICE/JTAG support? I check Mentor Graphics and Macraigor.