|
From: Bryan T. <tho...@us...> - 2007-07-10 16:13:04
|
Update of /cvsroot/cweb/lgpl-utils/src/java/it/unimi/dsi/mg4j/compression In directory sc8-pr-cvs4.sourceforge.net:/tmp/cvs-serv14570/src/java/it/unimi/dsi/mg4j/compression Added Files: CodedCharSequenceBooleanIterator.java package.html CodeWordCoder.java PrefixCoder.java HuTuckerCodec.java PrefixCodec.java Decoder.java TreeDecoder.java HuffmanCodec.java Coder.java Codec.java Log Message: Commit incorporates some clases from the mg4j project that I am trying out for order preserving (alphabetic) compression. I am in the process of writing test cases for the HuTuckerCodec that verify its use and order preserving properties. A minimal set of classes from mg4j has been imported to support compression. A fastutils jar has also been incorporated, but it can doubtless be prunned down even further. The import is from the 1.x version of mg4j. I should really re-import from 2.0 now that it has been released. --- NEW FILE: Decoder.java --- package it.unimi.dsi.mg4j.compression; /* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2006 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ import it.unimi.dsi.fastutil.booleans.BooleanIterator; import it.unimi.dsi.mg4j.io.InputBitStream; import java.io.IOException; /** Decoding methods for a specific compression technique. */ public interface Decoder { /** Decodes the next symbol from the given boolean iterator. * * <P>Note that {@link InputBitStream} implements {@link BooleanIterator}. * * @param iterator a boolean iterator. * @return the next symbol decoded from the bits emitted by <code>i</code> * @throws java.util.NoSuchElementException if <code>iterator</code> terminates before a symbol has been decoded. */ int decode( BooleanIterator iterator ); /** Decodes the next symbol from the given input bit stream. * * <P>Note that {@link InputBitStream} implements {@link BooleanIterator}. * * @param ibs an input bit stream. * @return the next symbol decoded from <code>ibs</code>. */ int decode( InputBitStream ibs ) throws IOException; } --- NEW FILE: Codec.java --- package it.unimi.dsi.mg4j.compression; /* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2006 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ /** An abstract factory corresponding to an instance of a specific compression technique. * * <P>An implementation of this interface provides coders and decoders. The * constructors must provide all data that is required to perform coding * and decoding. */ public interface Codec { /** Returns a coder for the compression technique represented by this coded. * * @return a coder for the compression technique represented by this codec. */ public Coder getCoder(); /** Returns a decoder for the compression technique represented by this coded. * * @return a decoder for the compression technique represented by this codec. */ public Decoder getDecoder(); } --- NEW FILE: PrefixCoder.java --- package it.unimi.dsi.mg4j.compression; /* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2006 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ import cern.colt.bitvector.BitVector; /** A coder that additionally exposes the vector of codewords. * * <P>Not all coders are codeword-based (for instance, {@linkplain it.unimi.dsi.mg4j.io.ArithmeticCoder arithmetic coding} * is not codeword-based). However, coders that are codeword-based are invited * to return by means of {@link it.unimi.dsi.mg4j.compression.Codec#getCoder()} an * implementation of this interface. */ public interface PrefixCoder extends Coder { /** Provides access to the codewords. * @return the codewords. */ BitVector[] codeWords(); } --- NEW FILE: PrefixCodec.java --- package it.unimi.dsi.mg4j.compression; /* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2006 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ import java.io.Serializable; import cern.colt.bitvector.BitVector; /** A generic prefix codec. * * <p>Several prefix codec work by building a vector of codewords for * coding, and a decoding tree. The common code is gathered here: you must * extend this class and implement just a constructor accepting a vector of frequencies * (and invoking the corresponding constructor of this class) and the <em>template method</em> {@link #initTree(int[])}, * which takes the vector of frequencies and must return the root of a decoding tree * built using {@link it.unimi.dsi.mg4j.compression.TreeDecoder.Node} for internal nodes * and {@link it.unimi.dsi.mg4j.compression.TreeDecoder.LeafNode} for leaves. This class will do the rest for you (populating the * codeword vector, returning the singleton coder/decoder, etc.). */ public abstract class PrefixCodec implements Codec, Serializable { private static final int BITS_PER_CHAR = 16; /** The number of symbols of this coder. */ public final int size; /** For each symbol, its codeword. */ public final BitVector[] codeWord; /** The root of the decoding tree. */ private final TreeDecoder.Node root; /** A cached singleton instance of the coder of this codec. */ private final Coder coder; /** A cached singleton instance of the decoder of this codec. */ private final Decoder decoder; private final static boolean DEBUG = false; /** Creates a new prefix codec. * * @param frequency a vector of frequencies. */ protected PrefixCodec( final int[] frequency ) { size = frequency.length; codeWord = new BitVector[ size ]; root = initTree( frequency ); buildCodes( root, new BitVector( 0 ) ); coder = new CodeWordCoder( codeWord ); decoder = new TreeDecoder( root ); if ( DEBUG ) { System.err.println( "Codes: " ); for( int i = 0; i < size; i++ ) System.err.println( i + " (" + codeWord[ i ].size() + " bits): " + codeWord[ i ] ); long totFreq = 0; for( int i = size; i-- != 0; ) totFreq += frequency[ i ]; long totBits = 0; for( int i = size; i-- != 0; ) totBits += frequency[ i ] * codeWord[ i ].size(); System.err.println( "Compression: " + totBits + " / " + totFreq * BITS_PER_CHAR + " = " + (double)totBits/(totFreq * BITS_PER_CHAR) ); } } /** A template method for building the decoding tree. * * @param frequency a vector of frequencies. * @return the root of a decoding tree for the given vector of frequencies. */ protected abstract TreeDecoder.Node initTree( final int[] frequency ); /** Populates the codeword vector by scanning recursively * the decoding tree. * * @param n a subtree of the decoding tree. * @param prefix the path leading to <code>n</code>. */ private void buildCodes( final TreeDecoder.Node n, final BitVector prefix ) { if ( n instanceof TreeDecoder.LeafNode ) { codeWord[ ((TreeDecoder.LeafNode)n).symbol ] = prefix; return; } BitVector bitVector = prefix.copy(); bitVector.setSize( bitVector.size() + 1 ); buildCodes( n.left, bitVector ); bitVector = prefix.copy(); bitVector.setSize( bitVector.size() + 1 ); bitVector.set( bitVector.size() - 1 ); buildCodes( n.right, bitVector ); } public Coder getCoder() { return coder; } public Decoder getDecoder() { return decoder; } } --- NEW FILE: package.html --- <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <html> <head> <title>MG4J: Managing Gigabytes for Java</title> </head> <body> <p>Word-based compression/decompression classes. <p>Classes in this package are somewhat experimental. They implement the basic code that is necessary for creating {@linkplain it.unimi.dsi.mg4j.util.ImmutableExternalTriePrefixDictionary trie prefix dictionaries}. In some cases they are significantly sub-optimal (for instance, the Huffman codec is not canonical). </body> </html> --- NEW FILE: CodedCharSequenceBooleanIterator.java --- package it.unimi.dsi.mg4j.compression; /* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2006 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ import it.unimi.dsi.fastutil.booleans.AbstractBooleanIterator; import it.unimi.dsi.fastutil.chars.Char2IntMap; import java.util.NoSuchElementException; import cern.colt.bitvector.BitVector; /** A wrapper for character sequences that exposes them as a boolean iterators * returning the bits obtained coding the sequence, using * a given map from character to symbols in a prefix coder. * * <P>This class is very lightweight: it just scans the string, concatenating * lazily the bits of each codeword. */ public class CodedCharSequenceBooleanIterator extends AbstractBooleanIterator { private final CharSequence s; private final int length; private final Char2IntMap char2code; private final BitVector[] codeWord; private BitVector char2codeWord( final char c ) { return codeWord[ char2code.get( c ) ]; } private int pos, index; private BitVector currCodeWord; /** Creates a new boolean iterator over a character sequence. * * @param s a character sequence. * @param prefixCoder a prefix coder that is able to code the characters in <code>s</code>, or possibly <code>null</code> if * <code>s</code> is empty. * @param char2code the map from characters in <code>s</code> to symbols in <code>coder</code>. */ public CodedCharSequenceBooleanIterator( final CharSequence s, final PrefixCoder prefixCoder, final Char2IntMap char2code ) { this.s = s; this.char2code = char2code; this.length = s.length(); if ( s.length() > 0 && prefixCoder == null ) throw new IllegalArgumentException( "Empty PrefixCoder with non empty string: " + s ); this.codeWord = prefixCoder != null ? prefixCoder.codeWords() : null; if ( length != 0 ) currCodeWord = char2codeWord( s.charAt( 0 ) ); } public boolean hasNext() { return pos < length; } public boolean nextBoolean() { if ( ! hasNext() ) throw new NoSuchElementException(); final boolean bit = currCodeWord.get( index++ ); if ( index == currCodeWord.size() ) { pos++; if ( pos < length ) { currCodeWord = char2codeWord( s.charAt( pos ) ); index = 0; } } return bit; } } --- NEW FILE: CodeWordCoder.java --- package it.unimi.dsi.mg4j.compression; /* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2006 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ import it.unimi.dsi.fastutil.booleans.BooleanIterator; import it.unimi.dsi.fastutil.booleans.BooleanIterators; import it.unimi.dsi.mg4j.io.OutputBitStream; import it.unimi.dsi.mg4j.util.BitVectorBooleanIterator; import java.io.IOException; import java.io.Serializable; import cern.colt.bitvector.BitVector; /** A standard coder for compression methods based on codewords. */ final class CodeWordCoder implements PrefixCoder, Serializable { private static final long serialVersionUID = 1L; private final BitVector[] codeWord; /** Creates a new codeword-based coder using the given vector of codewords. The * coder will be able to encode symbols numbered from 0 to <code>codeWord.length-1</code>, included. * * @param codeWord a vector of coderwords. */ public CodeWordCoder( final BitVector[] codeWord ) { this.codeWord = codeWord; } public BooleanIterator encode( final int symbol ) { return new BitVectorBooleanIterator( codeWord[ symbol ] ); } public int encode( final int symbol, final OutputBitStream obs ) throws IOException { final BitVector w = codeWord[ symbol ]; final int size = w.size(); for( int i = 0; i < size; i++ ) obs.writeBit( w.get( i ) ); return size; } public int flush( final OutputBitStream unused ) { return 0; } public BooleanIterator flush() { return BooleanIterators.EMPTY_ITERATOR; } public BitVector[] codeWords() { return codeWord; } } --- NEW FILE: TreeDecoder.java --- package it.unimi.dsi.mg4j.compression; /* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2006 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ import it.unimi.dsi.fastutil.booleans.BooleanIterator; import it.unimi.dsi.mg4j.io.InputBitStream; import java.io.IOException; import java.io.Serializable; /** A decoder that follows 0/1 labelled paths in a tree. */ final class TreeDecoder implements Decoder, Serializable { private static final long serialVersionUID = 1L; /** A internal node of the decoding tree. */ public static class Node implements Serializable { private static final long serialVersionUID = 1L; public Node left, right; } /** A leaf node of the decoding tree. */ public static class LeafNode extends Node { private static final long serialVersionUID = 1L; public final int symbol; /** Creates a leaf node. * @param symbol the symbol for this node. */ public LeafNode( final int symbol ) { this.symbol = symbol; } } /** The root of the decoding tree. */ private final Node root; /** Creates a new codeword-based coder using the given tree. It * is responsability of the caller that the tree is well-formed, * that is, that all internal nodes are instances of {@link TreeDecoder.Node} * and all leaf nodes are instances of {@link TreeDecoder.LeafNode}. * * @param root the root of a decoding tree. */ public TreeDecoder( final Node root ) { this.root = root; } public int decode( final BooleanIterator iterator ) { Node n = root; while( ! ( n instanceof LeafNode ) ) n = iterator.nextBoolean() ? n.right : n.left; return ((LeafNode)n).symbol; } public int decode( final InputBitStream ibs ) throws IOException { Node n = root; while( ! ( n instanceof LeafNode ) ) n = ibs.readBit() == 0 ? n.left : n.right; return ((LeafNode)n).symbol; } } --- NEW FILE: Coder.java --- package it.unimi.dsi.mg4j.compression; /* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2006 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ import it.unimi.dsi.fastutil.booleans.BooleanIterator; import it.unimi.dsi.mg4j.io.OutputBitStream; import java.io.IOException; /** Coding methods for a specific compression technique. */ public interface Coder { /** Encodes a symbol. * * @param symbol a symbol. * @return a boolean iterator returning the bits coding <code>symbol</code>. */ BooleanIterator encode( int symbol ); /** Encodes a symbol. * * @param symbol a symbol. * @param obs the output bit stream where the encoded symbol will be written. * @return the number of bits written. */ int encode( int symbol, OutputBitStream obs ) throws IOException; /** Flushes the coder. * * @param obs the output bit stream where the flushing bits will be written. * @return the number of bits written to flush the coder. */ int flush( OutputBitStream obs ); /** Flushes the coder. * * @return a boolean iterator returning the bits used to flush this coder. */ BooleanIterator flush(); } --- NEW FILE: HuTuckerCodec.java --- package it.unimi.dsi.mg4j.compression; /* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2006 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ import it.unimi.dsi.fastutil.booleans.BooleanArrays; import java.io.Serializable; /** An implementation of the Hu–Tucker optimal lexicographical prefix-free code. * * <p>The familiar Huffman coding technique can be extended so to preserve the order in which * symbols are given to the coder, in the sense that if <var>j</var><<var>k</var>, then the * <var>j</var>-th symbol will get a code lexicographically smaller than the one * assigned to the <var>k</var>-th symbol. This result can be obtained with a small loss in * code length (for more details, see the third volume of <i>The Art of Computer Programming</i>). * * <p>A Hu–Tucker coder is built given an array of frequencies corresponding to each * symbol. Frequency 0 symbols are allowed, but they will degrade the resulting code. * * <p>The implementation of this class is rather inefficient, and the time required to build * a Hu–Tucker code is quadratic in the number of symbols. * An <i>O</i>(<var>n</var> log <var>n</var>) implementation * is possible, but it requires very sophisticated data structures. */ public class HuTuckerCodec extends PrefixCodec implements Serializable { private static final long serialVersionUID = 1L; /** A node to be used for the tree construction: it records both the level and the index. */ private static final class LevelNode extends TreeDecoder.LeafNode { private static final long serialVersionUID = 1L; int level; private LevelNode( final int symbol ) { super( symbol ); } private LevelNode() { super( -1 ); } } public HuTuckerCodec( final int[] frequency ) { super( frequency ); } protected TreeDecoder.Node initTree( final int[] frequency ) { final boolean[] internal = new boolean[ size ]; final boolean[] removed = new boolean[ size ]; final long[] compoundFrequency = new long[ size ]; final LevelNode[] externalNode = new LevelNode[ size ], node = new LevelNode[ size ]; long currPri; int first, last, left, right, minLeft, minRight; LevelNode n; // We create a node with level information for each symbol for( int i = size; i-- != 0; ) { compoundFrequency[ i ] = frequency[ i ]; node[ i ] = externalNode[ i ] = new LevelNode( i ); } first = 0; last = size - 1; minLeft = 0; int currMinLeft; // First selection phase (see Knuth) for( int i = size; --i != 0; ) { currMinLeft = minLeft = minRight = -1; currPri = Long.MAX_VALUE; while( removed[ first ] ) first++; while( removed[ last ] ) last--; right = first; assert right < last; while( right < last ) { left = currMinLeft = right; do { right++; if ( ! removed[ right ] ) { if ( compoundFrequency[ currMinLeft ] + compoundFrequency[ right ] < currPri ) { currPri = compoundFrequency[ currMinLeft ] + compoundFrequency[ right ]; minLeft = currMinLeft; minRight = right; } if ( compoundFrequency[ right ] < compoundFrequency[ currMinLeft ] ) currMinLeft = right; } } while( ( removed[ right ] || internal[ right ] ) && right < last ); assert right == last || ( ! removed[ right ] && ! internal[ right ] ); assert left < right; } internal[ minLeft ] = true; removed[ minRight ] = true; n = new LevelNode(); n.left = node[ minLeft ]; n.right = node[ minRight ]; node[ minLeft ] = n; compoundFrequency[ minLeft ] += compoundFrequency[ minRight ]; } // Recursive marking markRec( node[ minLeft ], 0 ); // We now restart the aggregation process BooleanArrays.fill( removed, false ); System.arraycopy( externalNode, 0, node, 0, size ); int currLevel, leftLevel; first = 0; minLeft = 0; last = size - 1; for( int i = size; --i != 0; ) { while( removed[ first ] ) first++; while( removed[ last ] ) last--; left = first; currLevel = minLeft = minRight = -1; while( left < last ) { leftLevel = node[ left ].level; assert leftLevel > currLevel; for( right = left + 1; right <= last && removed[ right ]; right++ ); assert right <= last; assert ! removed[ right ]; if ( leftLevel == node[ right ].level ) { currLevel = leftLevel; minLeft = left; minRight = right; } do left++; while( left < last && ( removed[ left ] || node[ left ].level <= currLevel ) ); } removed[ minRight ] = true; n = new LevelNode(); n.left = node[ minLeft ]; n.right = node[ minRight ]; n.level = currLevel - 1; node[ minLeft ] = n; } return rebuildTree( node[ minLeft ] ); } /** We scan recursively the tree, making a copy that uses lightweight nodes. */ private TreeDecoder.Node rebuildTree( final LevelNode n ) { if ( n == null ) return null; if ( n.symbol != -1 ) return new TreeDecoder.LeafNode( n.symbol ); TreeDecoder.Node newNode = new TreeDecoder.Node(); newNode.left = rebuildTree( (LevelNode) n.left ); newNode.right = rebuildTree( (LevelNode) n.right ); return newNode; } /** Mark recursively the height of each node. */ private void markRec( final LevelNode n, final int height ) { if ( n == null ) return; n.level = height; markRec( (LevelNode) n.left, height + 1 ); markRec( (LevelNode) n.right, height + 1 ); } } --- NEW FILE: HuffmanCodec.java --- package it.unimi.dsi.mg4j.compression; /* * MG4J: Managing Gigabytes for Java * * Copyright (C) 2005-2006 Sebastiano Vigna * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by the Free * Software Foundation; either version 2.1 of the License, or (at your option) * any later version. * * This library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * */ import it.unimi.dsi.fastutil.ints.IntIterators; import it.unimi.dsi.fastutil.longs.LongHeapSemiIndirectPriorityQueue; import java.io.Serializable; /** An implementation of the Huffman optimal prefix-free code. * * <p>A Huffman coder is built given an array of frequencies corresponding to each * symbol. Frequency 0 symbols are allowed, but they will degrade the resulting code. */ public class HuffmanCodec extends PrefixCodec implements Serializable { private static final long serialVersionUID = 1L; /** Creates a new Huffman coder using the given vector of frequencies. * * @param frequency a vector of nonnnegative frequencies. */ public HuffmanCodec( final int[] frequency ) { super( frequency ); } protected TreeDecoder.Node initTree( final int[] frequency ) { final boolean[] internal = new boolean[ size ]; final boolean[] removed = new boolean[ size ]; final long[] compoundFrequency = new long[ size ]; final TreeDecoder.Node[] externalNode = new TreeDecoder.Node[ size ], node = new TreeDecoder.Node[ size ]; // We create a node for each symbol. for( int i = size; i-- != 0; ) { compoundFrequency[ i ] = frequency[ i ]; node[ i ] = externalNode[ i ] = new TreeDecoder.LeafNode( i ); } // We create a semi-indirect queue keeping track of the nodes with minimum compound frequency. final int[] a = new int[ size ]; IntIterators.unwrap( IntIterators.fromTo( 0, size ), a ); LongHeapSemiIndirectPriorityQueue queue = new LongHeapSemiIndirectPriorityQueue( compoundFrequency, a ); TreeDecoder.Node n; int min1, min2 = 0; for( int i = size; --i != 0; ) { min1 = queue.dequeue(); min2 = queue.first(); // We aggregate min1 and min2 into min2. internal[ min2 ] = true; removed[ min1 ] = true; n = new TreeDecoder.Node(); n.left = node[ min1 ]; n.right = node[ min2 ]; node[ min2 ] = n; compoundFrequency[ min2 ] += compoundFrequency[ min1 ]; queue.changed(); } return node[ min2 ]; } } |