001: package it.unimi.dsi.mg4j.compression;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Sebastiano Vigna
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023:
024: import java.io.Serializable;
025: import java.util.Arrays;
026:
027: import cern.colt.Sorting;
028: import cern.colt.bitvector.BitVector;
029: import cern.colt.function.IntComparator;
030:
031: /** An implementation of Huffman optimal prefix-free coding.
032: *
033: * <p>A Huffman coder is built starting from an array of frequencies corresponding to each
034: * symbol. Frequency 0 symbols are allowed, but they will degrade the resulting code.
035: *
036: * <p>Instances of this class compute a <em>canonical</em> Huffman code
037: * (Eugene S. Schwartz and Bruce Kallick, “Generating a Canonical Prefix Encoding”, <i>Commun. ACM</i> 7(3), pages 166−169, 1964), which can
038: * by {@linkplain CanonicalFast64CodeWordDecoder quickly decoded using table lookups}.
039: * The construction uses the most efficient one-pass in-place codelength computation procedure
040: * described by Alistair Moffat and Jyrki Katajainen in “In-Place Calculation of Minimum-Redundancy Codes”,
041: * <i>Algorithms and Data Structures, 4th International Workshop</i>,
042: * number 955 in Lecture Notes in Computer Science, pages 393−402, Springer-Verlag, 1995.
043: *
044: * <p>We note by passing that this coded uses a {@link CanonicalFast64CodeWordDecoder}, which does not support codelengths above 64.
045: * However, since the worst case for codelengths is given by Fibonacci numbers, and frequencies are to be provided as integers,
046: * no codeword longer than the base-[(5<sup>1/2</sup> + 1)/2] logarithm of 5<sup>1/2</sup> · 2<sup>31</sup> (less than 47) will ever be generated.
047: * @deprecated Moved to <code>dsiutils</code>.
048: */
049:
050: @Deprecated
051: public class HuffmanCodec implements PrefixCodec, Serializable {
052: private static final boolean DEBUG = false;
053: private static final boolean ASSERTS = false;
054: private static final long serialVersionUID = 2L;
055:
056: /** The number of symbols of this coder. */
057: public final int size;
058: /** The codewords for this coder. */
059: private final BitVector[] codeWord;
060: /** A cached singleton instance of the coder of this codec. */
061: private final Fast64CodeWordCoder coder;
062: /** A cached singleton instance of the decoder of this codec. */
063: private final CanonicalFast64CodeWordDecoder decoder;
064:
065: /** Creates a new Huffman codec using the given vector of frequencies.
066: *
067: * @param frequency a vector of nonnnegative frequencies.
068: */
069: public HuffmanCodec(final int[] frequency) {
070: size = frequency.length;
071:
072: if (size == 0 || size == 1) {
073: codeWord = new BitVector[size];
074: if (size == 1)
075: codeWord[0] = new BitVector(0);
076: coder = new Fast64CodeWordCoder(codeWord, new long[size]);
077: decoder = new CanonicalFast64CodeWordDecoder(new int[size],
078: new int[size]);
079: return;
080: }
081:
082: final long[] a = new long[size];
083: for (int i = size; i-- != 0;)
084: a[i] = frequency[i];
085: // Sort frequencies (this is the only n log n step).
086: Arrays.sort(a);
087:
088: // The following lines are from Moffat & Katajainen sample code. Please refer to their paper.
089:
090: // First pass, left to right, setting parent pointers.
091: a[0] += a[1];
092: int root = 0;
093: int leaf = 2;
094: for (int next = 1; next < size - 1; next++) {
095: // Select first item for a pairing.
096: if (leaf >= size || a[root] < a[leaf]) {
097: a[next] = a[root];
098: a[root++] = next;
099: } else
100: a[next] = a[leaf++];
101:
102: // Add on the second item.
103: if (leaf >= size || (root < next && a[root] < a[leaf])) {
104: a[next] += a[root];
105: a[root++] = next;
106: } else
107: a[next] += a[leaf++];
108: }
109:
110: // Second pass, right to left, setting internal depths.
111: a[size - 2] = 0;
112: for (int next = size - 3; next >= 0; next--)
113: a[next] = a[(int) a[next]] + 1;
114:
115: // Third pass, right to left, setting leaf depths.
116: int available = 1, used = 0, depth = 0;
117: root = size - 2;
118: int next = size - 1;
119: while (available > 0) {
120: while (root >= 0 && a[root] == depth) {
121: used++;
122: root--;
123: }
124: while (available > used) {
125: a[next--] = depth;
126: available--;
127: }
128: available = 2 * used;
129: depth++;
130: used = 0;
131: }
132:
133: // Reverse the order of symbol lengths, and store them into an int array.
134: final int[] length = new int[size];
135: for (int i = size; i-- != 0;)
136: length[i] = (int) a[size - 1 - i];
137:
138: // Sort symbols indices by decreasing frequencies (so symbols correspond to lengths).
139: final int[] symbol = new int[size];
140: for (int i = size; i-- != 0;)
141: symbol[i] = i;
142: Sorting.quickSort(symbol, 0, size, new IntComparator() {
143: public int compare(int x, int y) {
144: return frequency[y] - frequency[x];
145: }
146: });
147:
148: // Assign codewords (just for the coder--the decoder needs just the lengths).
149: int s = symbol[0];
150: int l = length[0];
151: long value = 0;
152: BitVector v;
153: codeWord = new BitVector[size];
154: final long[] longCodeWord = new long[size];
155: codeWord[s] = new BitVector(l);
156:
157: for (int i = 1; i < size; i++) {
158: s = symbol[i];
159: if (length[i] == l)
160: value++;
161: else {
162: value++;
163: value <<= length[i] - l;
164: if (ASSERTS)
165: assert length[i] > l;
166: l = length[i];
167: }
168: v = new BitVector(l);
169: for (int j = l; j-- != 0;)
170: if ((1L << j & value) != 0)
171: v.set(l - 1 - j);
172: codeWord[s] = v;
173: longCodeWord[s] = value;
174: }
175:
176: coder = new Fast64CodeWordCoder(codeWord, longCodeWord);
177: decoder = new CanonicalFast64CodeWordDecoder(length, symbol);
178:
179: if (DEBUG) {
180: final BitVector[] codeWord = codeWords();
181: System.err.println("Codes: ");
182: for (int i = 0; i < size; i++)
183: System.err.println(i + " (" + codeWord[i].size()
184: + " bits): " + codeWord[i]);
185:
186: long totFreq = 0;
187: for (int i = size; i-- != 0;)
188: totFreq += frequency[i];
189: long totBits = 0;
190: for (int i = size; i-- != 0;)
191: totBits += frequency[i] * codeWord[i].size();
192: System.err.println("Compression: " + totBits + " / "
193: + totFreq * Character.SIZE + " = "
194: + (double) totBits / (totFreq * Character.SIZE));
195: }
196: }
197:
198: public CodeWordCoder coder() {
199: return coder;
200: }
201:
202: public Decoder decoder() {
203: return decoder;
204: }
205:
206: public int size() {
207: return size;
208: }
209:
210: public BitVector[] codeWords() {
211: return coder.codeWords();
212: }
213:
214: @Deprecated
215: public PrefixCoder getCoder() {
216: return coder();
217: }
218:
219: @Deprecated
220: public Decoder getDecoder() {
221: return decoder();
222: }
223:
224: }
|