001: /* ====================================================================
002: Licensed to the Apache Software Foundation (ASF) under one or more
003: contributor license agreements. See the NOTICE file distributed with
004: this work for additional information regarding copyright ownership.
005: The ASF licenses this file to You under the Apache License, Version 2.0
006: (the "License"); you may not use this file except in compliance with
007: the License. You may obtain a copy of the License at
008:
009: http://www.apache.org/licenses/LICENSE-2.0
010:
011: Unless required by applicable law or agreed to in writing, software
012: distributed under the License is distributed on an "AS IS" BASIS,
013: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: See the License for the specific language governing permissions and
015: limitations under the License.
016: ==================================================================== */
017: package org.apache.poi.hdgf;
018:
019: import java.io.ByteArrayOutputStream;
020: import java.io.IOException;
021: import java.io.InputStream;
022: import java.io.OutputStream;
023:
024: /**
025: * A decoder for the crazy LZW implementation used
026: * in Visio.
027: * According to VSDump, "it's a slightly perverted version of LZW
028: * compression, with inverted meaning of flag byte and 0xFEE as an
029: * 'initial shift'". It uses 12 bit codes
030: * (http://www.gnome.ru/projects/vsdump_en.html)
031: *
032: * Two good resources on LZW are:
033: * http://en.wikipedia.org/wiki/LZW
034: * http://marknelson.us/1989/10/01/lzw-data-compression/
035: */
036: public class HDGFLZW {
037:
038: /**
039: * Given an integer, turn it into a java byte, handling
040: * the wrapping.
041: * This is a convenience method
042: */
043: public static byte fromInt(int b) {
044: if (b < 128)
045: return (byte) b;
046: return (byte) (b - 256);
047: }
048:
049: /**
050: * Given a java byte, turn it into an integer between 0
051: * and 255 (i.e. handle the unwrapping).
052: * This is a convenience method
053: */
054: public static int fromByte(byte b) {
055: if (b >= 0)
056: return (int) b;
057: return (int) (b + 256);
058: }
059:
060: /**
061: * Compress the given input stream, returning the array of bytes
062: * of the compressed input
063: */
064: public byte[] compress(InputStream src) throws IOException {
065: ByteArrayOutputStream res = new ByteArrayOutputStream();
066: compress(src, res);
067: return res.toByteArray();
068: }
069:
070: /**
071: * Decompresses the given input stream, returning the array of bytes
072: * of the decompressed input.
073: */
074: public byte[] decode(InputStream src) throws IOException {
075: ByteArrayOutputStream res = new ByteArrayOutputStream();
076: decode(src, res);
077: return res.toByteArray();
078: }
079:
080: /**
081: * Perform a streaming decompression of the input.
082: * Works by:
083: * 1) Reading a flag byte, the 8 bits of which tell you if the
084: * following 8 codes are compressed our un-compressed
085: * 2) Consider the 8 bits in turn
086: * 3) If the bit is set, the next code is un-compressed, so
087: * add it to the dictionary and output it
088: * 4) If the bit isn't set, then read in the length and start
089: * position in the dictionary, and output the bytes there
090: * 5) Loop until we've done all 8 bits, then read in the next
091: * flag byte
092: */
093: public void decode(InputStream src, OutputStream res)
094: throws IOException {
095: // We use 12 bit codes:
096: // * 0-255 are real bytes
097: // * 256-4095 are the substring codes
098: // Java handily initialises our buffer / dictionary
099: // to all zeros
100: byte[] buffer = new byte[4096];
101:
102: // How far through the output we've got
103: // (This is normally used &4095, so it nicely wraps)
104: int pos = 0;
105: // The flag byte is treated as its 8 individual
106: // bits, which tell us if the following 8 codes
107: // are compressed or un-compressed
108: int flag;
109: // The mask, between 1 and 255, which is used when
110: // processing each bit of the flag byte in turn
111: int mask;
112:
113: // This is a byte as looked up in the dictionary
114: // It needs to be signed, as it'll get passed on to
115: // the output stream
116: byte dataB;
117: // This is an unsigned byte read from the stream
118: // It needs to be unsigned, so that bit stuff works
119: int dataI;
120: // The compressed code sequence is held over 2 bytes
121: int dataIPt1, dataIPt2;
122: // How long a code sequence is, and where in the
123: // dictionary to start at
124: int len, pntr;
125:
126: while ((flag = src.read()) != -1) {
127: // Compare each bit in our flag byte in turn:
128: for (mask = 1; mask < 256; mask <<= 1) {
129: // Is this a new code (un-compressed), or
130: // the use of existing codes (compressed)?
131: if ((flag & mask) > 0) {
132: // Retrieve the un-compressed code
133: if ((dataI = src.read()) != -1) {
134: // Save the byte into the dictionary
135: buffer[(pos & 4095)] = fromInt(dataI);
136: pos++;
137: // And output the byte
138: res.write(new byte[] { fromInt(dataI) });
139: }
140: } else {
141: // We have a compressed sequence
142: // Grab the next 16 bits of data
143: dataIPt1 = src.read();
144: dataIPt2 = src.read();
145: if (dataIPt1 == -1 || dataIPt2 == -1)
146: break;
147:
148: // Build up how long the code sequence is, and
149: // what position of the code to start at
150: // (The position is the first 12 bits, the
151: // length is the last 4 bits)
152: len = (dataIPt2 & 15) + 3;
153: pntr = (dataIPt2 & 240) * 16 + dataIPt1;
154:
155: // If the pointer happens to be passed the end
156: // of our buffer, then wrap around
157: if (pntr > 4078) {
158: pntr = pntr - 4078;
159: } else {
160: pntr = pntr + 18;
161: }
162:
163: // Loop over the codes, outputting what they correspond to
164: for (int i = 0; i < len; i++) {
165: buffer[(pos + i) & 4095] = buffer[(pntr + i) & 4095];
166: dataB = buffer[(pntr + i) & 4095];
167: res.write(new byte[] { dataB });
168: }
169:
170: // Record how far along the stream we have moved
171: pos = pos + len;
172: }
173: }
174: }
175: }
176:
177: /**
178: * Performs the Visio compatible streaming LZW compression.
179: * TODO - Finish
180: */
181: public void compress(InputStream src, OutputStream res)
182: throws IOException {
183: Compressor c = new Compressor();
184: c.compress(src, res);
185: }
186:
187: /**
188: * Helper class to handle the Visio compatible
189: * streaming LZW compression.
190: * Need our own class to handle keeping track of the
191: * code buffer, pending bytes to write out etc.
192: */
193: private class Compressor {
194: // We use 12 bit codes:
195: // * 0-255 are real bytes
196: // * 256-4095 are the substring codes
197: // Java handily initialises our buffer / dictionary
198: // to all zeros
199: byte[] dict = new byte[4096];
200:
201: // The next block of data to be written out, minus
202: // its mask byte
203: byte[] buffer = new byte[16];
204: // And how long it is
205: // (Un-compressed codes are 1 byte each, compressed codes
206: // are two)
207: int bufferLen = 0;
208:
209: // The raw length of a code is limited to 4 bits
210: byte[] rawCode = new byte[16];
211: // And how much we're using
212: int rawCodeLen = 0;
213:
214: // How far through the input and output streams we are
215: int posInp = 0;
216: int posOut = 0;
217:
218: // What the next mask byte to output will be
219: int nextMask = 0;
220: // And how many bits we've already set
221: int maskBitsSet = 0;
222:
223: /**
224: * Returns the last place that the bytes from rawCode are found
225: * at in the buffer, or -1 if they can't be found
226: */
227: private int findRawCodeInBuffer() {
228: // Work our way back from the end
229: // (Visio always seems to use the last possible code)
230: for (int i = (buffer.length - rawCodeLen); i >= 0; i--) {
231: boolean matches = true;
232: for (int j = 0; matches && j < rawCodeLen; j++) {
233: if (buffer[i] == rawCode[j]) {
234: // Fits
235: } else {
236: // Doesn't fit, can't be a match
237: matches = false;
238: }
239: }
240:
241: // Was this position a match?
242: if (matches) {
243: return i;
244: }
245: }
246:
247: // Not found
248: return -1;
249: }
250:
251: /**
252: * Output the compressed representation for the bytes
253: * found in rawCode
254: */
255: private void outputCompressed(OutputStream res)
256: throws IOException {
257: // It's not worth compressing only 1 or two bytes,
258: // due to the overheads
259: // So if asked, just output uncompressed
260: if (rawCodeLen < 3) {
261: for (int i = 0; i < rawCodeLen; i++) {
262: outputUncompressed(rawCode[i], res);
263: }
264: return;
265: }
266:
267: // Increment the mask bit count, we've done another code
268: maskBitsSet++;
269: // Add the length+code to the buffer
270: // TODO
271: posOut += 2;
272:
273: // If we're now at 8 codes, output
274: if (maskBitsSet == 8) {
275: output8Codes(res);
276: }
277: }
278:
279: /**
280: * Output the un-compressed byte
281: */
282: private void outputUncompressed(byte b, OutputStream res)
283: throws IOException {
284: // Set the mask bit for us
285: nextMask += (1 << maskBitsSet);
286:
287: // And add us to the buffer + dictionary
288: buffer[bufferLen] = fromInt(b);
289: bufferLen++;
290: dict[(posOut & 4095)] = fromInt(b);
291: posOut++;
292:
293: // If we're now at 8 codes, output
294: if (maskBitsSet == 8) {
295: output8Codes(res);
296: }
297: }
298:
299: /**
300: * We've got 8 code worth to write out, so
301: * output along with the header
302: */
303: private void output8Codes(OutputStream res) throws IOException {
304: // Output the mask and the data
305: res.write(new byte[] { fromInt(nextMask) });
306: res.write(buffer, 0, bufferLen);
307:
308: // Reset things
309: nextMask = 0;
310: maskBitsSet = 0;
311: bufferLen = 0;
312: }
313:
314: /**
315: * Does the compression
316: */
317: private void compress(InputStream src, OutputStream res)
318: throws IOException {
319: // Have we hit the end of the file yet?
320: boolean going = true;
321:
322: // This is a byte as looked up in the dictionary
323: // It needs to be signed, as it'll get passed on to
324: // the output stream
325: byte dataB;
326: // This is an unsigned byte read from the stream
327: // It needs to be unsigned, so that bit stuff works
328: int dataI;
329:
330: while (going) {
331: dataI = src.read();
332: posInp++;
333: if (dataI == -1) {
334: going = false;
335: }
336: dataB = fromInt(dataI);
337:
338: // If we've run out of data, output anything that's
339: // pending then finish
340: if (!going && rawCodeLen > 0) {
341: outputCompressed(res);
342: break;
343: }
344:
345: // Try adding this new byte onto rawCode, and
346: // see if all of that is still found in the
347: // buffer dictionary or not
348: rawCode[rawCodeLen] = dataB;
349: rawCodeLen++;
350: int rawAt = findRawCodeInBuffer();
351:
352: // If we found it and are now at 16 bytes,
353: // we need to output our pending code block
354: if (rawCodeLen == 16 && rawAt > -1) {
355: outputCompressed(res);
356: rawCodeLen = 0;
357: continue;
358: }
359:
360: // If we did find all of rawCode with our new
361: // byte added on, we can wait to see what happens
362: // with the next byte
363: if (rawAt > -1) {
364: continue;
365: }
366:
367: // If there was something in rawCode before, then we
368: // need to output that
369: rawCodeLen--;
370: if (rawCodeLen > 0) {
371: // Output the old rawCode
372: outputCompressed(res);
373:
374: // Can this byte start a new rawCode, or does
375: // it need outputting itself?
376: rawCode[0] = dataB;
377: rawCodeLen = 1;
378: if (findRawCodeInBuffer() > -1) {
379: // Fits in, wait for next byte
380: continue;
381: } else {
382: // Doesn't fit, output
383: outputUncompressed(dataB, res);
384: rawCodeLen = 0;
385: }
386: } else {
387: // Nothing in rawCode before, so this byte
388: // isn't in the buffer dictionary
389: // Output it un-compressed
390: outputUncompressed(dataB, res);
391: }
392: }
393: }
394: }
395:
396: }
|