001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.harmony.pack200;
018:
019: import java.io.IOException;
020: import java.io.InputStream;
021:
022: /**
023: * A codec allows a sequence of bytes to be decoded into integer values (or vice
024: * versa). It uses a variable-length encoding and a modified sign representation
025: * such that small numbers are represented as a single byte, whilst larger
026: * numbers take more bytes to encode. The number may be signed or unsigned; if
027: * it is unsigned, it can be weighted towards positive numbers or equally
028: * distributed using a one's complement. The codec also supports delta coding,
029: * where a sequence of numbers is represented as a series of first-order
030: * differences. So a delta encoding of the integers [1..10] would be represented
031: * as a sequence of 10x1s. This allows the absolute value of a coded integer to
032: * fall outside of the 'small number' range, whilst still being encoded as a
033: * single byte.
034: *
035: * A codec is configured with four parameters:
036: * <dl>
037: * <dt>B</dt>
038: * <dd>The maximum number of bytes that each value is encoded as. B must be a
039: * value between [1..5]. For a pass-through coding (where each byte is encoded
040: * as itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd>
041: * <dt>H</dt>
042: * <dd>The radix of the integer. Values are defined as a sequence of values,
043: * where value <code>n</code> is multiplied by <code>H^<sup>n</sup></code>.
044: * So the number 1234 may be represented as the sequence 4 3 2 1 with a radix
045: * (H) of 10. Note that other permutations are also possible; 43 2 1 will also
046: * encode 1234. The co-parameter L is defined as 256-H. This is important
047: * because only the last value in a sequence may be < L; all prior values
048: * must be > L.</dd>
049: * <dt>S</dt>
050: * <dd>Whether the codec represents signed values (or not). This may have 3
051: * values; 0 (unsigned), 1 (signed, ones complement) or 2 (signed, but not sure
052: * what the difference is) TODO Update documentation when I know what the
053: * difference is</dd>
054: * <dt>D</dt>
055: * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta)
056: * or 1 (delta encoding). A delta encoding of 1 indicates that values are
057: * cumulative; a sequence of <code>1 1 1 1 1</code> will represent the
058: * sequence <code>1 2 3 4 5</code>. For this reason, the codec supports two
059: * variants of decode; one {@link #decode(InputStream, long) with} and one
060: * {@link #decode(InputStream) without} a <code>last</code> parameter. If the
061: * codec is a non-delta encoding, then the value is ignored if passed. If the
062: * codec is a delta encoding, it is a run-time error to call the value without
063: * the extra parameter, and the previous value should be returned. (It was
064: * designed this way to support multi-threaded access without requiring a new
065: * instance of the Codec to be cloned for each use.)
066: * <dt>
067: * </dl>
068: *
069: * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero.
070: * Thus {@link #BYTE1} is denoted (1,256,0,0) or (1,256). The
071: * {@link #toString()} method prints out the condensed form of the encoding.
072: * Often, the last character in the name ({@link #BYTE1}, {@link #UNSIGNED5})
073: * gives a clue as to the B value. Those that start with U ({@link #UDELTA5},
074: * {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed.
075: * The presence of the word Delta ({@link #DELTA5}, {@link #UDELTA5})
076: * indicates a delta encoding is used.
077: *
078: * This codec is really quite cool for storing compressed information, and could
079: * be used entirely separately from the Pack200 implementation for efficient
080: * transfer of integer data if required.
081: *
082: * Note that all information is byte-oriented; for decoding float/double
083: * information, the bit values are converted (not cast) into a long type. Note
084: * that long values are used throughout even though most may be cast to ints;
085: * this is primarily to avoid having to worry about signed values, even if it
086: * would be more efficient to do so.
087: *
088: * There are a number of standard codecs ({@link #UDELTA5}, {@link #UNSIGNED5},
089: * {@link #BYTE1}, {@link #CHAR3}) that are used in the implementation of many
090: * bands; but there are a variety of other ones, and indeed the specification
091: * assumes that other combinations of values can result in more specific and
092: * efficient formats. There are also a sequence of canonical encodings defined
093: * by the Pack200 specification, which allow a codec to be referred to by
094: * canonical number. {@link CodecEncoding#canonicalCodec})
095: */
096: public abstract class Codec {
097: /**
098: * BCI5 = (5,4): Used for storing branching information in bytecode.
099: */
100: public static final BHSDCodec BCI5 = new BHSDCodec(5, 4);
101:
102: /**
103: * BRANCH5 = (5,4,2): Used for storing branching information in bytecode.
104: */
105: public static final BHSDCodec BRANCH5 = new BHSDCodec(5, 4, 2);
106:
107: /**
108: * BYTE1 = (1,256): Used for storing plain bytes.
109: */
110: public static final BHSDCodec BYTE1 = new BHSDCodec(1, 256);
111:
112: /**
113: * CHAR3 = (3,128): Used for storing text (UTF-8) strings. NB This isn't
114: * quite the same as UTF-8, but has similar properties; ASCII characters
115: * < 127 are stored in a single byte.
116: */
117: public static final BHSDCodec CHAR3 = new BHSDCodec(3, 128);
118:
119: /**
120: * DELTA5 = (5,64,1,1): Used for the majority of numerical codings where
121: * there is a correlated sequence of signed values.
122: */
123: public static final BHSDCodec DELTA5 = new BHSDCodec(5, 64, 1, 1);
124:
125: /**
126: * MDELTA5 = (5,64,2,1): Used for the majority of numerical codings where
127: * there is a correlated sequence of signed values, but where most of them
128: * are expected to be non-negative.
129: */
130: public static final BHSDCodec MDELTA5 = new BHSDCodec(5, 64, 2, 1);
131:
132: /**
133: * SIGNED5 = (5,64,1): Used for small signed values.
134: */
135: public static final BHSDCodec SIGNED5 = new BHSDCodec(5, 64, 1);
136:
137: /**
138: * UDELTA5 = (5,64,0,1): Used for the majority of numerical codings where
139: * there is a correlated sequence of unsigned values.
140: */
141: public static final BHSDCodec UDELTA5 = new BHSDCodec(5, 64, 0, 1);
142:
143: /**
144: * UNSIGNED5 = (5,64): Used for small unsigned values.
145: */
146: public static final BHSDCodec UNSIGNED5 = new BHSDCodec(5, 64);
147:
148: /**
149: * Decode a sequence of bytes from the given input stream, returning the
150: * value as a long. Note that this method can only be applied for non-delta
151: * encodings.
152: *
153: * @param in
154: * the input stream to read from
155: * @return the value as a long
156: * @throws IOException
157: * if there is a problem reading from the underlying input
158: * stream
159: * @throws Pack200Exception
160: * if the encoding is a delta encoding
161: */
162: public abstract long decode(InputStream in) throws IOException,
163: Pack200Exception;
164:
165: /**
166: * Decode a sequence of bytes from the given input stream, returning the
167: * value as a long. If this encoding is a delta encoding (d=1) then the
168: * previous value must be passed in as a parameter. If it is a non-delta
169: * encoding, then it does not matter what value is passed in, so it makes
170: * sense for the value to be passed in by default using code similar to:
171: *
172: * <pre>
173: * long last = 0;
174: * while (condition) {
175: * last = codec.decode(in, last);
176: * // do something with last
177: * }
178: * </pre>
179: *
180: * @param in
181: * the input stream to read from
182: * @param last
183: * the previous value read, which must be supplied if the codec
184: * is a delta encoding
185: * @return the value as a long
186: * @throws IOException
187: * if there is a problem reading from the underlying input
188: * stream
189: * @throws Pack200Exception
190: * if there is a problem decoding the value or that the value is
191: * invalid
192: */
193: public abstract long decode(InputStream in, long last)
194: throws IOException, Pack200Exception;
195:
196: /**
197: * Decodes a sequence of <code>n</code> values from <code>in</code>.
198: * This should probably be used in most cases, since some codecs
199: * (such as @{link PopCodec}) only work when the number of values
200: * to be read is known.
201: *
202: * @param n
203: * the number of values to decode
204: * @param in
205: * the input stream to read from
206: * @return an array of <code>long</code> values corresponding to values
207: * decoded
208: * @throws IOException
209: * if there is a problem reading from the underlying input
210: * stream
211: * @throws Pack200Exception
212: * if there is a problem decoding the value or that the value is
213: * invalid
214: */
215: public long[] decode(int n, InputStream in) throws IOException,
216: Pack200Exception {
217: long result[] = new long[n];
218: long last = 0;
219: for (int i = 0; i < n; i++) {
220: result[i] = last = decode(in, last);
221: }
222: return result;
223: }
224:
225: /**
226: * Decodes a sequence of <code>n</code> values from <code>in</code>.
227: *
228: * @param n
229: * the number of values to decode
230: * @param in
231: * the input stream to read from
232: * @param firstValue
233: * the first value in the band if it has already been read
234: * @return an array of <code>long</code> values corresponding to values
235: * decoded, with firstValue as the first value in the array.
236: * @throws IOException
237: * if there is a problem reading from the underlying input
238: * stream
239: * @throws Pack200Exception
240: * if there is a problem decoding the value or that the value is
241: * invalid
242: */
243: public long[] decode(int n, InputStream in, long firstValue)
244: throws IOException, Pack200Exception {
245: long result[] = new long[n + 1];
246: result[0] = firstValue;
247: long last = firstValue;
248: for (int i = 1; i < n + 1; i++) {
249: result[i] = last = decode(in, last);
250: }
251: return result;
252: }
253: }
|