View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.commons.compress.harmony.pack200;
20  
21  import java.io.EOFException;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.util.ArrayList;
25  import java.util.Arrays;
26  import java.util.List;
27  
28  import org.apache.commons.compress.utils.ExactMath;
29  
30  /**
31   * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" encoding mechanism. It uses a
32   * variable-length encoding and a modified sign representation such that small numbers are represented as a single byte, whilst larger numbers take more bytes
33   * to encode. The number may be signed or unsigned; if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's
34   * complement. The Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. So a delta encoding of
35   * the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute value of a coded integer to fall outside of the 'small number'
36   * range, whilst still being encoded as a single byte.
37   * <p>
38   * A BHSD codec is configured with four parameters:
39   * </p>
40   * <dl>
41   * <dt>B</dt>
42   * <dd>The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as
43   * itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd>
44   * <dt>H</dt>
45   * <dd>The radix of the integer. Values are defined as a sequence of values, where value {@code n} is multiplied by {@code H^<sup>n</sup>}. So the number 1234
46   * may be represented as the sequence 4 3 2 1 with a radix (H) of 10. Note that other permutations are also possible; 43 2 1 will also encode 1234. The
47   * co-parameter L is defined as 256-H. This is important because only the last value in a sequence may be &lt; L; all prior values must be &gt; L.</dd>
48   * <dt>S</dt>
49   * <dd>Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's
50   * complement)</dd>
51   * <dt>D</dt>
52   * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding of 1 indicates that values are
53   * cumulative; a sequence of {@code 1 1 1 1 1} will represent the sequence {@code 1 2 3 4 5}. For this reason, the codec supports two variants of decode; one
54   * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a {@code last} parameter. If the codec is a non-delta encoding, then the
55   * value is ignored if passed. If the codec is a delta encoding, it is a run-time error to call the value without the extra parameter, and the previous value
56   * should be returned. (It was designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for each use.)</dd>
57   * </dl>
58   * <p>
59   * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted (1,256,0,0) or (1,256). The {@link #toString()}
60   * method prints out the condensed form of the encoding. Often, the last character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B
61   * value. Those that start with U ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the word Delta
62   * ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used.
63   * </p>
64   */
65  public final class BHSDCodec extends Codec {
66  
67      /**
68       * The maximum number of bytes in each coding word. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as itself, aka
69       * {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).
70       */
71      private final int b;
72  
73      /**
74       * Whether delta encoding is used (0=false,1=true).
75       */
76      private final int d;
77  
78      /**
79       * The radix of the encoding.
80       */
81      private final int h;
82  
83      /**
84       * The co-parameter of h; 256-h.
85       */
86      private final int l;
87  
88      /**
89       * Represents signed numbers or not, 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's complement).
90       */
91      private final int s;
92  
93      private long cardinality;
94  
95      private final long smallest;
96  
97      private final long largest;
98  
99      /**
100      * radix^i powers
101      */
102     private final long[] powers;
103 
104     /**
105      * Constructs an unsigned, non-delta Codec with the given B and H values.
106      *
107      * @param b the maximum number of bytes that a value can be encoded as [1..5]
108      * @param h the radix of the encoding [1..256]
109      */
110     public BHSDCodec(final int b, final int h) {
111         this(b, h, 0, 0);
112     }
113 
114     /**
115      * Constructs a non-delta Codec with the given B, H and S values.
116      *
117      * @param b the maximum number of bytes that a value can be encoded as [1..5]
118      * @param h the radix of the encoding [1..256]
119      * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
120      */
121     public BHSDCodec(final int b, final int h, final int s) {
122         this(b, h, s, 0);
123     }
124 
125     /**
126      * Constructs a Codec with the given B, H, S and D values.
127      *
128      * @param b the maximum number of bytes that a value can be encoded as [1..5]
129      * @param h the radix of the encoding [1..256]
130      * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
131      * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta)
132      */
133     public BHSDCodec(final int b, final int h, final int s, final int d) {
134         if (b < 1 || b > 5) {
135             throw new IllegalArgumentException("1<=b<=5");
136         }
137         if (h < 1 || h > 256) {
138             throw new IllegalArgumentException("1<=h<=256");
139         }
140         if (s < 0 || s > 2) {
141             throw new IllegalArgumentException("0<=s<=2");
142         }
143         if (d < 0 || d > 1) {
144             throw new IllegalArgumentException("0<=d<=1");
145         }
146         if (b == 1 && h != 256) {
147             throw new IllegalArgumentException("b=1 -> h=256");
148         }
149         if (h == 256 && b == 5) {
150             throw new IllegalArgumentException("h=256 -> b!=5");
151         }
152         this.b = b;
153         this.h = h;
154         this.s = s;
155         this.d = d;
156         this.l = 256 - h;
157         if (h == 1) {
158             cardinality = b * 255 + 1;
159         } else {
160             cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b));
161         }
162         smallest = calculateSmallest();
163         largest = calculateLargest();
164 
165         powers = new long[b];
166         Arrays.setAll(powers, c -> (long) Math.pow(h, c));
167     }
168 
169     private long calculateLargest() {
170         final long result;
171         // TODO This can probably be optimized into a better mathematical statement
172         if (d == 1) {
173             return new BHSDCodec(b, h).largest();
174         }
175         switch (s) {
176         case 0:
177             result = cardinality() - 1;
178             break;
179         case 1:
180             result = cardinality() / 2 - 1;
181             break;
182         case 2:
183             result = 3L * cardinality() / 4 - 1;
184             break;
185         default:
186             throw new Error("Unknown s value");
187         }
188         return Math.min((s == 0 ? (long) Integer.MAX_VALUE << 1 : Integer.MAX_VALUE) - 1, result);
189     }
190 
191     private long calculateSmallest() {
192         final long result;
193         if (d == 1 || !isSigned()) {
194             if (cardinality >= 4294967296L) { // 2^32
195                 result = Integer.MIN_VALUE;
196             } else {
197                 result = 0;
198             }
199         } else {
200             result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s));
201         }
202         return result;
203     }
204 
205     /**
206      * Returns the cardinality of this codec; that is, the number of distinct values that it can contain.
207      *
208      * @return the cardinality of this codec
209      */
210     public long cardinality() {
211         return cardinality;
212     }
213 
214     @Override
215     public int decode(final InputStream in) throws IOException, Pack200Exception {
216         if (d != 0) {
217             throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error");
218         }
219         return decode(in, 0);
220     }
221 
222     @Override
223     public int decode(final InputStream in, final long last) throws IOException, Pack200Exception {
224         int n = 0;
225         long z = 0;
226         long x = 0;
227 
228         do {
229             x = in.read();
230             lastBandLength++;
231             z += x * powers[n];
232             n++;
233         } while (x >= l && n < b);
234 
235         if (x == -1) {
236             throw new EOFException("End of stream reached whilst decoding");
237         }
238 
239         if (isSigned()) {
240             final int u = (1 << s) - 1;
241             if ((z & u) == u) {
242                 z = z >>> s ^ -1L;
243             } else {
244                 z -= z >>> s;
245             }
246         }
247         // This algorithm does the same thing, but is probably slower. Leaving
248         // in for now for readability
249         // if (isSigned()) {
250         // long u = z;
251         // long twoPowS = (long) Math.pow(2, s);
252         // double twoPowSMinusOne = twoPowS-1;
253         // if (u % twoPowS < twoPowSMinusOne) {
254         // if (cardinality < Math.pow(2, 32)) {
255         // z = (long) (u - (Math.floor(u/ twoPowS)));
256         // } else {
257         // z = cast32((long) (u - (Math.floor(u/ twoPowS))));
258         // }
259         // } else {
260         // z = (long) (-Math.floor(u/ twoPowS) - 1);
261         // }
262         // }
263         if (isDelta()) {
264             z += last;
265         }
266         return (int) z;
267     }
268 
269     // private long cast32(long u) {
270     // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) -
271     // Math.pow(2, 31));
272     // return u;
273     // }
274 
275     @Override
276     public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception {
277         final int[] band = super.decodeInts(n, in);
278         if (isDelta()) {
279             for (int i = 0; i < band.length; i++) {
280                 while (band[i] > largest) {
281                     band[i] -= cardinality;
282                 }
283                 while (band[i] < smallest) {
284                     band[i] = ExactMath.add(band[i], cardinality);
285                 }
286             }
287         }
288         return band;
289     }
290 
291     @Override
292     public int[] decodeInts(final int n, final InputStream in, final int firstValue) throws IOException, Pack200Exception {
293         final int[] band = super.decodeInts(n, in, firstValue);
294         if (isDelta()) {
295             for (int i = 0; i < band.length; i++) {
296                 while (band[i] > largest) {
297                     band[i] -= cardinality;
298                 }
299                 while (band[i] < smallest) {
300                     band[i] = ExactMath.add(band[i], cardinality);
301                 }
302             }
303         }
304         return band;
305     }
306 
307     @Override
308     public byte[] encode(final int value) throws Pack200Exception {
309         return encode(value, 0);
310     }
311 
312     @Override
313     public byte[] encode(final int value, final int last) throws Pack200Exception {
314         if (!encodes(value)) {
315             throw new Pack200Exception("The codec " + this + " does not encode the value " + value);
316         }
317 
318         long z = value;
319         if (isDelta()) {
320             z -= last;
321         }
322         if (isSigned()) {
323             if (z < Integer.MIN_VALUE) {
324                 z += 4294967296L;
325             } else if (z > Integer.MAX_VALUE) {
326                 z -= 4294967296L;
327             }
328             if (z < 0) {
329                 z = (-z << s) - 1;
330             } else if (s == 1) {
331                 z = z << s;
332             } else {
333                 z += (z - z % 3) / 3;
334             }
335         } else if (z < 0) {
336             // Need to use integer overflow here to represent negatives.
337             // 4294967296L is the 1 << 32.
338             z += Math.min(cardinality, 4294967296L);
339         }
340         if (z < 0) {
341             throw new Pack200Exception("unable to encode");
342         }
343 
344         final List<Byte> byteList = new ArrayList<>();
345         for (int n = 0; n < b; n++) {
346             long byteN;
347             if (z < l) {
348                 byteN = z;
349             } else {
350                 byteN = z % h;
351                 while (byteN < l) {
352                     byteN += h;
353                 }
354             }
355             byteList.add(Byte.valueOf((byte) byteN));
356             if (byteN < l) {
357                 break;
358             }
359             z -= byteN;
360             z /= h;
361         }
362         final byte[] bytes = new byte[byteList.size()];
363         for (int i = 0; i < bytes.length; i++) {
364             bytes[i] = byteList.get(i).byteValue();
365         }
366         return bytes;
367     }
368 
369     /**
370      * True if this encoding can code the given value
371      *
372      * @param value the value to check
373      * @return {@code true} if the encoding can encode this value
374      */
375     public boolean encodes(final long value) {
376         return value >= smallest && value <= largest;
377     }
378 
379     @Override
380     public boolean equals(final Object o) {
381         if (o instanceof BHSDCodec) {
382             final BHSDCodec codec = (BHSDCodec) o;
383             return codec.b == b && codec.h == h && codec.s == s && codec.d == d;
384         }
385         return false;
386     }
387 
388     /**
389      * Gets the B.
390      *
391      * @return the b
392      */
393     public int getB() {
394         return b;
395     }
396 
397     /**
398      * Gets the H.
399      *
400      * @return the h
401      */
402     public int getH() {
403         return h;
404     }
405 
406     /**
407      * Gets the L.
408      *
409      * @return the l
410      */
411     public int getL() {
412         return l;
413     }
414 
415     /**
416      * Gets the S.
417      *
418      * @return the s
419      */
420     public int getS() {
421         return s;
422     }
423 
424     @Override
425     public int hashCode() {
426         return ((b * 37 + h) * 37 + s) * 37 + d;
427     }
428 
429     /**
430      * Returns true if this codec is a delta codec
431      *
432      * @return true if this codec is a delta codec
433      */
434     public boolean isDelta() {
435         return d != 0;
436     }
437 
438     /**
439      * Returns true if this codec is a signed codec
440      *
441      * @return true if this codec is a signed codec
442      */
443     public boolean isSigned() {
444         return s != 0;
445     }
446 
447     /**
448      * Returns the largest value that this codec can represent.
449      *
450      * @return the largest value that this codec can represent.
451      */
452     public long largest() {
453         return largest;
454     }
455 
456     /**
457      * Returns the smallest value that this codec can represent.
458      *
459      * @return the smallest value that this codec can represent.
460      */
461     public long smallest() {
462         return smallest;
463     }
464 
465     /**
466      * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown.
467      */
468     @Override
469     public String toString() {
470         final StringBuilder buffer = new StringBuilder(11);
471         buffer.append('(');
472         buffer.append(b);
473         buffer.append(',');
474         buffer.append(h);
475         if (s != 0 || d != 0) {
476             buffer.append(',');
477             buffer.append(s);
478         }
479         if (d != 0) {
480             buffer.append(',');
481             buffer.append(d);
482         }
483         buffer.append(')');
484         return buffer.toString();
485     }
486 }