001/*
002 *  Licensed to the Apache Software Foundation (ASF) under one or more
003 *  contributor license agreements.  See the NOTICE file distributed with
004 *  this work for additional information regarding copyright ownership.
005 *  The ASF licenses this file to You under the Apache License, Version 2.0
006 *  (the "License"); you may not use this file except in compliance with
007 *  the License.  You may obtain a copy of the License at
008 *
009 *     http://www.apache.org/licenses/LICENSE-2.0
010 *
011 *  Unless required by applicable law or agreed to in writing, software
012 *  distributed under the License is distributed on an "AS IS" BASIS,
013 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 *  See the License for the specific language governing permissions and
015 *  limitations under the License.
016 */
017package org.apache.commons.compress.harmony.pack200;
018
019import java.io.EOFException;
020import java.io.IOException;
021import java.io.InputStream;
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.List;
025
026import org.apache.commons.compress.utils.ExactMath;
027
028/**
029 * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" encoding mechanism. It uses a
030 * variable-length encoding and a modified sign representation such that small numbers are represented as a single byte, whilst larger numbers take more bytes
031 * to encode. The number may be signed or unsigned; if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's
032 * complement. The Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. So a delta encoding of
033 * the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute value of a coded integer to fall outside of the 'small number'
034 * range, whilst still being encoded as a single byte.
035 * <p>
036 * A BHSD codec is configured with four parameters:
037 * </p>
038 * <dl>
039 * <dt>B</dt>
040 * <dd>The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as
041 * itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd>
042 * <dt>H</dt>
043 * <dd>The radix of the integer. Values are defined as a sequence of values, where value {@code n} is multiplied by {@code H^<sup>n</sup>}. So the number 1234
044 * may be represented as the sequence 4 3 2 1 with a radix (H) of 10. Note that other permutations are also possible; 43 2 1 will also encode 1234. The
045 * co-parameter L is defined as 256-H. This is important because only the last value in a sequence may be &lt; L; all prior values must be &gt; L.</dd>
046 * <dt>S</dt>
047 * <dd>Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's
048 * complement)</dd>
049 * <dt>D</dt>
050 * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding of 1 indicates that values are
051 * cumulative; a sequence of {@code 1 1 1 1 1} will represent the sequence {@code 1 2 3 4 5}. For this reason, the codec supports two variants of decode; one
052 * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a {@code last} parameter. If the codec is a non-delta encoding, then the
053 * value is ignored if passed. If the codec is a delta encoding, it is a run-time error to call the value without the extra parameter, and the previous value
054 * should be returned. (It was designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for each use.)</dd>
055 * </dl>
056 * <p>
057 * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted (1,256,0,0) or (1,256). The {@link #toString()}
058 * method prints out the condensed form of the encoding. Often, the last character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B
059 * value. Those that start with U ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the word Delta
060 * ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used.
061 * </p>
062 */
063public final class BHSDCodec extends Codec {
064
065    /**
066     * The maximum number of bytes in each coding word. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as itself, aka
067     * {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).
068     */
069    private final int b;
070
071    /**
072     * Whether delta encoding is used (0=false,1=true).
073     */
074    private final int d;
075
076    /**
077     * The radix of the encoding.
078     */
079    private final int h;
080
081    /**
082     * The co-parameter of h; 256-h.
083     */
084    private final int l;
085
086    /**
087     * Represents signed numbers or not, 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's complement).
088     */
089    private final int s;
090
091    private long cardinality;
092
093    private final long smallest;
094
095    private final long largest;
096
097    /**
098     * radix^i powers
099     */
100    private final long[] powers;
101
102    /**
103     * Constructs an unsigned, non-delta Codec with the given B and H values.
104     *
105     * @param b the maximum number of bytes that a value can be encoded as [1..5]
106     * @param h the radix of the encoding [1..256]
107     */
108    public BHSDCodec(final int b, final int h) {
109        this(b, h, 0, 0);
110    }
111
112    /**
113     * Constructs a non-delta Codec with the given B, H and S values.
114     *
115     * @param b the maximum number of bytes that a value can be encoded as [1..5]
116     * @param h the radix of the encoding [1..256]
117     * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
118     */
119    public BHSDCodec(final int b, final int h, final int s) {
120        this(b, h, s, 0);
121    }
122
123    /**
124     * Constructs a Codec with the given B, H, S and D values.
125     *
126     * @param b the maximum number of bytes that a value can be encoded as [1..5]
127     * @param h the radix of the encoding [1..256]
128     * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
129     * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta)
130     */
131    public BHSDCodec(final int b, final int h, final int s, final int d) {
132        if (b < 1 || b > 5) {
133            throw new IllegalArgumentException("1<=b<=5");
134        }
135        if (h < 1 || h > 256) {
136            throw new IllegalArgumentException("1<=h<=256");
137        }
138        if (s < 0 || s > 2) {
139            throw new IllegalArgumentException("0<=s<=2");
140        }
141        if (d < 0 || d > 1) {
142            throw new IllegalArgumentException("0<=d<=1");
143        }
144        if (b == 1 && h != 256) {
145            throw new IllegalArgumentException("b=1 -> h=256");
146        }
147        if (h == 256 && b == 5) {
148            throw new IllegalArgumentException("h=256 -> b!=5");
149        }
150        this.b = b;
151        this.h = h;
152        this.s = s;
153        this.d = d;
154        this.l = 256 - h;
155        if (h == 1) {
156            cardinality = b * 255 + 1;
157        } else {
158            cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b));
159        }
160        smallest = calculateSmallest();
161        largest = calculateLargest();
162
163        powers = new long[b];
164        Arrays.setAll(powers, c -> (long) Math.pow(h, c));
165    }
166
167    private long calculateLargest() {
168        long result;
169        // TODO This can probably be optimized into a better mathematical statement
170        if (d == 1) {
171            return new BHSDCodec(b, h).largest();
172        }
173        switch (s) {
174        case 0:
175            result = cardinality() - 1;
176            break;
177        case 1:
178            result = cardinality() / 2 - 1;
179            break;
180        case 2:
181            result = 3L * cardinality() / 4 - 1;
182            break;
183        default:
184            throw new Error("Unknown s value");
185        }
186        return Math.min((s == 0 ? (long) Integer.MAX_VALUE << 1 : Integer.MAX_VALUE) - 1, result);
187    }
188
189    private long calculateSmallest() {
190        long result;
191        if (d == 1 || !isSigned()) {
192            if (cardinality >= 4294967296L) { // 2^32
193                result = Integer.MIN_VALUE;
194            } else {
195                result = 0;
196            }
197        } else {
198            result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s));
199        }
200        return result;
201    }
202
203    /**
204     * Returns the cardinality of this codec; that is, the number of distinct values that it can contain.
205     *
206     * @return the cardinality of this codec
207     */
208    public long cardinality() {
209        return cardinality;
210    }
211
212    @Override
213    public int decode(final InputStream in) throws IOException, Pack200Exception {
214        if (d != 0) {
215            throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error");
216        }
217        return decode(in, 0);
218    }
219
220    @Override
221    public int decode(final InputStream in, final long last) throws IOException, Pack200Exception {
222        int n = 0;
223        long z = 0;
224        long x = 0;
225
226        do {
227            x = in.read();
228            lastBandLength++;
229            z += x * powers[n];
230            n++;
231        } while (x >= l && n < b);
232
233        if (x == -1) {
234            throw new EOFException("End of stream reached whilst decoding");
235        }
236
237        if (isSigned()) {
238            final int u = (1 << s) - 1;
239            if ((z & u) == u) {
240                z = z >>> s ^ -1L;
241            } else {
242                z -= z >>> s;
243            }
244        }
245        // This algorithm does the same thing, but is probably slower. Leaving
246        // in for now for readability
247        // if (isSigned()) {
248        // long u = z;
249        // long twoPowS = (long) Math.pow(2, s);
250        // double twoPowSMinusOne = twoPowS-1;
251        // if (u % twoPowS < twoPowSMinusOne) {
252        // if (cardinality < Math.pow(2, 32)) {
253        // z = (long) (u - (Math.floor(u/ twoPowS)));
254        // } else {
255        // z = cast32((long) (u - (Math.floor(u/ twoPowS))));
256        // }
257        // } else {
258        // z = (long) (-Math.floor(u/ twoPowS) - 1);
259        // }
260        // }
261        if (isDelta()) {
262            z += last;
263        }
264        return (int) z;
265    }
266
267    // private long cast32(long u) {
268    // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) -
269    // Math.pow(2, 31));
270    // return u;
271    // }
272
273    @Override
274    public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception {
275        final int[] band = super.decodeInts(n, in);
276        if (isDelta()) {
277            for (int i = 0; i < band.length; i++) {
278                while (band[i] > largest) {
279                    band[i] -= cardinality;
280                }
281                while (band[i] < smallest) {
282                    band[i] = ExactMath.add(band[i], cardinality);
283                }
284            }
285        }
286        return band;
287    }
288
289    @Override
290    public int[] decodeInts(final int n, final InputStream in, final int firstValue) throws IOException, Pack200Exception {
291        final int[] band = super.decodeInts(n, in, firstValue);
292        if (isDelta()) {
293            for (int i = 0; i < band.length; i++) {
294                while (band[i] > largest) {
295                    band[i] -= cardinality;
296                }
297                while (band[i] < smallest) {
298                    band[i] = ExactMath.add(band[i], cardinality);
299                }
300            }
301        }
302        return band;
303    }
304
305    @Override
306    public byte[] encode(final int value) throws Pack200Exception {
307        return encode(value, 0);
308    }
309
310    @Override
311    public byte[] encode(final int value, final int last) throws Pack200Exception {
312        if (!encodes(value)) {
313            throw new Pack200Exception("The codec " + this + " does not encode the value " + value);
314        }
315
316        long z = value;
317        if (isDelta()) {
318            z -= last;
319        }
320        if (isSigned()) {
321            if (z < Integer.MIN_VALUE) {
322                z += 4294967296L;
323            } else if (z > Integer.MAX_VALUE) {
324                z -= 4294967296L;
325            }
326            if (z < 0) {
327                z = (-z << s) - 1;
328            } else if (s == 1) {
329                z = z << s;
330            } else {
331                z += (z - z % 3) / 3;
332            }
333        } else if (z < 0) {
334            // Need to use integer overflow here to represent negatives.
335            // 4294967296L is the 1 << 32.
336            z += Math.min(cardinality, 4294967296L);
337        }
338        if (z < 0) {
339            throw new Pack200Exception("unable to encode");
340        }
341
342        final List<Byte> byteList = new ArrayList<>();
343        for (int n = 0; n < b; n++) {
344            long byteN;
345            if (z < l) {
346                byteN = z;
347            } else {
348                byteN = z % h;
349                while (byteN < l) {
350                    byteN += h;
351                }
352            }
353            byteList.add(Byte.valueOf((byte) byteN));
354            if (byteN < l) {
355                break;
356            }
357            z -= byteN;
358            z /= h;
359        }
360        final byte[] bytes = new byte[byteList.size()];
361        for (int i = 0; i < bytes.length; i++) {
362            bytes[i] = byteList.get(i).byteValue();
363        }
364        return bytes;
365    }
366
367    /**
368     * True if this encoding can code the given value
369     *
370     * @param value the value to check
371     * @return {@code true} if the encoding can encode this value
372     */
373    public boolean encodes(final long value) {
374        return value >= smallest && value <= largest;
375    }
376
377    @Override
378    public boolean equals(final Object o) {
379        if (o instanceof BHSDCodec) {
380            final BHSDCodec codec = (BHSDCodec) o;
381            return codec.b == b && codec.h == h && codec.s == s && codec.d == d;
382        }
383        return false;
384    }
385
386    /**
387     * Gets the B.
388     *
389     * @return the b
390     */
391    public int getB() {
392        return b;
393    }
394
395    /**
396     * Gets the H.
397     *
398     * @return the h
399     */
400    public int getH() {
401        return h;
402    }
403
404    /**
405     * Gets the L.
406     *
407     * @return the l
408     */
409    public int getL() {
410        return l;
411    }
412
413    /**
414     * Gets the S.
415     *
416     * @return the s
417     */
418    public int getS() {
419        return s;
420    }
421
422    @Override
423    public int hashCode() {
424        return ((b * 37 + h) * 37 + s) * 37 + d;
425    }
426
427    /**
428     * Returns true if this codec is a delta codec
429     *
430     * @return true if this codec is a delta codec
431     */
432    public boolean isDelta() {
433        return d != 0;
434    }
435
436    /**
437     * Returns true if this codec is a signed codec
438     *
439     * @return true if this codec is a signed codec
440     */
441    public boolean isSigned() {
442        return s != 0;
443    }
444
445    /**
446     * Returns the largest value that this codec can represent.
447     *
448     * @return the largest value that this codec can represent.
449     */
450    public long largest() {
451        return largest;
452    }
453
454    /**
455     * Returns the smallest value that this codec can represent.
456     *
457     * @return the smallest value that this codec can represent.
458     */
459    public long smallest() {
460        return smallest;
461    }
462
463    /**
464     * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown.
465     */
466    @Override
467    public String toString() {
468        final StringBuilder buffer = new StringBuilder(11);
469        buffer.append('(');
470        buffer.append(b);
471        buffer.append(',');
472        buffer.append(h);
473        if (s != 0 || d != 0) {
474            buffer.append(',');
475            buffer.append(s);
476        }
477        if (d != 0) {
478            buffer.append(',');
479            buffer.append(d);
480        }
481        buffer.append(')');
482        return buffer.toString();
483    }
484}