001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.harmony.pack200;
020
021import java.io.EOFException;
022import java.io.IOException;
023import java.io.InputStream;
024import java.util.ArrayList;
025import java.util.Arrays;
026import java.util.List;
027
028import org.apache.commons.compress.utils.ExactMath;
029
030/**
031 * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" encoding mechanism. It uses a
032 * variable-length encoding and a modified sign representation such that small numbers are represented as a single byte, whilst larger numbers take more bytes
033 * to encode. The number may be signed or unsigned; if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's
034 * complement. The Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. So a delta encoding of
035 * the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute value of a coded integer to fall outside of the 'small number'
036 * range, whilst still being encoded as a single byte.
037 * <p>
038 * A BHSD codec is configured with four parameters:
039 * </p>
040 * <dl>
041 * <dt>B</dt>
042 * <dd>The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as
043 * itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd>
044 * <dt>H</dt>
045 * <dd>The radix of the integer. Values are defined as a sequence of values, where value {@code n} is multiplied by {@code H^<sup>n</sup>}. So the number 1234
046 * may be represented as the sequence 4 3 2 1 with a radix (H) of 10. Note that other permutations are also possible; 43 2 1 will also encode 1234. The
047 * co-parameter L is defined as 256-H. This is important because only the last value in a sequence may be &lt; L; all prior values must be &gt; L.</dd>
048 * <dt>S</dt>
049 * <dd>Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's
050 * complement)</dd>
051 * <dt>D</dt>
052 * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding of 1 indicates that values are
053 * cumulative; a sequence of {@code 1 1 1 1 1} will represent the sequence {@code 1 2 3 4 5}. For this reason, the codec supports two variants of decode; one
054 * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a {@code last} parameter. If the codec is a non-delta encoding, then the
055 * value is ignored if passed. If the codec is a delta encoding, it is a run-time error to call the value without the extra parameter, and the previous value
056 * should be returned. (It was designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for each use.)</dd>
057 * </dl>
058 * <p>
059 * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted (1,256,0,0) or (1,256). The {@link #toString()}
060 * method prints out the condensed form of the encoding. Often, the last character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B
061 * value. Those that start with U ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the word Delta
062 * ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used.
063 * </p>
064 */
065public final class BHSDCodec extends Codec {
066
067    /**
068     * The maximum number of bytes in each coding word. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as itself, aka
069     * {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).
070     */
071    private final int b;
072
073    /**
074     * Whether delta encoding is used (0=false,1=true).
075     */
076    private final int d;
077
078    /**
079     * The radix of the encoding.
080     */
081    private final int h;
082
083    /**
084     * The co-parameter of h; 256-h.
085     */
086    private final int l;
087
088    /**
089     * Represents signed numbers or not, 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's complement).
090     */
091    private final int s;
092
093    private long cardinality;
094
095    private final long smallest;
096
097    private final long largest;
098
099    /**
100     * radix^i powers
101     */
102    private final long[] powers;
103
104    /**
105     * Constructs an unsigned, non-delta Codec with the given B and H values.
106     *
107     * @param b the maximum number of bytes that a value can be encoded as [1..5]
108     * @param h the radix of the encoding [1..256]
109     */
110    public BHSDCodec(final int b, final int h) {
111        this(b, h, 0, 0);
112    }
113
114    /**
115     * Constructs a non-delta Codec with the given B, H and S values.
116     *
117     * @param b the maximum number of bytes that a value can be encoded as [1..5]
118     * @param h the radix of the encoding [1..256]
119     * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
120     */
121    public BHSDCodec(final int b, final int h, final int s) {
122        this(b, h, s, 0);
123    }
124
125    /**
126     * Constructs a Codec with the given B, H, S and D values.
127     *
128     * @param b the maximum number of bytes that a value can be encoded as [1..5]
129     * @param h the radix of the encoding [1..256]
130     * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
131     * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta)
132     */
133    public BHSDCodec(final int b, final int h, final int s, final int d) {
134        if (b < 1 || b > 5) {
135            throw new IllegalArgumentException("1<=b<=5");
136        }
137        if (h < 1 || h > 256) {
138            throw new IllegalArgumentException("1<=h<=256");
139        }
140        if (s < 0 || s > 2) {
141            throw new IllegalArgumentException("0<=s<=2");
142        }
143        if (d < 0 || d > 1) {
144            throw new IllegalArgumentException("0<=d<=1");
145        }
146        if (b == 1 && h != 256) {
147            throw new IllegalArgumentException("b=1 -> h=256");
148        }
149        if (h == 256 && b == 5) {
150            throw new IllegalArgumentException("h=256 -> b!=5");
151        }
152        this.b = b;
153        this.h = h;
154        this.s = s;
155        this.d = d;
156        this.l = 256 - h;
157        if (h == 1) {
158            cardinality = b * 255 + 1;
159        } else {
160            cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b));
161        }
162        smallest = calculateSmallest();
163        largest = calculateLargest();
164
165        powers = new long[b];
166        Arrays.setAll(powers, c -> (long) Math.pow(h, c));
167    }
168
169    private long calculateLargest() {
170        final long result;
171        // TODO This can probably be optimized into a better mathematical statement
172        if (d == 1) {
173            return new BHSDCodec(b, h).largest();
174        }
175        switch (s) {
176        case 0:
177            result = cardinality() - 1;
178            break;
179        case 1:
180            result = cardinality() / 2 - 1;
181            break;
182        case 2:
183            result = 3L * cardinality() / 4 - 1;
184            break;
185        default:
186            throw new Error("Unknown s value");
187        }
188        return Math.min((s == 0 ? (long) Integer.MAX_VALUE << 1 : Integer.MAX_VALUE) - 1, result);
189    }
190
191    private long calculateSmallest() {
192        final long result;
193        if (d == 1 || !isSigned()) {
194            if (cardinality >= 4294967296L) { // 2^32
195                result = Integer.MIN_VALUE;
196            } else {
197                result = 0;
198            }
199        } else {
200            result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s));
201        }
202        return result;
203    }
204
205    /**
206     * Returns the cardinality of this codec; that is, the number of distinct values that it can contain.
207     *
208     * @return the cardinality of this codec
209     */
210    public long cardinality() {
211        return cardinality;
212    }
213
214    @Override
215    public int decode(final InputStream in) throws IOException, Pack200Exception {
216        if (d != 0) {
217            throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error");
218        }
219        return decode(in, 0);
220    }
221
222    @Override
223    public int decode(final InputStream in, final long last) throws IOException, Pack200Exception {
224        int n = 0;
225        long z = 0;
226        long x = 0;
227
228        do {
229            x = in.read();
230            lastBandLength++;
231            z += x * powers[n];
232            n++;
233        } while (x >= l && n < b);
234
235        if (x == -1) {
236            throw new EOFException("End of stream reached whilst decoding");
237        }
238
239        if (isSigned()) {
240            final int u = (1 << s) - 1;
241            if ((z & u) == u) {
242                z = z >>> s ^ -1L;
243            } else {
244                z -= z >>> s;
245            }
246        }
247        // This algorithm does the same thing, but is probably slower. Leaving
248        // in for now for readability
249        // if (isSigned()) {
250        // long u = z;
251        // long twoPowS = (long) Math.pow(2, s);
252        // double twoPowSMinusOne = twoPowS-1;
253        // if (u % twoPowS < twoPowSMinusOne) {
254        // if (cardinality < Math.pow(2, 32)) {
255        // z = (long) (u - (Math.floor(u/ twoPowS)));
256        // } else {
257        // z = cast32((long) (u - (Math.floor(u/ twoPowS))));
258        // }
259        // } else {
260        // z = (long) (-Math.floor(u/ twoPowS) - 1);
261        // }
262        // }
263        if (isDelta()) {
264            z += last;
265        }
266        return (int) z;
267    }
268
269    // private long cast32(long u) {
270    // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) -
271    // Math.pow(2, 31));
272    // return u;
273    // }
274
275    @Override
276    public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception {
277        final int[] band = super.decodeInts(n, in);
278        if (isDelta()) {
279            for (int i = 0; i < band.length; i++) {
280                while (band[i] > largest) {
281                    band[i] -= cardinality;
282                }
283                while (band[i] < smallest) {
284                    band[i] = ExactMath.add(band[i], cardinality);
285                }
286            }
287        }
288        return band;
289    }
290
291    @Override
292    public int[] decodeInts(final int n, final InputStream in, final int firstValue) throws IOException, Pack200Exception {
293        final int[] band = super.decodeInts(n, in, firstValue);
294        if (isDelta()) {
295            for (int i = 0; i < band.length; i++) {
296                while (band[i] > largest) {
297                    band[i] -= cardinality;
298                }
299                while (band[i] < smallest) {
300                    band[i] = ExactMath.add(band[i], cardinality);
301                }
302            }
303        }
304        return band;
305    }
306
307    @Override
308    public byte[] encode(final int value) throws Pack200Exception {
309        return encode(value, 0);
310    }
311
312    @Override
313    public byte[] encode(final int value, final int last) throws Pack200Exception {
314        if (!encodes(value)) {
315            throw new Pack200Exception("The codec " + this + " does not encode the value " + value);
316        }
317
318        long z = value;
319        if (isDelta()) {
320            z -= last;
321        }
322        if (isSigned()) {
323            if (z < Integer.MIN_VALUE) {
324                z += 4294967296L;
325            } else if (z > Integer.MAX_VALUE) {
326                z -= 4294967296L;
327            }
328            if (z < 0) {
329                z = (-z << s) - 1;
330            } else if (s == 1) {
331                z = z << s;
332            } else {
333                z += (z - z % 3) / 3;
334            }
335        } else if (z < 0) {
336            // Need to use integer overflow here to represent negatives.
337            // 4294967296L is the 1 << 32.
338            z += Math.min(cardinality, 4294967296L);
339        }
340        if (z < 0) {
341            throw new Pack200Exception("unable to encode");
342        }
343
344        final List<Byte> byteList = new ArrayList<>();
345        for (int n = 0; n < b; n++) {
346            long byteN;
347            if (z < l) {
348                byteN = z;
349            } else {
350                byteN = z % h;
351                while (byteN < l) {
352                    byteN += h;
353                }
354            }
355            byteList.add(Byte.valueOf((byte) byteN));
356            if (byteN < l) {
357                break;
358            }
359            z -= byteN;
360            z /= h;
361        }
362        final byte[] bytes = new byte[byteList.size()];
363        for (int i = 0; i < bytes.length; i++) {
364            bytes[i] = byteList.get(i).byteValue();
365        }
366        return bytes;
367    }
368
369    /**
370     * True if this encoding can code the given value
371     *
372     * @param value the value to check
373     * @return {@code true} if the encoding can encode this value
374     */
375    public boolean encodes(final long value) {
376        return value >= smallest && value <= largest;
377    }
378
379    @Override
380    public boolean equals(final Object o) {
381        if (o instanceof BHSDCodec) {
382            final BHSDCodec codec = (BHSDCodec) o;
383            return codec.b == b && codec.h == h && codec.s == s && codec.d == d;
384        }
385        return false;
386    }
387
388    /**
389     * Gets the B.
390     *
391     * @return the b
392     */
393    public int getB() {
394        return b;
395    }
396
397    /**
398     * Gets the H.
399     *
400     * @return the h
401     */
402    public int getH() {
403        return h;
404    }
405
406    /**
407     * Gets the L.
408     *
409     * @return the l
410     */
411    public int getL() {
412        return l;
413    }
414
415    /**
416     * Gets the S.
417     *
418     * @return the s
419     */
420    public int getS() {
421        return s;
422    }
423
424    @Override
425    public int hashCode() {
426        return ((b * 37 + h) * 37 + s) * 37 + d;
427    }
428
429    /**
430     * Returns true if this codec is a delta codec
431     *
432     * @return true if this codec is a delta codec
433     */
434    public boolean isDelta() {
435        return d != 0;
436    }
437
438    /**
439     * Returns true if this codec is a signed codec
440     *
441     * @return true if this codec is a signed codec
442     */
443    public boolean isSigned() {
444        return s != 0;
445    }
446
447    /**
448     * Returns the largest value that this codec can represent.
449     *
450     * @return the largest value that this codec can represent.
451     */
452    public long largest() {
453        return largest;
454    }
455
456    /**
457     * Returns the smallest value that this codec can represent.
458     *
459     * @return the smallest value that this codec can represent.
460     */
461    public long smallest() {
462        return smallest;
463    }
464
465    /**
466     * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown.
467     */
468    @Override
469    public String toString() {
470        final StringBuilder buffer = new StringBuilder(11);
471        buffer.append('(');
472        buffer.append(b);
473        buffer.append(',');
474        buffer.append(h);
475        if (s != 0 || d != 0) {
476            buffer.append(',');
477            buffer.append(s);
478        }
479        if (d != 0) {
480            buffer.append(',');
481            buffer.append(d);
482        }
483        buffer.append(')');
484        return buffer.toString();
485    }
486}