BHSDCodec.java

  1. /*
  2.  *  Licensed to the Apache Software Foundation (ASF) under one or more
  3.  *  contributor license agreements.  See the NOTICE file distributed with
  4.  *  this work for additional information regarding copyright ownership.
  5.  *  The ASF licenses this file to You under the Apache License, Version 2.0
  6.  *  (the "License"); you may not use this file except in compliance with
  7.  *  the License.  You may obtain a copy of the License at
  8.  *
  9.  *     http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  *  Unless required by applicable law or agreed to in writing, software
  12.  *  distributed under the License is distributed on an "AS IS" BASIS,
  13.  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  *  See the License for the specific language governing permissions and
  15.  *  limitations under the License.
  16.  */
  17. package org.apache.commons.compress.harmony.pack200;

  18. import java.io.EOFException;
  19. import java.io.IOException;
  20. import java.io.InputStream;
  21. import java.util.ArrayList;
  22. import java.util.Arrays;
  23. import java.util.List;

  24. import org.apache.commons.compress.utils.ExactMath;

  25. /**
  26.  * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" encoding mechanism. It uses a
  27.  * variable-length encoding and a modified sign representation such that small numbers are represented as a single byte, whilst larger numbers take more bytes
  28.  * to encode. The number may be signed or unsigned; if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's
  29.  * complement. The Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. So a delta encoding of
  30.  * the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute value of a coded integer to fall outside of the 'small number'
  31.  * range, whilst still being encoded as a single byte.
  32.  * <p>
  33.  * A BHSD codec is configured with four parameters:
  34.  * </p>
  35.  * <dl>
  36.  * <dt>B</dt>
  37.  * <dd>The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as
  38.  * itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd>
  39.  * <dt>H</dt>
  40.  * <dd>The radix of the integer. Values are defined as a sequence of values, where value {@code n} is multiplied by {@code H^<sup>n</sup>}. So the number 1234
  41.  * may be represented as the sequence 4 3 2 1 with a radix (H) of 10. Note that other permutations are also possible; 43 2 1 will also encode 1234. The
  42.  * co-parameter L is defined as 256-H. This is important because only the last value in a sequence may be &lt; L; all prior values must be &gt; L.</dd>
  43.  * <dt>S</dt>
  44.  * <dd>Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's
  45.  * complement)</dd>
  46.  * <dt>D</dt>
  47.  * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding of 1 indicates that values are
  48.  * cumulative; a sequence of {@code 1 1 1 1 1} will represent the sequence {@code 1 2 3 4 5}. For this reason, the codec supports two variants of decode; one
  49.  * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a {@code last} parameter. If the codec is a non-delta encoding, then the
  50.  * value is ignored if passed. If the codec is a delta encoding, it is a run-time error to call the value without the extra parameter, and the previous value
  51.  * should be returned. (It was designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for each use.)</dd>
  52.  * </dl>
  53.  * <p>
  54.  * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted (1,256,0,0) or (1,256). The {@link #toString()}
  55.  * method prints out the condensed form of the encoding. Often, the last character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B
  56.  * value. Those that start with U ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the word Delta
  57.  * ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used.
  58.  * </p>
  59.  */
  60. public final class BHSDCodec extends Codec {

  61.     /**
  62.      * The maximum number of bytes in each coding word. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as itself, aka
  63.      * {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).
  64.      */
  65.     private final int b;

  66.     /**
  67.      * Whether delta encoding is used (0=false,1=true).
  68.      */
  69.     private final int d;

  70.     /**
  71.      * The radix of the encoding.
  72.      */
  73.     private final int h;

  74.     /**
  75.      * The co-parameter of h; 256-h.
  76.      */
  77.     private final int l;

  78.     /**
  79.      * Represents signed numbers or not, 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's complement).
  80.      */
  81.     private final int s;

  82.     private long cardinality;

  83.     private final long smallest;

  84.     private final long largest;

  85.     /**
  86.      * radix^i powers
  87.      */
  88.     private final long[] powers;

  89.     /**
  90.      * Constructs an unsigned, non-delta Codec with the given B and H values.
  91.      *
  92.      * @param b the maximum number of bytes that a value can be encoded as [1..5]
  93.      * @param h the radix of the encoding [1..256]
  94.      */
  95.     public BHSDCodec(final int b, final int h) {
  96.         this(b, h, 0, 0);
  97.     }

  98.     /**
  99.      * Constructs a non-delta Codec with the given B, H and S values.
  100.      *
  101.      * @param b the maximum number of bytes that a value can be encoded as [1..5]
  102.      * @param h the radix of the encoding [1..256]
  103.      * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
  104.      */
  105.     public BHSDCodec(final int b, final int h, final int s) {
  106.         this(b, h, s, 0);
  107.     }

  108.     /**
  109.      * Constructs a Codec with the given B, H, S and D values.
  110.      *
  111.      * @param b the maximum number of bytes that a value can be encoded as [1..5]
  112.      * @param h the radix of the encoding [1..256]
  113.      * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
  114.      * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta)
  115.      */
  116.     public BHSDCodec(final int b, final int h, final int s, final int d) {
  117.         if (b < 1 || b > 5) {
  118.             throw new IllegalArgumentException("1<=b<=5");
  119.         }
  120.         if (h < 1 || h > 256) {
  121.             throw new IllegalArgumentException("1<=h<=256");
  122.         }
  123.         if (s < 0 || s > 2) {
  124.             throw new IllegalArgumentException("0<=s<=2");
  125.         }
  126.         if (d < 0 || d > 1) {
  127.             throw new IllegalArgumentException("0<=d<=1");
  128.         }
  129.         if (b == 1 && h != 256) {
  130.             throw new IllegalArgumentException("b=1 -> h=256");
  131.         }
  132.         if (h == 256 && b == 5) {
  133.             throw new IllegalArgumentException("h=256 -> b!=5");
  134.         }
  135.         this.b = b;
  136.         this.h = h;
  137.         this.s = s;
  138.         this.d = d;
  139.         this.l = 256 - h;
  140.         if (h == 1) {
  141.             cardinality = b * 255 + 1;
  142.         } else {
  143.             cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b));
  144.         }
  145.         smallest = calculateSmallest();
  146.         largest = calculateLargest();

  147.         powers = new long[b];
  148.         Arrays.setAll(powers, c -> (long) Math.pow(h, c));
  149.     }

  150.     private long calculateLargest() {
  151.         long result;
  152.         // TODO This can probably be optimized into a better mathematical statement
  153.         if (d == 1) {
  154.             return new BHSDCodec(b, h).largest();
  155.         }
  156.         switch (s) {
  157.         case 0:
  158.             result = cardinality() - 1;
  159.             break;
  160.         case 1:
  161.             result = cardinality() / 2 - 1;
  162.             break;
  163.         case 2:
  164.             result = 3L * cardinality() / 4 - 1;
  165.             break;
  166.         default:
  167.             throw new Error("Unknown s value");
  168.         }
  169.         return Math.min((s == 0 ? (long) Integer.MAX_VALUE << 1 : Integer.MAX_VALUE) - 1, result);
  170.     }

  171.     private long calculateSmallest() {
  172.         long result;
  173.         if (d == 1 || !isSigned()) {
  174.             if (cardinality >= 4294967296L) { // 2^32
  175.                 result = Integer.MIN_VALUE;
  176.             } else {
  177.                 result = 0;
  178.             }
  179.         } else {
  180.             result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s));
  181.         }
  182.         return result;
  183.     }

  184.     /**
  185.      * Returns the cardinality of this codec; that is, the number of distinct values that it can contain.
  186.      *
  187.      * @return the cardinality of this codec
  188.      */
  189.     public long cardinality() {
  190.         return cardinality;
  191.     }

  192.     @Override
  193.     public int decode(final InputStream in) throws IOException, Pack200Exception {
  194.         if (d != 0) {
  195.             throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error");
  196.         }
  197.         return decode(in, 0);
  198.     }

  199.     @Override
  200.     public int decode(final InputStream in, final long last) throws IOException, Pack200Exception {
  201.         int n = 0;
  202.         long z = 0;
  203.         long x = 0;

  204.         do {
  205.             x = in.read();
  206.             lastBandLength++;
  207.             z += x * powers[n];
  208.             n++;
  209.         } while (x >= l && n < b);

  210.         if (x == -1) {
  211.             throw new EOFException("End of stream reached whilst decoding");
  212.         }

  213.         if (isSigned()) {
  214.             final int u = (1 << s) - 1;
  215.             if ((z & u) == u) {
  216.                 z = z >>> s ^ -1L;
  217.             } else {
  218.                 z -= z >>> s;
  219.             }
  220.         }
  221.         // This algorithm does the same thing, but is probably slower. Leaving
  222.         // in for now for readability
  223.         // if (isSigned()) {
  224.         // long u = z;
  225.         // long twoPowS = (long) Math.pow(2, s);
  226.         // double twoPowSMinusOne = twoPowS-1;
  227.         // if (u % twoPowS < twoPowSMinusOne) {
  228.         // if (cardinality < Math.pow(2, 32)) {
  229.         // z = (long) (u - (Math.floor(u/ twoPowS)));
  230.         // } else {
  231.         // z = cast32((long) (u - (Math.floor(u/ twoPowS))));
  232.         // }
  233.         // } else {
  234.         // z = (long) (-Math.floor(u/ twoPowS) - 1);
  235.         // }
  236.         // }
  237.         if (isDelta()) {
  238.             z += last;
  239.         }
  240.         return (int) z;
  241.     }

  242.     // private long cast32(long u) {
  243.     // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) -
  244.     // Math.pow(2, 31));
  245.     // return u;
  246.     // }

  247.     @Override
  248.     public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception {
  249.         final int[] band = super.decodeInts(n, in);
  250.         if (isDelta()) {
  251.             for (int i = 0; i < band.length; i++) {
  252.                 while (band[i] > largest) {
  253.                     band[i] -= cardinality;
  254.                 }
  255.                 while (band[i] < smallest) {
  256.                     band[i] = ExactMath.add(band[i], cardinality);
  257.                 }
  258.             }
  259.         }
  260.         return band;
  261.     }

  262.     @Override
  263.     public int[] decodeInts(final int n, final InputStream in, final int firstValue) throws IOException, Pack200Exception {
  264.         final int[] band = super.decodeInts(n, in, firstValue);
  265.         if (isDelta()) {
  266.             for (int i = 0; i < band.length; i++) {
  267.                 while (band[i] > largest) {
  268.                     band[i] -= cardinality;
  269.                 }
  270.                 while (band[i] < smallest) {
  271.                     band[i] = ExactMath.add(band[i], cardinality);
  272.                 }
  273.             }
  274.         }
  275.         return band;
  276.     }

  277.     @Override
  278.     public byte[] encode(final int value) throws Pack200Exception {
  279.         return encode(value, 0);
  280.     }

  281.     @Override
  282.     public byte[] encode(final int value, final int last) throws Pack200Exception {
  283.         if (!encodes(value)) {
  284.             throw new Pack200Exception("The codec " + this + " does not encode the value " + value);
  285.         }

  286.         long z = value;
  287.         if (isDelta()) {
  288.             z -= last;
  289.         }
  290.         if (isSigned()) {
  291.             if (z < Integer.MIN_VALUE) {
  292.                 z += 4294967296L;
  293.             } else if (z > Integer.MAX_VALUE) {
  294.                 z -= 4294967296L;
  295.             }
  296.             if (z < 0) {
  297.                 z = (-z << s) - 1;
  298.             } else if (s == 1) {
  299.                 z = z << s;
  300.             } else {
  301.                 z += (z - z % 3) / 3;
  302.             }
  303.         } else if (z < 0) {
  304.             // Need to use integer overflow here to represent negatives.
  305.             // 4294967296L is the 1 << 32.
  306.             z += Math.min(cardinality, 4294967296L);
  307.         }
  308.         if (z < 0) {
  309.             throw new Pack200Exception("unable to encode");
  310.         }

  311.         final List<Byte> byteList = new ArrayList<>();
  312.         for (int n = 0; n < b; n++) {
  313.             long byteN;
  314.             if (z < l) {
  315.                 byteN = z;
  316.             } else {
  317.                 byteN = z % h;
  318.                 while (byteN < l) {
  319.                     byteN += h;
  320.                 }
  321.             }
  322.             byteList.add(Byte.valueOf((byte) byteN));
  323.             if (byteN < l) {
  324.                 break;
  325.             }
  326.             z -= byteN;
  327.             z /= h;
  328.         }
  329.         final byte[] bytes = new byte[byteList.size()];
  330.         for (int i = 0; i < bytes.length; i++) {
  331.             bytes[i] = byteList.get(i).byteValue();
  332.         }
  333.         return bytes;
  334.     }

  335.     /**
  336.      * True if this encoding can code the given value
  337.      *
  338.      * @param value the value to check
  339.      * @return {@code true} if the encoding can encode this value
  340.      */
  341.     public boolean encodes(final long value) {
  342.         return value >= smallest && value <= largest;
  343.     }

  344.     @Override
  345.     public boolean equals(final Object o) {
  346.         if (o instanceof BHSDCodec) {
  347.             final BHSDCodec codec = (BHSDCodec) o;
  348.             return codec.b == b && codec.h == h && codec.s == s && codec.d == d;
  349.         }
  350.         return false;
  351.     }

  352.     /**
  353.      * Gets the B.
  354.      *
  355.      * @return the b
  356.      */
  357.     public int getB() {
  358.         return b;
  359.     }

  360.     /**
  361.      * Gets the H.
  362.      *
  363.      * @return the h
  364.      */
  365.     public int getH() {
  366.         return h;
  367.     }

  368.     /**
  369.      * Gets the L.
  370.      *
  371.      * @return the l
  372.      */
  373.     public int getL() {
  374.         return l;
  375.     }

  376.     /**
  377.      * Gets the S.
  378.      *
  379.      * @return the s
  380.      */
  381.     public int getS() {
  382.         return s;
  383.     }

  384.     @Override
  385.     public int hashCode() {
  386.         return ((b * 37 + h) * 37 + s) * 37 + d;
  387.     }

  388.     /**
  389.      * Returns true if this codec is a delta codec
  390.      *
  391.      * @return true if this codec is a delta codec
  392.      */
  393.     public boolean isDelta() {
  394.         return d != 0;
  395.     }

  396.     /**
  397.      * Returns true if this codec is a signed codec
  398.      *
  399.      * @return true if this codec is a signed codec
  400.      */
  401.     public boolean isSigned() {
  402.         return s != 0;
  403.     }

  404.     /**
  405.      * Returns the largest value that this codec can represent.
  406.      *
  407.      * @return the largest value that this codec can represent.
  408.      */
  409.     public long largest() {
  410.         return largest;
  411.     }

  412.     /**
  413.      * Returns the smallest value that this codec can represent.
  414.      *
  415.      * @return the smallest value that this codec can represent.
  416.      */
  417.     public long smallest() {
  418.         return smallest;
  419.     }

  420.     /**
  421.      * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown.
  422.      */
  423.     @Override
  424.     public String toString() {
  425.         final StringBuilder buffer = new StringBuilder(11);
  426.         buffer.append('(');
  427.         buffer.append(b);
  428.         buffer.append(',');
  429.         buffer.append(h);
  430.         if (s != 0 || d != 0) {
  431.             buffer.append(',');
  432.             buffer.append(s);
  433.         }
  434.         if (d != 0) {
  435.             buffer.append(',');
  436.             buffer.append(d);
  437.         }
  438.         buffer.append(')');
  439.         return buffer.toString();
  440.     }
  441. }