1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * https://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.commons.compress.harmony.pack200;
20
21 import java.io.EOFException;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.util.ArrayList;
25 import java.util.Arrays;
26 import java.util.List;
27
28 import org.apache.commons.compress.utils.ExactMath;
29
30 /**
31 * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" encoding mechanism. It uses a
32 * variable-length encoding and a modified sign representation such that small numbers are represented as a single byte, whilst larger numbers take more bytes
33 * to encode. The number may be signed or unsigned; if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's
34 * complement. The Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. So a delta encoding of
35 * the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute value of a coded integer to fall outside of the 'small number'
36 * range, whilst still being encoded as a single byte.
37 * <p>
38 * A BHSD codec is configured with four parameters:
39 * </p>
40 * <dl>
41 * <dt>B</dt>
42 * <dd>The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as
43 * itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd>
44 * <dt>H</dt>
45 * <dd>The radix of the integer. Values are defined as a sequence of values, where value {@code n} is multiplied by {@code H^<sup>n</sup>}. So the number 1234
46 * may be represented as the sequence 4 3 2 1 with a radix (H) of 10. Note that other permutations are also possible; 43 2 1 will also encode 1234. The
47 * co-parameter L is defined as 256-H. This is important because only the last value in a sequence may be < L; all prior values must be > L.</dd>
48 * <dt>S</dt>
49 * <dd>Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's
50 * complement)</dd>
51 * <dt>D</dt>
52 * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding of 1 indicates that values are
53 * cumulative; a sequence of {@code 1 1 1 1 1} will represent the sequence {@code 1 2 3 4 5}. For this reason, the codec supports two variants of decode; one
54 * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a {@code last} parameter. If the codec is a non-delta encoding, then the
55 * value is ignored if passed. If the codec is a delta encoding, it is a run-time error to call the value without the extra parameter, and the previous value
56 * should be returned. (It was designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for each use.)</dd>
57 * </dl>
58 * <p>
59 * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted (1,256,0,0) or (1,256). The {@link #toString()}
60 * method prints out the condensed form of the encoding. Often, the last character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B
61 * value. Those that start with U ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the word Delta
62 * ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used.
63 * </p>
64 */
65 public final class BHSDCodec extends Codec {
66
67 /**
68 * The maximum number of bytes in each coding word. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as itself, aka
69 * {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).
70 */
71 private final int b;
72
73 /**
74 * Whether delta encoding is used (0=false,1=true).
75 */
76 private final int d;
77
78 /**
79 * The radix of the encoding.
80 */
81 private final int h;
82
83 /**
84 * The co-parameter of h; 256-h.
85 */
86 private final int l;
87
88 /**
89 * Represents signed numbers or not, 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's complement).
90 */
91 private final int s;
92
93 private long cardinality;
94
95 private final long smallest;
96
97 private final long largest;
98
99 /**
100 * radix^i powers
101 */
102 private final long[] powers;
103
104 /**
105 * Constructs an unsigned, non-delta Codec with the given B and H values.
106 *
107 * @param b the maximum number of bytes that a value can be encoded as [1..5]
108 * @param h the radix of the encoding [1..256]
109 */
110 public BHSDCodec(final int b, final int h) {
111 this(b, h, 0, 0);
112 }
113
114 /**
115 * Constructs a non-delta Codec with the given B, H and S values.
116 *
117 * @param b the maximum number of bytes that a value can be encoded as [1..5]
118 * @param h the radix of the encoding [1..256]
119 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
120 */
121 public BHSDCodec(final int b, final int h, final int s) {
122 this(b, h, s, 0);
123 }
124
125 /**
126 * Constructs a Codec with the given B, H, S and D values.
127 *
128 * @param b the maximum number of bytes that a value can be encoded as [1..5]
129 * @param h the radix of the encoding [1..256]
130 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?)
131 * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta)
132 */
133 public BHSDCodec(final int b, final int h, final int s, final int d) {
134 if (b < 1 || b > 5) {
135 throw new IllegalArgumentException("1<=b<=5");
136 }
137 if (h < 1 || h > 256) {
138 throw new IllegalArgumentException("1<=h<=256");
139 }
140 if (s < 0 || s > 2) {
141 throw new IllegalArgumentException("0<=s<=2");
142 }
143 if (d < 0 || d > 1) {
144 throw new IllegalArgumentException("0<=d<=1");
145 }
146 if (b == 1 && h != 256) {
147 throw new IllegalArgumentException("b=1 -> h=256");
148 }
149 if (h == 256 && b == 5) {
150 throw new IllegalArgumentException("h=256 -> b!=5");
151 }
152 this.b = b;
153 this.h = h;
154 this.s = s;
155 this.d = d;
156 this.l = 256 - h;
157 if (h == 1) {
158 cardinality = b * 255 + 1;
159 } else {
160 cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b));
161 }
162 smallest = calculateSmallest();
163 largest = calculateLargest();
164
165 powers = new long[b];
166 Arrays.setAll(powers, c -> (long) Math.pow(h, c));
167 }
168
169 private long calculateLargest() {
170 final long result;
171 // TODO This can probably be optimized into a better mathematical statement
172 if (d == 1) {
173 return new BHSDCodec(b, h).largest();
174 }
175 switch (s) {
176 case 0:
177 result = cardinality() - 1;
178 break;
179 case 1:
180 result = cardinality() / 2 - 1;
181 break;
182 case 2:
183 result = 3L * cardinality() / 4 - 1;
184 break;
185 default:
186 throw new Error("Unknown s value");
187 }
188 return Math.min((s == 0 ? (long) Integer.MAX_VALUE << 1 : Integer.MAX_VALUE) - 1, result);
189 }
190
191 private long calculateSmallest() {
192 final long result;
193 if (d == 1 || !isSigned()) {
194 if (cardinality >= 4294967296L) { // 2^32
195 result = Integer.MIN_VALUE;
196 } else {
197 result = 0;
198 }
199 } else {
200 result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s));
201 }
202 return result;
203 }
204
205 /**
206 * Returns the cardinality of this codec; that is, the number of distinct values that it can contain.
207 *
208 * @return the cardinality of this codec
209 */
210 public long cardinality() {
211 return cardinality;
212 }
213
214 @Override
215 public int decode(final InputStream in) throws IOException, Pack200Exception {
216 if (d != 0) {
217 throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error");
218 }
219 return decode(in, 0);
220 }
221
222 @Override
223 public int decode(final InputStream in, final long last) throws IOException, Pack200Exception {
224 int n = 0;
225 long z = 0;
226 long x = 0;
227
228 do {
229 x = in.read();
230 lastBandLength++;
231 z += x * powers[n];
232 n++;
233 } while (x >= l && n < b);
234
235 if (x == -1) {
236 throw new EOFException("End of stream reached whilst decoding");
237 }
238
239 if (isSigned()) {
240 final int u = (1 << s) - 1;
241 if ((z & u) == u) {
242 z = z >>> s ^ -1L;
243 } else {
244 z -= z >>> s;
245 }
246 }
247 // This algorithm does the same thing, but is probably slower. Leaving
248 // in for now for readability
249 // if (isSigned()) {
250 // long u = z;
251 // long twoPowS = (long) Math.pow(2, s);
252 // double twoPowSMinusOne = twoPowS-1;
253 // if (u % twoPowS < twoPowSMinusOne) {
254 // if (cardinality < Math.pow(2, 32)) {
255 // z = (long) (u - (Math.floor(u/ twoPowS)));
256 // } else {
257 // z = cast32((long) (u - (Math.floor(u/ twoPowS))));
258 // }
259 // } else {
260 // z = (long) (-Math.floor(u/ twoPowS) - 1);
261 // }
262 // }
263 if (isDelta()) {
264 z += last;
265 }
266 return (int) z;
267 }
268
269 // private long cast32(long u) {
270 // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) -
271 // Math.pow(2, 31));
272 // return u;
273 // }
274
275 @Override
276 public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception {
277 final int[] band = super.decodeInts(n, in);
278 if (isDelta()) {
279 for (int i = 0; i < band.length; i++) {
280 while (band[i] > largest) {
281 band[i] -= cardinality;
282 }
283 while (band[i] < smallest) {
284 band[i] = ExactMath.add(band[i], cardinality);
285 }
286 }
287 }
288 return band;
289 }
290
291 @Override
292 public int[] decodeInts(final int n, final InputStream in, final int firstValue) throws IOException, Pack200Exception {
293 final int[] band = super.decodeInts(n, in, firstValue);
294 if (isDelta()) {
295 for (int i = 0; i < band.length; i++) {
296 while (band[i] > largest) {
297 band[i] -= cardinality;
298 }
299 while (band[i] < smallest) {
300 band[i] = ExactMath.add(band[i], cardinality);
301 }
302 }
303 }
304 return band;
305 }
306
307 @Override
308 public byte[] encode(final int value) throws Pack200Exception {
309 return encode(value, 0);
310 }
311
312 @Override
313 public byte[] encode(final int value, final int last) throws Pack200Exception {
314 if (!encodes(value)) {
315 throw new Pack200Exception("The codec " + this + " does not encode the value " + value);
316 }
317
318 long z = value;
319 if (isDelta()) {
320 z -= last;
321 }
322 if (isSigned()) {
323 if (z < Integer.MIN_VALUE) {
324 z += 4294967296L;
325 } else if (z > Integer.MAX_VALUE) {
326 z -= 4294967296L;
327 }
328 if (z < 0) {
329 z = (-z << s) - 1;
330 } else if (s == 1) {
331 z = z << s;
332 } else {
333 z += (z - z % 3) / 3;
334 }
335 } else if (z < 0) {
336 // Need to use integer overflow here to represent negatives.
337 // 4294967296L is the 1 << 32.
338 z += Math.min(cardinality, 4294967296L);
339 }
340 if (z < 0) {
341 throw new Pack200Exception("unable to encode");
342 }
343
344 final List<Byte> byteList = new ArrayList<>();
345 for (int n = 0; n < b; n++) {
346 long byteN;
347 if (z < l) {
348 byteN = z;
349 } else {
350 byteN = z % h;
351 while (byteN < l) {
352 byteN += h;
353 }
354 }
355 byteList.add(Byte.valueOf((byte) byteN));
356 if (byteN < l) {
357 break;
358 }
359 z -= byteN;
360 z /= h;
361 }
362 final byte[] bytes = new byte[byteList.size()];
363 for (int i = 0; i < bytes.length; i++) {
364 bytes[i] = byteList.get(i).byteValue();
365 }
366 return bytes;
367 }
368
369 /**
370 * True if this encoding can code the given value
371 *
372 * @param value the value to check
373 * @return {@code true} if the encoding can encode this value
374 */
375 public boolean encodes(final long value) {
376 return value >= smallest && value <= largest;
377 }
378
379 @Override
380 public boolean equals(final Object o) {
381 if (o instanceof BHSDCodec) {
382 final BHSDCodec codec = (BHSDCodec) o;
383 return codec.b == b && codec.h == h && codec.s == s && codec.d == d;
384 }
385 return false;
386 }
387
388 /**
389 * Gets the B.
390 *
391 * @return the b
392 */
393 public int getB() {
394 return b;
395 }
396
397 /**
398 * Gets the H.
399 *
400 * @return the h
401 */
402 public int getH() {
403 return h;
404 }
405
406 /**
407 * Gets the L.
408 *
409 * @return the l
410 */
411 public int getL() {
412 return l;
413 }
414
415 /**
416 * Gets the S.
417 *
418 * @return the s
419 */
420 public int getS() {
421 return s;
422 }
423
424 @Override
425 public int hashCode() {
426 return ((b * 37 + h) * 37 + s) * 37 + d;
427 }
428
429 /**
430 * Returns true if this codec is a delta codec
431 *
432 * @return true if this codec is a delta codec
433 */
434 public boolean isDelta() {
435 return d != 0;
436 }
437
438 /**
439 * Returns true if this codec is a signed codec
440 *
441 * @return true if this codec is a signed codec
442 */
443 public boolean isSigned() {
444 return s != 0;
445 }
446
447 /**
448 * Returns the largest value that this codec can represent.
449 *
450 * @return the largest value that this codec can represent.
451 */
452 public long largest() {
453 return largest;
454 }
455
456 /**
457 * Returns the smallest value that this codec can represent.
458 *
459 * @return the smallest value that this codec can represent.
460 */
461 public long smallest() {
462 return smallest;
463 }
464
465 /**
466 * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown.
467 */
468 @Override
469 public String toString() {
470 final StringBuilder buffer = new StringBuilder(11);
471 buffer.append('(');
472 buffer.append(b);
473 buffer.append(',');
474 buffer.append(h);
475 if (s != 0 || d != 0) {
476 buffer.append(',');
477 buffer.append(s);
478 }
479 if (d != 0) {
480 buffer.append(',');
481 buffer.append(d);
482 }
483 buffer.append(')');
484 return buffer.toString();
485 }
486 }