001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * https://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.harmony.pack200; 020 021import java.io.EOFException; 022import java.io.IOException; 023import java.io.InputStream; 024import java.util.ArrayList; 025import java.util.Arrays; 026import java.util.List; 027 028import org.apache.commons.compress.utils.ExactMath; 029 030/** 031 * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" encoding mechanism. It uses a 032 * variable-length encoding and a modified sign representation such that small numbers are represented as a single byte, whilst larger numbers take more bytes 033 * to encode. The number may be signed or unsigned; if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's 034 * complement. The Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. So a delta encoding of 035 * the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute value of a coded integer to fall outside of the 'small number' 036 * range, whilst still being encoded as a single byte. 037 * <p> 038 * A BHSD codec is configured with four parameters: 039 * </p> 040 * <dl> 041 * <dt>B</dt> 042 * <dd>The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as 043 * itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd> 044 * <dt>H</dt> 045 * <dd>The radix of the integer. Values are defined as a sequence of values, where value {@code n} is multiplied by {@code H^<sup>n</sup>}. So the number 1234 046 * may be represented as the sequence 4 3 2 1 with a radix (H) of 10. Note that other permutations are also possible; 43 2 1 will also encode 1234. The 047 * co-parameter L is defined as 256-H. This is important because only the last value in a sequence may be < L; all prior values must be > L.</dd> 048 * <dt>S</dt> 049 * <dd>Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's 050 * complement)</dd> 051 * <dt>D</dt> 052 * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding of 1 indicates that values are 053 * cumulative; a sequence of {@code 1 1 1 1 1} will represent the sequence {@code 1 2 3 4 5}. For this reason, the codec supports two variants of decode; one 054 * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a {@code last} parameter. If the codec is a non-delta encoding, then the 055 * value is ignored if passed. If the codec is a delta encoding, it is a run-time error to call the value without the extra parameter, and the previous value 056 * should be returned. (It was designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for each use.)</dd> 057 * </dl> 058 * <p> 059 * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted (1,256,0,0) or (1,256). The {@link #toString()} 060 * method prints out the condensed form of the encoding. Often, the last character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B 061 * value. Those that start with U ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the word Delta 062 * ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used. 063 * </p> 064 */ 065public final class BHSDCodec extends Codec { 066 067 /** 068 * The maximum number of bytes in each coding word. B must be a value between [1..5]. For a pass-through coding (where each byte is encoded as itself, aka 069 * {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte). 070 */ 071 private final int b; 072 073 /** 074 * Whether delta encoding is used (0=false,1=true). 075 */ 076 private final int d; 077 078 /** 079 * The radix of the encoding. 080 */ 081 private final int h; 082 083 /** 084 * The co-parameter of h; 256-h. 085 */ 086 private final int l; 087 088 /** 089 * Represents signed numbers or not, 0 (unsigned), 1 (signed, one's complement) or 2 (signed, two's complement). 090 */ 091 private final int s; 092 093 private long cardinality; 094 095 private final long smallest; 096 097 private final long largest; 098 099 /** 100 * radix^i powers 101 */ 102 private final long[] powers; 103 104 /** 105 * Constructs an unsigned, non-delta Codec with the given B and H values. 106 * 107 * @param b the maximum number of bytes that a value can be encoded as [1..5] 108 * @param h the radix of the encoding [1..256] 109 */ 110 public BHSDCodec(final int b, final int h) { 111 this(b, h, 0, 0); 112 } 113 114 /** 115 * Constructs a non-delta Codec with the given B, H and S values. 116 * 117 * @param b the maximum number of bytes that a value can be encoded as [1..5] 118 * @param h the radix of the encoding [1..256] 119 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?) 120 */ 121 public BHSDCodec(final int b, final int h, final int s) { 122 this(b, h, s, 0); 123 } 124 125 /** 126 * Constructs a Codec with the given B, H, S and D values. 127 * 128 * @param b the maximum number of bytes that a value can be encoded as [1..5] 129 * @param h the radix of the encoding [1..256] 130 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 is signed with ?) 131 * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta) 132 */ 133 public BHSDCodec(final int b, final int h, final int s, final int d) { 134 if (b < 1 || b > 5) { 135 throw new IllegalArgumentException("1<=b<=5"); 136 } 137 if (h < 1 || h > 256) { 138 throw new IllegalArgumentException("1<=h<=256"); 139 } 140 if (s < 0 || s > 2) { 141 throw new IllegalArgumentException("0<=s<=2"); 142 } 143 if (d < 0 || d > 1) { 144 throw new IllegalArgumentException("0<=d<=1"); 145 } 146 if (b == 1 && h != 256) { 147 throw new IllegalArgumentException("b=1 -> h=256"); 148 } 149 if (h == 256 && b == 5) { 150 throw new IllegalArgumentException("h=256 -> b!=5"); 151 } 152 this.b = b; 153 this.h = h; 154 this.s = s; 155 this.d = d; 156 this.l = 256 - h; 157 if (h == 1) { 158 cardinality = b * 255 + 1; 159 } else { 160 cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b)); 161 } 162 smallest = calculateSmallest(); 163 largest = calculateLargest(); 164 165 powers = new long[b]; 166 Arrays.setAll(powers, c -> (long) Math.pow(h, c)); 167 } 168 169 private long calculateLargest() { 170 final long result; 171 // TODO This can probably be optimized into a better mathematical statement 172 if (d == 1) { 173 return new BHSDCodec(b, h).largest(); 174 } 175 switch (s) { 176 case 0: 177 result = cardinality() - 1; 178 break; 179 case 1: 180 result = cardinality() / 2 - 1; 181 break; 182 case 2: 183 result = 3L * cardinality() / 4 - 1; 184 break; 185 default: 186 throw new Error("Unknown s value"); 187 } 188 return Math.min((s == 0 ? (long) Integer.MAX_VALUE << 1 : Integer.MAX_VALUE) - 1, result); 189 } 190 191 private long calculateSmallest() { 192 final long result; 193 if (d == 1 || !isSigned()) { 194 if (cardinality >= 4294967296L) { // 2^32 195 result = Integer.MIN_VALUE; 196 } else { 197 result = 0; 198 } 199 } else { 200 result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s)); 201 } 202 return result; 203 } 204 205 /** 206 * Returns the cardinality of this codec; that is, the number of distinct values that it can contain. 207 * 208 * @return the cardinality of this codec 209 */ 210 public long cardinality() { 211 return cardinality; 212 } 213 214 @Override 215 public int decode(final InputStream in) throws IOException, Pack200Exception { 216 if (d != 0) { 217 throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error"); 218 } 219 return decode(in, 0); 220 } 221 222 @Override 223 public int decode(final InputStream in, final long last) throws IOException, Pack200Exception { 224 int n = 0; 225 long z = 0; 226 long x = 0; 227 228 do { 229 x = in.read(); 230 lastBandLength++; 231 z += x * powers[n]; 232 n++; 233 } while (x >= l && n < b); 234 235 if (x == -1) { 236 throw new EOFException("End of stream reached whilst decoding"); 237 } 238 239 if (isSigned()) { 240 final int u = (1 << s) - 1; 241 if ((z & u) == u) { 242 z = z >>> s ^ -1L; 243 } else { 244 z -= z >>> s; 245 } 246 } 247 // This algorithm does the same thing, but is probably slower. Leaving 248 // in for now for readability 249 // if (isSigned()) { 250 // long u = z; 251 // long twoPowS = (long) Math.pow(2, s); 252 // double twoPowSMinusOne = twoPowS-1; 253 // if (u % twoPowS < twoPowSMinusOne) { 254 // if (cardinality < Math.pow(2, 32)) { 255 // z = (long) (u - (Math.floor(u/ twoPowS))); 256 // } else { 257 // z = cast32((long) (u - (Math.floor(u/ twoPowS)))); 258 // } 259 // } else { 260 // z = (long) (-Math.floor(u/ twoPowS) - 1); 261 // } 262 // } 263 if (isDelta()) { 264 z += last; 265 } 266 return (int) z; 267 } 268 269 // private long cast32(long u) { 270 // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) - 271 // Math.pow(2, 31)); 272 // return u; 273 // } 274 275 @Override 276 public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception { 277 final int[] band = super.decodeInts(n, in); 278 if (isDelta()) { 279 for (int i = 0; i < band.length; i++) { 280 while (band[i] > largest) { 281 band[i] -= cardinality; 282 } 283 while (band[i] < smallest) { 284 band[i] = ExactMath.add(band[i], cardinality); 285 } 286 } 287 } 288 return band; 289 } 290 291 @Override 292 public int[] decodeInts(final int n, final InputStream in, final int firstValue) throws IOException, Pack200Exception { 293 final int[] band = super.decodeInts(n, in, firstValue); 294 if (isDelta()) { 295 for (int i = 0; i < band.length; i++) { 296 while (band[i] > largest) { 297 band[i] -= cardinality; 298 } 299 while (band[i] < smallest) { 300 band[i] = ExactMath.add(band[i], cardinality); 301 } 302 } 303 } 304 return band; 305 } 306 307 @Override 308 public byte[] encode(final int value) throws Pack200Exception { 309 return encode(value, 0); 310 } 311 312 @Override 313 public byte[] encode(final int value, final int last) throws Pack200Exception { 314 if (!encodes(value)) { 315 throw new Pack200Exception("The codec " + this + " does not encode the value " + value); 316 } 317 318 long z = value; 319 if (isDelta()) { 320 z -= last; 321 } 322 if (isSigned()) { 323 if (z < Integer.MIN_VALUE) { 324 z += 4294967296L; 325 } else if (z > Integer.MAX_VALUE) { 326 z -= 4294967296L; 327 } 328 if (z < 0) { 329 z = (-z << s) - 1; 330 } else if (s == 1) { 331 z = z << s; 332 } else { 333 z += (z - z % 3) / 3; 334 } 335 } else if (z < 0) { 336 // Need to use integer overflow here to represent negatives. 337 // 4294967296L is the 1 << 32. 338 z += Math.min(cardinality, 4294967296L); 339 } 340 if (z < 0) { 341 throw new Pack200Exception("unable to encode"); 342 } 343 344 final List<Byte> byteList = new ArrayList<>(); 345 for (int n = 0; n < b; n++) { 346 long byteN; 347 if (z < l) { 348 byteN = z; 349 } else { 350 byteN = z % h; 351 while (byteN < l) { 352 byteN += h; 353 } 354 } 355 byteList.add(Byte.valueOf((byte) byteN)); 356 if (byteN < l) { 357 break; 358 } 359 z -= byteN; 360 z /= h; 361 } 362 final byte[] bytes = new byte[byteList.size()]; 363 for (int i = 0; i < bytes.length; i++) { 364 bytes[i] = byteList.get(i).byteValue(); 365 } 366 return bytes; 367 } 368 369 /** 370 * True if this encoding can code the given value 371 * 372 * @param value the value to check 373 * @return {@code true} if the encoding can encode this value 374 */ 375 public boolean encodes(final long value) { 376 return value >= smallest && value <= largest; 377 } 378 379 @Override 380 public boolean equals(final Object o) { 381 if (o instanceof BHSDCodec) { 382 final BHSDCodec codec = (BHSDCodec) o; 383 return codec.b == b && codec.h == h && codec.s == s && codec.d == d; 384 } 385 return false; 386 } 387 388 /** 389 * Gets the B. 390 * 391 * @return the b 392 */ 393 public int getB() { 394 return b; 395 } 396 397 /** 398 * Gets the H. 399 * 400 * @return the h 401 */ 402 public int getH() { 403 return h; 404 } 405 406 /** 407 * Gets the L. 408 * 409 * @return the l 410 */ 411 public int getL() { 412 return l; 413 } 414 415 /** 416 * Gets the S. 417 * 418 * @return the s 419 */ 420 public int getS() { 421 return s; 422 } 423 424 @Override 425 public int hashCode() { 426 return ((b * 37 + h) * 37 + s) * 37 + d; 427 } 428 429 /** 430 * Returns true if this codec is a delta codec 431 * 432 * @return true if this codec is a delta codec 433 */ 434 public boolean isDelta() { 435 return d != 0; 436 } 437 438 /** 439 * Returns true if this codec is a signed codec 440 * 441 * @return true if this codec is a signed codec 442 */ 443 public boolean isSigned() { 444 return s != 0; 445 } 446 447 /** 448 * Returns the largest value that this codec can represent. 449 * 450 * @return the largest value that this codec can represent. 451 */ 452 public long largest() { 453 return largest; 454 } 455 456 /** 457 * Returns the smallest value that this codec can represent. 458 * 459 * @return the smallest value that this codec can represent. 460 */ 461 public long smallest() { 462 return smallest; 463 } 464 465 /** 466 * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown. 467 */ 468 @Override 469 public String toString() { 470 final StringBuilder buffer = new StringBuilder(11); 471 buffer.append('('); 472 buffer.append(b); 473 buffer.append(','); 474 buffer.append(h); 475 if (s != 0 || d != 0) { 476 buffer.append(','); 477 buffer.append(s); 478 } 479 if (d != 0) { 480 buffer.append(','); 481 buffer.append(d); 482 } 483 buffer.append(')'); 484 return buffer.toString(); 485 } 486}