001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.compress.harmony.pack200; 018 019import java.io.EOFException; 020import java.io.IOException; 021import java.io.InputStream; 022import java.util.ArrayList; 023import java.util.Arrays; 024import java.util.List; 025 026import org.apache.commons.compress.utils.ExactMath; 027 028/** 029 * A BHSD codec is a means of encoding integer values as a sequence of bytes or vice versa using a specified "BHSD" 030 * encoding mechanism. It uses a variable-length encoding and a modified sign representation such that small numbers are 031 * represented as a single byte, whilst larger numbers take more bytes to encode. The number may be signed or unsigned; 032 * if it is unsigned, it can be weighted towards positive numbers or equally distributed using a one's complement. The 033 * Codec also supports delta coding, where a sequence of numbers is represented as a series of first-order differences. 034 * So a delta encoding of the integers [1..10] would be represented as a sequence of 10x1s. This allows the absolute 035 * value of a coded integer to fall outside of the 'small number' range, whilst still being encoded as a single byte. 036 * 037 * A BHSD codec is configured with four parameters: 038 * <dl> 039 * <dt>B</dt> 040 * <dd>The maximum number of bytes that each value is encoded as. B must be a value between [1..5]. For a pass-through 041 * coding (where each byte is encoded as itself, aka {@link #BYTE1}, B is 1 (each byte takes a maximum of 1 byte).</dd> 042 * <dt>H</dt> 043 * <dd>The radix of the integer. Values are defined as a sequence of values, where value {@code n} is multiplied by 044 * {@code H^<sup>n</sup>}. So the number 1234 may be represented as the sequence 4 3 2 1 with a radix (H) of 10. 045 * Note that other permutations are also possible; 43 2 1 will also encode 1234. The co-parameter L is defined as 256-H. 046 * This is important because only the last value in a sequence may be < L; all prior values must be > L.</dd> 047 * <dt>S</dt> 048 * <dd>Whether the codec represents signed values (or not). This may have 3 values; 0 (unsigned), 1 (signed, one's 049 * complement) or 2 (signed, two's complement)</dd> 050 * <dt>D</dt> 051 * <dd>Whether the codec represents a delta encoding. This may be 0 (no delta) or 1 (delta encoding). A delta encoding 052 * of 1 indicates that values are cumulative; a sequence of {@code 1 1 1 1 1} will represent the sequence 053 * {@code 1 2 3 4 5}. For this reason, the codec supports two variants of decode; one 054 * {@link #decode(InputStream, long) with} and one {@link #decode(InputStream) without} a {@code last} parameter. 055 * If the codec is a non-delta encoding, then the value is ignored if passed. If the codec is a delta encoding, it is a 056 * run-time error to call the value without the extra parameter, and the previous value should be returned. (It was 057 * designed this way to support multi-threaded access without requiring a new instance of the Codec to be cloned for 058 * each use.) 059 * </dd> 060 * </dl> 061 * 062 * Codecs are notated as (B,H,S,D) and either D or S,D may be omitted if zero. Thus {@link #BYTE1} is denoted 063 * (1,256,0,0) or (1,256). The {@link #toString()} method prints out the condensed form of the encoding. Often, the last 064 * character in the name ({@link #BYTE1}, {@link #UNSIGNED5}) gives a clue as to the B value. Those that start with U 065 * ({@link #UDELTA5}, {@link #UNSIGNED5}) are unsigned; otherwise, in most cases, they are signed. The presence of the 066 * word Delta ({@link #DELTA5}, {@link #UDELTA5}) indicates a delta encoding is used. 067 */ 068public final class BHSDCodec extends Codec { 069 070 /** 071 * The maximum number of bytes in each coding word 072 */ 073 private final int b; 074 075 /** 076 * Whether delta encoding is used (0=false,1=true) 077 */ 078 private final int d; 079 080 /** 081 * The radix of the encoding 082 */ 083 private final int h; 084 085 /** 086 * The co-parameter of h; 256-h 087 */ 088 private final int l; 089 090 /** 091 * Represents signed numbers or not (0=unsigned,1/2=signed) 092 */ 093 private final int s; 094 095 private long cardinality; 096 097 private final long smallest; 098 099 private final long largest; 100 101 /** 102 * radix^i powers 103 */ 104 private final long[] powers; 105 106 /** 107 * Constructs an unsigned, non-delta Codec with the given B and H values. 108 * 109 * @param b the maximum number of bytes that a value can be encoded as [1..5] 110 * @param h the radix of the encoding [1..256] 111 */ 112 public BHSDCodec(final int b, final int h) { 113 this(b, h, 0, 0); 114 } 115 116 /** 117 * Constructs a non-delta Codec with the given B, H and S values. 118 * 119 * @param b the maximum number of bytes that a value can be encoded as [1..5] 120 * @param h the radix of the encoding [1..256] 121 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 122 * is signed with ?) 123 */ 124 public BHSDCodec(final int b, final int h, final int s) { 125 this(b, h, s, 0); 126 } 127 128 /** 129 * Constructs a Codec with the given B, H, S and D values. 130 * 131 * @param b the maximum number of bytes that a value can be encoded as [1..5] 132 * @param h the radix of the encoding [1..256] 133 * @param s whether the encoding represents signed numbers (s=0 is unsigned; s=1 is signed with 1s complement; s=2 134 * is signed with ?) 135 * @param d whether this is a delta encoding (d=0 is non-delta; d=1 is delta) 136 */ 137 public BHSDCodec(final int b, final int h, final int s, final int d) { 138 if (b < 1 || b > 5) { 139 throw new IllegalArgumentException("1<=b<=5"); 140 } 141 if (h < 1 || h > 256) { 142 throw new IllegalArgumentException("1<=h<=256"); 143 } 144 if (s < 0 || s > 2) { 145 throw new IllegalArgumentException("0<=s<=2"); 146 } 147 if (d < 0 || d > 1) { 148 throw new IllegalArgumentException("0<=d<=1"); 149 } 150 if (b == 1 && h != 256) { 151 throw new IllegalArgumentException("b=1 -> h=256"); 152 } 153 if (h == 256 && b == 5) { 154 throw new IllegalArgumentException("h=256 -> b!=5"); 155 } 156 this.b = b; 157 this.h = h; 158 this.s = s; 159 this.d = d; 160 this.l = 256 - h; 161 if (h == 1) { 162 cardinality = b * 255 + 1; 163 } else { 164 cardinality = (long) ((long) (l * (1 - Math.pow(h, b)) / (1 - h)) + Math.pow(h, b)); 165 } 166 smallest = calculateSmallest(); 167 largest = calculateLargest(); 168 169 powers = new long[b]; 170 Arrays.setAll(powers, c -> (long) Math.pow(h, c)); 171 } 172 173 private long calculateLargest() { 174 long result; 175 // TODO This can probably be optimized into a better mathematical 176 // statement 177 if (d == 1) { 178 final BHSDCodec bh0 = new BHSDCodec(b, h); 179 return bh0.largest(); 180 } 181 switch (s) { 182 case 0: 183 result = cardinality() - 1; 184 break; 185 case 1: 186 result = cardinality() / 2 - 1; 187 break; 188 case 2: 189 result = 3L * cardinality() / 4 - 1; 190 break; 191 default: 192 throw new Error("Unknown s value"); 193 } 194 return Math.min((s == 0 ? (long) Integer.MAX_VALUE << 1 : Integer.MAX_VALUE) - 1, result); 195 } 196 197 private long calculateSmallest() { 198 long result; 199 if (d == 1 || !isSigned()) { 200 if (cardinality >= 4294967296L) { // 2^32 201 result = Integer.MIN_VALUE; 202 } else { 203 result = 0; 204 } 205 } else { 206 result = Math.max(Integer.MIN_VALUE, -cardinality() / (1 << s)); 207 } 208 return result; 209 } 210 211 /** 212 * Returns the cardinality of this codec; that is, the number of distinct values that it can contain. 213 * 214 * @return the cardinality of this codec 215 */ 216 public long cardinality() { 217 return cardinality; 218 } 219 220 @Override 221 public int decode(final InputStream in) throws IOException, Pack200Exception { 222 if (d != 0) { 223 throw new Pack200Exception("Delta encoding used without passing in last value; this is a coding error"); 224 } 225 return decode(in, 0); 226 } 227 228 @Override 229 public int decode(final InputStream in, final long last) throws IOException, Pack200Exception { 230 int n = 0; 231 long z = 0; 232 long x = 0; 233 234 do { 235 x = in.read(); 236 lastBandLength++; 237 z += x * powers[n]; 238 n++; 239 } while (x >= l && n < b); 240 241 if (x == -1) { 242 throw new EOFException("End of stream reached whilst decoding"); 243 } 244 245 if (isSigned()) { 246 final int u = (1 << s) - 1; 247 if ((z & u) == u) { 248 z = z >>> s ^ -1L; 249 } else { 250 z = z - (z >>> s); 251 } 252 } 253 // This algorithm does the same thing, but is probably slower. Leaving 254 // in for now for readability 255 // if (isSigned()) { 256 // long u = z; 257 // long twoPowS = (long) Math.pow(2, s); 258 // double twoPowSMinusOne = twoPowS-1; 259 // if (u % twoPowS < twoPowSMinusOne) { 260 // if (cardinality < Math.pow(2, 32)) { 261 // z = (long) (u - (Math.floor(u/ twoPowS))); 262 // } else { 263 // z = cast32((long) (u - (Math.floor(u/ twoPowS)))); 264 // } 265 // } else { 266 // z = (long) (-Math.floor(u/ twoPowS) - 1); 267 // } 268 // } 269 if (isDelta()) { 270 z += last; 271 } 272 return (int) z; 273 } 274 275 // private long cast32(long u) { 276 // u = (long) ((long) ((u + Math.pow(2, 31)) % Math.pow(2, 32)) - 277 // Math.pow(2, 31)); 278 // return u; 279 // } 280 281 @Override 282 public int[] decodeInts(final int n, final InputStream in) throws IOException, Pack200Exception { 283 final int[] band = super.decodeInts(n, in); 284 if (isDelta()) { 285 for (int i = 0; i < band.length; i++) { 286 while (band[i] > largest) { 287 band[i] -= cardinality; 288 } 289 while (band[i] < smallest) { 290 band[i] = ExactMath.add(band[i], cardinality); 291 } 292 } 293 } 294 return band; 295 } 296 297 @Override 298 public int[] decodeInts(final int n, final InputStream in, final int firstValue) 299 throws IOException, Pack200Exception { 300 final int[] band = super.decodeInts(n, in, firstValue); 301 if (isDelta()) { 302 for (int i = 0; i < band.length; i++) { 303 while (band[i] > largest) { 304 band[i] -= cardinality; 305 } 306 while (band[i] < smallest) { 307 band[i] = ExactMath.add(band[i], cardinality); 308 } 309 } 310 } 311 return band; 312 } 313 314 @Override 315 public byte[] encode(final int value) throws Pack200Exception { 316 return encode(value, 0); 317 } 318 319 @Override 320 public byte[] encode(final int value, final int last) throws Pack200Exception { 321 if (!encodes(value)) { 322 throw new Pack200Exception("The codec " + this + " does not encode the value " + value); 323 } 324 325 long z = value; 326 if (isDelta()) { 327 z -= last; 328 } 329 if (isSigned()) { 330 if (z < Integer.MIN_VALUE) { 331 z += 4294967296L; 332 } else if (z > Integer.MAX_VALUE) { 333 z -= 4294967296L; 334 } 335 if (z < 0) { 336 z = (-z << s) - 1; 337 } else if (s == 1) { 338 z = z << s; 339 } else { 340 z += (z - z % 3) / 3; 341 } 342 } else if (z < 0) { 343 // Need to use integer overflow here to represent negatives. 344 // 4294967296L is the 1 << 32. 345 z += Math.min(cardinality, 4294967296L); 346 } 347 if (z < 0) { 348 throw new Pack200Exception("unable to encode"); 349 } 350 351 final List<Byte> byteList = new ArrayList<>(); 352 for (int n = 0; n < b; n++) { 353 long byteN; 354 if (z < l) { 355 byteN = z; 356 } else { 357 byteN = z % h; 358 while (byteN < l) { 359 byteN += h; 360 } 361 } 362 byteList.add(Byte.valueOf((byte) byteN)); 363 if (byteN < l) { 364 break; 365 } 366 z -= byteN; 367 z /= h; 368 } 369 final byte[] bytes = new byte[byteList.size()]; 370 for (int i = 0; i < bytes.length; i++) { 371 bytes[i] = byteList.get(i).byteValue(); 372 } 373 return bytes; 374 } 375 376 /** 377 * True if this encoding can code the given value 378 * 379 * @param value the value to check 380 * @return {@code true} if the encoding can encode this value 381 */ 382 public boolean encodes(final long value) { 383 return value >= smallest && value <= largest; 384 } 385 386 @Override 387 public boolean equals(final Object o) { 388 if (o instanceof BHSDCodec) { 389 final BHSDCodec codec = (BHSDCodec) o; 390 return codec.b == b && codec.h == h && codec.s == s && codec.d == d; 391 } 392 return false; 393 } 394 395 /** 396 * @return the b 397 */ 398 public int getB() { 399 return b; 400 } 401 402 /** 403 * @return the h 404 */ 405 public int getH() { 406 return h; 407 } 408 409 /** 410 * @return the l 411 */ 412 public int getL() { 413 return l; 414 } 415 416 /** 417 * @return the s 418 */ 419 public int getS() { 420 return s; 421 } 422 423 @Override 424 public int hashCode() { 425 return ((b * 37 + h) * 37 + s) * 37 + d; 426 } 427 428 /** 429 * Returns true if this codec is a delta codec 430 * 431 * @return true if this codec is a delta codec 432 */ 433 public boolean isDelta() { 434 return d != 0; 435 } 436 437 /** 438 * Returns true if this codec is a signed codec 439 * 440 * @return true if this codec is a signed codec 441 */ 442 public boolean isSigned() { 443 return s != 0; 444 } 445 446 /** 447 * Returns the largest value that this codec can represent. 448 * 449 * @return the largest value that this codec can represent. 450 */ 451 public long largest() { 452 return largest; 453 } 454 455 /** 456 * Returns the smallest value that this codec can represent. 457 * 458 * @return the smallest value that this codec can represent. 459 */ 460 public long smallest() { 461 return smallest; 462 } 463 464 /** 465 * Returns the codec in the form (1,256) or (1,64,1,1). Note that trailing zero fields are not shown. 466 */ 467 @Override 468 public String toString() { 469 final StringBuilder buffer = new StringBuilder(11); 470 buffer.append('('); 471 buffer.append(b); 472 buffer.append(','); 473 buffer.append(h); 474 if (s != 0 || d != 0) { 475 buffer.append(','); 476 buffer.append(s); 477 } 478 if (d != 0) { 479 buffer.append(','); 480 buffer.append(d); 481 } 482 buffer.append(')'); 483 return buffer.toString(); 484 } 485}