001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.codec.binary;
019
020 /**
021 * Provides Base32 encoding and decoding as defined by <a href="http://www.ietf.org/rfc/rfc4648.txt">RFC 4648</a>.
022 *
023 * <p>
024 * The class can be parameterized in the following manner with various constructors:
025 * <ul>
026 * <li>Whether to use the "base32hex" variant instead of the default "base32"</li>
027 * <li>Line length: Default 76. Line length that aren't multiples of 8 will still essentially end up being multiples of
028 * 8 in the encoded data.
029 * <li>Line separator: Default is CRLF ("\r\n")</li>
030 * </ul>
031 * </p>
032 * <p>
033 * This class operates directly on byte streams, and not character streams.
034 * </p>
035 * <p>
036 * This class is thread-safe.
037 * </p>
038 *
039 * @see <a href="http://www.ietf.org/rfc/rfc4648.txt">RFC 4648</a>
040 *
041 * @since 1.5
042 * @version $Id: Base32.html 889935 2013-12-11 05:05:13Z ggregory $
043 */
044 public class Base32 extends BaseNCodec {
045
046 /**
047 * BASE32 characters are 5 bits in length.
048 * They are formed by taking a block of five octets to form a 40-bit string,
049 * which is converted into eight BASE32 characters.
050 */
051 private static final int BITS_PER_ENCODED_BYTE = 5;
052 private static final int BYTES_PER_ENCODED_BLOCK = 8;
053 private static final int BYTES_PER_UNENCODED_BLOCK = 5;
054
055 /**
056 * Chunk separator per RFC 2045 section 2.1.
057 *
058 * @see <a href="http://www.ietf.org/rfc/rfc2045.txt">RFC 2045 section 2.1</a>
059 */
060 private static final byte[] CHUNK_SEPARATOR = {'\r', '\n'};
061
062 /**
063 * This array is a lookup table that translates Unicode characters drawn from the "Base32 Alphabet" (as specified
064 * in Table 3 of RFC 2045) into their 5-bit positive integer equivalents. Characters that are not in the Base32
065 * alphabet but fall within the bounds of the array are translated to -1.
066 */
067 private static final byte[] DECODE_TABLE = {
068 // 0 1 2 3 4 5 6 7 8 9 A B C D E F
069 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f
070 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f
071 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 63, // 20-2f
072 -1, -1, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, // 30-3f 2-7
073 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, // 40-4f A-N
074 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, // 50-5a O-Z
075 };
076
077 /**
078 * This array is a lookup table that translates 5-bit positive integer index values into their "Base32 Alphabet"
079 * equivalents as specified in Table 3 of RFC 2045.
080 */
081 private static final byte[] ENCODE_TABLE = {
082 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
083 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
084 '2', '3', '4', '5', '6', '7',
085 };
086
087 /**
088 * This array is a lookup table that translates Unicode characters drawn from the "Base32 |Hex Alphabet" (as
089 * specified in Table 3 of RFC 2045) into their 5-bit positive integer equivalents. Characters that are not in the
090 * Base32 Hex alphabet but fall within the bounds of the array are translated to -1.
091 */
092 private static final byte[] HEX_DECODE_TABLE = {
093 // 0 1 2 3 4 5 6 7 8 9 A B C D E F
094 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 00-0f
095 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 10-1f
096 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 63, // 20-2f
097 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 30-3f 2-7
098 -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, // 40-4f A-N
099 25, 26, 27, 28, 29, 30, 31, 32, // 50-57 O-V
100 };
101
102 /**
103 * This array is a lookup table that translates 5-bit positive integer index values into their
104 * "Base32 Hex Alphabet" equivalents as specified in Table 3 of RFC 2045.
105 */
106 private static final byte[] HEX_ENCODE_TABLE = {
107 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
108 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
109 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
110 };
111
112 /** Mask used to extract 5 bits, used when encoding Base32 bytes */
113 private static final int MASK_5BITS = 0x1f;
114
115 // The static final fields above are used for the original static byte[] methods on Base32.
116 // The private member fields below are used with the new streaming approach, which requires
117 // some state be preserved between calls of encode() and decode().
118
119 /**
120 * Place holder for the bytes we're dealing with for our based logic.
121 * Bitwise operations store and extract the encoding or decoding from this variable.
122 */
123
124 /**
125 * Convenience variable to help us determine when our buffer is going to run out of room and needs resizing.
126 * <code>decodeSize = {@link #BYTES_PER_ENCODED_BLOCK} - 1 + lineSeparator.length;</code>
127 */
128 private final int decodeSize;
129
130 /**
131 * Decode table to use.
132 */
133 private final byte[] decodeTable;
134
135 /**
136 * Convenience variable to help us determine when our buffer is going to run out of room and needs resizing.
137 * <code>encodeSize = {@link #BYTES_PER_ENCODED_BLOCK} + lineSeparator.length;</code>
138 */
139 private final int encodeSize;
140
141 /**
142 * Encode table to use.
143 */
144 private final byte[] encodeTable;
145
146 /**
147 * Line separator for encoding. Not used when decoding. Only used if lineLength > 0.
148 */
149 private final byte[] lineSeparator;
150
151 /**
152 * Creates a Base32 codec used for decoding and encoding.
153 * <p>
154 * When encoding the line length is 0 (no chunking).
155 * </p>
156 *
157 */
158 public Base32() {
159 this(false);
160 }
161
162 /**
163 * Creates a Base32 codec used for decoding and encoding.
164 * <p>
165 * When encoding the line length is 0 (no chunking).
166 * </p>
167 * @param useHex if {@code true} then use Base32 Hex alphabet
168 */
169 public Base32(boolean useHex) {
170 this(0, null, useHex);
171 }
172
173 /**
174 * Creates a Base32 codec used for decoding and encoding.
175 * <p>
176 * When encoding the line length is given in the constructor, the line separator is CRLF.
177 * </p>
178 *
179 * @param lineLength
180 * Each line of encoded data will be at most of the given length (rounded down to nearest multiple of
181 * 8). If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when
182 * decoding.
183 */
184 public Base32(int lineLength) {
185 this(lineLength, CHUNK_SEPARATOR);
186 }
187
188 /**
189 * Creates a Base32 codec used for decoding and encoding.
190 * <p>
191 * When encoding the line length and line separator are given in the constructor.
192 * </p>
193 * <p>
194 * Line lengths that aren't multiples of 8 will still essentially end up being multiples of 8 in the encoded data.
195 * </p>
196 *
197 * @param lineLength
198 * Each line of encoded data will be at most of the given length (rounded down to nearest multiple of
199 * 8). If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when
200 * decoding.
201 * @param lineSeparator
202 * Each line of encoded data will end with this sequence of bytes.
203 * @throws IllegalArgumentException
204 * The provided lineSeparator included some Base32 characters. That's not going to work!
205 */
206 public Base32(int lineLength, byte[] lineSeparator) {
207 this(lineLength, lineSeparator, false);
208 }
209
210 /**
211 * Creates a Base32 / Base32 Hex codec used for decoding and encoding.
212 * <p>
213 * When encoding the line length and line separator are given in the constructor.
214 * </p>
215 * <p>
216 * Line lengths that aren't multiples of 8 will still essentially end up being multiples of 8 in the encoded data.
217 * </p>
218 *
219 * @param lineLength
220 * Each line of encoded data will be at most of the given length (rounded down to nearest multiple of
221 * 8). If lineLength <= 0, then the output will not be divided into lines (chunks). Ignored when
222 * decoding.
223 * @param lineSeparator
224 * Each line of encoded data will end with this sequence of bytes.
225 * @param useHex
226 * if {@code true}, then use Base32 Hex alphabet, otherwise use Base32 alphabet
227 * @throws IllegalArgumentException
228 * The provided lineSeparator included some Base32 characters. That's not going to work! Or the
229 * lineLength > 0 and lineSeparator is null.
230 */
231 public Base32(int lineLength, byte[] lineSeparator, boolean useHex) {
232 super(BYTES_PER_UNENCODED_BLOCK, BYTES_PER_ENCODED_BLOCK,
233 lineLength,
234 lineSeparator == null ? 0 : lineSeparator.length);
235 if (useHex){
236 this.encodeTable = HEX_ENCODE_TABLE;
237 this.decodeTable = HEX_DECODE_TABLE;
238 } else {
239 this.encodeTable = ENCODE_TABLE;
240 this.decodeTable = DECODE_TABLE;
241 }
242 if (lineLength > 0) {
243 if (lineSeparator == null) {
244 throw new IllegalArgumentException("lineLength "+lineLength+" > 0, but lineSeparator is null");
245 }
246 // Must be done after initializing the tables
247 if (containsAlphabetOrPad(lineSeparator)) {
248 String sep = StringUtils.newStringUtf8(lineSeparator);
249 throw new IllegalArgumentException("lineSeparator must not contain Base32 characters: [" + sep + "]");
250 }
251 this.encodeSize = BYTES_PER_ENCODED_BLOCK + lineSeparator.length;
252 this.lineSeparator = new byte[lineSeparator.length];
253 System.arraycopy(lineSeparator, 0, this.lineSeparator, 0, lineSeparator.length);
254 } else {
255 this.encodeSize = BYTES_PER_ENCODED_BLOCK;
256 this.lineSeparator = null;
257 }
258 this.decodeSize = this.encodeSize - 1;
259 }
260
261 /**
262 * <p>
263 * Decodes all of the provided data, starting at inPos, for inAvail bytes. Should be called at least twice: once
264 * with the data to decode, and once with inAvail set to "-1" to alert decoder that EOF has been reached. The "-1"
265 * call is not necessary when decoding, but it doesn't hurt, either.
266 * </p>
267 * <p>
268 * Ignores all non-Base32 characters. This is how chunked (e.g. 76 character) data is handled, since CR and LF are
269 * silently ignored, but has implications for other bytes, too. This method subscribes to the garbage-in,
270 * garbage-out philosophy: it will not check the provided data for validity.
271 * </p>
272 *
273 * @param in
274 * byte[] array of ascii data to Base32 decode.
275 * @param inPos
276 * Position to start reading data from.
277 * @param inAvail
278 * Amount of bytes available from input for encoding.
279 * @param context the context to be used
280 *
281 * Output is written to {@link Context#buffer} as 8-bit octets, using {@link Context#pos} as the buffer position
282 */
283 @Override
284 void decode(byte[] in, int inPos, int inAvail, Context context) { // package protected for access from I/O streams
285 if (context.eof) {
286 return;
287 }
288 if (inAvail < 0) {
289 context.eof = true;
290 }
291 for (int i = 0; i < inAvail; i++) {
292 final byte b = in[inPos++];
293 if (b == PAD) {
294 // We're done.
295 context.eof = true;
296 break;
297 } else {
298 final byte[] buffer = ensureBufferSize(decodeSize, context);
299 if (b >= 0 && b < this.decodeTable.length) {
300 final int result = this.decodeTable[b];
301 if (result >= 0) {
302 context.modulus = (context.modulus+1) % BYTES_PER_ENCODED_BLOCK;
303 // collect decoded bytes
304 context.lbitWorkArea = (context.lbitWorkArea << BITS_PER_ENCODED_BYTE) + result;
305 if (context.modulus == 0) { // we can output the 5 bytes
306 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 32) & MASK_8BITS);
307 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 24) & MASK_8BITS);
308 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 16) & MASK_8BITS);
309 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
310 buffer[context.pos++] = (byte) (context.lbitWorkArea & MASK_8BITS);
311 }
312 }
313 }
314 }
315 }
316
317 // Two forms of EOF as far as Base32 decoder is concerned: actual
318 // EOF (-1) and first time '=' character is encountered in stream.
319 // This approach makes the '=' padding characters completely optional.
320 if (context.eof && context.modulus >= 2) { // if modulus < 2, nothing to do
321 final byte[] buffer = ensureBufferSize(decodeSize, context);
322
323 // we ignore partial bytes, i.e. only multiples of 8 count
324 switch (context.modulus) {
325 case 2 : // 10 bits, drop 2 and output one byte
326 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 2) & MASK_8BITS);
327 break;
328 case 3 : // 15 bits, drop 7 and output 1 byte
329 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 7) & MASK_8BITS);
330 break;
331 case 4 : // 20 bits = 2*8 + 4
332 context.lbitWorkArea = context.lbitWorkArea >> 4; // drop 4 bits
333 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
334 buffer[context.pos++] = (byte) ((context.lbitWorkArea) & MASK_8BITS);
335 break;
336 case 5 : // 25bits = 3*8 + 1
337 context.lbitWorkArea = context.lbitWorkArea >> 1;
338 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 16) & MASK_8BITS);
339 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
340 buffer[context.pos++] = (byte) ((context.lbitWorkArea) & MASK_8BITS);
341 break;
342 case 6 : // 30bits = 3*8 + 6
343 context.lbitWorkArea = context.lbitWorkArea >> 6;
344 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 16) & MASK_8BITS);
345 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
346 buffer[context.pos++] = (byte) ((context.lbitWorkArea) & MASK_8BITS);
347 break;
348 case 7 : // 35 = 4*8 +3
349 context.lbitWorkArea = context.lbitWorkArea >> 3;
350 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 24) & MASK_8BITS);
351 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 16) & MASK_8BITS);
352 buffer[context.pos++] = (byte) ((context.lbitWorkArea >> 8) & MASK_8BITS);
353 buffer[context.pos++] = (byte) ((context.lbitWorkArea) & MASK_8BITS);
354 break;
355 default:
356 // modulus can be 0-7, and we excluded 0,1 already
357 throw new IllegalStateException("Impossible modulus "+context.modulus);
358 }
359 }
360 }
361
362 /**
363 * <p>
364 * Encodes all of the provided data, starting at inPos, for inAvail bytes. Must be called at least twice: once with
365 * the data to encode, and once with inAvail set to "-1" to alert encoder that EOF has been reached, so flush last
366 * remaining bytes (if not multiple of 5).
367 * </p>
368 *
369 * @param in
370 * byte[] array of binary data to Base32 encode.
371 * @param inPos
372 * Position to start reading data from.
373 * @param inAvail
374 * Amount of bytes available from input for encoding.
375 * @param context the context to be used
376 */
377 @Override
378 void encode(byte[] in, int inPos, int inAvail, Context context) { // package protected for access from I/O streams
379 if (context.eof) {
380 return;
381 }
382 // inAvail < 0 is how we're informed of EOF in the underlying data we're
383 // encoding.
384 if (inAvail < 0) {
385 context.eof = true;
386 if (0 == context.modulus && lineLength == 0) {
387 return; // no leftovers to process and not using chunking
388 }
389 final byte[] buffer = ensureBufferSize(encodeSize, context);
390 final int savedPos = context.pos;
391 switch (context.modulus) { // % 5
392 case 0 :
393 break;
394 case 1 : // Only 1 octet; take top 5 bits then remainder
395 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 3) & MASK_5BITS]; // 8-1*5 = 3
396 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea << 2) & MASK_5BITS]; // 5-3=2
397 buffer[context.pos++] = PAD;
398 buffer[context.pos++] = PAD;
399 buffer[context.pos++] = PAD;
400 buffer[context.pos++] = PAD;
401 buffer[context.pos++] = PAD;
402 buffer[context.pos++] = PAD;
403 break;
404 case 2 : // 2 octets = 16 bits to use
405 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 11) & MASK_5BITS]; // 16-1*5 = 11
406 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 6) & MASK_5BITS]; // 16-2*5 = 6
407 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 1) & MASK_5BITS]; // 16-3*5 = 1
408 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea << 4) & MASK_5BITS]; // 5-1 = 4
409 buffer[context.pos++] = PAD;
410 buffer[context.pos++] = PAD;
411 buffer[context.pos++] = PAD;
412 buffer[context.pos++] = PAD;
413 break;
414 case 3 : // 3 octets = 24 bits to use
415 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 19) & MASK_5BITS]; // 24-1*5 = 19
416 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 14) & MASK_5BITS]; // 24-2*5 = 14
417 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 9) & MASK_5BITS]; // 24-3*5 = 9
418 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 4) & MASK_5BITS]; // 24-4*5 = 4
419 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea << 1) & MASK_5BITS]; // 5-4 = 1
420 buffer[context.pos++] = PAD;
421 buffer[context.pos++] = PAD;
422 buffer[context.pos++] = PAD;
423 break;
424 case 4 : // 4 octets = 32 bits to use
425 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 27) & MASK_5BITS]; // 32-1*5 = 27
426 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 22) & MASK_5BITS]; // 32-2*5 = 22
427 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 17) & MASK_5BITS]; // 32-3*5 = 17
428 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 12) & MASK_5BITS]; // 32-4*5 = 12
429 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 7) & MASK_5BITS]; // 32-5*5 = 7
430 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 2) & MASK_5BITS]; // 32-6*5 = 2
431 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea << 3) & MASK_5BITS]; // 5-2 = 3
432 buffer[context.pos++] = PAD;
433 break;
434 default:
435 throw new IllegalStateException("Impossible modulus "+context.modulus);
436 }
437 context.currentLinePos += context.pos - savedPos; // keep track of current line position
438 // if currentPos == 0 we are at the start of a line, so don't add CRLF
439 if (lineLength > 0 && context.currentLinePos > 0){ // add chunk separator if required
440 System.arraycopy(lineSeparator, 0, buffer, context.pos, lineSeparator.length);
441 context.pos += lineSeparator.length;
442 }
443 } else {
444 for (int i = 0; i < inAvail; i++) {
445 final byte[] buffer = ensureBufferSize(encodeSize, context);
446 context.modulus = (context.modulus+1) % BYTES_PER_UNENCODED_BLOCK;
447 int b = in[inPos++];
448 if (b < 0) {
449 b += 256;
450 }
451 context.lbitWorkArea = (context.lbitWorkArea << 8) + b; // BITS_PER_BYTE
452 if (0 == context.modulus) { // we have enough bytes to create our output
453 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 35) & MASK_5BITS];
454 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 30) & MASK_5BITS];
455 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 25) & MASK_5BITS];
456 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 20) & MASK_5BITS];
457 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 15) & MASK_5BITS];
458 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 10) & MASK_5BITS];
459 buffer[context.pos++] = encodeTable[(int)(context.lbitWorkArea >> 5) & MASK_5BITS];
460 buffer[context.pos++] = encodeTable[(int)context.lbitWorkArea & MASK_5BITS];
461 context.currentLinePos += BYTES_PER_ENCODED_BLOCK;
462 if (lineLength > 0 && lineLength <= context.currentLinePos) {
463 System.arraycopy(lineSeparator, 0, buffer, context.pos, lineSeparator.length);
464 context.pos += lineSeparator.length;
465 context.currentLinePos = 0;
466 }
467 }
468 }
469 }
470 }
471
472 /**
473 * Returns whether or not the <code>octet</code> is in the Base32 alphabet.
474 *
475 * @param octet
476 * The value to test
477 * @return {@code true} if the value is defined in the the Base32 alphabet {@code false} otherwise.
478 */
479 @Override
480 public boolean isInAlphabet(byte octet) {
481 return octet >= 0 && octet < decodeTable.length && decodeTable[octet] != -1;
482 }
483 }