NioZipEncoding.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one
  3.  * or more contributor license agreements.  See the NOTICE file
  4.  * distributed with this work for additional information
  5.  * regarding copyright ownership.  The ASF licenses this file
  6.  * to you under the Apache License, Version 2.0 (the
  7.  * "License"); you may not use this file except in compliance
  8.  * with the License.  You may obtain a copy of the License at
  9.  *
  10.  * http://www.apache.org/licenses/LICENSE-2.0
  11.  *
  12.  * Unless required by applicable law or agreed to in writing,
  13.  * software distributed under the License is distributed on an
  14.  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15.  * KIND, either express or implied.  See the License for the
  16.  * specific language governing permissions and limitations
  17.  * under the License.
  18.  */

  19. package org.apache.commons.compress.archivers.zip;

  20. import java.io.IOException;
  21. import java.nio.ByteBuffer;
  22. import java.nio.CharBuffer;
  23. import java.nio.charset.Charset;
  24. import java.nio.charset.CharsetDecoder;
  25. import java.nio.charset.CharsetEncoder;
  26. import java.nio.charset.CoderResult;
  27. import java.nio.charset.CodingErrorAction;

  28. /**
  29.  * A ZipEncoding, which uses a {@link Charset} to encode names.
  30.  * <p>
  31.  * The methods of this class are reentrant.
  32.  * </p>
  33.  *
  34.  * @Immutable
  35.  */
  36. final class NioZipEncoding implements ZipEncoding, CharsetAccessor {

  37.     private static final char REPLACEMENT = '?';
  38.     private static final byte[] REPLACEMENT_BYTES = { (byte) REPLACEMENT };
  39.     private static final String REPLACEMENT_STRING = String.valueOf(REPLACEMENT);
  40.     private static final char[] HEX_CHARS = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };

  41.     private static ByteBuffer encodeFully(final CharsetEncoder enc, final CharBuffer cb, final ByteBuffer out) {
  42.         ByteBuffer o = out;
  43.         while (cb.hasRemaining()) {
  44.             final CoderResult result = enc.encode(cb, o, false);
  45.             if (result.isOverflow()) {
  46.                 final int increment = estimateIncrementalEncodingSize(enc, cb.remaining());
  47.                 o = ZipEncodingHelper.growBufferBy(o, increment);
  48.             }
  49.         }
  50.         return o;
  51.     }

  52.     private static CharBuffer encodeSurrogate(final CharBuffer cb, final char c) {
  53.         cb.position(0).limit(6);
  54.         cb.put('%');
  55.         cb.put('U');

  56.         cb.put(HEX_CHARS[c >> 12 & 0x0f]);
  57.         cb.put(HEX_CHARS[c >> 8 & 0x0f]);
  58.         cb.put(HEX_CHARS[c >> 4 & 0x0f]);
  59.         cb.put(HEX_CHARS[c & 0x0f]);
  60.         cb.flip();
  61.         return cb;
  62.     }

  63.     /**
  64.      * Estimate the size needed for remaining characters
  65.      *
  66.      * @param enc       encoder to use for estimates
  67.      * @param charCount number of characters remaining
  68.      * @return estimated size in bytes.
  69.      */
  70.     private static int estimateIncrementalEncodingSize(final CharsetEncoder enc, final int charCount) {
  71.         return (int) Math.ceil(charCount * enc.averageBytesPerChar());
  72.     }

  73.     /**
  74.      * Estimate the initial encoded size (in bytes) for a character buffer.
  75.      * <p>
  76.      * The estimate assumes that one character consumes uses the maximum length encoding, whilst the rest use an average size encoding. This accounts for any
  77.      * BOM for UTF-16, at the expense of a couple of extra bytes for UTF-8 encoded ASCII.
  78.      * </p>
  79.      *
  80.      * @param enc        encoder to use for estimates
  81.      * @param charChount number of characters in string
  82.      * @return estimated size in bytes.
  83.      */
  84.     private static int estimateInitialBufferSize(final CharsetEncoder enc, final int charChount) {
  85.         final float first = enc.maxBytesPerChar();
  86.         final float rest = (charChount - 1) * enc.averageBytesPerChar();
  87.         return (int) Math.ceil(first + rest);
  88.     }

  89.     private final Charset charset;

  90.     private final boolean useReplacement;

  91.     /**
  92.      * Constructs an NioZipEncoding using the given charset.
  93.      *
  94.      * @param charset        The character set to use.
  95.      * @param useReplacement should invalid characters be replaced, or reported.
  96.      */
  97.     NioZipEncoding(final Charset charset, final boolean useReplacement) {
  98.         this.charset = charset;
  99.         this.useReplacement = useReplacement;
  100.     }

  101.     /**
  102.      * @see ZipEncoding#canEncode(String)
  103.      */
  104.     @Override
  105.     public boolean canEncode(final String name) {
  106.         return newEncoder().canEncode(name);
  107.     }

  108.     /**
  109.      * @see ZipEncoding#decode(byte[])
  110.      */
  111.     @Override
  112.     public String decode(final byte[] data) throws IOException {
  113.         return newDecoder().decode(ByteBuffer.wrap(data)).toString();
  114.     }

  115.     /**
  116.      * @see ZipEncoding#encode(String)
  117.      */
  118.     @Override
  119.     public ByteBuffer encode(final String name) {
  120.         final CharsetEncoder enc = newEncoder();

  121.         final CharBuffer cb = CharBuffer.wrap(name);
  122.         CharBuffer tmp = null;
  123.         ByteBuffer out = ByteBuffer.allocate(estimateInitialBufferSize(enc, cb.remaining()));

  124.         while (cb.hasRemaining()) {
  125.             final CoderResult res = enc.encode(cb, out, false);

  126.             if (res.isUnmappable() || res.isMalformed()) {

  127.                 // write the unmappable characters in utf-16
  128.                 // pseudo-URL encoding style to ByteBuffer.

  129.                 final int spaceForSurrogate = estimateIncrementalEncodingSize(enc, 6 * res.length());
  130.                 if (spaceForSurrogate > out.remaining()) {
  131.                     // if the destination buffer isn't oversized, assume that the presence of one
  132.                     // unmappable character makes it likely that there will be more. Find all the
  133.                     // un-encoded characters and allocate space based on those estimates.
  134.                     int charCount = 0;
  135.                     for (int i = cb.position(); i < cb.limit(); i++) {
  136.                         charCount += !enc.canEncode(cb.get(i)) ? 6 : 1;
  137.                     }
  138.                     final int totalExtraSpace = estimateIncrementalEncodingSize(enc, charCount);
  139.                     out = ZipEncodingHelper.growBufferBy(out, totalExtraSpace - out.remaining());
  140.                 }
  141.                 if (tmp == null) {
  142.                     tmp = CharBuffer.allocate(6);
  143.                 }
  144.                 for (int i = 0; i < res.length(); ++i) {
  145.                     out = encodeFully(enc, encodeSurrogate(tmp, cb.get()), out);
  146.                 }

  147.             } else if (res.isOverflow()) {
  148.                 final int increment = estimateIncrementalEncodingSize(enc, cb.remaining());
  149.                 out = ZipEncodingHelper.growBufferBy(out, increment);

  150.             } else if (res.isUnderflow() || res.isError()) {
  151.                 break;
  152.             }
  153.         }
  154.         // tell the encoder we are done
  155.         enc.encode(cb, out, true);
  156.         // may have caused underflow, but that's been ignored traditionally

  157.         out.limit(out.position());
  158.         out.rewind();
  159.         return out;
  160.     }

  161.     @Override
  162.     public Charset getCharset() {
  163.         return charset;
  164.     }

  165.     private CharsetDecoder newDecoder() {
  166.         if (!useReplacement) {
  167.             return this.charset.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
  168.         }
  169.         return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE).onUnmappableCharacter(CodingErrorAction.REPLACE)
  170.                 .replaceWith(REPLACEMENT_STRING);
  171.     }

  172.     private CharsetEncoder newEncoder() {
  173.         if (useReplacement) {
  174.             return charset.newEncoder().onMalformedInput(CodingErrorAction.REPLACE).onUnmappableCharacter(CodingErrorAction.REPLACE)
  175.                     .replaceWith(REPLACEMENT_BYTES);
  176.         }
  177.         return charset.newEncoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
  178.     }

  179. }