View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   * http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package org.apache.commons.compress.archivers.zip;
21  
22  import java.io.IOException;
23  import java.nio.ByteBuffer;
24  import java.nio.CharBuffer;
25  import java.nio.charset.Charset;
26  import java.nio.charset.CharsetDecoder;
27  import java.nio.charset.CharsetEncoder;
28  import java.nio.charset.CoderResult;
29  import java.nio.charset.CodingErrorAction;
30  
31  /**
32   * A ZipEncoding, which uses a {@link Charset} to encode names.
33   * <p>
34   * The methods of this class are reentrant.
35   * </p>
36   *
37   * @Immutable
38   */
39  final class NioZipEncoding implements ZipEncoding, CharsetAccessor {
40  
41      private static final char REPLACEMENT = '?';
42      private static final byte[] REPLACEMENT_BYTES = { (byte) REPLACEMENT };
43      private static final String REPLACEMENT_STRING = String.valueOf(REPLACEMENT);
44      private static final char[] HEX_CHARS = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
45  
46      private static ByteBuffer encodeFully(final CharsetEncoder enc, final CharBuffer cb, final ByteBuffer out) {
47          ByteBuffer o = out;
48          while (cb.hasRemaining()) {
49              final CoderResult result = enc.encode(cb, o, false);
50              if (result.isOverflow()) {
51                  final int increment = estimateIncrementalEncodingSize(enc, cb.remaining());
52                  o = ZipEncodingHelper.growBufferBy(o, increment);
53              }
54          }
55          return o;
56      }
57  
58      private static CharBuffer encodeSurrogate(final CharBuffer cb, final char c) {
59          cb.position(0).limit(6);
60          cb.put('%');
61          cb.put('U');
62  
63          cb.put(HEX_CHARS[c >> 12 & 0x0f]);
64          cb.put(HEX_CHARS[c >> 8 & 0x0f]);
65          cb.put(HEX_CHARS[c >> 4 & 0x0f]);
66          cb.put(HEX_CHARS[c & 0x0f]);
67          cb.flip();
68          return cb;
69      }
70  
71      /**
72       * Estimate the size needed for remaining characters
73       *
74       * @param enc       encoder to use for estimates
75       * @param charCount number of characters remaining
76       * @return estimated size in bytes.
77       */
78      private static int estimateIncrementalEncodingSize(final CharsetEncoder enc, final int charCount) {
79          return (int) Math.ceil(charCount * enc.averageBytesPerChar());
80      }
81  
82      /**
83       * Estimate the initial encoded size (in bytes) for a character buffer.
84       * <p>
85       * The estimate assumes that one character consumes uses the maximum length encoding, whilst the rest use an average size encoding. This accounts for any
86       * BOM for UTF-16, at the expense of a couple of extra bytes for UTF-8 encoded ASCII.
87       * </p>
88       *
89       * @param enc        encoder to use for estimates
90       * @param charChount number of characters in string
91       * @return estimated size in bytes.
92       */
93      private static int estimateInitialBufferSize(final CharsetEncoder enc, final int charChount) {
94          final float first = enc.maxBytesPerChar();
95          final float rest = (charChount - 1) * enc.averageBytesPerChar();
96          return (int) Math.ceil(first + rest);
97      }
98  
99      private final Charset charset;
100 
101     private final boolean useReplacement;
102 
103     /**
104      * Constructs an NioZipEncoding using the given charset.
105      *
106      * @param charset        The character set to use.
107      * @param useReplacement should invalid characters be replaced, or reported.
108      */
109     NioZipEncoding(final Charset charset, final boolean useReplacement) {
110         this.charset = charset;
111         this.useReplacement = useReplacement;
112     }
113 
114     /**
115      * @see ZipEncoding#canEncode(String)
116      */
117     @Override
118     public boolean canEncode(final String name) {
119         return newEncoder().canEncode(name);
120     }
121 
122     /**
123      * @see ZipEncoding#decode(byte[])
124      */
125     @Override
126     public String decode(final byte[] data) throws IOException {
127         return newDecoder().decode(ByteBuffer.wrap(data)).toString();
128     }
129 
130     /**
131      * @see ZipEncoding#encode(String)
132      */
133     @Override
134     public ByteBuffer encode(final String name) {
135         final CharsetEncoder enc = newEncoder();
136 
137         final CharBuffer cb = CharBuffer.wrap(name);
138         CharBuffer tmp = null;
139         ByteBuffer out = ByteBuffer.allocate(estimateInitialBufferSize(enc, cb.remaining()));
140 
141         while (cb.hasRemaining()) {
142             final CoderResult res = enc.encode(cb, out, false);
143 
144             if (res.isUnmappable() || res.isMalformed()) {
145 
146                 // write the unmappable characters in utf-16
147                 // pseudo-URL encoding style to ByteBuffer.
148 
149                 final int spaceForSurrogate = estimateIncrementalEncodingSize(enc, 6 * res.length());
150                 if (spaceForSurrogate > out.remaining()) {
151                     // if the destination buffer isn't oversized, assume that the presence of one
152                     // unmappable character makes it likely that there will be more. Find all the
153                     // un-encoded characters and allocate space based on those estimates.
154                     int charCount = 0;
155                     for (int i = cb.position(); i < cb.limit(); i++) {
156                         charCount += !enc.canEncode(cb.get(i)) ? 6 : 1;
157                     }
158                     final int totalExtraSpace = estimateIncrementalEncodingSize(enc, charCount);
159                     out = ZipEncodingHelper.growBufferBy(out, totalExtraSpace - out.remaining());
160                 }
161                 if (tmp == null) {
162                     tmp = CharBuffer.allocate(6);
163                 }
164                 for (int i = 0; i < res.length(); ++i) {
165                     out = encodeFully(enc, encodeSurrogate(tmp, cb.get()), out);
166                 }
167 
168             } else if (res.isOverflow()) {
169                 final int increment = estimateIncrementalEncodingSize(enc, cb.remaining());
170                 out = ZipEncodingHelper.growBufferBy(out, increment);
171 
172             } else if (res.isUnderflow() || res.isError()) {
173                 break;
174             }
175         }
176         // tell the encoder we are done
177         enc.encode(cb, out, true);
178         // may have caused underflow, but that's been ignored traditionally
179 
180         out.limit(out.position());
181         out.rewind();
182         return out;
183     }
184 
185     @Override
186     public Charset getCharset() {
187         return charset;
188     }
189 
190     private CharsetDecoder newDecoder() {
191         if (!useReplacement) {
192             return this.charset.newDecoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
193         }
194         return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE).onUnmappableCharacter(CodingErrorAction.REPLACE)
195                 .replaceWith(REPLACEMENT_STRING);
196     }
197 
198     private CharsetEncoder newEncoder() {
199         if (useReplacement) {
200             return charset.newEncoder().onMalformedInput(CodingErrorAction.REPLACE).onUnmappableCharacter(CodingErrorAction.REPLACE)
201                     .replaceWith(REPLACEMENT_BYTES);
202         }
203         return charset.newEncoder().onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT);
204     }
205 
206 }