View Javadoc
1   /*
2    *  Licensed to the Apache Software Foundation (ASF) under one or more
3    *  contributor license agreements.  See the NOTICE file distributed with
4    *  this work for additional information regarding copyright ownership.
5    *  The ASF licenses this file to You under the Apache License, Version 2.0
6    *  (the "License"); you may not use this file except in compliance with
7    *  the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   *  Unless required by applicable law or agreed to in writing, software
12   *  distributed under the License is distributed on an "AS IS" BASIS,
13   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   *  See the License for the specific language governing permissions and
15   *  limitations under the License.
16   */
17  
18  package org.apache.commons.compress.archivers.zip;
19  
20  import static java.nio.charset.StandardCharsets.US_ASCII;
21  import static java.nio.charset.StandardCharsets.UTF_8;
22  import static org.junit.jupiter.api.Assertions.assertEquals;
23  import static org.junit.jupiter.api.Assertions.assertNotNull;
24  import static org.junit.jupiter.api.Assertions.assertNotSame;
25  import static org.junit.jupiter.api.Assertions.fail;
26  
27  import java.io.File;
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.nio.ByteBuffer;
31  import java.nio.charset.Charset;
32  import java.nio.charset.StandardCharsets;
33  import java.nio.file.Files;
34  import java.util.Enumeration;
35  import java.util.zip.CRC32;
36  
37  import org.apache.commons.compress.AbstractTest;
38  import org.junit.jupiter.api.Test;
39  
40  public class UTF8ZipFilesTest extends AbstractTest {
41  
42      private static final String CP437 = "cp437";
43      private static final String ASCII_TXT = "ascii.txt";
44      private static final String EURO_FOR_DOLLAR_TXT = "\u20AC_for_Dollar.txt";
45      private static final String OIL_BARREL_TXT = "\u00D6lf\u00E4sser.txt";
46  
47      private static void assertRawNameOfAcsiiTxt(final ZipArchiveEntry ze) {
48          final byte[] b = ze.getRawName();
49          assertNotNull(b);
50          final int len = ASCII_TXT.length();
51          assertEquals(len, b.length);
52          for (int i = 0; i < len; i++) {
53              assertEquals((byte) ASCII_TXT.charAt(i), b[i], "Byte " + i);
54          }
55          assertNotSame(b, ze.getRawName());
56      }
57  
58      private static void assertUnicodeName(final ZipArchiveEntry ze, final String expectedName, final String encoding) throws IOException {
59          if (!expectedName.equals(ze.getName())) {
60              final UnicodePathExtraField ucpf = findUniCodePath(ze);
61              assertNotNull(ucpf);
62  
63              final ZipEncoding enc = ZipEncodingHelper.getZipEncoding(encoding);
64              final ByteBuffer ne = enc.encode(ze.getName());
65  
66              final CRC32 crc = new CRC32();
67              crc.update(ne.array(), ne.arrayOffset(), ne.limit() - ne.position());
68  
69              assertEquals(crc.getValue(), ucpf.getNameCRC32());
70              assertEquals(expectedName, new String(ucpf.getUnicodeName(), UTF_8));
71          }
72      }
73  
74      private static void createTestFile(final File file, final String encoding, final boolean withEFS, final boolean withExplicitUnicodeExtra)
75              throws IOException {
76  
77          final ZipEncoding zipEncoding = ZipEncodingHelper.getZipEncoding(encoding);
78  
79          try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(file)) {
80              zos.setEncoding(encoding);
81              zos.setUseLanguageEncodingFlag(withEFS);
82              zos.setCreateUnicodeExtraFields(
83                      withExplicitUnicodeExtra ? ZipArchiveOutputStream.UnicodeExtraFieldPolicy.NEVER : ZipArchiveOutputStream.UnicodeExtraFieldPolicy.ALWAYS);
84  
85              ZipArchiveEntry ze = new ZipArchiveEntry(OIL_BARREL_TXT);
86              if (withExplicitUnicodeExtra && !zipEncoding.canEncode(ze.getName())) {
87  
88                  final ByteBuffer en = zipEncoding.encode(ze.getName());
89  
90                  ze.addExtraField(new UnicodePathExtraField(ze.getName(), en.array(), en.arrayOffset(), en.limit() - en.position()));
91              }
92  
93              zos.putArchiveEntry(ze);
94              zos.write("Hello, world!".getBytes(US_ASCII));
95              zos.closeArchiveEntry();
96  
97              ze = new ZipArchiveEntry(EURO_FOR_DOLLAR_TXT);
98              if (withExplicitUnicodeExtra && !zipEncoding.canEncode(ze.getName())) {
99  
100                 final ByteBuffer en = zipEncoding.encode(ze.getName());
101 
102                 ze.addExtraField(new UnicodePathExtraField(ze.getName(), en.array(), en.arrayOffset(), en.limit() - en.position()));
103             }
104 
105             zos.putArchiveEntry(ze);
106             zos.write("Give me your money!".getBytes(US_ASCII));
107             zos.closeArchiveEntry();
108 
109             ze = new ZipArchiveEntry(ASCII_TXT);
110 
111             if (withExplicitUnicodeExtra && !zipEncoding.canEncode(ze.getName())) {
112 
113                 final ByteBuffer en = zipEncoding.encode(ze.getName());
114 
115                 ze.addExtraField(new UnicodePathExtraField(ze.getName(), en.array(), en.arrayOffset(), en.limit() - en.position()));
116             }
117 
118             zos.putArchiveEntry(ze);
119             zos.write("ascii".getBytes(US_ASCII));
120             zos.closeArchiveEntry();
121 
122             zos.finish();
123         }
124     }
125 
126     private static UnicodePathExtraField findUniCodePath(final ZipArchiveEntry ze) {
127         return (UnicodePathExtraField) ze.getExtraField(UnicodePathExtraField.UPATH_ID);
128     }
129 
130     private static void testFile(final File file, final String encoding) throws IOException {
131         try (ZipFile zf = ZipFile.builder().setFile(file).setCharset(encoding).setUseUnicodeExtraFields(false).get()) {
132             final Enumeration<ZipArchiveEntry> e = zf.getEntries();
133             while (e.hasMoreElements()) {
134                 final ZipArchiveEntry ze = e.nextElement();
135                 if (ze.getName().endsWith("sser.txt")) {
136                     assertUnicodeName(ze, OIL_BARREL_TXT, encoding);
137                 } else if (ze.getName().endsWith("_for_Dollar.txt")) {
138                     assertUnicodeName(ze, EURO_FOR_DOLLAR_TXT, encoding);
139                 } else if (!ze.getName().equals(ASCII_TXT)) {
140                     fail("Unrecognized ZIP entry with name [" + ze.getName() + "] found.");
141                 }
142             }
143         }
144     }
145 
146     private void assertCanRead(final ZipFile zf, final String fileName) throws IOException {
147         final ZipArchiveEntry entry = zf.getEntry(fileName);
148         assertNotNull(entry, "Entry doesn't exist");
149         try (InputStream is = zf.getInputStream(entry)) {
150             assertNotNull(is, "InputStream is null");
151             is.read();
152         }
153     }
154 
155     @Test
156     public void testASCIIFileRoundtripExplicitUnicodeExtra() throws IOException {
157         testFileRoundtrip(StandardCharsets.US_ASCII.name(), false, true);
158     }
159 
160     @Test
161     public void testASCIIFileRoundtripImplicitUnicodeExtra() throws IOException {
162         testFileRoundtrip(StandardCharsets.US_ASCII.name(), false, false);
163     }
164 
165     @Test
166     public void testCP437FileRoundtripExplicitUnicodeExtra() throws IOException {
167         testFileRoundtrip(CP437, false, true);
168     }
169 
170     @Test
171     public void testCP437FileRoundtripImplicitUnicodeExtra() throws IOException {
172         testFileRoundtrip(CP437, false, false);
173     }
174 
175     private void testFileRoundtrip(final String encoding, final boolean withEFS, final boolean withExplicitUnicodeExtra) throws IOException {
176         final File file = createTempFile(encoding + "-test", ".zip");
177         createTestFile(file, encoding, withEFS, withExplicitUnicodeExtra);
178         testFile(file, encoding);
179     }
180 
181     @Test
182     public void testRawNameReadFromStream() throws IOException {
183         try (ZipArchiveInputStream zi = new ZipArchiveInputStream(newInputStream("utf8-7zip-test.zip"), CP437, false)) {
184             assertRawNameOfAcsiiTxt(zi.getNextEntry());
185         }
186     }
187 
188     @Test
189     public void testRawNameReadFromZipFile() throws IOException {
190         final File archive = getFile("utf8-7zip-test.zip");
191         try (ZipFile zf = ZipFile.builder().setFile(archive).setCharset(CP437).setUseUnicodeExtraFields(false).get()) {
192             assertRawNameOfAcsiiTxt(zf.getEntry(ASCII_TXT));
193         }
194     }
195 
196     /*
197      * 7-ZIP created archive, uses EFS to signal UTF-8 file names.
198      *
199      * 7-ZIP doesn't use EFS for strings that can be encoded in CP437 - which is true for OIL_BARREL_TXT.
200      */
201     @Test
202     public void testRead7ZipArchive() throws IOException {
203         final File archive = getFile("utf8-7zip-test.zip");
204         try (ZipFile zf = new ZipFile(archive, CP437, false)) {
205             assertNotNull(zf.getEntry(ASCII_TXT));
206             assertNotNull(zf.getEntry(EURO_FOR_DOLLAR_TXT));
207             assertNotNull(zf.getEntry(OIL_BARREL_TXT));
208         }
209     }
210 
211     @Test
212     public void testRead7ZipArchiveForStream() throws IOException {
213         try (ZipArchiveInputStream zi = new ZipArchiveInputStream(newInputStream("utf8-7zip-test.zip"), CP437, false)) {
214             assertEquals(ASCII_TXT, zi.getNextEntry().getName());
215             assertEquals(OIL_BARREL_TXT, zi.getNextEntry().getName());
216             assertEquals(EURO_FOR_DOLLAR_TXT, zi.getNextEntry().getName());
217         }
218     }
219 
220     /*
221      * WinZIP created archive, uses Unicode Extra Fields but only in the central directory.
222      */
223     @Test
224     public void testReadWinZipArchive() throws IOException {
225         final File archive = getFile("utf8-winzip-test.zip");
226         // fix for test fails on Windows with default charset that is not UTF-8
227         String encoding = null;
228         if (Charset.defaultCharset() != UTF_8) {
229             encoding = UTF_8.name();
230         }
231         try (ZipFile zf = ZipFile.builder().setFile(archive).setCharset(encoding).setUseUnicodeExtraFields(true).get()) {
232             assertCanRead(zf, ASCII_TXT);
233             assertCanRead(zf, EURO_FOR_DOLLAR_TXT);
234             assertCanRead(zf, OIL_BARREL_TXT);
235         }
236     }
237 
238     @Test
239     public void testReadWinZipArchiveForStream() throws IOException {
240         // fix for test fails on Windows with default charset that is not UTF-8
241         String encoding = null;
242         if (Charset.defaultCharset() != UTF_8) {
243             encoding = UTF_8.name();
244         }
245         try (InputStream archive = newInputStream("utf8-winzip-test.zip");
246                 ZipArchiveInputStream zi = new ZipArchiveInputStream(archive, encoding, true)) {
247             assertEquals(EURO_FOR_DOLLAR_TXT, zi.getNextEntry().getName());
248             assertEquals(OIL_BARREL_TXT, zi.getNextEntry().getName());
249             assertEquals(ASCII_TXT, zi.getNextEntry().getName());
250         }
251     }
252 
253     /**
254      * @see <a href="https://issues.apache.org/jira/browse/COMPRESS-479">COMPRESS-479</a>
255      */
256     @Test
257     public void testStreamSkipsOverUnicodeExtraFieldWithUnsupportedVersion() throws IOException {
258         try (InputStream archive = newInputStream("COMPRESS-479.zip");
259                 ZipArchiveInputStream zi = new ZipArchiveInputStream(archive)) {
260             assertEquals(OIL_BARREL_TXT, zi.getNextEntry().getName());
261             assertEquals("%U20AC_for_Dollar.txt", zi.getNextEntry().getName());
262             assertEquals(ASCII_TXT, zi.getNextEntry().getName());
263         }
264     }
265 
266     @Test
267     public void testUtf8FileRoundtripExplicitUnicodeExtra() throws IOException {
268         testFileRoundtrip(StandardCharsets.UTF_8.name(), true, true);
269     }
270 
271     @Test
272     public void testUtf8FileRoundtripImplicitUnicodeExtra() throws IOException {
273         testFileRoundtrip(StandardCharsets.UTF_8.name(), true, false);
274     }
275 
276     @Test
277     public void testUtf8FileRoundtripNoEFSExplicitUnicodeExtra() throws IOException {
278         testFileRoundtrip(StandardCharsets.UTF_8.name(), false, true);
279     }
280 
281     @Test
282     public void testUtf8FileRoundtripNoEFSImplicitUnicodeExtra() throws IOException {
283         testFileRoundtrip(StandardCharsets.UTF_8.name(), false, false);
284     }
285 
286     @Test
287     public void testUtf8Interoperability() throws IOException {
288         final File file1 = getFile("utf8-7zip-test.zip");
289         final File file2 = getFile("utf8-winzip-test.zip");
290         testFile(file1, CP437);
291         testFile(file2, CP437);
292     }
293 
294     @Test
295     public void testZipArchiveInputStreamReadsUnicodeFields() throws IOException {
296         final File file = createTempFile("unicode-test", ".zip");
297         createTestFile(file, StandardCharsets.US_ASCII.name(), false, true);
298         try (ZipFile zf = ZipFile.builder().setFile(file).setCharset(StandardCharsets.US_ASCII).setUseUnicodeExtraFields(true).get()) {
299             assertNotNull(zf.getEntry(ASCII_TXT));
300             assertNotNull(zf.getEntry(EURO_FOR_DOLLAR_TXT));
301             assertNotNull(zf.getEntry(OIL_BARREL_TXT));
302         }
303     }
304 
305     @Test
306     public void testZipFileReadsUnicodeFields() throws IOException {
307         final File file = createTempFile("unicode-test", ".zip");
308         createTestFile(file, StandardCharsets.US_ASCII.name(), false, true);
309         try (ZipArchiveInputStream zi = new ZipArchiveInputStream(Files.newInputStream(file.toPath()), StandardCharsets.US_ASCII.name(), true)) {
310             assertEquals(OIL_BARREL_TXT, zi.getNextEntry().getName());
311             assertEquals(EURO_FOR_DOLLAR_TXT, zi.getNextEntry().getName());
312             assertEquals(ASCII_TXT, zi.getNextEntry().getName());
313         }
314     }
315 
316     /**
317      * @see <a href="https://issues.apache.org/jira/browse/COMPRESS-479">COMPRESS-479</a>
318      */
319     @Test
320     public void testZipFileSkipsOverUnicodeExtraFieldWithUnsupportedVersion() throws IOException {
321         try (ZipFile zf = ZipFile.builder().setFile(getFile("COMPRESS-479.zip")).get()) {
322             assertNotNull(zf.getEntry(ASCII_TXT));
323             assertNotNull(zf.getEntry("%U20AC_for_Dollar.txt"));
324             assertNotNull(zf.getEntry(OIL_BARREL_TXT));
325         }
326     }
327 }