View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import static org.junit.jupiter.api.Assertions.assertArrayEquals;
20  import static org.junit.jupiter.api.Assertions.assertEquals;
21  import static org.junit.jupiter.api.Assertions.assertNotEquals;
22  import static org.junit.jupiter.api.Assertions.assertNotNull;
23  import static org.junit.jupiter.api.Assertions.assertThrows;
24  import static org.junit.jupiter.api.Assertions.assertTrue;
25  import static org.junit.jupiter.api.Assertions.fail;
26  
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.io.StringReader;
30  import java.nio.ByteBuffer;
31  import java.nio.CharBuffer;
32  import java.nio.charset.Charset;
33  import java.nio.charset.CharsetEncoder;
34  import java.nio.charset.CoderResult;
35  import java.nio.charset.CodingErrorAction;
36  import java.nio.charset.StandardCharsets;
37  import java.nio.charset.UnmappableCharacterException;
38  import java.util.Random;
39  
40  import org.apache.commons.io.CharsetsTest;
41  import org.apache.commons.io.IOUtils;
42  import org.apache.commons.lang3.StringUtils;
43  import org.junit.jupiter.api.Test;
44  import org.junit.jupiter.params.ParameterizedTest;
45  import org.junit.jupiter.params.provider.MethodSource;
46  
47  class CharSequenceInputStreamTest {
48  
49      private static final String UTF_16 = StandardCharsets.UTF_16.name();
50      private static final String UTF_8 = StandardCharsets.UTF_8.name();
51      private static final String ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
52      private static final String TEST_STRING = "\u00e0 peine arriv\u00e9s nous entr\u00e2mes dans sa chambre";
53      private static final String LARGE_TEST_STRING = StringUtils.repeat(TEST_STRING, 100);
54  
55      private final Random random = new Random();
56  
57      private int checkAvail(final InputStream is, final int min) throws Exception {
58          final int available = is.available();
59          assertTrue(available >= min, "avail should be >= " + min + ", but was " + available);
60          return available;
61      }
62  
63      private boolean isAvailabilityTestableForCharset(final String csName) {
64          return Charset.forName(csName).canEncode()
65                  && !"COMPOUND_TEXT".equalsIgnoreCase(csName) && !"x-COMPOUND_TEXT".equalsIgnoreCase(csName)
66                  && !isOddBallLegacyCharsetThatDoesNotSupportFrenchCharacters(csName);
67      }
68  
69      private boolean isOddBallLegacyCharsetThatDoesNotSupportFrenchCharacters(final String csName) {
70          return "x-IBM1388".equalsIgnoreCase(csName) ||
71                  "ISO-2022-CN".equalsIgnoreCase(csName) ||
72                  "ISO-2022-JP".equalsIgnoreCase(csName) ||
73                  "Shift_JIS".equalsIgnoreCase(csName);
74      }
75  
76      @ParameterizedTest(name = "{0}")
77      @MethodSource(CharsetsTest.AVAIL_CHARSETS)
78      void testAvailable(final String csName) throws Exception {
79          // prevent java.lang.UnsupportedOperationException at sun.nio.cs.ext.ISO2022_CN.newEncoder.
80          // also try and avoid the following exception
81  //            java.lang.UnsupportedOperationException: null
82  //            at java.nio.CharBuffer.array(CharBuffer.java:940)
83  //            at sun.nio.cs.ext.COMPOUND_TEXT_Encoder.encodeLoop(COMPOUND_TEXT_Encoder.java:75)
84  //            at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
85  //            at org.apache.commons.io.input.CharSequenceInputStream.fillBuffer(CharSequenceInputStream.java:120)
86  //            at org.apache.commons.io.input.CharSequenceInputStream.read(CharSequenceInputStream.java:151)
87  //            at org.apache.commons.io.input.CharSequenceInputStreamTest.testAvailableRead(CharSequenceInputStreamTest.java:412)
88  //            at org.apache.commons.io.input.CharSequenceInputStreamTest.testAvailable(CharSequenceInputStreamTest.java:424)
89  
90          try {
91              if (isAvailabilityTestableForCharset(csName)) {
92                  testAvailableSkip(csName);
93                  testAvailableRead(csName);
94              }
95          } catch (final UnsupportedOperationException e) {
96              fail("Operation not supported for " + csName);
97          }
98      }
99  
100     @Test
101     void testAvailableAfterClose() throws Exception {
102         final InputStream shadow;
103         try (InputStream in = CharSequenceInputStream.builder().setCharSequence("Hi").get()) {
104             assertTrue(in.available() > 0);
105             shadow = in;
106         }
107         assertEquals(0, shadow.available());
108     }
109 
110     /**
111      * IO-781 available() returns 2 but only 1 byte is read afterwards.
112      */
113     @Test
114     void testAvailableAfterOpen() throws IOException {
115         final Charset charset = Charset.forName("Big5");
116         try (CharSequenceInputStream in = new CharSequenceInputStream("\uD800\uDC00", charset)) {
117             final int available = in.available();
118             final byte[] data = new byte[available];
119             final int bytesRead = in.read(data);
120             assertEquals(available, bytesRead);
121         }
122     }
123 
124     private void testAvailableRead(final String csName) throws Exception {
125         final String input = "test";
126         try (InputStream r = new CharSequenceInputStream(input, csName)) {
127             int available = checkAvail(r, input.length());
128             assertEquals(available - 1, r.skip(available - 1)); // skip all but one
129             available = checkAvail(r, 1);
130             final byte[] buff = new byte[available];
131             assertEquals(available, r.read(buff, 0, available));
132         }
133     }
134 
135     private void testAvailableSkip(final String csName) throws Exception {
136         final String input = "test";
137         try (InputStream r = new CharSequenceInputStream(input, csName)) {
138             int available = checkAvail(r, input.length());
139             assertEquals(available - 1, r.skip(available - 1)); // skip all but one
140             available = checkAvail(r, 1);
141             assertEquals(1, r.skip(1));
142             available = checkAvail(r, 0);
143         }
144     }
145 
146     private void testBufferedRead(final String testString, final String charsetName) throws IOException {
147         final byte[] expected = testString.getBytes(charsetName);
148         try (InputStream in = new CharSequenceInputStream(testString, charsetName, 512)) {
149             final byte[] buffer = new byte[128];
150             int offset = 0;            while (true) {
151                 int bufferOffset = random.nextInt(64);
152                 final int bufferLength = random.nextInt(64);
153                 int read = in.read(buffer, bufferOffset, bufferLength);
154                 if (read == -1) {
155                     assertEquals(expected.length, offset, "EOF: offset should equal length for charset " + charsetName);
156                     break;
157                 }
158                 assertTrue(read <= bufferLength, "Read " + read + " <= " + bufferLength);
159                 while (read > 0) {
160                     assertTrue(offset < expected.length,
161                             "offset for " + charsetName + " " + offset + " < " + expected.length);
162                     assertEquals(expected[offset], buffer[bufferOffset], "bytes should agree for " + charsetName);
163                     offset++;
164                     bufferOffset++;
165                     read--;
166                 }
167             }
168         }
169     }
170 
171     //    Unfortunately checking canEncode does not seem to work for all charsets:
172 //    testBufferedRead_AvailableCharset(org.apache.commons.io.input.CharSequenceInputStreamTest)  Time elapsed: 0.682 sec  <<< ERROR!
173 //    java.lang.UnsupportedOperationException: null
174 //        at java.nio.CharBuffer.array(CharBuffer.java:940)
175 //        at sun.nio.cs.ext.COMPOUND_TEXT_Encoder.encodeLoop(COMPOUND_TEXT_Encoder.java:75)
176 //        at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
177 //        at org.apache.commons.io.input.CharSequenceInputStream.fillBuffer(CharSequenceInputStream.java:111)
178     @ParameterizedTest(name = "{0}")
179     @MethodSource(CharsetsTest.AVAIL_CHARSETS)
180     void testBufferedRead_AvailableCharset(final String csName) throws IOException {
181         // prevent java.lang.UnsupportedOperationException at sun.nio.cs.ext.ISO2022_CN.newEncoder.
182         if (isAvailabilityTestableForCharset(csName)) {
183             testBufferedRead(TEST_STRING, csName);
184         }
185     }
186 
187     @ParameterizedTest
188     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
189     void testBufferedRead_RequiredCharset(final String csName) throws IOException {
190         testBufferedRead(TEST_STRING, csName);
191     }
192 
193     @Test
194     void testBufferedRead_UTF8() throws IOException {
195         testBufferedRead(TEST_STRING, UTF_8);
196     }
197 
198     @Test
199     void testCharacterCodingException() throws IOException {
200         final Charset charset = StandardCharsets.US_ASCII;
201         final CharSequenceInputStream in = CharSequenceInputStream.builder()
202             .setCharsetEncoder(charset.newEncoder().onUnmappableCharacter(CodingErrorAction.REPORT))
203             .setCharSequence("\u0080")
204             .get();
205         assertEquals(0, in.available());
206         assertThrows(UnmappableCharacterException.class, in::read);
207     }
208 
209     private void testCharsetMismatchInfiniteLoop(final String csName) throws IOException {
210         // Input is UTF-8 bytes: 0xE0 0xB2 0xA0
211         final char[] inputChars = { (char) 0xE0, (char) 0xB2, (char) 0xA0 };
212         final Charset charset = Charset.forName(csName); // infinite loop for US-ASCII, UTF-8 OK
213         try (InputStream stream = new CharSequenceInputStream(new String(inputChars), charset, 512)) {
214             IOUtils.toCharArray(stream, charset);
215         }
216         try (InputStream stream = CharSequenceInputStream.builder().setCharSequence(new String(inputChars)).setCharset(charset).setBufferSize(512).get()) {
217             IOUtils.toCharArray(stream, charset);
218         }
219     }
220 
221     @ParameterizedTest
222     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
223     void testCharsetMismatchInfiniteLoop_RequiredCharsets(final String csName) throws IOException {
224         testCharsetMismatchInfiniteLoop(csName);
225     }
226 
227     // Test is broken if readFirst > 0
228     // This is because the initial read fills the buffer from the CharSequence
229     // so data1 gets the first buffer full; data2 will get the next buffer full
230     private void testIO_356(final int bufferSize, final int dataSize, final int readFirst, final String csName) throws Exception {
231         final byte[] data1;
232         final byte[] data2;
233         try (CharSequenceInputStream is = new CharSequenceInputStream(ALPHABET, csName, bufferSize)) {
234             for (int i = 0; i < readFirst; i++) {
235                 final int ch = is.read();
236                 assertNotEquals(-1, ch);
237             }
238 
239             is.mark(dataSize);
240 
241             data1 = new byte[dataSize];
242             final int readCount1 = is.read(data1);
243             assertEquals(dataSize, readCount1);
244 
245             is.reset(); // should allow data to be re-read
246 
247             data2 = new byte[dataSize];
248             final int readCount2 = is.read(data2);
249             assertEquals(dataSize, readCount2);
250         }
251 
252         // data buffers should be identical
253         assertArrayEquals(data1, data2, "bufferSize=" + bufferSize + " dataSize=" + dataSize);
254     }
255 
256     @Test
257     void testIO_356_B10_D10_S0_UTF16() throws Exception {
258         testIO_356(10, 10, 0, UTF_16);
259     }
260 
261     @Test
262     void testIO_356_B10_D10_S0_UTF8() throws Exception {
263         testIO_356(10, 10, 0, UTF_8);
264     }
265 
266     @Test
267     void testIO_356_B10_D10_S1_UTF8() throws Exception {
268         testIO_356(10, 10, 1, UTF_8);
269     }
270 
271     @Test
272     void testIO_356_B10_D10_S2_UTF8() throws Exception {
273         testIO_356(10, 10, 2, UTF_8);
274     }
275 
276     @Test
277     void testIO_356_B10_D13_S0_UTF8() throws Exception {
278         testIO_356(10, 13, 0, UTF_8);
279     }
280 
281     @Test
282     void testIO_356_B10_D13_S1_UTF8() throws Exception {
283         testIO_356(10, 13, 1, UTF_8);
284     }
285 
286     @Test
287     void testIO_356_B10_D20_S0_UTF8() throws Exception {
288         testIO_356(10, 20, 0, UTF_8);
289     }
290 
291     private void testIO_356_Loop(final String csName, final int maxBytesPerChar) throws Exception {
292         for (int bufferSize = maxBytesPerChar; bufferSize <= 10; bufferSize++) {
293             for (int dataSize = 1; dataSize <= 20; dataSize++) {
294                 testIO_356(bufferSize, dataSize, 0, csName);
295             }
296         }
297     }
298 
299     @Test
300     void testIO_356_Loop_UTF16() throws Exception {
301         final Charset charset = StandardCharsets.UTF_16;
302         testIO_356_Loop(charset.displayName(), (int) ReaderInputStream.minBufferSize(charset.newEncoder()));
303     }
304 
305     @Test
306     void testIO_356_Loop_UTF8() throws Exception {
307         final Charset charset = StandardCharsets.UTF_8;
308         testIO_356_Loop(charset.displayName(), (int) ReaderInputStream.minBufferSize(charset.newEncoder()));
309     }
310 
311     @ParameterizedTest
312     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
313     void testLargeBufferedRead_RequiredCharsets(final String csName) throws IOException {
314         testBufferedRead(LARGE_TEST_STRING, csName);
315     }
316 
317     @Test
318     void testLargeBufferedRead_UTF8() throws IOException {
319         testBufferedRead(LARGE_TEST_STRING, UTF_8);
320     }
321 
322     @ParameterizedTest
323     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
324     void testLargeSingleByteRead_RequiredCharsets(final String csName) throws IOException {
325         testSingleByteRead(LARGE_TEST_STRING, csName);
326     }
327 
328     @Test
329     void testLargeSingleByteRead_UTF8() throws IOException {
330         testSingleByteRead(LARGE_TEST_STRING, UTF_8);
331     }
332 
333     // This test doesn't work for charsets that don't create a single byte for each char.
334     // Use testMarkResetMultiByteChars() instead for those cases.
335     private void testMarkReset(final String csName) throws Exception {
336         try (InputStream r = new CharSequenceInputStream("test", csName)) {
337             assertEquals(2, r.skip(2));
338             r.mark(0);
339             assertEquals('s', r.read(), csName);
340             assertEquals('t', r.read(), csName);
341             assertEquals(-1, r.read(), csName);
342             r.reset();
343             assertEquals('s', r.read(), csName);
344             assertEquals('t', r.read(), csName);
345             assertEquals(-1, r.read(), csName);
346             r.reset();
347             r.reset();
348         }
349     }
350 
351     @ParameterizedTest
352     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
353     void testMarkReset_RequiredCharsets(final String csName) throws Exception {
354         testMarkResetMultiByteChars(csName);
355     }
356 
357     @Test
358     void testMarkReset_USASCII() throws Exception {
359         testMarkReset(StandardCharsets.US_ASCII.name());
360     }
361 
362     @Test
363     void testMarkReset_UTF8() throws Exception {
364         testMarkReset(UTF_8);
365     }
366 
367     private void testMarkResetMultiByteChars(final String csName) throws IOException {
368         // This test quietly skips Charsets that can't handle multibyte characters like ASCII.
369         final String sequenceEnglish = "Test Sequence";
370         final String sequenceCJK = "\u4e01\u4f23\u5045\u5167\u5289\u53ab"; // Kanji text
371         final String[] sequences = {sequenceEnglish, sequenceCJK};
372         for (final String testSequence : sequences) {
373             final CharsetEncoder charsetEncoder = Charset.forName(csName).newEncoder();
374             final ByteBuffer byteBuffer = ByteBuffer.allocate(testSequence.length() * 3);
375             final CharBuffer charBuffer = CharBuffer.wrap(testSequence);
376             final CoderResult result = charsetEncoder.encode(charBuffer, byteBuffer, true);
377             if (result.isUnmappable()) {
378                 continue; // Skip character sets that can't handle multibyte characters.
379             }
380             final byte[] expectedBytes = byteBuffer.array();
381 
382             final int bLength = byteBuffer.position();
383             final int skip = bLength - 4;
384             try (InputStream r = new CharSequenceInputStream(testSequence, csName)) {
385                 assertEquals(skip, r.skip(skip));
386                 r.mark(0);
387                 assertEquals(expectedBytes[bLength - 4], (byte) r.read(), csName);
388                 assertEquals(expectedBytes[bLength - 3], (byte) r.read(), csName);
389                 assertEquals(expectedBytes[bLength - 2], (byte) r.read(), csName);
390                 assertEquals(expectedBytes[bLength - 1], (byte) r.read(), csName);
391                 assertEquals(-1, (byte) r.read(), csName);
392                 r.reset();
393                 assertEquals(expectedBytes[bLength - 4], (byte) r.read(), csName);
394                 assertEquals(expectedBytes[bLength - 3], (byte) r.read(), csName);
395                 assertEquals(expectedBytes[bLength - 2], (byte) r.read(), csName);
396                 assertEquals(expectedBytes[bLength - 1], (byte) r.read(), csName);
397                 assertEquals(-1, (byte) r.read(), csName);
398                 r.reset();
399                 assertEquals(expectedBytes[bLength - 4], (byte) r.read(), csName);
400                 assertEquals(expectedBytes[bLength - 3], (byte) r.read(), csName);
401                 assertEquals(expectedBytes[bLength - 2], (byte) r.read(), csName);
402                 assertEquals(expectedBytes[bLength - 1], (byte) r.read(), csName);
403                 assertEquals(-1, (byte) r.read(), csName);
404             }
405         }
406     }
407 
408     @Test
409     void testMarkSupported() throws Exception {
410         try (@SuppressWarnings("deprecation")
411         InputStream r = new CharSequenceInputStream("test", UTF_8)) {
412             assertTrue(r.markSupported());
413         }
414         try (InputStream r = CharSequenceInputStream.builder().setCharSequence("test").setCharset(UTF_8).get()) {
415             assertTrue(r.markSupported());
416         }
417     }
418 
419     @Test
420     void testNullCharset() throws IOException {
421         try (CharSequenceInputStream in = new CharSequenceInputStream("A", (Charset) null)) {
422             IOUtils.toByteArray(in);
423             assertEquals(Charset.defaultCharset(), in.getCharsetEncoder().charset());
424         }
425         try (CharSequenceInputStream in = CharSequenceInputStream.builder().setCharSequence("test").setCharset((Charset) null).get()) {
426             IOUtils.toByteArray(in);
427             assertEquals(Charset.defaultCharset(), in.getCharsetEncoder().charset());
428         }
429     }
430 
431     @Test
432     void testNullCharsetName() throws IOException {
433         try (CharSequenceInputStream in = new CharSequenceInputStream("A", (String) null)) {
434             IOUtils.toByteArray(in);
435             assertEquals(Charset.defaultCharset(), in.getCharsetEncoder().charset());
436         }
437         try (CharSequenceInputStream in = CharSequenceInputStream.builder().setCharSequence("test").setCharset((String) null).get()) {
438             IOUtils.toByteArray(in);
439             assertEquals(Charset.defaultCharset(), in.getCharsetEncoder().charset());
440         }
441     }
442 
443     @Test
444     void testReadAfterClose() throws Exception {
445         final InputStream shadow;
446         try (InputStream in = CharSequenceInputStream.builder().setCharSequence("Hi").get()) {
447             assertTrue(in.available() > 0);
448             shadow = in;
449         }
450         assertEquals(IOUtils.EOF, shadow.read());
451     }
452 
453     private void testReadZero(final String csName) throws Exception {
454         try (InputStream r = new CharSequenceInputStream("test", csName)) {
455             final byte[] bytes = new byte[30];
456             assertEquals(0, r.read(bytes, 0, 0));
457         }
458     }
459 
460     @Test
461     void testReadZero_EmptyString() throws Exception {
462         try (InputStream r = new CharSequenceInputStream("", UTF_8)) {
463             final byte[] bytes = new byte[30];
464             assertEquals(0, r.read(bytes, 0, 0));
465         }
466     }
467 
468     @ParameterizedTest
469     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
470     void testReadZero_RequiredCharsets(final String csName) throws Exception {
471         testReadZero(csName);
472     }
473 
474     private void testResetBeforeEnd(final CharSequenceInputStream inputStream) throws IOException {
475         inputStream.mark(1);
476         assertEquals('1', inputStream.read());
477         inputStream.reset();
478         assertEquals('1', inputStream.read());
479         assertEquals('2', inputStream.read());
480         inputStream.reset();
481         assertEquals('1', inputStream.read());
482         assertEquals('2', inputStream.read());
483         assertEquals('3', inputStream.read());
484         inputStream.reset();
485         assertEquals('1', inputStream.read());
486         assertEquals('2', inputStream.read());
487         assertEquals('3', inputStream.read());
488         assertEquals('4', inputStream.read());
489         inputStream.reset();
490         assertEquals('1', inputStream.read());
491     }
492 
493     @Test
494     void testResetBeforeEndSetCharSequence() throws IOException {
495         try (CharSequenceInputStream inputStream = CharSequenceInputStream.builder().setCharSequence("1234").get()) {
496             testResetBeforeEnd(inputStream);
497         }
498     }
499 
500     @Test
501     void testResetCharset() {
502         assertNotNull(CharSequenceInputStream.builder().setReader(new StringReader("\uD800")).setCharset((Charset) null).getCharset());
503     }
504 
505     @Test
506     void testResetCharsetEncoder() {
507         assertNotNull(CharSequenceInputStream.builder().setReader(new StringReader("\uD800")).setCharsetEncoder(null).getCharsetEncoder());
508     }
509 
510     @Test
511     void testResetCharsetName() {
512         assertNotNull(CharSequenceInputStream.builder().setReader(new StringReader("\uD800")).setCharset((String) null).getCharset());
513     }
514 
515     private void testSingleByteRead(final String testString, final String charsetName) throws IOException {
516         final byte[] bytes = testString.getBytes(charsetName);
517         try (InputStream in = new CharSequenceInputStream(testString, charsetName, 512)) {
518             for (final byte b : bytes) {
519                 final int read = in.read();
520                 assertTrue(read >= 0, "read " + read + " >=0 ");
521                 assertTrue(read <= 255, "read " + read + " <= 255");
522                 assertEquals(b, (byte) read, "Should agree with input");
523             }
524             assertEquals(-1, in.read());
525         }
526     }
527 
528     @ParameterizedTest
529     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
530     void testSingleByteRead_RequiredCharsets(final String csName) throws IOException {
531         testSingleByteRead(TEST_STRING, csName);
532     }
533 
534     @Test
535     void testSingleByteRead_UTF16() throws IOException {
536         testSingleByteRead(TEST_STRING, UTF_16);
537     }
538 
539     @Test
540     void testSingleByteRead_UTF8() throws IOException {
541         testSingleByteRead(TEST_STRING, UTF_8);
542     }
543 
544     @ParameterizedTest
545     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
546     void testSkip_RequiredCharsets(final String csName) throws Exception {
547         try (InputStream r = new CharSequenceInputStream("test", csName)) {
548             assertEquals(1, r.skip(1));
549             assertEquals(2, r.skip(2));
550             r.skip(100);
551             assertEquals(-1, r.read(), csName);
552         }
553     }
554 }