View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import static org.junit.jupiter.api.Assertions.assertArrayEquals;
20  import static org.junit.jupiter.api.Assertions.assertEquals;
21  import static org.junit.jupiter.api.Assertions.assertNotEquals;
22  import static org.junit.jupiter.api.Assertions.assertNotNull;
23  import static org.junit.jupiter.api.Assertions.assertThrows;
24  import static org.junit.jupiter.api.Assertions.assertTrue;
25  import static org.junit.jupiter.api.Assertions.fail;
26  
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.io.StringReader;
30  import java.nio.ByteBuffer;
31  import java.nio.CharBuffer;
32  import java.nio.charset.Charset;
33  import java.nio.charset.CharsetEncoder;
34  import java.nio.charset.CoderResult;
35  import java.nio.charset.CodingErrorAction;
36  import java.nio.charset.StandardCharsets;
37  import java.nio.charset.UnmappableCharacterException;
38  import java.util.Random;
39  
40  import org.apache.commons.io.CharsetsTest;
41  import org.apache.commons.io.IOUtils;
42  import org.junit.jupiter.api.Test;
43  import org.junit.jupiter.params.ParameterizedTest;
44  import org.junit.jupiter.params.provider.MethodSource;
45  
46  public class CharSequenceInputStreamTest {
47  
48      private static final String UTF_16 = StandardCharsets.UTF_16.name();
49      private static final String UTF_8 = StandardCharsets.UTF_8.name();
50      private static final String ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
51      private static final String LARGE_TEST_STRING;
52  
53      private static final String TEST_STRING = "\u00e0 peine arriv\u00e9s nous entr\u00e2mes dans sa chambre";
54  
55      static {
56          final StringBuilder buffer = new StringBuilder();
57          for (int i = 0; i < 100; i++) {
58              buffer.append(TEST_STRING);
59          }
60          LARGE_TEST_STRING = buffer.toString();
61      }
62  
63      private final Random random = new Random();
64  
65      private int checkAvail(final InputStream is, final int min) throws Exception {
66          final int available = is.available();
67          assertTrue(available >= min, "avail should be >= " + min + ", but was " + available);
68          return available;
69      }
70  
71      private boolean isAvailabilityTestableForCharset(final String csName) {
72          return Charset.forName(csName).canEncode()
73                  && !"COMPOUND_TEXT".equalsIgnoreCase(csName) && !"x-COMPOUND_TEXT".equalsIgnoreCase(csName)
74                  && !isOddBallLegacyCharsetThatDoesNotSupportFrenchCharacters(csName);
75      }
76  
77      private boolean isOddBallLegacyCharsetThatDoesNotSupportFrenchCharacters(final String csName) {
78          return "x-IBM1388".equalsIgnoreCase(csName) ||
79                  "ISO-2022-CN".equalsIgnoreCase(csName) ||
80                  "ISO-2022-JP".equalsIgnoreCase(csName) ||
81                  "Shift_JIS".equalsIgnoreCase(csName);
82      }
83  
84      /**
85       * IO-781 available() returns 2 but only 1 byte is read afterwards.
86       */
87      @Test
88      public void testAvailable() throws IOException {
89          final Charset charset = Charset.forName("Big5");
90          final CharSequenceInputStream in = new CharSequenceInputStream("\uD800\uDC00", charset);
91          final int available = in.available();
92          final byte[] data = new byte[available];
93          final int bytesRead = in.read(data);
94          assertEquals(available, bytesRead);
95      }
96  
97      @ParameterizedTest(name = "{0}")
98      @MethodSource(CharsetsTest.AVAIL_CHARSETS)
99      public void testAvailable(final String csName) throws Exception {
100         // prevent java.lang.UnsupportedOperationException at sun.nio.cs.ext.ISO2022_CN.newEncoder.
101         // also try and avoid the following exception
102 //            java.lang.UnsupportedOperationException: null
103 //            at java.nio.CharBuffer.array(CharBuffer.java:940)
104 //            at sun.nio.cs.ext.COMPOUND_TEXT_Encoder.encodeLoop(COMPOUND_TEXT_Encoder.java:75)
105 //            at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
106 //            at org.apache.commons.io.input.CharSequenceInputStream.fillBuffer(CharSequenceInputStream.java:120)
107 //            at org.apache.commons.io.input.CharSequenceInputStream.read(CharSequenceInputStream.java:151)
108 //            at org.apache.commons.io.input.CharSequenceInputStreamTest.testAvailableRead(CharSequenceInputStreamTest.java:412)
109 //            at org.apache.commons.io.input.CharSequenceInputStreamTest.testAvailable(CharSequenceInputStreamTest.java:424)
110 
111         try {
112             if (isAvailabilityTestableForCharset(csName)) {
113                 testAvailableSkip(csName);
114                 testAvailableRead(csName);
115             }
116         } catch (final UnsupportedOperationException e) {
117             fail("Operation not supported for " + csName);
118         }
119     }
120 
121     private void testAvailableRead(final String csName) throws Exception {
122         final String input = "test";
123         try (InputStream r = new CharSequenceInputStream(input, csName)) {
124             int available = checkAvail(r, input.length());
125             assertEquals(available - 1, r.skip(available - 1)); // skip all but one
126             available = checkAvail(r, 1);
127             final byte[] buff = new byte[available];
128             assertEquals(available, r.read(buff, 0, available));
129         }
130     }
131 
132     private void testAvailableSkip(final String csName) throws Exception {
133         final String input = "test";
134         try (InputStream r = new CharSequenceInputStream(input, csName)) {
135             int available = checkAvail(r, input.length());
136             assertEquals(available - 1, r.skip(available - 1)); // skip all but one
137             available = checkAvail(r, 1);
138             assertEquals(1, r.skip(1));
139             available = checkAvail(r, 0);
140         }
141     }
142 
143     private void testBufferedRead(final String testString, final String charsetName) throws IOException {
144         final byte[] expected = testString.getBytes(charsetName);
145         try (InputStream in = new CharSequenceInputStream(testString, charsetName, 512)) {
146             final byte[] buffer = new byte[128];
147             int offset = 0;            while (true) {
148                 int bufferOffset = random.nextInt(64);
149                 final int bufferLength = random.nextInt(64);
150                 int read = in.read(buffer, bufferOffset, bufferLength);
151                 if (read == -1) {
152                     assertEquals(expected.length, offset, "EOF: offset should equal length for charset " + charsetName);
153                     break;
154                 }
155                 assertTrue(read <= bufferLength, "Read " + read + " <= " + bufferLength);
156                 while (read > 0) {
157                     assertTrue(offset < expected.length,
158                             "offset for " + charsetName + " " + offset + " < " + expected.length);
159                     assertEquals(expected[offset], buffer[bufferOffset], "bytes should agree for " + charsetName);
160                     offset++;
161                     bufferOffset++;
162                     read--;
163                 }
164             }
165         }
166     }
167 
168     //    Unfortunately checking canEncode does not seem to work for all charsets:
169 //    testBufferedRead_AvailableCharset(org.apache.commons.io.input.CharSequenceInputStreamTest)  Time elapsed: 0.682 sec  <<< ERROR!
170 //    java.lang.UnsupportedOperationException: null
171 //        at java.nio.CharBuffer.array(CharBuffer.java:940)
172 //        at sun.nio.cs.ext.COMPOUND_TEXT_Encoder.encodeLoop(COMPOUND_TEXT_Encoder.java:75)
173 //        at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
174 //        at org.apache.commons.io.input.CharSequenceInputStream.fillBuffer(CharSequenceInputStream.java:111)
175     @ParameterizedTest(name = "{0}")
176     @MethodSource(CharsetsTest.AVAIL_CHARSETS)
177     public void testBufferedRead_AvailableCharset(final String csName) throws IOException {
178         // prevent java.lang.UnsupportedOperationException at sun.nio.cs.ext.ISO2022_CN.newEncoder.
179         if (isAvailabilityTestableForCharset(csName)) {
180             testBufferedRead(TEST_STRING, csName);
181         }
182     }
183 
184     @ParameterizedTest
185     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
186     public void testBufferedRead_RequiredCharset(final String csName) throws IOException {
187         testBufferedRead(TEST_STRING, csName);
188     }
189 
190     @Test
191     public void testBufferedRead_UTF8() throws IOException {
192         testBufferedRead(TEST_STRING, UTF_8);
193     }
194 
195     @Test
196     public void testCharacterCodingException() throws IOException {
197         final Charset charset = StandardCharsets.US_ASCII;
198         final CharSequenceInputStream in = CharSequenceInputStream.builder()
199             .setCharsetEncoder(charset.newEncoder().onUnmappableCharacter(CodingErrorAction.REPORT))
200             .setCharSequence("\u0080")
201             .get();
202         assertEquals(0, in.available());
203         assertThrows(UnmappableCharacterException.class, in::read);
204     }
205 
206     private void testCharsetMismatchInfiniteLoop(final String csName) throws IOException {
207         // Input is UTF-8 bytes: 0xE0 0xB2 0xA0
208         final char[] inputChars = { (char) 0xE0, (char) 0xB2, (char) 0xA0 };
209         final Charset charset = Charset.forName(csName); // infinite loop for US-ASCII, UTF-8 OK
210         try (InputStream stream = new CharSequenceInputStream(new String(inputChars), charset, 512)) {
211             IOUtils.toCharArray(stream, charset);
212         }
213         try (InputStream stream = CharSequenceInputStream.builder().setCharSequence(new String(inputChars)).setCharset(charset).setBufferSize(512).get()) {
214             IOUtils.toCharArray(stream, charset);
215         }
216     }
217 
218     @ParameterizedTest
219     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
220     public void testCharsetMismatchInfiniteLoop_RequiredCharsets(final String csName) throws IOException {
221         testCharsetMismatchInfiniteLoop(csName);
222     }
223 
224     // Test is broken if readFirst > 0
225     // This is because the initial read fills the buffer from the CharSequence
226     // so data1 gets the first buffer full; data2 will get the next buffer full
227     private void testIO_356(final int bufferSize, final int dataSize, final int readFirst, final String csName) throws Exception {
228         final byte[] data1;
229         final byte[] data2;
230         try (CharSequenceInputStream is = new CharSequenceInputStream(ALPHABET, csName, bufferSize)) {
231             for (int i = 0; i < readFirst; i++) {
232                 final int ch = is.read();
233                 assertNotEquals(-1, ch);
234             }
235 
236             is.mark(dataSize);
237 
238             data1 = new byte[dataSize];
239             final int readCount1 = is.read(data1);
240             assertEquals(dataSize, readCount1);
241 
242             is.reset(); // should allow data to be re-read
243 
244             data2 = new byte[dataSize];
245             final int readCount2 = is.read(data2);
246             assertEquals(dataSize, readCount2);
247         }
248 
249         // data buffers should be identical
250         assertArrayEquals(data1, data2, "bufferSize=" + bufferSize + " dataSize=" + dataSize);
251     }
252 
253     @Test
254     public void testIO_356_B10_D10_S0_UTF16() throws Exception {
255         testIO_356(10, 10, 0, UTF_16);
256     }
257 
258     @Test
259     public void testIO_356_B10_D10_S0_UTF8() throws Exception {
260         testIO_356(10, 10, 0, UTF_8);
261     }
262 
263     @Test
264     public void testIO_356_B10_D10_S1_UTF8() throws Exception {
265         testIO_356(10, 10, 1, UTF_8);
266     }
267 
268     @Test
269     public void testIO_356_B10_D10_S2_UTF8() throws Exception {
270         testIO_356(10, 10, 2, UTF_8);
271     }
272 
273     @Test
274     public void testIO_356_B10_D13_S0_UTF8() throws Exception {
275         testIO_356(10, 13, 0, UTF_8);
276     }
277 
278     @Test
279     public void testIO_356_B10_D13_S1_UTF8() throws Exception {
280         testIO_356(10, 13, 1, UTF_8);
281     }
282 
283     @Test
284     public void testIO_356_B10_D20_S0_UTF8() throws Exception {
285         testIO_356(10, 20, 0, UTF_8);
286     }
287 
288     private void testIO_356_Loop(final String csName, final int maxBytesPerChar) throws Exception {
289         for (int bufferSize = maxBytesPerChar; bufferSize <= 10; bufferSize++) {
290             for (int dataSize = 1; dataSize <= 20; dataSize++) {
291                 testIO_356(bufferSize, dataSize, 0, csName);
292             }
293         }
294     }
295 
296     @Test
297     public void testIO_356_Loop_UTF16() throws Exception {
298         final Charset charset = StandardCharsets.UTF_16;
299         testIO_356_Loop(charset.displayName(), (int) ReaderInputStream.minBufferSize(charset.newEncoder()));
300     }
301 
302     @Test
303     public void testIO_356_Loop_UTF8() throws Exception {
304         final Charset charset = StandardCharsets.UTF_8;
305         testIO_356_Loop(charset.displayName(), (int) ReaderInputStream.minBufferSize(charset.newEncoder()));
306     }
307 
308     @ParameterizedTest
309     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
310     public void testLargeBufferedRead_RequiredCharsets(final String csName) throws IOException {
311         testBufferedRead(LARGE_TEST_STRING, csName);
312     }
313 
314     @Test
315     public void testLargeBufferedRead_UTF8() throws IOException {
316         testBufferedRead(LARGE_TEST_STRING, UTF_8);
317     }
318 
319     @ParameterizedTest
320     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
321     public void testLargeSingleByteRead_RequiredCharsets(final String csName) throws IOException {
322         testSingleByteRead(LARGE_TEST_STRING, csName);
323     }
324 
325     @Test
326     public void testLargeSingleByteRead_UTF8() throws IOException {
327         testSingleByteRead(LARGE_TEST_STRING, UTF_8);
328     }
329 
330     // This test doesn't work for charsets that don't create a single byte for each char.
331     // Use testMarkResetMultiByteChars() instead for those cases.
332     private void testMarkReset(final String csName) throws Exception {
333         try (InputStream r = new CharSequenceInputStream("test", csName)) {
334             assertEquals(2, r.skip(2));
335             r.mark(0);
336             assertEquals('s', r.read(), csName);
337             assertEquals('t', r.read(), csName);
338             assertEquals(-1, r.read(), csName);
339             r.reset();
340             assertEquals('s', r.read(), csName);
341             assertEquals('t', r.read(), csName);
342             assertEquals(-1, r.read(), csName);
343             r.reset();
344             r.reset();
345         }
346     }
347 
348     @ParameterizedTest
349     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
350     public void testMarkReset_RequiredCharsets(final String csName) throws Exception {
351         testMarkResetMultiByteChars(csName);
352     }
353 
354     @Test
355     public void testMarkReset_USASCII() throws Exception {
356         testMarkReset(StandardCharsets.US_ASCII.name());
357     }
358 
359     @Test
360     public void testMarkReset_UTF8() throws Exception {
361         testMarkReset(UTF_8);
362     }
363 
364     private void testMarkResetMultiByteChars(final String csName) throws IOException {
365         // This test quietly skips Charsets that can't handle multibyte characters like ASCII.
366         final String sequenceEnglish = "Test Sequence";
367         final String sequenceCJK = "\u4e01\u4f23\u5045\u5167\u5289\u53ab"; // Kanji text
368         final String[] sequences = {sequenceEnglish, sequenceCJK};
369         for (final String testSequence : sequences) {
370             final CharsetEncoder charsetEncoder = Charset.forName(csName).newEncoder();
371             final ByteBuffer byteBuffer = ByteBuffer.allocate(testSequence.length() * 3);
372             final CharBuffer charBuffer = CharBuffer.wrap(testSequence);
373             final CoderResult result = charsetEncoder.encode(charBuffer, byteBuffer, true);
374             if (result.isUnmappable()) {
375                 continue; // Skip character sets that can't handle multibyte characters.
376             }
377             final byte[] expectedBytes = byteBuffer.array();
378 
379             final int bLength = byteBuffer.position();
380             final int skip = bLength - 4;
381             try (InputStream r = new CharSequenceInputStream(testSequence, csName)) {
382                 assertEquals(skip, r.skip(skip));
383                 r.mark(0);
384                 assertEquals(expectedBytes[bLength - 4], (byte) r.read(), csName);
385                 assertEquals(expectedBytes[bLength - 3], (byte) r.read(), csName);
386                 assertEquals(expectedBytes[bLength - 2], (byte) r.read(), csName);
387                 assertEquals(expectedBytes[bLength - 1], (byte) r.read(), csName);
388                 assertEquals(-1, (byte) r.read(), csName);
389                 r.reset();
390                 assertEquals(expectedBytes[bLength - 4], (byte) r.read(), csName);
391                 assertEquals(expectedBytes[bLength - 3], (byte) r.read(), csName);
392                 assertEquals(expectedBytes[bLength - 2], (byte) r.read(), csName);
393                 assertEquals(expectedBytes[bLength - 1], (byte) r.read(), csName);
394                 assertEquals(-1, (byte) r.read(), csName);
395                 r.reset();
396                 assertEquals(expectedBytes[bLength - 4], (byte) r.read(), csName);
397                 assertEquals(expectedBytes[bLength - 3], (byte) r.read(), csName);
398                 assertEquals(expectedBytes[bLength - 2], (byte) r.read(), csName);
399                 assertEquals(expectedBytes[bLength - 1], (byte) r.read(), csName);
400                 assertEquals(-1, (byte) r.read(), csName);
401             }
402         }
403     }
404 
405     @Test
406     public void testMarkSupported() throws Exception {
407         try (@SuppressWarnings("deprecation")
408         InputStream r = new CharSequenceInputStream("test", UTF_8)) {
409             assertTrue(r.markSupported());
410         }
411         try (InputStream r = CharSequenceInputStream.builder().setCharSequence("test").setCharset(UTF_8).get()) {
412             assertTrue(r.markSupported());
413         }
414     }
415 
416     @Test
417     public void testNullCharset() throws IOException {
418         try (CharSequenceInputStream in = new CharSequenceInputStream("A", (Charset) null)) {
419             IOUtils.toByteArray(in);
420             assertEquals(Charset.defaultCharset(), in.getCharsetEncoder().charset());
421         }
422         try (CharSequenceInputStream in = CharSequenceInputStream.builder().setCharSequence("test").setCharset((Charset) null).get()) {
423             IOUtils.toByteArray(in);
424             assertEquals(Charset.defaultCharset(), in.getCharsetEncoder().charset());
425         }
426     }
427 
428     @Test
429     public void testNullCharsetName() throws IOException {
430         try (CharSequenceInputStream in = new CharSequenceInputStream("A", (String) null)) {
431             IOUtils.toByteArray(in);
432             assertEquals(Charset.defaultCharset(), in.getCharsetEncoder().charset());
433         }
434         try (CharSequenceInputStream in = CharSequenceInputStream.builder().setCharSequence("test").setCharset((String) null).get()) {
435             IOUtils.toByteArray(in);
436             assertEquals(Charset.defaultCharset(), in.getCharsetEncoder().charset());
437         }
438     }
439 
440     private void testReadZero(final String csName) throws Exception {
441         try (InputStream r = new CharSequenceInputStream("test", csName)) {
442             final byte[] bytes = new byte[30];
443             assertEquals(0, r.read(bytes, 0, 0));
444         }
445     }
446 
447     @Test
448     public void testReadZero_EmptyString() throws Exception {
449         try (InputStream r = new CharSequenceInputStream("", UTF_8)) {
450             final byte[] bytes = new byte[30];
451             assertEquals(0, r.read(bytes, 0, 0));
452         }
453     }
454 
455     @ParameterizedTest
456     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
457     public void testReadZero_RequiredCharsets(final String csName) throws Exception {
458         testReadZero(csName);
459     }
460 
461     private void testResetBeforeEnd(final CharSequenceInputStream inputStream) throws IOException {
462         inputStream.mark(1);
463         assertEquals('1', inputStream.read());
464         inputStream.reset();
465         assertEquals('1', inputStream.read());
466         assertEquals('2', inputStream.read());
467         inputStream.reset();
468         assertEquals('1', inputStream.read());
469         assertEquals('2', inputStream.read());
470         assertEquals('3', inputStream.read());
471         inputStream.reset();
472         assertEquals('1', inputStream.read());
473         assertEquals('2', inputStream.read());
474         assertEquals('3', inputStream.read());
475         assertEquals('4', inputStream.read());
476         inputStream.reset();
477         assertEquals('1', inputStream.read());
478     }
479 
480     @Test
481     public void testResetBeforeEndSetCharSequence() throws IOException {
482         try (final CharSequenceInputStream inputStream = CharSequenceInputStream.builder().setCharSequence("1234").get()) {
483             testResetBeforeEnd(inputStream);
484         }
485     }
486 
487     @Test
488     public void testResetCharset() {
489         assertNotNull(CharSequenceInputStream.builder().setReader(new StringReader("\uD800")).setCharset((Charset) null).getCharset());
490     }
491 
492     @Test
493     public void testResetCharsetEncoder() {
494         assertNotNull(CharSequenceInputStream.builder().setReader(new StringReader("\uD800")).setCharsetEncoder(null).getCharsetEncoder());
495     }
496 
497     @Test
498     public void testResetCharsetName() {
499         assertNotNull(CharSequenceInputStream.builder().setReader(new StringReader("\uD800")).setCharset((String) null).getCharset());
500     }
501 
502     private void testSingleByteRead(final String testString, final String charsetName) throws IOException {
503         final byte[] bytes = testString.getBytes(charsetName);
504         try (InputStream in = new CharSequenceInputStream(testString, charsetName, 512)) {
505             for (final byte b : bytes) {
506                 final int read = in.read();
507                 assertTrue(read >= 0, "read " + read + " >=0 ");
508                 assertTrue(read <= 255, "read " + read + " <= 255");
509                 assertEquals(b, (byte) read, "Should agree with input");
510             }
511             assertEquals(-1, in.read());
512         }
513     }
514 
515     @ParameterizedTest
516     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
517     public void testSingleByteRead_RequiredCharsets(final String csName) throws IOException {
518         testSingleByteRead(TEST_STRING, csName);
519     }
520 
521     @Test
522     public void testSingleByteRead_UTF16() throws IOException {
523         testSingleByteRead(TEST_STRING, UTF_16);
524     }
525 
526     @Test
527     public void testSingleByteRead_UTF8() throws IOException {
528         testSingleByteRead(TEST_STRING, UTF_8);
529     }
530 
531     @ParameterizedTest
532     @MethodSource(CharsetsTest.REQUIRED_CHARSETS)
533     public void testSkip_RequiredCharsets(final String csName) throws Exception {
534         try (InputStream r = new CharSequenceInputStream("test", csName)) {
535             assertEquals(1, r.skip(1));
536             assertEquals(2, r.skip(2));
537             r.skip(100);
538             assertEquals(-1, r.read(), csName);
539         }
540     }
541 }