View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import static org.junit.Assert.assertArrayEquals;
20  import static org.junit.Assert.assertEquals;
21  import static org.junit.Assert.assertTrue;
22  import static org.junit.Assert.assertFalse;
23  import static org.junit.Assert.fail;
24  
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.nio.charset.Charset;
28  import java.util.Random;
29  import java.util.Set;
30  
31  import org.apache.commons.io.Charsets;
32  import org.junit.Ignore;
33  import org.junit.Test;
34  
35  public class CharSequenceInputStreamTest {
36  
37      private static final String ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
38      private static final String LARGE_TEST_STRING;
39  
40      private static final String TEST_STRING = "\u00e0 peine arriv\u00e9s nous entr\u00e2mes dans sa chambre";
41  
42      static {
43          final StringBuilder buffer = new StringBuilder();
44          for (int i = 0; i < 100; i++) {
45              buffer.append(TEST_STRING);
46          }
47          LARGE_TEST_STRING = buffer.toString();
48      }
49  
50      private final Random random = new Random();
51  
52      private Set<String> getRequiredCharsetNames() {
53          return Charsets.requiredCharsets().keySet();
54      }
55  
56      private void testBufferedRead(final String testString, final String charsetName) throws IOException {
57          final byte[] expected = testString.getBytes(charsetName);
58          try (InputStream in = new CharSequenceInputStream(testString, charsetName, 512)) {
59              final byte[] buffer = new byte[128];
60              int offset = 0;
61              while (true) {
62                  int bufferOffset = random.nextInt(64);
63                  final int bufferLength = random.nextInt(64);
64                  int read = in.read(buffer, bufferOffset, bufferLength);
65                  if (read == -1) {
66                      assertEquals("EOF: offset should equal length for charset " + charsetName, expected.length, offset);
67                      break;
68                  } else {
69                      assertTrue("Read " + read + " <= " + bufferLength, read <= bufferLength);
70                      while (read > 0) {
71                          assertTrue("offset for " + charsetName + " " + offset + " < " + expected.length, offset <
72                                  expected.length);
73                          assertEquals("bytes should agree for " + charsetName, expected[offset], buffer[bufferOffset]);
74                          offset++;
75                          bufferOffset++;
76                          read--;
77                      }
78                  }
79              }
80          }
81      }
82  
83  //    Unfortunately checking canEncode does not seem to work for all charsets:
84  //    testBufferedRead_AvailableCharset(org.apache.commons.io.input.CharSequenceInputStreamTest)  Time elapsed: 0.682 sec  <<< ERROR!
85  //    java.lang.UnsupportedOperationException: null
86  //        at java.nio.CharBuffer.array(CharBuffer.java:940)
87  //        at sun.nio.cs.ext.COMPOUND_TEXT_Encoder.encodeLoop(COMPOUND_TEXT_Encoder.java:75)
88  //        at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
89  //        at org.apache.commons.io.input.CharSequenceInputStream.fillBuffer(CharSequenceInputStream.java:111)
90      @Test
91      public void testBufferedRead_AvailableCharset() throws IOException {
92          for (final String csName : Charset.availableCharsets().keySet()) {
93              // prevent java.lang.UnsupportedOperationException at sun.nio.cs.ext.ISO2022_CN.newEncoder.
94              if (isAvailabilityTestableForCharset(csName)) {
95                  testBufferedRead(TEST_STRING, csName);
96              }
97          }
98      }
99  
100     @Test
101     public void testBufferedRead_RequiredCharset() throws IOException {
102         for (final String csName : getRequiredCharsetNames()) {
103             testBufferedRead(TEST_STRING, csName);
104         }
105     }
106 
107     @Test
108     public void testBufferedRead_UTF8() throws IOException {
109         testBufferedRead(TEST_STRING, "UTF-8");
110     }
111 
112     private void testCharsetMismatchInfiniteLoop(final String csName) throws IOException {
113         // Input is UTF-8 bytes: 0xE0 0xB2 0xA0
114         final char[] inputChars = new char[] { (char) 0xE0, (char) 0xB2, (char) 0xA0 };
115         final Charset charset = Charset.forName(csName); // infinite loop for US-ASCII, UTF-8 OK
116         try (InputStream stream = new CharSequenceInputStream(new String(inputChars), charset, 512)) {
117             while (stream.read() != -1) {
118             }
119         }
120     }
121 
122     @Test
123     public void testCharsetMismatchInfiniteLoop_RequiredCharsets() throws IOException {
124         for (final String csName : getRequiredCharsetNames()) {
125             testCharsetMismatchInfiniteLoop(csName);
126         }
127     }
128 
129     // Test is broken if readFirst > 0
130     // This is because the initial read fills the buffer from the CharSequence
131     // so data1 gets the first buffer full; data2 will get the next buffer full
132     private void testIO_356(final int bufferSize, final int dataSize, final int readFirst, final String csName) throws Exception {
133         final CharSequenceInputStream is = new CharSequenceInputStream(ALPHABET, csName, bufferSize);
134 
135         for (int i = 0; i < readFirst; i++) {
136             final int ch = is.read();
137             assertFalse(ch == -1);
138         }
139 
140         is.mark(dataSize);
141 
142         final byte[] data1 = new byte[dataSize];
143         final int readCount1 = is.read(data1);
144         assertEquals(dataSize, readCount1);
145 
146         is.reset(); // should allow data to be re-read
147 
148         final byte[] data2 = new byte[dataSize];
149         final int readCount2 = is.read(data2);
150         assertEquals(dataSize, readCount2);
151 
152         is.close();
153 
154         // data buffers should be identical
155         assertArrayEquals("bufferSize=" + bufferSize + " dataSize=" + dataSize, data1, data2);
156     }
157 
158     @Test
159     public void testIO_356_B10_D10_S0_UTF16() throws Exception {
160         testIO_356(10, 10, 0, "UTF-16");
161     }
162 
163     @Test
164     public void testIO_356_B10_D10_S0_UTF8() throws Exception {
165         testIO_356(10, 10, 0, "UTF-8");
166     }
167 
168     @Test
169     public void testIO_356_B10_D10_S1_UTF8() throws Exception {
170         testIO_356(10, 10, 1, "UTF-8");
171     }
172 
173     @Test
174     public void testIO_356_B10_D10_S2_UTF8() throws Exception {
175         testIO_356(10, 10, 2, "UTF-8");
176     }
177 
178     @Test
179     public void testIO_356_B10_D13_S0_UTF8() throws Exception {
180         testIO_356(10, 13, 0, "UTF-8");
181     }
182 
183     @Test
184     public void testIO_356_B10_D13_S1_UTF8() throws Exception {
185         testIO_356(10, 13, 1, "UTF-8");
186     }
187 
188     @Test
189     public void testIO_356_B10_D20_S0_UTF8() throws Exception {
190         testIO_356(10, 20, 0, "UTF-8");
191     }
192 
193     private void testIO_356_Loop(final String csName, final int maxBytesPerChar) throws Exception {
194         for (int bufferSize = maxBytesPerChar; bufferSize <= 10; bufferSize++) {
195             for (int dataSize = 1; dataSize <= 20; dataSize++) {
196                 testIO_356(bufferSize, dataSize, 0, csName);
197             }
198         }
199     }
200 
201     @Test
202     public void testIO_356_Loop_UTF16() throws Exception {
203         testIO_356_Loop("UTF-16", 4);
204     }
205 
206     @Test
207     public void testIO_356_Loop_UTF8() throws Exception {
208         testIO_356_Loop("UTF-8", 4);
209     }
210 
211     @Test
212     public void testLargeBufferedRead_RequiredCharsets() throws IOException {
213         for (final String csName : getRequiredCharsetNames()) {
214             testBufferedRead(LARGE_TEST_STRING, csName);
215         }
216     }
217 
218     @Test
219     public void testLargeBufferedRead_UTF8() throws IOException {
220         testBufferedRead(LARGE_TEST_STRING, "UTF-8");
221     }
222 
223     @Test
224     public void testLargeSingleByteRead_RequiredCharsets() throws IOException {
225         for (final String csName : getRequiredCharsetNames()) {
226             testSingleByteRead(LARGE_TEST_STRING, csName);
227         }
228     }
229 
230     @Test
231     public void testLargeSingleByteRead_UTF8() throws IOException {
232         testSingleByteRead(LARGE_TEST_STRING, "UTF-8");
233     }
234 
235     // This test is broken for charsets that don't create a single byte for each char
236     private void testMarkReset(final String csName) throws Exception {
237         try (InputStream r = new CharSequenceInputStream("test", csName)) {
238             assertEquals(2, r.skip(2));
239             r.mark(0);
240             assertEquals(csName, 's', r.read());
241             assertEquals(csName, 't', r.read());
242             assertEquals(csName, -1, r.read());
243             r.reset();
244             assertEquals(csName, 's', r.read());
245             assertEquals(csName, 't', r.read());
246             assertEquals(csName, -1, r.read());
247             r.reset();
248             r.reset();
249         }
250     }
251 
252     @Test
253     @Ignore // Test broken for charsets that create multiple bytes for a single char
254     public void testMarkReset_RequiredCharsets() throws Exception {
255         for (final String csName : getRequiredCharsetNames()) {
256             testMarkReset(csName);
257         }
258     }
259 
260     @Test
261     public void testMarkReset_USASCII() throws Exception {
262         testMarkReset("US-ASCII");
263     }
264 
265     @Test
266     public void testMarkReset_UTF8() throws Exception {
267         testMarkReset("UTF-8");
268     }
269 
270     @Test
271     public void testMarkSupported() throws Exception {
272         try (InputStream r = new CharSequenceInputStream("test", "UTF-8")) {
273             assertTrue(r.markSupported());
274         }
275     }
276 
277     private void testReadZero(final String csName) throws Exception {
278         try (InputStream r = new CharSequenceInputStream("test", csName)) {
279             final byte[] bytes = new byte[30];
280             assertEquals(0, r.read(bytes, 0, 0));
281         }
282     }
283 
284     @Test
285     public void testReadZero_EmptyString() throws Exception {
286         try (InputStream r = new CharSequenceInputStream("", "UTF-8")) {
287             final byte[] bytes = new byte[30];
288             assertEquals(0, r.read(bytes, 0, 0));
289         }
290     }
291 
292     @Test
293     public void testReadZero_RequiredCharsets() throws Exception {
294         for (final String csName : getRequiredCharsetNames()) {
295             testReadZero(csName);
296         }
297     }
298 
299     private void testSingleByteRead(final String testString, final String charsetName) throws IOException {
300         final byte[] bytes = testString.getBytes(charsetName);
301         try (InputStream in = new CharSequenceInputStream(testString, charsetName, 512)) {
302             for (final byte b : bytes) {
303                 final int read = in.read();
304                 assertTrue("read " + read + " >=0 ", read >= 0);
305                 assertTrue("read " + read + " <= 255", read <= 255);
306                 assertEquals("Should agree with input", b, (byte) read);
307             }
308             assertEquals(-1, in.read());
309         }
310     }
311 
312     @Test
313     public void testSingleByteRead_RequiredCharsets() throws IOException {
314         for (final String csName : getRequiredCharsetNames()) {
315             testSingleByteRead(TEST_STRING, csName);
316         }
317     }
318 
319     @Test
320     public void testSingleByteRead_UTF16() throws IOException {
321         testSingleByteRead(TEST_STRING, "UTF-16");
322     }
323 
324     @Test
325     public void testSingleByteRead_UTF8() throws IOException {
326         testSingleByteRead(TEST_STRING, "UTF-8");
327     }
328 
329     // This is broken for charsets that don't map each char to a byte
330     private void testSkip(final String csName) throws Exception {
331         try (InputStream r = new CharSequenceInputStream("test", csName)) {
332             assertEquals(1, r.skip(1));
333             assertEquals(2, r.skip(2));
334             assertEquals(csName, 't', r.read());
335             r.skip(100);
336             assertEquals(csName, -1, r.read());
337         }
338     }
339 
340     @Test
341     @Ignore // test is broken for charsets that generate multiple bytes per char.
342     public void testSkip_RequiredCharsets() throws Exception {
343         for (final String csName : getRequiredCharsetNames()) {
344             testSkip(csName);
345         }
346     }
347 
348     @Test
349     public void testSkip_USASCII() throws Exception {
350         testSkip("US-ASCII");
351     }
352 
353     @Test
354     public void testSkip_UTF8() throws Exception {
355         testSkip("UTF-8");
356     }
357 
358     private int checkAvail(InputStream is, int min) throws Exception {
359         int available = is.available();
360         assertTrue("avail should be >= " + min + ", but was " + available, available >= min);
361         return available;
362     }
363 
364     private void testAvailableSkip(final String csName) throws Exception {
365         final String input = "test";
366         try (InputStream r = new CharSequenceInputStream(input, csName)) {
367             int available = checkAvail(r, input.length());
368             assertEquals(available - 1, r.skip(available - 1)); // skip all but one
369             available = checkAvail(r, 1);
370             assertEquals(1, r.skip(1));
371             available = checkAvail(r, 0);
372         }
373     }
374 
375     private void testAvailableRead(final String csName) throws Exception {
376         final String input = "test";
377         try (InputStream r = new CharSequenceInputStream(input, csName)) {
378             int available = checkAvail(r, input.length());
379             byte buff[] = new byte[available];
380             assertEquals(available - 1, r.skip(available - 1)); // skip all but one
381             available = checkAvail(r, 1);
382             buff = new byte[available];
383             assertEquals(available, r.read(buff, 0, available));
384         }
385     }
386 
387     @Test
388     public void testAvailable() throws Exception {
389         for (final String csName : Charset.availableCharsets().keySet()) {
390             // prevent java.lang.UnsupportedOperationException at sun.nio.cs.ext.ISO2022_CN.newEncoder.
391             // also try and avoid the following Effor on Continuum
392 //            java.lang.UnsupportedOperationException: null
393 //            at java.nio.CharBuffer.array(CharBuffer.java:940)
394 //            at sun.nio.cs.ext.COMPOUND_TEXT_Encoder.encodeLoop(COMPOUND_TEXT_Encoder.java:75)
395 //            at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
396 //            at org.apache.commons.io.input.CharSequenceInputStream.fillBuffer(CharSequenceInputStream.java:120)
397 //            at org.apache.commons.io.input.CharSequenceInputStream.read(CharSequenceInputStream.java:151)
398 //            at org.apache.commons.io.input.CharSequenceInputStreamTest.testAvailableRead(CharSequenceInputStreamTest.java:412)
399 //            at org.apache.commons.io.input.CharSequenceInputStreamTest.testAvailable(CharSequenceInputStreamTest.java:424)
400 
401             try {
402                 if (isAvailabilityTestableForCharset(csName)) {
403                     testAvailableSkip(csName);
404                     testAvailableRead(csName);
405                 }
406             } catch (UnsupportedOperationException e){
407                 fail("Operation not supported for " + csName);
408             }
409         }
410     }
411 
412     private boolean isAvailabilityTestableForCharset(final String csName) {
413         return Charset.forName(csName).canEncode()
414                 && !"COMPOUND_TEXT".equalsIgnoreCase(csName) && !"x-COMPOUND_TEXT".equalsIgnoreCase(csName)
415                 && !isOddBallLegacyCharsetThatDoesNotSupportFrenchCharacters(csName);
416     }
417 
418     private boolean isOddBallLegacyCharsetThatDoesNotSupportFrenchCharacters(String csName) {
419         return "x-IBM1388".equalsIgnoreCase(csName) ||
420                 "ISO-2022-CN".equalsIgnoreCase(csName) ||
421                 "ISO-2022-JP".equalsIgnoreCase(csName) ||
422                 "Shift_JIS".equalsIgnoreCase(csName);
423     }
424 }