View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import static org.junit.Assert.assertArrayEquals;
20  import static org.junit.Assert.assertEquals;
21  import static org.junit.Assert.assertTrue;
22  import static org.junit.Assert.assertFalse;
23  import static org.junit.Assert.fail;
24  
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.nio.charset.Charset;
28  import java.util.Random;
29  import java.util.Set;
30  
31  import org.apache.commons.io.Charsets;
32  import org.junit.Ignore;
33  import org.junit.Test;
34  
35  public class CharSequenceInputStreamTest {
36  
37      private static final String ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
38      private static final String LARGE_TEST_STRING;
39  
40      private static final String TEST_STRING = "\u00e0 peine arriv\u00e9s nous entr\u00e2mes dans sa chambre";
41  
42      static {
43          final StringBuilder buffer = new StringBuilder();
44          for (int i = 0; i < 100; i++) {
45              buffer.append(TEST_STRING);
46          }
47          LARGE_TEST_STRING = buffer.toString();
48      }
49  
50      private final Random random = new Random();
51  
52      private Set<String> getRequiredCharsetNames() {
53          return Charsets.requiredCharsets().keySet();
54      }
55  
56      private void testBufferedRead(final String testString, final String charsetName) throws IOException {
57          final byte[] expected = testString.getBytes(charsetName);
58          final InputStream in = new CharSequenceInputStream(testString, charsetName, 512);
59          try {
60              final byte[] buffer = new byte[128];
61              int offset = 0;
62              while (true) {
63                  int bufferOffset = random.nextInt(64);
64                  final int bufferLength = random.nextInt(64);
65                  int read = in.read(buffer, bufferOffset, bufferLength);
66                  if (read == -1) {
67                      assertEquals("EOF: offset should equal length for charset " + charsetName, expected.length, offset);
68                      break;
69                  } else {
70                      assertTrue("Read " + read + " <= " + bufferLength, read <= bufferLength);
71                      while (read > 0) {
72                          assertTrue("offset for " + charsetName +" " + offset + " < " + expected.length, offset < expected.length);
73                          assertEquals("bytes should agree for " + charsetName, expected[offset], buffer[bufferOffset]);
74                          offset++;
75                          bufferOffset++;
76                          read--;
77                      }
78                  }
79              }
80          } finally {
81              in.close();
82          }
83      }
84  
85  //    Unfortunately checking canEncode does not seem to work for all charsets:
86  //    testBufferedRead_AvailableCharset(org.apache.commons.io.input.CharSequenceInputStreamTest)  Time elapsed: 0.682 sec  <<< ERROR!
87  //    java.lang.UnsupportedOperationException: null
88  //        at java.nio.CharBuffer.array(CharBuffer.java:940)
89  //        at sun.nio.cs.ext.COMPOUND_TEXT_Encoder.encodeLoop(COMPOUND_TEXT_Encoder.java:75)
90  //        at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
91  //        at org.apache.commons.io.input.CharSequenceInputStream.fillBuffer(CharSequenceInputStream.java:111)
92      @Test
93      public void testBufferedRead_AvailableCharset() throws IOException {
94          for (final String csName : Charset.availableCharsets().keySet()) {
95              // prevent java.lang.UnsupportedOperationException at sun.nio.cs.ext.ISO2022_CN.newEncoder.
96              if (isAvailabilityTestableForCharset(csName)) {
97                  testBufferedRead(TEST_STRING, csName);
98              }
99          }
100     }
101 
102     @Test
103     public void testBufferedRead_RequiredCharset() throws IOException {
104         for (final String csName : getRequiredCharsetNames()) {
105             testBufferedRead(TEST_STRING, csName);
106         }
107     }
108 
109     @Test
110     public void testBufferedRead_UTF8() throws IOException {
111         testBufferedRead(TEST_STRING, "UTF-8");
112     }
113 
114     private void testCharsetMismatchInfiniteLoop(final String csName) throws IOException {
115         // Input is UTF-8 bytes: 0xE0 0xB2 0xA0
116         final char[] inputChars = new char[] { (char) 0xE0, (char) 0xB2, (char) 0xA0 };
117         final Charset charset = Charset.forName(csName); // infinite loop for US-ASCII, UTF-8 OK
118         final InputStream stream = new CharSequenceInputStream(new String(inputChars), charset, 512);
119         try {
120             while (stream.read() != -1) {
121             }
122         } finally {
123             stream.close();
124         }
125     }
126 
127     @Test
128     public void testCharsetMismatchInfiniteLoop_RequiredCharsets() throws IOException {
129         for (final String csName : getRequiredCharsetNames()) {
130             testCharsetMismatchInfiniteLoop(csName);
131         }
132     }
133 
134     // Test is broken if readFirst > 0
135     // This is because the initial read fills the buffer from the CharSequence
136     // so data1 gets the first buffer full; data2 will get the next buffer full
137     private void testIO_356(final int bufferSize, final int dataSize, final int readFirst, final String csName) throws Exception {
138         final CharSequenceInputStream is = new CharSequenceInputStream(ALPHABET, csName, bufferSize);
139 
140         for (int i = 0; i < readFirst; i++) {
141             final int ch = is.read();
142             assertFalse(ch == -1);
143         }
144 
145         is.mark(dataSize);
146 
147         final byte[] data1 = new byte[dataSize];
148         final int readCount1 = is.read(data1);
149         assertEquals(dataSize, readCount1);
150 
151         is.reset(); // should allow data to be re-read
152 
153         final byte[] data2 = new byte[dataSize];
154         final int readCount2 = is.read(data2);
155         assertEquals(dataSize, readCount2);
156 
157         is.close();
158 
159         // data buffers should be identical
160         assertArrayEquals("bufferSize=" + bufferSize + " dataSize=" + dataSize, data1, data2);
161     }
162 
163     @Test
164     public void testIO_356_B10_D10_S0_UTF16() throws Exception {
165         testIO_356(10, 10, 0, "UTF-16");
166     }
167 
168     @Test
169     public void testIO_356_B10_D10_S0_UTF8() throws Exception {
170         testIO_356(10, 10, 0, "UTF-8");
171     }
172 
173     @Test
174     public void testIO_356_B10_D10_S1_UTF8() throws Exception {
175         testIO_356(10, 10, 1, "UTF-8");
176     }
177 
178     @Test
179     public void testIO_356_B10_D10_S2_UTF8() throws Exception {
180         testIO_356(10, 10, 2, "UTF-8");
181     }
182 
183     @Test
184     public void testIO_356_B10_D13_S0_UTF8() throws Exception {
185         testIO_356(10, 13, 0, "UTF-8");
186     }
187 
188     @Test
189     public void testIO_356_B10_D13_S1_UTF8() throws Exception {
190         testIO_356(10, 13, 1, "UTF-8");
191     }
192 
193     @Test
194     public void testIO_356_B10_D20_S0_UTF8() throws Exception {
195         testIO_356(10, 20, 0, "UTF-8");
196     }
197 
198     private void testIO_356_Loop(final String csName, final int maxBytesPerChar) throws Exception {
199         for (int bufferSize = maxBytesPerChar; bufferSize <= 10; bufferSize++) {
200             for (int dataSize = 1; dataSize <= 20; dataSize++) {
201                 testIO_356(bufferSize, dataSize, 0, csName);
202             }
203         }
204     }
205 
206     @Test
207     public void testIO_356_Loop_UTF16() throws Exception {
208         testIO_356_Loop("UTF-16", 4);
209     }
210 
211     @Test
212     public void testIO_356_Loop_UTF8() throws Exception {
213         testIO_356_Loop("UTF-8", 4);
214     }
215 
216     @Test
217     public void testLargeBufferedRead_RequiredCharsets() throws IOException {
218         for (final String csName : getRequiredCharsetNames()) {
219             testBufferedRead(LARGE_TEST_STRING, csName);
220         }
221     }
222 
223     @Test
224     public void testLargeBufferedRead_UTF8() throws IOException {
225         testBufferedRead(LARGE_TEST_STRING, "UTF-8");
226     }
227 
228     @Test
229     public void testLargeSingleByteRead_RequiredCharsets() throws IOException {
230         for (final String csName : getRequiredCharsetNames()) {
231             testSingleByteRead(LARGE_TEST_STRING, csName);
232         }
233     }
234 
235     @Test
236     public void testLargeSingleByteRead_UTF8() throws IOException {
237         testSingleByteRead(LARGE_TEST_STRING, "UTF-8");
238     }
239 
240     // This test is broken for charsets that don't create a single byte for each char
241     private void testMarkReset(final String csName) throws Exception {
242         final InputStream r = new CharSequenceInputStream("test", csName);
243         try {
244             assertEquals(2, r.skip(2));
245             r.mark(0);
246             assertEquals(csName, 's', r.read());
247             assertEquals(csName, 't', r.read());
248             assertEquals(csName, -1, r.read());
249             r.reset();
250             assertEquals(csName, 's', r.read());
251             assertEquals(csName, 't', r.read());
252             assertEquals(csName, -1, r.read());
253             r.reset();
254             r.reset();
255         } finally {
256             r.close();
257         }
258     }
259 
260     @Test
261     @Ignore // Test broken for charsets that create multiple bytes for a single char
262     public void testMarkReset_RequiredCharsets() throws Exception {
263         for (final String csName : getRequiredCharsetNames()) {
264             testMarkReset(csName);
265         }
266     }
267 
268     @Test
269     public void testMarkReset_USASCII() throws Exception {
270         testMarkReset("US-ASCII");
271     }
272 
273     @Test
274     public void testMarkReset_UTF8() throws Exception {
275         testMarkReset("UTF-8");
276     }
277 
278     @Test
279     public void testMarkSupported() throws Exception {
280         final InputStream r = new CharSequenceInputStream("test", "UTF-8");
281         try {
282             assertTrue(r.markSupported());
283         } finally {
284             r.close();
285         }
286     }
287 
288     private void testReadZero(final String csName) throws Exception {
289         final InputStream r = new CharSequenceInputStream("test", csName);
290         try {
291             final byte[] bytes = new byte[30];
292             assertEquals(0, r.read(bytes, 0, 0));
293         } finally {
294             r.close();
295         }
296     }
297 
298     @Test
299     public void testReadZero_EmptyString() throws Exception {
300         final InputStream r = new CharSequenceInputStream("", "UTF-8");
301         try {
302             final byte[] bytes = new byte[30];
303             assertEquals(0, r.read(bytes, 0, 0));
304         } finally {
305             r.close();
306         }
307     }
308 
309     @Test
310     public void testReadZero_RequiredCharsets() throws Exception {
311         for (final String csName : getRequiredCharsetNames()) {
312             testReadZero(csName);
313         }
314     }
315 
316     private void testSingleByteRead(final String testString, final String charsetName) throws IOException {
317         final byte[] bytes = testString.getBytes(charsetName);
318         final InputStream in = new CharSequenceInputStream(testString, charsetName, 512);
319         try {
320             for (final byte b : bytes) {
321                 final int read = in.read();
322                 assertTrue("read " + read + " >=0 ", read >= 0);
323                 assertTrue("read " + read + " <= 255", read <= 255);
324                 assertEquals("Should agree with input", b, (byte) read);
325             }
326             assertEquals(-1, in.read());
327         } finally {
328             in.close();
329         }
330     }
331 
332     @Test
333     public void testSingleByteRead_RequiredCharsets() throws IOException {
334         for (final String csName : getRequiredCharsetNames()) {
335             testSingleByteRead(TEST_STRING, csName);
336         }
337     }
338 
339     @Test
340     public void testSingleByteRead_UTF16() throws IOException {
341         testSingleByteRead(TEST_STRING, "UTF-16");
342     }
343 
344     @Test
345     public void testSingleByteRead_UTF8() throws IOException {
346         testSingleByteRead(TEST_STRING, "UTF-8");
347     }
348 
349     // This is broken for charsets that don't map each char to a byte
350     private void testSkip(final String csName) throws Exception {
351         final InputStream r = new CharSequenceInputStream("test", csName);
352         try {
353             assertEquals(1, r.skip(1));
354             assertEquals(2, r.skip(2));
355             assertEquals(csName, 't', r.read());
356             r.skip(100);
357             assertEquals(csName, -1, r.read());
358         } finally {
359             r.close();
360         }
361     }
362 
363     @Test
364     @Ignore // test is broken for charsets that generate multiple bytes per char.
365     public void testSkip_RequiredCharsets() throws Exception {
366         for (final String csName : getRequiredCharsetNames()) {
367             testSkip(csName);
368         }
369     }
370 
371     @Test
372     public void testSkip_USASCII() throws Exception {
373         testSkip("US-ASCII");
374     }
375 
376     @Test
377     public void testSkip_UTF8() throws Exception {
378         testSkip("UTF-8");
379     }
380 
381     private int checkAvail(InputStream is, int min) throws Exception {
382         int available = is.available();
383         assertTrue("avail should be >= " + min + ", but was " + available, available >= min);
384         return available;
385     }
386 
387     private void testAvailableSkip(final String csName) throws Exception {
388         final String input = "test";
389         final InputStream r = new CharSequenceInputStream(input, csName);
390         try {
391             int available = checkAvail(r, input.length());
392             assertEquals(available - 1, r.skip(available-1)); // skip all but one
393             available = checkAvail(r, 1);
394             assertEquals(1, r.skip(1));
395             available = checkAvail(r, 0);
396         } finally {
397             r.close();
398         }
399     }
400 
401     private void testAvailableRead(final String csName) throws Exception {
402         final String input = "test";
403         final InputStream r = new CharSequenceInputStream(input, csName);
404         try {
405             int available = checkAvail(r, input.length());
406             byte buff[] = new byte[available];
407             assertEquals(available - 1, r.skip(available-1)); // skip all but one
408             available = checkAvail(r, 1);
409             buff = new byte[available];
410             assertEquals(available, r.read(buff, 0, available));
411         } finally {
412             r.close();
413         }
414     }
415 
416     @Test
417     public void testAvailable() throws Exception {
418         for (final String csName : Charset.availableCharsets().keySet()) {
419             // prevent java.lang.UnsupportedOperationException at sun.nio.cs.ext.ISO2022_CN.newEncoder.
420             // also try and avoid the following Effor on Continuum
421 //            java.lang.UnsupportedOperationException: null
422 //            at java.nio.CharBuffer.array(CharBuffer.java:940)
423 //            at sun.nio.cs.ext.COMPOUND_TEXT_Encoder.encodeLoop(COMPOUND_TEXT_Encoder.java:75)
424 //            at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
425 //            at org.apache.commons.io.input.CharSequenceInputStream.fillBuffer(CharSequenceInputStream.java:120)
426 //            at org.apache.commons.io.input.CharSequenceInputStream.read(CharSequenceInputStream.java:151)
427 //            at org.apache.commons.io.input.CharSequenceInputStreamTest.testAvailableRead(CharSequenceInputStreamTest.java:412)
428 //            at org.apache.commons.io.input.CharSequenceInputStreamTest.testAvailable(CharSequenceInputStreamTest.java:424)
429 
430             try {
431                 if (isAvailabilityTestableForCharset(csName)) {
432                     testAvailableSkip(csName);
433                     testAvailableRead(csName);
434                 }
435             } catch (UnsupportedOperationException e){
436                 fail("Operation not supported for " + csName);
437             }
438         }
439     }
440 
441     private boolean isAvailabilityTestableForCharset(final String csName) {
442         return Charset.forName(csName).canEncode()
443                 && !"COMPOUND_TEXT".equalsIgnoreCase(csName) && !"x-COMPOUND_TEXT".equalsIgnoreCase(csName)
444                 && !isOddBallLegacyCharsetThatDoesNotSupportFrenchCharacters(csName);
445     }
446 
447     private boolean isOddBallLegacyCharsetThatDoesNotSupportFrenchCharacters(String csName) {
448         return "x-IBM1388".equalsIgnoreCase(csName) ||
449                 "ISO-2022-CN".equalsIgnoreCase(csName) ||
450                 "ISO-2022-JP".equalsIgnoreCase(csName) ||
451                 "Shift_JIS".equalsIgnoreCase(csName);
452     }
453 }