View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   https://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  
20  package org.apache.commons.csv;
21  
22  import static org.apache.commons.csv.Constants.CR;
23  import static org.apache.commons.csv.Constants.LF;
24  import static org.apache.commons.csv.Constants.UNDEFINED;
25  import static org.apache.commons.io.IOUtils.EOF;
26  
27  import java.io.IOException;
28  import java.io.Reader;
29  import java.nio.CharBuffer;
30  import java.nio.charset.CharacterCodingException;
31  import java.nio.charset.Charset;
32  import java.nio.charset.CharsetEncoder;
33  
34  import org.apache.commons.io.IOUtils;
35  import org.apache.commons.io.input.UnsynchronizedBufferedReader;
36  
37  /**
38   * A special buffered reader which supports sophisticated read access.
39   * <p>
40   * In particular the reader supports a look-ahead option, which allows you to see the next char returned by
41   * {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}.
42   * </p>
43   */
44  final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
45  
46      /** The last char returned */
47      private int lastChar = UNDEFINED;
48      private int lastCharMark = UNDEFINED;
49  
50      /** The count of EOLs (CR/LF/CRLF) seen so far */
51      private long lineNumber;
52      private long lineNumberMark;
53  
54      /** The position, which is the number of characters read so far */
55      private long position;
56      private long positionMark;
57  
58      /** The number of bytes read so far. */
59      private long bytesRead;
60      private long bytesReadMark;
61  
62      /** Encoder for calculating the number of bytes for each character read. */
63      private final CharsetEncoder encoder;
64  
65      /**
66       * Constructs a new instance using the default buffer size.
67       */
68      ExtendedBufferedReader(final Reader reader) {
69          this(reader, null, false);
70      }
71  
72      /**
73       * Constructs a new instance with the specified reader, character set,
74       * and byte tracking option. Initializes an encoder if byte tracking is enabled
75       * and a character set is provided.
76       *
77       * @param reader the reader supports a look-ahead option.
78       * @param charset the character set for encoding, or {@code null} if not applicable.
79       * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
80       */
81      ExtendedBufferedReader(final Reader reader, final Charset charset, final boolean trackBytes) {
82          super(reader);
83          encoder = charset != null && trackBytes ? charset.newEncoder() : null;
84      }
85  
86      /**
87       * Closes the stream.
88       *
89       * @throws IOException
90       *             If an I/O error occurs
91       */
92      @Override
93      public void close() throws IOException {
94          // Set ivars before calling super close() in case close() throws an IOException.
95          lastChar = EOF;
96          super.close();
97      }
98  
99      /**
100      * Gets the number of bytes read by the reader.
101      *
102      * @return the number of bytes read by the read
103      */
104     long getBytesRead() {
105         return this.bytesRead;
106     }
107 
108     /**
109      * Gets the byte length of the given character based on the the original Unicode
110      * specification, which defined characters as fixed-width 16-bit entities.
111      * <p>
112      * The Unicode characters are divided into two main ranges:
113      * <ul>
114      *   <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b>
115      *     <ul>
116      *       <li>Represented using a single 16-bit {@code char}.</li>
117      *       <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
118      *     </ul>
119      *   </li>
120      *   <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b>
121      *     <ul>
122      *       <li>Represented as a pair of {@code char}s:</li>
123      *       <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
124      *       <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
125      *       <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
126      *     </ul>
127      *   </li>
128      * </ul>
129      *
130      * @param current the current character to process.
131      * @return the byte length of the character.
132      * @throws CharacterCodingException if the character cannot be encoded.
133      */
134     private int getEncodedCharLength(final int current) throws CharacterCodingException {
135         final char cChar = (char) current;
136         final char lChar = (char) lastChar;
137         if (!Character.isSurrogate(cChar)) {
138             return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
139         }
140         if (Character.isHighSurrogate(cChar)) {
141             // Move on to the next char (low surrogate)
142             return 0;
143         } else if (Character.isSurrogatePair(lChar, cChar)) {
144             return encoder.encode(CharBuffer.wrap(new char[] { lChar, cChar })).limit();
145         } else {
146             throw new CharacterCodingException();
147         }
148     }
149 
150     /**
151      * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by
152      * any of the read methods. This will not include a character read using the {@link #peek()} method. If no
153      * character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached
154      * on the last read then this will return {@link IOUtils#EOF}.
155      *
156      * @return the last character that was read
157      */
158     int getLastChar() {
159         return lastChar;
160     }
161 
162     /**
163      * Returns the current line number
164      *
165      * @return the current line number
166      */
167     long getLineNumber() {
168         // Check if we are at EOL or EOF or just starting
169         if (lastChar == CR || lastChar == LF || lastChar == UNDEFINED || lastChar == EOF) {
170             return lineNumber; // counter is accurate
171         }
172         return lineNumber + 1; // Allow for counter being incremented only at EOL
173     }
174 
175     /**
176      * Gets the character position in the reader.
177      *
178      * @return the current position in the reader (counting characters, not bytes since this is a Reader)
179      */
180     long getPosition() {
181         return this.position;
182     }
183 
184     @Override
185     public void mark(final int readAheadLimit) throws IOException {
186         lineNumberMark = lineNumber;
187         lastCharMark = lastChar;
188         positionMark = position;
189         bytesReadMark = bytesRead;
190         super.mark(readAheadLimit);
191     }
192 
193     @Override
194     public int read() throws IOException {
195         final int current = super.read();
196         if (current == CR || current == LF && lastChar != CR ||
197             current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
198             lineNumber++;
199         }
200         if (encoder != null) {
201             this.bytesRead += getEncodedCharLength(current);
202         }
203         lastChar = current;
204         position++;
205         return lastChar;
206     }
207 
208     @Override
209     public int read(final char[] buf, final int offset, final int length) throws IOException {
210         if (length == 0) {
211             return 0;
212         }
213         final int len = super.read(buf, offset, length);
214         if (len > 0) {
215             for (int i = offset; i < offset + len; i++) {
216                 final char ch = buf[i];
217                 if (ch == LF) {
218                     if (CR != (i > offset ? buf[i - 1] : lastChar)) {
219                         lineNumber++;
220                     }
221                 } else if (ch == CR) {
222                     lineNumber++;
223                 }
224             }
225             lastChar = buf[offset + len - 1];
226         } else if (len == EOF) {
227             lastChar = EOF;
228         }
229         position += len;
230         return len;
231     }
232 
233     /**
234      * Gets the next line, dropping the line terminator(s). This method should only be called when processing a
235      * comment, otherwise, information can be lost.
236      * <p>
237      * Increments {@link #lineNumber} and updates {@link #position}.
238      * </p>
239      * <p>
240      * Sets {@link #lastChar} to {@code Constants.EOF} at EOF, otherwise the last EOL character.
241      * </p>
242      *
243      * @return the line that was read, or null if reached EOF.
244      */
245     @Override
246     public String readLine() throws IOException {
247         if (peek() == EOF) {
248             return null;
249         }
250         final StringBuilder buffer = new StringBuilder();
251         while (true) {
252             final int current = read();
253             if (current == CR) {
254                 final int next = peek();
255                 if (next == LF) {
256                     read();
257                 }
258             }
259             if (current == EOF || current == LF || current == CR) {
260                 break;
261             }
262             buffer.append((char) current);
263         }
264         return buffer.toString();
265     }
266 
267     @Override
268     public void reset() throws IOException {
269         lineNumber = lineNumberMark;
270         lastChar = lastCharMark;
271         position = positionMark;
272         bytesRead = bytesReadMark;
273         super.reset();
274     }
275 
276 }