1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * https://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20 package org.apache.commons.csv;
21
22 import static org.apache.commons.csv.Constants.CR;
23 import static org.apache.commons.csv.Constants.LF;
24 import static org.apache.commons.csv.Constants.UNDEFINED;
25 import static org.apache.commons.io.IOUtils.EOF;
26
27 import java.io.IOException;
28 import java.io.Reader;
29 import java.nio.CharBuffer;
30 import java.nio.charset.CharacterCodingException;
31 import java.nio.charset.Charset;
32 import java.nio.charset.CharsetEncoder;
33
34 import org.apache.commons.io.IOUtils;
35 import org.apache.commons.io.input.UnsynchronizedBufferedReader;
36
37 /**
38 * A special buffered reader which supports sophisticated read access.
39 * <p>
40 * In particular the reader supports a look-ahead option, which allows you to see the next char returned by
41 * {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}.
42 * </p>
43 */
44 final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
45
46 /** The last char returned */
47 private int lastChar = UNDEFINED;
48 private int lastCharMark = UNDEFINED;
49
50 /** The count of EOLs (CR/LF/CRLF) seen so far */
51 private long lineNumber;
52 private long lineNumberMark;
53
54 /** The position, which is the number of characters read so far */
55 private long position;
56 private long positionMark;
57
58 /** The number of bytes read so far. */
59 private long bytesRead;
60 private long bytesReadMark;
61
62 /** Encoder for calculating the number of bytes for each character read. */
63 private final CharsetEncoder encoder;
64
65 /**
66 * Constructs a new instance using the default buffer size.
67 */
68 ExtendedBufferedReader(final Reader reader) {
69 this(reader, null, false);
70 }
71
72 /**
73 * Constructs a new instance with the specified reader, character set,
74 * and byte tracking option. Initializes an encoder if byte tracking is enabled
75 * and a character set is provided.
76 *
77 * @param reader the reader supports a look-ahead option.
78 * @param charset the character set for encoding, or {@code null} if not applicable.
79 * @param trackBytes {@code true} to enable byte tracking; {@code false} to disable it.
80 */
81 ExtendedBufferedReader(final Reader reader, final Charset charset, final boolean trackBytes) {
82 super(reader);
83 encoder = charset != null && trackBytes ? charset.newEncoder() : null;
84 }
85
86 /**
87 * Closes the stream.
88 *
89 * @throws IOException
90 * If an I/O error occurs
91 */
92 @Override
93 public void close() throws IOException {
94 // Set ivars before calling super close() in case close() throws an IOException.
95 lastChar = EOF;
96 super.close();
97 }
98
99 /**
100 * Gets the number of bytes read by the reader.
101 *
102 * @return the number of bytes read by the read
103 */
104 long getBytesRead() {
105 return this.bytesRead;
106 }
107
108 /**
109 * Gets the byte length of the given character based on the the original Unicode
110 * specification, which defined characters as fixed-width 16-bit entities.
111 * <p>
112 * The Unicode characters are divided into two main ranges:
113 * <ul>
114 * <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b>
115 * <ul>
116 * <li>Represented using a single 16-bit {@code char}.</li>
117 * <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
118 * </ul>
119 * </li>
120 * <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b>
121 * <ul>
122 * <li>Represented as a pair of {@code char}s:</li>
123 * <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
124 * <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
125 * <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
126 * </ul>
127 * </li>
128 * </ul>
129 *
130 * @param current the current character to process.
131 * @return the byte length of the character.
132 * @throws CharacterCodingException if the character cannot be encoded.
133 */
134 private int getEncodedCharLength(final int current) throws CharacterCodingException {
135 final char cChar = (char) current;
136 final char lChar = (char) lastChar;
137 if (!Character.isSurrogate(cChar)) {
138 return encoder.encode(CharBuffer.wrap(new char[] { cChar })).limit();
139 }
140 if (Character.isHighSurrogate(cChar)) {
141 // Move on to the next char (low surrogate)
142 return 0;
143 }
144 if (Character.isSurrogatePair(lChar, cChar)) {
145 return encoder.encode(CharBuffer.wrap(new char[] { lChar, cChar })).limit();
146 }
147 throw new CharacterCodingException();
148 }
149
150 /**
151 * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by
152 * any of the read methods. This will not include a character read using the {@link #peek()} method. If no
153 * character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached
154 * on the last read then this will return {@link IOUtils#EOF}.
155 *
156 * @return the last character that was read
157 */
158 int getLastChar() {
159 return lastChar;
160 }
161
162 /**
163 * Returns the current line number
164 *
165 * @return the current line number
166 */
167 long getLineNumber() {
168 // Check if we are at EOL or EOF or just starting
169 if (lastChar == CR || lastChar == LF || lastChar == UNDEFINED || lastChar == EOF) {
170 return lineNumber; // counter is accurate
171 }
172 return lineNumber + 1; // Allow for counter being incremented only at EOL
173 }
174
175 /**
176 * Gets the character position in the reader.
177 *
178 * @return the current position in the reader (counting characters, not bytes since this is a Reader)
179 */
180 long getPosition() {
181 return this.position;
182 }
183
184 @Override
185 public void mark(final int readAheadLimit) throws IOException {
186 lineNumberMark = lineNumber;
187 lastCharMark = lastChar;
188 positionMark = position;
189 bytesReadMark = bytesRead;
190 super.mark(readAheadLimit);
191 }
192
193 @Override
194 public int read() throws IOException {
195 final int current = super.read();
196 if (current == CR || current == LF && lastChar != CR ||
197 current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
198 lineNumber++;
199 }
200 if (encoder != null) {
201 this.bytesRead += getEncodedCharLength(current);
202 }
203 lastChar = current;
204 position++;
205 return lastChar;
206 }
207
208 @Override
209 public int read(final char[] buf, final int offset, final int length) throws IOException {
210 if (length == 0) {
211 return 0;
212 }
213 final int len = super.read(buf, offset, length);
214 if (len > 0) {
215 for (int i = offset; i < offset + len; i++) {
216 final char ch = buf[i];
217 if (ch == LF) {
218 if (CR != (i > offset ? buf[i - 1] : lastChar)) {
219 lineNumber++;
220 }
221 } else if (ch == CR) {
222 lineNumber++;
223 }
224 }
225 lastChar = buf[offset + len - 1];
226 } else if (len == EOF) {
227 lastChar = EOF;
228 }
229 position += len;
230 return len;
231 }
232
233 /**
234 * Gets the next line, dropping the line terminator(s). This method should only be called when processing a
235 * comment, otherwise, information can be lost.
236 * <p>
237 * Increments {@link #lineNumber} and updates {@link #position}.
238 * </p>
239 * <p>
240 * Sets {@link #lastChar} to {@code Constants.EOF} at EOF, otherwise the last EOL character.
241 * </p>
242 *
243 * @return the line that was read, or null if reached EOF.
244 */
245 @Override
246 public String readLine() throws IOException {
247 if (peek() == EOF) {
248 return null;
249 }
250 final StringBuilder buffer = new StringBuilder();
251 while (true) {
252 final int current = read();
253 if (current == CR) {
254 final int next = peek();
255 if (next == LF) {
256 read();
257 }
258 }
259 if (current == EOF || current == LF || current == CR) {
260 break;
261 }
262 buffer.append((char) current);
263 }
264 return buffer.toString();
265 }
266
267 @Override
268 public void reset() throws IOException {
269 lineNumber = lineNumberMark;
270 lastChar = lastCharMark;
271 position = positionMark;
272 bytesRead = bytesReadMark;
273 super.reset();
274 }
275
276 }