1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.jexl3.parser;
18
19 import java.util.Objects;
20
21 /**
22 * Common constant strings utilities.
23 * <p>
24 * This package methods read JEXL string literals and handle escaping through the
25 * 'backslash' (ie: \) character. Escaping is used to neutralize string delimiters (the single
26 * and double quotes) and read Unicode hexadecimal encoded characters.
27 * </p>
28 * <p>
29 * The only escapable characters are the single and double quotes - ''' and '"' -,
30 * a Unicode sequence starting with 'u' followed by 4 hexadecimals and
31 * the backslash character - '\' - itself.
32 * </p>
33 * <p>
34 * A sequence where '\' occurs before any non-escapable character or sequence has no effect, the
35 * sequence output being the same as the input.
36 * </p>
37 */
38 public class StringParser {
39
40 /** The length of an escaped unicode sequence. */
41 private static final int UCHAR_LEN = 4;
42
43 /** Initial shift value for composing a Unicode char from 4 nibbles (16 - 4). */
44 private static final int SHIFT = 12;
45
46 /** The base 10 offset used to convert hexa characters to decimal. */
47 private static final int BASE10 = 10;
48
49 /** The last 7bits ASCII character. */
50 private static final char LAST_ASCII = 127;
51
52 /** The first printable 7bits ASCII character. */
53 private static final char FIRST_ASCII = 32;
54
55 /**
56 * Builds a regex pattern string, handles escaping '/' through '\/' syntax.
57 *
58 * @param str the string to build from
59 * @return the built string
60 */
61 public static String buildRegex(final CharSequence str) {
62 return buildString(str.subSequence(1, str.length()), true);
63 }
64
65 /**
66 * Builds a string, handles escaping through '\' syntax.
67 *
68 * @param str the string to build from
69 * @param eatsep whether the separator, the first character, should be considered
70 * @return the built string
71 */
72 public static String buildString(final CharSequence str, final boolean eatsep) {
73 return buildString(str, eatsep, true);
74 }
75
76 /**
77 * Builds a string, handles escaping through '\' syntax.
78 *
79 * @param str the string to build from
80 * @param eatsep whether the separator, the first character, should be considered
81 * @param esc whether escape characters are interpreted or escaped
82 * @return the built string
83 */
84 private static String buildString(final CharSequence str, final boolean eatsep, final boolean esc) {
85 final StringBuilder strb = new StringBuilder(str.length());
86 final char sep = eatsep ? str.charAt(0) : 0;
87 final int end = str.length() - (eatsep ? 1 : 0);
88 final int begin = eatsep ? 1 : 0;
89 read(strb, str, begin, end, sep, esc);
90 return strb.toString();
91 }
92
93 /**
94 * Builds a template, does not escape characters.
95 *
96 * @param str the string to build from
97 * @param eatsep whether the separator, the first character, should be considered
98 * @return the built string
99 */
100 public static String buildTemplate(final CharSequence str, final boolean eatsep) {
101 return buildString(str, eatsep, false);
102 }
103
104 /**
105 * Adds a escape char ('\') where needed in a string form of an ide
106 *
107 * @param str the identifier un-escaped string
108 * @return the string with added backslash character before space, quote, double-quote and backslash
109 */
110 public static String escapeIdentifier(final String str) {
111 StringBuilder strb = null;
112 if (str != null) {
113 int n = 0;
114 final int last = str.length();
115 while (n < last) {
116 final char c = str.charAt(n);
117 switch (c) {
118 case ' ':
119 case '\'':
120 case '"':
121 case '\\': {
122 if (strb == null) {
123 strb = new StringBuilder(last);
124 strb.append(str, 0, n);
125 }
126 strb.append('\\');
127 strb.append(c);
128 break;
129 }
130 default:
131 if (strb != null) {
132 strb.append(c);
133 }
134 }
135 n += 1;
136 }
137 }
138 return Objects.toString(strb, str);
139 }
140
141 /**
142 * Escapes a String representation, expand non-ASCII characters as Unicode escape sequence.
143 *
144 * @param delim the delimiter character (if 0, no delimiter is added)
145 * @param str the string to escape
146 * @return the escaped representation
147 */
148 public static String escapeString(final CharSequence str, final char delim) {
149 if (str == null) {
150 return null;
151 }
152 final int length = str.length();
153 final StringBuilder strb = new StringBuilder(length + 2);
154 if (delim > 0) {
155 strb.append(delim);
156 }
157 for (int i = 0; i < length; ++i) {
158 final char c = str.charAt(i);
159 switch (c) {
160 case 0:
161 continue;
162 case '\b':
163 strb.append('\\');
164 strb.append('b');
165 break;
166 case '\t':
167 strb.append('\\');
168 strb.append('t');
169 break;
170 case '\n':
171 strb.append('\\');
172 strb.append('n');
173 break;
174 case '\f':
175 strb.append('\\');
176 strb.append('f');
177 break;
178 case '\r':
179 strb.append('\\');
180 strb.append('r');
181 break;
182 case '\\':
183 // we escape the backslash only if there is a delimiter
184 if (delim > 0) {
185 strb.append('\\');
186 }
187 strb.append('\\');
188 break;
189 default:
190 if (c == delim) {
191 strb.append('\\');
192 strb.append(delim);
193 } else if (c >= FIRST_ASCII && c <= LAST_ASCII) {
194 strb.append(c);
195 } else {
196 // convert to Unicode escape sequence
197 strb.append('\\');
198 strb.append('u');
199 final String hex = Integer.toHexString(c);
200 for (int h = hex.length(); h < UCHAR_LEN; ++h) {
201 strb.append('0');
202 }
203 strb.append(hex);
204 }
205 }
206 }
207 if (delim > 0) {
208 strb.append(delim);
209 }
210 return strb.toString();
211 }
212
213 /**
214 * Reads the remainder of a string till a given separator,
215 * handles escaping through '\' syntax.
216 *
217 * @param strb the destination buffer to copy characters into
218 * @param str the origin
219 * @param begin the relative offset in str to begin reading
220 * @param end the relative offset in str to end reading
221 * @param sep the separator, single or double quote, marking end of string
222 * @param esc whether escape characters are interpreted or escaped
223 * @return the last character offset handled in origin
224 */
225 private static int read(final StringBuilder strb, final CharSequence str, final int begin, final int end, final char sep, final boolean esc) {
226 boolean escape = false;
227 int index = begin;
228 for (; index < end; ++index) {
229 final char c = str.charAt(index);
230 if (escape) {
231 if (c == 'u' && index + UCHAR_LEN < end && readUnicodeChar(strb, str, index + 1) > 0) {
232 index += UCHAR_LEN;
233 } else {
234 // if c is not an escapable character, re-emmit the backslash before it
235 final boolean notSeparator = sep == 0 ? c != '\'' && c != '"' : c != sep;
236 if (notSeparator && c != '\\') {
237 if (!esc) {
238 strb.append('\\').append(c);
239 } else {
240 switch (c) {
241 // https://es5.github.io/x7.html#x7.8.4
242 case 'b':
243 strb.append('\b');
244 break; // backspace \u0008
245 case 't':
246 strb.append('\t');
247 break; // horizontal tab \u0009
248 case 'n':
249 strb.append('\n');
250 break; // line feed \u000A
251 // We don't support vertical tab. If needed, the unicode (\u000B) should be used instead
252 case 'f':
253 strb.append('\f');
254 break; // form feed \u000C
255 case 'r':
256 strb.append('\r');
257 break; // carriage return \u000D
258 default:
259 strb.append('\\').append(c);
260 }
261 }
262 } else {
263 strb.append(c);
264 }
265 }
266 escape = false;
267 continue;
268 }
269 if (c == '\\') {
270 escape = true;
271 continue;
272 }
273 strb.append(c);
274 if (c == sep) {
275 break;
276 }
277 }
278 return index;
279 }
280
281 /**
282 * Reads the remainder of a string till a given separator,
283 * handles escaping through '\' syntax.
284 *
285 * @param strb the destination buffer to copy characters into
286 * @param str the origin
287 * @param index the offset into the origin
288 * @param sep the separator, single or double quote, marking end of string
289 * @return the offset in origin
290 */
291 public static int readString(final StringBuilder strb, final CharSequence str, final int index, final char sep) {
292 return read(strb, str, index, str.length(), sep, true);
293 }
294
295 /**
296 * Reads a Unicode escape character.
297 *
298 * @param strb the builder to write the character to
299 * @param str the sequence
300 * @param begin the begin offset in sequence (after the '\\u')
301 * @return 0 if char could not be read, 4 otherwise
302 */
303 private static int readUnicodeChar(final StringBuilder strb, final CharSequence str, final int begin) {
304 char xc = 0;
305 int bits = SHIFT;
306 int value;
307 for (int offset = 0; offset < UCHAR_LEN; ++offset) {
308 final char c = str.charAt(begin + offset);
309 if (c >= '0' && c <= '9') {
310 value = c - '0';
311 } else if (c >= 'a' && c <= 'h') {
312 value = c - 'a' + BASE10;
313 } else if (c >= 'A' && c <= 'H') {
314 value = c - 'A' + BASE10;
315 } else {
316 return 0;
317 }
318 xc |= value << bits;
319 bits -= UCHAR_LEN;
320 }
321 strb.append(xc);
322 return UCHAR_LEN;
323 }
324
325 /**
326 * Remove escape char ('\') from an identifier.
327 *
328 * @param str the identifier escaped string, ie with a backslash before space, quote, double-quote and backslash
329 * @return the string with no '\\' character
330 */
331 public static String unescapeIdentifier(final String str) {
332 StringBuilder strb = null;
333 if (str != null) {
334 int n = 0;
335 final int last = str.length();
336 while (n < last) {
337 final char c = str.charAt(n);
338 if (c == '\\') {
339 if (strb == null) {
340 strb = new StringBuilder(last);
341 strb.append(str, 0, n);
342 }
343 } else if (strb != null) {
344 strb.append(c);
345 }
346 n += 1;
347 }
348 }
349 return Objects.toString(strb, str);
350 }
351
352 /** Default constructor. */
353 protected StringParser() {
354 // nothing to initialize
355 }
356 }