View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.jexl3.parser;
18  
19  import java.util.Objects;
20  
21  /**
22   * Common constant strings utilities.
23   * <p>
24   * This package methods read JEXL string literals and handle escaping through the
25   * 'backslash' (ie: \) character. Escaping is used to neutralize string delimiters (the single
26   * and double quotes) and read Unicode hexadecimal encoded characters.
27   * </p>
28   * <p>
29   * The only escapable characters are the single and double quotes - ''' and '"' -,
30   * a Unicode sequence starting with 'u' followed by 4 hexadecimals and
31   * the backslash character - '\' - itself.
32   * </p>
33   * <p>
34   * A sequence where '\' occurs before any non-escapable character or sequence has no effect, the
35   * sequence output being the same as the input.
36   * </p>
37   */
38  public class StringParser {
39  
40      /** The length of an escaped unicode sequence. */
41      private static final int UCHAR_LEN = 4;
42  
43      /** Initial shift value for composing a Unicode char from 4 nibbles (16 - 4). */
44      private static final int SHIFT = 12;
45  
46      /** The base 10 offset used to convert hexa characters to decimal. */
47      private static final int BASE10 = 10;
48  
49      /** The last 7bits ASCII character. */
50      private static final char LAST_ASCII = 127;
51  
52      /** The first printable 7bits ASCII character. */
53      private static final char FIRST_ASCII = 32;
54  
55      /**
56       * Builds a regex pattern string, handles escaping '/' through '\/' syntax.
57       *
58       * @param str the string to build from
59       * @return the built string
60       */
61      public static String buildRegex(final CharSequence str) {
62          return buildString(str.subSequence(1, str.length()), true);
63      }
64  
65      /**
66       * Builds a string, handles escaping through '\' syntax.
67       *
68       * @param str the string to build from
69       * @param eatsep whether the separator, the first character, should be considered
70       * @return the built string
71       */
72      public static String buildString(final CharSequence str, final boolean eatsep) {
73          return buildString(str, eatsep, true);
74      }
75  
76      /**
77       * Builds a string, handles escaping through '\' syntax.
78       *
79       * @param str the string to build from
80       * @param eatsep whether the separator, the first character, should be considered
81       * @param esc whether escape characters are interpreted or escaped
82       * @return the built string
83       */
84      private static String buildString(final CharSequence str, final boolean eatsep, final boolean esc) {
85          final StringBuilder strb = new StringBuilder(str.length());
86          final char sep = eatsep ? str.charAt(0) : 0;
87          final int end = str.length() - (eatsep ? 1 : 0);
88          final int begin = eatsep ? 1 : 0;
89          read(strb, str, begin, end, sep, esc);
90          return strb.toString();
91      }
92  
93      /**
94       * Builds a template, does not escape characters.
95       *
96       * @param str the string to build from
97       * @param eatsep whether the separator, the first character, should be considered
98       * @return the built string
99       */
100     public static String buildTemplate(final CharSequence str, final boolean eatsep) {
101         return buildString(str, eatsep, false);
102     }
103 
104     /**
105      * Adds a escape char ('\') where needed in a string form of an ide
106      *
107      * @param str the identifier un-escaped string
108      * @return the string with added  backslash character before space, quote, double-quote and backslash
109      */
110     public static String escapeIdentifier(final String str) {
111         StringBuilder strb = null;
112         if (str != null) {
113             int n = 0;
114             final int last = str.length();
115             while (n < last) {
116                 final char c = str.charAt(n);
117                 switch (c) {
118                     case ' ':
119                     case '\'':
120                     case '"':
121                     case '\\': {
122                         if (strb == null) {
123                             strb = new StringBuilder(last);
124                             strb.append(str, 0, n);
125                         }
126                         strb.append('\\');
127                         strb.append(c);
128                         break;
129                     }
130                     default:
131                         if (strb != null) {
132                             strb.append(c);
133                         }
134                 }
135                 n += 1;
136             }
137         }
138         return Objects.toString(strb, str);
139     }
140 
141     /**
142      * Escapes a String representation, expand non-ASCII characters as Unicode escape sequence.
143      *
144      * @param delim the delimiter character (if 0, no delimiter is added)
145      * @param str the string to escape
146      * @return the escaped representation
147      */
148     public static String escapeString(final CharSequence str, final char delim) {
149         if (str == null) {
150             return null;
151         }
152         final int length = str.length();
153         final StringBuilder strb = new StringBuilder(length + 2);
154         if (delim > 0) {
155             strb.append(delim);
156         }
157         for (int i = 0; i < length; ++i) {
158             final char c = str.charAt(i);
159             switch (c) {
160                 case 0:
161                     continue;
162                 case '\b':
163                     strb.append('\\');
164                     strb.append('b');
165                     break;
166                 case '\t':
167                     strb.append('\\');
168                     strb.append('t');
169                     break;
170                 case '\n':
171                     strb.append('\\');
172                     strb.append('n');
173                     break;
174                 case '\f':
175                     strb.append('\\');
176                     strb.append('f');
177                     break;
178                 case '\r':
179                     strb.append('\\');
180                     strb.append('r');
181                     break;
182                 case '\\':
183                     // we escape the backslash only if there is a delimiter
184                     if (delim > 0) {
185                         strb.append('\\');
186                     }
187                     strb.append('\\');
188                     break;
189                 default:
190                     if (c == delim) {
191                         strb.append('\\');
192                         strb.append(delim);
193                     } else if (c >= FIRST_ASCII && c <= LAST_ASCII) {
194                         strb.append(c);
195                     } else {
196                         // convert to Unicode escape sequence
197                         strb.append('\\');
198                         strb.append('u');
199                         final String hex = Integer.toHexString(c);
200                         for (int h = hex.length(); h < UCHAR_LEN; ++h) {
201                             strb.append('0');
202                         }
203                         strb.append(hex);
204                     }
205             }
206         }
207         if (delim > 0) {
208             strb.append(delim);
209         }
210         return strb.toString();
211     }
212 
213     /**
214      * Reads the remainder of a string till a given separator,
215      * handles escaping through '\' syntax.
216      *
217      * @param strb the destination buffer to copy characters into
218      * @param str the origin
219      * @param begin the relative offset in str to begin reading
220      * @param end the relative offset in str to end reading
221      * @param sep the separator, single or double quote, marking end of string
222      * @param esc whether escape characters are interpreted or escaped
223      * @return the last character offset handled in origin
224      */
225     private static int read(final StringBuilder strb, final CharSequence str, final int begin, final int end, final char sep, final boolean esc) {
226         boolean escape = false;
227         int index = begin;
228         for (; index < end; ++index) {
229             final char c = str.charAt(index);
230             if (escape) {
231                 if (c == 'u' && index + UCHAR_LEN < end && readUnicodeChar(strb, str, index + 1) > 0) {
232                     index += UCHAR_LEN;
233                 } else {
234                     // if c is not an escapable character, re-emmit the backslash before it
235                     final boolean notSeparator = sep == 0 ? c != '\'' && c != '"' : c != sep;
236                     if (notSeparator && c != '\\') {
237                         if (!esc) {
238                             strb.append('\\').append(c);
239                         } else {
240                             switch (c) {
241                                 // https://es5.github.io/x7.html#x7.8.4
242                                 case 'b':
243                                     strb.append('\b');
244                                     break; // backspace \u0008
245                                 case 't':
246                                     strb.append('\t');
247                                     break; // horizontal tab \u0009
248                                 case 'n':
249                                     strb.append('\n');
250                                     break; // line feed \u000A
251                                 // We don't support vertical tab. If needed, the unicode (\u000B) should be used instead
252                                 case 'f':
253                                     strb.append('\f');
254                                     break; // form feed \u000C
255                                 case 'r':
256                                     strb.append('\r');
257                                     break; // carriage return \u000D
258                                 default:
259                                     strb.append('\\').append(c);
260                             }
261                         }
262                     } else {
263                         strb.append(c);
264                     }
265                 }
266                 escape = false;
267                 continue;
268             }
269             if (c == '\\') {
270                 escape = true;
271                 continue;
272             }
273             strb.append(c);
274             if (c == sep) {
275                 break;
276             }
277         }
278         return index;
279     }
280 
281     /**
282      * Reads the remainder of a string till a given separator,
283      * handles escaping through '\' syntax.
284      *
285      * @param strb the destination buffer to copy characters into
286      * @param str the origin
287      * @param index the offset into the origin
288      * @param sep the separator, single or double quote, marking end of string
289      * @return the offset in origin
290      */
291     public static int readString(final StringBuilder strb, final CharSequence str, final int index, final char sep) {
292         return read(strb, str, index, str.length(), sep, true);
293     }
294 
295     /**
296      * Reads a Unicode escape character.
297      *
298      * @param strb the builder to write the character to
299      * @param str the sequence
300      * @param begin the begin offset in sequence (after the '\\u')
301      * @return 0 if char could not be read, 4 otherwise
302      */
303     private static int readUnicodeChar(final StringBuilder strb, final CharSequence str, final int begin) {
304         char xc = 0;
305         int bits = SHIFT;
306         int value;
307         for (int offset = 0; offset < UCHAR_LEN; ++offset) {
308             final char c = str.charAt(begin + offset);
309             if (c >= '0' && c <= '9') {
310                 value = c - '0';
311             } else if (c >= 'a' && c <= 'h') {
312                 value = c - 'a' + BASE10;
313             } else if (c >= 'A' && c <= 'H') {
314                 value = c - 'A' + BASE10;
315             } else {
316                 return 0;
317             }
318             xc |= value << bits;
319             bits -= UCHAR_LEN;
320         }
321         strb.append(xc);
322         return UCHAR_LEN;
323     }
324 
325     /**
326      * Remove escape char ('\') from an identifier.
327      *
328      * @param str the identifier escaped string, ie with a backslash before space, quote, double-quote and backslash
329      * @return the string with no '\\' character
330      */
331     public static String unescapeIdentifier(final String str) {
332         StringBuilder strb = null;
333         if (str != null) {
334             int n = 0;
335             final int last = str.length();
336             while (n < last) {
337                 final char c = str.charAt(n);
338                 if (c == '\\') {
339                     if (strb == null) {
340                         strb = new StringBuilder(last);
341                         strb.append(str, 0, n);
342                     }
343                 } else if (strb != null) {
344                     strb.append(c);
345                 }
346                 n += 1;
347             }
348         }
349         return Objects.toString(strb, str);
350     }
351 
352     /** Default constructor.  */
353     protected StringParser() {
354         // nothing to initialize
355     }
356 }