View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.jexl3.parser;
18  
19  /**
20   * Common constant strings utilities.
21   * <p>
22   * This package methods read JEXL string literals and handle escaping through the
23   * 'backslash' (ie: \) character. Escaping is used to neutralize string delimiters (the single
24   * and double quotes) and read Unicode hexadecimal encoded characters.
25   * </p>
26   * <p>
27   * The only escapable characters are the single and double quotes - ''' and '"' -,
28   * a Unicode sequence starting with 'u' followed by 4 hexadecimals and
29   * the backslash character - '\' - itself.
30   * </p>
31   * <p>
32   * A sequence where '\' occurs before any non-escapable character or sequence has no effect, the
33   * sequence output being the same as the input.
34   * </p>
35   */
36  public class StringParser {
37      /** Default constructor.  */
38      protected StringParser() {
39          // nothing to initialize
40      }
41  
42      /**
43       * Builds a string, handles escaping through '\' syntax.
44       * @param str the string to build from
45       * @param eatsep whether the separator, the first character, should be considered
46       * @return the built string
47       */
48      public static String buildString(final CharSequence str, final boolean eatsep) {
49          return buildString(str, eatsep, true);
50      }
51  
52      /**
53       * Builds a template, does not escape characters.
54       * @param str the string to build from
55       * @param eatsep whether the separator, the first character, should be considered
56       * @return the built string
57       */
58      public static String buildTemplate(final CharSequence str, final boolean eatsep) {
59          return buildString(str, eatsep, false);
60      }
61  
62      /**
63       * Builds a string, handles escaping through '\' syntax.
64       * @param str the string to build from
65       * @param eatsep whether the separator, the first character, should be considered
66       * @param esc whether escape characters are interpreted or escaped
67       * @return the built string
68       */
69      private static String buildString(final CharSequence str, final boolean eatsep, final boolean esc) {
70          final StringBuilder strb = new StringBuilder(str.length());
71          final char sep = eatsep ? str.charAt(0) : 0;
72          final int end = str.length() - (eatsep ? 1 : 0);
73          final int begin = (eatsep ? 1 : 0);
74          read(strb, str, begin, end, sep, esc);
75          return strb.toString();
76      }
77  
78      /**
79       * Builds a regex pattern string, handles escaping '/' through '\/' syntax.
80       * @param str the string to build from
81       * @return the built string
82       */
83      public static String buildRegex(final CharSequence str) {
84          return buildString(str.subSequence(1, str.length()), true);
85      }
86  
87      /**
88       * Read the remainder of a string till a given separator,
89       * handles escaping through '\' syntax.
90       * @param strb the destination buffer to copy characters into
91       * @param str the origin
92       * @param index the offset into the origin
93       * @param sep the separator, single or double quote, marking end of string
94       * @return the offset in origin
95       */
96      public static int readString(final StringBuilder strb, final CharSequence str, final int index, final char sep) {
97          return read(strb, str, index, str.length(), sep, true);
98      }
99      /** The length of an escaped unicode sequence. */
100     private static final int UCHAR_LEN = 4;
101 
102     /**
103      * Read the remainder of a string till a given separator,
104      * handles escaping through '\' syntax.
105      * @param strb the destination buffer to copy characters into
106      * @param str the origin
107      * @param begin the relative offset in str to begin reading
108      * @param end the relative offset in str to end reading
109      * @param sep the separator, single or double quote, marking end of string
110      * @param esc whether escape characters are interpreted or escaped
111      * @return the last character offset handled in origin
112      */
113     private static int read(final StringBuilder strb, final CharSequence str, final int begin, final int end, final char sep, final boolean esc) {
114         boolean escape = false;
115         int index = begin;
116         for (; index < end; ++index) {
117             final char c = str.charAt(index);
118             if (escape) {
119                 if (c == 'u' && (index + UCHAR_LEN) < end && readUnicodeChar(strb, str, index + 1) > 0) {
120                     index += UCHAR_LEN;
121                 } else {
122                     // if c is not an escapable character, re-emmit the backslash before it
123                     final boolean notSeparator = sep == 0 ? c != '\'' && c != '"' : c != sep;
124                     if (notSeparator && c != '\\') {
125                         if (!esc) {
126                             strb.append('\\').append(c);
127                         } else {
128                             switch (c) {
129                                 // http://es5.github.io/x7.html#x7.8.4
130                                 case 'b':
131                                     strb.append('\b');
132                                     break; // backspace \u0008
133                                 case 't':
134                                     strb.append('\t');
135                                     break; // horizontal tab \u0009
136                                 case 'n':
137                                     strb.append('\n');
138                                     break; // line feed \u000A
139                                 // We don't support vertical tab. If needed, the unicode (\u000B) should be used instead
140                                 case 'f':
141                                     strb.append('\f');
142                                     break; // form feed \u000C
143                                 case 'r':
144                                     strb.append('\r');
145                                     break; // carriage return \u000D
146                                 default:
147                                     strb.append('\\').append(c);
148                             }
149                         }
150                     } else {
151                         strb.append(c);
152                     }
153                 }
154                 escape = false;
155                 continue;
156             }
157             if (c == '\\') {
158                 escape = true;
159                 continue;
160             }
161             strb.append(c);
162             if (c == sep) {
163                 break;
164             }
165         }
166         return index;
167     }
168     /** Initial shift value for composing a Unicode char from 4 nibbles (16 - 4). */
169     private static final int SHIFT = 12;
170     /** The base 10 offset used to convert hexa characters to decimal. */
171     private static final int BASE10 = 10;
172 
173     /**
174      * Reads a Unicode escape character.
175      * @param strb the builder to write the character to
176      * @param str the sequence
177      * @param begin the begin offset in sequence (after the '\\u')
178      * @return 0 if char could not be read, 4 otherwise
179      */
180     private static int readUnicodeChar(final StringBuilder strb, final CharSequence str, final int begin) {
181         char xc = 0;
182         int bits = SHIFT;
183         int value;
184         for (int offset = 0; offset < UCHAR_LEN; ++offset) {
185             final char c = str.charAt(begin + offset);
186             if (c >= '0' && c <= '9') {
187                 value = (c - '0');
188             } else if (c >= 'a' && c <= 'h') {
189                 value = (c - 'a' + BASE10);
190             } else if (c >= 'A' && c <= 'H') {
191                 value = (c - 'A' + BASE10);
192             } else {
193                 return 0;
194             }
195             xc |= value << bits;
196             bits -= UCHAR_LEN;
197         }
198         strb.append(xc);
199         return UCHAR_LEN;
200     }
201     /** The last 7bits ascii character. */
202     private static final char LAST_ASCII = 127;
203     /** The first printable 7bits ascii character. */
204     private static final char FIRST_ASCII = 32;
205 
206     /**
207      * Escapes a String representation, expand non-ASCII characters as Unicode escape sequence.
208      * @param delim the delimiter character
209      * @param str the string to escape
210      * @return the escaped representation
211      */
212     public static String escapeString(final String str, final char delim) {
213         if (str == null) {
214             return null;
215         }
216         final int length = str.length();
217         final StringBuilder strb = new StringBuilder(length + 2);
218         strb.append(delim);
219         for (int i = 0; i < length; ++i) {
220             final char c = str.charAt(i);
221             switch (c) {
222                 case 0:
223                     continue;
224                 case '\b':
225                     strb.append('\\');
226                     strb.append('b');
227                     break;
228                 case '\t':
229                     strb.append('\\');
230                     strb.append('t');
231                     break;
232                 case '\n':
233                     strb.append('\\');
234                     strb.append('n');
235                     break;
236                 case '\f':
237                     strb.append('\\');
238                     strb.append('f');
239                     break;
240                 case '\r':
241                     strb.append('\\');
242                     strb.append('r');
243                     break;
244                 case '\\':
245                     strb.append('\\');
246                     strb.append('\\');
247                     break;
248                 default:
249                     if (c == delim) {
250                         strb.append('\\');
251                         strb.append(delim);
252                     } else if (c >= FIRST_ASCII && c <= LAST_ASCII) {
253                         strb.append(c);
254                     } else {
255                         // convert to Unicode escape sequence
256                         strb.append('\\');
257                         strb.append('u');
258                         final String hex = Integer.toHexString(c);
259                         for (int h = hex.length(); h < UCHAR_LEN; ++h) {
260                             strb.append('0');
261                         }
262                         strb.append(hex);
263                     }
264             }
265         }
266         strb.append(delim);
267         return strb.toString();
268     }
269 
270     /**
271      * Remove escape char ('\') from an identifier.
272      * @param str the identifier escaped string, ie with a backslash before space, quote, double-quote and backslash
273      * @return the string with no '\\' character
274      */
275     public static String unescapeIdentifier(final String str) {
276         StringBuilder strb = null;
277         if (str != null) {
278             int n = 0;
279             final int last = str.length();
280             while (n < last) {
281                 final char c = str.charAt(n);
282                 if (c == '\\') {
283                     if (strb == null) {
284                         strb = new StringBuilder(last);
285                         strb.append(str, 0, n);
286                     }
287                 } else if (strb != null) {
288                     strb.append(c);
289                 }
290                 n += 1;
291             }
292         }
293         return strb == null ? str : strb.toString();
294     }
295 
296     /**
297      * Adds a escape char ('\') where needed in a string form of an ide
298      * @param str the identifier un-escaped string
299      * @return the string with added  backslash character before space, quote, double-quote and backslash
300      */
301     public static String escapeIdentifier(final String str) {
302         StringBuilder strb = null;
303         if (str != null) {
304             int n = 0;
305             final int last = str.length();
306             while (n < last) {
307                 final char c = str.charAt(n);
308                 switch (c) {
309                     case ' ':
310                     case '\'':
311                     case '"':
312                     case '\\': {
313                         if (strb == null) {
314                             strb = new StringBuilder(last);
315                             strb.append(str, 0, n);
316                         }
317                         strb.append('\\');
318                         strb.append(c);
319                         break;
320                     }
321                     default:
322                         if (strb != null) {
323                             strb.append(c);
324                         }
325                 }
326                 n += 1;
327             }
328         }
329         return strb == null ? str : strb.toString();
330     }
331 }