1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.jexl2.parser;
18
19 /**
20 * Common constant strings utilities.
21 * <p>
22 * This package methods read JEXL string literals and handle escaping through the
23 * 'backslash' (ie: \) character. Escaping is used to neutralize string delimiters (the single
24 * and double quotes) and read Unicode hexadecimal encoded characters.
25 * </p>
26 * <p>
27 * The only escapable characters are the single and double quotes - ''' and '"' -,
28 * a Unicode sequence starting with 'u' followed by 4 hexadecimals and
29 * the backslash character - '\' - itself.
30 * </p>
31 * <p>
32 * A sequence where '\' occurs before any non-escapable character or sequence has no effect, the
33 * sequence output being the same as the input.
34 * </p>
35 */
36 public class StringParser {
37 /** Default constructor. */
38 public StringParser() {
39 }
40
41 /**
42 * Builds a string, handles escaping through '\' syntax.
43 * @param str the string to build from
44 * @param eatsep whether the separator, the first character, should be considered
45 * @return the built string
46 */
47 public static String buildString(CharSequence str, boolean eatsep) {
48 StringBuilder strb = new StringBuilder(str.length());
49 char sep = eatsep ? str.charAt(0) : 0;
50 int end = str.length() - (eatsep ? 1 : 0);
51 int begin = (eatsep ? 1 : 0);
52 read(strb, str, begin, end, sep);
53 return strb.toString();
54 }
55
56 /**
57 * Read the remainder of a string till a given separator,
58 * handles escaping through '\' syntax.
59 * @param strb the destination buffer to copy characters into
60 * @param str the origin
61 * @param index the offset into the origin
62 * @param sep the separator, single or double quote, marking end of string
63 * @return the offset in origin
64 */
65 public static int readString(StringBuilder strb, CharSequence str, int index, char sep) {
66 return read(strb, str, index, str.length(), sep);
67 }
68 /** The length of an escaped unicode sequence. */
69 private static final int UCHAR_LEN = 4;
70
71 /**
72 * Read the remainder of a string till a given separator,
73 * handles escaping through '\' syntax.
74 * @param strb the destination buffer to copy characters into
75 * @param str the origin
76 * @param begin the relative offset in str to begin reading
77 * @param end the relative offset in str to end reading
78 * @param sep the separator, single or double quote, marking end of string
79 * @return the last character offset handled in origin
80 */
81 private static int read(StringBuilder strb, CharSequence str, int begin, int end, char sep) {
82 boolean escape = false;
83 int index = begin;
84 for (; index < end; ++index) {
85 char c = str.charAt(index);
86 if (escape) {
87 if (c == 'u' && (index + UCHAR_LEN) < end && readUnicodeChar(strb, str, index + 1) > 0) {
88 index += UCHAR_LEN;
89 } else {
90 // if c is not an escapable character, re-emmit the backslash before it
91 boolean notSeparator = sep == 0 ? c != '\'' && c != '"' : c != sep;
92 if (notSeparator && c != '\\') {
93 strb.append('\\');
94 }
95 strb.append(c);
96 }
97 escape = false;
98 continue;
99 }
100 if (c == '\\') {
101 escape = true;
102 continue;
103 }
104 strb.append(c);
105 if (c == sep) {
106 break;
107 }
108 }
109 return index;
110 }
111 /** Initial shift value for composing a Unicode char from 4 nibbles (16 - 4). */
112 private static final int SHIFT = 12;
113 /** The base 10 offset used to convert hexa characters to decimal. */
114 private static final int BASE10 = 10;
115
116 /**
117 * Reads a Unicode escape character.
118 * @param strb the builder to write the character to
119 * @param str the sequence
120 * @param begin the begin offset in sequence (after the '\\u')
121 * @return 0 if char could not be read, 4 otherwise
122 */
123 private static int readUnicodeChar(StringBuilder strb, CharSequence str, int begin) {
124 char xc = 0;
125 int bits = SHIFT;
126 int value = 0;
127 for (int offset = 0; offset < UCHAR_LEN; ++offset) {
128 char c = str.charAt(begin + offset);
129 if (c >= '0' && c <= '9') {
130 value = (c - '0');
131 } else if (c >= 'a' && c <= 'h') {
132 value = (c - 'a' + BASE10);
133 } else if (c >= 'A' && c <= 'H') {
134 value = (c - 'A' + BASE10);
135 } else {
136 return 0;
137 }
138 xc |= value << bits;
139 bits -= UCHAR_LEN;
140 }
141 strb.append(xc);
142 return UCHAR_LEN;
143 }
144 /** The last 7bits ascii character. */
145 private static final char LAST_ASCII = 127;
146 /** The first printable 7bits ascii character. */
147 private static final char FIRST_ASCII = 32;
148
149 /**
150 * Escapes a String representation, expand non-ASCII characters as Unicode escape sequence.
151 * @param str the string to escape
152 * @return the escaped representation
153 */
154 public static String escapeString(String str, char delim) {
155 if (str == null) {
156 return null;
157 }
158 final int length = str.length();
159 StringBuilder strb = new StringBuilder(length + 2);
160 strb.append(delim);
161 for (int i = 0; i < length; ++i) {
162 char c = str.charAt(i);
163 switch (c) {
164 case 0:
165 continue;
166 case '\b':
167 strb.append("\\b");
168 break;
169 case '\t':
170 strb.append("\\t");
171 break;
172 case '\n':
173 strb.append("\\n");
174 break;
175 case '\f':
176 strb.append("\\f");
177 break;
178 case '\r':
179 strb.append("\\r");
180 break;
181 case '\"':
182 strb.append("\\\"");
183 break;
184 case '\'':
185 strb.append("\\\'");
186 break;
187 case '\\':
188 strb.append("\\\\");
189 break;
190 default:
191 if (c >= FIRST_ASCII && c <= LAST_ASCII) {
192 strb.append(c);
193 } else {
194 // convert to Unicode escape sequence
195 strb.append('\\');
196 strb.append('u');
197 String hex = Integer.toHexString(c);
198 for (int h = hex.length(); h < UCHAR_LEN; ++h) {
199 strb.append('0');
200 }
201 strb.append(hex);
202 }
203 }
204 }
205 strb.append(delim);
206 return strb.toString();
207 }
208 }