001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.jexl2.parser;
018    
019    /**
020     * Common constant strings utilities.
021     * <p>
022     * This package methods read JEXL string literals and handle escaping through the
023     * 'backslash' (ie: \) character. Escaping is used to neutralize string delimiters (the single
024     * and double quotes) and read Unicode hexadecimal encoded characters.
025     * </p>
026     * <p>
027     * The only escapable characters are the single and double quotes - ''' and '"' -,
028     * a Unicode sequence starting with 'u' followed by 4 hexadecimals and
029     * the backslash character - '\' - itself.
030     * </p>
031     * <p>
032     * A sequence where '\' occurs before any non-escapable character or sequence has no effect, the
033     * sequence output being the same as the input.
034     * </p>
035     */
036    public class StringParser {
037        /** Default constructor.  */
038        public StringParser() {
039        }
040    
041        /**
042         * Builds a string, handles escaping through '\' syntax.
043         * @param str the string to build from
044         * @param eatsep whether the separator, the first character, should be considered
045         * @return the built string
046         */
047        public static String buildString(CharSequence str, boolean eatsep) {
048            StringBuilder strb = new StringBuilder(str.length());
049            char sep = eatsep ? str.charAt(0) : 0;
050            int end = str.length() - (eatsep ? 1 : 0);
051            int begin = (eatsep ? 1 : 0);
052            read(strb, str, begin, end, sep);
053            return strb.toString();
054        }
055    
056        /**
057         * Read the remainder of a string till a given separator,
058         * handles escaping through '\' syntax.
059         * @param strb the destination buffer to copy characters into
060         * @param str the origin
061         * @param index the offset into the origin
062         * @param sep the separator, single or double quote, marking end of string
063         * @return the offset in origin
064         */
065        public static int readString(StringBuilder strb, CharSequence str, int index, char sep) {
066            return read(strb, str, index, str.length(), sep);
067        }
068        /** The length of an escaped unicode sequence. */
069        private static final int UCHAR_LEN = 4;
070    
071        /**
072         * Read the remainder of a string till a given separator,
073         * handles escaping through '\' syntax.
074         * @param strb the destination buffer to copy characters into
075         * @param str the origin
076         * @param begin the relative offset in str to begin reading
077         * @param end the relative offset in str to end reading
078         * @param sep the separator, single or double quote, marking end of string
079         * @return the last character offset handled in origin
080         */
081        private static int read(StringBuilder strb, CharSequence str, int begin, int end, char sep) {
082            boolean escape = false;
083            int index = begin;
084            for (; index < end; ++index) {
085                char c = str.charAt(index);
086                if (escape) {
087                    if (c == 'u' && (index + UCHAR_LEN) < end && readUnicodeChar(strb, str, index + 1) > 0) {
088                        index += UCHAR_LEN;
089                    } else {
090                        // if c is not an escapable character, re-emmit the backslash before it
091                        boolean notSeparator = sep == 0 ? c != '\'' && c != '"' : c != sep;
092                        if (notSeparator && c != '\\') {
093                            strb.append('\\');
094                        }
095                        strb.append(c);
096                    }
097                    escape = false;
098                    continue;
099                }
100                if (c == '\\') {
101                    escape = true;
102                    continue;
103                }
104                strb.append(c);
105                if (c == sep) {
106                    break;
107                }
108            }
109            return index;
110        }
111        /** Initial shift value for composing a Unicode char from 4 nibbles (16 - 4). */
112        private static final int SHIFT = 12;
113        /** The base 10 offset used to convert hexa characters to decimal. */
114        private static final int BASE10 = 10;
115    
116        /**
117         * Reads a Unicode escape character.
118         * @param strb the builder to write the character to
119         * @param str the sequence
120         * @param begin the begin offset in sequence (after the '\\u')
121         * @return 0 if char could not be read, 4 otherwise
122         */
123        private static int readUnicodeChar(StringBuilder strb, CharSequence str, int begin) {
124            char xc = 0;
125            int bits = SHIFT;
126            int value = 0;
127            for (int offset = 0; offset < UCHAR_LEN; ++offset) {
128                char c = str.charAt(begin + offset);
129                if (c >= '0' && c <= '9') {
130                    value = (c - '0');
131                } else if (c >= 'a' && c <= 'h') {
132                    value = (c - 'a' + BASE10);
133                } else if (c >= 'A' && c <= 'H') {
134                    value = (c - 'A' + BASE10);
135                } else {
136                    return 0;
137                }
138                xc |= value << bits;
139                bits -= UCHAR_LEN;
140            }
141            strb.append(xc);
142            return UCHAR_LEN;
143        }
144        /** The last 7bits ascii character. */
145        private static final char LAST_ASCII = 127;
146        /** The first printable 7bits ascii character. */
147        private static final char FIRST_ASCII = 32;
148    
149        /**
150         * Escapes a String representation, expand non-ASCII characters as Unicode escape sequence.
151         * @param str the string to escape
152         * @return the escaped representation
153         */
154        public static String escapeString(String str, char delim) {
155            if (str == null) {
156                return null;
157            }
158            final int length = str.length();
159            StringBuilder strb = new StringBuilder(length + 2);
160            strb.append(delim);
161            for (int i = 0; i < length; ++i) {
162                char c = str.charAt(i);
163                switch (c) {
164                    case 0:
165                        continue;
166                    case '\b':
167                        strb.append("\\b");
168                        break;
169                    case '\t':
170                        strb.append("\\t");
171                        break;
172                    case '\n':
173                        strb.append("\\n");
174                        break;
175                    case '\f':
176                        strb.append("\\f");
177                        break;
178                    case '\r':
179                        strb.append("\\r");
180                        break;
181                    case '\"':
182                        strb.append("\\\"");
183                        break;
184                    case '\'':
185                        strb.append("\\\'");
186                        break;
187                    case '\\':
188                        strb.append("\\\\");
189                        break;
190                    default:
191                        if (c >= FIRST_ASCII && c <= LAST_ASCII) {
192                            strb.append(c);
193                        } else {
194                            // convert to Unicode escape sequence
195                            strb.append('\\');
196                            strb.append('u');
197                            String hex = Integer.toHexString(c);
198                            for (int h = hex.length(); h < UCHAR_LEN; ++h) {
199                                strb.append('0');
200                            }
201                            strb.append(hex);
202                        }
203                }
204            }
205            strb.append(delim);
206            return strb.toString();
207        }
208    }