001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.jexl2.parser; 018 019 /** 020 * Common constant strings utilities. 021 * <p> 022 * This package methods read JEXL string literals and handle escaping through the 023 * 'backslash' (ie: \) character. Escaping is used to neutralize string delimiters (the single 024 * and double quotes) and read Unicode hexadecimal encoded characters. 025 * </p> 026 * <p> 027 * The only escapable characters are the single and double quotes - ''' and '"' -, 028 * a Unicode sequence starting with 'u' followed by 4 hexadecimals and 029 * the backslash character - '\' - itself. 030 * </p> 031 * <p> 032 * A sequence where '\' occurs before any non-escapable character or sequence has no effect, the 033 * sequence output being the same as the input. 034 * </p> 035 */ 036 public class StringParser { 037 /** Default constructor. */ 038 public StringParser() { 039 } 040 041 /** 042 * Builds a string, handles escaping through '\' syntax. 043 * @param str the string to build from 044 * @param eatsep whether the separator, the first character, should be considered 045 * @return the built string 046 */ 047 public static String buildString(CharSequence str, boolean eatsep) { 048 StringBuilder strb = new StringBuilder(str.length()); 049 char sep = eatsep ? str.charAt(0) : 0; 050 int end = str.length() - (eatsep ? 1 : 0); 051 int begin = (eatsep ? 1 : 0); 052 read(strb, str, begin, end, sep); 053 return strb.toString(); 054 } 055 056 /** 057 * Read the remainder of a string till a given separator, 058 * handles escaping through '\' syntax. 059 * @param strb the destination buffer to copy characters into 060 * @param str the origin 061 * @param index the offset into the origin 062 * @param sep the separator, single or double quote, marking end of string 063 * @return the offset in origin 064 */ 065 public static int readString(StringBuilder strb, CharSequence str, int index, char sep) { 066 return read(strb, str, index, str.length(), sep); 067 } 068 /** The length of an escaped unicode sequence. */ 069 private static final int UCHAR_LEN = 4; 070 071 /** 072 * Read the remainder of a string till a given separator, 073 * handles escaping through '\' syntax. 074 * @param strb the destination buffer to copy characters into 075 * @param str the origin 076 * @param begin the relative offset in str to begin reading 077 * @param end the relative offset in str to end reading 078 * @param sep the separator, single or double quote, marking end of string 079 * @return the last character offset handled in origin 080 */ 081 private static int read(StringBuilder strb, CharSequence str, int begin, int end, char sep) { 082 boolean escape = false; 083 int index = begin; 084 for (; index < end; ++index) { 085 char c = str.charAt(index); 086 if (escape) { 087 if (c == 'u' && (index + UCHAR_LEN) < end && readUnicodeChar(strb, str, index + 1) > 0) { 088 index += UCHAR_LEN; 089 } else { 090 // if c is not an escapable character, re-emmit the backslash before it 091 boolean notSeparator = sep == 0 ? c != '\'' && c != '"' : c != sep; 092 if (notSeparator && c != '\\') { 093 strb.append('\\'); 094 } 095 strb.append(c); 096 } 097 escape = false; 098 continue; 099 } 100 if (c == '\\') { 101 escape = true; 102 continue; 103 } 104 strb.append(c); 105 if (c == sep) { 106 break; 107 } 108 } 109 return index; 110 } 111 /** Initial shift value for composing a Unicode char from 4 nibbles (16 - 4). */ 112 private static final int SHIFT = 12; 113 /** The base 10 offset used to convert hexa characters to decimal. */ 114 private static final int BASE10 = 10; 115 116 /** 117 * Reads a Unicode escape character. 118 * @param strb the builder to write the character to 119 * @param str the sequence 120 * @param begin the begin offset in sequence (after the '\\u') 121 * @return 0 if char could not be read, 4 otherwise 122 */ 123 private static int readUnicodeChar(StringBuilder strb, CharSequence str, int begin) { 124 char xc = 0; 125 int bits = SHIFT; 126 int value = 0; 127 for (int offset = 0; offset < UCHAR_LEN; ++offset) { 128 char c = str.charAt(begin + offset); 129 if (c >= '0' && c <= '9') { 130 value = (c - '0'); 131 } else if (c >= 'a' && c <= 'h') { 132 value = (c - 'a' + BASE10); 133 } else if (c >= 'A' && c <= 'H') { 134 value = (c - 'A' + BASE10); 135 } else { 136 return 0; 137 } 138 xc |= value << bits; 139 bits -= UCHAR_LEN; 140 } 141 strb.append(xc); 142 return UCHAR_LEN; 143 } 144 /** The last 7bits ascii character. */ 145 private static final char LAST_ASCII = 127; 146 /** The first printable 7bits ascii character. */ 147 private static final char FIRST_ASCII = 32; 148 149 /** 150 * Escapes a String representation, expand non-ASCII characters as Unicode escape sequence. 151 * @param str the string to escape 152 * @return the escaped representation 153 */ 154 public static String escapeString(String str, char delim) { 155 if (str == null) { 156 return null; 157 } 158 final int length = str.length(); 159 StringBuilder strb = new StringBuilder(length + 2); 160 strb.append(delim); 161 for (int i = 0; i < length; ++i) { 162 char c = str.charAt(i); 163 switch (c) { 164 case 0: 165 continue; 166 case '\b': 167 strb.append("\\b"); 168 break; 169 case '\t': 170 strb.append("\\t"); 171 break; 172 case '\n': 173 strb.append("\\n"); 174 break; 175 case '\f': 176 strb.append("\\f"); 177 break; 178 case '\r': 179 strb.append("\\r"); 180 break; 181 case '\"': 182 strb.append("\\\""); 183 break; 184 case '\'': 185 strb.append("\\\'"); 186 break; 187 case '\\': 188 strb.append("\\\\"); 189 break; 190 default: 191 if (c >= FIRST_ASCII && c <= LAST_ASCII) { 192 strb.append(c); 193 } else { 194 // convert to Unicode escape sequence 195 strb.append('\\'); 196 strb.append('u'); 197 String hex = Integer.toHexString(c); 198 for (int h = hex.length(); h < UCHAR_LEN; ++h) { 199 strb.append('0'); 200 } 201 strb.append(hex); 202 } 203 } 204 } 205 strb.append(delim); 206 return strb.toString(); 207 } 208 }