001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.lang3; 018 019 import java.io.IOException; 020 import java.io.Writer; 021 022 import org.apache.commons.lang3.text.translate.AggregateTranslator; 023 import org.apache.commons.lang3.text.translate.CharSequenceTranslator; 024 import org.apache.commons.lang3.text.translate.EntityArrays; 025 import org.apache.commons.lang3.text.translate.LookupTranslator; 026 import org.apache.commons.lang3.text.translate.NumericEntityUnescaper; 027 import org.apache.commons.lang3.text.translate.OctalUnescaper; 028 import org.apache.commons.lang3.text.translate.UnicodeEscaper; 029 import org.apache.commons.lang3.text.translate.UnicodeUnescaper; 030 031 /** 032 * <p>Escapes and unescapes {@code String}s for 033 * Java, Java Script, HTML and XML.</p> 034 * 035 * <p>#ThreadSafe#</p> 036 * @since 2.0 037 * @version $Id: StringEscapeUtils.java 1148164 2011-07-19 05:08:14Z bayard $ 038 */ 039 public class StringEscapeUtils { 040 041 /* ESCAPE TRANSLATORS */ 042 043 /** 044 * Translator object for escaping Java. 045 * 046 * While {@link #escapeJava(String)} is the expected method of use, this 047 * object allows the Java escaping functionality to be used 048 * as the foundation for a custom translator. 049 * 050 * @since 3.0 051 */ 052 public static final CharSequenceTranslator ESCAPE_JAVA = 053 new LookupTranslator( 054 new String[][] { 055 {"\"", "\\\""}, 056 {"\\", "\\\\"}, 057 }).with( 058 new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()) 059 ).with( 060 UnicodeEscaper.outsideOf(32, 0x7f) 061 ); 062 063 /** 064 * Translator object for escaping EcmaScript/JavaScript. 065 * 066 * While {@link #escapeEcmaScript(String)} is the expected method of use, this 067 * object allows the EcmaScript escaping functionality to be used 068 * as the foundation for a custom translator. 069 * 070 * @since 3.0 071 */ 072 public static final CharSequenceTranslator ESCAPE_ECMASCRIPT = 073 new AggregateTranslator( 074 new LookupTranslator( 075 new String[][] { 076 {"'", "\\'"}, 077 {"\"", "\\\""}, 078 {"\\", "\\\\"}, 079 {"/", "\\/"} 080 }), 081 new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()), 082 UnicodeEscaper.outsideOf(32, 0x7f) 083 ); 084 085 /** 086 * Translator object for escaping XML. 087 * 088 * While {@link #escapeXml(String)} is the expected method of use, this 089 * object allows the XML escaping functionality to be used 090 * as the foundation for a custom translator. 091 * 092 * @since 3.0 093 */ 094 public static final CharSequenceTranslator ESCAPE_XML = 095 new AggregateTranslator( 096 new LookupTranslator(EntityArrays.BASIC_ESCAPE()), 097 new LookupTranslator(EntityArrays.APOS_ESCAPE()) 098 ); 099 100 /** 101 * Translator object for escaping HTML version 3.0. 102 * 103 * While {@link #escapeHtml3(String)} is the expected method of use, this 104 * object allows the HTML escaping functionality to be used 105 * as the foundation for a custom translator. 106 * 107 * @since 3.0 108 */ 109 public static final CharSequenceTranslator ESCAPE_HTML3 = 110 new AggregateTranslator( 111 new LookupTranslator(EntityArrays.BASIC_ESCAPE()), 112 new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()) 113 ); 114 115 /** 116 * Translator object for escaping HTML version 4.0. 117 * 118 * While {@link #escapeHtml4(String)} is the expected method of use, this 119 * object allows the HTML escaping functionality to be used 120 * as the foundation for a custom translator. 121 * 122 * @since 3.0 123 */ 124 public static final CharSequenceTranslator ESCAPE_HTML4 = 125 new AggregateTranslator( 126 new LookupTranslator(EntityArrays.BASIC_ESCAPE()), 127 new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()), 128 new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE()) 129 ); 130 131 /** 132 * Translator object for escaping individual Comma Separated Values. 133 * 134 * While {@link #escapeCsv(String)} is the expected method of use, this 135 * object allows the CSV escaping functionality to be used 136 * as the foundation for a custom translator. 137 * 138 * @since 3.0 139 */ 140 public static final CharSequenceTranslator ESCAPE_CSV = new CsvEscaper(); 141 142 // TODO: Create a parent class - 'SinglePassTranslator' ? 143 // It would handle the index checking + length returning, 144 // and could also have an optimization check method. 145 static class CsvEscaper extends CharSequenceTranslator { 146 147 private static final char CSV_DELIMITER = ','; 148 private static final char CSV_QUOTE = '"'; 149 private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE); 150 private static final char[] CSV_SEARCH_CHARS = 151 new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF}; 152 153 @Override 154 public int translate(CharSequence input, int index, Writer out) throws IOException { 155 156 if(index != 0) { 157 throw new IllegalStateException("CsvEscaper should never reach the [1] index"); 158 } 159 160 if (StringUtils.containsNone(input.toString(), CSV_SEARCH_CHARS)) { 161 out.write(input.toString()); 162 } else { 163 out.write(CSV_QUOTE); 164 out.write(StringUtils.replace(input.toString(), CSV_QUOTE_STR, CSV_QUOTE_STR + CSV_QUOTE_STR)); 165 out.write(CSV_QUOTE); 166 } 167 return input.length(); 168 } 169 } 170 171 /* UNESCAPE TRANSLATORS */ 172 173 /** 174 * Translator object for unescaping escaped Java. 175 * 176 * While {@link #unescapeJava(String)} is the expected method of use, this 177 * object allows the Java unescaping functionality to be used 178 * as the foundation for a custom translator. 179 * 180 * @since 3.0 181 */ 182 // TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)? 183 public static final CharSequenceTranslator UNESCAPE_JAVA = 184 new AggregateTranslator( 185 new OctalUnescaper(), // .between('\1', '\377'), 186 new UnicodeUnescaper(), 187 new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE()), 188 new LookupTranslator( 189 new String[][] { 190 {"\\\\", "\\"}, 191 {"\\\"", "\""}, 192 {"\\'", "'"}, 193 {"\\", ""} 194 }) 195 ); 196 197 /** 198 * Translator object for unescaping escaped EcmaScript. 199 * 200 * While {@link #unescapeEcmaScript(String)} is the expected method of use, this 201 * object allows the EcmaScript unescaping functionality to be used 202 * as the foundation for a custom translator. 203 * 204 * @since 3.0 205 */ 206 public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA; 207 208 /** 209 * Translator object for unescaping escaped HTML 3.0. 210 * 211 * While {@link #unescapeHtml3(String)} is the expected method of use, this 212 * object allows the HTML unescaping functionality to be used 213 * as the foundation for a custom translator. 214 * 215 * @since 3.0 216 */ 217 public static final CharSequenceTranslator UNESCAPE_HTML3 = 218 new AggregateTranslator( 219 new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), 220 new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()), 221 new NumericEntityUnescaper() 222 ); 223 224 /** 225 * Translator object for unescaping escaped HTML 4.0. 226 * 227 * While {@link #unescapeHtml4(String)} is the expected method of use, this 228 * object allows the HTML unescaping functionality to be used 229 * as the foundation for a custom translator. 230 * 231 * @since 3.0 232 */ 233 public static final CharSequenceTranslator UNESCAPE_HTML4 = 234 new AggregateTranslator( 235 new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), 236 new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()), 237 new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()), 238 new NumericEntityUnescaper() 239 ); 240 241 /** 242 * Translator object for unescaping escaped XML. 243 * 244 * While {@link #unescapeXml(String)} is the expected method of use, this 245 * object allows the XML unescaping functionality to be used 246 * as the foundation for a custom translator. 247 * 248 * @since 3.0 249 */ 250 public static final CharSequenceTranslator UNESCAPE_XML = 251 new AggregateTranslator( 252 new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), 253 new LookupTranslator(EntityArrays.APOS_UNESCAPE()), 254 new NumericEntityUnescaper() 255 ); 256 257 /** 258 * Translator object for unescaping escaped Comma Separated Value entries. 259 * 260 * While {@link #unescapeCsv(String)} is the expected method of use, this 261 * object allows the CSV unescaping functionality to be used 262 * as the foundation for a custom translator. 263 * 264 * @since 3.0 265 */ 266 public static final CharSequenceTranslator UNESCAPE_CSV = new CsvUnescaper(); 267 268 static class CsvUnescaper extends CharSequenceTranslator { 269 270 private static final char CSV_DELIMITER = ','; 271 private static final char CSV_QUOTE = '"'; 272 private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE); 273 private static final char[] CSV_SEARCH_CHARS = 274 new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF}; 275 276 @Override 277 public int translate(CharSequence input, int index, Writer out) throws IOException { 278 279 if(index != 0) { 280 throw new IllegalStateException("CsvUnescaper should never reach the [1] index"); 281 } 282 283 if ( input.charAt(0) != CSV_QUOTE || input.charAt(input.length() - 1) != CSV_QUOTE ) { 284 out.write(input.toString()); 285 return input.length(); 286 } 287 288 // strip quotes 289 String quoteless = input.subSequence(1, input.length() - 1).toString(); 290 291 if ( StringUtils.containsAny(quoteless, CSV_SEARCH_CHARS) ) { 292 // deal with escaped quotes; ie) "" 293 out.write(StringUtils.replace(quoteless, CSV_QUOTE_STR + CSV_QUOTE_STR, CSV_QUOTE_STR)); 294 } else { 295 out.write(input.toString()); 296 } 297 return input.length(); 298 } 299 } 300 301 /* Helper functions */ 302 303 /** 304 * <p>{@code StringEscapeUtils} instances should NOT be constructed in 305 * standard programming.</p> 306 * 307 * <p>Instead, the class should be used as: 308 * <pre>StringEscapeUtils.escapeJava("foo");</pre></p> 309 * 310 * <p>This constructor is public to permit tools that require a JavaBean 311 * instance to operate.</p> 312 */ 313 public StringEscapeUtils() { 314 super(); 315 } 316 317 // Java and JavaScript 318 //-------------------------------------------------------------------------- 319 /** 320 * <p>Escapes the characters in a {@code String} using Java String rules.</p> 321 * 322 * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> 323 * 324 * <p>So a tab becomes the characters {@code '\\'} and 325 * {@code 't'}.</p> 326 * 327 * <p>The only difference between Java strings and JavaScript strings 328 * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p> 329 * 330 * <p>Example: 331 * <pre> 332 * input string: He didn't say, "Stop!" 333 * output string: He didn't say, \"Stop!\" 334 * </pre> 335 * </p> 336 * 337 * @param input String to escape values in, may be null 338 * @return String with escaped values, {@code null} if null string input 339 */ 340 public static final String escapeJava(String input) { 341 return ESCAPE_JAVA.translate(input); 342 } 343 344 /** 345 * <p>Escapes the characters in a {@code String} using EcmaScript String rules.</p> 346 * <p>Escapes any values it finds into their EcmaScript String form. 347 * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> 348 * 349 * <p>So a tab becomes the characters {@code '\\'} and 350 * {@code 't'}.</p> 351 * 352 * <p>The only difference between Java strings and EcmaScript strings 353 * is that in EcmaScript, a single quote and forward-slash (/) are escaped.</p> 354 * 355 * <p>Note that EcmaScript is best known by the JavaScript and ActionScript dialects. </p> 356 * 357 * <p>Example: 358 * <pre> 359 * input string: He didn't say, "Stop!" 360 * output string: He didn\'t say, \"Stop!\" 361 * </pre> 362 * </p> 363 * 364 * @param input String to escape values in, may be null 365 * @return String with escaped values, {@code null} if null string input 366 * 367 * @since 3.0 368 */ 369 public static final String escapeEcmaScript(String input) { 370 return ESCAPE_ECMASCRIPT.translate(input); 371 } 372 373 /** 374 * <p>Unescapes any Java literals found in the {@code String}. 375 * For example, it will turn a sequence of {@code '\'} and 376 * {@code 'n'} into a newline character, unless the {@code '\'} 377 * is preceded by another {@code '\'}.</p> 378 * 379 * @param input the {@code String} to unescape, may be null 380 * @return a new unescaped {@code String}, {@code null} if null string input 381 */ 382 public static final String unescapeJava(String input) { 383 return UNESCAPE_JAVA.translate(input); 384 } 385 386 /** 387 * <p>Unescapes any EcmaScript literals found in the {@code String}.</p> 388 * 389 * <p>For example, it will turn a sequence of {@code '\'} and {@code 'n'} 390 * into a newline character, unless the {@code '\'} is preceded by another 391 * {@code '\'}.</p> 392 * 393 * @see #unescapeJava(String) 394 * @param input the {@code String} to unescape, may be null 395 * @return A new unescaped {@code String}, {@code null} if null string input 396 * 397 * @since 3.0 398 */ 399 public static final String unescapeEcmaScript(String input) { 400 return UNESCAPE_ECMASCRIPT.translate(input); 401 } 402 403 // HTML and XML 404 //-------------------------------------------------------------------------- 405 /** 406 * <p>Escapes the characters in a {@code String} using HTML entities.</p> 407 * 408 * <p> 409 * For example: 410 * </p> 411 * <p><code>"bread" & "butter"</code></p> 412 * becomes: 413 * <p> 414 * <code>&quot;bread&quot; &amp; &quot;butter&quot;</code>. 415 * </p> 416 * 417 * <p>Supports all known HTML 4.0 entities, including funky accents. 418 * Note that the commonly used apostrophe escape character (&apos;) 419 * is not a legal entity and so is not supported). </p> 420 * 421 * @param input the {@code String} to escape, may be null 422 * @return a new escaped {@code String}, {@code null} if null string input 423 * 424 * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> 425 * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> 426 * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> 427 * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> 428 * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> 429 * 430 * @since 3.0 431 */ 432 public static final String escapeHtml4(String input) { 433 return ESCAPE_HTML4.translate(input); 434 } 435 436 /** 437 * <p>Escapes the characters in a {@code String} using HTML entities.</p> 438 * <p>Supports only the HTML 3.0 entities. </p> 439 * 440 * @param input the {@code String} to escape, may be null 441 * @return a new escaped {@code String}, {@code null} if null string input 442 * 443 * @since 3.0 444 */ 445 public static final String escapeHtml3(String input) { 446 return ESCAPE_HTML3.translate(input); 447 } 448 449 //----------------------------------------------------------------------- 450 /** 451 * <p>Unescapes a string containing entity escapes to a string 452 * containing the actual Unicode characters corresponding to the 453 * escapes. Supports HTML 4.0 entities.</p> 454 * 455 * <p>For example, the string "&lt;Fran&ccedil;ais&gt;" 456 * will become "<Français>"</p> 457 * 458 * <p>If an entity is unrecognized, it is left alone, and inserted 459 * verbatim into the result string. e.g. "&gt;&zzzz;x" will 460 * become ">&zzzz;x".</p> 461 * 462 * @param input the {@code String} to unescape, may be null 463 * @return a new unescaped {@code String}, {@code null} if null string input 464 * 465 * @since 3.0 466 */ 467 public static final String unescapeHtml4(String input) { 468 return UNESCAPE_HTML4.translate(input); 469 } 470 471 /** 472 * <p>Unescapes a string containing entity escapes to a string 473 * containing the actual Unicode characters corresponding to the 474 * escapes. Supports only HTML 3.0 entities.</p> 475 * 476 * @param input the {@code String} to unescape, may be null 477 * @return a new unescaped {@code String}, {@code null} if null string input 478 * 479 * @since 3.0 480 */ 481 public static final String unescapeHtml3(String input) { 482 return UNESCAPE_HTML3.translate(input); 483 } 484 485 //----------------------------------------------------------------------- 486 /** 487 * <p>Escapes the characters in a {@code String} using XML entities.</p> 488 * 489 * <p>For example: <tt>"bread" & "butter"</tt> => 490 * <tt>&quot;bread&quot; &amp; &quot;butter&quot;</tt>. 491 * </p> 492 * 493 * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos). 494 * Does not support DTDs or external entities.</p> 495 * 496 * <p>Note that unicode characters greater than 0x7f are as of 3.0, no longer 497 * escaped. If you still wish this functionality, you can achieve it 498 * via the following: 499 * {@code StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) );}</p> 500 * 501 * @param input the {@code String} to escape, may be null 502 * @return a new escaped {@code String}, {@code null} if null string input 503 * @see #unescapeXml(java.lang.String) 504 */ 505 public static final String escapeXml(String input) { 506 return ESCAPE_XML.translate(input); 507 } 508 509 510 //----------------------------------------------------------------------- 511 /** 512 * <p>Unescapes a string containing XML entity escapes to a string 513 * containing the actual Unicode characters corresponding to the 514 * escapes.</p> 515 * 516 * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos). 517 * Does not support DTDs or external entities.</p> 518 * 519 * <p>Note that numerical \\u unicode codes are unescaped to their respective 520 * unicode characters. This may change in future releases. </p> 521 * 522 * @param input the {@code String} to unescape, may be null 523 * @return a new unescaped {@code String}, {@code null} if null string input 524 * @see #escapeXml(String) 525 */ 526 public static final String unescapeXml(String input) { 527 return UNESCAPE_XML.translate(input); 528 } 529 530 531 //----------------------------------------------------------------------- 532 533 /** 534 * <p>Returns a {@code String} value for a CSV column enclosed in double quotes, 535 * if required.</p> 536 * 537 * <p>If the value contains a comma, newline or double quote, then the 538 * String value is returned enclosed in double quotes.</p> 539 * </p> 540 * 541 * <p>Any double quote characters in the value are escaped with another double quote.</p> 542 * 543 * <p>If the value does not contain a comma, newline or double quote, then the 544 * String value is returned unchanged.</p> 545 * </p> 546 * 547 * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and 548 * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>. 549 * 550 * @param input the input CSV column String, may be null 551 * @return the input String, enclosed in double quotes if the value contains a comma, 552 * newline or double quote, {@code null} if null string input 553 * @since 2.4 554 */ 555 public static final String escapeCsv(String input) { 556 return ESCAPE_CSV.translate(input); 557 } 558 559 /** 560 * <p>Returns a {@code String} value for an unescaped CSV column. </p> 561 * 562 * <p>If the value is enclosed in double quotes, and contains a comma, newline 563 * or double quote, then quotes are removed. 564 * </p> 565 * 566 * <p>Any double quote escaped characters (a pair of double quotes) are unescaped 567 * to just one double quote. </p> 568 * 569 * <p>If the value is not enclosed in double quotes, or is and does not contain a 570 * comma, newline or double quote, then the String value is returned unchanged.</p> 571 * </p> 572 * 573 * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and 574 * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>. 575 * 576 * @param input the input CSV column String, may be null 577 * @return the input String, with enclosing double quotes removed and embedded double 578 * quotes unescaped, {@code null} if null string input 579 * @since 2.4 580 */ 581 public static final String unescapeCsv(String input) { 582 return UNESCAPE_CSV.translate(input); 583 } 584 585 }