View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    * 
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.lang3;
18  
19  import java.io.IOException;
20  import java.io.Writer;
21  
22  import org.apache.commons.lang3.text.translate.AggregateTranslator;
23  import org.apache.commons.lang3.text.translate.CharSequenceTranslator;
24  import org.apache.commons.lang3.text.translate.EntityArrays;
25  import org.apache.commons.lang3.text.translate.JavaUnicodeEscaper;
26  import org.apache.commons.lang3.text.translate.LookupTranslator;
27  import org.apache.commons.lang3.text.translate.NumericEntityUnescaper;
28  import org.apache.commons.lang3.text.translate.OctalUnescaper;
29  import org.apache.commons.lang3.text.translate.UnicodeUnescaper;
30  
31  /**
32   * <p>Escapes and unescapes {@code String}s for
33   * Java, Java Script, HTML and XML.</p>
34   *
35   * <p>#ThreadSafe#</p>
36   * @since 2.0
37   * @version $Id: StringEscapeUtils.java 1436770 2013-01-22 07:09:45Z ggregory $
38   */
39  public class StringEscapeUtils {
40  
41      /* ESCAPE TRANSLATORS */
42  
43      /**
44       * Translator object for escaping Java. 
45       * 
46       * While {@link #escapeJava(String)} is the expected method of use, this 
47       * object allows the Java escaping functionality to be used 
48       * as the foundation for a custom translator. 
49       *
50       * @since 3.0
51       */
52      public static final CharSequenceTranslator ESCAPE_JAVA = 
53            new LookupTranslator(
54              new String[][] { 
55                {"\"", "\\\""},
56                {"\\", "\\\\"},
57            }).with(
58              new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE())
59            ).with(
60              JavaUnicodeEscaper.outsideOf(32, 0x7f) 
61          );
62  
63      /**
64       * Translator object for escaping EcmaScript/JavaScript. 
65       * 
66       * While {@link #escapeEcmaScript(String)} is the expected method of use, this 
67       * object allows the EcmaScript escaping functionality to be used 
68       * as the foundation for a custom translator. 
69       *
70       * @since 3.0
71       */
72      public static final CharSequenceTranslator ESCAPE_ECMASCRIPT = 
73          new AggregateTranslator(
74              new LookupTranslator(
75                        new String[][] { 
76                              {"'", "\\'"},
77                              {"\"", "\\\""},
78                              {"\\", "\\\\"},
79                              {"/", "\\/"}
80                        }),
81              new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()),
82              JavaUnicodeEscaper.outsideOf(32, 0x7f) 
83          );
84              
85      /**
86       * Translator object for escaping XML.
87       * 
88       * While {@link #escapeXml(String)} is the expected method of use, this 
89       * object allows the XML escaping functionality to be used 
90       * as the foundation for a custom translator. 
91       *
92       * @since 3.0
93       */
94      public static final CharSequenceTranslator ESCAPE_XML = 
95          new AggregateTranslator(
96              new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
97              new LookupTranslator(EntityArrays.APOS_ESCAPE())
98          );
99  
100     /**
101      * Translator object for escaping HTML version 3.0.
102      * 
103      * While {@link #escapeHtml3(String)} is the expected method of use, this 
104      * object allows the HTML escaping functionality to be used 
105      * as the foundation for a custom translator. 
106      *
107      * @since 3.0
108      */
109     public static final CharSequenceTranslator ESCAPE_HTML3 = 
110         new AggregateTranslator(
111             new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
112             new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE())
113         );
114 
115     /**
116      * Translator object for escaping HTML version 4.0.
117      * 
118      * While {@link #escapeHtml4(String)} is the expected method of use, this 
119      * object allows the HTML escaping functionality to be used 
120      * as the foundation for a custom translator. 
121      *
122      * @since 3.0
123      */
124     public static final CharSequenceTranslator ESCAPE_HTML4 = 
125         new AggregateTranslator(
126             new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
127             new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()),
128             new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE())
129         );
130 
131     /**
132      * Translator object for escaping individual Comma Separated Values. 
133      * 
134      * While {@link #escapeCsv(String)} is the expected method of use, this 
135      * object allows the CSV escaping functionality to be used 
136      * as the foundation for a custom translator. 
137      *
138      * @since 3.0
139      */
140     public static final CharSequenceTranslator ESCAPE_CSV = new CsvEscaper();
141 
142     // TODO: Create a parent class - 'SinglePassTranslator' ?
143     //       It would handle the index checking + length returning, 
144     //       and could also have an optimization check method.
145     static class CsvEscaper extends CharSequenceTranslator {
146 
147         private static final char CSV_DELIMITER = ',';
148         private static final char CSV_QUOTE = '"';
149         private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE);
150         private static final char[] CSV_SEARCH_CHARS = 
151             new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF};
152 
153         @Override
154         public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
155 
156             if(index != 0) {
157                 throw new IllegalStateException("CsvEscaper should never reach the [1] index");
158             }
159 
160             if (StringUtils.containsNone(input.toString(), CSV_SEARCH_CHARS)) {
161                 out.write(input.toString());
162             } else {
163                 out.write(CSV_QUOTE);
164                 out.write(StringUtils.replace(input.toString(), CSV_QUOTE_STR, CSV_QUOTE_STR + CSV_QUOTE_STR));
165                 out.write(CSV_QUOTE);
166             }
167             return input.length();
168         }
169     }
170 
171     /* UNESCAPE TRANSLATORS */
172 
173     /**
174      * Translator object for unescaping escaped Java. 
175      * 
176      * While {@link #unescapeJava(String)} is the expected method of use, this 
177      * object allows the Java unescaping functionality to be used 
178      * as the foundation for a custom translator. 
179      *
180      * @since 3.0
181      */
182     // TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)?
183     public static final CharSequenceTranslator UNESCAPE_JAVA = 
184         new AggregateTranslator(
185             new OctalUnescaper(),     // .between('\1', '\377'),
186             new UnicodeUnescaper(),
187             new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE()),
188             new LookupTranslator(
189                       new String[][] { 
190                             {"\\\\", "\\"},
191                             {"\\\"", "\""},
192                             {"\\'", "'"},
193                             {"\\", ""}
194                       })
195         );
196 
197     /**
198      * Translator object for unescaping escaped EcmaScript. 
199      * 
200      * While {@link #unescapeEcmaScript(String)} is the expected method of use, this 
201      * object allows the EcmaScript unescaping functionality to be used 
202      * as the foundation for a custom translator. 
203      *
204      * @since 3.0
205      */
206     public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA;
207 
208     /**
209      * Translator object for unescaping escaped HTML 3.0. 
210      * 
211      * While {@link #unescapeHtml3(String)} is the expected method of use, this 
212      * object allows the HTML unescaping functionality to be used 
213      * as the foundation for a custom translator. 
214      *
215      * @since 3.0
216      */
217     public static final CharSequenceTranslator UNESCAPE_HTML3 = 
218         new AggregateTranslator(
219             new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
220             new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
221             new NumericEntityUnescaper()
222         );
223 
224     /**
225      * Translator object for unescaping escaped HTML 4.0. 
226      * 
227      * While {@link #unescapeHtml4(String)} is the expected method of use, this 
228      * object allows the HTML unescaping functionality to be used 
229      * as the foundation for a custom translator. 
230      *
231      * @since 3.0
232      */
233     public static final CharSequenceTranslator UNESCAPE_HTML4 = 
234         new AggregateTranslator(
235             new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
236             new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
237             new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()),
238             new NumericEntityUnescaper()
239         );
240 
241     /**
242      * Translator object for unescaping escaped XML.
243      * 
244      * While {@link #unescapeXml(String)} is the expected method of use, this 
245      * object allows the XML unescaping functionality to be used 
246      * as the foundation for a custom translator. 
247      *
248      * @since 3.0
249      */
250     public static final CharSequenceTranslator UNESCAPE_XML = 
251         new AggregateTranslator(
252             new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
253             new LookupTranslator(EntityArrays.APOS_UNESCAPE()),
254             new NumericEntityUnescaper()
255         );
256 
257     /**
258      * Translator object for unescaping escaped Comma Separated Value entries.
259      * 
260      * While {@link #unescapeCsv(String)} is the expected method of use, this 
261      * object allows the CSV unescaping functionality to be used 
262      * as the foundation for a custom translator. 
263      *
264      * @since 3.0
265      */
266     public static final CharSequenceTranslator UNESCAPE_CSV = new CsvUnescaper();
267 
268     static class CsvUnescaper extends CharSequenceTranslator {
269 
270         private static final char CSV_DELIMITER = ',';
271         private static final char CSV_QUOTE = '"';
272         private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE);
273         private static final char[] CSV_SEARCH_CHARS = 
274             new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF};
275 
276         @Override
277         public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
278 
279             if(index != 0) {
280                 throw new IllegalStateException("CsvUnescaper should never reach the [1] index");
281             }
282 
283             if ( input.charAt(0) != CSV_QUOTE || input.charAt(input.length() - 1) != CSV_QUOTE ) {
284                 out.write(input.toString());
285                 return input.length();
286             }
287 
288             // strip quotes
289             final String quoteless = input.subSequence(1, input.length() - 1).toString();
290 
291             if ( StringUtils.containsAny(quoteless, CSV_SEARCH_CHARS) ) {
292                 // deal with escaped quotes; ie) ""
293                 out.write(StringUtils.replace(quoteless, CSV_QUOTE_STR + CSV_QUOTE_STR, CSV_QUOTE_STR));
294             } else {
295                 out.write(input.toString());
296             }
297             return input.length();
298         }
299     }
300 
301     /* Helper functions */
302 
303     /**
304      * <p>{@code StringEscapeUtils} instances should NOT be constructed in
305      * standard programming.</p>
306      *
307      * <p>Instead, the class should be used as:
308      * <pre>StringEscapeUtils.escapeJava("foo");</pre></p>
309      *
310      * <p>This constructor is public to permit tools that require a JavaBean
311      * instance to operate.</p>
312      */
313     public StringEscapeUtils() {
314       super();
315     }
316 
317     // Java and JavaScript
318     //--------------------------------------------------------------------------
319     /**
320      * <p>Escapes the characters in a {@code String} using Java String rules.</p>
321      *
322      * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
323      *
324      * <p>So a tab becomes the characters {@code '\\'} and
325      * {@code 't'}.</p>
326      *
327      * <p>The only difference between Java strings and JavaScript strings
328      * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p>
329      *
330      * <p>Example:
331      * <pre>
332      * input string: He didn't say, "Stop!"
333      * output string: He didn't say, \"Stop!\"
334      * </pre>
335      * </p>
336      *
337      * @param input  String to escape values in, may be null
338      * @return String with escaped values, {@code null} if null string input
339      */
340     public static final String escapeJava(final String input) {
341         return ESCAPE_JAVA.translate(input);
342     }
343 
344     /**
345      * <p>Escapes the characters in a {@code String} using EcmaScript String rules.</p>
346      * <p>Escapes any values it finds into their EcmaScript String form.
347      * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
348      *
349      * <p>So a tab becomes the characters {@code '\\'} and
350      * {@code 't'}.</p>
351      *
352      * <p>The only difference between Java strings and EcmaScript strings
353      * is that in EcmaScript, a single quote and forward-slash (/) are escaped.</p>
354      *
355      * <p>Note that EcmaScript is best known by the JavaScript and ActionScript dialects. </p>
356      *
357      * <p>Example:
358      * <pre>
359      * input string: He didn't say, "Stop!"
360      * output string: He didn\'t say, \"Stop!\"
361      * </pre>
362      * </p>
363      *
364      * @param input  String to escape values in, may be null
365      * @return String with escaped values, {@code null} if null string input
366      *
367      * @since 3.0
368      */
369     public static final String escapeEcmaScript(final String input) {
370         return ESCAPE_ECMASCRIPT.translate(input);
371     }
372 
373     /**
374      * <p>Unescapes any Java literals found in the {@code String}.
375      * For example, it will turn a sequence of {@code '\'} and
376      * {@code 'n'} into a newline character, unless the {@code '\'}
377      * is preceded by another {@code '\'}.</p>
378      * 
379      * @param input  the {@code String} to unescape, may be null
380      * @return a new unescaped {@code String}, {@code null} if null string input
381      */
382     public static final String unescapeJava(final String input) {
383         return UNESCAPE_JAVA.translate(input);
384     }
385 
386     /**
387      * <p>Unescapes any EcmaScript literals found in the {@code String}.</p>
388      *
389      * <p>For example, it will turn a sequence of {@code '\'} and {@code 'n'}
390      * into a newline character, unless the {@code '\'} is preceded by another
391      * {@code '\'}.</p>
392      *
393      * @see #unescapeJava(String)
394      * @param input  the {@code String} to unescape, may be null
395      * @return A new unescaped {@code String}, {@code null} if null string input
396      *
397      * @since 3.0
398      */
399     public static final String unescapeEcmaScript(final String input) {
400         return UNESCAPE_ECMASCRIPT.translate(input);
401     }
402 
403     // HTML and XML
404     //--------------------------------------------------------------------------
405     /**
406      * <p>Escapes the characters in a {@code String} using HTML entities.</p>
407      *
408      * <p>
409      * For example:
410      * </p> 
411      * <p><code>"bread" & "butter"</code></p>
412      * becomes:
413      * <p>
414      * <code>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</code>.
415      * </p>
416      *
417      * <p>Supports all known HTML 4.0 entities, including funky accents.
418      * Note that the commonly used apostrophe escape character (&amp;apos;)
419      * is not a legal entity and so is not supported). </p>
420      *
421      * @param input  the {@code String} to escape, may be null
422      * @return a new escaped {@code String}, {@code null} if null string input
423      * 
424      * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
425      * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
426      * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
427      * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
428      * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
429      * 
430      * @since 3.0
431      */
432     public static final String escapeHtml4(final String input) {
433         return ESCAPE_HTML4.translate(input);
434     }
435 
436     /**
437      * <p>Escapes the characters in a {@code String} using HTML entities.</p>
438      * <p>Supports only the HTML 3.0 entities. </p>
439      *
440      * @param input  the {@code String} to escape, may be null
441      * @return a new escaped {@code String}, {@code null} if null string input
442      * 
443      * @since 3.0
444      */
445     public static final String escapeHtml3(final String input) {
446         return ESCAPE_HTML3.translate(input);
447     }
448                 
449     //-----------------------------------------------------------------------
450     /**
451      * <p>Unescapes a string containing entity escapes to a string
452      * containing the actual Unicode characters corresponding to the
453      * escapes. Supports HTML 4.0 entities.</p>
454      *
455      * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
456      * will become "&lt;Fran&ccedil;ais&gt;"</p>
457      *
458      * <p>If an entity is unrecognized, it is left alone, and inserted
459      * verbatim into the result string. e.g. "&amp;gt;&amp;zzzz;x" will
460      * become "&gt;&amp;zzzz;x".</p>
461      *
462      * @param input  the {@code String} to unescape, may be null
463      * @return a new unescaped {@code String}, {@code null} if null string input
464      * 
465      * @since 3.0
466      */
467     public static final String unescapeHtml4(final String input) {
468         return UNESCAPE_HTML4.translate(input);
469     }
470 
471     /**
472      * <p>Unescapes a string containing entity escapes to a string
473      * containing the actual Unicode characters corresponding to the
474      * escapes. Supports only HTML 3.0 entities.</p>
475      *
476      * @param input  the {@code String} to unescape, may be null
477      * @return a new unescaped {@code String}, {@code null} if null string input
478      * 
479      * @since 3.0
480      */
481     public static final String unescapeHtml3(final String input) {
482         return UNESCAPE_HTML3.translate(input);
483     }
484 
485     //-----------------------------------------------------------------------
486     /**
487      * <p>Escapes the characters in a {@code String} using XML entities.</p>
488      *
489      * <p>For example: <tt>"bread" & "butter"</tt> =>
490      * <tt>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</tt>.
491      * </p>
492      *
493      * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos).
494      * Does not support DTDs or external entities.</p>
495      *
496      * <p>Note that Unicode characters greater than 0x7f are as of 3.0, no longer 
497      *    escaped. If you still wish this functionality, you can achieve it 
498      *    via the following: 
499      * {@code StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) );}</p>
500      *
501      * @param input  the {@code String} to escape, may be null
502      * @return a new escaped {@code String}, {@code null} if null string input
503      * @see #unescapeXml(java.lang.String)
504      */
505     public static final String escapeXml(final String input) {
506         return ESCAPE_XML.translate(input);
507     }
508                 
509 
510     //-----------------------------------------------------------------------
511     /**
512      * <p>Unescapes a string containing XML entity escapes to a string
513      * containing the actual Unicode characters corresponding to the
514      * escapes.</p>
515      *
516      * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos).
517      * Does not support DTDs or external entities.</p>
518      *
519      * <p>Note that numerical \\u Unicode codes are unescaped to their respective 
520      *    Unicode characters. This may change in future releases. </p>
521      *
522      * @param input  the {@code String} to unescape, may be null
523      * @return a new unescaped {@code String}, {@code null} if null string input
524      * @see #escapeXml(String)
525      */
526     public static final String unescapeXml(final String input) {
527         return UNESCAPE_XML.translate(input);
528     }
529                 
530 
531     //-----------------------------------------------------------------------
532 
533     /**
534      * <p>Returns a {@code String} value for a CSV column enclosed in double quotes,
535      * if required.</p>
536      *
537      * <p>If the value contains a comma, newline or double quote, then the
538      *    String value is returned enclosed in double quotes.</p>
539      * </p>
540      *
541      * <p>Any double quote characters in the value are escaped with another double quote.</p>
542      *
543      * <p>If the value does not contain a comma, newline or double quote, then the
544      *    String value is returned unchanged.</p>
545      * </p>
546      *
547      * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and
548      * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>.
549      *
550      * @param input the input CSV column String, may be null
551      * @return the input String, enclosed in double quotes if the value contains a comma,
552      * newline or double quote, {@code null} if null string input
553      * @since 2.4
554      */
555     public static final String escapeCsv(final String input) {
556         return ESCAPE_CSV.translate(input);
557     }
558 
559     /**
560      * <p>Returns a {@code String} value for an unescaped CSV column. </p>
561      *
562      * <p>If the value is enclosed in double quotes, and contains a comma, newline 
563      *    or double quote, then quotes are removed. 
564      * </p>
565      *
566      * <p>Any double quote escaped characters (a pair of double quotes) are unescaped 
567      *    to just one double quote. </p>
568      *
569      * <p>If the value is not enclosed in double quotes, or is and does not contain a 
570      *    comma, newline or double quote, then the String value is returned unchanged.</p>
571      * </p>
572      *
573      * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and
574      * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>.
575      *
576      * @param input the input CSV column String, may be null
577      * @return the input String, with enclosing double quotes removed and embedded double 
578      * quotes unescaped, {@code null} if null string input
579      * @since 2.4
580      */
581     public static final String unescapeCsv(final String input) {
582         return UNESCAPE_CSV.translate(input);
583     }
584 
585 }