001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.lang3;
018    
019    import java.io.IOException;
020    import java.io.Writer;
021    
022    import org.apache.commons.lang3.text.translate.AggregateTranslator;
023    import org.apache.commons.lang3.text.translate.CharSequenceTranslator;
024    import org.apache.commons.lang3.text.translate.EntityArrays;
025    import org.apache.commons.lang3.text.translate.LookupTranslator;
026    import org.apache.commons.lang3.text.translate.NumericEntityUnescaper;
027    import org.apache.commons.lang3.text.translate.OctalUnescaper;
028    import org.apache.commons.lang3.text.translate.UnicodeEscaper;
029    import org.apache.commons.lang3.text.translate.UnicodeUnescaper;
030    
031    /**
032     * <p>Escapes and unescapes {@code String}s for
033     * Java, Java Script, HTML and XML.</p>
034     *
035     * <p>#ThreadSafe#</p>
036     * @since 2.0
037     * @version $Id: StringEscapeUtils.java 1148520 2011-07-19 20:53:23Z ggregory $
038     */
039    public class StringEscapeUtils {
040    
041        /* ESCAPE TRANSLATORS */
042    
043        /**
044         * Translator object for escaping Java. 
045         * 
046         * While {@link #escapeJava(String)} is the expected method of use, this 
047         * object allows the Java escaping functionality to be used 
048         * as the foundation for a custom translator. 
049         *
050         * @since 3.0
051         */
052        public static final CharSequenceTranslator ESCAPE_JAVA = 
053              new LookupTranslator(
054                new String[][] { 
055                  {"\"", "\\\""},
056                  {"\\", "\\\\"},
057              }).with(
058                new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE())
059              ).with(
060                UnicodeEscaper.outsideOf(32, 0x7f) 
061            );
062    
063        /**
064         * Translator object for escaping EcmaScript/JavaScript. 
065         * 
066         * While {@link #escapeEcmaScript(String)} is the expected method of use, this 
067         * object allows the EcmaScript escaping functionality to be used 
068         * as the foundation for a custom translator. 
069         *
070         * @since 3.0
071         */
072        public static final CharSequenceTranslator ESCAPE_ECMASCRIPT = 
073            new AggregateTranslator(
074                new LookupTranslator(
075                          new String[][] { 
076                                {"'", "\\'"},
077                                {"\"", "\\\""},
078                                {"\\", "\\\\"},
079                                {"/", "\\/"}
080                          }),
081                new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()),
082                UnicodeEscaper.outsideOf(32, 0x7f) 
083            );
084                
085        /**
086         * Translator object for escaping XML.
087         * 
088         * While {@link #escapeXml(String)} is the expected method of use, this 
089         * object allows the XML escaping functionality to be used 
090         * as the foundation for a custom translator. 
091         *
092         * @since 3.0
093         */
094        public static final CharSequenceTranslator ESCAPE_XML = 
095            new AggregateTranslator(
096                new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
097                new LookupTranslator(EntityArrays.APOS_ESCAPE())
098            );
099    
100        /**
101         * Translator object for escaping HTML version 3.0.
102         * 
103         * While {@link #escapeHtml3(String)} is the expected method of use, this 
104         * object allows the HTML escaping functionality to be used 
105         * as the foundation for a custom translator. 
106         *
107         * @since 3.0
108         */
109        public static final CharSequenceTranslator ESCAPE_HTML3 = 
110            new AggregateTranslator(
111                new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
112                new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE())
113            );
114    
115        /**
116         * Translator object for escaping HTML version 4.0.
117         * 
118         * While {@link #escapeHtml4(String)} is the expected method of use, this 
119         * object allows the HTML escaping functionality to be used 
120         * as the foundation for a custom translator. 
121         *
122         * @since 3.0
123         */
124        public static final CharSequenceTranslator ESCAPE_HTML4 = 
125            new AggregateTranslator(
126                new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
127                new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()),
128                new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE())
129            );
130    
131        /**
132         * Translator object for escaping individual Comma Separated Values. 
133         * 
134         * While {@link #escapeCsv(String)} is the expected method of use, this 
135         * object allows the CSV escaping functionality to be used 
136         * as the foundation for a custom translator. 
137         *
138         * @since 3.0
139         */
140        public static final CharSequenceTranslator ESCAPE_CSV = new CsvEscaper();
141    
142        // TODO: Create a parent class - 'SinglePassTranslator' ?
143        //       It would handle the index checking + length returning, 
144        //       and could also have an optimization check method.
145        static class CsvEscaper extends CharSequenceTranslator {
146    
147            private static final char CSV_DELIMITER = ',';
148            private static final char CSV_QUOTE = '"';
149            private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE);
150            private static final char[] CSV_SEARCH_CHARS = 
151                new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF};
152    
153            @Override
154            public int translate(CharSequence input, int index, Writer out) throws IOException {
155    
156                if(index != 0) {
157                    throw new IllegalStateException("CsvEscaper should never reach the [1] index");
158                }
159    
160                if (StringUtils.containsNone(input.toString(), CSV_SEARCH_CHARS)) {
161                    out.write(input.toString());
162                } else {
163                    out.write(CSV_QUOTE);
164                    out.write(StringUtils.replace(input.toString(), CSV_QUOTE_STR, CSV_QUOTE_STR + CSV_QUOTE_STR));
165                    out.write(CSV_QUOTE);
166                }
167                return input.length();
168            }
169        }
170    
171        /* UNESCAPE TRANSLATORS */
172    
173        /**
174         * Translator object for unescaping escaped Java. 
175         * 
176         * While {@link #unescapeJava(String)} is the expected method of use, this 
177         * object allows the Java unescaping functionality to be used 
178         * as the foundation for a custom translator. 
179         *
180         * @since 3.0
181         */
182        // TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)?
183        public static final CharSequenceTranslator UNESCAPE_JAVA = 
184            new AggregateTranslator(
185                new OctalUnescaper(),     // .between('\1', '\377'),
186                new UnicodeUnescaper(),
187                new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE()),
188                new LookupTranslator(
189                          new String[][] { 
190                                {"\\\\", "\\"},
191                                {"\\\"", "\""},
192                                {"\\'", "'"},
193                                {"\\", ""}
194                          })
195            );
196    
197        /**
198         * Translator object for unescaping escaped EcmaScript. 
199         * 
200         * While {@link #unescapeEcmaScript(String)} is the expected method of use, this 
201         * object allows the EcmaScript unescaping functionality to be used 
202         * as the foundation for a custom translator. 
203         *
204         * @since 3.0
205         */
206        public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA;
207    
208        /**
209         * Translator object for unescaping escaped HTML 3.0. 
210         * 
211         * While {@link #unescapeHtml3(String)} is the expected method of use, this 
212         * object allows the HTML unescaping functionality to be used 
213         * as the foundation for a custom translator. 
214         *
215         * @since 3.0
216         */
217        public static final CharSequenceTranslator UNESCAPE_HTML3 = 
218            new AggregateTranslator(
219                new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
220                new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
221                new NumericEntityUnescaper()
222            );
223    
224        /**
225         * Translator object for unescaping escaped HTML 4.0. 
226         * 
227         * While {@link #unescapeHtml4(String)} is the expected method of use, this 
228         * object allows the HTML unescaping functionality to be used 
229         * as the foundation for a custom translator. 
230         *
231         * @since 3.0
232         */
233        public static final CharSequenceTranslator UNESCAPE_HTML4 = 
234            new AggregateTranslator(
235                new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
236                new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
237                new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()),
238                new NumericEntityUnescaper()
239            );
240    
241        /**
242         * Translator object for unescaping escaped XML.
243         * 
244         * While {@link #unescapeXml(String)} is the expected method of use, this 
245         * object allows the XML unescaping functionality to be used 
246         * as the foundation for a custom translator. 
247         *
248         * @since 3.0
249         */
250        public static final CharSequenceTranslator UNESCAPE_XML = 
251            new AggregateTranslator(
252                new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
253                new LookupTranslator(EntityArrays.APOS_UNESCAPE()),
254                new NumericEntityUnescaper()
255            );
256    
257        /**
258         * Translator object for unescaping escaped Comma Separated Value entries.
259         * 
260         * While {@link #unescapeCsv(String)} is the expected method of use, this 
261         * object allows the CSV unescaping functionality to be used 
262         * as the foundation for a custom translator. 
263         *
264         * @since 3.0
265         */
266        public static final CharSequenceTranslator UNESCAPE_CSV = new CsvUnescaper();
267    
268        static class CsvUnescaper extends CharSequenceTranslator {
269    
270            private static final char CSV_DELIMITER = ',';
271            private static final char CSV_QUOTE = '"';
272            private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE);
273            private static final char[] CSV_SEARCH_CHARS = 
274                new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF};
275    
276            @Override
277            public int translate(CharSequence input, int index, Writer out) throws IOException {
278    
279                if(index != 0) {
280                    throw new IllegalStateException("CsvUnescaper should never reach the [1] index");
281                }
282    
283                if ( input.charAt(0) != CSV_QUOTE || input.charAt(input.length() - 1) != CSV_QUOTE ) {
284                    out.write(input.toString());
285                    return input.length();
286                }
287    
288                // strip quotes
289                String quoteless = input.subSequence(1, input.length() - 1).toString();
290    
291                if ( StringUtils.containsAny(quoteless, CSV_SEARCH_CHARS) ) {
292                    // deal with escaped quotes; ie) ""
293                    out.write(StringUtils.replace(quoteless, CSV_QUOTE_STR + CSV_QUOTE_STR, CSV_QUOTE_STR));
294                } else {
295                    out.write(input.toString());
296                }
297                return input.length();
298            }
299        }
300    
301        /* Helper functions */
302    
303        /**
304         * <p>{@code StringEscapeUtils} instances should NOT be constructed in
305         * standard programming.</p>
306         *
307         * <p>Instead, the class should be used as:
308         * <pre>StringEscapeUtils.escapeJava("foo");</pre></p>
309         *
310         * <p>This constructor is public to permit tools that require a JavaBean
311         * instance to operate.</p>
312         */
313        public StringEscapeUtils() {
314          super();
315        }
316    
317        // Java and JavaScript
318        //--------------------------------------------------------------------------
319        /**
320         * <p>Escapes the characters in a {@code String} using Java String rules.</p>
321         *
322         * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
323         *
324         * <p>So a tab becomes the characters {@code '\\'} and
325         * {@code 't'}.</p>
326         *
327         * <p>The only difference between Java strings and JavaScript strings
328         * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p>
329         *
330         * <p>Example:
331         * <pre>
332         * input string: He didn't say, "Stop!"
333         * output string: He didn't say, \"Stop!\"
334         * </pre>
335         * </p>
336         *
337         * @param input  String to escape values in, may be null
338         * @return String with escaped values, {@code null} if null string input
339         */
340        public static final String escapeJava(String input) {
341            return ESCAPE_JAVA.translate(input);
342        }
343    
344        /**
345         * <p>Escapes the characters in a {@code String} using EcmaScript String rules.</p>
346         * <p>Escapes any values it finds into their EcmaScript String form.
347         * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
348         *
349         * <p>So a tab becomes the characters {@code '\\'} and
350         * {@code 't'}.</p>
351         *
352         * <p>The only difference between Java strings and EcmaScript strings
353         * is that in EcmaScript, a single quote and forward-slash (/) are escaped.</p>
354         *
355         * <p>Note that EcmaScript is best known by the JavaScript and ActionScript dialects. </p>
356         *
357         * <p>Example:
358         * <pre>
359         * input string: He didn't say, "Stop!"
360         * output string: He didn\'t say, \"Stop!\"
361         * </pre>
362         * </p>
363         *
364         * @param input  String to escape values in, may be null
365         * @return String with escaped values, {@code null} if null string input
366         *
367         * @since 3.0
368         */
369        public static final String escapeEcmaScript(String input) {
370            return ESCAPE_ECMASCRIPT.translate(input);
371        }
372    
373        /**
374         * <p>Unescapes any Java literals found in the {@code String}.
375         * For example, it will turn a sequence of {@code '\'} and
376         * {@code 'n'} into a newline character, unless the {@code '\'}
377         * is preceded by another {@code '\'}.</p>
378         * 
379         * @param input  the {@code String} to unescape, may be null
380         * @return a new unescaped {@code String}, {@code null} if null string input
381         */
382        public static final String unescapeJava(String input) {
383            return UNESCAPE_JAVA.translate(input);
384        }
385    
386        /**
387         * <p>Unescapes any EcmaScript literals found in the {@code String}.</p>
388         *
389         * <p>For example, it will turn a sequence of {@code '\'} and {@code 'n'}
390         * into a newline character, unless the {@code '\'} is preceded by another
391         * {@code '\'}.</p>
392         *
393         * @see #unescapeJava(String)
394         * @param input  the {@code String} to unescape, may be null
395         * @return A new unescaped {@code String}, {@code null} if null string input
396         *
397         * @since 3.0
398         */
399        public static final String unescapeEcmaScript(String input) {
400            return UNESCAPE_ECMASCRIPT.translate(input);
401        }
402    
403        // HTML and XML
404        //--------------------------------------------------------------------------
405        /**
406         * <p>Escapes the characters in a {@code String} using HTML entities.</p>
407         *
408         * <p>
409         * For example:
410         * </p> 
411         * <p><code>"bread" & "butter"</code></p>
412         * becomes:
413         * <p>
414         * <code>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</code>.
415         * </p>
416         *
417         * <p>Supports all known HTML 4.0 entities, including funky accents.
418         * Note that the commonly used apostrophe escape character (&amp;apos;)
419         * is not a legal entity and so is not supported). </p>
420         *
421         * @param input  the {@code String} to escape, may be null
422         * @return a new escaped {@code String}, {@code null} if null string input
423         * 
424         * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
425         * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
426         * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
427         * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
428         * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
429         * 
430         * @since 3.0
431         */
432        public static final String escapeHtml4(String input) {
433            return ESCAPE_HTML4.translate(input);
434        }
435    
436        /**
437         * <p>Escapes the characters in a {@code String} using HTML entities.</p>
438         * <p>Supports only the HTML 3.0 entities. </p>
439         *
440         * @param input  the {@code String} to escape, may be null
441         * @return a new escaped {@code String}, {@code null} if null string input
442         * 
443         * @since 3.0
444         */
445        public static final String escapeHtml3(String input) {
446            return ESCAPE_HTML3.translate(input);
447        }
448                    
449        //-----------------------------------------------------------------------
450        /**
451         * <p>Unescapes a string containing entity escapes to a string
452         * containing the actual Unicode characters corresponding to the
453         * escapes. Supports HTML 4.0 entities.</p>
454         *
455         * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
456         * will become "&lt;Fran&ccedil;ais&gt;"</p>
457         *
458         * <p>If an entity is unrecognized, it is left alone, and inserted
459         * verbatim into the result string. e.g. "&amp;gt;&amp;zzzz;x" will
460         * become "&gt;&amp;zzzz;x".</p>
461         *
462         * @param input  the {@code String} to unescape, may be null
463         * @return a new unescaped {@code String}, {@code null} if null string input
464         * 
465         * @since 3.0
466         */
467        public static final String unescapeHtml4(String input) {
468            return UNESCAPE_HTML4.translate(input);
469        }
470    
471        /**
472         * <p>Unescapes a string containing entity escapes to a string
473         * containing the actual Unicode characters corresponding to the
474         * escapes. Supports only HTML 3.0 entities.</p>
475         *
476         * @param input  the {@code String} to unescape, may be null
477         * @return a new unescaped {@code String}, {@code null} if null string input
478         * 
479         * @since 3.0
480         */
481        public static final String unescapeHtml3(String input) {
482            return UNESCAPE_HTML3.translate(input);
483        }
484    
485        //-----------------------------------------------------------------------
486        /**
487         * <p>Escapes the characters in a {@code String} using XML entities.</p>
488         *
489         * <p>For example: <tt>"bread" & "butter"</tt> =>
490         * <tt>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</tt>.
491         * </p>
492         *
493         * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos).
494         * Does not support DTDs or external entities.</p>
495         *
496         * <p>Note that Unicode characters greater than 0x7f are as of 3.0, no longer 
497         *    escaped. If you still wish this functionality, you can achieve it 
498         *    via the following: 
499         * {@code StringEscapeUtils.ESCAPE_XML.with( NumericEntityEscaper.between(0x7f, Integer.MAX_VALUE) );}</p>
500         *
501         * @param input  the {@code String} to escape, may be null
502         * @return a new escaped {@code String}, {@code null} if null string input
503         * @see #unescapeXml(java.lang.String)
504         */
505        public static final String escapeXml(String input) {
506            return ESCAPE_XML.translate(input);
507        }
508                    
509    
510        //-----------------------------------------------------------------------
511        /**
512         * <p>Unescapes a string containing XML entity escapes to a string
513         * containing the actual Unicode characters corresponding to the
514         * escapes.</p>
515         *
516         * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos).
517         * Does not support DTDs or external entities.</p>
518         *
519         * <p>Note that numerical \\u Unicode codes are unescaped to their respective 
520         *    Unicode characters. This may change in future releases. </p>
521         *
522         * @param input  the {@code String} to unescape, may be null
523         * @return a new unescaped {@code String}, {@code null} if null string input
524         * @see #escapeXml(String)
525         */
526        public static final String unescapeXml(String input) {
527            return UNESCAPE_XML.translate(input);
528        }
529                    
530    
531        //-----------------------------------------------------------------------
532    
533        /**
534         * <p>Returns a {@code String} value for a CSV column enclosed in double quotes,
535         * if required.</p>
536         *
537         * <p>If the value contains a comma, newline or double quote, then the
538         *    String value is returned enclosed in double quotes.</p>
539         * </p>
540         *
541         * <p>Any double quote characters in the value are escaped with another double quote.</p>
542         *
543         * <p>If the value does not contain a comma, newline or double quote, then the
544         *    String value is returned unchanged.</p>
545         * </p>
546         *
547         * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and
548         * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>.
549         *
550         * @param input the input CSV column String, may be null
551         * @return the input String, enclosed in double quotes if the value contains a comma,
552         * newline or double quote, {@code null} if null string input
553         * @since 2.4
554         */
555        public static final String escapeCsv(String input) {
556            return ESCAPE_CSV.translate(input);
557        }
558    
559        /**
560         * <p>Returns a {@code String} value for an unescaped CSV column. </p>
561         *
562         * <p>If the value is enclosed in double quotes, and contains a comma, newline 
563         *    or double quote, then quotes are removed. 
564         * </p>
565         *
566         * <p>Any double quote escaped characters (a pair of double quotes) are unescaped 
567         *    to just one double quote. </p>
568         *
569         * <p>If the value is not enclosed in double quotes, or is and does not contain a 
570         *    comma, newline or double quote, then the String value is returned unchanged.</p>
571         * </p>
572         *
573         * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and
574         * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>.
575         *
576         * @param input the input CSV column String, may be null
577         * @return the input String, with enclosing double quotes removed and embedded double 
578         * quotes unescaped, {@code null} if null string input
579         * @since 2.4
580         */
581        public static final String unescapeCsv(String input) {
582            return UNESCAPE_CSV.translate(input);
583        }
584    
585    }