001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser.tools;
018    
019    /**
020     * Class that can cleanse a string so that nothing can be present to break an
021     * XML parser.  This is a VERY non-portable class as it is meant to work just
022     * with Xalan/Xerces and may remove more text and replace things that are
023     * non-XML centric.
024     *
025     * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
026     * @version $Id: XMLCleanser.java 159211 2005-03-27 23:24:21Z burton $
027     */
028    public class XMLCleanser {
029    
030        public static String cleanse( String content ) {
031    
032            StringBuffer buff = new StringBuffer( content.length() );
033    
034            for ( int i = 0; i < content.length(); ++i ) {
035    
036                char c = content.charAt( i );
037                
038                if ( isXMLCharacter( c ) ) {
039    
040                    buff.append( c );
041                    
042                } 
043    
044            }
045    
046            return buff.toString();
047    
048        }
049    
050        /**
051         * Copy based on a byte array.  
052         *
053         * 
054         */
055        public static String cleanse( byte[] content, String encoding ) throws Exception {
056    
057            String s = new String( content, encoding );
058            
059            StringBuffer buff = new StringBuffer( content.length );
060    
061            for ( int i = 0; i < s.length(); ++i ) {
062    
063                char c = s.charAt( i );
064                
065                if ( isXMLCharacter( c ) ) {
066    
067                    buff.append( c );
068                    
069                } 
070    
071            }
072    
073            return buff.toString();
074    
075        }
076    
077        public static char[] cleanseToCharArray( byte[] content ) {
078    
079            char[] buff = new char[content.length];
080    
081            int index = 0;
082    
083            for ( int i = 0; i < content.length; ++i ) {
084    
085                char c = (char)content[ i ];
086                
087                if ( isXMLCharacter( c ) ) {
088    
089                    buff[index] = c;
090                    
091                    ++index;
092                } 
093    
094            }
095    
096            return buff;
097    
098        }
099        
100        /**
101         * Copy based on a byte array.  
102         *
103         * 
104         */
105        public static byte[] cleanseToByteArray( byte[] content ) {
106    
107            byte[] buff = new byte[ content.length ];
108    
109            int index = 0;
110            for ( int i = 0; i < content.length; ++i ) {
111    
112                char c = (char)content[ i ];
113                
114                if ( isXMLCharacter( c ) ) {
115    
116                    //buff.append( c );
117                    buff[index] = content[ i ];
118                    ++index;
119                } 
120    
121            }
122    
123            return buff;
124    
125        }
126    
127        /*
128         * This is a utility function for determining whether a specified character
129         * is a character according to production 2 of the XML 1.0 specification.
130         *
131         * @param c <code>char</code> to check for XML compliance.
132    
133         * @return <code>boolean</code> - true if it's a character, false otherwise.
134         */
135        public static boolean isXMLCharacter( char c ) {
136    
137            // A parsed entity contains text, a sequence of characters, which may
138            // represent markup or character data. A character is an atomic unit of
139            // text as specified by ISO/IEC 10646 [ISO/IEC 10646]. Legal characters
140            // are tab, carriage return, line feed, and the legal graphic characters
141            // of Unicode and ISO/IEC 10646. The use of "compatibility characters",
142            // as defined in section 6.8 of [Unicode], is discouraged.
143    
144            // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
145            // [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate
146            // blocks, FFFE, and FFFF. */
147            
148            if (c == '\n') return true;
149            if (c == '\r') return true;
150            if (c == '\t') return true;
151    
152            //NOTE: this was BROKEN!  The range between 0x80 and 0xFF is valid XML
153            //and would end up dropping latin characters in UTF-8.  Why did I want
154            //to return false here again?
155            
156            //if (c < 0x20) return false;  if (c < 0x80) return true;
157            //if (c < 0xFF) return false; if (c <= 0xD7FF) return true;
158    
159            if (c < 0x20) return false;  if (c <= 0xD7FF) return true;
160            if (c < 0xE000) return false;  if (c <= 0xFFFD) return true;
161            if (c < 0x10000) return false;  if (c <= 0x10FFFF) return true;
162            
163            return false;
164    
165        }
166    
167    }