001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser;
018    
019    import java.util.HashMap;
020    import java.util.regex.Matcher;
021    import java.util.regex.Pattern;
022    
023    import org.apache.log4j.Logger;
024    
025    /**
026     *
027     * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
028     * @version $Id: FeedFilter.java 373614 2006-01-30 22:31:21Z mvdb $
029     */
030    public class FeedFilter {
031    
032        private static Logger log = Logger.getLogger( FeedFilter.class );
033    
034        public static boolean DO_REMOVE_LEADING_PROLOG = true;
035        public static boolean DO_DECODE_ENTITIES = true;
036    
037        public static HashMap LATIN1_ENTITIES = new HashMap();
038    
039        private static Pattern entity_pattern = Pattern.compile( "&([a-zA-Z]+);" );
040    
041        /**
042         * This is probably the wrong behavior.  I shouldn't call this method I
043         * think because assuming a content type is bad form.
044         *
045         * @deprecated Specify an encoding with #parse( bytes[], encoding )
046         * 
047         */
048        public static byte[] parse( byte[] bytes )
049            throws Exception {
050    
051            return parse( bytes, "UTF-8" );
052    
053        }
054    
055        public static byte[] parse( byte[] bytes, String encoding )
056            throws Exception {
057    
058            String content = new String( bytes, encoding );
059    
060            return parse( content, encoding );
061    
062        }
063    
064        /**
065         * Parse out an input string of content.
066         * 
067         * http://wiki.apache.org/jakarta-commons/FeedParser_2fStringAllocationConsideredHelpful
068         *
069         * 
070         */
071        public static byte[] parse( String content, String encoding )
072            throws Exception {
073    
074            //FIXME: return an object here so that I can flag a bozo bit.
075            
076            //remove leading prolog...
077            if ( DO_REMOVE_LEADING_PROLOG )
078                content = doRemoveLeadingProlog( content, encoding );
079    
080            //decode HTML entities that are referenced.
081            if ( DO_DECODE_ENTITIES )
082                content = doDecodeEntities( content );
083    
084            //TODO: undeclared namespace prefixes should be expanded to their common
085            //form. 'rdf, 'atom', 'xhtml' etc. Considering that they're will only be
086            //a handful H and then 4^36 different possibilities the probability will
087            //only be H in 4^36 which is pretty good that we won't have a false
088            //positive.
089            
090            return content.getBytes( encoding );
091    
092        }
093            
094        /**
095         * Removing prolog whitespace, comments, and other garbage from the
096         * beginning of a feed.
097         *
098         * 
099         */
100        private static String doRemoveLeadingProlog( String content, String encoding ) {
101    
102            // if we're a UTF-16 or UTF-32 feed we need to LEAVE the prolog because
103            // it triggers a UTF-16 parse due to the BOM.
104            //
105            // FIXME: this isn't actually true.  We should leave the BOM and remove
106            // the prolog anyway due to the fact that this will still break the
107            // parser.  Come up with some tests for UTF-16 to see if I can get it to
108            // break and then update this method.
109    
110            if ( "UTF-16".equals( encoding ) ||
111                 "UTF-32".equals( encoding ) )
112                return content;
113            
114            //move to the beginning of the first element or comment.  When this is a
115            //processing instruction we will move to that
116            int begin = content.indexOf( "<" );
117    
118            if ( begin > 0 ) {
119                content = content.substring( begin, content.length() );
120                log.warn( "Skipped whitespace in prolog and moved towards first element." );
121            }
122    
123            //now skip to the XML processing instruction when necessary.  This is
124            //used to remove comments prior to <?xml which are not allowed.
125            
126            begin = content.indexOf( "<?xml" );
127    
128            if ( begin > 0 ) {
129                content = content.substring( begin, content.length() );
130                log.warn( "Removed prolog towards first processing instruction." );
131            }
132    
133            content = doRemoveElementProlog( content );
134    
135            return content;
136            
137        }
138    
139        /**
140         * Remove element content between:
141         *
142         * <?xml version="1.0"?>
143         *
144         * THIS IS BROKEN PROLOG
145         *
146         * <foo>
147         *
148         * 
149         */
150        private static String doRemoveElementProlog( String content ) {
151    
152            int end = content.lastIndexOf( "?>", 100 );
153    
154            if ( end == -1 )
155                return content;
156    
157            StringBuffer buff = new StringBuffer( content.length() );
158            end = end + 2;
159            buff.append( content.substring( 0, end ) );
160    
161            int begin = content.indexOf( "<", end );
162    
163            if ( begin != -1 ) {
164    
165                buff.append( "\n" );
166                buff.append( content.substring( begin, content.length() ) );
167                
168            }
169            
170            return buff.toString();
171            
172        }
173        
174        private static String doDecodeEntities( String content ) {
175    
176            StringBuffer buff = new StringBuffer( content.length() + 1000 );
177    
178            Matcher m = entity_pattern.matcher( content );
179    
180            int begin = 0;
181    
182            boolean hasFilterDecodedEntities = false;
183            boolean hasFilterFoundUnknownEntity = false;
184    
185            //FIXME: note that when I was benchmarking this code that this showed up
186            //as a MAJOR bottleneck so we might want to optimize it a little more.
187    
188            while ( m.find() ) {
189    
190                buff.append( content.substring( begin, m.start() ) );
191                
192                String entity = m.group( 1 );
193    
194                String value = (String)LATIN1_ENTITIES.get( entity );
195    
196                if ( value != null ) {
197                    buff.append( "&#" );
198                    buff.append( value );
199                    buff.append( ";" );
200    
201                    hasFilterDecodedEntities = true;
202    
203                } else {
204    
205                    //This is not a known entity so we have no way to correct it.
206                    //If this is done then we have a problem and the feed probably
207                    //still won't parse
208                    buff.append( "&" );
209                    buff.append( entity );
210                    buff.append( ";" );
211    
212                    hasFilterFoundUnknownEntity = true;
213                }
214    
215                begin = m.end( 0 );
216                
217            } 
218    
219            buff.append( content.substring( begin, content.length() ) );
220    
221            if ( hasFilterFoundUnknownEntity ) 
222                log.warn( "Filter encountered unknown entities" );
223    
224            if ( hasFilterDecodedEntities ) 
225                log.warn( "Filter has decoded latin1 entities." );
226    
227            return buff.toString();
228            
229        }
230        
231        public static void main( String[] args ) throws Exception {
232    
233            byte[] b = parse( "hello &eacute; world".getBytes() );
234    
235            String v = new String( b );
236    
237            System.out.println( "v: " + v );
238            
239        }
240        
241        static {
242    
243            // load the latin1 entity map.  We will replace latin1 entities with
244            // their char references directly.  For example if someone incorrectly
245            // references:
246            //
247            // &auml;
248            //
249            // we replace it with:
250            //
251            // &#228;
252            //
253            // Which is correct in Latin1
254    
255            // http://my.netscape.com/publish/formats/rss-0.91.dtd
256    
257            LATIN1_ENTITIES.put( "nbsp",      "160" );
258            LATIN1_ENTITIES.put( "iexcl",     "161" );
259            LATIN1_ENTITIES.put( "cent",      "162" );
260            LATIN1_ENTITIES.put( "pound",     "163" );
261            LATIN1_ENTITIES.put( "curren",    "164" );
262            LATIN1_ENTITIES.put( "yen",       "165" );
263            LATIN1_ENTITIES.put( "brvbar",    "166" );
264            LATIN1_ENTITIES.put( "sect",      "167" );
265            LATIN1_ENTITIES.put( "uml",       "168" );
266            LATIN1_ENTITIES.put( "copy",      "169" );
267            LATIN1_ENTITIES.put( "ordf",      "170" );
268            LATIN1_ENTITIES.put( "laquo",     "171" );
269            LATIN1_ENTITIES.put( "not",       "172" );
270            LATIN1_ENTITIES.put( "shy",       "173" );
271            LATIN1_ENTITIES.put( "reg",       "174" );
272            LATIN1_ENTITIES.put( "macr",      "175" );
273            LATIN1_ENTITIES.put( "deg",       "176" );
274            LATIN1_ENTITIES.put( "plusmn",    "177" );
275            LATIN1_ENTITIES.put( "sup2",      "178" );
276            LATIN1_ENTITIES.put( "sup3",      "179" );
277            LATIN1_ENTITIES.put( "acute",     "180" );
278            LATIN1_ENTITIES.put( "micro",     "181" );
279            LATIN1_ENTITIES.put( "para",      "182" );
280            LATIN1_ENTITIES.put( "middot",    "183" );
281            LATIN1_ENTITIES.put( "cedil",     "184" );
282            LATIN1_ENTITIES.put( "sup1",      "185" );
283            LATIN1_ENTITIES.put( "ordm",      "186" );
284            LATIN1_ENTITIES.put( "raquo",     "187" );
285            LATIN1_ENTITIES.put( "frac14",    "188" );
286            LATIN1_ENTITIES.put( "frac12",    "189" );
287            LATIN1_ENTITIES.put( "frac34",    "190" );
288            LATIN1_ENTITIES.put( "iquest",    "191" );
289            LATIN1_ENTITIES.put( "Agrave",    "192" );
290            LATIN1_ENTITIES.put( "Aacute",    "193" );
291            LATIN1_ENTITIES.put( "Acirc",     "194" );
292            LATIN1_ENTITIES.put( "Atilde",    "195" );
293            LATIN1_ENTITIES.put( "Auml",      "196" );
294            LATIN1_ENTITIES.put( "Aring",     "197" );
295            LATIN1_ENTITIES.put( "AElig",     "198" );
296            LATIN1_ENTITIES.put( "Ccedil",    "199" );
297            LATIN1_ENTITIES.put( "Egrave",    "200" );
298            LATIN1_ENTITIES.put( "Eacute",    "201" );
299            LATIN1_ENTITIES.put( "Ecirc",     "202" );
300            LATIN1_ENTITIES.put( "Euml",      "203" );
301            LATIN1_ENTITIES.put( "Igrave",    "204" );
302            LATIN1_ENTITIES.put( "Iacute",    "205" );
303            LATIN1_ENTITIES.put( "Icirc",     "206" );
304            LATIN1_ENTITIES.put( "Iuml",      "207" );
305            LATIN1_ENTITIES.put( "ETH",       "208" );
306            LATIN1_ENTITIES.put( "Ntilde",    "209" );
307            LATIN1_ENTITIES.put( "Ograve",    "210" );
308            LATIN1_ENTITIES.put( "Oacute",    "211" );
309            LATIN1_ENTITIES.put( "Ocirc",     "212" );
310            LATIN1_ENTITIES.put( "Otilde",    "213" );
311            LATIN1_ENTITIES.put( "Ouml",      "214" );
312            LATIN1_ENTITIES.put( "times",     "215" );
313            LATIN1_ENTITIES.put( "Oslash",    "216" );
314            LATIN1_ENTITIES.put( "Ugrave",    "217" );
315            LATIN1_ENTITIES.put( "Uacute",    "218" );
316            LATIN1_ENTITIES.put( "Ucirc",     "219" );
317            LATIN1_ENTITIES.put( "Uuml",      "220" );
318            LATIN1_ENTITIES.put( "Yacute",    "221" );
319            LATIN1_ENTITIES.put( "THORN",     "222" );
320            LATIN1_ENTITIES.put( "szlig",     "223" );
321            LATIN1_ENTITIES.put( "agrave",    "224" );
322            LATIN1_ENTITIES.put( "aacute",    "225" );
323            LATIN1_ENTITIES.put( "acirc",     "226" );
324            LATIN1_ENTITIES.put( "atilde",    "227" );
325            LATIN1_ENTITIES.put( "auml",      "228" );
326            LATIN1_ENTITIES.put( "aring",     "229" );
327            LATIN1_ENTITIES.put( "aelig",     "230" );
328            LATIN1_ENTITIES.put( "ccedil",    "231" );
329            LATIN1_ENTITIES.put( "egrave",    "232" );
330            LATIN1_ENTITIES.put( "eacute",    "233" );
331            LATIN1_ENTITIES.put( "ecirc",     "234" );
332            LATIN1_ENTITIES.put( "euml",      "235" );
333            LATIN1_ENTITIES.put( "igrave",    "236" );
334            LATIN1_ENTITIES.put( "iacute",    "237" );
335            LATIN1_ENTITIES.put( "icirc",     "238" );
336            LATIN1_ENTITIES.put( "iuml",      "239" );
337            LATIN1_ENTITIES.put( "eth",       "240" );
338            LATIN1_ENTITIES.put( "ntilde",    "241" );
339            LATIN1_ENTITIES.put( "ograve",    "242" );
340            LATIN1_ENTITIES.put( "oacute",    "243" );
341            LATIN1_ENTITIES.put( "ocirc",     "244" );
342            LATIN1_ENTITIES.put( "otilde",    "245" );
343            LATIN1_ENTITIES.put( "ouml",      "246" );
344            LATIN1_ENTITIES.put( "divide",    "247" );
345            LATIN1_ENTITIES.put( "oslash",    "248" );
346            LATIN1_ENTITIES.put( "ugrave",    "249" );
347            LATIN1_ENTITIES.put( "uacute",    "250" );
348            LATIN1_ENTITIES.put( "ucirc",     "251" );
349            LATIN1_ENTITIES.put( "uuml",      "252" );
350            LATIN1_ENTITIES.put( "yacute",    "253" );
351            LATIN1_ENTITIES.put( "thorn",     "254" );
352            LATIN1_ENTITIES.put( "yuml",      "255" );
353    
354        }
355        
356    }