001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser;
018    
019    import java.io.ByteArrayInputStream;
020    import java.io.ByteArrayOutputStream;
021    import java.io.IOException;
022    import java.io.InputStream;
023    
024    import org.apache.commons.feedparser.tools.XMLCleanser;
025    import org.apache.commons.feedparser.tools.XMLEncodingParser;
026    import org.apache.log4j.Logger;
027    import org.jdom.input.SAXBuilder;
028    
029    /**
030     * This FeedParser implementation is based on JDOM and Jaxen and is based around
031     * XPath and JDOM iteration.  While the implementation is straight forward it
032     * has not been optimized for performance.  A SAX based parser would certainly
033     * be less memory intensive but with the downside of being harder to develop.
034     *
035     * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
036     * @version $Id: FeedParserImpl.java 373614 2006-01-30 22:31:21Z mvdb $
037     */
038    public class FeedParserImpl implements FeedParser {
039    
040        private static Logger log = Logger.getLogger(FeedParserImpl.class);
041    
042        /**
043         * Parse this feed.
044         *
045         * @param resource The URL of the feed being parsed.  This is optional and
046         *                 may be null but is used when an exception is thrown to aid debugging.
047         */
048        public void parse(FeedParserListener listener,
049                          InputStream is,
050                          String resource) throws FeedParserException {
051    
052            try {
053    
054                // Need to massage our XML support for UTF-8 to prevent the dreaded
055                // "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in some
056                // default feeds.  This was tested a great deal under NewsMonster
057                // and I'm happy with the results.  Within FeedParser 2.0 we will be
058                // using SAX2 so this won't be as big of a problem.  In FeedParser
059                // 2.0 (or as soon as we use SAX) this code should be totally
060                // removed to use the original stream.
061    
062                is = getCorrectInputStream( is );
063    
064                //OK.  Now we have the right InputStream so we should build our DOM
065                //and exec.
066                SAXBuilder builder = new SAXBuilder();
067    
068                //NOTE: in b10 of JDOM this won't accept an InputStream and requires
069                //a org.w3c.dom.Document so we'll have to build one here.  Will this
070                //slow things down any?
071    
072                org.jdom.Document doc = builder.build( is );
073    
074                parse(listener, doc);
075    
076            } catch (FeedParserException fpe) {
077                //if an explicit FeedParserException is thrown just rethrow it..
078                throw fpe;
079            } catch (Throwable t) {
080    
081                //FIXME: when this is a JDOM or XML parser Exception we should
082                //detect when we're working with an XHTML or HTML file and then
083                //parse it with an XFN/XOXO event listener.
084    
085                throw new FeedParserException(t);
086            }
087    
088        }
089    
090        /**
091         * Perform the Xerces UTF8 correction and FeedFilter.
092         */
093        private InputStream getCorrectInputStream(InputStream is)
094                throws Exception {
095    
096            byte[] bytes = toByteArray(is);
097    
098            //FIXME: if we return the WRONG content type here we will break.
099            //getBytes()... UTF-16 and UTF-32 especially.  We should also perform
100            //HTTP Content-Type parsing here to preserve the content type.  This can
101            //be fixed by integrating our networking API from NewsMonster.
102    
103            String encoding = XMLEncodingParser.parse(bytes);
104    
105            if (encoding == null)
106                encoding = "UTF-8";
107    
108            if ( encoding.startsWith( "UTF" ) ) {
109    
110                String result = XMLCleanser.cleanse( bytes, encoding );
111                bytes = FeedFilter.parse( result, encoding );
112    
113            } else {
114    
115                bytes = FeedFilter.parse(bytes, encoding);
116    
117            }
118    
119            //remove prefix whitespace, intern HTML entities, etc.
120    
121            //build an input stream from the our bytes for parsing...
122            is = new ByteArrayInputStream( bytes );
123    
124            return is;
125    
126        }
127    
128        /**
129         * @deprecated Use #parse( FeedParserException, InputStream, String )
130         */
131        public void parse(FeedParserListener listener,
132                          InputStream is) throws FeedParserException {
133    
134            parse(listener, is, null);
135    
136        }
137    
138        /**
139         * Parse this feed.
140         */
141        public void parse(FeedParserListener listener,
142                          org.jdom.Document doc) throws FeedParserException {
143    
144            try {
145    
146                String root = doc.getRootElement().getName();
147    
148                //Handle OPML
149                if ("opml".equals(root)) {
150                    OPMLFeedParser.parse(listener, doc);
151                    return;
152                }
153    
154                //Handle changes.xml
155                if ("weblogUpdates".equals(root)) {
156                    ChangesFeedParser.parse(listener, doc);
157                    return;
158                }
159    
160                //Handle ATOM
161                if ( "feed".equals( root ) ) {
162                    AtomFeedParser.parse(listener, doc);
163                    return;
164                }
165    
166                //Handle FOAF
167                if (doc.getRootElement().getChildren("Person", NS.FOAF).size() > 0) {
168                    FOAFFeedParser.parse(listener, doc);
169                    return;
170                }
171    
172                //FIXME: if this is XHTML we need to handle this with either an XFN
173                //or an XOXO directory parser.  There might be more metadata we need
174                //to parse here.  (also I wonder if this could be a chance to do
175                //autodiscovery).
176    
177                //fall back on RDF and RSS parsing.
178    
179                //FIXME: if this is an UNKNOWN format We need to throw an
180                //UnsupportedFeedxception (which extends FeedParserException)
181                //
182                // In this situation the ROOT elements should be: rss or RDF
183    
184                RSSFeedParser.parse(listener, doc);
185    
186            } catch (FeedParserException fpe) {
187                //if an explicit FeedParserException is thrown just rethrow it..
188                throw fpe;
189            } catch (Throwable t) {
190                throw new FeedParserException(t);
191            }
192    
193        }
194    
195        /**
196         * Convert an InputStream to a byte array.
197         */
198        public byte[] toByteArray(InputStream is) throws IOException {
199    
200            //WARNING:
201            ByteArrayOutputStream bos = new ByteArrayOutputStream();
202    
203            //now process the Reader...
204            byte data[] = new byte[200];
205    
206            int readCount = 0;
207    
208            while ((readCount = is.read(data)) > 0) {
209    
210                bos.write(data, 0, readCount);
211            }
212    
213            is.close();
214            bos.close();
215    
216            return bos.toByteArray();
217    
218        }
219    
220    }
221