View Javadoc

1   /*
2    * Copyright 1999,2004 The Apache Software Foundation.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.commons.feedparser;
18  
19  import java.io.ByteArrayInputStream;
20  import java.io.ByteArrayOutputStream;
21  import java.io.IOException;
22  import java.io.InputStream;
23  
24  import org.apache.commons.feedparser.tools.XMLCleanser;
25  import org.apache.commons.feedparser.tools.XMLEncodingParser;
26  import org.apache.log4j.Logger;
27  import org.jdom.input.SAXBuilder;
28  
29  /**
30   * This FeedParser implementation is based on JDOM and Jaxen and is based around
31   * XPath and JDOM iteration.  While the implementation is straight forward it
32   * has not been optimized for performance.  A SAX based parser would certainly
33   * be less memory intensive but with the downside of being harder to develop.
34   *
35   * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
36   * @version $Id: FeedParserImpl.java 373614 2006-01-30 22:31:21Z mvdb $
37   */
38  public class FeedParserImpl implements FeedParser {
39  
40      private static Logger log = Logger.getLogger(FeedParserImpl.class);
41  
42      /**
43       * Parse this feed.
44       *
45       * @param resource The URL of the feed being parsed.  This is optional and
46       *                 may be null but is used when an exception is thrown to aid debugging.
47       */
48      public void parse(FeedParserListener listener,
49                        InputStream is,
50                        String resource) throws FeedParserException {
51  
52          try {
53  
54              // Need to massage our XML support for UTF-8 to prevent the dreaded
55              // "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in some
56              // default feeds.  This was tested a great deal under NewsMonster
57              // and I'm happy with the results.  Within FeedParser 2.0 we will be
58              // using SAX2 so this won't be as big of a problem.  In FeedParser
59              // 2.0 (or as soon as we use SAX) this code should be totally
60              // removed to use the original stream.
61  
62              is = getCorrectInputStream( is );
63  
64              //OK.  Now we have the right InputStream so we should build our DOM
65              //and exec.
66              SAXBuilder builder = new SAXBuilder();
67  
68              //NOTE: in b10 of JDOM this won't accept an InputStream and requires
69              //a org.w3c.dom.Document so we'll have to build one here.  Will this
70              //slow things down any?
71  
72              org.jdom.Document doc = builder.build( is );
73  
74              parse(listener, doc);
75  
76          } catch (FeedParserException fpe) {
77              //if an explicit FeedParserException is thrown just rethrow it..
78              throw fpe;
79          } catch (Throwable t) {
80  
81              //FIXME: when this is a JDOM or XML parser Exception we should
82              //detect when we're working with an XHTML or HTML file and then
83              //parse it with an XFN/XOXO event listener.
84  
85              throw new FeedParserException(t);
86          }
87  
88      }
89  
90      /**
91       * Perform the Xerces UTF8 correction and FeedFilter.
92       */
93      private InputStream getCorrectInputStream(InputStream is)
94              throws Exception {
95  
96          byte[] bytes = toByteArray(is);
97  
98          //FIXME: if we return the WRONG content type here we will break.
99          //getBytes()... UTF-16 and UTF-32 especially.  We should also perform
100         //HTTP Content-Type parsing here to preserve the content type.  This can
101         //be fixed by integrating our networking API from NewsMonster.
102 
103         String encoding = XMLEncodingParser.parse(bytes);
104 
105         if (encoding == null)
106             encoding = "UTF-8";
107 
108         if ( encoding.startsWith( "UTF" ) ) {
109 
110             String result = XMLCleanser.cleanse( bytes, encoding );
111             bytes = FeedFilter.parse( result, encoding );
112 
113         } else {
114 
115             bytes = FeedFilter.parse(bytes, encoding);
116 
117         }
118 
119         //remove prefix whitespace, intern HTML entities, etc.
120 
121         //build an input stream from the our bytes for parsing...
122         is = new ByteArrayInputStream( bytes );
123 
124         return is;
125 
126     }
127 
128     /**
129      * @deprecated Use #parse( FeedParserException, InputStream, String )
130      */
131     public void parse(FeedParserListener listener,
132                       InputStream is) throws FeedParserException {
133 
134         parse(listener, is, null);
135 
136     }
137 
138     /**
139      * Parse this feed.
140      */
141     public void parse(FeedParserListener listener,
142                       org.jdom.Document doc) throws FeedParserException {
143 
144         try {
145 
146             String root = doc.getRootElement().getName();
147 
148             //Handle OPML
149             if ("opml".equals(root)) {
150                 OPMLFeedParser.parse(listener, doc);
151                 return;
152             }
153 
154             //Handle changes.xml
155             if ("weblogUpdates".equals(root)) {
156                 ChangesFeedParser.parse(listener, doc);
157                 return;
158             }
159 
160             //Handle ATOM
161             if ( "feed".equals( root ) ) {
162                 AtomFeedParser.parse(listener, doc);
163                 return;
164             }
165 
166             //Handle FOAF
167             if (doc.getRootElement().getChildren("Person", NS.FOAF).size() > 0) {
168                 FOAFFeedParser.parse(listener, doc);
169                 return;
170             }
171 
172             //FIXME: if this is XHTML we need to handle this with either an XFN
173             //or an XOXO directory parser.  There might be more metadata we need
174             //to parse here.  (also I wonder if this could be a chance to do
175             //autodiscovery).
176 
177             //fall back on RDF and RSS parsing.
178 
179             //FIXME: if this is an UNKNOWN format We need to throw an
180             //UnsupportedFeedxception (which extends FeedParserException)
181             //
182             // In this situation the ROOT elements should be: rss or RDF
183 
184             RSSFeedParser.parse(listener, doc);
185 
186         } catch (FeedParserException fpe) {
187             //if an explicit FeedParserException is thrown just rethrow it..
188             throw fpe;
189         } catch (Throwable t) {
190             throw new FeedParserException(t);
191         }
192 
193     }
194 
195     /**
196      * Convert an InputStream to a byte array.
197      */
198     public byte[] toByteArray(InputStream is) throws IOException {
199 
200         //WARNING:
201         ByteArrayOutputStream bos = new ByteArrayOutputStream();
202 
203         //now process the Reader...
204         byte data[] = new byte[200];
205 
206         int readCount = 0;
207 
208         while ((readCount = is.read(data)) > 0) {
209 
210             bos.write(data, 0, readCount);
211         }
212 
213         is.close();
214         bos.close();
215 
216         return bos.toByteArray();
217 
218     }
219 
220 }
221