1 /*
2 * Copyright 1999,2004 The Apache Software Foundation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.commons.feedparser;
18
19 import java.io.ByteArrayInputStream;
20 import java.io.ByteArrayOutputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23
24 import org.apache.commons.feedparser.tools.XMLCleanser;
25 import org.apache.commons.feedparser.tools.XMLEncodingParser;
26 import org.apache.log4j.Logger;
27 import org.jdom.input.SAXBuilder;
28
29 /**
30 * This FeedParser implementation is based on JDOM and Jaxen and is based around
31 * XPath and JDOM iteration. While the implementation is straight forward it
32 * has not been optimized for performance. A SAX based parser would certainly
33 * be less memory intensive but with the downside of being harder to develop.
34 *
35 * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
36 * @version $Id: FeedParserImpl.java 373614 2006-01-30 22:31:21Z mvdb $
37 */
38 public class FeedParserImpl implements FeedParser {
39
40 private static Logger log = Logger.getLogger(FeedParserImpl.class);
41
42 /**
43 * Parse this feed.
44 *
45 * @param resource The URL of the feed being parsed. This is optional and
46 * may be null but is used when an exception is thrown to aid debugging.
47 */
48 public void parse(FeedParserListener listener,
49 InputStream is,
50 String resource) throws FeedParserException {
51
52 try {
53
54 // Need to massage our XML support for UTF-8 to prevent the dreaded
55 // "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in some
56 // default feeds. This was tested a great deal under NewsMonster
57 // and I'm happy with the results. Within FeedParser 2.0 we will be
58 // using SAX2 so this won't be as big of a problem. In FeedParser
59 // 2.0 (or as soon as we use SAX) this code should be totally
60 // removed to use the original stream.
61
62 is = getCorrectInputStream( is );
63
64 //OK. Now we have the right InputStream so we should build our DOM
65 //and exec.
66 SAXBuilder builder = new SAXBuilder();
67
68 //NOTE: in b10 of JDOM this won't accept an InputStream and requires
69 //a org.w3c.dom.Document so we'll have to build one here. Will this
70 //slow things down any?
71
72 org.jdom.Document doc = builder.build( is );
73
74 parse(listener, doc);
75
76 } catch (FeedParserException fpe) {
77 //if an explicit FeedParserException is thrown just rethrow it..
78 throw fpe;
79 } catch (Throwable t) {
80
81 //FIXME: when this is a JDOM or XML parser Exception we should
82 //detect when we're working with an XHTML or HTML file and then
83 //parse it with an XFN/XOXO event listener.
84
85 throw new FeedParserException(t);
86 }
87
88 }
89
90 /**
91 * Perform the Xerces UTF8 correction and FeedFilter.
92 */
93 private InputStream getCorrectInputStream(InputStream is)
94 throws Exception {
95
96 byte[] bytes = toByteArray(is);
97
98 //FIXME: if we return the WRONG content type here we will break.
99 //getBytes()... UTF-16 and UTF-32 especially. We should also perform
100 //HTTP Content-Type parsing here to preserve the content type. This can
101 //be fixed by integrating our networking API from NewsMonster.
102
103 String encoding = XMLEncodingParser.parse(bytes);
104
105 if (encoding == null)
106 encoding = "UTF-8";
107
108 if ( encoding.startsWith( "UTF" ) ) {
109
110 String result = XMLCleanser.cleanse( bytes, encoding );
111 bytes = FeedFilter.parse( result, encoding );
112
113 } else {
114
115 bytes = FeedFilter.parse(bytes, encoding);
116
117 }
118
119 //remove prefix whitespace, intern HTML entities, etc.
120
121 //build an input stream from the our bytes for parsing...
122 is = new ByteArrayInputStream( bytes );
123
124 return is;
125
126 }
127
128 /**
129 * @deprecated Use #parse( FeedParserException, InputStream, String )
130 */
131 public void parse(FeedParserListener listener,
132 InputStream is) throws FeedParserException {
133
134 parse(listener, is, null);
135
136 }
137
138 /**
139 * Parse this feed.
140 */
141 public void parse(FeedParserListener listener,
142 org.jdom.Document doc) throws FeedParserException {
143
144 try {
145
146 String root = doc.getRootElement().getName();
147
148 //Handle OPML
149 if ("opml".equals(root)) {
150 OPMLFeedParser.parse(listener, doc);
151 return;
152 }
153
154 //Handle changes.xml
155 if ("weblogUpdates".equals(root)) {
156 ChangesFeedParser.parse(listener, doc);
157 return;
158 }
159
160 //Handle ATOM
161 if ( "feed".equals( root ) ) {
162 AtomFeedParser.parse(listener, doc);
163 return;
164 }
165
166 //Handle FOAF
167 if (doc.getRootElement().getChildren("Person", NS.FOAF).size() > 0) {
168 FOAFFeedParser.parse(listener, doc);
169 return;
170 }
171
172 //FIXME: if this is XHTML we need to handle this with either an XFN
173 //or an XOXO directory parser. There might be more metadata we need
174 //to parse here. (also I wonder if this could be a chance to do
175 //autodiscovery).
176
177 //fall back on RDF and RSS parsing.
178
179 //FIXME: if this is an UNKNOWN format We need to throw an
180 //UnsupportedFeedxception (which extends FeedParserException)
181 //
182 // In this situation the ROOT elements should be: rss or RDF
183
184 RSSFeedParser.parse(listener, doc);
185
186 } catch (FeedParserException fpe) {
187 //if an explicit FeedParserException is thrown just rethrow it..
188 throw fpe;
189 } catch (Throwable t) {
190 throw new FeedParserException(t);
191 }
192
193 }
194
195 /**
196 * Convert an InputStream to a byte array.
197 */
198 public byte[] toByteArray(InputStream is) throws IOException {
199
200 //WARNING:
201 ByteArrayOutputStream bos = new ByteArrayOutputStream();
202
203 //now process the Reader...
204 byte data[] = new byte[200];
205
206 int readCount = 0;
207
208 while ((readCount = is.read(data)) > 0) {
209
210 bos.write(data, 0, readCount);
211 }
212
213 is.close();
214 bos.close();
215
216 return bos.toByteArray();
217
218 }
219
220 }
221