1 /* 2 * Copyright 1999,2004 The Apache Software Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.commons.feedparser; 18 19 import java.io.ByteArrayInputStream; 20 import java.io.ByteArrayOutputStream; 21 import java.io.IOException; 22 import java.io.InputStream; 23 24 import org.apache.commons.feedparser.tools.XMLCleanser; 25 import org.apache.commons.feedparser.tools.XMLEncodingParser; 26 import org.apache.log4j.Logger; 27 import org.jdom.input.SAXBuilder; 28 29 /** 30 * This FeedParser implementation is based on JDOM and Jaxen and is based around 31 * XPath and JDOM iteration. While the implementation is straight forward it 32 * has not been optimized for performance. A SAX based parser would certainly 33 * be less memory intensive but with the downside of being harder to develop. 34 * 35 * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a> 36 * @version $Id: FeedParserImpl.java 373614 2006-01-30 22:31:21Z mvdb $ 37 */ 38 public class FeedParserImpl implements FeedParser { 39 40 private static Logger log = Logger.getLogger(FeedParserImpl.class); 41 42 /** 43 * Parse this feed. 44 * 45 * @param resource The URL of the feed being parsed. This is optional and 46 * may be null but is used when an exception is thrown to aid debugging. 47 */ 48 public void parse(FeedParserListener listener, 49 InputStream is, 50 String resource) throws FeedParserException { 51 52 try { 53 54 // Need to massage our XML support for UTF-8 to prevent the dreaded 55 // "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in some 56 // default feeds. This was tested a great deal under NewsMonster 57 // and I'm happy with the results. Within FeedParser 2.0 we will be 58 // using SAX2 so this won't be as big of a problem. In FeedParser 59 // 2.0 (or as soon as we use SAX) this code should be totally 60 // removed to use the original stream. 61 62 is = getCorrectInputStream( is ); 63 64 //OK. Now we have the right InputStream so we should build our DOM 65 //and exec. 66 SAXBuilder builder = new SAXBuilder(); 67 68 //NOTE: in b10 of JDOM this won't accept an InputStream and requires 69 //a org.w3c.dom.Document so we'll have to build one here. Will this 70 //slow things down any? 71 72 org.jdom.Document doc = builder.build( is ); 73 74 parse(listener, doc); 75 76 } catch (FeedParserException fpe) { 77 //if an explicit FeedParserException is thrown just rethrow it.. 78 throw fpe; 79 } catch (Throwable t) { 80 81 //FIXME: when this is a JDOM or XML parser Exception we should 82 //detect when we're working with an XHTML or HTML file and then 83 //parse it with an XFN/XOXO event listener. 84 85 throw new FeedParserException(t); 86 } 87 88 } 89 90 /** 91 * Perform the Xerces UTF8 correction and FeedFilter. 92 */ 93 private InputStream getCorrectInputStream(InputStream is) 94 throws Exception { 95 96 byte[] bytes = toByteArray(is); 97 98 //FIXME: if we return the WRONG content type here we will break. 99 //getBytes()... UTF-16 and UTF-32 especially. We should also perform 100 //HTTP Content-Type parsing here to preserve the content type. This can 101 //be fixed by integrating our networking API from NewsMonster. 102 103 String encoding = XMLEncodingParser.parse(bytes); 104 105 if (encoding == null) 106 encoding = "UTF-8"; 107 108 if ( encoding.startsWith( "UTF" ) ) { 109 110 String result = XMLCleanser.cleanse( bytes, encoding ); 111 bytes = FeedFilter.parse( result, encoding ); 112 113 } else { 114 115 bytes = FeedFilter.parse(bytes, encoding); 116 117 } 118 119 //remove prefix whitespace, intern HTML entities, etc. 120 121 //build an input stream from the our bytes for parsing... 122 is = new ByteArrayInputStream( bytes ); 123 124 return is; 125 126 } 127 128 /** 129 * @deprecated Use #parse( FeedParserException, InputStream, String ) 130 */ 131 public void parse(FeedParserListener listener, 132 InputStream is) throws FeedParserException { 133 134 parse(listener, is, null); 135 136 } 137 138 /** 139 * Parse this feed. 140 */ 141 public void parse(FeedParserListener listener, 142 org.jdom.Document doc) throws FeedParserException { 143 144 try { 145 146 String root = doc.getRootElement().getName(); 147 148 //Handle OPML 149 if ("opml".equals(root)) { 150 OPMLFeedParser.parse(listener, doc); 151 return; 152 } 153 154 //Handle changes.xml 155 if ("weblogUpdates".equals(root)) { 156 ChangesFeedParser.parse(listener, doc); 157 return; 158 } 159 160 //Handle ATOM 161 if ( "feed".equals( root ) ) { 162 AtomFeedParser.parse(listener, doc); 163 return; 164 } 165 166 //Handle FOAF 167 if (doc.getRootElement().getChildren("Person", NS.FOAF).size() > 0) { 168 FOAFFeedParser.parse(listener, doc); 169 return; 170 } 171 172 //FIXME: if this is XHTML we need to handle this with either an XFN 173 //or an XOXO directory parser. There might be more metadata we need 174 //to parse here. (also I wonder if this could be a chance to do 175 //autodiscovery). 176 177 //fall back on RDF and RSS parsing. 178 179 //FIXME: if this is an UNKNOWN format We need to throw an 180 //UnsupportedFeedxception (which extends FeedParserException) 181 // 182 // In this situation the ROOT elements should be: rss or RDF 183 184 RSSFeedParser.parse(listener, doc); 185 186 } catch (FeedParserException fpe) { 187 //if an explicit FeedParserException is thrown just rethrow it.. 188 throw fpe; 189 } catch (Throwable t) { 190 throw new FeedParserException(t); 191 } 192 193 } 194 195 /** 196 * Convert an InputStream to a byte array. 197 */ 198 public byte[] toByteArray(InputStream is) throws IOException { 199 200 //WARNING: 201 ByteArrayOutputStream bos = new ByteArrayOutputStream(); 202 203 //now process the Reader... 204 byte data[] = new byte[200]; 205 206 int readCount = 0; 207 208 while ((readCount = is.read(data)) > 0) { 209 210 bos.write(data, 0, readCount); 211 } 212 213 is.close(); 214 bos.close(); 215 216 return bos.toByteArray(); 217 218 } 219 220 } 221