001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser; 018 019 import java.io.ByteArrayInputStream; 020 import java.io.ByteArrayOutputStream; 021 import java.io.IOException; 022 import java.io.InputStream; 023 024 import org.apache.commons.feedparser.tools.XMLCleanser; 025 import org.apache.commons.feedparser.tools.XMLEncodingParser; 026 import org.apache.log4j.Logger; 027 import org.jdom.input.SAXBuilder; 028 029 /** 030 * This FeedParser implementation is based on JDOM and Jaxen and is based around 031 * XPath and JDOM iteration. While the implementation is straight forward it 032 * has not been optimized for performance. A SAX based parser would certainly 033 * be less memory intensive but with the downside of being harder to develop. 034 * 035 * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a> 036 * @version $Id: FeedParserImpl.java 373614 2006-01-30 22:31:21Z mvdb $ 037 */ 038 public class FeedParserImpl implements FeedParser { 039 040 private static Logger log = Logger.getLogger(FeedParserImpl.class); 041 042 /** 043 * Parse this feed. 044 * 045 * @param resource The URL of the feed being parsed. This is optional and 046 * may be null but is used when an exception is thrown to aid debugging. 047 */ 048 public void parse(FeedParserListener listener, 049 InputStream is, 050 String resource) throws FeedParserException { 051 052 try { 053 054 // Need to massage our XML support for UTF-8 to prevent the dreaded 055 // "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in some 056 // default feeds. This was tested a great deal under NewsMonster 057 // and I'm happy with the results. Within FeedParser 2.0 we will be 058 // using SAX2 so this won't be as big of a problem. In FeedParser 059 // 2.0 (or as soon as we use SAX) this code should be totally 060 // removed to use the original stream. 061 062 is = getCorrectInputStream( is ); 063 064 //OK. Now we have the right InputStream so we should build our DOM 065 //and exec. 066 SAXBuilder builder = new SAXBuilder(); 067 068 //NOTE: in b10 of JDOM this won't accept an InputStream and requires 069 //a org.w3c.dom.Document so we'll have to build one here. Will this 070 //slow things down any? 071 072 org.jdom.Document doc = builder.build( is ); 073 074 parse(listener, doc); 075 076 } catch (FeedParserException fpe) { 077 //if an explicit FeedParserException is thrown just rethrow it.. 078 throw fpe; 079 } catch (Throwable t) { 080 081 //FIXME: when this is a JDOM or XML parser Exception we should 082 //detect when we're working with an XHTML or HTML file and then 083 //parse it with an XFN/XOXO event listener. 084 085 throw new FeedParserException(t); 086 } 087 088 } 089 090 /** 091 * Perform the Xerces UTF8 correction and FeedFilter. 092 */ 093 private InputStream getCorrectInputStream(InputStream is) 094 throws Exception { 095 096 byte[] bytes = toByteArray(is); 097 098 //FIXME: if we return the WRONG content type here we will break. 099 //getBytes()... UTF-16 and UTF-32 especially. We should also perform 100 //HTTP Content-Type parsing here to preserve the content type. This can 101 //be fixed by integrating our networking API from NewsMonster. 102 103 String encoding = XMLEncodingParser.parse(bytes); 104 105 if (encoding == null) 106 encoding = "UTF-8"; 107 108 if ( encoding.startsWith( "UTF" ) ) { 109 110 String result = XMLCleanser.cleanse( bytes, encoding ); 111 bytes = FeedFilter.parse( result, encoding ); 112 113 } else { 114 115 bytes = FeedFilter.parse(bytes, encoding); 116 117 } 118 119 //remove prefix whitespace, intern HTML entities, etc. 120 121 //build an input stream from the our bytes for parsing... 122 is = new ByteArrayInputStream( bytes ); 123 124 return is; 125 126 } 127 128 /** 129 * @deprecated Use #parse( FeedParserException, InputStream, String ) 130 */ 131 public void parse(FeedParserListener listener, 132 InputStream is) throws FeedParserException { 133 134 parse(listener, is, null); 135 136 } 137 138 /** 139 * Parse this feed. 140 */ 141 public void parse(FeedParserListener listener, 142 org.jdom.Document doc) throws FeedParserException { 143 144 try { 145 146 String root = doc.getRootElement().getName(); 147 148 //Handle OPML 149 if ("opml".equals(root)) { 150 OPMLFeedParser.parse(listener, doc); 151 return; 152 } 153 154 //Handle changes.xml 155 if ("weblogUpdates".equals(root)) { 156 ChangesFeedParser.parse(listener, doc); 157 return; 158 } 159 160 //Handle ATOM 161 if ( "feed".equals( root ) ) { 162 AtomFeedParser.parse(listener, doc); 163 return; 164 } 165 166 //Handle FOAF 167 if (doc.getRootElement().getChildren("Person", NS.FOAF).size() > 0) { 168 FOAFFeedParser.parse(listener, doc); 169 return; 170 } 171 172 //FIXME: if this is XHTML we need to handle this with either an XFN 173 //or an XOXO directory parser. There might be more metadata we need 174 //to parse here. (also I wonder if this could be a chance to do 175 //autodiscovery). 176 177 //fall back on RDF and RSS parsing. 178 179 //FIXME: if this is an UNKNOWN format We need to throw an 180 //UnsupportedFeedxception (which extends FeedParserException) 181 // 182 // In this situation the ROOT elements should be: rss or RDF 183 184 RSSFeedParser.parse(listener, doc); 185 186 } catch (FeedParserException fpe) { 187 //if an explicit FeedParserException is thrown just rethrow it.. 188 throw fpe; 189 } catch (Throwable t) { 190 throw new FeedParserException(t); 191 } 192 193 } 194 195 /** 196 * Convert an InputStream to a byte array. 197 */ 198 public byte[] toByteArray(InputStream is) throws IOException { 199 200 //WARNING: 201 ByteArrayOutputStream bos = new ByteArrayOutputStream(); 202 203 //now process the Reader... 204 byte data[] = new byte[200]; 205 206 int readCount = 0; 207 208 while ((readCount = is.read(data)) > 0) { 209 210 bos.write(data, 0, readCount); 211 } 212 213 is.close(); 214 bos.close(); 215 216 return bos.toByteArray(); 217 218 } 219 220 } 221