001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser.network; 018 019 import java.io.FileNotFoundException; 020 import java.io.IOException; 021 import java.io.InputStream; 022 import java.net.ProtocolException; 023 import java.net.URL; 024 import java.net.URLConnection; 025 import java.util.Iterator; 026 import java.util.zip.GZIPInputStream; 027 028 import org.apache.log4j.Logger; 029 030 import sun.net.www.protocol.http.HttpURLConnection; 031 032 /** 033 * ResourceRequest implementation that uses java.net.URL as the backend. 034 * 035 * Differences from other ResourceRequests. 036 * 037 * setRequestMethod() - Allows us to change the request type (HEAD, etc). 038 * 039 * getContentLength() - Returns the length/size of the content represented by 040 * this resource. Can be used by clients with setRequestMethod( "HEAD" ) to 041 * find the size of a remote resource without doing a full fetch. 042 * 043 * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a> 044 * @version $Id: URLResourceRequest.java 561366 2007-07-31 15:58:29Z rahul $ 045 */ 046 public class URLResourceRequest extends BaseResourceRequest implements ResourceRequest { 047 048 private static Logger log = Logger.getLogger( URLResourceRequest.class.getName() ); 049 050 public static final String ACCEPT_ENCODING_HEADER = "Accept-Encoding"; 051 public static final String IF_NONE_MATCH_HEADER = "If-None-Match"; 052 public static final String GZIP_ENCODING = "gzip"; 053 public static final String USER_AGENT_HEADER = "User-Agent"; 054 055 /** 056 * 057 * Enable RFC 3228 HTTP Delta for feeds. 058 * 059 * http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html 060 * 061 * http://bobwyman.pubsub.com/main/2004/09/implementations.html 062 * 063 */ 064 public static boolean ENABLE_HTTP_DELTA_FEED_IM = false; 065 066 public static String USER_AGENT 067 = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1; aggregator:FeedParser; http://commons.apache.org/feedparser/) Gecko/20021130"; 068 069 public static String USER_AGENT_MOZILLA 070 = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1) Gecko/20021130"; 071 072 /** 073 * Not used anymore. Provided for historical reasons. 074 */ 075 public static final String REFERER 076 = "http://commons.apache.org/feedparser/?isAggregator=true"; 077 078 public static final int MAX_CONTENT_LENGTH = 1000000; 079 080 private URL _url = null; 081 082 private URLConnection _urlConnection = null; 083 084 private InputStream inputStream = null; 085 086 private boolean initConnection = false; 087 088 /** 089 * 090 * 091 */ 092 public void init() throws IOException { 093 094 String resource = this.getResource(); 095 096 //if we are offline... we don't need to init. 097 if ( ResourceRequestFactory.isOffline() ) { return; } 098 099 //pull from the HTCache if it is enabled and then short-circuit so that 100 //we don't fetch from the network. 101 102 //NOTE: currently removed because the htcache wasn't portable. I can OSS 103 //this in the future if necessary 104 105 // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() && 106 // HTCache.hasContentInCache( this.getResource() ) ) { 107 // 108 // //get the input stream we can use from the HTCache. 109 // this.inputStream = HTCache.getContentAsInputStream( resource ); 110 // return; 111 // 112 // } 113 114 _url = new URL( this.getResource() ); 115 _urlConnection = _url.openConnection(); 116 117 } 118 119 /** 120 * Init the actual connection. Should be called AFTER init() but before 121 * getInputStream() so that we can set any runtime params requestMethod, 122 * etc. If getInputStream() is called without an initConnection() we do 123 * this automatically. initConnection() might not want to be called when 124 * doing a HEAD request. 125 * 126 * 127 */ 128 public void initConnection() throws NetworkException { 129 130 long before = System.currentTimeMillis(); 131 132 initConnection = true; 133 134 this.fireInit(); 135 136 //FIXME: do smart user agent detection. if this is a .html file we can 137 //set it to us Mozilla and if not we can use NewsMonster 138 //_urlConnection.setRequestProperty( "Referer", REFERER ); 139 140 String resource = this.getResource(); 141 142 //set the user agent if it hasn't ALREADY been set by the caller. 143 if ( getRequestHeaderField( USER_AGENT_HEADER ) == null ) { 144 _urlConnection.setRequestProperty( USER_AGENT_HEADER, USER_AGENT ); 145 } 146 147 _urlConnection.setRequestProperty( ACCEPT_ENCODING_HEADER, GZIP_ENCODING ); 148 149 //copy over any headers set in the request.. 150 151 Iterator it = getRequestHeaderFields(); 152 153 while ( it.hasNext() ) { 154 155 String key = (String)it.next(); 156 157 _urlConnection.setRequestProperty( key, getRequestHeaderField( key ) ); 158 159 } 160 161 if ( _urlConnection instanceof HttpURLConnection ) { 162 163 HttpURLConnection httpURLConn = (HttpURLConnection)_urlConnection; 164 165 httpURLConn.setFollowRedirects( getFollowRedirects() ); 166 httpURLConn.setInstanceFollowRedirects( getFollowRedirects() ); 167 168 if ( this.getIfModifiedSince() != -1 ) 169 httpURLConn.setIfModifiedSince( this.getIfModifiedSince() ); 170 171 if ( getEtag() != null ) { 172 httpURLConn.setRequestProperty( IF_NONE_MATCH_HEADER, getEtag() ); 173 174 //now support RFC3229 HTTP Delta 175 //A-IM: feed, gzip 176 177 if ( ENABLE_HTTP_DELTA_FEED_IM ) { 178 179 //note that this will return HTTP 226 if used. 180 // 181 182 httpURLConn.setRequestProperty( "A-IM", "feed, gzip" ); 183 184 } 185 186 } 187 188 try { 189 190 httpURLConn.connect(); 191 192 //setResource( getRedirectedResource() ); 193 194 this.setResponseCode( httpURLConn.getResponseCode() ); 195 196 } catch ( IOException e ) { 197 throw new NetworkException( e ); 198 } 199 200 } 201 202 int contentLength = _urlConnection.getContentLength(); 203 204 //bigger than 1 meg and it is a remote document (it is safe to process 205 //local documents) 206 if ( contentLength > MAX_CONTENT_LENGTH && 207 this.getResource().startsWith( "file:" ) == false ) { 208 209 //NOTE: make 100% sure this doens't just go ahead and download the 210 //file FIRST before doing a HEAD. I think that's what happens but I 211 //might be wrong. 212 213 throw new NetworkException( "Content is too large - " + contentLength + " - " + getResource() ); 214 215 } 216 217 long after = System.currentTimeMillis(); 218 219 log.debug( getResource() + " - init duration: " + (after-before) ); 220 221 } 222 223 java.lang.reflect.Field FIELD_HTTP_URL_CONNECTION_HTTP = null; 224 java.lang.reflect.Field FIELD_HTTP_CLIENT_URL = null; 225 226 /** 227 * This method used Reflection to pull out the redirected URL in 228 * java.net.URL. Internally sun.net.www.protocol.http.HttpURLConnection 229 * stores a reference to sun.net.www.http.HttpClient which then in turn does 230 * all the redirection and stores the redirect java.net.URL. We just use 231 * reflection to FETCH this URL and then call toString to get the correct 232 * value. 233 * 234 * Java needs the concept of readonly private variables. 235 * 236 * 237 */ 238 public String getResourceFromRedirect() { 239 240 try { 241 242 if ( FIELD_HTTP_URL_CONNECTION_HTTP == null ) { 243 244 //Note: when using a FILE URL this won't work! 245 FIELD_HTTP_URL_CONNECTION_HTTP = _urlConnection.getClass().getDeclaredField( "http" ); 246 FIELD_HTTP_URL_CONNECTION_HTTP.setAccessible( true ); 247 248 } 249 250 Object http = FIELD_HTTP_URL_CONNECTION_HTTP.get( _urlConnection ); 251 252 //when java.net.URL has already cleaned itself up 'http' will be 253 //null here. 254 if ( http == null ) 255 return getResource(); 256 257 if ( FIELD_HTTP_CLIENT_URL == null ) { 258 259 FIELD_HTTP_CLIENT_URL = http.getClass().getDeclaredField( "url" ); 260 FIELD_HTTP_CLIENT_URL.setAccessible( true ); 261 262 } 263 264 Object url = FIELD_HTTP_CLIENT_URL.get( http ); 265 266 //this will be a java.net.URL and now I can call the toString method 267 //on it which will return our full URI. 268 return url.toString(); 269 270 } catch ( Throwable t ) { 271 //log.error( t ); 272 return getResource(); 273 } 274 275 } 276 277 public InputStream getInputStream() throws NetworkException { 278 279 try { 280 return _getInputStream(); 281 282 } catch ( IOException e ) { 283 284 String message = null; 285 286 //the modern VM buries the FileNotFoundException which prevents a 287 //catch. Very very ugly. 288 if ( e.getCause() instanceof FileNotFoundException ) { 289 message = "File not found: " + e.getCause().getMessage(); 290 } else { 291 message = e.getMessage(); 292 } 293 294 throw new NetworkException( message, e, this, _url, _urlConnection ); 295 } 296 297 } 298 299 /** 300 * 301 * 302 * 303 */ 304 public InputStream _getInputStream() throws IOException { 305 306 if ( ! initConnection ) { initConnection(); } 307 308 String resource = this.getResource(); 309 310 //if we haven't pulled from the cache (as above) and we are offline we 311 //need to throw an exception. 312 if ( ResourceRequestFactory.isOffline() ) { 313 314 //see if we can return from the HTCache. 315 // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() && 316 // HTCache.hasContentInCache( resource ) ) 317 // return HTCache.getContentAsInputStream( resource ); 318 319 //if not we should throw an exception 320 throw new IOException( "ResourceRequestFactory is offline and content was not in cache - " + 321 resource ); 322 323 } 324 325 //if we are using an input stream NOT from init() 326 if ( this.inputStream == null ) { 327 328 this.inputStream = _urlConnection.getInputStream(); 329 this.inputStream = new AdvancedInputStream( this.inputStream, this ); 330 331 //first decompress 332 if ( GZIP_ENCODING.equals( _urlConnection.getContentEncoding() ) ) { 333 334 //note. the advanced input stream must be wrapped by a GZIP 335 //input stream and not vice-versa or we will end up with 336 //incorrect results. 337 338 this.inputStream = new GZIPInputStream( this.inputStream ); 339 340 } 341 342 // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() ) { 343 344 // System.out.println( "cache store for: " + 345 // resource + " as " + 346 // HTCache.getContentAsPath( resource ) ); 347 348 // //FIXME: performance improvement... don't write do disk and then 349 // //read from disk.? 350 351 // //store this content from the network and save it in the cache. Then fetch it and return 352 // HTCache.store( resource, this.inputStream ); 353 354 // return HTCache.getContentAsInputStream( resource ); 355 356 // } 357 358 } 359 360 setResource( getResourceFromRedirect() ); 361 362 //this is potentially teh cached input stream created if we have used 363 //the HTCache. 364 return inputStream; 365 366 } 367 368 /** 369 * Set the RequestMethod of this URLConnection. 370 * 371 * 372 */ 373 public void setRequestMethod( String method ) throws NetworkException { 374 375 try { 376 377 if ( _urlConnection instanceof HttpURLConnection ) { 378 379 ((HttpURLConnection)_urlConnection).setRequestMethod( method ); 380 381 } 382 383 } catch ( ProtocolException pe ) { 384 385 NetworkException ne = new NetworkException( pe.getMessage() ); 386 ne.initCause( pe ); 387 throw ne; 388 389 } 390 391 } 392 393 /** 394 * 395 * 396 * 397 */ 398 public int getContentLength() throws IOException { 399 400 if ( ! initConnection ) { initConnection(); } 401 402 //if ( _urlConnection instanceof HttpURLConnection ) { 403 404 return _urlConnection.getContentLength(); 405 406 } 407 408 public String getHeaderField( String name ) { 409 return _urlConnection.getHeaderField( name ); 410 } 411 412 }