View Javadoc

1   /*
2    * Copyright 1999,2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.commons.feedparser.network;
18  
19  import java.io.FileNotFoundException;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.net.ProtocolException;
23  import java.net.URL;
24  import java.net.URLConnection;
25  import java.util.Iterator;
26  import java.util.zip.GZIPInputStream;
27  
28  import org.apache.log4j.Logger;
29  
30  import sun.net.www.protocol.http.HttpURLConnection;
31  
32  /**
33   * ResourceRequest implementation that uses java.net.URL as the backend.
34   *
35   * Differences from other ResourceRequests.
36   *
37   * setRequestMethod() - Allows us to change the request type (HEAD, etc).
38   * 
39   * getContentLength() - Returns the length/size of the content represented by
40   * this resource.  Can be used by clients with setRequestMethod( "HEAD" ) to
41   * find the size of a remote resource without doing a full fetch.
42   *
43   * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
44   * @version $Id: URLResourceRequest.java 561366 2007-07-31 15:58:29Z rahul $
45   */
46  public class URLResourceRequest extends BaseResourceRequest implements ResourceRequest {
47  
48      private static Logger log = Logger.getLogger( URLResourceRequest.class.getName() );
49  
50      public static final String ACCEPT_ENCODING_HEADER = "Accept-Encoding";
51      public static final String IF_NONE_MATCH_HEADER = "If-None-Match";
52      public static final String GZIP_ENCODING = "gzip";
53      public static final String USER_AGENT_HEADER = "User-Agent";
54  
55      /**
56       *
57       * Enable RFC 3228 HTTP Delta for feeds.
58       * 
59       * http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html
60       * 
61       *  http://bobwyman.pubsub.com/main/2004/09/implementations.html
62       * 
63       */
64      public static boolean ENABLE_HTTP_DELTA_FEED_IM = false;
65      
66      public static String USER_AGENT
67          = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1; aggregator:FeedParser; http://commons.apache.org/feedparser/) Gecko/20021130";
68  
69      public static String USER_AGENT_MOZILLA
70          = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1) Gecko/20021130";
71  
72      /**
73       * Not used anymore.  Provided for historical reasons.
74       */
75      public static final String REFERER
76          = "http://commons.apache.org/feedparser/?isAggregator=true";
77      
78      public static final int MAX_CONTENT_LENGTH = 1000000;
79      
80      private URL _url = null;
81  
82      private URLConnection _urlConnection = null;
83  
84      private InputStream inputStream = null;
85  
86      private boolean initConnection = false;
87      
88      /**
89       * 
90       * 
91       */
92      public void init() throws IOException {
93  
94          String resource = this.getResource();
95  
96          //if we are offline... we don't need to init.
97          if ( ResourceRequestFactory.isOffline() ) { return; } 
98  
99          //pull from the HTCache if it is enabled and then short-circuit so that
100         //we don't fetch from the network.
101 
102         //NOTE: currently removed because the htcache wasn't portable. I can OSS
103         //this in the future if necessary
104 
105         // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() &&
106         //     HTCache.hasContentInCache( this.getResource() ) ) {
107         //
108         //    //get the input stream we can use from the HTCache.
109         //    this.inputStream = HTCache.getContentAsInputStream( resource );
110         //  return;
111         //    
112         // }
113 
114         _url = new URL( this.getResource() );
115         _urlConnection = _url.openConnection();
116 
117     }
118 
119     /**
120      * Init the actual connection.  Should be called AFTER init() but before
121      * getInputStream() so that we can set any runtime params requestMethod,
122      * etc.  If getInputStream() is called without an initConnection() we do
123      * this automatically.  initConnection() might not want to be called when
124      * doing a HEAD request.
125      * 
126      * 
127      */
128     public void initConnection() throws NetworkException {
129 
130         long before = System.currentTimeMillis();
131 
132         initConnection = true;
133 
134         this.fireInit();
135 
136         //FIXME: do smart user agent detection.  if this is a .html file we can
137         //set it to us Mozilla and if not we can use NewsMonster
138         //_urlConnection.setRequestProperty( "Referer", REFERER );
139 
140         String resource = this.getResource();
141 
142         //set the user agent if it hasn't ALREADY been set by the caller.
143         if ( getRequestHeaderField( USER_AGENT_HEADER ) == null ) {
144             _urlConnection.setRequestProperty( USER_AGENT_HEADER, USER_AGENT );
145         } 
146 
147         _urlConnection.setRequestProperty( ACCEPT_ENCODING_HEADER, GZIP_ENCODING );
148 
149         //copy over any headers set in the request..
150 
151         Iterator it = getRequestHeaderFields();
152 
153         while ( it.hasNext() ) {
154 
155             String key = (String)it.next();
156 
157             _urlConnection.setRequestProperty( key, getRequestHeaderField( key ) );
158             
159         } 
160 
161         if ( _urlConnection instanceof HttpURLConnection ) {
162 
163             HttpURLConnection httpURLConn = (HttpURLConnection)_urlConnection;
164 
165             httpURLConn.setFollowRedirects( getFollowRedirects() );
166             httpURLConn.setInstanceFollowRedirects( getFollowRedirects() );
167 
168             if ( this.getIfModifiedSince() != -1 )
169                 httpURLConn.setIfModifiedSince( this.getIfModifiedSince() );
170 
171             if ( getEtag() != null ) {
172                 httpURLConn.setRequestProperty( IF_NONE_MATCH_HEADER, getEtag() );
173 
174                 //now support RFC3229 HTTP Delta
175                 //A-IM: feed, gzip
176 
177                 if ( ENABLE_HTTP_DELTA_FEED_IM ) {
178 
179                     //note that this will return HTTP 226 if used.
180                     //
181                     
182                     httpURLConn.setRequestProperty( "A-IM", "feed, gzip" );
183 
184                 }
185 
186             }
187             
188             try {
189 
190                 httpURLConn.connect();
191 
192                 //setResource( getRedirectedResource() );
193                 
194                 this.setResponseCode( httpURLConn.getResponseCode() ); 
195 
196             } catch ( IOException e ) {
197                 throw new NetworkException( e );
198             }
199 
200         } 
201 
202         int contentLength = _urlConnection.getContentLength();
203 
204         //bigger than 1 meg and it is a remote document (it is safe to process
205         //local documents)
206         if ( contentLength > MAX_CONTENT_LENGTH &&
207              this.getResource().startsWith( "file:" ) == false ) {
208 
209             //NOTE: make 100% sure this doens't just go ahead and download the
210             //file FIRST before doing a HEAD.  I think that's what happens but I
211             //might be wrong.
212             
213             throw new NetworkException( "Content is too large - " + contentLength + " - " + getResource() );
214             
215         } 
216 
217         long after = System.currentTimeMillis();
218         
219         log.debug( getResource() + " - init duration: " + (after-before) );
220         
221     }
222 
223     java.lang.reflect.Field FIELD_HTTP_URL_CONNECTION_HTTP = null;
224     java.lang.reflect.Field FIELD_HTTP_CLIENT_URL = null;
225     
226     /**
227      * This method used Reflection to pull out the redirected URL in
228      * java.net.URL.  Internally sun.net.www.protocol.http.HttpURLConnection
229      * stores a reference to sun.net.www.http.HttpClient which then in turn does
230      * all the redirection and stores the redirect java.net.URL.  We just use
231      * reflection to FETCH this URL and then call toString to get the correct
232      * value.
233      * 
234      * Java needs the concept of readonly private variables.
235      *
236      * 
237      */
238     public String getResourceFromRedirect() {
239 
240         try {
241 
242             if ( FIELD_HTTP_URL_CONNECTION_HTTP == null ) {
243 
244                 //Note: when using a FILE URL this won't work!                
245                 FIELD_HTTP_URL_CONNECTION_HTTP = _urlConnection.getClass().getDeclaredField( "http" );
246                 FIELD_HTTP_URL_CONNECTION_HTTP.setAccessible( true );
247                 
248             }
249 
250             Object http = FIELD_HTTP_URL_CONNECTION_HTTP.get( _urlConnection );
251 
252             //when java.net.URL has already cleaned itself up 'http' will be
253             //null here.
254             if ( http == null )
255                 return getResource();
256 
257             if ( FIELD_HTTP_CLIENT_URL == null ) {
258 
259                 FIELD_HTTP_CLIENT_URL = http.getClass().getDeclaredField( "url" );
260                 FIELD_HTTP_CLIENT_URL.setAccessible( true );
261                 
262             }
263             
264             Object url = FIELD_HTTP_CLIENT_URL.get( http );
265 
266             //this will be a java.net.URL and now I can call the toString method
267             //on it which will return our full URI.
268             return url.toString();
269             
270         } catch ( Throwable t ) {
271             //log.error( t );
272             return getResource();
273         }
274         
275     }
276 
277     public InputStream getInputStream() throws NetworkException {
278 
279         try {
280             return _getInputStream();
281 
282         } catch ( IOException e ) {
283 
284             String message = null;
285             
286             //the modern VM buries the FileNotFoundException which prevents a
287             //catch.  Very very ugly.
288             if ( e.getCause() instanceof FileNotFoundException ) {
289                 message = "File not found: " + e.getCause().getMessage();
290             } else {
291                 message = e.getMessage();
292             }
293 
294             throw new NetworkException( message, e, this, _url, _urlConnection );
295         }
296 
297     }
298     
299     /**
300      * 
301      *
302      * 
303      */
304     public InputStream _getInputStream() throws IOException {
305 
306         if ( ! initConnection ) { initConnection(); } 
307 
308         String resource = this.getResource();
309 
310         //if we haven't pulled from the cache (as above) and we are offline we
311         //need to throw an exception.
312         if ( ResourceRequestFactory.isOffline() ) {
313 
314             //see if we can return from the HTCache.
315             // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() &&
316             //     HTCache.hasContentInCache( resource ) )
317             //    return HTCache.getContentAsInputStream( resource );
318 
319             //if not we should throw an exception
320             throw new IOException( "ResourceRequestFactory is offline and content was not in cache - " +
321                                    resource );
322 
323         }
324 
325         //if we are using an input stream NOT from init() 
326         if ( this.inputStream == null ) {
327             
328             this.inputStream = _urlConnection.getInputStream();
329             this.inputStream = new AdvancedInputStream( this.inputStream, this );
330 
331             //first decompress
332             if ( GZIP_ENCODING.equals( _urlConnection.getContentEncoding() ) ) {
333 
334                 //note.  the advanced input stream must be wrapped by a GZIP
335                 //input stream and not vice-versa or we will end up with
336                 //incorrect results.
337                 
338                 this.inputStream = new GZIPInputStream( this.inputStream );
339 
340             }
341         
342             // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() ) {
343                 
344             //     System.out.println( "cache store for: " +
345             //                         resource + " as " +
346             //                         HTCache.getContentAsPath( resource ) );
347 
348             //     //FIXME: performance improvement... don't write do disk and then
349             //     //read from disk.?
350                 
351             //     //store this content from the network and save it in the cache.  Then fetch it and return
352             //     HTCache.store( resource, this.inputStream );
353                 
354             //     return HTCache.getContentAsInputStream( resource );
355                 
356             // }
357 
358         }
359 
360         setResource( getResourceFromRedirect() );
361 
362         //this is potentially teh cached input stream created if we have used
363         //the HTCache.
364         return inputStream;
365         
366     }
367 
368     /**
369      * Set the RequestMethod of this URLConnection.
370      *
371      * 
372      */
373     public void setRequestMethod( String method ) throws NetworkException {
374 
375         try { 
376             
377             if ( _urlConnection instanceof HttpURLConnection ) {
378                 
379                 ((HttpURLConnection)_urlConnection).setRequestMethod( method );
380                 
381             } 
382             
383         } catch ( ProtocolException pe ) {
384             
385             NetworkException ne = new NetworkException( pe.getMessage() );
386             ne.initCause( pe );
387             throw ne;
388             
389         }
390 
391     }
392 
393     /**
394      * 
395      *
396      * 
397      */
398     public int getContentLength() throws IOException {
399 
400         if ( ! initConnection ) { initConnection(); } 
401 
402         //if ( _urlConnection instanceof HttpURLConnection ) {
403 
404         return  _urlConnection.getContentLength();
405         
406     }
407     
408     public String getHeaderField( String name ) {
409         return  _urlConnection.getHeaderField( name );
410     }
411 
412 }