View Javadoc

1   /*
2    * Copyright 1999,2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.commons.feedparser.network;
18  
19  import java.io.*;
20  import java.net.*;
21  import java.util.*;
22  import java.util.zip.*;
23  
24  import org.apache.log4j.*;
25  
26  import sun.net.www.protocol.http.HttpURLConnection;
27  
28  /***
29   * ResourceRequest implementation that uses java.net.URL as the backend.
30   *
31   * Differences from other ResourceRequests.
32   *
33   * setRequestMethod() - Allows us to change the request type (HEAD, etc).
34   * 
35   * getContentLength() - Returns the length/size of the content represented by
36   * this resource.  Can be used by clients with setRequestMethod( "HEAD" ) to
37   * find the size of a remote resource without doing a full fetch.
38   *
39   * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
40   * @version $Id: URLResourceRequest.java 159213 2005-03-27 23:32:01Z burton $
41   */
42  public class URLResourceRequest extends BaseResourceRequest implements ResourceRequest {
43  
44      private static Logger log = Logger.getLogger( URLResourceRequest.class.getName() );
45  
46      public static final String ACCEPT_ENCODING_HEADER = "Accept-Encoding";
47      public static final String IF_NONE_MATCH_HEADER = "If-None-Match";
48      public static final String GZIP_ENCODING = "gzip";
49      public static final String USER_AGENT_HEADER = "User-Agent";
50  
51      /***
52       *
53       * Enable RFC 3228 HTTP Delta for feeds.
54       * 
55       * http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html
56       * 
57       *  http://bobwyman.pubsub.com/main/2004/09/implementations.html
58       * 
59       */
60      public static boolean ENABLE_HTTP_DELTA_FEED_IM = false;
61      
62      public static String USER_AGENT
63          = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1; aggregator:FeedParser; http://commons.apache.org/feedparser/) Gecko/20021130";
64  
65      public static String USER_AGENT_MOZILLA
66          = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1) Gecko/20021130";
67  
68      /***
69       * Not used anymore.  Provided for historical reasons.
70       */
71      public static final String REFERER
72          = "http://commons.apache.org/feedparser/?isAggregator=true";
73      
74      public static final int MAX_CONTENT_LENGTH = 1000000;
75      
76      private URL _url = null;
77  
78      private URLConnection _urlConnection = null;
79  
80      private InputStream inputStream = null;
81  
82      private boolean initConnection = false;
83      
84      /***
85       * 
86       * 
87       */
88      public void init() throws IOException {
89  
90          String resource = this.getResource();
91  
92          //if we are offline... we don't need to init.
93          if ( ResourceRequestFactory.isOffline() ) { return; } 
94  
95          //pull from the HTCache if it is enabled and then short-circuit so that
96          //we don't fetch from the network.
97  
98          //NOTE: currently removed because the htcache wasn't portable. I can OSS
99          //this in the future if necessary
100 
101         // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() &&
102         //     HTCache.hasContentInCache( this.getResource() ) ) {
103         //
104         //    //get the input stream we can use from the HTCache.
105         //    this.inputStream = HTCache.getContentAsInputStream( resource );
106         //  return;
107         //    
108         // }
109 
110         _url = new URL( this.getResource() );
111         _urlConnection = _url.openConnection();
112 
113     }
114 
115     /***
116      * Init the actual connection.  Should be called AFTER init() but before
117      * getInputStream() so that we can set any runtime params requestMethod,
118      * etc.  If getInputStream() is called without an initConnection() we do
119      * this automatically.  initConnection() might not want to be called when
120      * doing a HEAD request.
121      * 
122      * 
123      */
124     public void initConnection() throws NetworkException {
125 
126         long before = System.currentTimeMillis();
127 
128         initConnection = true;
129 
130         this.fireInit();
131 
132         //FIXME: do smart user agent detection.  if this is a .html file we can
133         //set it to us Mozilla and if not we can use NewsMonster
134         //_urlConnection.setRequestProperty( "Referer", REFERER );
135 
136         String resource = this.getResource();
137 
138         //set the user agent if it hasn't ALREADY been set by the caller.
139         if ( getRequestHeaderField( USER_AGENT_HEADER ) == null ) {
140             _urlConnection.setRequestProperty( USER_AGENT_HEADER, USER_AGENT );
141         } 
142 
143         _urlConnection.setRequestProperty( ACCEPT_ENCODING_HEADER, GZIP_ENCODING );
144 
145         //copy over any headers set in the request..
146 
147         Iterator it = getRequestHeaderFields();
148 
149         while ( it.hasNext() ) {
150 
151             String key = (String)it.next();
152 
153             _urlConnection.setRequestProperty( key, getRequestHeaderField( key ) );
154             
155         } 
156 
157         if ( _urlConnection instanceof HttpURLConnection ) {
158 
159             HttpURLConnection httpURLConn = (HttpURLConnection)_urlConnection;
160 
161             httpURLConn.setFollowRedirects( getFollowRedirects() );
162             httpURLConn.setInstanceFollowRedirects( getFollowRedirects() );
163 
164             if ( this.getIfModifiedSince() != -1 )
165                 httpURLConn.setIfModifiedSince( this.getIfModifiedSince() );
166 
167             if ( getEtag() != null ) {
168                 httpURLConn.setRequestProperty( IF_NONE_MATCH_HEADER, getEtag() );
169 
170                 //now support RFC3229 HTTP Delta
171                 //A-IM: feed, gzip
172 
173                 if ( ENABLE_HTTP_DELTA_FEED_IM ) {
174 
175                     //note that this will return HTTP 226 if used.
176                     //
177                     
178                     httpURLConn.setRequestProperty( "A-IM", "feed, gzip" );
179 
180                 }
181 
182             }
183             
184             try {
185 
186                 httpURLConn.connect();
187 
188                 //setResource( getRedirectedResource() );
189                 
190                 this.setResponseCode( httpURLConn.getResponseCode() ); 
191 
192             } catch ( IOException e ) {
193                 throw new NetworkException( e );
194             }
195 
196         } 
197 
198         int contentLength = _urlConnection.getContentLength();
199 
200         //bigger than 1 meg and it is a remote document (it is safe to process
201         //local documents)
202         if ( contentLength > MAX_CONTENT_LENGTH &&
203              this.getResource().startsWith( "file:" ) == false ) {
204 
205             //NOTE: make 100% sure this doens't just go ahead and download the
206             //file FIRST before doing a HEAD.  I think that's what happens but I
207             //might be wrong.
208             
209             throw new NetworkException( "Content is too large - " + contentLength + " - " + getResource() );
210             
211         } 
212 
213         long after = System.currentTimeMillis();
214         
215         log.debug( getResource() + " - init duration: " + (after-before) );
216         
217     }
218 
219     java.lang.reflect.Field FIELD_HTTP_URL_CONNECTION_HTTP = null;
220     java.lang.reflect.Field FIELD_HTTP_CLIENT_URL = null;
221     
222     /***
223      * This method used Reflection to pull out the redirected URL in
224      * java.net.URL.  Internally sun.net.www.protocol.http.HttpURLConnection
225      * stores a reference to sun.net.www.http.HttpClient which then in turn does
226      * all the redirection and stores the redirect java.net.URL.  We just use
227      * reflection to FETCH this URL and then call toString to get the correct
228      * value.
229      * 
230      * Java needs the concept of readonly private variables.
231      *
232      * 
233      */
234     public String getResourceFromRedirect() {
235 
236         try {
237 
238             if ( FIELD_HTTP_URL_CONNECTION_HTTP == null ) {
239 
240                 //Note: when using a FILE URL this won't work!                
241                 FIELD_HTTP_URL_CONNECTION_HTTP = _urlConnection.getClass().getDeclaredField( "http" );
242                 FIELD_HTTP_URL_CONNECTION_HTTP.setAccessible( true );
243                 
244             }
245 
246             Object http = FIELD_HTTP_URL_CONNECTION_HTTP.get( _urlConnection );
247 
248             //when java.net.URL has already cleaned itself up 'http' will be
249             //null here.
250             if ( http == null )
251                 return getResource();
252 
253             if ( FIELD_HTTP_CLIENT_URL == null ) {
254 
255                 FIELD_HTTP_CLIENT_URL = http.getClass().getDeclaredField( "url" );
256                 FIELD_HTTP_CLIENT_URL.setAccessible( true );
257                 
258             }
259             
260             Object url = FIELD_HTTP_CLIENT_URL.get( http );
261 
262             //this will be a java.net.URL and now I can call the toString method
263             //on it which will return our full URI.
264             return url.toString();
265             
266         } catch ( Throwable t ) {
267             //log.error( t );
268             return getResource();
269         }
270         
271     }
272 
273     public InputStream getInputStream() throws NetworkException {
274 
275         try {
276             return _getInputStream();
277 
278         } catch ( IOException e ) {
279 
280             String message = null;
281             
282             //the modern VM buries the FileNotFoundException which prevents a
283             //catch.  Very very ugly.
284             if ( e.getCause() instanceof FileNotFoundException ) {
285                 message = "File not found: " + e.getCause().getMessage();
286             } else {
287                 message = e.getMessage();
288             }
289 
290             throw new NetworkException( message, e, this, _url, _urlConnection );
291         }
292 
293     }
294     
295     /***
296      * 
297      *
298      * 
299      */
300     public InputStream _getInputStream() throws IOException {
301 
302         if ( ! initConnection ) { initConnection(); } 
303 
304         String resource = this.getResource();
305 
306         //if we haven't pulled from the cache (as above) and we are offline we
307         //need to throw an exception.
308         if ( ResourceRequestFactory.isOffline() ) {
309 
310             //see if we can return from the HTCache.
311             // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() &&
312             //     HTCache.hasContentInCache( resource ) )
313             //    return HTCache.getContentAsInputStream( resource );
314 
315             //if not we should throw an exception
316             throw new IOException( "ResourceRequestFactory is offline and content was not in cache - " +
317                                    resource );
318 
319         }
320 
321         //if we are using an input stream NOT from init() 
322         if ( this.inputStream == null ) {
323             
324             this.inputStream = _urlConnection.getInputStream();
325             this.inputStream = new AdvancedInputStream( this.inputStream, this );
326 
327             //first decompress
328             if ( GZIP_ENCODING.equals( _urlConnection.getContentEncoding() ) ) {
329 
330                 //note.  the advanced input stream must be wrapped by a GZIP
331                 //input stream and not vice-versa or we will end up with
332                 //incorrect results.
333                 
334                 this.inputStream = new GZIPInputStream( this.inputStream );
335 
336             }
337         
338             // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() ) {
339                 
340             //     System.out.println( "cache store for: " +
341             //                         resource + " as " +
342             //                         HTCache.getContentAsPath( resource ) );
343 
344             //     //FIXME: performance improvement... don't write do disk and then
345             //     //read from disk.?
346                 
347             //     //store this content from the network and save it in the cache.  Then fetch it and return
348             //     HTCache.store( resource, this.inputStream );
349                 
350             //     return HTCache.getContentAsInputStream( resource );
351                 
352             // }
353 
354         }
355 
356         setResource( getResourceFromRedirect() );
357 
358         //this is potentially teh cached input stream created if we have used
359         //the HTCache.
360         return inputStream;
361         
362     }
363 
364     /***
365      * Set the RequestMethod of this URLConnection.
366      *
367      * 
368      */
369     public void setRequestMethod( String method ) throws NetworkException {
370 
371         try { 
372             
373             if ( _urlConnection instanceof HttpURLConnection ) {
374                 
375                 ((HttpURLConnection)_urlConnection).setRequestMethod( method );
376                 
377             } 
378             
379         } catch ( ProtocolException pe ) {
380             
381             NetworkException ne = new NetworkException( pe.getMessage() );
382             ne.initCause( pe );
383             throw ne;
384             
385         }
386 
387     }
388 
389     /***
390      * 
391      *
392      * 
393      */
394     public int getContentLength() throws IOException {
395 
396         if ( ! initConnection ) { initConnection(); } 
397 
398         //if ( _urlConnection instanceof HttpURLConnection ) {
399 
400         return  _urlConnection.getContentLength();
401         
402     }
403     
404     public String getHeaderField( String name ) {
405         return  _urlConnection.getHeaderField( name );
406     }
407 
408 }