001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser.network;
018    
019    import java.io.FileNotFoundException;
020    import java.io.IOException;
021    import java.io.InputStream;
022    import java.net.ProtocolException;
023    import java.net.URL;
024    import java.net.URLConnection;
025    import java.util.Iterator;
026    import java.util.zip.GZIPInputStream;
027    
028    import org.apache.log4j.Logger;
029    
030    import sun.net.www.protocol.http.HttpURLConnection;
031    
032    /**
033     * ResourceRequest implementation that uses java.net.URL as the backend.
034     *
035     * Differences from other ResourceRequests.
036     *
037     * setRequestMethod() - Allows us to change the request type (HEAD, etc).
038     * 
039     * getContentLength() - Returns the length/size of the content represented by
040     * this resource.  Can be used by clients with setRequestMethod( "HEAD" ) to
041     * find the size of a remote resource without doing a full fetch.
042     *
043     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
044     * @version $Id: URLResourceRequest.java 561366 2007-07-31 15:58:29Z rahul $
045     */
046    public class URLResourceRequest extends BaseResourceRequest implements ResourceRequest {
047    
048        private static Logger log = Logger.getLogger( URLResourceRequest.class.getName() );
049    
050        public static final String ACCEPT_ENCODING_HEADER = "Accept-Encoding";
051        public static final String IF_NONE_MATCH_HEADER = "If-None-Match";
052        public static final String GZIP_ENCODING = "gzip";
053        public static final String USER_AGENT_HEADER = "User-Agent";
054    
055        /**
056         *
057         * Enable RFC 3228 HTTP Delta for feeds.
058         * 
059         * http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html
060         * 
061         *  http://bobwyman.pubsub.com/main/2004/09/implementations.html
062         * 
063         */
064        public static boolean ENABLE_HTTP_DELTA_FEED_IM = false;
065        
066        public static String USER_AGENT
067            = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1; aggregator:FeedParser; http://commons.apache.org/feedparser/) Gecko/20021130";
068    
069        public static String USER_AGENT_MOZILLA
070            = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1) Gecko/20021130";
071    
072        /**
073         * Not used anymore.  Provided for historical reasons.
074         */
075        public static final String REFERER
076            = "http://commons.apache.org/feedparser/?isAggregator=true";
077        
078        public static final int MAX_CONTENT_LENGTH = 1000000;
079        
080        private URL _url = null;
081    
082        private URLConnection _urlConnection = null;
083    
084        private InputStream inputStream = null;
085    
086        private boolean initConnection = false;
087        
088        /**
089         * 
090         * 
091         */
092        public void init() throws IOException {
093    
094            String resource = this.getResource();
095    
096            //if we are offline... we don't need to init.
097            if ( ResourceRequestFactory.isOffline() ) { return; } 
098    
099            //pull from the HTCache if it is enabled and then short-circuit so that
100            //we don't fetch from the network.
101    
102            //NOTE: currently removed because the htcache wasn't portable. I can OSS
103            //this in the future if necessary
104    
105            // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() &&
106            //     HTCache.hasContentInCache( this.getResource() ) ) {
107            //
108            //    //get the input stream we can use from the HTCache.
109            //    this.inputStream = HTCache.getContentAsInputStream( resource );
110            //  return;
111            //    
112            // }
113    
114            _url = new URL( this.getResource() );
115            _urlConnection = _url.openConnection();
116    
117        }
118    
119        /**
120         * Init the actual connection.  Should be called AFTER init() but before
121         * getInputStream() so that we can set any runtime params requestMethod,
122         * etc.  If getInputStream() is called without an initConnection() we do
123         * this automatically.  initConnection() might not want to be called when
124         * doing a HEAD request.
125         * 
126         * 
127         */
128        public void initConnection() throws NetworkException {
129    
130            long before = System.currentTimeMillis();
131    
132            initConnection = true;
133    
134            this.fireInit();
135    
136            //FIXME: do smart user agent detection.  if this is a .html file we can
137            //set it to us Mozilla and if not we can use NewsMonster
138            //_urlConnection.setRequestProperty( "Referer", REFERER );
139    
140            String resource = this.getResource();
141    
142            //set the user agent if it hasn't ALREADY been set by the caller.
143            if ( getRequestHeaderField( USER_AGENT_HEADER ) == null ) {
144                _urlConnection.setRequestProperty( USER_AGENT_HEADER, USER_AGENT );
145            } 
146    
147            _urlConnection.setRequestProperty( ACCEPT_ENCODING_HEADER, GZIP_ENCODING );
148    
149            //copy over any headers set in the request..
150    
151            Iterator it = getRequestHeaderFields();
152    
153            while ( it.hasNext() ) {
154    
155                String key = (String)it.next();
156    
157                _urlConnection.setRequestProperty( key, getRequestHeaderField( key ) );
158                
159            } 
160    
161            if ( _urlConnection instanceof HttpURLConnection ) {
162    
163                HttpURLConnection httpURLConn = (HttpURLConnection)_urlConnection;
164    
165                httpURLConn.setFollowRedirects( getFollowRedirects() );
166                httpURLConn.setInstanceFollowRedirects( getFollowRedirects() );
167    
168                if ( this.getIfModifiedSince() != -1 )
169                    httpURLConn.setIfModifiedSince( this.getIfModifiedSince() );
170    
171                if ( getEtag() != null ) {
172                    httpURLConn.setRequestProperty( IF_NONE_MATCH_HEADER, getEtag() );
173    
174                    //now support RFC3229 HTTP Delta
175                    //A-IM: feed, gzip
176    
177                    if ( ENABLE_HTTP_DELTA_FEED_IM ) {
178    
179                        //note that this will return HTTP 226 if used.
180                        //
181                        
182                        httpURLConn.setRequestProperty( "A-IM", "feed, gzip" );
183    
184                    }
185    
186                }
187                
188                try {
189    
190                    httpURLConn.connect();
191    
192                    //setResource( getRedirectedResource() );
193                    
194                    this.setResponseCode( httpURLConn.getResponseCode() ); 
195    
196                } catch ( IOException e ) {
197                    throw new NetworkException( e );
198                }
199    
200            } 
201    
202            int contentLength = _urlConnection.getContentLength();
203    
204            //bigger than 1 meg and it is a remote document (it is safe to process
205            //local documents)
206            if ( contentLength > MAX_CONTENT_LENGTH &&
207                 this.getResource().startsWith( "file:" ) == false ) {
208    
209                //NOTE: make 100% sure this doens't just go ahead and download the
210                //file FIRST before doing a HEAD.  I think that's what happens but I
211                //might be wrong.
212                
213                throw new NetworkException( "Content is too large - " + contentLength + " - " + getResource() );
214                
215            } 
216    
217            long after = System.currentTimeMillis();
218            
219            log.debug( getResource() + " - init duration: " + (after-before) );
220            
221        }
222    
223        java.lang.reflect.Field FIELD_HTTP_URL_CONNECTION_HTTP = null;
224        java.lang.reflect.Field FIELD_HTTP_CLIENT_URL = null;
225        
226        /**
227         * This method used Reflection to pull out the redirected URL in
228         * java.net.URL.  Internally sun.net.www.protocol.http.HttpURLConnection
229         * stores a reference to sun.net.www.http.HttpClient which then in turn does
230         * all the redirection and stores the redirect java.net.URL.  We just use
231         * reflection to FETCH this URL and then call toString to get the correct
232         * value.
233         * 
234         * Java needs the concept of readonly private variables.
235         *
236         * 
237         */
238        public String getResourceFromRedirect() {
239    
240            try {
241    
242                if ( FIELD_HTTP_URL_CONNECTION_HTTP == null ) {
243    
244                    //Note: when using a FILE URL this won't work!                
245                    FIELD_HTTP_URL_CONNECTION_HTTP = _urlConnection.getClass().getDeclaredField( "http" );
246                    FIELD_HTTP_URL_CONNECTION_HTTP.setAccessible( true );
247                    
248                }
249    
250                Object http = FIELD_HTTP_URL_CONNECTION_HTTP.get( _urlConnection );
251    
252                //when java.net.URL has already cleaned itself up 'http' will be
253                //null here.
254                if ( http == null )
255                    return getResource();
256    
257                if ( FIELD_HTTP_CLIENT_URL == null ) {
258    
259                    FIELD_HTTP_CLIENT_URL = http.getClass().getDeclaredField( "url" );
260                    FIELD_HTTP_CLIENT_URL.setAccessible( true );
261                    
262                }
263                
264                Object url = FIELD_HTTP_CLIENT_URL.get( http );
265    
266                //this will be a java.net.URL and now I can call the toString method
267                //on it which will return our full URI.
268                return url.toString();
269                
270            } catch ( Throwable t ) {
271                //log.error( t );
272                return getResource();
273            }
274            
275        }
276    
277        public InputStream getInputStream() throws NetworkException {
278    
279            try {
280                return _getInputStream();
281    
282            } catch ( IOException e ) {
283    
284                String message = null;
285                
286                //the modern VM buries the FileNotFoundException which prevents a
287                //catch.  Very very ugly.
288                if ( e.getCause() instanceof FileNotFoundException ) {
289                    message = "File not found: " + e.getCause().getMessage();
290                } else {
291                    message = e.getMessage();
292                }
293    
294                throw new NetworkException( message, e, this, _url, _urlConnection );
295            }
296    
297        }
298        
299        /**
300         * 
301         *
302         * 
303         */
304        public InputStream _getInputStream() throws IOException {
305    
306            if ( ! initConnection ) { initConnection(); } 
307    
308            String resource = this.getResource();
309    
310            //if we haven't pulled from the cache (as above) and we are offline we
311            //need to throw an exception.
312            if ( ResourceRequestFactory.isOffline() ) {
313    
314                //see if we can return from the HTCache.
315                // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() &&
316                //     HTCache.hasContentInCache( resource ) )
317                //    return HTCache.getContentAsInputStream( resource );
318    
319                //if not we should throw an exception
320                throw new IOException( "ResourceRequestFactory is offline and content was not in cache - " +
321                                       resource );
322    
323            }
324    
325            //if we are using an input stream NOT from init() 
326            if ( this.inputStream == null ) {
327                
328                this.inputStream = _urlConnection.getInputStream();
329                this.inputStream = new AdvancedInputStream( this.inputStream, this );
330    
331                //first decompress
332                if ( GZIP_ENCODING.equals( _urlConnection.getContentEncoding() ) ) {
333    
334                    //note.  the advanced input stream must be wrapped by a GZIP
335                    //input stream and not vice-versa or we will end up with
336                    //incorrect results.
337                    
338                    this.inputStream = new GZIPInputStream( this.inputStream );
339    
340                }
341            
342                // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() ) {
343                    
344                //     System.out.println( "cache store for: " +
345                //                         resource + " as " +
346                //                         HTCache.getContentAsPath( resource ) );
347    
348                //     //FIXME: performance improvement... don't write do disk and then
349                //     //read from disk.?
350                    
351                //     //store this content from the network and save it in the cache.  Then fetch it and return
352                //     HTCache.store( resource, this.inputStream );
353                    
354                //     return HTCache.getContentAsInputStream( resource );
355                    
356                // }
357    
358            }
359    
360            setResource( getResourceFromRedirect() );
361    
362            //this is potentially teh cached input stream created if we have used
363            //the HTCache.
364            return inputStream;
365            
366        }
367    
368        /**
369         * Set the RequestMethod of this URLConnection.
370         *
371         * 
372         */
373        public void setRequestMethod( String method ) throws NetworkException {
374    
375            try { 
376                
377                if ( _urlConnection instanceof HttpURLConnection ) {
378                    
379                    ((HttpURLConnection)_urlConnection).setRequestMethod( method );
380                    
381                } 
382                
383            } catch ( ProtocolException pe ) {
384                
385                NetworkException ne = new NetworkException( pe.getMessage() );
386                ne.initCause( pe );
387                throw ne;
388                
389            }
390    
391        }
392    
393        /**
394         * 
395         *
396         * 
397         */
398        public int getContentLength() throws IOException {
399    
400            if ( ! initConnection ) { initConnection(); } 
401    
402            //if ( _urlConnection instanceof HttpURLConnection ) {
403    
404            return  _urlConnection.getContentLength();
405            
406        }
407        
408        public String getHeaderField( String name ) {
409            return  _urlConnection.getHeaderField( name );
410        }
411    
412    }