1 /*
2 * Copyright 1999,2004 The Apache Software Foundation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.commons.feedparser.network;
18
19 import java.io.FileNotFoundException;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.net.ProtocolException;
23 import java.net.URL;
24 import java.net.URLConnection;
25 import java.util.Iterator;
26 import java.util.zip.GZIPInputStream;
27
28 import org.apache.log4j.Logger;
29
30 import sun.net.www.protocol.http.HttpURLConnection;
31
32 /**
33 * ResourceRequest implementation that uses java.net.URL as the backend.
34 *
35 * Differences from other ResourceRequests.
36 *
37 * setRequestMethod() - Allows us to change the request type (HEAD, etc).
38 *
39 * getContentLength() - Returns the length/size of the content represented by
40 * this resource. Can be used by clients with setRequestMethod( "HEAD" ) to
41 * find the size of a remote resource without doing a full fetch.
42 *
43 * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
44 * @version $Id: URLResourceRequest.java 561366 2007-07-31 15:58:29Z rahul $
45 */
46 public class URLResourceRequest extends BaseResourceRequest implements ResourceRequest {
47
48 private static Logger log = Logger.getLogger( URLResourceRequest.class.getName() );
49
50 public static final String ACCEPT_ENCODING_HEADER = "Accept-Encoding";
51 public static final String IF_NONE_MATCH_HEADER = "If-None-Match";
52 public static final String GZIP_ENCODING = "gzip";
53 public static final String USER_AGENT_HEADER = "User-Agent";
54
55 /**
56 *
57 * Enable RFC 3228 HTTP Delta for feeds.
58 *
59 * http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html
60 *
61 * http://bobwyman.pubsub.com/main/2004/09/implementations.html
62 *
63 */
64 public static boolean ENABLE_HTTP_DELTA_FEED_IM = false;
65
66 public static String USER_AGENT
67 = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1; aggregator:FeedParser; http://commons.apache.org/feedparser/) Gecko/20021130";
68
69 public static String USER_AGENT_MOZILLA
70 = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1) Gecko/20021130";
71
72 /**
73 * Not used anymore. Provided for historical reasons.
74 */
75 public static final String REFERER
76 = "http://commons.apache.org/feedparser/?isAggregator=true";
77
78 public static final int MAX_CONTENT_LENGTH = 1000000;
79
80 private URL _url = null;
81
82 private URLConnection _urlConnection = null;
83
84 private InputStream inputStream = null;
85
86 private boolean initConnection = false;
87
88 /**
89 *
90 *
91 */
92 public void init() throws IOException {
93
94 String resource = this.getResource();
95
96 //if we are offline... we don't need to init.
97 if ( ResourceRequestFactory.isOffline() ) { return; }
98
99 //pull from the HTCache if it is enabled and then short-circuit so that
100 //we don't fetch from the network.
101
102 //NOTE: currently removed because the htcache wasn't portable. I can OSS
103 //this in the future if necessary
104
105 // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() &&
106 // HTCache.hasContentInCache( this.getResource() ) ) {
107 //
108 // //get the input stream we can use from the HTCache.
109 // this.inputStream = HTCache.getContentAsInputStream( resource );
110 // return;
111 //
112 // }
113
114 _url = new URL( this.getResource() );
115 _urlConnection = _url.openConnection();
116
117 }
118
119 /**
120 * Init the actual connection. Should be called AFTER init() but before
121 * getInputStream() so that we can set any runtime params requestMethod,
122 * etc. If getInputStream() is called without an initConnection() we do
123 * this automatically. initConnection() might not want to be called when
124 * doing a HEAD request.
125 *
126 *
127 */
128 public void initConnection() throws NetworkException {
129
130 long before = System.currentTimeMillis();
131
132 initConnection = true;
133
134 this.fireInit();
135
136 //FIXME: do smart user agent detection. if this is a .html file we can
137 //set it to us Mozilla and if not we can use NewsMonster
138 //_urlConnection.setRequestProperty( "Referer", REFERER );
139
140 String resource = this.getResource();
141
142 //set the user agent if it hasn't ALREADY been set by the caller.
143 if ( getRequestHeaderField( USER_AGENT_HEADER ) == null ) {
144 _urlConnection.setRequestProperty( USER_AGENT_HEADER, USER_AGENT );
145 }
146
147 _urlConnection.setRequestProperty( ACCEPT_ENCODING_HEADER, GZIP_ENCODING );
148
149 //copy over any headers set in the request..
150
151 Iterator it = getRequestHeaderFields();
152
153 while ( it.hasNext() ) {
154
155 String key = (String)it.next();
156
157 _urlConnection.setRequestProperty( key, getRequestHeaderField( key ) );
158
159 }
160
161 if ( _urlConnection instanceof HttpURLConnection ) {
162
163 HttpURLConnection httpURLConn = (HttpURLConnection)_urlConnection;
164
165 httpURLConn.setFollowRedirects( getFollowRedirects() );
166 httpURLConn.setInstanceFollowRedirects( getFollowRedirects() );
167
168 if ( this.getIfModifiedSince() != -1 )
169 httpURLConn.setIfModifiedSince( this.getIfModifiedSince() );
170
171 if ( getEtag() != null ) {
172 httpURLConn.setRequestProperty( IF_NONE_MATCH_HEADER, getEtag() );
173
174 //now support RFC3229 HTTP Delta
175 //A-IM: feed, gzip
176
177 if ( ENABLE_HTTP_DELTA_FEED_IM ) {
178
179 //note that this will return HTTP 226 if used.
180 //
181
182 httpURLConn.setRequestProperty( "A-IM", "feed, gzip" );
183
184 }
185
186 }
187
188 try {
189
190 httpURLConn.connect();
191
192 //setResource( getRedirectedResource() );
193
194 this.setResponseCode( httpURLConn.getResponseCode() );
195
196 } catch ( IOException e ) {
197 throw new NetworkException( e );
198 }
199
200 }
201
202 int contentLength = _urlConnection.getContentLength();
203
204 //bigger than 1 meg and it is a remote document (it is safe to process
205 //local documents)
206 if ( contentLength > MAX_CONTENT_LENGTH &&
207 this.getResource().startsWith( "file:" ) == false ) {
208
209 //NOTE: make 100% sure this doens't just go ahead and download the
210 //file FIRST before doing a HEAD. I think that's what happens but I
211 //might be wrong.
212
213 throw new NetworkException( "Content is too large - " + contentLength + " - " + getResource() );
214
215 }
216
217 long after = System.currentTimeMillis();
218
219 log.debug( getResource() + " - init duration: " + (after-before) );
220
221 }
222
223 java.lang.reflect.Field FIELD_HTTP_URL_CONNECTION_HTTP = null;
224 java.lang.reflect.Field FIELD_HTTP_CLIENT_URL = null;
225
226 /**
227 * This method used Reflection to pull out the redirected URL in
228 * java.net.URL. Internally sun.net.www.protocol.http.HttpURLConnection
229 * stores a reference to sun.net.www.http.HttpClient which then in turn does
230 * all the redirection and stores the redirect java.net.URL. We just use
231 * reflection to FETCH this URL and then call toString to get the correct
232 * value.
233 *
234 * Java needs the concept of readonly private variables.
235 *
236 *
237 */
238 public String getResourceFromRedirect() {
239
240 try {
241
242 if ( FIELD_HTTP_URL_CONNECTION_HTTP == null ) {
243
244 //Note: when using a FILE URL this won't work!
245 FIELD_HTTP_URL_CONNECTION_HTTP = _urlConnection.getClass().getDeclaredField( "http" );
246 FIELD_HTTP_URL_CONNECTION_HTTP.setAccessible( true );
247
248 }
249
250 Object http = FIELD_HTTP_URL_CONNECTION_HTTP.get( _urlConnection );
251
252 //when java.net.URL has already cleaned itself up 'http' will be
253 //null here.
254 if ( http == null )
255 return getResource();
256
257 if ( FIELD_HTTP_CLIENT_URL == null ) {
258
259 FIELD_HTTP_CLIENT_URL = http.getClass().getDeclaredField( "url" );
260 FIELD_HTTP_CLIENT_URL.setAccessible( true );
261
262 }
263
264 Object url = FIELD_HTTP_CLIENT_URL.get( http );
265
266 //this will be a java.net.URL and now I can call the toString method
267 //on it which will return our full URI.
268 return url.toString();
269
270 } catch ( Throwable t ) {
271 //log.error( t );
272 return getResource();
273 }
274
275 }
276
277 public InputStream getInputStream() throws NetworkException {
278
279 try {
280 return _getInputStream();
281
282 } catch ( IOException e ) {
283
284 String message = null;
285
286 //the modern VM buries the FileNotFoundException which prevents a
287 //catch. Very very ugly.
288 if ( e.getCause() instanceof FileNotFoundException ) {
289 message = "File not found: " + e.getCause().getMessage();
290 } else {
291 message = e.getMessage();
292 }
293
294 throw new NetworkException( message, e, this, _url, _urlConnection );
295 }
296
297 }
298
299 /**
300 *
301 *
302 *
303 */
304 public InputStream _getInputStream() throws IOException {
305
306 if ( ! initConnection ) { initConnection(); }
307
308 String resource = this.getResource();
309
310 //if we haven't pulled from the cache (as above) and we are offline we
311 //need to throw an exception.
312 if ( ResourceRequestFactory.isOffline() ) {
313
314 //see if we can return from the HTCache.
315 // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() &&
316 // HTCache.hasContentInCache( resource ) )
317 // return HTCache.getContentAsInputStream( resource );
318
319 //if not we should throw an exception
320 throw new IOException( "ResourceRequestFactory is offline and content was not in cache - " +
321 resource );
322
323 }
324
325 //if we are using an input stream NOT from init()
326 if ( this.inputStream == null ) {
327
328 this.inputStream = _urlConnection.getInputStream();
329 this.inputStream = new AdvancedInputStream( this.inputStream, this );
330
331 //first decompress
332 if ( GZIP_ENCODING.equals( _urlConnection.getContentEncoding() ) ) {
333
334 //note. the advanced input stream must be wrapped by a GZIP
335 //input stream and not vice-versa or we will end up with
336 //incorrect results.
337
338 this.inputStream = new GZIPInputStream( this.inputStream );
339
340 }
341
342 // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() ) {
343
344 // System.out.println( "cache store for: " +
345 // resource + " as " +
346 // HTCache.getContentAsPath( resource ) );
347
348 // //FIXME: performance improvement... don't write do disk and then
349 // //read from disk.?
350
351 // //store this content from the network and save it in the cache. Then fetch it and return
352 // HTCache.store( resource, this.inputStream );
353
354 // return HTCache.getContentAsInputStream( resource );
355
356 // }
357
358 }
359
360 setResource( getResourceFromRedirect() );
361
362 //this is potentially teh cached input stream created if we have used
363 //the HTCache.
364 return inputStream;
365
366 }
367
368 /**
369 * Set the RequestMethod of this URLConnection.
370 *
371 *
372 */
373 public void setRequestMethod( String method ) throws NetworkException {
374
375 try {
376
377 if ( _urlConnection instanceof HttpURLConnection ) {
378
379 ((HttpURLConnection)_urlConnection).setRequestMethod( method );
380
381 }
382
383 } catch ( ProtocolException pe ) {
384
385 NetworkException ne = new NetworkException( pe.getMessage() );
386 ne.initCause( pe );
387 throw ne;
388
389 }
390
391 }
392
393 /**
394 *
395 *
396 *
397 */
398 public int getContentLength() throws IOException {
399
400 if ( ! initConnection ) { initConnection(); }
401
402 //if ( _urlConnection instanceof HttpURLConnection ) {
403
404 return _urlConnection.getContentLength();
405
406 }
407
408 public String getHeaderField( String name ) {
409 return _urlConnection.getHeaderField( name );
410 }
411
412 }