View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.pipeline.stage;
19  
20  import java.io.BufferedInputStream;
21  import java.io.BufferedOutputStream;
22  import java.io.File;
23  import java.io.FileOutputStream;
24  import java.io.IOException;
25  import java.io.InputStream;
26  import java.io.OutputStream;
27  import java.net.HttpURLConnection;
28  import java.net.MalformedURLException;
29  import java.net.URL;
30  
31  import org.apache.commons.logging.Log;
32  import org.apache.commons.logging.LogFactory;
33  import org.apache.commons.pipeline.StageException;
34  import org.apache.commons.pipeline.validation.ConsumedTypes;
35  import org.apache.commons.pipeline.validation.ProducedTypes;
36  
37  
38  /**
39   * This {@link org.apache.commons.pipeline.Pipeline$Stage Stage} provides the 
40   * functionality needed to retrieve data from an HTTP URL. Multipart responses 
41   * are not yet supported.
42   */
43  @ConsumedTypes({URL.class, String.class})
44  @ProducedTypes({File.class})
45  public class HttpFileDownloadStage extends BaseStage {
46      private static final int BUFFER_SIZE = 10000;
47      private String workDir = null;
48      private Log log = LogFactory.getLog(HttpFileDownloadStage.class);
49      
50      public HttpFileDownloadStage() { }
51      
52      /**
53       * Creates a new HttpFileDownloadStage which will download files to the
54       * specified work directory.
55       * @param workDir the path to which files will be downloaded.
56       */
57      public HttpFileDownloadStage(String workDir) {
58          this.workDir = workDir;
59      }
60      
61      /**
62       * Removes a java.net.URL (an HTTP URL) or string representing a URL from 
63       * the input queue, and then retrieves the data at that URL and stores it
64       * in a temporary file. The file is stored in the directory specified by 
65       * {@link #setWorkDir(String) setWorkDir()}, or to the system default 
66       * temporary directory if no work directory is set.
67       *
68       * @param obj The URL from which to download data.
69       * @throws IllegalArgumentException if the parameter obj is not a string or 
70       * an instance of {@link java.net.URL}.
71       * @throws StageException if there is an error retrieving data from the 
72       * URL specified.
73       */
74      public void process(Object obj) throws StageException {
75          //Map params = new HashMap();
76          
77          URL url;
78          try {
79              if (obj instanceof String) {
80  //                String loc = (String) obj;
81  //                int paramIndex = loc.indexOf('?');
82  //                if (paramIndex > 0) {
83  //                    url = new URL(loc.substring(0, paramIndex));
84  //                    for (StringTokenizer st = new StringTokenizer(loc.substring(paramIndex + 1), "&"); st.hasMoreTokens();) {
85  //                        String tok = st.nextToken();
86  //                        int eqIndex = tok.indexOf('=');
87  //                        if (eqIndex > 0) {
88  //                            params.put(tok.substring(0, eqIndex), tok.substring(eqIndex + 1));
89  //                        }
90  //                        else {
91  //                            params.put(tok, null);
92  //                        }
93  //                    }
94  //                }
95  //                else {
96                  url = new URL((String) obj);
97  //                }
98              } else if (obj instanceof URL) {
99                  url = (URL) obj;
100             } else {
101                 throw new IllegalArgumentException("Unrecognized parameter class to process() for HttpFileDownload: " + obj.getClass().getName() + "; must be URL or String");
102             }
103         } catch (MalformedURLException e) {
104             throw new StageException(this, "Malformed URL: " + obj, e);
105         }
106         
107         log.debug("Retrieving data from " + url.toString());
108         
109 //        try {
110 //            url = handleRedirects(url);
111 //        }
112 //        catch (Exception e) { //catches MalformedURLException, IOException
113 //            throw new StageException("An error was encountered attempting to follow URL redirects from " + url.toString(), e);
114 //        }
115         
116         HttpURLConnection con = null;
117         try {
118             con = (java.net.HttpURLConnection) url.openConnection();
119 //            if (!params.isEmpty()) {
120 //                con.setRequestMethod("GET");
121 //                for (Iterator iter = params.entrySet().iterator(); iter.hasNext();) {
122 //                    Map.Entry entry = (Map.Entry) iter.next();
123 //                    con.setRequestProperty((String) entry.getKey(), (String) entry.getValue());
124 //                }
125 //            }
126 
127             File workDir = (this.workDir == null) ? null : new File(this.workDir);
128             File workFile = File.createTempFile("http-file-download","tmp", workDir);
129             
130             InputStream in = new BufferedInputStream(con.getInputStream());
131             OutputStream out = new BufferedOutputStream(new FileOutputStream(workFile, false));
132             byte[] buffer = new byte[BUFFER_SIZE]; //attempt to read 10k at a time
133             for (int results = 0; (results = in.read(buffer)) != -1;) {
134                 out.write(buffer, 0, results);
135             }
136             out.close();
137             in.close();
138             
139             this.emit(workFile);
140         } catch (IOException e) {
141             throw new StageException(this, "An error occurred downloading a data file from " + url.toString(), e);
142         } finally {
143             con.disconnect();
144         }        
145     }
146     
147     
148     /**
149      * Sets the working directory for the file download. If the directory does
150      * not already exist, it will be created during the preprocess() step.
151      * If you do not set this directory, the work directory will be the
152      * default temporary directory for your machine type.
153      */
154     public void setWorkDir(String workDir) {
155         this.workDir = workDir;
156     }
157     
158     /**
159      * Returns the name of the file download directory.
160      */
161     public String getWorkDir() {
162         return this.workDir;
163     }
164     
165     /**
166      * Follows redirects from the specified URL and recursively returns the destination
167      * URL. This method does not check for circular redirects, so it is possible that a malicious
168      * site could force this method into infinite recursion.
169      *
170      * TODO: Add a max_hops parameterized version
171      */
172     public URL handleRedirects(URL url) throws IOException, MalformedURLException {
173         java.net.HttpURLConnection.setFollowRedirects(false);
174         HttpURLConnection con = (HttpURLConnection) url.openConnection();
175         int response = con.getResponseCode();
176         log.debug("Response code for " + url + " = " + response);
177         
178         if (response == java.net.HttpURLConnection.HTTP_MOVED_PERM || response == java.net.HttpURLConnection.HTTP_MOVED_TEMP) {
179             String location = con.getHeaderField("Location");
180             log.debug("Handling redirect to location: " + location);
181             
182             if (location.startsWith("http:")) {
183                 url = new URL(location);
184             } else if (location.startsWith("/")) {
185                 url = new URL("http://" + url.getHost() + location);
186             } else {
187                 url = new URL(con.getURL(), location);
188             }
189             
190             url = handleRedirects(url); // to handle nested redirections
191         }
192         
193         return url;
194     }
195 }