1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.commons.pipeline.stage;
19
20 import java.io.BufferedInputStream;
21 import java.io.BufferedOutputStream;
22 import java.io.File;
23 import java.io.FileOutputStream;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.OutputStream;
27 import java.net.HttpURLConnection;
28 import java.net.MalformedURLException;
29 import java.net.URL;
30
31 import org.apache.commons.logging.Log;
32 import org.apache.commons.logging.LogFactory;
33 import org.apache.commons.pipeline.StageException;
34 import org.apache.commons.pipeline.validation.ConsumedTypes;
35 import org.apache.commons.pipeline.validation.ProducedTypes;
36
37
38 /**
39 * This {@link org.apache.commons.pipeline.Pipeline$Stage Stage} provides the
40 * functionality needed to retrieve data from an HTTP URL. Multipart responses
41 * are not yet supported.
42 */
43 @ConsumedTypes({URL.class, String.class})
44 @ProducedTypes({File.class})
45 public class HttpFileDownloadStage extends BaseStage {
46 private static final int BUFFER_SIZE = 10000;
47 private String workDir = null;
48 private Log log = LogFactory.getLog(HttpFileDownloadStage.class);
49
50 public HttpFileDownloadStage() { }
51
52 /**
53 * Creates a new HttpFileDownloadStage which will download files to the
54 * specified work directory.
55 * @param workDir the path to which files will be downloaded.
56 */
57 public HttpFileDownloadStage(String workDir) {
58 this.workDir = workDir;
59 }
60
61 /**
62 * Removes a java.net.URL (an HTTP URL) or string representing a URL from
63 * the input queue, and then retrieves the data at that URL and stores it
64 * in a temporary file. The file is stored in the directory specified by
65 * {@link #setWorkDir(String) setWorkDir()}, or to the system default
66 * temporary directory if no work directory is set.
67 *
68 * @param obj The URL from which to download data.
69 * @throws IllegalArgumentException if the parameter obj is not a string or
70 * an instance of {@link java.net.URL}.
71 * @throws StageException if there is an error retrieving data from the
72 * URL specified.
73 */
74 public void process(Object obj) throws StageException {
75 //Map params = new HashMap();
76
77 URL url;
78 try {
79 if (obj instanceof String) {
80 // String loc = (String) obj;
81 // int paramIndex = loc.indexOf('?');
82 // if (paramIndex > 0) {
83 // url = new URL(loc.substring(0, paramIndex));
84 // for (StringTokenizer st = new StringTokenizer(loc.substring(paramIndex + 1), "&"); st.hasMoreTokens();) {
85 // String tok = st.nextToken();
86 // int eqIndex = tok.indexOf('=');
87 // if (eqIndex > 0) {
88 // params.put(tok.substring(0, eqIndex), tok.substring(eqIndex + 1));
89 // }
90 // else {
91 // params.put(tok, null);
92 // }
93 // }
94 // }
95 // else {
96 url = new URL((String) obj);
97 // }
98 } else if (obj instanceof URL) {
99 url = (URL) obj;
100 } else {
101 throw new IllegalArgumentException("Unrecognized parameter class to process() for HttpFileDownload: " + obj.getClass().getName() + "; must be URL or String");
102 }
103 } catch (MalformedURLException e) {
104 throw new StageException(this, "Malformed URL: " + obj, e);
105 }
106
107 log.debug("Retrieving data from " + url.toString());
108
109 // try {
110 // url = handleRedirects(url);
111 // }
112 // catch (Exception e) { //catches MalformedURLException, IOException
113 // throw new StageException("An error was encountered attempting to follow URL redirects from " + url.toString(), e);
114 // }
115
116 HttpURLConnection con = null;
117 try {
118 con = (java.net.HttpURLConnection) url.openConnection();
119 // if (!params.isEmpty()) {
120 // con.setRequestMethod("GET");
121 // for (Iterator iter = params.entrySet().iterator(); iter.hasNext();) {
122 // Map.Entry entry = (Map.Entry) iter.next();
123 // con.setRequestProperty((String) entry.getKey(), (String) entry.getValue());
124 // }
125 // }
126
127 File workDir = (this.workDir == null) ? null : new File(this.workDir);
128 File workFile = File.createTempFile("http-file-download","tmp", workDir);
129
130 InputStream in = new BufferedInputStream(con.getInputStream());
131 OutputStream out = new BufferedOutputStream(new FileOutputStream(workFile, false));
132 byte[] buffer = new byte[BUFFER_SIZE]; //attempt to read 10k at a time
133 for (int results = 0; (results = in.read(buffer)) != -1;) {
134 out.write(buffer, 0, results);
135 }
136 out.close();
137 in.close();
138
139 this.emit(workFile);
140 } catch (IOException e) {
141 throw new StageException(this, "An error occurred downloading a data file from " + url.toString(), e);
142 } finally {
143 con.disconnect();
144 }
145 }
146
147
148 /**
149 * Sets the working directory for the file download. If the directory does
150 * not already exist, it will be created during the preprocess() step.
151 * If you do not set this directory, the work directory will be the
152 * default temporary directory for your machine type.
153 */
154 public void setWorkDir(String workDir) {
155 this.workDir = workDir;
156 }
157
158 /**
159 * Returns the name of the file download directory.
160 */
161 public String getWorkDir() {
162 return this.workDir;
163 }
164
165 /**
166 * Follows redirects from the specified URL and recursively returns the destination
167 * URL. This method does not check for circular redirects, so it is possible that a malicious
168 * site could force this method into infinite recursion.
169 *
170 * TODO: Add a max_hops parameterized version
171 */
172 public URL handleRedirects(URL url) throws IOException, MalformedURLException {
173 java.net.HttpURLConnection.setFollowRedirects(false);
174 HttpURLConnection con = (HttpURLConnection) url.openConnection();
175 int response = con.getResponseCode();
176 log.debug("Response code for " + url + " = " + response);
177
178 if (response == java.net.HttpURLConnection.HTTP_MOVED_PERM || response == java.net.HttpURLConnection.HTTP_MOVED_TEMP) {
179 String location = con.getHeaderField("Location");
180 log.debug("Handling redirect to location: " + location);
181
182 if (location.startsWith("http:")) {
183 url = new URL(location);
184 } else if (location.startsWith("/")) {
185 url = new URL("http://" + url.getHost() + location);
186 } else {
187 url = new URL(con.getURL(), location);
188 }
189
190 url = handleRedirects(url); // to handle nested redirections
191 }
192
193 return url;
194 }
195 }