001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.pipeline.stage;
019
020 import java.io.BufferedInputStream;
021 import java.io.BufferedOutputStream;
022 import java.io.File;
023 import java.io.FileOutputStream;
024 import java.io.IOException;
025 import java.io.InputStream;
026 import java.io.OutputStream;
027 import java.net.HttpURLConnection;
028 import java.net.MalformedURLException;
029 import java.net.URL;
030
031 import org.apache.commons.logging.Log;
032 import org.apache.commons.logging.LogFactory;
033 import org.apache.commons.pipeline.StageException;
034 import org.apache.commons.pipeline.validation.ConsumedTypes;
035 import org.apache.commons.pipeline.validation.ProducedTypes;
036
037
038 /**
039 * This {@link org.apache.commons.pipeline.Pipeline$Stage Stage} provides the
040 * functionality needed to retrieve data from an HTTP URL. Multipart responses
041 * are not yet supported.
042 */
043 @ConsumedTypes({URL.class, String.class})
044 @ProducedTypes({File.class})
045 public class HttpFileDownloadStage extends BaseStage {
046 private static final int BUFFER_SIZE = 10000;
047 private String workDir = null;
048 private Log log = LogFactory.getLog(HttpFileDownloadStage.class);
049
050 public HttpFileDownloadStage() { }
051
052 /**
053 * Creates a new HttpFileDownloadStage which will download files to the
054 * specified work directory.
055 * @param workDir the path to which files will be downloaded.
056 */
057 public HttpFileDownloadStage(String workDir) {
058 this.workDir = workDir;
059 }
060
061 /**
062 * Removes a java.net.URL (an HTTP URL) or string representing a URL from
063 * the input queue, and then retrieves the data at that URL and stores it
064 * in a temporary file. The file is stored in the directory specified by
065 * {@link #setWorkDir(String) setWorkDir()}, or to the system default
066 * temporary directory if no work directory is set.
067 *
068 * @param obj The URL from which to download data.
069 * @throws IllegalArgumentException if the parameter obj is not a string or
070 * an instance of {@link java.net.URL}.
071 * @throws StageException if there is an error retrieving data from the
072 * URL specified.
073 */
074 public void process(Object obj) throws StageException {
075 //Map params = new HashMap();
076
077 URL url;
078 try {
079 if (obj instanceof String) {
080 // String loc = (String) obj;
081 // int paramIndex = loc.indexOf('?');
082 // if (paramIndex > 0) {
083 // url = new URL(loc.substring(0, paramIndex));
084 // for (StringTokenizer st = new StringTokenizer(loc.substring(paramIndex + 1), "&"); st.hasMoreTokens();) {
085 // String tok = st.nextToken();
086 // int eqIndex = tok.indexOf('=');
087 // if (eqIndex > 0) {
088 // params.put(tok.substring(0, eqIndex), tok.substring(eqIndex + 1));
089 // }
090 // else {
091 // params.put(tok, null);
092 // }
093 // }
094 // }
095 // else {
096 url = new URL((String) obj);
097 // }
098 } else if (obj instanceof URL) {
099 url = (URL) obj;
100 } else {
101 throw new IllegalArgumentException("Unrecognized parameter class to process() for HttpFileDownload: " + obj.getClass().getName() + "; must be URL or String");
102 }
103 } catch (MalformedURLException e) {
104 throw new StageException(this, "Malformed URL: " + obj, e);
105 }
106
107 log.debug("Retrieving data from " + url.toString());
108
109 // try {
110 // url = handleRedirects(url);
111 // }
112 // catch (Exception e) { //catches MalformedURLException, IOException
113 // throw new StageException("An error was encountered attempting to follow URL redirects from " + url.toString(), e);
114 // }
115
116 HttpURLConnection con = null;
117 try {
118 con = (java.net.HttpURLConnection) url.openConnection();
119 // if (!params.isEmpty()) {
120 // con.setRequestMethod("GET");
121 // for (Iterator iter = params.entrySet().iterator(); iter.hasNext();) {
122 // Map.Entry entry = (Map.Entry) iter.next();
123 // con.setRequestProperty((String) entry.getKey(), (String) entry.getValue());
124 // }
125 // }
126
127 File workDir = (this.workDir == null) ? null : new File(this.workDir);
128 File workFile = File.createTempFile("http-file-download","tmp", workDir);
129
130 InputStream in = new BufferedInputStream(con.getInputStream());
131 OutputStream out = new BufferedOutputStream(new FileOutputStream(workFile, false));
132 byte[] buffer = new byte[BUFFER_SIZE]; //attempt to read 10k at a time
133 for (int results = 0; (results = in.read(buffer)) != -1;) {
134 out.write(buffer, 0, results);
135 }
136 out.close();
137 in.close();
138
139 this.emit(workFile);
140 } catch (IOException e) {
141 throw new StageException(this, "An error occurred downloading a data file from " + url.toString(), e);
142 } finally {
143 con.disconnect();
144 }
145 }
146
147
148 /**
149 * Sets the working directory for the file download. If the directory does
150 * not already exist, it will be created during the preprocess() step.
151 * If you do not set this directory, the work directory will be the
152 * default temporary directory for your machine type.
153 */
154 public void setWorkDir(String workDir) {
155 this.workDir = workDir;
156 }
157
158 /**
159 * Returns the name of the file download directory.
160 */
161 public String getWorkDir() {
162 return this.workDir;
163 }
164
165 /**
166 * Follows redirects from the specified URL and recursively returns the destination
167 * URL. This method does not check for circular redirects, so it is possible that a malicious
168 * site could force this method into infinite recursion.
169 *
170 * TODO: Add a max_hops parameterized version
171 */
172 public URL handleRedirects(URL url) throws IOException, MalformedURLException {
173 java.net.HttpURLConnection.setFollowRedirects(false);
174 HttpURLConnection con = (HttpURLConnection) url.openConnection();
175 int response = con.getResponseCode();
176 log.debug("Response code for " + url + " = " + response);
177
178 if (response == java.net.HttpURLConnection.HTTP_MOVED_PERM || response == java.net.HttpURLConnection.HTTP_MOVED_TEMP) {
179 String location = con.getHeaderField("Location");
180 log.debug("Handling redirect to location: " + location);
181
182 if (location.startsWith("http:")) {
183 url = new URL(location);
184 } else if (location.startsWith("/")) {
185 url = new URL("http://" + url.getHost() + location);
186 } else {
187 url = new URL(con.getURL(), location);
188 }
189
190 url = handleRedirects(url); // to handle nested redirections
191 }
192
193 return url;
194 }
195 }