001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018 package org.apache.commons.pipeline.stage; 019 020 import java.io.BufferedInputStream; 021 import java.io.BufferedOutputStream; 022 import java.io.File; 023 import java.io.FileOutputStream; 024 import java.io.IOException; 025 import java.io.InputStream; 026 import java.io.OutputStream; 027 import java.net.HttpURLConnection; 028 import java.net.MalformedURLException; 029 import java.net.URL; 030 031 import org.apache.commons.logging.Log; 032 import org.apache.commons.logging.LogFactory; 033 import org.apache.commons.pipeline.StageException; 034 import org.apache.commons.pipeline.validation.ConsumedTypes; 035 import org.apache.commons.pipeline.validation.ProducedTypes; 036 037 038 /** 039 * This {@link org.apache.commons.pipeline.Pipeline$Stage Stage} provides the 040 * functionality needed to retrieve data from an HTTP URL. Multipart responses 041 * are not yet supported. 042 */ 043 @ConsumedTypes({URL.class, String.class}) 044 @ProducedTypes({File.class}) 045 public class HttpFileDownloadStage extends BaseStage { 046 private static final int BUFFER_SIZE = 10000; 047 private String workDir = null; 048 private Log log = LogFactory.getLog(HttpFileDownloadStage.class); 049 050 public HttpFileDownloadStage() { } 051 052 /** 053 * Creates a new HttpFileDownloadStage which will download files to the 054 * specified work directory. 055 * @param workDir the path to which files will be downloaded. 056 */ 057 public HttpFileDownloadStage(String workDir) { 058 this.workDir = workDir; 059 } 060 061 /** 062 * Removes a java.net.URL (an HTTP URL) or string representing a URL from 063 * the input queue, and then retrieves the data at that URL and stores it 064 * in a temporary file. The file is stored in the directory specified by 065 * {@link #setWorkDir(String) setWorkDir()}, or to the system default 066 * temporary directory if no work directory is set. 067 * 068 * @param obj The URL from which to download data. 069 * @throws IllegalArgumentException if the parameter obj is not a string or 070 * an instance of {@link java.net.URL}. 071 * @throws StageException if there is an error retrieving data from the 072 * URL specified. 073 */ 074 public void process(Object obj) throws StageException { 075 //Map params = new HashMap(); 076 077 URL url; 078 try { 079 if (obj instanceof String) { 080 // String loc = (String) obj; 081 // int paramIndex = loc.indexOf('?'); 082 // if (paramIndex > 0) { 083 // url = new URL(loc.substring(0, paramIndex)); 084 // for (StringTokenizer st = new StringTokenizer(loc.substring(paramIndex + 1), "&"); st.hasMoreTokens();) { 085 // String tok = st.nextToken(); 086 // int eqIndex = tok.indexOf('='); 087 // if (eqIndex > 0) { 088 // params.put(tok.substring(0, eqIndex), tok.substring(eqIndex + 1)); 089 // } 090 // else { 091 // params.put(tok, null); 092 // } 093 // } 094 // } 095 // else { 096 url = new URL((String) obj); 097 // } 098 } else if (obj instanceof URL) { 099 url = (URL) obj; 100 } else { 101 throw new IllegalArgumentException("Unrecognized parameter class to process() for HttpFileDownload: " + obj.getClass().getName() + "; must be URL or String"); 102 } 103 } catch (MalformedURLException e) { 104 throw new StageException(this, "Malformed URL: " + obj, e); 105 } 106 107 log.debug("Retrieving data from " + url.toString()); 108 109 // try { 110 // url = handleRedirects(url); 111 // } 112 // catch (Exception e) { //catches MalformedURLException, IOException 113 // throw new StageException("An error was encountered attempting to follow URL redirects from " + url.toString(), e); 114 // } 115 116 HttpURLConnection con = null; 117 try { 118 con = (java.net.HttpURLConnection) url.openConnection(); 119 // if (!params.isEmpty()) { 120 // con.setRequestMethod("GET"); 121 // for (Iterator iter = params.entrySet().iterator(); iter.hasNext();) { 122 // Map.Entry entry = (Map.Entry) iter.next(); 123 // con.setRequestProperty((String) entry.getKey(), (String) entry.getValue()); 124 // } 125 // } 126 127 File workDir = (this.workDir == null) ? null : new File(this.workDir); 128 File workFile = File.createTempFile("http-file-download","tmp", workDir); 129 130 InputStream in = new BufferedInputStream(con.getInputStream()); 131 OutputStream out = new BufferedOutputStream(new FileOutputStream(workFile, false)); 132 byte[] buffer = new byte[BUFFER_SIZE]; //attempt to read 10k at a time 133 for (int results = 0; (results = in.read(buffer)) != -1;) { 134 out.write(buffer, 0, results); 135 } 136 out.close(); 137 in.close(); 138 139 this.emit(workFile); 140 } catch (IOException e) { 141 throw new StageException(this, "An error occurred downloading a data file from " + url.toString(), e); 142 } finally { 143 con.disconnect(); 144 } 145 } 146 147 148 /** 149 * Sets the working directory for the file download. If the directory does 150 * not already exist, it will be created during the preprocess() step. 151 * If you do not set this directory, the work directory will be the 152 * default temporary directory for your machine type. 153 */ 154 public void setWorkDir(String workDir) { 155 this.workDir = workDir; 156 } 157 158 /** 159 * Returns the name of the file download directory. 160 */ 161 public String getWorkDir() { 162 return this.workDir; 163 } 164 165 /** 166 * Follows redirects from the specified URL and recursively returns the destination 167 * URL. This method does not check for circular redirects, so it is possible that a malicious 168 * site could force this method into infinite recursion. 169 * 170 * TODO: Add a max_hops parameterized version 171 */ 172 public URL handleRedirects(URL url) throws IOException, MalformedURLException { 173 java.net.HttpURLConnection.setFollowRedirects(false); 174 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 175 int response = con.getResponseCode(); 176 log.debug("Response code for " + url + " = " + response); 177 178 if (response == java.net.HttpURLConnection.HTTP_MOVED_PERM || response == java.net.HttpURLConnection.HTTP_MOVED_TEMP) { 179 String location = con.getHeaderField("Location"); 180 log.debug("Handling redirect to location: " + location); 181 182 if (location.startsWith("http:")) { 183 url = new URL(location); 184 } else if (location.startsWith("/")) { 185 url = new URL("http://" + url.getHost() + location); 186 } else { 187 url = new URL(con.getURL(), location); 188 } 189 190 url = handleRedirects(url); // to handle nested redirections 191 } 192 193 return url; 194 } 195 }