001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *     http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.pipeline.stage;
019    
020    import java.io.BufferedInputStream;
021    import java.io.BufferedOutputStream;
022    import java.io.File;
023    import java.io.FileOutputStream;
024    import java.io.IOException;
025    import java.io.InputStream;
026    import java.io.OutputStream;
027    import java.net.HttpURLConnection;
028    import java.net.MalformedURLException;
029    import java.net.URL;
030    
031    import org.apache.commons.logging.Log;
032    import org.apache.commons.logging.LogFactory;
033    import org.apache.commons.pipeline.StageException;
034    import org.apache.commons.pipeline.validation.ConsumedTypes;
035    import org.apache.commons.pipeline.validation.ProducedTypes;
036    
037    
038    /**
039     * This {@link org.apache.commons.pipeline.Pipeline$Stage Stage} provides the 
040     * functionality needed to retrieve data from an HTTP URL. Multipart responses 
041     * are not yet supported.
042     */
043    @ConsumedTypes({URL.class, String.class})
044    @ProducedTypes({File.class})
045    public class HttpFileDownloadStage extends BaseStage {
046        private static final int BUFFER_SIZE = 10000;
047        private String workDir = null;
048        private Log log = LogFactory.getLog(HttpFileDownloadStage.class);
049        
050        public HttpFileDownloadStage() { }
051        
052        /**
053         * Creates a new HttpFileDownloadStage which will download files to the
054         * specified work directory.
055         * @param workDir the path to which files will be downloaded.
056         */
057        public HttpFileDownloadStage(String workDir) {
058            this.workDir = workDir;
059        }
060        
061        /**
062         * Removes a java.net.URL (an HTTP URL) or string representing a URL from 
063         * the input queue, and then retrieves the data at that URL and stores it
064         * in a temporary file. The file is stored in the directory specified by 
065         * {@link #setWorkDir(String) setWorkDir()}, or to the system default 
066         * temporary directory if no work directory is set.
067         *
068         * @param obj The URL from which to download data.
069         * @throws IllegalArgumentException if the parameter obj is not a string or 
070         * an instance of {@link java.net.URL}.
071         * @throws StageException if there is an error retrieving data from the 
072         * URL specified.
073         */
074        public void process(Object obj) throws StageException {
075            //Map params = new HashMap();
076            
077            URL url;
078            try {
079                if (obj instanceof String) {
080    //                String loc = (String) obj;
081    //                int paramIndex = loc.indexOf('?');
082    //                if (paramIndex > 0) {
083    //                    url = new URL(loc.substring(0, paramIndex));
084    //                    for (StringTokenizer st = new StringTokenizer(loc.substring(paramIndex + 1), "&"); st.hasMoreTokens();) {
085    //                        String tok = st.nextToken();
086    //                        int eqIndex = tok.indexOf('=');
087    //                        if (eqIndex > 0) {
088    //                            params.put(tok.substring(0, eqIndex), tok.substring(eqIndex + 1));
089    //                        }
090    //                        else {
091    //                            params.put(tok, null);
092    //                        }
093    //                    }
094    //                }
095    //                else {
096                    url = new URL((String) obj);
097    //                }
098                } else if (obj instanceof URL) {
099                    url = (URL) obj;
100                } else {
101                    throw new IllegalArgumentException("Unrecognized parameter class to process() for HttpFileDownload: " + obj.getClass().getName() + "; must be URL or String");
102                }
103            } catch (MalformedURLException e) {
104                throw new StageException(this, "Malformed URL: " + obj, e);
105            }
106            
107            log.debug("Retrieving data from " + url.toString());
108            
109    //        try {
110    //            url = handleRedirects(url);
111    //        }
112    //        catch (Exception e) { //catches MalformedURLException, IOException
113    //            throw new StageException("An error was encountered attempting to follow URL redirects from " + url.toString(), e);
114    //        }
115            
116            HttpURLConnection con = null;
117            try {
118                con = (java.net.HttpURLConnection) url.openConnection();
119    //            if (!params.isEmpty()) {
120    //                con.setRequestMethod("GET");
121    //                for (Iterator iter = params.entrySet().iterator(); iter.hasNext();) {
122    //                    Map.Entry entry = (Map.Entry) iter.next();
123    //                    con.setRequestProperty((String) entry.getKey(), (String) entry.getValue());
124    //                }
125    //            }
126    
127                File workDir = (this.workDir == null) ? null : new File(this.workDir);
128                File workFile = File.createTempFile("http-file-download","tmp", workDir);
129                
130                InputStream in = new BufferedInputStream(con.getInputStream());
131                OutputStream out = new BufferedOutputStream(new FileOutputStream(workFile, false));
132                byte[] buffer = new byte[BUFFER_SIZE]; //attempt to read 10k at a time
133                for (int results = 0; (results = in.read(buffer)) != -1;) {
134                    out.write(buffer, 0, results);
135                }
136                out.close();
137                in.close();
138                
139                this.emit(workFile);
140            } catch (IOException e) {
141                throw new StageException(this, "An error occurred downloading a data file from " + url.toString(), e);
142            } finally {
143                con.disconnect();
144            }        
145        }
146        
147        
148        /**
149         * Sets the working directory for the file download. If the directory does
150         * not already exist, it will be created during the preprocess() step.
151         * If you do not set this directory, the work directory will be the
152         * default temporary directory for your machine type.
153         */
154        public void setWorkDir(String workDir) {
155            this.workDir = workDir;
156        }
157        
158        /**
159         * Returns the name of the file download directory.
160         */
161        public String getWorkDir() {
162            return this.workDir;
163        }
164        
165        /**
166         * Follows redirects from the specified URL and recursively returns the destination
167         * URL. This method does not check for circular redirects, so it is possible that a malicious
168         * site could force this method into infinite recursion.
169         *
170         * TODO: Add a max_hops parameterized version
171         */
172        public URL handleRedirects(URL url) throws IOException, MalformedURLException {
173            java.net.HttpURLConnection.setFollowRedirects(false);
174            HttpURLConnection con = (HttpURLConnection) url.openConnection();
175            int response = con.getResponseCode();
176            log.debug("Response code for " + url + " = " + response);
177            
178            if (response == java.net.HttpURLConnection.HTTP_MOVED_PERM || response == java.net.HttpURLConnection.HTTP_MOVED_TEMP) {
179                String location = con.getHeaderField("Location");
180                log.debug("Handling redirect to location: " + location);
181                
182                if (location.startsWith("http:")) {
183                    url = new URL(location);
184                } else if (location.startsWith("/")) {
185                    url = new URL("http://" + url.getHost() + location);
186                } else {
187                    url = new URL(con.getURL(), location);
188                }
189                
190                url = handleRedirects(url); // to handle nested redirections
191            }
192            
193            return url;
194        }
195    }