001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser.locate;
018    
019    import java.util.regex.Pattern;
020    
021    import org.apache.log4j.Logger;
022    
023    /**
024     *
025     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
026     * @version $Id: ResourceExpander.java 373622 2006-01-30 22:53:00Z mvdb $
027     */
028    public class ResourceExpander {
029    
030        private static Logger log = Logger.getLogger( ResourceExpander.class );
031    
032        /** A regexp to determine if a URL has a scheme, such as "http://foo.com".
033         */
034        protected static Pattern schemePattern = Pattern.compile("^\\w*://.*");
035        
036        /**
037         * Expand a link relavant to the current site.  This takes care of links
038         * such as
039         *
040         * /foo.html -> http://site.com/base/foo.html
041         *
042         * foo.html -> http://site.com/base/foo.html
043         *
044         * Links should *always* be expanded before they are used.
045         *
046         * This is because if we use the URL http://site.com/base then we don't know
047         * if it's a directory or a file.  http://site.com/base/ would be a directory.
048         * 
049         * Note that all resource URLs will have correct trailing slashes.  If the URL
050         * does not end with / then it is a file URL and not a directory.
051         * 
052         * @param resource The absolute base URL that will be used to expand the
053         * link, such as "http://www.codinginparadise.org".
054         * @param link The link to possibly expand, such as "/index.rdf" or
055         * "http://www.somehost.com/somepage.html".
056         *
057         * 
058         */
059        public static String expand( String resource, String link ) {
060    
061            if ( link == null )
062                return null;
063    
064            //make sure we can use this.
065            if ( !isValidScheme( link ) )
066                return link;
067    
068            //nothing if ALREADY relativized
069            if ( isExpanded( link ) )
070                return link;
071    
072            //    From: http://www.w3.org/Addressing/rfc1808.txt
073            //
074            //    If the parse string begins with a double-slash "//", then the
075            //    substring of characters after the double-slash and up to, but not
076            //    including, the next slash "/" character is the network
077            //    location/login (<net_loc>) of the URL.  If no trailing slash "/"
078            //    is present, the entire remaining parse string is assigned to
079            //    <net_loc>.  The double- slash and <net_loc> are removed from the
080            //    parse string before
081            //FIXME: What happens if resource is a "file://" scheme?
082            if ( link.startsWith( "//" ) ) {
083    
084                return "http:" + link;
085    
086            }
087    
088            //keep going
089            if ( link.startsWith( "/" ) ) {
090    
091                link = getSite( resource ) + link;
092    
093                return link;
094    
095            } else if ( link.startsWith( "#" ) ) {
096    
097                link = resource + link;
098    
099                return link;
100    
101            } else if ( link.startsWith( ".." ) ) {
102    
103                //ok.  We need to get rid of these .. directories.
104    
105                String base = getBase( resource ) + "/";
106    
107                while ( link.startsWith( ".." ) ) {
108    
109                    //get rid of the first previous dir in the link
110                    int begin = 2;
111                    if ( link.length() > 2 && link.charAt( 2 ) == '/' )
112                        begin = 3;
113    
114                    link = link.substring( begin, link.length() );
115    
116                    //get rid of the last directory in the resource
117    
118                    int end = base.length();
119    
120                    if ( base.endsWith( "/" ) )
121                         --end;
122    
123                    base = base.substring( 0, base.lastIndexOf( "/", end - 1 ) );
124    
125                }
126    
127                link = base + "/" + link;
128    
129                return link;
130    
131            }
132    
133            // If the resource ends with a common file ending, then chop
134            // off the file ending before adding the link
135            // Is this rfc1808 compliant? Brad Neuberg, bkn3@columbia.edu
136            resource = getBase(resource);
137            if ( link.startsWith( "http://" ) == false ) {
138    
139                link = resource + "/" + link;
140                log.debug("link="+link);
141    
142            }
143    
144            return link;
145    
146        }
147    
148        /**
149         * Return true if the given link is ALREADY relativized..
150         *
151         * 
152         */
153        public static boolean isExpanded( String resource ) {
154            return (resource.startsWith( "http://" ) ||
155                    resource.startsWith( "file://" ));
156        }
157        
158        /**
159         * Return true if this is an valid scheme and should be expanded.
160         *
161         * 
162         */
163        public static boolean isValidScheme( String resource ) {
164            if (hasScheme(resource) == false)
165                return true;
166            
167            //only on file: and http:
168    
169            if ( resource.startsWith( "http:" ) )
170                return true;
171    
172            if ( resource.startsWith( "file:" ) )
173                return true;
174    
175            return false;
176            
177        }
178        
179        /**
180         * Determines if the given resource has a scheme. (i.e. does it start with
181         * "http://foo.com" or does it just have "foo.com").
182         */
183        public static boolean hasScheme( String resource ) {
184            return schemePattern.matcher( resource ).matches();
185            
186        }
187    
188        /**
189         * Get the site for this resource.  For example:
190         *
191         * http://www.foo.com/directory/index.html
192         *
193         * we will return
194         *
195         * http://www.foo.com
196         *
197         * for file: URLs we return file://
198         *
199         * 
200         */
201        public static String getSite( String resource ) {
202    
203            if ( resource.startsWith( "file:" ) ) {
204                return "file://";
205            } 
206    
207            //start at 8 which is the width of http://
208            int end = resource.indexOf( "/", 8 );
209    
210            if ( end == -1 ) {
211    
212                end = resource.length();
213    
214            } 
215    
216            return resource.substring( 0, end );
217    
218        }
219    
220        /**
221         * Given a URL get the domain name.  
222         *
223         * 
224         */
225        public static String getDomain( String resource ) {
226    
227            String site = getSite( resource );
228    
229            int firstIndex = -1;
230            int indexCount = 0;
231    
232            int index = site.length();
233    
234            while ( (index = site.lastIndexOf( ".", index-1 )) != -1 ) {
235    
236                ++indexCount;
237    
238                if ( indexCount == 2 )
239                    break;
240    
241            }
242    
243            int begin = 7; // http:// length
244            if ( indexCount >= 2 )
245                begin = index + 1;
246    
247            return site.substring( begin, site.length() );
248            
249        }
250        
251        /**
252         * Get the base of this URL.  For example if we are given:
253         *
254         * http://www.foo.com/directory/index.html
255         *
256         * we will return
257         *
258         * http://www.foo.com/directory
259         *
260         *
261         * 
262         */
263        public static String getBase( String resource ) {
264    
265            //FIXME: Brad says this method is totally broken.
266            if ( resource == null )
267                return null;
268            
269            int begin = "http://".length() + 1;
270            
271            int end = resource.lastIndexOf( "/" );
272            
273            if ( end == -1 || end <= begin ) {
274                //probaby a URL like http://www.cnn.com
275                
276                end = resource.length();
277                
278            } 
279            return resource.substring( 0, end );
280            
281        } 
282    
283        public static void main( String[] args ) throws Exception {
284    
285            System.out.println( expand( "http://peerfear.org/foo/bar/", "../../blog" ) );
286    
287            System.out.println( expand( "http://peerfear.org/foo/bar/", "../../index.html" ) );
288    
289            System.out.println( expand( "http://peerfear.org/blog/", ".." ) );
290    
291            System.out.println( expand( "http://peerfear.org", "/blog" ) );
292            System.out.println( expand( "http://peerfear.org", "http://peerfear.org" ) );
293    
294            System.out.println( expand( "http://peerfear.org", "blog" ) );
295            System.out.println( expand( "http://peerfear.org/blog", "foo/bar" ) );
296    
297            System.out.println( expand( "file://projects/newsmonster/", "blog" ) );
298    
299            System.out.println( expand( "file:/projects/ksa/src/java/ksa/test/TestFeedTask_WithRelativePath.rss"
300                                          , "/blog" ) );        
301        }
302    
303    }
304