View Javadoc

1   /*
2    * Copyright 1999,2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.commons.feedparser.locate;
18  
19  import java.util.regex.Pattern;
20  
21  import org.apache.log4j.Logger;
22  
23  /**
24   *
25   * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
26   * @version $Id: ResourceExpander.java 373622 2006-01-30 22:53:00Z mvdb $
27   */
28  public class ResourceExpander {
29  
30      private static Logger log = Logger.getLogger( ResourceExpander.class );
31  
32      /** A regexp to determine if a URL has a scheme, such as "http://foo.com" target="alexandria_uri">http://foo.com".
33       */
34      protected static Pattern schemePattern = Pattern.compile("^\\w*://.*");
35      
36      /**
37       * Expand a link relavant to the current site.  This takes care of links
38       * such as
39       *
40       * /foo.html -> http://site.com/base/foo.html
41       *
42       * foo.html -> http://site.com/base/foo.html
43       *
44       * Links should *always* be expanded before they are used.
45       *
46       * This is because if we use the URL http://site.com/base then we don't know
47       * if it's a directory or a file.  http://site.com/base/ would be a directory.
48       * 
49       * Note that all resource URLs will have correct trailing slashes.  If the URL
50       * does not end with / then it is a file URL and not a directory.
51       * 
52       * @param resource The absolute base URL that will be used to expand the
53       * link, such as "http://www.codinginparadise.org".
54       * @param link The link to possibly expand, such as "/index.rdf" or
55       * "http://www.somehost.com/somepage.html".
56       *
57       * 
58       */
59      public static String expand( String resource, String link ) {
60  
61          if ( link == null )
62              return null;
63  
64          //make sure we can use this.
65          if ( !isValidScheme( link ) )
66              return link;
67  
68          //nothing if ALREADY relativized
69          if ( isExpanded( link ) )
70              return link;
71  
72          //    From: http://www.w3.org/Addressing/rfc1808.txt
73          //
74          //    If the parse string begins with a double-slash "//", then the
75          //    substring of characters after the double-slash and up to, but not
76          //    including, the next slash "/" character is the network
77          //    location/login (<net_loc>) of the URL.  If no trailing slash "/"
78          //    is present, the entire remaining parse string is assigned to
79          //    <net_loc>.  The double- slash and <net_loc> are removed from the
80          //    parse string before
81          //FIXME: What happens if resource is a "file://" scheme?
82          if ( link.startsWith( "//" ) ) {
83  
84              return "http:" + link;
85  
86          }
87  
88          //keep going
89          if ( link.startsWith( "/" ) ) {
90  
91              link = getSite( resource ) + link;
92  
93              return link;
94  
95          } else if ( link.startsWith( "#" ) ) {
96  
97              link = resource + link;
98  
99              return link;
100 
101         } else if ( link.startsWith( ".." ) ) {
102 
103             //ok.  We need to get rid of these .. directories.
104 
105             String base = getBase( resource ) + "/";
106 
107             while ( link.startsWith( ".." ) ) {
108 
109                 //get rid of the first previous dir in the link
110                 int begin = 2;
111                 if ( link.length() > 2 && link.charAt( 2 ) == '/' )
112                     begin = 3;
113 
114                 link = link.substring( begin, link.length() );
115 
116                 //get rid of the last directory in the resource
117 
118                 int end = base.length();
119 
120                 if ( base.endsWith( "/" ) )
121                      --end;
122 
123                 base = base.substring( 0, base.lastIndexOf( "/", end - 1 ) );
124 
125             }
126 
127             link = base + "/" + link;
128 
129             return link;
130 
131         }
132 
133         // If the resource ends with a common file ending, then chop
134         // off the file ending before adding the link
135         // Is this rfc1808 compliant? Brad Neuberg, bkn3@columbia.edu
136         resource = getBase(resource);
137         if ( link.startsWith( "http://" ) == false ) {
138 
139             link = resource + "/" + link;
140             log.debug("link="+link);
141 
142         }
143 
144         return link;
145 
146     }
147 
148     /**
149      * Return true if the given link is ALREADY relativized..
150      *
151      * 
152      */
153     public static boolean isExpanded( String resource ) {
154         return (resource.startsWith( "http://" ) ||
155                 resource.startsWith( "file://" ));
156     }
157     
158     /**
159      * Return true if this is an valid scheme and should be expanded.
160      *
161      * 
162      */
163     public static boolean isValidScheme( String resource ) {
164         if (hasScheme(resource) == false)
165             return true;
166         
167         //only on file: and http:
168 
169         if ( resource.startsWith( "http:" ) )
170             return true;
171 
172         if ( resource.startsWith( "file:" ) )
173             return true;
174 
175         return false;
176         
177     }
178     
179     /**
180      * Determines if the given resource has a scheme. (i.e. does it start with
181      * "http://foo.com" or does it just have "foo.com").
182      */
183     public static boolean hasScheme( String resource ) {
184         return schemePattern.matcher( resource ).matches();
185         
186     }
187 
188     /**
189      * Get the site for this resource.  For example:
190      *
191      * http://www.foo.com/directory/index.html
192      *
193      * we will return
194      *
195      * http://www.foo.com
196      *
197      * for file: URLs we return file://
198      *
199      * 
200      */
201     public static String getSite( String resource ) {
202 
203         if ( resource.startsWith( "file:" ) ) {
204             return "file://";
205         } 
206 
207         //start at 8 which is the width of http://
208         int end = resource.indexOf( "/", 8 );
209 
210         if ( end == -1 ) {
211 
212             end = resource.length();
213 
214         } 
215 
216         return resource.substring( 0, end );
217 
218     }
219 
220     /**
221      * Given a URL get the domain name.  
222      *
223      * 
224      */
225     public static String getDomain( String resource ) {
226 
227         String site = getSite( resource );
228 
229         int firstIndex = -1;
230         int indexCount = 0;
231 
232         int index = site.length();
233 
234         while ( (index = site.lastIndexOf( ".", index-1 )) != -1 ) {
235 
236             ++indexCount;
237 
238             if ( indexCount == 2 )
239                 break;
240 
241         }
242 
243         int begin = 7; // http:// length
244         if ( indexCount >= 2 )
245             begin = index + 1;
246 
247         return site.substring( begin, site.length() );
248         
249     }
250     
251     /**
252      * Get the base of this URL.  For example if we are given:
253      *
254      * http://www.foo.com/directory/index.html
255      *
256      * we will return
257      *
258      * http://www.foo.com/directory
259      *
260      *
261      * 
262      */
263     public static String getBase( String resource ) {
264 
265         //FIXME: Brad says this method is totally broken.
266         if ( resource == null )
267             return null;
268         
269         int begin = "http://".length() + 1;
270         
271         int end = resource.lastIndexOf( "/" );
272         
273         if ( end == -1 || end <= begin ) {
274             //probaby a URL like http://www.cnn.com
275             
276             end = resource.length();
277             
278         } 
279         return resource.substring( 0, end );
280         
281     } 
282 
283     public static void main( String[] args ) throws Exception {
284 
285         System.out.println( expand( "http://peerfear.org/foo/bar/", "../../blog" ) );
286 
287         System.out.println( expand( "http://peerfear.org/foo/bar/", "../../index.html" ) );
288 
289         System.out.println( expand( "http://peerfear.org/blog/", ".." ) );
290 
291         System.out.println( expand( "http://peerfear.org", "/blog" ) );
292         System.out.println( expand( "http://peerfear.org", "http://peerfear.org" ) );
293 
294         System.out.println( expand( "http://peerfear.org", "blog" ) );
295         System.out.println( expand( "http://peerfear.org/blog", "foo/bar" ) );
296 
297         System.out.println( expand( "file://projects/newsmonster/", "blog" ) );
298 
299         System.out.println( expand( "file:/projects/ksa/src/java/ksa/test/TestFeedTask_WithRelativePath.rss"
300                                       , "/blog" ) );        
301     }
302 
303 }
304