001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser.locate; 018 019 import java.util.regex.Pattern; 020 021 import org.apache.log4j.Logger; 022 023 /** 024 * 025 * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a> 026 * @version $Id: ResourceExpander.java 373622 2006-01-30 22:53:00Z mvdb $ 027 */ 028 public class ResourceExpander { 029 030 private static Logger log = Logger.getLogger( ResourceExpander.class ); 031 032 /** A regexp to determine if a URL has a scheme, such as "http://foo.com". 033 */ 034 protected static Pattern schemePattern = Pattern.compile("^\\w*://.*"); 035 036 /** 037 * Expand a link relavant to the current site. This takes care of links 038 * such as 039 * 040 * /foo.html -> http://site.com/base/foo.html 041 * 042 * foo.html -> http://site.com/base/foo.html 043 * 044 * Links should *always* be expanded before they are used. 045 * 046 * This is because if we use the URL http://site.com/base then we don't know 047 * if it's a directory or a file. http://site.com/base/ would be a directory. 048 * 049 * Note that all resource URLs will have correct trailing slashes. If the URL 050 * does not end with / then it is a file URL and not a directory. 051 * 052 * @param resource The absolute base URL that will be used to expand the 053 * link, such as "http://www.codinginparadise.org". 054 * @param link The link to possibly expand, such as "/index.rdf" or 055 * "http://www.somehost.com/somepage.html". 056 * 057 * 058 */ 059 public static String expand( String resource, String link ) { 060 061 if ( link == null ) 062 return null; 063 064 //make sure we can use this. 065 if ( !isValidScheme( link ) ) 066 return link; 067 068 //nothing if ALREADY relativized 069 if ( isExpanded( link ) ) 070 return link; 071 072 // From: http://www.w3.org/Addressing/rfc1808.txt 073 // 074 // If the parse string begins with a double-slash "//", then the 075 // substring of characters after the double-slash and up to, but not 076 // including, the next slash "/" character is the network 077 // location/login (<net_loc>) of the URL. If no trailing slash "/" 078 // is present, the entire remaining parse string is assigned to 079 // <net_loc>. The double- slash and <net_loc> are removed from the 080 // parse string before 081 //FIXME: What happens if resource is a "file://" scheme? 082 if ( link.startsWith( "//" ) ) { 083 084 return "http:" + link; 085 086 } 087 088 //keep going 089 if ( link.startsWith( "/" ) ) { 090 091 link = getSite( resource ) + link; 092 093 return link; 094 095 } else if ( link.startsWith( "#" ) ) { 096 097 link = resource + link; 098 099 return link; 100 101 } else if ( link.startsWith( ".." ) ) { 102 103 //ok. We need to get rid of these .. directories. 104 105 String base = getBase( resource ) + "/"; 106 107 while ( link.startsWith( ".." ) ) { 108 109 //get rid of the first previous dir in the link 110 int begin = 2; 111 if ( link.length() > 2 && link.charAt( 2 ) == '/' ) 112 begin = 3; 113 114 link = link.substring( begin, link.length() ); 115 116 //get rid of the last directory in the resource 117 118 int end = base.length(); 119 120 if ( base.endsWith( "/" ) ) 121 --end; 122 123 base = base.substring( 0, base.lastIndexOf( "/", end - 1 ) ); 124 125 } 126 127 link = base + "/" + link; 128 129 return link; 130 131 } 132 133 // If the resource ends with a common file ending, then chop 134 // off the file ending before adding the link 135 // Is this rfc1808 compliant? Brad Neuberg, bkn3@columbia.edu 136 resource = getBase(resource); 137 if ( link.startsWith( "http://" ) == false ) { 138 139 link = resource + "/" + link; 140 log.debug("link="+link); 141 142 } 143 144 return link; 145 146 } 147 148 /** 149 * Return true if the given link is ALREADY relativized.. 150 * 151 * 152 */ 153 public static boolean isExpanded( String resource ) { 154 return (resource.startsWith( "http://" ) || 155 resource.startsWith( "file://" )); 156 } 157 158 /** 159 * Return true if this is an valid scheme and should be expanded. 160 * 161 * 162 */ 163 public static boolean isValidScheme( String resource ) { 164 if (hasScheme(resource) == false) 165 return true; 166 167 //only on file: and http: 168 169 if ( resource.startsWith( "http:" ) ) 170 return true; 171 172 if ( resource.startsWith( "file:" ) ) 173 return true; 174 175 return false; 176 177 } 178 179 /** 180 * Determines if the given resource has a scheme. (i.e. does it start with 181 * "http://foo.com" or does it just have "foo.com"). 182 */ 183 public static boolean hasScheme( String resource ) { 184 return schemePattern.matcher( resource ).matches(); 185 186 } 187 188 /** 189 * Get the site for this resource. For example: 190 * 191 * http://www.foo.com/directory/index.html 192 * 193 * we will return 194 * 195 * http://www.foo.com 196 * 197 * for file: URLs we return file:// 198 * 199 * 200 */ 201 public static String getSite( String resource ) { 202 203 if ( resource.startsWith( "file:" ) ) { 204 return "file://"; 205 } 206 207 //start at 8 which is the width of http:// 208 int end = resource.indexOf( "/", 8 ); 209 210 if ( end == -1 ) { 211 212 end = resource.length(); 213 214 } 215 216 return resource.substring( 0, end ); 217 218 } 219 220 /** 221 * Given a URL get the domain name. 222 * 223 * 224 */ 225 public static String getDomain( String resource ) { 226 227 String site = getSite( resource ); 228 229 int firstIndex = -1; 230 int indexCount = 0; 231 232 int index = site.length(); 233 234 while ( (index = site.lastIndexOf( ".", index-1 )) != -1 ) { 235 236 ++indexCount; 237 238 if ( indexCount == 2 ) 239 break; 240 241 } 242 243 int begin = 7; // http:// length 244 if ( indexCount >= 2 ) 245 begin = index + 1; 246 247 return site.substring( begin, site.length() ); 248 249 } 250 251 /** 252 * Get the base of this URL. For example if we are given: 253 * 254 * http://www.foo.com/directory/index.html 255 * 256 * we will return 257 * 258 * http://www.foo.com/directory 259 * 260 * 261 * 262 */ 263 public static String getBase( String resource ) { 264 265 //FIXME: Brad says this method is totally broken. 266 if ( resource == null ) 267 return null; 268 269 int begin = "http://".length() + 1; 270 271 int end = resource.lastIndexOf( "/" ); 272 273 if ( end == -1 || end <= begin ) { 274 //probaby a URL like http://www.cnn.com 275 276 end = resource.length(); 277 278 } 279 return resource.substring( 0, end ); 280 281 } 282 283 public static void main( String[] args ) throws Exception { 284 285 System.out.println( expand( "http://peerfear.org/foo/bar/", "../../blog" ) ); 286 287 System.out.println( expand( "http://peerfear.org/foo/bar/", "../../index.html" ) ); 288 289 System.out.println( expand( "http://peerfear.org/blog/", ".." ) ); 290 291 System.out.println( expand( "http://peerfear.org", "/blog" ) ); 292 System.out.println( expand( "http://peerfear.org", "http://peerfear.org" ) ); 293 294 System.out.println( expand( "http://peerfear.org", "blog" ) ); 295 System.out.println( expand( "http://peerfear.org/blog", "foo/bar" ) ); 296 297 System.out.println( expand( "file://projects/newsmonster/", "blog" ) ); 298 299 System.out.println( expand( "file:/projects/ksa/src/java/ksa/test/TestFeedTask_WithRelativePath.rss" 300 , "/blog" ) ); 301 } 302 303 } 304