001 /*
002 * Copyright 1999,2004 The Apache Software Foundation.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package org.apache.commons.feedparser.locate;
018
019 import java.util.regex.Pattern;
020
021 import org.apache.log4j.Logger;
022
023 /**
024 *
025 * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
026 * @version $Id: ResourceExpander.java 373622 2006-01-30 22:53:00Z mvdb $
027 */
028 public class ResourceExpander {
029
030 private static Logger log = Logger.getLogger( ResourceExpander.class );
031
032 /** A regexp to determine if a URL has a scheme, such as "http://foo.com".
033 */
034 protected static Pattern schemePattern = Pattern.compile("^\\w*://.*");
035
036 /**
037 * Expand a link relavant to the current site. This takes care of links
038 * such as
039 *
040 * /foo.html -> http://site.com/base/foo.html
041 *
042 * foo.html -> http://site.com/base/foo.html
043 *
044 * Links should *always* be expanded before they are used.
045 *
046 * This is because if we use the URL http://site.com/base then we don't know
047 * if it's a directory or a file. http://site.com/base/ would be a directory.
048 *
049 * Note that all resource URLs will have correct trailing slashes. If the URL
050 * does not end with / then it is a file URL and not a directory.
051 *
052 * @param resource The absolute base URL that will be used to expand the
053 * link, such as "http://www.codinginparadise.org".
054 * @param link The link to possibly expand, such as "/index.rdf" or
055 * "http://www.somehost.com/somepage.html".
056 *
057 *
058 */
059 public static String expand( String resource, String link ) {
060
061 if ( link == null )
062 return null;
063
064 //make sure we can use this.
065 if ( !isValidScheme( link ) )
066 return link;
067
068 //nothing if ALREADY relativized
069 if ( isExpanded( link ) )
070 return link;
071
072 // From: http://www.w3.org/Addressing/rfc1808.txt
073 //
074 // If the parse string begins with a double-slash "//", then the
075 // substring of characters after the double-slash and up to, but not
076 // including, the next slash "/" character is the network
077 // location/login (<net_loc>) of the URL. If no trailing slash "/"
078 // is present, the entire remaining parse string is assigned to
079 // <net_loc>. The double- slash and <net_loc> are removed from the
080 // parse string before
081 //FIXME: What happens if resource is a "file://" scheme?
082 if ( link.startsWith( "//" ) ) {
083
084 return "http:" + link;
085
086 }
087
088 //keep going
089 if ( link.startsWith( "/" ) ) {
090
091 link = getSite( resource ) + link;
092
093 return link;
094
095 } else if ( link.startsWith( "#" ) ) {
096
097 link = resource + link;
098
099 return link;
100
101 } else if ( link.startsWith( ".." ) ) {
102
103 //ok. We need to get rid of these .. directories.
104
105 String base = getBase( resource ) + "/";
106
107 while ( link.startsWith( ".." ) ) {
108
109 //get rid of the first previous dir in the link
110 int begin = 2;
111 if ( link.length() > 2 && link.charAt( 2 ) == '/' )
112 begin = 3;
113
114 link = link.substring( begin, link.length() );
115
116 //get rid of the last directory in the resource
117
118 int end = base.length();
119
120 if ( base.endsWith( "/" ) )
121 --end;
122
123 base = base.substring( 0, base.lastIndexOf( "/", end - 1 ) );
124
125 }
126
127 link = base + "/" + link;
128
129 return link;
130
131 }
132
133 // If the resource ends with a common file ending, then chop
134 // off the file ending before adding the link
135 // Is this rfc1808 compliant? Brad Neuberg, bkn3@columbia.edu
136 resource = getBase(resource);
137 if ( link.startsWith( "http://" ) == false ) {
138
139 link = resource + "/" + link;
140 log.debug("link="+link);
141
142 }
143
144 return link;
145
146 }
147
148 /**
149 * Return true if the given link is ALREADY relativized..
150 *
151 *
152 */
153 public static boolean isExpanded( String resource ) {
154 return (resource.startsWith( "http://" ) ||
155 resource.startsWith( "file://" ));
156 }
157
158 /**
159 * Return true if this is an valid scheme and should be expanded.
160 *
161 *
162 */
163 public static boolean isValidScheme( String resource ) {
164 if (hasScheme(resource) == false)
165 return true;
166
167 //only on file: and http:
168
169 if ( resource.startsWith( "http:" ) )
170 return true;
171
172 if ( resource.startsWith( "file:" ) )
173 return true;
174
175 return false;
176
177 }
178
179 /**
180 * Determines if the given resource has a scheme. (i.e. does it start with
181 * "http://foo.com" or does it just have "foo.com").
182 */
183 public static boolean hasScheme( String resource ) {
184 return schemePattern.matcher( resource ).matches();
185
186 }
187
188 /**
189 * Get the site for this resource. For example:
190 *
191 * http://www.foo.com/directory/index.html
192 *
193 * we will return
194 *
195 * http://www.foo.com
196 *
197 * for file: URLs we return file://
198 *
199 *
200 */
201 public static String getSite( String resource ) {
202
203 if ( resource.startsWith( "file:" ) ) {
204 return "file://";
205 }
206
207 //start at 8 which is the width of http://
208 int end = resource.indexOf( "/", 8 );
209
210 if ( end == -1 ) {
211
212 end = resource.length();
213
214 }
215
216 return resource.substring( 0, end );
217
218 }
219
220 /**
221 * Given a URL get the domain name.
222 *
223 *
224 */
225 public static String getDomain( String resource ) {
226
227 String site = getSite( resource );
228
229 int firstIndex = -1;
230 int indexCount = 0;
231
232 int index = site.length();
233
234 while ( (index = site.lastIndexOf( ".", index-1 )) != -1 ) {
235
236 ++indexCount;
237
238 if ( indexCount == 2 )
239 break;
240
241 }
242
243 int begin = 7; // http:// length
244 if ( indexCount >= 2 )
245 begin = index + 1;
246
247 return site.substring( begin, site.length() );
248
249 }
250
251 /**
252 * Get the base of this URL. For example if we are given:
253 *
254 * http://www.foo.com/directory/index.html
255 *
256 * we will return
257 *
258 * http://www.foo.com/directory
259 *
260 *
261 *
262 */
263 public static String getBase( String resource ) {
264
265 //FIXME: Brad says this method is totally broken.
266 if ( resource == null )
267 return null;
268
269 int begin = "http://".length() + 1;
270
271 int end = resource.lastIndexOf( "/" );
272
273 if ( end == -1 || end <= begin ) {
274 //probaby a URL like http://www.cnn.com
275
276 end = resource.length();
277
278 }
279 return resource.substring( 0, end );
280
281 }
282
283 public static void main( String[] args ) throws Exception {
284
285 System.out.println( expand( "http://peerfear.org/foo/bar/", "../../blog" ) );
286
287 System.out.println( expand( "http://peerfear.org/foo/bar/", "../../index.html" ) );
288
289 System.out.println( expand( "http://peerfear.org/blog/", ".." ) );
290
291 System.out.println( expand( "http://peerfear.org", "/blog" ) );
292 System.out.println( expand( "http://peerfear.org", "http://peerfear.org" ) );
293
294 System.out.println( expand( "http://peerfear.org", "blog" ) );
295 System.out.println( expand( "http://peerfear.org/blog", "foo/bar" ) );
296
297 System.out.println( expand( "file://projects/newsmonster/", "blog" ) );
298
299 System.out.println( expand( "file:/projects/ksa/src/java/ksa/test/TestFeedTask_WithRelativePath.rss"
300 , "/blog" ) );
301 }
302
303 }
304