001 /*
002 * Copyright 1999,2004 The Apache Software Foundation.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package org.apache.commons.feedparser.locate.blogservice;
018
019 import java.net.MalformedURLException;
020 import java.util.regex.*;
021
022 import org.apache.commons.feedparser.FeedParserException;
023 import org.apache.commons.feedparser.locate.*;
024
025 /**
026 * Models the Blosxom blog service, encapsulating whether a given weblog
027 * is this type of service and where it usually keeps its feeds.
028 *
029 * @author Brad Neuberg, bkn3@columbia.edu
030 */
031 public class Blosxom extends BlogService {
032
033 /** A pattern used to discover Blosxom blogs. */
034 private static Pattern blosxomPattern =
035 Pattern.compile("alt=[\"' ]powered by blosxom[\"' ]",
036 Pattern.CASE_INSENSITIVE);
037
038 /** Returns whether we can trust the results of this blog service's
039 * autodiscovery links. For example, TextAmerica returns invalid
040 * autodiscovery results.
041 */
042 public boolean hasValidAutoDiscovery() {
043 return true;
044 }
045
046 /** Returns whether we should follow HTTP redirects for this blog service.
047 * Some services don't implement HTTP redirects correctly, while others,
048 * like Xanga, require it.
049 */
050 public boolean followRedirects() {
051 return false;
052 }
053
054 /** Determines if the weblog at the given resource and with the given
055 * content is this blog service.
056 * @param resource A full URI to this resource, such as
057 * "http://www.codinginparadise.org".
058 * @param content The full HTML content at the resource's URL.
059 * @throws FeedParserException Thrown if an error occurs while
060 * determining the type of this weblog.
061 */
062 public boolean isThisService(String resource, String content)
063 throws FeedParserException {
064 boolean results = false;
065
066 // This is the only kind of blog that we need to check for a
067 // 'Powered by Blosxom'. We do this with the alt= value on the
068 // Powered By image.
069 // FIXME: This might be fragile, but it is used across all of the
070 // Blosxom blogs I have looked at so far. Brad Neuberg, bkn3@columbia.edu
071
072 Matcher blosxomMatcher = blosxomPattern.matcher(content);
073 results = blosxomMatcher.find();
074
075 return results;
076 }
077
078 /**
079 * Returns an array of FeedReferences that contains information on the
080 * usual locations this blog service contains its feed. The feeds should
081 * be ordered by quality, so that higher quality feeds come before lower
082 * quality ones (i.e. you would want to have an Atom FeedReference
083 * object come before an RSS 0.91 FeedReference object in this list).
084 * @param resource A URL to the given weblog that might be used to build
085 * up where feeds are usually located.
086 * @param content The full content of the resource URL, which might
087 * be useful to determine where feeds are usually located. This can be
088 * null.
089 * @throws FeedParserException Thrown if an error occurs while trying
090 * to determine the usual locations of feeds for this service.
091 */
092 public FeedReference[] getFeedLocations(String resource,
093 String content)
094 throws FeedParserException {
095 // there is sometimes an index.rss20 file, but Blosxom has a bug where
096 // it incorrectly responds to HTTP HEAD requests for that file,
097 // saying that it exists when it doesn't. Most sites don't seem
098 // to have this file so we don't include it here.
099 // Brad Neuberg, bkn3@columbia.edu
100 FeedReference[] blosxomLocations =
101 { new FeedReference("index.rss", FeedReference.RSS_MEDIA_TYPE) };
102
103 return blosxomLocations;
104 }
105
106 /** This method takes a resource, such as "http://www.codinginparadise.org/myweblog.php",
107 * and gets the path necessary to build up a feed, such as
108 * "http://www.codinginparadise.org/". Basicly it appends a slash
109 * to the end if there is not one, and removes any file names that
110 * might be at the end, such as "myweblog.php".
111 *
112 * There is a special exception for some Blosxom blogs,
113 * which have things inside of a cgi-script and 'hang' their RSS files
114 * off of this cgi-bin. For example,
115 * http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi has its RSS file
116 * at http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi/index.rss, so
117 * we must return the blosxom.cgi at the end as well for this method.
118 *
119 * @throws MalformedURLException Thrown if the given resource's URL is
120 * incorrectly formatted.
121 */
122 public String getBaseFeedPath( String resource ) {
123
124 // strip off any query string or anchors
125 int end = resource.lastIndexOf( "#" );
126
127 if ( end != -1 )
128 resource = resource.substring( 0, end );
129
130 end = resource.lastIndexOf( "?" );
131
132 if ( end != -1 )
133 resource = resource.substring( 0, end );
134
135 if ( ! resource.endsWith( "/" ) ) {
136 resource = resource + "/";
137 }
138
139 return resource;
140 }
141 }