001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser.locate.blogservice;
018    
019    import java.net.MalformedURLException;
020    import java.util.regex.*;
021    
022    import org.apache.commons.feedparser.FeedParserException;
023    import org.apache.commons.feedparser.locate.*;
024    
025    /**
026     * Models the Blosxom blog service, encapsulating whether a given weblog
027     * is this type of service and where it usually keeps its feeds.
028     * 
029     * @author Brad Neuberg, bkn3@columbia.edu
030     */
031    public class Blosxom extends BlogService {
032        
033        /** A pattern used to discover Blosxom blogs. */
034        private static Pattern blosxomPattern =
035                    Pattern.compile("alt=[\"' ]powered by blosxom[\"' ]",
036                                    Pattern.CASE_INSENSITIVE);
037            
038        /** Returns whether we can trust the results of this blog service's 
039         *  autodiscovery links.  For example, TextAmerica returns invalid 
040         *  autodiscovery results.
041         */
042        public boolean hasValidAutoDiscovery() {
043            return true;
044        }
045        
046        /** Returns whether we should follow HTTP redirects for this blog service.
047         *  Some services don't implement HTTP redirects correctly, while others,
048         *  like Xanga, require it.
049         */
050        public boolean followRedirects() {
051            return false;
052        }
053        
054        /** Determines if the weblog at the given resource and with the given
055         *  content is this blog service.
056         * @param resource A full URI to this resource, such as 
057         * "http://www.codinginparadise.org".
058         * @param content The full HTML content at the resource's URL.
059         * @throws FeedParserException Thrown if an error occurs while 
060         * determining the type of this weblog.
061         */
062        public boolean isThisService(String resource, String content)
063                                                    throws FeedParserException {
064            boolean results = false;
065            
066            // This is the only kind of blog that we need to check for a 
067            // 'Powered by Blosxom'.  We do this with the alt= value on the
068            // Powered By image.
069            // FIXME: This might be fragile, but it is used across all of the
070            // Blosxom blogs I have looked at so far. Brad Neuberg, bkn3@columbia.edu
071            
072            Matcher blosxomMatcher = blosxomPattern.matcher(content);
073            results = blosxomMatcher.find();
074            
075            return results;
076        }
077    
078        /**
079         * Returns an array of FeedReferences that contains information on the
080         * usual locations this blog service contains its feed.  The feeds should
081         * be ordered by quality, so that higher quality feeds come before lower
082         * quality ones (i.e. you would want to have an Atom FeedReference
083         * object come before an RSS 0.91 FeedReference object in this list).
084         * @param resource A URL to the given weblog that might be used to build
085         * up where feeds are usually located.
086         * @param content The full content of the resource URL, which might
087         * be useful to determine where feeds are usually located.  This can be
088         * null.
089         * @throws FeedParserException Thrown if an error occurs while trying
090         * to determine the usual locations of feeds for this service.
091         */
092        public FeedReference[] getFeedLocations(String resource,
093                                                String content)
094                                                    throws FeedParserException {
095            // there is sometimes an index.rss20 file, but Blosxom has a bug where
096            // it incorrectly responds to HTTP HEAD requests for that file,
097            // saying that it exists when it doesn't.  Most sites don't seem
098            // to have this file so we don't include it here. 
099            // Brad Neuberg, bkn3@columbia.edu
100            FeedReference[] blosxomLocations = 
101                { new FeedReference("index.rss", FeedReference.RSS_MEDIA_TYPE) };
102            
103            return blosxomLocations;
104        }
105        
106        /** This method takes a resource, such as "http://www.codinginparadise.org/myweblog.php",
107         *  and gets the path necessary to build up a feed, such as 
108         *  "http://www.codinginparadise.org/".  Basicly it appends a slash 
109         *  to the end if there is not one, and removes any file names that 
110         *  might be at the end, such as "myweblog.php".
111         *
112         *  There is a special exception for some Blosxom blogs,
113         *  which have things inside of a cgi-script and 'hang' their RSS files
114         *  off of this cgi-bin.  For example, 
115         *  http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi has its RSS file
116         *  at http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi/index.rss, so
117         *  we must return the blosxom.cgi at the end as well for this method.
118         * 
119         *  @throws MalformedURLException Thrown if the given resource's URL is 
120         *  incorrectly formatted.
121         */
122        public String getBaseFeedPath( String resource ) {
123            
124            // strip off any query string or anchors
125            int end = resource.lastIndexOf( "#" );
126            
127            if ( end != -1 )
128                resource = resource.substring( 0, end );
129    
130            end = resource.lastIndexOf( "?" );
131    
132            if ( end != -1 )
133                resource = resource.substring( 0, end );
134            
135            if ( ! resource.endsWith( "/" ) ) {
136                resource = resource + "/";
137            }
138            
139            return resource;
140        }
141    }