001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser.locate.blogservice;
018    
019    import java.net.*;
020    import java.util.*;
021    import java.util.regex.*;
022    
023    import org.apache.commons.feedparser.*;
024    import org.apache.commons.feedparser.locate.*;
025    
026    /**
027     * Models the different kinds of blog services that are available.  This
028     * is needed for two reasons.  First, sometimes it is useful to simply
029     * know what provider a given weblog is being hosted by, such as Blogger
030     * or PMachine, in order to use special, non-standard capabilities.  Second,
031     * many services have "quirks" that don't follow the standards, such as
032     * supporting autodiscovery or supporting it in an incorrect way, and we
033     * therefore need to know what service we are dealing with so that we
034     * can find its feed.
035     * 
036     * The BlogService object encapsulates how to determine if a given
037     * weblog is of that type and how to find its feeds.  Concrete subclasses,
038     * such as org.apache.commons.feedparser.locate.blogservice.Blogger,
039     * fill in this class and provide the actual way to determine these
040     * things for each blog service type.
041     * 
042     * @author Brad Neuberg, bkn3@columbia.edu
043     */
044    public abstract class BlogService {
045        protected static List blogServices = new ArrayList();
046        
047        /** Subclasses should have a static block similar to the following:
048         *  <code>
049         *      {
050         *          BlogService.addBlogService(new MyBlogService());
051         *      }
052         *  </code>
053         */
054        
055        /** Locates all the generator meta tags
056         *  (i.e. <meta content="generator" content="someGenerator"/>)
057         */
058        protected static Pattern metaTagsPattern = 
059                    Pattern.compile("<[\\s]*meta[\\w\\s=\"']*name=['\" ]generator[\"' ][\\w\\s=\"']*[^>]*",
060                                    Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
061        
062        /**
063         * A regex to find any trailing filename and strip it
064         */
065        protected static Pattern patternToStrip = Pattern.compile("[^/](/\\w*\\.\\w*$)"); 
066            
067        /** Returns whether we can trust the results of this blog service's 
068         *  autodiscovery links.  For example, TextAmerica returns invalid 
069         *  autodiscovery results.
070         */
071        public abstract boolean hasValidAutoDiscovery();
072        
073        /** Returns whether we should follow HTTP redirects for this blog service.
074         *  Some services don't implement HTTP redirects correctly, while others,
075         *  like Xanga, require it.
076         */
077        public abstract boolean followRedirects();
078        
079        /** Determines if the weblog at the given resource and with the given
080         *  content is this blog service.
081         * @param resource A full URI to this resource, such as 
082         * "http://www.codinginparadise.org".
083         * @param content The full HTML content at the resource's URL.
084         * @throws FeedParserException Thrown if an error occurs while 
085         * determining the type of this weblog.
086         */
087        public abstract boolean isThisService(String resource, String content)
088                                                    throws FeedParserException;
089    
090        /**
091         * Returns an array of FeedReferences that contains information on the
092         * usual locations this blog service contains its feed.  The feeds should
093         * be ordered by quality, so that higher quality feeds come before lower
094         * quality ones (i.e. you would want to have an Atom FeedReference
095         * object come before an RSS 0.91 FeedReference object in this list).
096         * @param resource A URL to the given weblog that might be used to build
097         * up where feeds are usually located.
098         * @param content The full content of the resource URL, which might
099         * be useful to determine where feeds are usually located.  This can be
100         * null.
101         * @throws FeedParserException Thrown if an error occurs while trying
102         * to determine the usual locations of feeds for this service.
103         */
104        public abstract FeedReference[] getFeedLocations(String resource,
105                                                         String content)
106                                                    throws FeedParserException;
107        
108        /** Determines if the weblog at the given resource is this blog service.
109         *  @param resource A full URI to this resource, such as 
110         *  "http://www.codinginparadise.org".
111         *  @throws FeedParserException Thrown if an error occurs while 
112         *  determining the type of this weblog.
113         */
114        public boolean isThisService(String resource) throws FeedParserException {
115            return isThisService(resource, null);
116        }
117        
118        /** This method takes a resource, such as "http://www.codinginparadise.org/myweblog.php",
119         *  and gets the path necessary to build up a feed, such as 
120         *  "http://www.codinginparadise.org/".  Basicly it appends a slash 
121         *  to the end if there is not one, and removes any file names that 
122         *  might be at the end, such as "myweblog.php".
123         *
124         *  There is a special exception for some Blosxom blogs,
125         *  which have things inside of a cgi-script and 'hang' their RSS files
126         *  off of this cgi-bin.  For example, 
127         *  http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi has its RSS file
128         *  at http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi/index.rss, so
129         *  we must return the blosxom.cgi at the end as well for this method.
130         * 
131         *  @throws MalformedURLException Thrown if the given resource's URL is 
132         *  incorrectly formatted.
133         */
134        public String getBaseFeedPath( String resource ) {
135            // strip off any query string or anchors
136            int end = resource.lastIndexOf( "#" );
137            
138            if ( end != -1 )
139                resource = resource.substring( 0, end );
140    
141            end = resource.lastIndexOf( "?" );
142    
143            if ( end != -1 )
144                resource = resource.substring( 0, end );
145    
146            Matcher fileMatcher = patternToStrip.matcher(resource);
147            if (fileMatcher.find()) {
148                String stringToStrip = fileMatcher.group(1);
149                int startStrip = resource.indexOf(stringToStrip);
150                resource = resource.substring(0, startStrip);
151            }
152            
153            if ( ! resource.endsWith( "/" ) ) {
154                resource = resource + "/";
155            }
156            
157            return resource;
158        }
159    
160        public String toString() {
161            return this.getClass().getName();
162        }
163        
164        public boolean equals(Object obj) {
165            if (obj == null)
166                return false;
167            
168            if (obj instanceof BlogService == false)
169                return false;
170            
171            return (obj.getClass().equals(this.getClass()));
172        }
173        
174        public int hashCode() {
175            return this.getClass().hashCode();
176        }
177        
178        /** Gets an array of all of the available BlogService implementations. */
179        public static BlogService[] getBlogServices() {
180            if (blogServices.size() == 0)
181                initializeBlogServices();
182            
183            BlogService[] results = new BlogService[blogServices.size()];
184            
185            return (BlogService[])blogServices.toArray(results);
186        }
187    
188        // **** util code ***********************************************************
189        // These methods are useful for non-abstract subclasses of this object
190        // to actually implement their functionality.
191        
192        /** Determines if the given resource contains the given domain name
193         *  fragment.
194         */
195        protected boolean containsDomain(String resource, String domain) {
196            return (resource.indexOf(domain) != -1);
197        }
198        
199        /**
200         * Determines if the given content was generated by the given generator.
201    
202         * Example. This document contains a meta tag with name="generator" and
203         * content equal to the generatorType).
204         */
205        protected boolean hasGenerator(String content, String generatorType) {
206            if (content == null) {
207                return false;
208            }
209            
210            Matcher metaTagsMatcher = metaTagsPattern.matcher(content);
211            if (metaTagsMatcher.find()) {
212                String metaTag = metaTagsMatcher.group(0).toLowerCase();
213                generatorType = generatorType.toLowerCase();
214                return (metaTag.indexOf(generatorType) != -1);
215            }
216            else {
217                return false;
218            }
219        }
220        
221        protected static void initializeBlogServices() {
222            blogServices.add(new AOLJournal());
223            blogServices.add(new Blogger());
224            blogServices.add(new Blosxom());
225            blogServices.add(new DiaryLand());
226            blogServices.add(new ExpressionEngine());
227            blogServices.add(new Flickr());
228            blogServices.add(new GreyMatter());
229            blogServices.add(new iBlog());
230            blogServices.add(new LiveJournal());
231            blogServices.add(new Manila());
232            blogServices.add(new MovableType());
233            blogServices.add(new PMachine());
234            blogServices.add(new RadioUserland());
235            blogServices.add(new TextAmerica());
236            blogServices.add(new TextPattern());
237            blogServices.add(new Typepad());
238            blogServices.add(new WordPress());
239            blogServices.add(new Xanga());
240            blogServices.add(new YahooGroups());
241        }
242    }