View Javadoc

1   /*
2    * Copyright 1999,2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.commons.feedparser.locate.blogservice;
18  
19  import java.net.*;
20  import java.util.*;
21  import java.util.regex.*;
22  
23  import org.apache.commons.feedparser.*;
24  import org.apache.commons.feedparser.locate.*;
25  
26  /**
27   * Models the different kinds of blog services that are available.  This
28   * is needed for two reasons.  First, sometimes it is useful to simply
29   * know what provider a given weblog is being hosted by, such as Blogger
30   * or PMachine, in order to use special, non-standard capabilities.  Second,
31   * many services have "quirks" that don't follow the standards, such as
32   * supporting autodiscovery or supporting it in an incorrect way, and we
33   * therefore need to know what service we are dealing with so that we
34   * can find its feed.
35   * 
36   * The BlogService object encapsulates how to determine if a given
37   * weblog is of that type and how to find its feeds.  Concrete subclasses,
38   * such as org.apache.commons.feedparser.locate.blogservice.Blogger,
39   * fill in this class and provide the actual way to determine these
40   * things for each blog service type.
41   * 
42   * @author Brad Neuberg, bkn3@columbia.edu
43   */
44  public abstract class BlogService {
45      protected static List blogServices = new ArrayList();
46      
47      /** Subclasses should have a static block similar to the following:
48       *  <code>
49       *      {
50       *          BlogService.addBlogService(new MyBlogService());
51       *      }
52       *  </code>
53       */
54      
55      /** Locates all the generator meta tags
56       *  (i.e. <meta content="generator" content="someGenerator"/>)
57       */
58      protected static Pattern metaTagsPattern = 
59                  Pattern.compile("<[\\s]*meta[\\w\\s=\"']*name=['\" ]generator[\"' ][\\w\\s=\"']*[^>]*",
60                                  Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
61      
62      /**
63       * A regex to find any trailing filename and strip it
64       */
65      protected static Pattern patternToStrip = Pattern.compile("[^/](/\\w*\\.\\w*$)"); 
66          
67      /** Returns whether we can trust the results of this blog service's 
68       *  autodiscovery links.  For example, TextAmerica returns invalid 
69       *  autodiscovery results.
70       */
71      public abstract boolean hasValidAutoDiscovery();
72      
73      /** Returns whether we should follow HTTP redirects for this blog service.
74       *  Some services don't implement HTTP redirects correctly, while others,
75       *  like Xanga, require it.
76       */
77      public abstract boolean followRedirects();
78      
79      /** Determines if the weblog at the given resource and with the given
80       *  content is this blog service.
81       * @param resource A full URI to this resource, such as 
82       * "http://www.codinginparadise.org".
83       * @param content The full HTML content at the resource's URL.
84       * @throws FeedParserException Thrown if an error occurs while 
85       * determining the type of this weblog.
86       */
87      public abstract boolean isThisService(String resource, String content)
88                                                  throws FeedParserException;
89  
90      /**
91       * Returns an array of FeedReferences that contains information on the
92       * usual locations this blog service contains its feed.  The feeds should
93       * be ordered by quality, so that higher quality feeds come before lower
94       * quality ones (i.e. you would want to have an Atom FeedReference
95       * object come before an RSS 0.91 FeedReference object in this list).
96       * @param resource A URL to the given weblog that might be used to build
97       * up where feeds are usually located.
98       * @param content The full content of the resource URL, which might
99       * be useful to determine where feeds are usually located.  This can be
100      * null.
101      * @throws FeedParserException Thrown if an error occurs while trying
102      * to determine the usual locations of feeds for this service.
103      */
104     public abstract FeedReference[] getFeedLocations(String resource,
105                                                      String content)
106                                                 throws FeedParserException;
107     
108     /** Determines if the weblog at the given resource is this blog service.
109      *  @param resource A full URI to this resource, such as 
110      *  "http://www.codinginparadise.org".
111      *  @throws FeedParserException Thrown if an error occurs while 
112      *  determining the type of this weblog.
113      */
114     public boolean isThisService(String resource) throws FeedParserException {
115         return isThisService(resource, null);
116     }
117     
118     /** This method takes a resource, such as "http://www.codinginparadise.org/myweblog.php" target="alexandria_uri">http://www.codinginparadise.org/myweblog.php",
119      *  and gets the path necessary to build up a feed, such as 
120      *  "http://www.codinginparadise.org/".  Basicly it appends a slash 
121      *  to the end if there is not one, and removes any file names that 
122      *  might be at the end, such as "myweblog.php".
123      *
124      *  There is a special exception for some Blosxom blogs,
125      *  which have things inside of a cgi-script and 'hang' their RSS files
126      *  off of this cgi-bin.  For example, 
127      *  http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi has its RSS file
128      *  at http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi/index.rss, so
129      *  we must return the blosxom.cgi at the end as well for this method.
130      * 
131      *  @throws MalformedURLException Thrown if the given resource's URL is 
132      *  incorrectly formatted.
133      */
134     public String getBaseFeedPath( String resource ) {
135         // strip off any query string or anchors
136         int end = resource.lastIndexOf( "#" );
137         
138         if ( end != -1 )
139             resource = resource.substring( 0, end );
140 
141         end = resource.lastIndexOf( "?" );
142 
143         if ( end != -1 )
144             resource = resource.substring( 0, end );
145 
146         Matcher fileMatcher = patternToStrip.matcher(resource);
147         if (fileMatcher.find()) {
148             String stringToStrip = fileMatcher.group(1);
149             int startStrip = resource.indexOf(stringToStrip);
150             resource = resource.substring(0, startStrip);
151         }
152         
153         if ( ! resource.endsWith( "/" ) ) {
154             resource = resource + "/";
155         }
156         
157         return resource;
158     }
159 
160     public String toString() {
161         return this.getClass().getName();
162     }
163     
164     public boolean equals(Object obj) {
165         if (obj == null)
166             return false;
167         
168         if (obj instanceof BlogService == false)
169             return false;
170         
171         return (obj.getClass().equals(this.getClass()));
172     }
173     
174     public int hashCode() {
175         return this.getClass().hashCode();
176     }
177     
178     /** Gets an array of all of the available BlogService implementations. */
179     public static BlogService[] getBlogServices() {
180         if (blogServices.size() == 0)
181             initializeBlogServices();
182         
183         BlogService[] results = new BlogService[blogServices.size()];
184         
185         return (BlogService[])blogServices.toArray(results);
186     }
187 
188     // **** util code ***********************************************************
189     // These methods are useful for non-abstract subclasses of this object
190     // to actually implement their functionality.
191     
192     /** Determines if the given resource contains the given domain name
193      *  fragment.
194      */
195     protected boolean containsDomain(String resource, String domain) {
196         return (resource.indexOf(domain) != -1);
197     }
198     
199     /**
200      * Determines if the given content was generated by the given generator.
201 
202      * Example. This document contains a meta tag with name="generator" and
203      * content equal to the generatorType).
204      */
205     protected boolean hasGenerator(String content, String generatorType) {
206         if (content == null) {
207             return false;
208         }
209         
210         Matcher metaTagsMatcher = metaTagsPattern.matcher(content);
211         if (metaTagsMatcher.find()) {
212             String metaTag = metaTagsMatcher.group(0).toLowerCase();
213             generatorType = generatorType.toLowerCase();
214             return (metaTag.indexOf(generatorType) != -1);
215         }
216         else {
217             return false;
218         }
219     }
220     
221     protected static void initializeBlogServices() {
222         blogServices.add(new AOLJournal());
223         blogServices.add(new Blogger());
224         blogServices.add(new Blosxom());
225         blogServices.add(new DiaryLand());
226         blogServices.add(new ExpressionEngine());
227         blogServices.add(new Flickr());
228         blogServices.add(new GreyMatter());
229         blogServices.add(new iBlog());
230         blogServices.add(new LiveJournal());
231         blogServices.add(new Manila());
232         blogServices.add(new MovableType());
233         blogServices.add(new PMachine());
234         blogServices.add(new RadioUserland());
235         blogServices.add(new TextAmerica());
236         blogServices.add(new TextPattern());
237         blogServices.add(new Typepad());
238         blogServices.add(new WordPress());
239         blogServices.add(new Xanga());
240         blogServices.add(new YahooGroups());
241     }
242 }