001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser.locate.blogservice; 018 019 import java.net.*; 020 import java.util.*; 021 import java.util.regex.*; 022 023 import org.apache.commons.feedparser.*; 024 import org.apache.commons.feedparser.locate.*; 025 026 /** 027 * Models the different kinds of blog services that are available. This 028 * is needed for two reasons. First, sometimes it is useful to simply 029 * know what provider a given weblog is being hosted by, such as Blogger 030 * or PMachine, in order to use special, non-standard capabilities. Second, 031 * many services have "quirks" that don't follow the standards, such as 032 * supporting autodiscovery or supporting it in an incorrect way, and we 033 * therefore need to know what service we are dealing with so that we 034 * can find its feed. 035 * 036 * The BlogService object encapsulates how to determine if a given 037 * weblog is of that type and how to find its feeds. Concrete subclasses, 038 * such as org.apache.commons.feedparser.locate.blogservice.Blogger, 039 * fill in this class and provide the actual way to determine these 040 * things for each blog service type. 041 * 042 * @author Brad Neuberg, bkn3@columbia.edu 043 */ 044 public abstract class BlogService { 045 protected static List blogServices = new ArrayList(); 046 047 /** Subclasses should have a static block similar to the following: 048 * <code> 049 * { 050 * BlogService.addBlogService(new MyBlogService()); 051 * } 052 * </code> 053 */ 054 055 /** Locates all the generator meta tags 056 * (i.e. <meta content="generator" content="someGenerator"/>) 057 */ 058 protected static Pattern metaTagsPattern = 059 Pattern.compile("<[\\s]*meta[\\w\\s=\"']*name=['\" ]generator[\"' ][\\w\\s=\"']*[^>]*", 060 Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); 061 062 /** 063 * A regex to find any trailing filename and strip it 064 */ 065 protected static Pattern patternToStrip = Pattern.compile("[^/](/\\w*\\.\\w*$)"); 066 067 /** Returns whether we can trust the results of this blog service's 068 * autodiscovery links. For example, TextAmerica returns invalid 069 * autodiscovery results. 070 */ 071 public abstract boolean hasValidAutoDiscovery(); 072 073 /** Returns whether we should follow HTTP redirects for this blog service. 074 * Some services don't implement HTTP redirects correctly, while others, 075 * like Xanga, require it. 076 */ 077 public abstract boolean followRedirects(); 078 079 /** Determines if the weblog at the given resource and with the given 080 * content is this blog service. 081 * @param resource A full URI to this resource, such as 082 * "http://www.codinginparadise.org". 083 * @param content The full HTML content at the resource's URL. 084 * @throws FeedParserException Thrown if an error occurs while 085 * determining the type of this weblog. 086 */ 087 public abstract boolean isThisService(String resource, String content) 088 throws FeedParserException; 089 090 /** 091 * Returns an array of FeedReferences that contains information on the 092 * usual locations this blog service contains its feed. The feeds should 093 * be ordered by quality, so that higher quality feeds come before lower 094 * quality ones (i.e. you would want to have an Atom FeedReference 095 * object come before an RSS 0.91 FeedReference object in this list). 096 * @param resource A URL to the given weblog that might be used to build 097 * up where feeds are usually located. 098 * @param content The full content of the resource URL, which might 099 * be useful to determine where feeds are usually located. This can be 100 * null. 101 * @throws FeedParserException Thrown if an error occurs while trying 102 * to determine the usual locations of feeds for this service. 103 */ 104 public abstract FeedReference[] getFeedLocations(String resource, 105 String content) 106 throws FeedParserException; 107 108 /** Determines if the weblog at the given resource is this blog service. 109 * @param resource A full URI to this resource, such as 110 * "http://www.codinginparadise.org". 111 * @throws FeedParserException Thrown if an error occurs while 112 * determining the type of this weblog. 113 */ 114 public boolean isThisService(String resource) throws FeedParserException { 115 return isThisService(resource, null); 116 } 117 118 /** This method takes a resource, such as "http://www.codinginparadise.org/myweblog.php", 119 * and gets the path necessary to build up a feed, such as 120 * "http://www.codinginparadise.org/". Basicly it appends a slash 121 * to the end if there is not one, and removes any file names that 122 * might be at the end, such as "myweblog.php". 123 * 124 * There is a special exception for some Blosxom blogs, 125 * which have things inside of a cgi-script and 'hang' their RSS files 126 * off of this cgi-bin. For example, 127 * http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi has its RSS file 128 * at http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi/index.rss, so 129 * we must return the blosxom.cgi at the end as well for this method. 130 * 131 * @throws MalformedURLException Thrown if the given resource's URL is 132 * incorrectly formatted. 133 */ 134 public String getBaseFeedPath( String resource ) { 135 // strip off any query string or anchors 136 int end = resource.lastIndexOf( "#" ); 137 138 if ( end != -1 ) 139 resource = resource.substring( 0, end ); 140 141 end = resource.lastIndexOf( "?" ); 142 143 if ( end != -1 ) 144 resource = resource.substring( 0, end ); 145 146 Matcher fileMatcher = patternToStrip.matcher(resource); 147 if (fileMatcher.find()) { 148 String stringToStrip = fileMatcher.group(1); 149 int startStrip = resource.indexOf(stringToStrip); 150 resource = resource.substring(0, startStrip); 151 } 152 153 if ( ! resource.endsWith( "/" ) ) { 154 resource = resource + "/"; 155 } 156 157 return resource; 158 } 159 160 public String toString() { 161 return this.getClass().getName(); 162 } 163 164 public boolean equals(Object obj) { 165 if (obj == null) 166 return false; 167 168 if (obj instanceof BlogService == false) 169 return false; 170 171 return (obj.getClass().equals(this.getClass())); 172 } 173 174 public int hashCode() { 175 return this.getClass().hashCode(); 176 } 177 178 /** Gets an array of all of the available BlogService implementations. */ 179 public static BlogService[] getBlogServices() { 180 if (blogServices.size() == 0) 181 initializeBlogServices(); 182 183 BlogService[] results = new BlogService[blogServices.size()]; 184 185 return (BlogService[])blogServices.toArray(results); 186 } 187 188 // **** util code *********************************************************** 189 // These methods are useful for non-abstract subclasses of this object 190 // to actually implement their functionality. 191 192 /** Determines if the given resource contains the given domain name 193 * fragment. 194 */ 195 protected boolean containsDomain(String resource, String domain) { 196 return (resource.indexOf(domain) != -1); 197 } 198 199 /** 200 * Determines if the given content was generated by the given generator. 201 202 * Example. This document contains a meta tag with name="generator" and 203 * content equal to the generatorType). 204 */ 205 protected boolean hasGenerator(String content, String generatorType) { 206 if (content == null) { 207 return false; 208 } 209 210 Matcher metaTagsMatcher = metaTagsPattern.matcher(content); 211 if (metaTagsMatcher.find()) { 212 String metaTag = metaTagsMatcher.group(0).toLowerCase(); 213 generatorType = generatorType.toLowerCase(); 214 return (metaTag.indexOf(generatorType) != -1); 215 } 216 else { 217 return false; 218 } 219 } 220 221 protected static void initializeBlogServices() { 222 blogServices.add(new AOLJournal()); 223 blogServices.add(new Blogger()); 224 blogServices.add(new Blosxom()); 225 blogServices.add(new DiaryLand()); 226 blogServices.add(new ExpressionEngine()); 227 blogServices.add(new Flickr()); 228 blogServices.add(new GreyMatter()); 229 blogServices.add(new iBlog()); 230 blogServices.add(new LiveJournal()); 231 blogServices.add(new Manila()); 232 blogServices.add(new MovableType()); 233 blogServices.add(new PMachine()); 234 blogServices.add(new RadioUserland()); 235 blogServices.add(new TextAmerica()); 236 blogServices.add(new TextPattern()); 237 blogServices.add(new Typepad()); 238 blogServices.add(new WordPress()); 239 blogServices.add(new Xanga()); 240 blogServices.add(new YahooGroups()); 241 } 242 }