1 /* 2 * Copyright 1999,2004 The Apache Software Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package org.apache.commons.feedparser.locate.blogservice; 18 19 import java.net.*; 20 import java.util.*; 21 import java.util.regex.*; 22 23 import org.apache.commons.feedparser.*; 24 import org.apache.commons.feedparser.locate.*; 25 26 /** 27 * Models the different kinds of blog services that are available. This 28 * is needed for two reasons. First, sometimes it is useful to simply 29 * know what provider a given weblog is being hosted by, such as Blogger 30 * or PMachine, in order to use special, non-standard capabilities. Second, 31 * many services have "quirks" that don't follow the standards, such as 32 * supporting autodiscovery or supporting it in an incorrect way, and we 33 * therefore need to know what service we are dealing with so that we 34 * can find its feed. 35 * 36 * The BlogService object encapsulates how to determine if a given 37 * weblog is of that type and how to find its feeds. Concrete subclasses, 38 * such as org.apache.commons.feedparser.locate.blogservice.Blogger, 39 * fill in this class and provide the actual way to determine these 40 * things for each blog service type. 41 * 42 * @author Brad Neuberg, bkn3@columbia.edu 43 */ 44 public abstract class BlogService { 45 protected static List blogServices = new ArrayList(); 46 47 /** Subclasses should have a static block similar to the following: 48 * <code> 49 * { 50 * BlogService.addBlogService(new MyBlogService()); 51 * } 52 * </code> 53 */ 54 55 /** Locates all the generator meta tags 56 * (i.e. <meta content="generator" content="someGenerator"/>) 57 */ 58 protected static Pattern metaTagsPattern = 59 Pattern.compile("<[\\s]*meta[\\w\\s=\"']*name=['\" ]generator[\"' ][\\w\\s=\"']*[^>]*", 60 Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); 61 62 /** 63 * A regex to find any trailing filename and strip it 64 */ 65 protected static Pattern patternToStrip = Pattern.compile("[^/](/\\w*\\.\\w*$)"); 66 67 /** Returns whether we can trust the results of this blog service's 68 * autodiscovery links. For example, TextAmerica returns invalid 69 * autodiscovery results. 70 */ 71 public abstract boolean hasValidAutoDiscovery(); 72 73 /** Returns whether we should follow HTTP redirects for this blog service. 74 * Some services don't implement HTTP redirects correctly, while others, 75 * like Xanga, require it. 76 */ 77 public abstract boolean followRedirects(); 78 79 /** Determines if the weblog at the given resource and with the given 80 * content is this blog service. 81 * @param resource A full URI to this resource, such as 82 * "http://www.codinginparadise.org". 83 * @param content The full HTML content at the resource's URL. 84 * @throws FeedParserException Thrown if an error occurs while 85 * determining the type of this weblog. 86 */ 87 public abstract boolean isThisService(String resource, String content) 88 throws FeedParserException; 89 90 /** 91 * Returns an array of FeedReferences that contains information on the 92 * usual locations this blog service contains its feed. The feeds should 93 * be ordered by quality, so that higher quality feeds come before lower 94 * quality ones (i.e. you would want to have an Atom FeedReference 95 * object come before an RSS 0.91 FeedReference object in this list). 96 * @param resource A URL to the given weblog that might be used to build 97 * up where feeds are usually located. 98 * @param content The full content of the resource URL, which might 99 * be useful to determine where feeds are usually located. This can be 100 * null. 101 * @throws FeedParserException Thrown if an error occurs while trying 102 * to determine the usual locations of feeds for this service. 103 */ 104 public abstract FeedReference[] getFeedLocations(String resource, 105 String content) 106 throws FeedParserException; 107 108 /** Determines if the weblog at the given resource is this blog service. 109 * @param resource A full URI to this resource, such as 110 * "http://www.codinginparadise.org". 111 * @throws FeedParserException Thrown if an error occurs while 112 * determining the type of this weblog. 113 */ 114 public boolean isThisService(String resource) throws FeedParserException { 115 return isThisService(resource, null); 116 } 117 118 /** This method takes a resource, such as "http://www.codinginparadise.org/myweblog.php" target="alexandria_uri">http://www.codinginparadise.org/myweblog.php", 119 * and gets the path necessary to build up a feed, such as 120 * "http://www.codinginparadise.org/". Basicly it appends a slash 121 * to the end if there is not one, and removes any file names that 122 * might be at the end, such as "myweblog.php". 123 * 124 * There is a special exception for some Blosxom blogs, 125 * which have things inside of a cgi-script and 'hang' their RSS files 126 * off of this cgi-bin. For example, 127 * http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi has its RSS file 128 * at http://www.bitbucketheaven.com/cgi-bin/blosxom.cgi/index.rss, so 129 * we must return the blosxom.cgi at the end as well for this method. 130 * 131 * @throws MalformedURLException Thrown if the given resource's URL is 132 * incorrectly formatted. 133 */ 134 public String getBaseFeedPath( String resource ) { 135 // strip off any query string or anchors 136 int end = resource.lastIndexOf( "#" ); 137 138 if ( end != -1 ) 139 resource = resource.substring( 0, end ); 140 141 end = resource.lastIndexOf( "?" ); 142 143 if ( end != -1 ) 144 resource = resource.substring( 0, end ); 145 146 Matcher fileMatcher = patternToStrip.matcher(resource); 147 if (fileMatcher.find()) { 148 String stringToStrip = fileMatcher.group(1); 149 int startStrip = resource.indexOf(stringToStrip); 150 resource = resource.substring(0, startStrip); 151 } 152 153 if ( ! resource.endsWith( "/" ) ) { 154 resource = resource + "/"; 155 } 156 157 return resource; 158 } 159 160 public String toString() { 161 return this.getClass().getName(); 162 } 163 164 public boolean equals(Object obj) { 165 if (obj == null) 166 return false; 167 168 if (obj instanceof BlogService == false) 169 return false; 170 171 return (obj.getClass().equals(this.getClass())); 172 } 173 174 public int hashCode() { 175 return this.getClass().hashCode(); 176 } 177 178 /** Gets an array of all of the available BlogService implementations. */ 179 public static BlogService[] getBlogServices() { 180 if (blogServices.size() == 0) 181 initializeBlogServices(); 182 183 BlogService[] results = new BlogService[blogServices.size()]; 184 185 return (BlogService[])blogServices.toArray(results); 186 } 187 188 // **** util code *********************************************************** 189 // These methods are useful for non-abstract subclasses of this object 190 // to actually implement their functionality. 191 192 /** Determines if the given resource contains the given domain name 193 * fragment. 194 */ 195 protected boolean containsDomain(String resource, String domain) { 196 return (resource.indexOf(domain) != -1); 197 } 198 199 /** 200 * Determines if the given content was generated by the given generator. 201 202 * Example. This document contains a meta tag with name="generator" and 203 * content equal to the generatorType). 204 */ 205 protected boolean hasGenerator(String content, String generatorType) { 206 if (content == null) { 207 return false; 208 } 209 210 Matcher metaTagsMatcher = metaTagsPattern.matcher(content); 211 if (metaTagsMatcher.find()) { 212 String metaTag = metaTagsMatcher.group(0).toLowerCase(); 213 generatorType = generatorType.toLowerCase(); 214 return (metaTag.indexOf(generatorType) != -1); 215 } 216 else { 217 return false; 218 } 219 } 220 221 protected static void initializeBlogServices() { 222 blogServices.add(new AOLJournal()); 223 blogServices.add(new Blogger()); 224 blogServices.add(new Blosxom()); 225 blogServices.add(new DiaryLand()); 226 blogServices.add(new ExpressionEngine()); 227 blogServices.add(new Flickr()); 228 blogServices.add(new GreyMatter()); 229 blogServices.add(new iBlog()); 230 blogServices.add(new LiveJournal()); 231 blogServices.add(new Manila()); 232 blogServices.add(new MovableType()); 233 blogServices.add(new PMachine()); 234 blogServices.add(new RadioUserland()); 235 blogServices.add(new TextAmerica()); 236 blogServices.add(new TextPattern()); 237 blogServices.add(new Typepad()); 238 blogServices.add(new WordPress()); 239 blogServices.add(new Xanga()); 240 blogServices.add(new YahooGroups()); 241 } 242 }