001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser.locate; 018 019 import java.util.ArrayList; 020 import java.util.HashSet; 021 import java.util.Iterator; 022 import java.util.List; 023 import java.util.Set; 024 025 import org.apache.commons.feedparser.FeedList; 026 import org.apache.commons.feedparser.locate.blogservice.BlogService; 027 import org.apache.commons.feedparser.locate.blogservice.Unknown; 028 import org.apache.commons.feedparser.network.ResourceRequest; 029 import org.apache.commons.feedparser.network.ResourceRequestFactory; 030 import org.apache.log4j.Logger; 031 032 /** 033 * Locator which uses Link probing. It also attempts to determine the type of 034 * blog service provider it is dealing with, such as BlogSpot, Blogsxom, etc., 035 * in order to find feed URLs that are not specified through autodiscovery. 036 * 037 * If ProbeLocator.AGGRESIVE_PROBING_ENABLED is true (by default it is false), 038 * then we probe for links. 039 * 040 * 041 * 042 * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a> 043 */ 044 public class ProbeLocator { 045 046 private static Logger log = Logger.getLogger( ProbeLocator.class ); 047 048 /** If true, then we aggresively probe a site if it doesn't have 049 * autodiscovery. This includes trying to determine what the blog provider 050 * is, trying individual locations based on a blog provider, and probing 051 * in several locations if the blog provider is unknown. 052 * 053 * The default value for this should be false. This should only be 054 * used on server-side aggregators that generate few requests, and 055 * _never_ on client-side aggregators. The level of traffic for 056 * client-side aggregators would be too great. 057 */ 058 public static boolean AGGRESIVE_PROBING_ENABLED = false; 059 060 /** If true, then after discovering what a site's blog provider is we 061 * probe in select locations for feeds based on the provider. This 062 * is useful if autodiscovery is not enabled on this blog and we don't 063 * want to do the full aggresive probing. 064 * 065 * The default value for this should be false. This should only 066 * be used on server-side aggregators that generate few requests, 067 * and _never_ on client-side aggregators. The level of traffic 068 * for client-side aggregators would be too great. 069 */ 070 public static boolean BLOG_SERVICE_PROBING_ENABLED = false; 071 072 073 /** 074 * 075 * 076 */ 077 public static final List locate( String resource, String content, FeedList list ) 078 throws Exception { 079 log.debug("ProbeLocator, resource="+resource+", list="+list); 080 081 // determine what blog service we are dealing with 082 BlogService blogService = BlogServiceDiscovery.discover( resource, content ); 083 log.debug("blogService="+blogService); 084 log.debug("blogService.hasValidAutoDiscovery="+blogService.hasValidAutoDiscovery()); 085 // fail-fast if we already have some results and if we determine that 086 // we can trust the results (TextAmerica has invalid autodiscovery, 087 // for example) 088 if ( list.size() > 0 && blogService.hasValidAutoDiscovery() ) 089 return list; 090 else if ( blogService.hasValidAutoDiscovery() == false ) { 091 // clear out the list so far since we can't trust the results 092 list.clear(); 093 } 094 095 if ( BLOG_SERVICE_PROBING_ENABLED || AGGRESIVE_PROBING_ENABLED ) { 096 log.debug("PROBING!!"); 097 List servicesToTry = new ArrayList(); 098 servicesToTry.add(blogService); 099 // only try the Unknown service if we want aggresive probing 100 if (AGGRESIVE_PROBING_ENABLED) 101 servicesToTry.add(new Unknown()); 102 Iterator iter = servicesToTry.iterator(); 103 Set previousAttempts = new HashSet(); 104 105 while (iter.hasNext() && list.size() == 0) { 106 BlogService currentService = (BlogService)iter.next(); 107 FeedReference[] mapping = currentService.getFeedLocations(resource, content); 108 log.debug( "mapping = " + mapping ); 109 110 // try out each mapping 111 for (int i = 0; i < mapping.length; i++) { 112 String baseFeedPath = currentService.getBaseFeedPath(resource); 113 String pathToTest ; 114 // build up our path to test differently if we are a 115 // relative or an exact path; needed because some 116 // blog services rewrite the domain name, such as 117 // Yahoo Groups 118 if (mapping[i].isRelative()) 119 pathToTest = baseFeedPath + mapping[i].resource; 120 else 121 pathToTest = mapping[i].resource; 122 123 log.debug( "pathToTest = " + pathToTest ); 124 125 if ( !previousAttempts.contains( pathToTest ) 126 && feedExists( pathToTest, currentService ) ) { 127 log.debug("Feed exists"); 128 FeedReference feedReference = new FeedReference( pathToTest, 129 mapping[i].type ); 130 feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY; 131 previousAttempts.add( pathToTest ); 132 onFeedReference( feedReference, list ); 133 } 134 135 // record this attempt so we don't repeat it again if 136 // we are doing aggresive probing 137 previousAttempts.add( pathToTest ); 138 } 139 } 140 141 log.info( "Using aggresive probing, found the following:" ); 142 log.info( "Blog service: " + blogService ); 143 } 144 145 log.info( "List: " + list ); 146 log.info( "RSS feed: " + list.getAdRSSFeed() ); 147 log.info( "Atom feed: " + list.getAdAtomFeed() ); 148 return list; 149 150 } 151 152 /** 153 * Called each time we find a feed so that we can set the Ad method. 154 * 155 * FIXME: This doesn't seem like the right place for this. Can you 156 * document this more? It's cryptic. Brad Neuberg, bkn3@columbia.edu. 157 * 158 */ 159 private static void onFeedReference( FeedReference ref, FeedList list ) { 160 161 if ( list.getAdAtomFeed() == null && 162 FeedReference.ATOM_MEDIA_TYPE.equals( ref.type ) ) { 163 164 list.setAdAtomFeed( ref ); 165 166 } else if ( list.getAdRSSFeed() == null && 167 FeedReference.RSS_MEDIA_TYPE.equals( ref.type ) ) { 168 169 list.setAdRSSFeed( ref ); 170 171 } 172 173 list.add( ref ); 174 175 } 176 177 /** Does an HTTP HEAD to see if the given resource exists. 178 * 179 * @param resource The full URI to the resource to check for. 180 * 181 * 182 */ 183 protected static boolean feedExists( String resource, 184 BlogService blogService) 185 throws Exception { 186 187 log.debug("feedExists, resource="+resource); 188 ResourceRequest request = ResourceRequestFactory.getResourceRequest( resource ); 189 190 request.setRequestMethod( "HEAD" ); 191 192 // Some services need to follow redirects; others block if you do. 193 // Ask the blog service itself what to do. 194 request.setFollowRedirects( blogService.followRedirects() ); 195 196 // the call below actually causes the connection to be made 197 request.getContentLength(); 198 199 long response = request.getResponseCode(); 200 log.debug("response="+response); 201 202 return response == 200; 203 } 204 205 206 207 }