001 /*
002 * Copyright 1999,2004 The Apache Software Foundation.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package org.apache.commons.feedparser.locate;
018
019 import java.util.ArrayList;
020 import java.util.HashSet;
021 import java.util.Iterator;
022 import java.util.List;
023 import java.util.Set;
024
025 import org.apache.commons.feedparser.FeedList;
026 import org.apache.commons.feedparser.locate.blogservice.BlogService;
027 import org.apache.commons.feedparser.locate.blogservice.Unknown;
028 import org.apache.commons.feedparser.network.ResourceRequest;
029 import org.apache.commons.feedparser.network.ResourceRequestFactory;
030 import org.apache.log4j.Logger;
031
032 /**
033 * Locator which uses Link probing. It also attempts to determine the type of
034 * blog service provider it is dealing with, such as BlogSpot, Blogsxom, etc.,
035 * in order to find feed URLs that are not specified through autodiscovery.
036 *
037 * If ProbeLocator.AGGRESIVE_PROBING_ENABLED is true (by default it is false),
038 * then we probe for links.
039 *
040 *
041 *
042 * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
043 */
044 public class ProbeLocator {
045
046 private static Logger log = Logger.getLogger( ProbeLocator.class );
047
048 /** If true, then we aggresively probe a site if it doesn't have
049 * autodiscovery. This includes trying to determine what the blog provider
050 * is, trying individual locations based on a blog provider, and probing
051 * in several locations if the blog provider is unknown.
052 *
053 * The default value for this should be false. This should only be
054 * used on server-side aggregators that generate few requests, and
055 * _never_ on client-side aggregators. The level of traffic for
056 * client-side aggregators would be too great.
057 */
058 public static boolean AGGRESIVE_PROBING_ENABLED = false;
059
060 /** If true, then after discovering what a site's blog provider is we
061 * probe in select locations for feeds based on the provider. This
062 * is useful if autodiscovery is not enabled on this blog and we don't
063 * want to do the full aggresive probing.
064 *
065 * The default value for this should be false. This should only
066 * be used on server-side aggregators that generate few requests,
067 * and _never_ on client-side aggregators. The level of traffic
068 * for client-side aggregators would be too great.
069 */
070 public static boolean BLOG_SERVICE_PROBING_ENABLED = false;
071
072
073 /**
074 *
075 *
076 */
077 public static final List locate( String resource, String content, FeedList list )
078 throws Exception {
079 log.debug("ProbeLocator, resource="+resource+", list="+list);
080
081 // determine what blog service we are dealing with
082 BlogService blogService = BlogServiceDiscovery.discover( resource, content );
083 log.debug("blogService="+blogService);
084 log.debug("blogService.hasValidAutoDiscovery="+blogService.hasValidAutoDiscovery());
085 // fail-fast if we already have some results and if we determine that
086 // we can trust the results (TextAmerica has invalid autodiscovery,
087 // for example)
088 if ( list.size() > 0 && blogService.hasValidAutoDiscovery() )
089 return list;
090 else if ( blogService.hasValidAutoDiscovery() == false ) {
091 // clear out the list so far since we can't trust the results
092 list.clear();
093 }
094
095 if ( BLOG_SERVICE_PROBING_ENABLED || AGGRESIVE_PROBING_ENABLED ) {
096 log.debug("PROBING!!");
097 List servicesToTry = new ArrayList();
098 servicesToTry.add(blogService);
099 // only try the Unknown service if we want aggresive probing
100 if (AGGRESIVE_PROBING_ENABLED)
101 servicesToTry.add(new Unknown());
102 Iterator iter = servicesToTry.iterator();
103 Set previousAttempts = new HashSet();
104
105 while (iter.hasNext() && list.size() == 0) {
106 BlogService currentService = (BlogService)iter.next();
107 FeedReference[] mapping = currentService.getFeedLocations(resource, content);
108 log.debug( "mapping = " + mapping );
109
110 // try out each mapping
111 for (int i = 0; i < mapping.length; i++) {
112 String baseFeedPath = currentService.getBaseFeedPath(resource);
113 String pathToTest ;
114 // build up our path to test differently if we are a
115 // relative or an exact path; needed because some
116 // blog services rewrite the domain name, such as
117 // Yahoo Groups
118 if (mapping[i].isRelative())
119 pathToTest = baseFeedPath + mapping[i].resource;
120 else
121 pathToTest = mapping[i].resource;
122
123 log.debug( "pathToTest = " + pathToTest );
124
125 if ( !previousAttempts.contains( pathToTest )
126 && feedExists( pathToTest, currentService ) ) {
127 log.debug("Feed exists");
128 FeedReference feedReference = new FeedReference( pathToTest,
129 mapping[i].type );
130 feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY;
131 previousAttempts.add( pathToTest );
132 onFeedReference( feedReference, list );
133 }
134
135 // record this attempt so we don't repeat it again if
136 // we are doing aggresive probing
137 previousAttempts.add( pathToTest );
138 }
139 }
140
141 log.info( "Using aggresive probing, found the following:" );
142 log.info( "Blog service: " + blogService );
143 }
144
145 log.info( "List: " + list );
146 log.info( "RSS feed: " + list.getAdRSSFeed() );
147 log.info( "Atom feed: " + list.getAdAtomFeed() );
148 return list;
149
150 }
151
152 /**
153 * Called each time we find a feed so that we can set the Ad method.
154 *
155 * FIXME: This doesn't seem like the right place for this. Can you
156 * document this more? It's cryptic. Brad Neuberg, bkn3@columbia.edu.
157 *
158 */
159 private static void onFeedReference( FeedReference ref, FeedList list ) {
160
161 if ( list.getAdAtomFeed() == null &&
162 FeedReference.ATOM_MEDIA_TYPE.equals( ref.type ) ) {
163
164 list.setAdAtomFeed( ref );
165
166 } else if ( list.getAdRSSFeed() == null &&
167 FeedReference.RSS_MEDIA_TYPE.equals( ref.type ) ) {
168
169 list.setAdRSSFeed( ref );
170
171 }
172
173 list.add( ref );
174
175 }
176
177 /** Does an HTTP HEAD to see if the given resource exists.
178 *
179 * @param resource The full URI to the resource to check for.
180 *
181 *
182 */
183 protected static boolean feedExists( String resource,
184 BlogService blogService)
185 throws Exception {
186
187 log.debug("feedExists, resource="+resource);
188 ResourceRequest request = ResourceRequestFactory.getResourceRequest( resource );
189
190 request.setRequestMethod( "HEAD" );
191
192 // Some services need to follow redirects; others block if you do.
193 // Ask the blog service itself what to do.
194 request.setFollowRedirects( blogService.followRedirects() );
195
196 // the call below actually causes the connection to be made
197 request.getContentLength();
198
199 long response = request.getResponseCode();
200 log.debug("response="+response);
201
202 return response == 200;
203 }
204
205
206
207 }