001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser.locate;
018    
019    import java.util.ArrayList;
020    import java.util.HashSet;
021    import java.util.Iterator;
022    import java.util.List;
023    import java.util.Set;
024    
025    import org.apache.commons.feedparser.FeedList;
026    import org.apache.commons.feedparser.locate.blogservice.BlogService;
027    import org.apache.commons.feedparser.locate.blogservice.Unknown;
028    import org.apache.commons.feedparser.network.ResourceRequest;
029    import org.apache.commons.feedparser.network.ResourceRequestFactory;
030    import org.apache.log4j.Logger;
031    
032    /**
033     * Locator which uses Link probing.  It also attempts to determine the type of
034     * blog service provider it is dealing with, such as BlogSpot, Blogsxom, etc.,
035     * in order to find feed URLs that are not specified through autodiscovery.
036     * 
037     * If ProbeLocator.AGGRESIVE_PROBING_ENABLED is true (by default it is false),
038     * then we probe for links.
039     * 
040     * 
041     * 
042     * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
043     */
044    public class ProbeLocator {
045    
046        private static Logger log = Logger.getLogger( ProbeLocator.class );
047    
048        /** If true, then we aggresively probe a site if it doesn't have
049         *  autodiscovery.  This includes trying to determine what the blog provider
050         *  is, trying individual locations based on a blog provider, and probing
051         *  in several locations if the blog provider is unknown.
052         * 
053         *  The default value for this should be false.  This should only be 
054         *  used on server-side aggregators that generate few requests, and 
055         *  _never_ on client-side aggregators.  The level of traffic for 
056         *  client-side aggregators would be too great.
057         */
058        public static boolean AGGRESIVE_PROBING_ENABLED = false;
059        
060        /** If true, then after discovering what a site's blog provider is we
061         *  probe in select locations for feeds based on the provider.  This
062         *  is useful if autodiscovery is not enabled on this blog and we don't
063         *  want to do the full aggresive probing.
064         * 
065         *  The default value for this should be false.  This should only 
066         *  be used on server-side aggregators that generate few requests, 
067         *  and _never_ on client-side aggregators.  The level of traffic 
068         *  for client-side aggregators would be too great.
069         */
070        public static boolean BLOG_SERVICE_PROBING_ENABLED = false;
071    
072        
073        /**
074         *
075         * 
076         */
077        public static final List locate( String resource, String content, FeedList list )
078            throws Exception {
079            log.debug("ProbeLocator, resource="+resource+", list="+list);
080    
081            // determine what blog service we are dealing with
082            BlogService blogService = BlogServiceDiscovery.discover( resource, content );  
083            log.debug("blogService="+blogService);
084            log.debug("blogService.hasValidAutoDiscovery="+blogService.hasValidAutoDiscovery());
085            // fail-fast if we already have some results and if we determine that
086            // we can trust the results (TextAmerica has invalid autodiscovery,
087            // for example)
088            if ( list.size() > 0 && blogService.hasValidAutoDiscovery() )
089                return list;
090            else if ( blogService.hasValidAutoDiscovery() == false ) {
091                // clear out the list so far since we can't trust the results
092                list.clear();
093            }
094    
095            if ( BLOG_SERVICE_PROBING_ENABLED || AGGRESIVE_PROBING_ENABLED ) {
096                log.debug("PROBING!!");
097                List servicesToTry = new ArrayList();
098                servicesToTry.add(blogService);
099                // only try the Unknown service if we want aggresive probing
100                if (AGGRESIVE_PROBING_ENABLED)
101                    servicesToTry.add(new Unknown());
102                Iterator iter = servicesToTry.iterator();
103                Set previousAttempts = new HashSet();
104                
105                while (iter.hasNext() && list.size() == 0) {
106                    BlogService currentService = (BlogService)iter.next();
107                    FeedReference[] mapping = currentService.getFeedLocations(resource, content);
108                    log.debug( "mapping = " + mapping );
109                
110                    // try out each mapping
111                    for (int i = 0; i < mapping.length; i++) {
112                        String baseFeedPath = currentService.getBaseFeedPath(resource);
113                        String pathToTest ;
114                        // build up our path to test differently if we are a
115                        // relative or an exact path; needed because some
116                        // blog services rewrite the domain name, such as
117                        // Yahoo Groups
118                        if (mapping[i].isRelative())
119                            pathToTest = baseFeedPath + mapping[i].resource;
120                        else
121                            pathToTest = mapping[i].resource;
122                        
123                        log.debug( "pathToTest = " + pathToTest );
124    
125                        if ( !previousAttempts.contains( pathToTest ) 
126                             && feedExists( pathToTest, currentService ) ) {
127                            log.debug("Feed exists");
128                            FeedReference feedReference = new FeedReference( pathToTest,
129                                                                             mapping[i].type );
130                            feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY;       
131                            previousAttempts.add( pathToTest );
132                            onFeedReference( feedReference, list );
133                        }
134                    
135                        // record this attempt so we don't repeat it again if
136                        // we are doing aggresive probing
137                        previousAttempts.add( pathToTest );
138                    }
139                }
140    
141                log.info( "Using aggresive probing, found the following:" );
142                log.info( "Blog service: " + blogService );
143            }
144    
145            log.info( "List: " + list );
146            log.info( "RSS feed: " + list.getAdRSSFeed() );
147            log.info( "Atom feed: " + list.getAdAtomFeed() );
148            return list;
149    
150        }
151    
152        /**
153         * Called each time we find a feed so that we can set the Ad method.
154         * 
155         * FIXME: This doesn't seem like the right place for this.  Can you
156         * document this more? It's cryptic.  Brad Neuberg, bkn3@columbia.edu.
157         * 
158         */
159        private static void onFeedReference( FeedReference ref, FeedList list ) {
160    
161            if ( list.getAdAtomFeed() == null &&
162                 FeedReference.ATOM_MEDIA_TYPE.equals( ref.type ) ) {
163    
164                list.setAdAtomFeed( ref );
165    
166            } else if ( list.getAdRSSFeed() == null &&
167                        FeedReference.RSS_MEDIA_TYPE.equals( ref.type ) ) {
168    
169                list.setAdRSSFeed( ref );
170    
171            }
172    
173            list.add( ref );
174            
175        }
176    
177        /** Does an HTTP HEAD to see if the given resource exists.
178         * 
179         *  @param resource The full URI to the resource to check for.
180         * 
181         * 
182         */
183        protected static boolean feedExists( String resource,
184                                             BlogService blogService) 
185            throws Exception {
186            
187            log.debug("feedExists, resource="+resource);
188            ResourceRequest request = ResourceRequestFactory.getResourceRequest( resource );
189    
190            request.setRequestMethod( "HEAD" );
191            
192            // Some services need to follow redirects; others block if you do.
193            // Ask the blog service itself what to do.
194            request.setFollowRedirects( blogService.followRedirects() );
195            
196            // the call below actually causes the connection to be made
197            request.getContentLength();
198            
199            long response = request.getResponseCode();
200            log.debug("response="+response);
201    
202            return response == 200;
203        }
204        
205        
206    
207    }