001 /*
002 * Copyright 1999,2004 The Apache Software Foundation.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package org.apache.commons.feedparser.locate;
018
019 import java.util.Iterator;
020
021 import org.apache.commons.feedparser.FeedList;
022 import org.apache.commons.feedparser.network.ResourceRequest;
023 import org.apache.commons.feedparser.network.ResourceRequestFactory;
024 import org.apache.log4j.Logger;
025
026 /**
027 * Method to determine feed URLs from a given resource URI. For example,
028 * you would pass in the URI:
029 *
030 * http://www.codinginparadise.org
031 *
032 * and this class would pass back a List with one address of the feed URL,
033 * which is
034 *
035 * http://www.codinginparadise.org/weblog/atom.xml"
036 *
037 * <code>
038 * String resource = "http://www.codinginparadise.org";
039 * FeedList l = FeedLocator.locate( resource );
040 * </code>
041 *
042 * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
043 */
044 public class FeedLocator {
045
046 private static Logger log = Logger.getLogger( FeedLocator.class );
047
048 /**
049 * Locate all feeds within the given resource. The resource should be a link
050 * to an (X)HTML document, usually a weblog or a website.
051 *
052 * Example: http://peerfear.org
053 *
054 * @param resource The weblog we need to discover
055 *
056 */
057 public static final FeedList locate( String resource ) throws Exception {
058 // \: Use my network library when it's migrated into Apache.
059
060 //fetch content
061 ResourceRequest request = ResourceRequestFactory.getResourceRequest( resource );
062
063 String content = request.getInputStreamAsString();
064
065 //return resources
066 return locate( resource, content );
067
068 }
069
070 /**
071 * Locate the feed with the given content.
072 *
073 *
074 */
075 public static final FeedList locate( String resource, String content ) throws Exception {
076
077 log.info( "Locating " + resource + "..." );
078
079 FeedList list = new FeedList();
080
081 //FIXME: if we were GIVEN an RSS/Atom/OPML/etc file then we should just
082 //attempt to use this and return a FeedList with just one entry. Parse
083 //it first I think to make sure its valid XML and then move forward.
084 //The downside here is that it would be wasted CPU if its HTML content.
085
086 log.debug( "Using DiscoveryLocator..." );
087 DiscoveryLocator.locate( resource, content, list );
088 log.debug("after discoverylocator, list="+list);
089
090 log.debug( "Using LinkLocator..." );
091 //this failed... try looking for links
092 LinkLocator.locate( resource, content, list );
093 log.debug("after linklocator, list="+list);
094
095 //this failed... try probe location. This is more reliable than
096 //LinkLocation but requires a few more HTTP gets.
097 log.debug( "Using ProbeLocator..." );
098 ProbeLocator.locate( resource, content, list );
099 log.debug("after probelocator, list="+list);
100
101 log.info( "After locating, list="+list );
102
103 return list;
104
105 }
106
107 public static void main( String[] args ) throws Exception {
108
109 //This should find http://www.electoral-vote.com/index.rss
110 //String resource = "http://brendonwilson.com/";
111
112 //String resource = "file:///projects/feedparser/tests/locate4.html";
113 //String resource = "file:///projects/feedparser/tests/locate5.html";
114 //String resource = "file:///projects/feedparser/tests/locate6.html";
115
116 //FIXME: add UNIT TESTS for Yahoo Groups and Flickr
117
118 String resource = "http://craigslist.org/w4m/";
119
120 //String resource = "http://groups.yahoo.com/group/aggregators/";
121
122 //String resource = "http://flickr.com/photos/tags/cats";
123
124 //String resource = "file:///projects/feedparser/tests/locate8.html";
125
126 //String resource = "http://blogs.sun.com/roller/page/gonzo";
127
128 //String resource = "http://gonze.com/weblog/";
129
130 //String resource = "http://codinginparadise.org/";
131
132 // String resource = "http://bucsfishingreport.com/pMachine/weblog.php";
133
134 //String resource = "http://www.livejournal.com/community/indiexiankids/";
135 //String resource= "http://www.thealarmclock.com/mt/";
136
137 //String resource = "http://guinness.joeuser.com";
138
139 //String resource = "http://georgewbush.com/blog";
140
141 //String resource = "http://carolinascl.blogspot.com/";
142
143 //String resource = "http://www.corante.com/strange/";
144 //String resource = "http://peerfear.org";
145
146 ProbeLocator.BLOG_SERVICE_PROBING_ENABLED = true;
147 ProbeLocator.AGGRESIVE_PROBING_ENABLED = true;
148
149 FeedList l = locate( resource );
150
151 Iterator it = l.iterator();
152
153 if ( it.hasNext() == false ) {
154 System.out.println( "NO LINKS FOUND" );
155 }
156
157 System.out.println( "AD RSS: " + l.getAdRSSFeed() );
158 System.out.println( "AD Atom: " + l.getAdAtomFeed() );
159
160 while ( it.hasNext() ) {
161
162 FeedReference ref = (FeedReference)it.next();
163
164 System.out.println( ref.resource );
165
166 }
167
168 }
169
170 }