001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser.locate; 018 019 import java.util.Iterator; 020 021 import org.apache.commons.feedparser.FeedList; 022 import org.apache.commons.feedparser.network.ResourceRequest; 023 import org.apache.commons.feedparser.network.ResourceRequestFactory; 024 import org.apache.log4j.Logger; 025 026 /** 027 * Method to determine feed URLs from a given resource URI. For example, 028 * you would pass in the URI: 029 * 030 * http://www.codinginparadise.org 031 * 032 * and this class would pass back a List with one address of the feed URL, 033 * which is 034 * 035 * http://www.codinginparadise.org/weblog/atom.xml" 036 * 037 * <code> 038 * String resource = "http://www.codinginparadise.org"; 039 * FeedList l = FeedLocator.locate( resource ); 040 * </code> 041 * 042 * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a> 043 */ 044 public class FeedLocator { 045 046 private static Logger log = Logger.getLogger( FeedLocator.class ); 047 048 /** 049 * Locate all feeds within the given resource. The resource should be a link 050 * to an (X)HTML document, usually a weblog or a website. 051 * 052 * Example: http://peerfear.org 053 * 054 * @param resource The weblog we need to discover 055 * 056 */ 057 public static final FeedList locate( String resource ) throws Exception { 058 // \: Use my network library when it's migrated into Apache. 059 060 //fetch content 061 ResourceRequest request = ResourceRequestFactory.getResourceRequest( resource ); 062 063 String content = request.getInputStreamAsString(); 064 065 //return resources 066 return locate( resource, content ); 067 068 } 069 070 /** 071 * Locate the feed with the given content. 072 * 073 * 074 */ 075 public static final FeedList locate( String resource, String content ) throws Exception { 076 077 log.info( "Locating " + resource + "..." ); 078 079 FeedList list = new FeedList(); 080 081 //FIXME: if we were GIVEN an RSS/Atom/OPML/etc file then we should just 082 //attempt to use this and return a FeedList with just one entry. Parse 083 //it first I think to make sure its valid XML and then move forward. 084 //The downside here is that it would be wasted CPU if its HTML content. 085 086 log.debug( "Using DiscoveryLocator..." ); 087 DiscoveryLocator.locate( resource, content, list ); 088 log.debug("after discoverylocator, list="+list); 089 090 log.debug( "Using LinkLocator..." ); 091 //this failed... try looking for links 092 LinkLocator.locate( resource, content, list ); 093 log.debug("after linklocator, list="+list); 094 095 //this failed... try probe location. This is more reliable than 096 //LinkLocation but requires a few more HTTP gets. 097 log.debug( "Using ProbeLocator..." ); 098 ProbeLocator.locate( resource, content, list ); 099 log.debug("after probelocator, list="+list); 100 101 log.info( "After locating, list="+list ); 102 103 return list; 104 105 } 106 107 public static void main( String[] args ) throws Exception { 108 109 //This should find http://www.electoral-vote.com/index.rss 110 //String resource = "http://brendonwilson.com/"; 111 112 //String resource = "file:///projects/feedparser/tests/locate4.html"; 113 //String resource = "file:///projects/feedparser/tests/locate5.html"; 114 //String resource = "file:///projects/feedparser/tests/locate6.html"; 115 116 //FIXME: add UNIT TESTS for Yahoo Groups and Flickr 117 118 String resource = "http://craigslist.org/w4m/"; 119 120 //String resource = "http://groups.yahoo.com/group/aggregators/"; 121 122 //String resource = "http://flickr.com/photos/tags/cats"; 123 124 //String resource = "file:///projects/feedparser/tests/locate8.html"; 125 126 //String resource = "http://blogs.sun.com/roller/page/gonzo"; 127 128 //String resource = "http://gonze.com/weblog/"; 129 130 //String resource = "http://codinginparadise.org/"; 131 132 // String resource = "http://bucsfishingreport.com/pMachine/weblog.php"; 133 134 //String resource = "http://www.livejournal.com/community/indiexiankids/"; 135 //String resource= "http://www.thealarmclock.com/mt/"; 136 137 //String resource = "http://guinness.joeuser.com"; 138 139 //String resource = "http://georgewbush.com/blog"; 140 141 //String resource = "http://carolinascl.blogspot.com/"; 142 143 //String resource = "http://www.corante.com/strange/"; 144 //String resource = "http://peerfear.org"; 145 146 ProbeLocator.BLOG_SERVICE_PROBING_ENABLED = true; 147 ProbeLocator.AGGRESIVE_PROBING_ENABLED = true; 148 149 FeedList l = locate( resource ); 150 151 Iterator it = l.iterator(); 152 153 if ( it.hasNext() == false ) { 154 System.out.println( "NO LINKS FOUND" ); 155 } 156 157 System.out.println( "AD RSS: " + l.getAdRSSFeed() ); 158 System.out.println( "AD Atom: " + l.getAdAtomFeed() ); 159 160 while ( it.hasNext() ) { 161 162 FeedReference ref = (FeedReference)it.next(); 163 164 System.out.println( ref.resource ); 165 166 } 167 168 } 169 170 }