001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser.locate; 018 019 import java.net.MalformedURLException; 020 import java.net.URL; 021 import java.util.HashSet; 022 import java.util.List; 023 024 import org.apache.commons.feedparser.FeedList; 025 026 /** 027 * Find links by parsing the raw HTML. We only return links that are on the 028 * same site and link to /index.rdf LINKS and so forth. 029 * 030 * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a> 031 */ 032 public class LinkLocator { 033 034 /** 035 * 036 * 037 * 038 */ 039 public static final List locate( String resource, 040 String content, 041 final FeedList list ) 042 throws Exception { 043 044 /** 045 * When we have been given feeds at a higher level (via <link rel> tags 046 * we should prefer these. 047 */ 048 final boolean hasExplicitRSSFeed = list.getAdRSSFeed() != null; 049 final boolean hasExplicitAtomFeed = list.getAdRSSFeed() != null; 050 051 AnchorParserListener listener = new AnchorParserListener() { 052 053 String resource = null; 054 055 String site = null; 056 057 HashSet seen = new HashSet(); 058 059 boolean hasFoundRSSFeed = false; 060 boolean hasFoundAtomFeed = false; 061 062 public void setContext( Object context ) { 063 064 resource = (String)context; 065 066 //pass in the resource of the blog 067 site = getSite( resource ); 068 069 } 070 071 public Object getResult() { 072 return list; 073 } 074 075 public boolean onAnchor( String href, String rel, String title ) { 076 String current = ResourceExpander.expand( resource, href ); 077 if ( current == null ) 078 return true; //obviously not 079 080 //FIXME: if it's at the same directory level we should prioritize it. 081 //for example: 082 // 083 // http://peerfear.org/blog/ 084 // 085 // http://peerfear.org/blog/index.rdf 086 // 087 // instead of 088 // 089 // http://peerfear.org/index.rdf 090 091 //see if the link is on a different site 092 if ( ! site.equals( getSite( current ) ) ) { 093 return true; 094 } 095 096 //Radio style feed. Screw that. 097 //FIXME: What happens if the Feed Parser is used to 098 //aggregate feeds on the localhost? This will break that. 099 //Brad Neuberg, bkn3@columbia.edu 100 if ( current.startsWith( "http://127" ) ) 101 return true; 102 103 if ( seen.contains( current ) ) { 104 return true; 105 } 106 107 seen.add( current ); 108 109 //FIXME: we should assert tha that these feeds are from the SAME 110 //domain not a link to another feed. 111 112 boolean isRSSLink = current.endsWith( ".rss" ); 113 114 //support ROLLER RSS links and explicit link discovery by 115 //non-extensions. 116 if ( isRSSLink == false ) { 117 118 isRSSLink = 119 title != null && 120 title.equalsIgnoreCase( "rss" ) && 121 href.indexOf( "rss" ) != -1; 122 123 } 124 125 if ( isRSSLink ) { 126 127 //this is an RSS feed. 128 FeedReference ref = new FeedReference( current, 129 FeedReference.RSS_MEDIA_TYPE ); 130 131 132 //make sure we haven't already discovered this feed 133 //through a different process 134 if (list.contains(ref)) 135 return true; 136 137 //Make sure to preserve existing AD feeds first. 138 if ( ! hasExplicitRSSFeed ) 139 list.setAdRSSFeed( ref ); 140 141 list.add( ref ); 142 143 hasFoundRSSFeed = true; 144 145 } 146 147 if ( current.endsWith( ".atom" ) ) { 148 149 FeedReference ref = new FeedReference( current, 150 FeedReference.RSS_MEDIA_TYPE ); 151 152 //make sure we haven't already discovered this feed 153 //through a different process 154 if (list.contains(ref)) 155 return true; 156 157 //Make sure to preserve existing AD feeds first. 158 if ( ! hasExplicitAtomFeed ) 159 list.setAdAtomFeed( ref ); 160 161 list.add( ref ); 162 163 hasFoundAtomFeed = true; 164 165 } 166 167 if ( current.endsWith( ".xml" ) || 168 current.endsWith( ".rdf" ) ) { 169 170 //NOTE that we do allow autodiscovery forfor index.xml 171 //and index.rdf files but we don't prefer them since 172 //these extensions are generic. We would prefer to use 173 //index.rss or even Atom (though people tend to use Atom 174 //autodiscovery now). This is important because if we 175 //spit back an index.xml file thats NOT RSS or worse an 176 //index.rdf file thats FOAF then we might break callers. 177 178 FeedReference ref = new FeedReference( current, 179 FeedReference.RSS_MEDIA_TYPE ); 180 181 //make sure we haven't already discovered this feed 182 //through a different process 183 if (list.contains(ref)) 184 return true; 185 186 //see if we should RESORT to using this. 187 188 if ( ! hasExplicitRSSFeed && ! hasFoundRSSFeed ) { 189 190 //NOTE: when we have found an existing RDF file use 191 //that instead.. This is probably RSS 1.0 which is 192 //much better than RSS 0.91 193 194 if ( list.getAdRSSFeed() == null || 195 list.getAdRSSFeed().resource.endsWith( ".rdf" ) == false ) { 196 197 list.setAdRSSFeed( ref ); 198 199 } 200 201 } 202 203 //feed for this blog. 204 list.add( ref ); 205 return true; 206 207 } 208 209 //for coderman's blog at http://www.peertech.org 210 //FIXME: This is a hack, Brad Neuberg, bkn3@columbia.edu 211 if ( current.endsWith( "/node/feed" ) ) 212 list.add( current ); 213 214 return true; 215 216 } 217 218 }; 219 220 listener.setContext( resource ); 221 AnchorParser.parseAnchors( content, listener ); 222 223 return list; 224 225 } 226 227 public static String getSite( String resource ) { 228 229 try { 230 231 String site = new URL( resource ).getHost(); 232 return site.replaceAll( "http://www", "http://" ); 233 234 } catch ( MalformedURLException e ) { 235 return null; 236 } 237 238 } 239 240 }