1 /*
2 * Copyright 1999,2004 The Apache Software Foundation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.commons.feedparser.locate;
18
19 import java.net.MalformedURLException;
20 import java.net.URL;
21 import java.util.HashSet;
22 import java.util.List;
23
24 import org.apache.commons.feedparser.FeedList;
25
26 /**
27 * Find links by parsing the raw HTML. We only return links that are on the
28 * same site and link to /index.rdf LINKS and so forth.
29 *
30 * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
31 */
32 public class LinkLocator {
33
34 /**
35 *
36 *
37 *
38 */
39 public static final List locate( String resource,
40 String content,
41 final FeedList list )
42 throws Exception {
43
44 /**
45 * When we have been given feeds at a higher level (via <link rel> tags
46 * we should prefer these.
47 */
48 final boolean hasExplicitRSSFeed = list.getAdRSSFeed() != null;
49 final boolean hasExplicitAtomFeed = list.getAdRSSFeed() != null;
50
51 AnchorParserListener listener = new AnchorParserListener() {
52
53 String resource = null;
54
55 String site = null;
56
57 HashSet seen = new HashSet();
58
59 boolean hasFoundRSSFeed = false;
60 boolean hasFoundAtomFeed = false;
61
62 public void setContext( Object context ) {
63
64 resource = (String)context;
65
66 //pass in the resource of the blog
67 site = getSite( resource );
68
69 }
70
71 public Object getResult() {
72 return list;
73 }
74
75 public boolean onAnchor( String href, String rel, String title ) {
76 String current = ResourceExpander.expand( resource, href );
77 if ( current == null )
78 return true; //obviously not
79
80 //FIXME: if it's at the same directory level we should prioritize it.
81 //for example:
82 //
83 // http://peerfear.org/blog/
84 //
85 // http://peerfear.org/blog/index.rdf
86 //
87 // instead of
88 //
89 // http://peerfear.org/index.rdf
90
91 //see if the link is on a different site
92 if ( ! site.equals( getSite( current ) ) ) {
93 return true;
94 }
95
96 //Radio style feed. Screw that.
97 //FIXME: What happens if the Feed Parser is used to
98 //aggregate feeds on the localhost? This will break that.
99 //Brad Neuberg, bkn3@columbia.edu
100 if ( current.startsWith( "http://127" ) )
101 return true;
102
103 if ( seen.contains( current ) ) {
104 return true;
105 }
106
107 seen.add( current );
108
109 //FIXME: we should assert tha that these feeds are from the SAME
110 //domain not a link to another feed.
111
112 boolean isRSSLink = current.endsWith( ".rss" );
113
114 //support ROLLER RSS links and explicit link discovery by
115 //non-extensions.
116 if ( isRSSLink == false ) {
117
118 isRSSLink =
119 title != null &&
120 title.equalsIgnoreCase( "rss" ) &&
121 href.indexOf( "rss" ) != -1;
122
123 }
124
125 if ( isRSSLink ) {
126
127 //this is an RSS feed.
128 FeedReference ref = new FeedReference( current,
129 FeedReference.RSS_MEDIA_TYPE );
130
131
132 //make sure we haven't already discovered this feed
133 //through a different process
134 if (list.contains(ref))
135 return true;
136
137 //Make sure to preserve existing AD feeds first.
138 if ( ! hasExplicitRSSFeed )
139 list.setAdRSSFeed( ref );
140
141 list.add( ref );
142
143 hasFoundRSSFeed = true;
144
145 }
146
147 if ( current.endsWith( ".atom" ) ) {
148
149 FeedReference ref = new FeedReference( current,
150 FeedReference.RSS_MEDIA_TYPE );
151
152 //make sure we haven't already discovered this feed
153 //through a different process
154 if (list.contains(ref))
155 return true;
156
157 //Make sure to preserve existing AD feeds first.
158 if ( ! hasExplicitAtomFeed )
159 list.setAdAtomFeed( ref );
160
161 list.add( ref );
162
163 hasFoundAtomFeed = true;
164
165 }
166
167 if ( current.endsWith( ".xml" ) ||
168 current.endsWith( ".rdf" ) ) {
169
170 //NOTE that we do allow autodiscovery forfor index.xml
171 //and index.rdf files but we don't prefer them since
172 //these extensions are generic. We would prefer to use
173 //index.rss or even Atom (though people tend to use Atom
174 //autodiscovery now). This is important because if we
175 //spit back an index.xml file thats NOT RSS or worse an
176 //index.rdf file thats FOAF then we might break callers.
177
178 FeedReference ref = new FeedReference( current,
179 FeedReference.RSS_MEDIA_TYPE );
180
181 //make sure we haven't already discovered this feed
182 //through a different process
183 if (list.contains(ref))
184 return true;
185
186 //see if we should RESORT to using this.
187
188 if ( ! hasExplicitRSSFeed && ! hasFoundRSSFeed ) {
189
190 //NOTE: when we have found an existing RDF file use
191 //that instead.. This is probably RSS 1.0 which is
192 //much better than RSS 0.91
193
194 if ( list.getAdRSSFeed() == null ||
195 list.getAdRSSFeed().resource.endsWith( ".rdf" ) == false ) {
196
197 list.setAdRSSFeed( ref );
198
199 }
200
201 }
202
203 //feed for this blog.
204 list.add( ref );
205 return true;
206
207 }
208
209 //for coderman's blog at http://www.peertech.org
210 //FIXME: This is a hack, Brad Neuberg, bkn3@columbia.edu
211 if ( current.endsWith( "/node/feed" ) )
212 list.add( current );
213
214 return true;
215
216 }
217
218 };
219
220 listener.setContext( resource );
221 AnchorParser.parseAnchors( content, listener );
222
223 return list;
224
225 }
226
227 public static String getSite( String resource ) {
228
229 try {
230
231 String site = new URL( resource ).getHost();
232 return site.replaceAll( "http://www", "http://" );
233
234 } catch ( MalformedURLException e ) {
235 return null;
236 }
237
238 }
239
240 }