LinkLocator xref

View Javadoc

1   /*
2    * Copyright 1999,2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.commons.feedparser.locate;
18  
19  import java.net.MalformedURLException;
20  import java.net.URL;
21  import java.util.HashSet;
22  import java.util.List;
23  
24  import org.apache.commons.feedparser.FeedList;
25  
26  /**
27   * Find links by parsing the raw HTML.  We only return links that are on the
28   * same site and link to /index.rdf LINKS and so forth.
29   *
30   * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
31   */
32  public class LinkLocator {
33  
34      /**
35       * 
36       *
37       * 
38       */
39      public static final List locate( String resource,
40                                       String content,
41                                       final FeedList list )
42          throws Exception {
43  
44          /**
45           * When we have been given feeds at a higher level (via <link rel> tags
46           * we should prefer these.
47           */
48          final boolean hasExplicitRSSFeed = list.getAdRSSFeed() != null;
49          final boolean hasExplicitAtomFeed = list.getAdRSSFeed() != null;
50  
51          AnchorParserListener listener = new AnchorParserListener() {
52  
53                  String resource = null;
54                  
55                  String site = null;
56  
57                  HashSet seen = new HashSet();
58  
59                  boolean hasFoundRSSFeed = false;
60                  boolean hasFoundAtomFeed = false;
61                  
62                  public void setContext( Object context ) {
63  
64                      resource = (String)context;
65                      
66                      //pass in the resource of the blog
67                      site = getSite( resource );
68                      
69                  }
70  
71                  public Object getResult() {
72                      return list;
73                  }
74  
75                  public boolean onAnchor( String href, String rel, String title ) {
76                      String current = ResourceExpander.expand( resource, href );
77                      if ( current == null )
78                          return true; //obviously not
79  
80                      //FIXME: if it's at the same directory level we should prioritize it.
81                      //for example:
82                      //
83                      // http://peerfear.org/blog/
84                      //
85                      // http://peerfear.org/blog/index.rdf
86                      //
87                      // instead of 
88                      //
89                      // http://peerfear.org/index.rdf
90  
91                      //see if the link is on a different site
92                      if ( ! site.equals( getSite( current ) ) ) {
93                          return true;
94                      }
95  
96                      //Radio style feed.  Screw that.
97                      //FIXME: What happens if the Feed Parser is used to
98                      //aggregate feeds on the localhost? This will break that.
99                      //Brad Neuberg, bkn3@columbia.edu
100                     if ( current.startsWith( "http://127" ) ) 
101                         return true;
102 
103                     if ( seen.contains( current ) ) {
104                         return true;
105                     } 
106 
107                     seen.add( current );
108 
109                     //FIXME: we should assert tha that these feeds are from the SAME
110                     //domain not a link to another feed.
111 
112                     boolean isRSSLink = current.endsWith( ".rss" );
113 
114                     //support ROLLER RSS links and explicit link discovery by
115                     //non-extensions.
116                     if ( isRSSLink == false ) {
117 
118                         isRSSLink =
119                             title != null &&
120                             title.equalsIgnoreCase( "rss" ) &&
121                             href.indexOf( "rss" ) != -1;
122 
123                     } 
124 
125                     if ( isRSSLink ) {
126 
127                         //this is an RSS feed.
128                         FeedReference ref = new FeedReference( current,
129                                                                FeedReference.RSS_MEDIA_TYPE );
130 
131                         
132                         //make sure we haven't already discovered this feed
133                         //through a different process
134                         if (list.contains(ref))
135                             return true;
136 
137                         //Make sure to preserve existing AD feeds first.
138                         if ( ! hasExplicitRSSFeed )
139                             list.setAdRSSFeed( ref );
140 
141                         list.add( ref );
142 
143                         hasFoundRSSFeed = true;
144                         
145                     }
146 
147                     if ( current.endsWith( ".atom" ) ) {
148 
149                         FeedReference ref = new FeedReference( current,
150                                                                FeedReference.RSS_MEDIA_TYPE );
151 
152                         //make sure we haven't already discovered this feed
153                         //through a different process
154                         if (list.contains(ref))
155                             return true;
156                         
157                         //Make sure to preserve existing AD feeds first.
158                         if ( ! hasExplicitAtomFeed )
159                             list.setAdAtomFeed( ref );
160 
161                         list.add( ref );
162 
163                         hasFoundAtomFeed = true;
164 
165                     }
166 
167                     if ( current.endsWith( ".xml" ) ||
168                          current.endsWith( ".rdf" ) ) {
169 
170                         //NOTE that we do allow autodiscovery forfor index.xml
171                         //and index.rdf files but we don't prefer them since
172                         //these extensions are generic.  We would prefer to use
173                         //index.rss or even Atom (though people tend to use Atom
174                         //autodiscovery now).  This is important because if we
175                         //spit back an index.xml file thats NOT RSS or worse an
176                         //index.rdf file thats FOAF then we might break callers.
177 
178                         FeedReference ref = new FeedReference( current,
179                                                                FeedReference.RSS_MEDIA_TYPE );
180                         
181                         //make sure we haven't already discovered this feed
182                         //through a different process
183                         if (list.contains(ref))
184                             return true;
185 
186                         //see if we should RESORT to using this.
187 
188                         if ( ! hasExplicitRSSFeed && ! hasFoundRSSFeed ) {
189 
190                             //NOTE: when we have found an existing RDF file use
191                             //that instead..  This is probably RSS 1.0 which is
192                             //much better than RSS 0.91
193 
194                             if ( list.getAdRSSFeed() == null ||
195                                  list.getAdRSSFeed().resource.endsWith( ".rdf" ) == false ) {
196 
197                                 list.setAdRSSFeed( ref );
198 
199                             }
200 
201                         }
202 
203                         //feed for this blog.
204                         list.add( ref );
205                         return true;
206                         
207                     } 
208 
209                     //for coderman's blog at http://www.peertech.org
210                     //FIXME: This is a hack, Brad Neuberg, bkn3@columbia.edu
211                     if ( current.endsWith( "/node/feed" ) )
212                         list.add( current );
213 
214                     return true;
215                     
216                 }
217 
218             };
219 
220         listener.setContext( resource );
221         AnchorParser.parseAnchors( content, listener );
222         
223         return list;
224         
225     }
226 
227     public static String getSite( String resource ) {
228 
229         try {
230 
231             String site = new URL( resource ).getHost();
232             return site.replaceAll( "http://www", "http://" );
233             
234         } catch ( MalformedURLException e ) {
235             return null;
236         }
237         
238     }
239 
240 }