001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser.locate;
018    
019    import java.net.MalformedURLException;
020    import java.net.URL;
021    import java.util.HashSet;
022    import java.util.List;
023    
024    import org.apache.commons.feedparser.FeedList;
025    
026    /**
027     * Find links by parsing the raw HTML.  We only return links that are on the
028     * same site and link to /index.rdf LINKS and so forth.
029     *
030     * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
031     */
032    public class LinkLocator {
033    
034        /**
035         * 
036         *
037         * 
038         */
039        public static final List locate( String resource,
040                                         String content,
041                                         final FeedList list )
042            throws Exception {
043    
044            /**
045             * When we have been given feeds at a higher level (via <link rel> tags
046             * we should prefer these.
047             */
048            final boolean hasExplicitRSSFeed = list.getAdRSSFeed() != null;
049            final boolean hasExplicitAtomFeed = list.getAdRSSFeed() != null;
050    
051            AnchorParserListener listener = new AnchorParserListener() {
052    
053                    String resource = null;
054                    
055                    String site = null;
056    
057                    HashSet seen = new HashSet();
058    
059                    boolean hasFoundRSSFeed = false;
060                    boolean hasFoundAtomFeed = false;
061                    
062                    public void setContext( Object context ) {
063    
064                        resource = (String)context;
065                        
066                        //pass in the resource of the blog
067                        site = getSite( resource );
068                        
069                    }
070    
071                    public Object getResult() {
072                        return list;
073                    }
074    
075                    public boolean onAnchor( String href, String rel, String title ) {
076                        String current = ResourceExpander.expand( resource, href );
077                        if ( current == null )
078                            return true; //obviously not
079    
080                        //FIXME: if it's at the same directory level we should prioritize it.
081                        //for example:
082                        //
083                        // http://peerfear.org/blog/
084                        //
085                        // http://peerfear.org/blog/index.rdf
086                        //
087                        // instead of 
088                        //
089                        // http://peerfear.org/index.rdf
090    
091                        //see if the link is on a different site
092                        if ( ! site.equals( getSite( current ) ) ) {
093                            return true;
094                        }
095    
096                        //Radio style feed.  Screw that.
097                        //FIXME: What happens if the Feed Parser is used to
098                        //aggregate feeds on the localhost? This will break that.
099                        //Brad Neuberg, bkn3@columbia.edu
100                        if ( current.startsWith( "http://127" ) ) 
101                            return true;
102    
103                        if ( seen.contains( current ) ) {
104                            return true;
105                        } 
106    
107                        seen.add( current );
108    
109                        //FIXME: we should assert tha that these feeds are from the SAME
110                        //domain not a link to another feed.
111    
112                        boolean isRSSLink = current.endsWith( ".rss" );
113    
114                        //support ROLLER RSS links and explicit link discovery by
115                        //non-extensions.
116                        if ( isRSSLink == false ) {
117    
118                            isRSSLink =
119                                title != null &&
120                                title.equalsIgnoreCase( "rss" ) &&
121                                href.indexOf( "rss" ) != -1;
122    
123                        } 
124    
125                        if ( isRSSLink ) {
126    
127                            //this is an RSS feed.
128                            FeedReference ref = new FeedReference( current,
129                                                                   FeedReference.RSS_MEDIA_TYPE );
130    
131                            
132                            //make sure we haven't already discovered this feed
133                            //through a different process
134                            if (list.contains(ref))
135                                return true;
136    
137                            //Make sure to preserve existing AD feeds first.
138                            if ( ! hasExplicitRSSFeed )
139                                list.setAdRSSFeed( ref );
140    
141                            list.add( ref );
142    
143                            hasFoundRSSFeed = true;
144                            
145                        }
146    
147                        if ( current.endsWith( ".atom" ) ) {
148    
149                            FeedReference ref = new FeedReference( current,
150                                                                   FeedReference.RSS_MEDIA_TYPE );
151    
152                            //make sure we haven't already discovered this feed
153                            //through a different process
154                            if (list.contains(ref))
155                                return true;
156                            
157                            //Make sure to preserve existing AD feeds first.
158                            if ( ! hasExplicitAtomFeed )
159                                list.setAdAtomFeed( ref );
160    
161                            list.add( ref );
162    
163                            hasFoundAtomFeed = true;
164    
165                        }
166    
167                        if ( current.endsWith( ".xml" ) ||
168                             current.endsWith( ".rdf" ) ) {
169    
170                            //NOTE that we do allow autodiscovery forfor index.xml
171                            //and index.rdf files but we don't prefer them since
172                            //these extensions are generic.  We would prefer to use
173                            //index.rss or even Atom (though people tend to use Atom
174                            //autodiscovery now).  This is important because if we
175                            //spit back an index.xml file thats NOT RSS or worse an
176                            //index.rdf file thats FOAF then we might break callers.
177    
178                            FeedReference ref = new FeedReference( current,
179                                                                   FeedReference.RSS_MEDIA_TYPE );
180                            
181                            //make sure we haven't already discovered this feed
182                            //through a different process
183                            if (list.contains(ref))
184                                return true;
185    
186                            //see if we should RESORT to using this.
187    
188                            if ( ! hasExplicitRSSFeed && ! hasFoundRSSFeed ) {
189    
190                                //NOTE: when we have found an existing RDF file use
191                                //that instead..  This is probably RSS 1.0 which is
192                                //much better than RSS 0.91
193    
194                                if ( list.getAdRSSFeed() == null ||
195                                     list.getAdRSSFeed().resource.endsWith( ".rdf" ) == false ) {
196    
197                                    list.setAdRSSFeed( ref );
198    
199                                }
200    
201                            }
202    
203                            //feed for this blog.
204                            list.add( ref );
205                            return true;
206                            
207                        } 
208    
209                        //for coderman's blog at http://www.peertech.org
210                        //FIXME: This is a hack, Brad Neuberg, bkn3@columbia.edu
211                        if ( current.endsWith( "/node/feed" ) )
212                            list.add( current );
213    
214                        return true;
215                        
216                    }
217    
218                };
219    
220            listener.setContext( resource );
221            AnchorParser.parseAnchors( content, listener );
222            
223            return list;
224            
225        }
226    
227        public static String getSite( String resource ) {
228    
229            try {
230    
231                String site = new URL( resource ).getHost();
232                return site.replaceAll( "http://www", "http://" );
233                
234            } catch ( MalformedURLException e ) {
235                return null;
236            }
237            
238        }
239    
240    }