001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    package org.apache.commons.feedparser.locate;
017    
018    import java.util.HashMap;
019    
020    /**
021     *
022     * Given a string of HTML content, parse out anchors and fire events with all
023     * the data when they are found.
024     *
025     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
026     */
027    public class AnchorParser {
028    
029        public static void parse( String content,
030                                  AnchorParserListener listener )
031            throws AnchorParserException {
032    
033            //FIXME: we do NOT obey base right now and this is a BIG problem!
034            
035            parseAnchors( content, listener );
036            
037        }
038    
039        /**
040         * Get links from the given html with included titles and other metainfo.
041         *
042         * @deprecated use HTParser
043         * 
044         */
045        public static void parseAnchors( String content,
046                                         AnchorParserListener listener )
047            throws AnchorParserException {
048    
049            int index = 0;
050            int begin = 0;
051            int end = 0;
052    
053            //FIXME: what if there are HTML comments here?  We would parse links
054            //within comments which isn't what we want.
055    
056            // FIXME: how do we pass back the content of the href?
057            //
058            // <a href=''> this is the content </a>
059            //
060            // which would pass a string "this is the content"
061    
062            //Matcher m = pattern.matcher( content );
063    
064            while ( (begin = content.indexOf( "<a", index )) != -1 ) {
065    
066                index = begin;
067    
068                end = content.indexOf( "</a>", index );
069                if ( end == -1 )
070                    break;
071                index = end + 1;
072                
073                String match =  content.substring( begin, end );
074                
075                HashMap map = DiscoveryLocator.getAttributes( match );
076                //String resource = EntityDecoder.decode( m.group( 1 ) );
077    
078                //FIXME: we SHOULD be using this but its not working right now.
079                String resource = (String)map.get( "href" );
080                
081                if ( resource == null || resource.equals( "" ) ) {
082                    continue;
083                }
084    
085                String title = (String)map.get( "title" );
086    
087                if ( title != null )
088                    title = EntityDecoder.decode( title );
089                    
090                String rel = (String)map.get( "rel" );
091                
092                if ( ! listener.onAnchor( resource, rel, title ) )
093                    return;
094    
095            } 
096    
097        }
098    
099        public static void main( String[] args ) throws Exception {
100    
101            AnchorParserListener listener = new AnchorParserListener() {
102    
103                    public boolean onAnchor( String href, String rel, String title ) {
104    
105                        System.out.println( "href: " + href );
106                        System.out.println( "rel: " + rel );
107                        System.out.println( "title: " + title );
108                        return true;
109                    }
110    
111                    public Object getResult() {
112                        return null;
113                    }
114                    public void setContext( Object context ) {}
115                    
116                };
117    
118            //FIXME: won't work with single quotes
119            //FIXME: won't work with <a />
120            //parse( "<a href=\"http://peerfear.org\" rel=\"linux\" title=\"linux\" >adf</a>", listener );
121    
122            //parse( "<a rel=\"linux\" href=\"http://peerfear.org\" title=\"linux\" >adf</a>", listener );
123            //parse( "<a title=\"linux\" rel=\"linux\" href=\"http://peerfear.org\" >adf</a>", listener );
124    
125            //parse( "<a href='http://peerfear.org' rel='linux' title='linux' >adf</a>", listener );
126    
127            parse( "<a href='mailto:burton@rojo.com' rel='linux' title='linux' ><img src='' /></a>", listener );
128    
129        }
130    
131    }