001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 package org.apache.commons.feedparser.locate; 017 018 import java.util.HashMap; 019 020 /** 021 * 022 * Given a string of HTML content, parse out anchors and fire events with all 023 * the data when they are found. 024 * 025 * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a> 026 */ 027 public class AnchorParser { 028 029 public static void parse( String content, 030 AnchorParserListener listener ) 031 throws AnchorParserException { 032 033 //FIXME: we do NOT obey base right now and this is a BIG problem! 034 035 parseAnchors( content, listener ); 036 037 } 038 039 /** 040 * Get links from the given html with included titles and other metainfo. 041 * 042 * @deprecated use HTParser 043 * 044 */ 045 public static void parseAnchors( String content, 046 AnchorParserListener listener ) 047 throws AnchorParserException { 048 049 int index = 0; 050 int begin = 0; 051 int end = 0; 052 053 //FIXME: what if there are HTML comments here? We would parse links 054 //within comments which isn't what we want. 055 056 // FIXME: how do we pass back the content of the href? 057 // 058 // <a href=''> this is the content </a> 059 // 060 // which would pass a string "this is the content" 061 062 //Matcher m = pattern.matcher( content ); 063 064 while ( (begin = content.indexOf( "<a", index )) != -1 ) { 065 066 index = begin; 067 068 end = content.indexOf( "</a>", index ); 069 if ( end == -1 ) 070 break; 071 index = end + 1; 072 073 String match = content.substring( begin, end ); 074 075 HashMap map = DiscoveryLocator.getAttributes( match ); 076 //String resource = EntityDecoder.decode( m.group( 1 ) ); 077 078 //FIXME: we SHOULD be using this but its not working right now. 079 String resource = (String)map.get( "href" ); 080 081 if ( resource == null || resource.equals( "" ) ) { 082 continue; 083 } 084 085 String title = (String)map.get( "title" ); 086 087 if ( title != null ) 088 title = EntityDecoder.decode( title ); 089 090 String rel = (String)map.get( "rel" ); 091 092 if ( ! listener.onAnchor( resource, rel, title ) ) 093 return; 094 095 } 096 097 } 098 099 public static void main( String[] args ) throws Exception { 100 101 AnchorParserListener listener = new AnchorParserListener() { 102 103 public boolean onAnchor( String href, String rel, String title ) { 104 105 System.out.println( "href: " + href ); 106 System.out.println( "rel: " + rel ); 107 System.out.println( "title: " + title ); 108 return true; 109 } 110 111 public Object getResult() { 112 return null; 113 } 114 public void setContext( Object context ) {} 115 116 }; 117 118 //FIXME: won't work with single quotes 119 //FIXME: won't work with <a /> 120 //parse( "<a href=\"http://peerfear.org\" rel=\"linux\" title=\"linux\" >adf</a>", listener ); 121 122 //parse( "<a rel=\"linux\" href=\"http://peerfear.org\" title=\"linux\" >adf</a>", listener ); 123 //parse( "<a title=\"linux\" rel=\"linux\" href=\"http://peerfear.org\" >adf</a>", listener ); 124 125 //parse( "<a href='http://peerfear.org' rel='linux' title='linux' >adf</a>", listener ); 126 127 parse( "<a href='mailto:burton@rojo.com' rel='linux' title='linux' ><img src='' /></a>", listener ); 128 129 } 130 131 }