001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser.locate; 018 019 import java.util.HashMap; 020 import java.util.HashSet; 021 import java.util.List; 022 import java.util.regex.Matcher; 023 import java.util.regex.Pattern; 024 025 import org.apache.commons.feedparser.FeedList; 026 import org.apache.log4j.Logger; 027 028 /** 029 * 030 * http://www.ietf.org/internet-drafts/draft-ietf-atompub-autodiscovery-00.txt 031 * 032 * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a> 033 */ 034 public class DiscoveryLocator { 035 036 private static Logger log = Logger.getLogger( DiscoveryLocator.class ); 037 038 /** 039 * Get a FULL link within the content. We then pull the attributes out of 040 * this. 041 */ 042 static Pattern element_pattern = 043 Pattern.compile( "<link[^>]+", 044 Pattern.CASE_INSENSITIVE ); 045 046 /** 047 * Regex to match on attributes. 048 * 049 * Implementation: Mon Mar 14 2005 01:59 PM (burton@rojo.com): this is a 050 * pretty difficult regexp to grok. 051 * 052 * There's are two regexps here. One for attributes with quotes and one 053 * without. Each regexp has two groups - 1 is the name and 2 is the value. 054 * You can split the regexp on | to better understand each individual 055 * regexp. 056 */ 057 058 // > Attribute values MUST be one of the following: enclosed in double 059 // > quotes, enclosed in single quotes, or not enclosed in quotes at all. 060 // 061 // 062 static String ATTR_REGEXP = "([a-zA-Z]+)=[\"']([^\"']+)[\"']|([a-zA-Z]+)=([^\"'>\r\n\t ]+)"; 063 064 static Pattern ATTR_PATTERN = Pattern.compile( ATTR_REGEXP, 065 Pattern.CASE_INSENSITIVE ); 066 067 static HashSet mediatypes = new HashSet(); 068 069 static { 070 071 mediatypes.add( FeedReference.ATOM_MEDIA_TYPE ); 072 mediatypes.add( FeedReference.RSS_MEDIA_TYPE ); 073 mediatypes.add( FeedReference.XML_MEDIA_TYPE ); 074 075 } 076 077 /** 078 * Locate a feed via RSS/Atom auto-discovery. If both Atom and RSS are 079 * listed we return both. Actually we return all Atom/RSS or XML feeds 080 * including FOAF. It's up to the caller to use the correct feed. 081 * 082 * 083 */ 084 public static final List locate( String resource, 085 String content, 086 FeedList list ) 087 throws Exception { 088 089 //this mechanism is easier but it isn't efficient. I should just parse 090 //elements forward until I discover </head>. Also note that this isn't 091 //doing all feed URLs just the first ones it finds. 092 093 Matcher m = element_pattern.matcher( content ); 094 095 while( m.find() ) { 096 //the value of the link element XML... example: 097 098 // <link rel="alternate" 099 // href="http://www.codinginparadise.org/weblog/atom.xml" 100 // type="application/atom+xml" 101 // title="ATOM" /> 102 103 String element = m.group( 0 ); 104 105 HashMap attributes = getAttributes( element ); 106 107 String type = (String)attributes.get( "type" ); 108 if (type != null) 109 type = type.toLowerCase(); 110 111 if ( mediatypes.contains( type ) ) { 112 113 //expand the href 114 String href = (String)attributes.get( "href" ); 115 log.debug("href="+href); 116 117 // http://xml.coverpages.org/draft-ietf-atompub-autodiscovery-00.txt 118 119 // > The href attribute MUST be present in an Atom autodiscovery element, 120 // > and its value MUST be the URI [RFC2396] of an Atom feed. The value 121 // > MAY be a relative URI, and if so, clients MUST resolve it to a full 122 // > URI (section 5 of [RFC2396]) using the document's base URI (section 123 // > 12.4 of HTML 4 [W3C.REC-html401-19991224]). 124 125 href = ResourceExpander.expand( resource, href ); 126 127 FeedReference feedReference = new FeedReference( href, type ); 128 129 feedReference.title = (String)attributes.get( "title" ); 130 131 list.add( feedReference ); 132 133 if ( type.equals( FeedReference.ATOM_MEDIA_TYPE ) ) 134 list.setFirstAdAtomFeed( feedReference ); 135 136 if ( type.equals( FeedReference.RSS_MEDIA_TYPE ) ) 137 list.setFirstAdRSSFeed( feedReference ); 138 139 } 140 141 } 142 143 return list; 144 145 } 146 147 /** 148 * Parse attributes within elements into a hashmap. 149 * 150 * 151 */ 152 public static HashMap getAttributes( String content ) { 153 154 HashMap map = new HashMap(); 155 156 Matcher m = ATTR_PATTERN.matcher( content ); 157 158 int index = 0; 159 160 while ( m.find( index ) ) { 161 162 String name = m.group( 1 ); 163 String value = null; 164 165 //Since we use an OR regexp the first match will be 1/2 and the 166 //second will be 3/4 167 if ( name != null ) { 168 value = m.group( 2 ); 169 } else { 170 name = m.group( 3 ); 171 value = m.group( 4 ); 172 } 173 174 //String value = m.group( 2 ).toLowerCase().trim(); 175 name = name.toLowerCase().trim(); 176 // Some services, such as AOL LiveJournal, are case sensitive 177 // on their resource names; can't do a toLowerCase. 178 // Brad Neuberg, bkn3@columbia.edu 179 // String value = m.group( 2 ).toLowerCase().trim(); 180 value = value.trim(); 181 182 if ( "".equals( value ) ) 183 value = null; 184 185 map.put( name, value ); 186 187 index = m.end(); 188 189 } 190 191 return map; 192 193 } 194 195 }