001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser.locate;
018    
019    import java.util.HashMap;
020    import java.util.HashSet;
021    import java.util.List;
022    import java.util.regex.Matcher;
023    import java.util.regex.Pattern;
024    
025    import org.apache.commons.feedparser.FeedList;
026    import org.apache.log4j.Logger;
027    
028    /**
029     *
030     * http://www.ietf.org/internet-drafts/draft-ietf-atompub-autodiscovery-00.txt
031     * 
032     * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
033     */
034    public class DiscoveryLocator {
035        
036        private static Logger log = Logger.getLogger( DiscoveryLocator.class );
037    
038        /**
039         * Get a FULL link within the content. We then pull the attributes out of
040         * this.
041         */
042        static Pattern element_pattern =
043            Pattern.compile( "<link[^>]+",
044                             Pattern.CASE_INSENSITIVE );
045    
046        /**
047         * Regex to match on attributes.
048         * 
049         * Implementation: Mon Mar 14 2005 01:59 PM (burton@rojo.com): this is a
050         * pretty difficult regexp to grok.
051         * 
052         * There's are two regexps here.  One for attributes with quotes and one
053         * without. Each regexp has two groups - 1 is the name and 2 is the value.
054         * You can split the regexp on | to better understand each individual
055         * regexp.
056         */
057    
058        // > Attribute values MUST be one of the following: enclosed in double
059        // > quotes, enclosed in single quotes, or not enclosed in quotes at all.
060        //
061        // 
062        static String ATTR_REGEXP = "([a-zA-Z]+)=[\"']([^\"']+)[\"']|([a-zA-Z]+)=([^\"'>\r\n\t ]+)";
063        
064        static Pattern ATTR_PATTERN = Pattern.compile( ATTR_REGEXP,
065                                                       Pattern.CASE_INSENSITIVE );
066    
067        static HashSet mediatypes = new HashSet();
068    
069        static {
070    
071            mediatypes.add( FeedReference.ATOM_MEDIA_TYPE );
072            mediatypes.add( FeedReference.RSS_MEDIA_TYPE );
073            mediatypes.add( FeedReference.XML_MEDIA_TYPE );
074            
075        }
076    
077        /**
078         * Locate a feed via RSS/Atom auto-discovery.  If both Atom and RSS are
079         * listed we return both.  Actually we return all Atom/RSS or XML feeds
080         * including FOAF.  It's up to the caller to use the correct feed.
081         *
082         * 
083         */
084        public static final List locate( String resource,
085                                         String content,
086                                         FeedList list )
087            throws Exception {
088    
089            //this mechanism is easier but it isn't efficient.  I should just parse
090            //elements forward until I discover </head>.  Also note that this isn't
091            //doing all feed URLs just the first ones it finds.  
092    
093            Matcher m = element_pattern.matcher( content );
094    
095            while( m.find() ) {
096                //the value of the link element XML... example:
097                
098                // <link rel="alternate" 
099                //      href="http://www.codinginparadise.org/weblog/atom.xml"
100                //      type="application/atom+xml" 
101                //      title="ATOM" />
102                     
103                String element = m.group( 0 );
104    
105                HashMap attributes = getAttributes( element );
106                
107                String type = (String)attributes.get( "type" );
108                if (type != null)
109                    type = type.toLowerCase();
110    
111                if ( mediatypes.contains( type )  ) {
112    
113                    //expand the href
114                    String href = (String)attributes.get( "href" );
115                    log.debug("href="+href);
116    
117                    // http://xml.coverpages.org/draft-ietf-atompub-autodiscovery-00.txt
118                    
119                    // > The href attribute MUST be present in an Atom autodiscovery element,
120                    // > and its value MUST be the URI [RFC2396] of an Atom feed.  The value
121                    // > MAY be a relative URI, and if so, clients MUST resolve it to a full
122                    // > URI (section 5 of [RFC2396]) using the document's base URI (section
123                    // > 12.4 of HTML 4 [W3C.REC-html401-19991224]).
124    
125                    href = ResourceExpander.expand( resource, href );
126    
127                    FeedReference feedReference = new FeedReference( href, type );
128                    
129                    feedReference.title = (String)attributes.get( "title" );
130                    
131                    list.add( feedReference );
132    
133                    if ( type.equals( FeedReference.ATOM_MEDIA_TYPE ) )
134                        list.setFirstAdAtomFeed( feedReference );
135                        
136                    if ( type.equals( FeedReference.RSS_MEDIA_TYPE ) )
137                        list.setFirstAdRSSFeed( feedReference );
138    
139                }
140                
141            }
142            
143            return list;
144            
145        }
146    
147        /**
148         * Parse attributes within elements into a hashmap.
149         *
150         * 
151         */
152        public static HashMap getAttributes( String content ) {
153    
154            HashMap map = new HashMap();
155    
156            Matcher m = ATTR_PATTERN.matcher( content );
157    
158            int index = 0;
159    
160            while ( m.find( index ) ) {
161    
162                String name = m.group( 1 );
163                String value = null;
164    
165                //Since we use an OR regexp the first match will be 1/2 and the
166                //second will be 3/4
167                if ( name != null ) {
168                    value = m.group( 2 );
169                } else {
170                    name = m.group( 3 );
171                    value = m.group( 4 );
172                }
173    
174                //String value = m.group( 2 ).toLowerCase().trim();
175                name = name.toLowerCase().trim();
176                // Some services, such as AOL LiveJournal, are case sensitive
177                // on their resource names; can't do a toLowerCase.
178                // Brad Neuberg, bkn3@columbia.edu
179                // String value = m.group( 2 ).toLowerCase().trim();
180                value = value.trim();
181    
182                if ( "".equals( value ) ) 
183                    value = null; 
184    
185                map.put( name, value );
186                
187                index =  m.end();
188                
189            } 
190    
191            return map;
192            
193        }
194    
195    }