001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser;
018    
019    
020    /**
021     * Given the RAW content of a URL, determine if we're looking at an RSS file or
022     * an HTML file.  We also return the given RSS version or Atom version.
023     * 
024     * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
025     * @version $Id: ContentDetector.java 373614 2006-01-30 22:31:21Z mvdb $
026     */
027    public class ContentDetector {
028    
029        /**
030         * Return true if the given content seems to be RSS.  This is going to be a
031         * cheat because really we have no way of telling if this is RSS other than if
032         * it is XML and it starts with an RSS 1.0, 2.0, 0.91 or 0.9 decl
033         *
034         * 
035         */
036        public static ContentDetectorResult detect( String content ) throws Exception {
037    
038            ContentDetectorResult result = new ContentDetectorResult();
039            
040            result.isHTML = isHTMLContent( content );
041            result.isRSS = ( isRSS_1_0_Content( content ) ||
042                             isRSS_2_0_Content( content ) ||
043                             isRSS_0_9_0_Content( content ) ||
044                             isRSS_0_9_1_Content( content ) ||
045                             isRSS_0_9_2_Content( content ) );
046    
047            result.isAtom = isAtomContent( content );
048    
049            result.isFeed = result.isRSS || result.isAtom;
050    
051            return result;
052                             
053        }
054    
055        /**
056         * Return true if this is RSS 1.0 content
057         *
058         * 
059         */
060        public static boolean isRSS_1_0_Content( String content ) throws Exception {
061    
062            //do a search for the RSS 1.0 namespace.  This is a bit of a trick right
063            //now.
064    
065            return content.indexOf( "http://purl.org/rss/1.0/" ) != -1;
066            
067        }
068    
069        /**
070         * Return true if this is RSS 2.0 content
071         *
072         * 
073         */
074        public static boolean isRSS_0_9_1_Content( String content ) throws Exception {
075    
076            //look for the beginning of the RSS element
077            return content.indexOf( "<rss" ) != -1;
078    
079        }
080    
081        /**
082         * Return true if this is RSS 0.9.2 content
083         *
084         * 
085         */
086        public static boolean isRSS_0_9_2_Content( String content ) throws Exception {
087    
088            //same check for RSS 0.9.1
089            return isRSS_0_9_1_Content( content );
090            
091        }
092    
093        /**
094         * Return true if this is RSS 2.0 content
095         *
096         * 
097         */
098        public static boolean isRSS_2_0_Content( String content ) throws Exception {
099    
100            return isRSS_0_9_1_Content( content );
101    
102        }
103    
104        /**
105         * Return true if this is RSS 2.0 content
106         *
107         * 
108         */
109        public static boolean isRSS_0_9_0_Content( String content ) throws Exception {
110    
111            //FIXME: look for the RDF namespace and the RSS DTD namespace
112            return content.indexOf( "http://my.netscape.com/rdf/simple/0.9/" ) != -1;
113    
114        }
115    
116        public static boolean isAtomContent( String content ) throws Exception {
117    
118            return content.indexOf( "http://purl.org/atom/ns#" ) != -1;
119    
120        }
121    
122        /**
123         * Return true if this is RSS 2.0 content
124         *
125         * 
126         */
127        public static boolean isHTMLContent( String content ) throws Exception {
128    
129            //look for the beginning of the RSS element
130            return content.indexOf( "<html" ) != -1;
131    
132        }
133    
134        public static void main( String[] args ) {
135    
136            try { 
137                
138                //System.out.println( RSSContentVerifier.isRSSContent( new URL( args[0] ) ) );
139                
140            } catch ( Throwable t ) {
141                
142                t.printStackTrace();
143                
144            }
145    
146        }
147        
148    }