View Javadoc

1   /*
2    * Copyright 1999,2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.commons.feedparser;
18  
19  
20  /**
21   * Given the RAW content of a URL, determine if we're looking at an RSS file or
22   * an HTML file.  We also return the given RSS version or Atom version.
23   * 
24   * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
25   * @version $Id: ContentDetector.java 373614 2006-01-30 22:31:21Z mvdb $
26   */
27  public class ContentDetector {
28  
29      /**
30       * Return true if the given content seems to be RSS.  This is going to be a
31       * cheat because really we have no way of telling if this is RSS other than if
32       * it is XML and it starts with an RSS 1.0, 2.0, 0.91 or 0.9 decl
33       *
34       * 
35       */
36      public static ContentDetectorResult detect( String content ) throws Exception {
37  
38          ContentDetectorResult result = new ContentDetectorResult();
39          
40          result.isHTML = isHTMLContent( content );
41          result.isRSS = ( isRSS_1_0_Content( content ) ||
42                           isRSS_2_0_Content( content ) ||
43                           isRSS_0_9_0_Content( content ) ||
44                           isRSS_0_9_1_Content( content ) ||
45                           isRSS_0_9_2_Content( content ) );
46  
47          result.isAtom = isAtomContent( content );
48  
49          result.isFeed = result.isRSS || result.isAtom;
50  
51          return result;
52                           
53      }
54  
55      /**
56       * Return true if this is RSS 1.0 content
57       *
58       * 
59       */
60      public static boolean isRSS_1_0_Content( String content ) throws Exception {
61  
62          //do a search for the RSS 1.0 namespace.  This is a bit of a trick right
63          //now.
64  
65          return content.indexOf( "http://purl.org/rss/1.0/" ) != -1;
66          
67      }
68  
69      /**
70       * Return true if this is RSS 2.0 content
71       *
72       * 
73       */
74      public static boolean isRSS_0_9_1_Content( String content ) throws Exception {
75  
76          //look for the beginning of the RSS element
77          return content.indexOf( "<rss" ) != -1;
78  
79      }
80  
81      /**
82       * Return true if this is RSS 0.9.2 content
83       *
84       * 
85       */
86      public static boolean isRSS_0_9_2_Content( String content ) throws Exception {
87  
88          //same check for RSS 0.9.1
89          return isRSS_0_9_1_Content( content );
90          
91      }
92  
93      /**
94       * Return true if this is RSS 2.0 content
95       *
96       * 
97       */
98      public static boolean isRSS_2_0_Content( String content ) throws Exception {
99  
100         return isRSS_0_9_1_Content( content );
101 
102     }
103 
104     /**
105      * Return true if this is RSS 2.0 content
106      *
107      * 
108      */
109     public static boolean isRSS_0_9_0_Content( String content ) throws Exception {
110 
111         //FIXME: look for the RDF namespace and the RSS DTD namespace
112         return content.indexOf( "http://my.netscape.com/rdf/simple/0.9/" ) != -1;
113 
114     }
115 
116     public static boolean isAtomContent( String content ) throws Exception {
117 
118         return content.indexOf( "http://purl.org/atom/ns#" ) != -1;
119 
120     }
121 
122     /**
123      * Return true if this is RSS 2.0 content
124      *
125      * 
126      */
127     public static boolean isHTMLContent( String content ) throws Exception {
128 
129         //look for the beginning of the RSS element
130         return content.indexOf( "<html" ) != -1;
131 
132     }
133 
134     public static void main( String[] args ) {
135 
136         try { 
137             
138             //System.out.println( RSSContentVerifier.isRSSContent( new URL( args[0] ) ) );
139             
140         } catch ( Throwable t ) {
141             
142             t.printStackTrace();
143             
144         }
145 
146     }
147     
148 }