View Javadoc

1   /*
2    * Copyright 1999,2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.commons.feedparser;
18  
19  import java.net.URL;
20  
21  /***
22   * Given the RAW content of a URL, determine if we're looking at an RSS file or
23   * an HTML file.  We also return the given RSS version or Atom version.
24   * 
25   * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
26   * @version $Id: ContentDetector.java 159215 2005-03-27 23:36:16Z burton $
27   */
28  public class ContentDetector {
29  
30      /***
31       * Return true if the given content seems to be RSS.  This is going to be a
32       * cheat because really we have no way of telling if this is RSS other than if
33       * it is XML and it starts with an RSS 1.0, 2.0, 0.91 or 0.9 decl
34       *
35       * 
36       */
37      public static ContentDetectorResult detect( String content ) throws Exception {
38  
39          ContentDetectorResult result = new ContentDetectorResult();
40          
41          result.isHTML = isHTMLContent( content );
42          result.isRSS = ( isRSS_1_0_Content( content ) ||
43                           isRSS_2_0_Content( content ) ||
44                           isRSS_0_9_0_Content( content ) ||
45                           isRSS_0_9_1_Content( content ) ||
46                           isRSS_0_9_2_Content( content ) );
47  
48          result.isAtom = isAtomContent( content );
49  
50          result.isFeed = result.isRSS || result.isAtom;
51  
52          return result;
53                           
54      }
55  
56      /***
57       * Return true if this is RSS 1.0 content
58       *
59       * 
60       */
61      public static boolean isRSS_1_0_Content( String content ) throws Exception {
62  
63          //do a search for the RSS 1.0 namespace.  This is a bit of a trick right
64          //now.
65  
66          return content.indexOf( "http://purl.org/rss/1.0/" ) != -1;
67          
68      }
69  
70      /***
71       * Return true if this is RSS 2.0 content
72       *
73       * 
74       */
75      public static boolean isRSS_0_9_1_Content( String content ) throws Exception {
76  
77          //look for the beginning of the RSS element
78          return content.indexOf( "<rss" ) != -1;
79  
80      }
81  
82      /***
83       * Return true if this is RSS 0.9.2 content
84       *
85       * 
86       */
87      public static boolean isRSS_0_9_2_Content( String content ) throws Exception {
88  
89          //same check for RSS 0.9.1
90          return isRSS_0_9_1_Content( content );
91          
92      }
93  
94      /***
95       * Return true if this is RSS 2.0 content
96       *
97       * 
98       */
99      public static boolean isRSS_2_0_Content( String content ) throws Exception {
100 
101         return isRSS_0_9_1_Content( content );
102 
103     }
104 
105     /***
106      * Return true if this is RSS 2.0 content
107      *
108      * 
109      */
110     public static boolean isRSS_0_9_0_Content( String content ) throws Exception {
111 
112         //FIXME: look for the RDF namespace and the RSS DTD namespace
113         return content.indexOf( "http://my.netscape.com/rdf/simple/0.9/" ) != -1;
114 
115     }
116 
117     public static boolean isAtomContent( String content ) throws Exception {
118 
119         return content.indexOf( "http://purl.org/atom/ns#" ) != -1;
120 
121     }
122 
123     /***
124      * Return true if this is RSS 2.0 content
125      *
126      * 
127      */
128     public static boolean isHTMLContent( String content ) throws Exception {
129 
130         //look for the beginning of the RSS element
131         return content.indexOf( "<html" ) != -1;
132 
133     }
134 
135     public static void main( String[] args ) {
136 
137         try { 
138             
139             //System.out.println( RSSContentVerifier.isRSSContent( new URL( args[0] ) ) );
140             
141         } catch ( Throwable t ) {
142             
143             t.printStackTrace();
144             
145         }
146 
147     }
148     
149 }