001 /*
002 * Copyright 1999,2004 The Apache Software Foundation.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package org.apache.commons.feedparser;
018
019
020 /**
021 * Given the RAW content of a URL, determine if we're looking at an RSS file or
022 * an HTML file. We also return the given RSS version or Atom version.
023 *
024 * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
025 * @version $Id: ContentDetector.java 373614 2006-01-30 22:31:21Z mvdb $
026 */
027 public class ContentDetector {
028
029 /**
030 * Return true if the given content seems to be RSS. This is going to be a
031 * cheat because really we have no way of telling if this is RSS other than if
032 * it is XML and it starts with an RSS 1.0, 2.0, 0.91 or 0.9 decl
033 *
034 *
035 */
036 public static ContentDetectorResult detect( String content ) throws Exception {
037
038 ContentDetectorResult result = new ContentDetectorResult();
039
040 result.isHTML = isHTMLContent( content );
041 result.isRSS = ( isRSS_1_0_Content( content ) ||
042 isRSS_2_0_Content( content ) ||
043 isRSS_0_9_0_Content( content ) ||
044 isRSS_0_9_1_Content( content ) ||
045 isRSS_0_9_2_Content( content ) );
046
047 result.isAtom = isAtomContent( content );
048
049 result.isFeed = result.isRSS || result.isAtom;
050
051 return result;
052
053 }
054
055 /**
056 * Return true if this is RSS 1.0 content
057 *
058 *
059 */
060 public static boolean isRSS_1_0_Content( String content ) throws Exception {
061
062 //do a search for the RSS 1.0 namespace. This is a bit of a trick right
063 //now.
064
065 return content.indexOf( "http://purl.org/rss/1.0/" ) != -1;
066
067 }
068
069 /**
070 * Return true if this is RSS 2.0 content
071 *
072 *
073 */
074 public static boolean isRSS_0_9_1_Content( String content ) throws Exception {
075
076 //look for the beginning of the RSS element
077 return content.indexOf( "<rss" ) != -1;
078
079 }
080
081 /**
082 * Return true if this is RSS 0.9.2 content
083 *
084 *
085 */
086 public static boolean isRSS_0_9_2_Content( String content ) throws Exception {
087
088 //same check for RSS 0.9.1
089 return isRSS_0_9_1_Content( content );
090
091 }
092
093 /**
094 * Return true if this is RSS 2.0 content
095 *
096 *
097 */
098 public static boolean isRSS_2_0_Content( String content ) throws Exception {
099
100 return isRSS_0_9_1_Content( content );
101
102 }
103
104 /**
105 * Return true if this is RSS 2.0 content
106 *
107 *
108 */
109 public static boolean isRSS_0_9_0_Content( String content ) throws Exception {
110
111 //FIXME: look for the RDF namespace and the RSS DTD namespace
112 return content.indexOf( "http://my.netscape.com/rdf/simple/0.9/" ) != -1;
113
114 }
115
116 public static boolean isAtomContent( String content ) throws Exception {
117
118 return content.indexOf( "http://purl.org/atom/ns#" ) != -1;
119
120 }
121
122 /**
123 * Return true if this is RSS 2.0 content
124 *
125 *
126 */
127 public static boolean isHTMLContent( String content ) throws Exception {
128
129 //look for the beginning of the RSS element
130 return content.indexOf( "<html" ) != -1;
131
132 }
133
134 public static void main( String[] args ) {
135
136 try {
137
138 //System.out.println( RSSContentVerifier.isRSSContent( new URL( args[0] ) ) );
139
140 } catch ( Throwable t ) {
141
142 t.printStackTrace();
143
144 }
145
146 }
147
148 }