001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser; 018 019 020 /** 021 * Given the RAW content of a URL, determine if we're looking at an RSS file or 022 * an HTML file. We also return the given RSS version or Atom version. 023 * 024 * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a> 025 * @version $Id: ContentDetector.java 373614 2006-01-30 22:31:21Z mvdb $ 026 */ 027 public class ContentDetector { 028 029 /** 030 * Return true if the given content seems to be RSS. This is going to be a 031 * cheat because really we have no way of telling if this is RSS other than if 032 * it is XML and it starts with an RSS 1.0, 2.0, 0.91 or 0.9 decl 033 * 034 * 035 */ 036 public static ContentDetectorResult detect( String content ) throws Exception { 037 038 ContentDetectorResult result = new ContentDetectorResult(); 039 040 result.isHTML = isHTMLContent( content ); 041 result.isRSS = ( isRSS_1_0_Content( content ) || 042 isRSS_2_0_Content( content ) || 043 isRSS_0_9_0_Content( content ) || 044 isRSS_0_9_1_Content( content ) || 045 isRSS_0_9_2_Content( content ) ); 046 047 result.isAtom = isAtomContent( content ); 048 049 result.isFeed = result.isRSS || result.isAtom; 050 051 return result; 052 053 } 054 055 /** 056 * Return true if this is RSS 1.0 content 057 * 058 * 059 */ 060 public static boolean isRSS_1_0_Content( String content ) throws Exception { 061 062 //do a search for the RSS 1.0 namespace. This is a bit of a trick right 063 //now. 064 065 return content.indexOf( "http://purl.org/rss/1.0/" ) != -1; 066 067 } 068 069 /** 070 * Return true if this is RSS 2.0 content 071 * 072 * 073 */ 074 public static boolean isRSS_0_9_1_Content( String content ) throws Exception { 075 076 //look for the beginning of the RSS element 077 return content.indexOf( "<rss" ) != -1; 078 079 } 080 081 /** 082 * Return true if this is RSS 0.9.2 content 083 * 084 * 085 */ 086 public static boolean isRSS_0_9_2_Content( String content ) throws Exception { 087 088 //same check for RSS 0.9.1 089 return isRSS_0_9_1_Content( content ); 090 091 } 092 093 /** 094 * Return true if this is RSS 2.0 content 095 * 096 * 097 */ 098 public static boolean isRSS_2_0_Content( String content ) throws Exception { 099 100 return isRSS_0_9_1_Content( content ); 101 102 } 103 104 /** 105 * Return true if this is RSS 2.0 content 106 * 107 * 108 */ 109 public static boolean isRSS_0_9_0_Content( String content ) throws Exception { 110 111 //FIXME: look for the RDF namespace and the RSS DTD namespace 112 return content.indexOf( "http://my.netscape.com/rdf/simple/0.9/" ) != -1; 113 114 } 115 116 public static boolean isAtomContent( String content ) throws Exception { 117 118 return content.indexOf( "http://purl.org/atom/ns#" ) != -1; 119 120 } 121 122 /** 123 * Return true if this is RSS 2.0 content 124 * 125 * 126 */ 127 public static boolean isHTMLContent( String content ) throws Exception { 128 129 //look for the beginning of the RSS element 130 return content.indexOf( "<html" ) != -1; 131 132 } 133 134 public static void main( String[] args ) { 135 136 try { 137 138 //System.out.println( RSSContentVerifier.isRSSContent( new URL( args[0] ) ) ); 139 140 } catch ( Throwable t ) { 141 142 t.printStackTrace(); 143 144 } 145 146 } 147 148 }