1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.feedparser;
18
19 import java.net.URL;
20
21 /***
22 * Given the RAW content of a URL, determine if we're looking at an RSS file or
23 * an HTML file. We also return the given RSS version or Atom version.
24 *
25 * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
26 * @version $Id: ContentDetector.java 159215 2005-03-27 23:36:16Z burton $
27 */
28 public class ContentDetector {
29
30 /***
31 * Return true if the given content seems to be RSS. This is going to be a
32 * cheat because really we have no way of telling if this is RSS other than if
33 * it is XML and it starts with an RSS 1.0, 2.0, 0.91 or 0.9 decl
34 *
35 *
36 */
37 public static ContentDetectorResult detect( String content ) throws Exception {
38
39 ContentDetectorResult result = new ContentDetectorResult();
40
41 result.isHTML = isHTMLContent( content );
42 result.isRSS = ( isRSS_1_0_Content( content ) ||
43 isRSS_2_0_Content( content ) ||
44 isRSS_0_9_0_Content( content ) ||
45 isRSS_0_9_1_Content( content ) ||
46 isRSS_0_9_2_Content( content ) );
47
48 result.isAtom = isAtomContent( content );
49
50 result.isFeed = result.isRSS || result.isAtom;
51
52 return result;
53
54 }
55
56 /***
57 * Return true if this is RSS 1.0 content
58 *
59 *
60 */
61 public static boolean isRSS_1_0_Content( String content ) throws Exception {
62
63
64
65
66 return content.indexOf( "http://purl.org/rss/1.0/" ) != -1;
67
68 }
69
70 /***
71 * Return true if this is RSS 2.0 content
72 *
73 *
74 */
75 public static boolean isRSS_0_9_1_Content( String content ) throws Exception {
76
77
78 return content.indexOf( "<rss" ) != -1;
79
80 }
81
82 /***
83 * Return true if this is RSS 0.9.2 content
84 *
85 *
86 */
87 public static boolean isRSS_0_9_2_Content( String content ) throws Exception {
88
89
90 return isRSS_0_9_1_Content( content );
91
92 }
93
94 /***
95 * Return true if this is RSS 2.0 content
96 *
97 *
98 */
99 public static boolean isRSS_2_0_Content( String content ) throws Exception {
100
101 return isRSS_0_9_1_Content( content );
102
103 }
104
105 /***
106 * Return true if this is RSS 2.0 content
107 *
108 *
109 */
110 public static boolean isRSS_0_9_0_Content( String content ) throws Exception {
111
112
113 return content.indexOf( "http://my.netscape.com/rdf/simple/0.9/" ) != -1;
114
115 }
116
117 public static boolean isAtomContent( String content ) throws Exception {
118
119 return content.indexOf( "http://purl.org/atom/ns#" ) != -1;
120
121 }
122
123 /***
124 * Return true if this is RSS 2.0 content
125 *
126 *
127 */
128 public static boolean isHTMLContent( String content ) throws Exception {
129
130
131 return content.indexOf( "<html" ) != -1;
132
133 }
134
135 public static void main( String[] args ) {
136
137 try {
138
139
140
141 } catch ( Throwable t ) {
142
143 t.printStackTrace();
144
145 }
146
147 }
148
149 }