001 /*
002 * Copyright 1999,2004 The Apache Software Foundation.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package org.apache.commons.feedparser;
018
019 import java.util.HashMap;
020 import java.util.regex.Matcher;
021 import java.util.regex.Pattern;
022
023 import org.apache.log4j.Logger;
024
025 /**
026 *
027 * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
028 * @version $Id: FeedFilter.java 373614 2006-01-30 22:31:21Z mvdb $
029 */
030 public class FeedFilter {
031
032 private static Logger log = Logger.getLogger( FeedFilter.class );
033
034 public static boolean DO_REMOVE_LEADING_PROLOG = true;
035 public static boolean DO_DECODE_ENTITIES = true;
036
037 public static HashMap LATIN1_ENTITIES = new HashMap();
038
039 private static Pattern entity_pattern = Pattern.compile( "&([a-zA-Z]+);" );
040
041 /**
042 * This is probably the wrong behavior. I shouldn't call this method I
043 * think because assuming a content type is bad form.
044 *
045 * @deprecated Specify an encoding with #parse( bytes[], encoding )
046 *
047 */
048 public static byte[] parse( byte[] bytes )
049 throws Exception {
050
051 return parse( bytes, "UTF-8" );
052
053 }
054
055 public static byte[] parse( byte[] bytes, String encoding )
056 throws Exception {
057
058 String content = new String( bytes, encoding );
059
060 return parse( content, encoding );
061
062 }
063
064 /**
065 * Parse out an input string of content.
066 *
067 * http://wiki.apache.org/jakarta-commons/FeedParser_2fStringAllocationConsideredHelpful
068 *
069 *
070 */
071 public static byte[] parse( String content, String encoding )
072 throws Exception {
073
074 //FIXME: return an object here so that I can flag a bozo bit.
075
076 //remove leading prolog...
077 if ( DO_REMOVE_LEADING_PROLOG )
078 content = doRemoveLeadingProlog( content, encoding );
079
080 //decode HTML entities that are referenced.
081 if ( DO_DECODE_ENTITIES )
082 content = doDecodeEntities( content );
083
084 //TODO: undeclared namespace prefixes should be expanded to their common
085 //form. 'rdf, 'atom', 'xhtml' etc. Considering that they're will only be
086 //a handful H and then 4^36 different possibilities the probability will
087 //only be H in 4^36 which is pretty good that we won't have a false
088 //positive.
089
090 return content.getBytes( encoding );
091
092 }
093
094 /**
095 * Removing prolog whitespace, comments, and other garbage from the
096 * beginning of a feed.
097 *
098 *
099 */
100 private static String doRemoveLeadingProlog( String content, String encoding ) {
101
102 // if we're a UTF-16 or UTF-32 feed we need to LEAVE the prolog because
103 // it triggers a UTF-16 parse due to the BOM.
104 //
105 // FIXME: this isn't actually true. We should leave the BOM and remove
106 // the prolog anyway due to the fact that this will still break the
107 // parser. Come up with some tests for UTF-16 to see if I can get it to
108 // break and then update this method.
109
110 if ( "UTF-16".equals( encoding ) ||
111 "UTF-32".equals( encoding ) )
112 return content;
113
114 //move to the beginning of the first element or comment. When this is a
115 //processing instruction we will move to that
116 int begin = content.indexOf( "<" );
117
118 if ( begin > 0 ) {
119 content = content.substring( begin, content.length() );
120 log.warn( "Skipped whitespace in prolog and moved towards first element." );
121 }
122
123 //now skip to the XML processing instruction when necessary. This is
124 //used to remove comments prior to <?xml which are not allowed.
125
126 begin = content.indexOf( "<?xml" );
127
128 if ( begin > 0 ) {
129 content = content.substring( begin, content.length() );
130 log.warn( "Removed prolog towards first processing instruction." );
131 }
132
133 content = doRemoveElementProlog( content );
134
135 return content;
136
137 }
138
139 /**
140 * Remove element content between:
141 *
142 * <?xml version="1.0"?>
143 *
144 * THIS IS BROKEN PROLOG
145 *
146 * <foo>
147 *
148 *
149 */
150 private static String doRemoveElementProlog( String content ) {
151
152 int end = content.lastIndexOf( "?>", 100 );
153
154 if ( end == -1 )
155 return content;
156
157 StringBuffer buff = new StringBuffer( content.length() );
158 end = end + 2;
159 buff.append( content.substring( 0, end ) );
160
161 int begin = content.indexOf( "<", end );
162
163 if ( begin != -1 ) {
164
165 buff.append( "\n" );
166 buff.append( content.substring( begin, content.length() ) );
167
168 }
169
170 return buff.toString();
171
172 }
173
174 private static String doDecodeEntities( String content ) {
175
176 StringBuffer buff = new StringBuffer( content.length() + 1000 );
177
178 Matcher m = entity_pattern.matcher( content );
179
180 int begin = 0;
181
182 boolean hasFilterDecodedEntities = false;
183 boolean hasFilterFoundUnknownEntity = false;
184
185 //FIXME: note that when I was benchmarking this code that this showed up
186 //as a MAJOR bottleneck so we might want to optimize it a little more.
187
188 while ( m.find() ) {
189
190 buff.append( content.substring( begin, m.start() ) );
191
192 String entity = m.group( 1 );
193
194 String value = (String)LATIN1_ENTITIES.get( entity );
195
196 if ( value != null ) {
197 buff.append( "&#" );
198 buff.append( value );
199 buff.append( ";" );
200
201 hasFilterDecodedEntities = true;
202
203 } else {
204
205 //This is not a known entity so we have no way to correct it.
206 //If this is done then we have a problem and the feed probably
207 //still won't parse
208 buff.append( "&" );
209 buff.append( entity );
210 buff.append( ";" );
211
212 hasFilterFoundUnknownEntity = true;
213 }
214
215 begin = m.end( 0 );
216
217 }
218
219 buff.append( content.substring( begin, content.length() ) );
220
221 if ( hasFilterFoundUnknownEntity )
222 log.warn( "Filter encountered unknown entities" );
223
224 if ( hasFilterDecodedEntities )
225 log.warn( "Filter has decoded latin1 entities." );
226
227 return buff.toString();
228
229 }
230
231 public static void main( String[] args ) throws Exception {
232
233 byte[] b = parse( "hello é world".getBytes() );
234
235 String v = new String( b );
236
237 System.out.println( "v: " + v );
238
239 }
240
241 static {
242
243 // load the latin1 entity map. We will replace latin1 entities with
244 // their char references directly. For example if someone incorrectly
245 // references:
246 //
247 // ä
248 //
249 // we replace it with:
250 //
251 // ä
252 //
253 // Which is correct in Latin1
254
255 // http://my.netscape.com/publish/formats/rss-0.91.dtd
256
257 LATIN1_ENTITIES.put( "nbsp", "160" );
258 LATIN1_ENTITIES.put( "iexcl", "161" );
259 LATIN1_ENTITIES.put( "cent", "162" );
260 LATIN1_ENTITIES.put( "pound", "163" );
261 LATIN1_ENTITIES.put( "curren", "164" );
262 LATIN1_ENTITIES.put( "yen", "165" );
263 LATIN1_ENTITIES.put( "brvbar", "166" );
264 LATIN1_ENTITIES.put( "sect", "167" );
265 LATIN1_ENTITIES.put( "uml", "168" );
266 LATIN1_ENTITIES.put( "copy", "169" );
267 LATIN1_ENTITIES.put( "ordf", "170" );
268 LATIN1_ENTITIES.put( "laquo", "171" );
269 LATIN1_ENTITIES.put( "not", "172" );
270 LATIN1_ENTITIES.put( "shy", "173" );
271 LATIN1_ENTITIES.put( "reg", "174" );
272 LATIN1_ENTITIES.put( "macr", "175" );
273 LATIN1_ENTITIES.put( "deg", "176" );
274 LATIN1_ENTITIES.put( "plusmn", "177" );
275 LATIN1_ENTITIES.put( "sup2", "178" );
276 LATIN1_ENTITIES.put( "sup3", "179" );
277 LATIN1_ENTITIES.put( "acute", "180" );
278 LATIN1_ENTITIES.put( "micro", "181" );
279 LATIN1_ENTITIES.put( "para", "182" );
280 LATIN1_ENTITIES.put( "middot", "183" );
281 LATIN1_ENTITIES.put( "cedil", "184" );
282 LATIN1_ENTITIES.put( "sup1", "185" );
283 LATIN1_ENTITIES.put( "ordm", "186" );
284 LATIN1_ENTITIES.put( "raquo", "187" );
285 LATIN1_ENTITIES.put( "frac14", "188" );
286 LATIN1_ENTITIES.put( "frac12", "189" );
287 LATIN1_ENTITIES.put( "frac34", "190" );
288 LATIN1_ENTITIES.put( "iquest", "191" );
289 LATIN1_ENTITIES.put( "Agrave", "192" );
290 LATIN1_ENTITIES.put( "Aacute", "193" );
291 LATIN1_ENTITIES.put( "Acirc", "194" );
292 LATIN1_ENTITIES.put( "Atilde", "195" );
293 LATIN1_ENTITIES.put( "Auml", "196" );
294 LATIN1_ENTITIES.put( "Aring", "197" );
295 LATIN1_ENTITIES.put( "AElig", "198" );
296 LATIN1_ENTITIES.put( "Ccedil", "199" );
297 LATIN1_ENTITIES.put( "Egrave", "200" );
298 LATIN1_ENTITIES.put( "Eacute", "201" );
299 LATIN1_ENTITIES.put( "Ecirc", "202" );
300 LATIN1_ENTITIES.put( "Euml", "203" );
301 LATIN1_ENTITIES.put( "Igrave", "204" );
302 LATIN1_ENTITIES.put( "Iacute", "205" );
303 LATIN1_ENTITIES.put( "Icirc", "206" );
304 LATIN1_ENTITIES.put( "Iuml", "207" );
305 LATIN1_ENTITIES.put( "ETH", "208" );
306 LATIN1_ENTITIES.put( "Ntilde", "209" );
307 LATIN1_ENTITIES.put( "Ograve", "210" );
308 LATIN1_ENTITIES.put( "Oacute", "211" );
309 LATIN1_ENTITIES.put( "Ocirc", "212" );
310 LATIN1_ENTITIES.put( "Otilde", "213" );
311 LATIN1_ENTITIES.put( "Ouml", "214" );
312 LATIN1_ENTITIES.put( "times", "215" );
313 LATIN1_ENTITIES.put( "Oslash", "216" );
314 LATIN1_ENTITIES.put( "Ugrave", "217" );
315 LATIN1_ENTITIES.put( "Uacute", "218" );
316 LATIN1_ENTITIES.put( "Ucirc", "219" );
317 LATIN1_ENTITIES.put( "Uuml", "220" );
318 LATIN1_ENTITIES.put( "Yacute", "221" );
319 LATIN1_ENTITIES.put( "THORN", "222" );
320 LATIN1_ENTITIES.put( "szlig", "223" );
321 LATIN1_ENTITIES.put( "agrave", "224" );
322 LATIN1_ENTITIES.put( "aacute", "225" );
323 LATIN1_ENTITIES.put( "acirc", "226" );
324 LATIN1_ENTITIES.put( "atilde", "227" );
325 LATIN1_ENTITIES.put( "auml", "228" );
326 LATIN1_ENTITIES.put( "aring", "229" );
327 LATIN1_ENTITIES.put( "aelig", "230" );
328 LATIN1_ENTITIES.put( "ccedil", "231" );
329 LATIN1_ENTITIES.put( "egrave", "232" );
330 LATIN1_ENTITIES.put( "eacute", "233" );
331 LATIN1_ENTITIES.put( "ecirc", "234" );
332 LATIN1_ENTITIES.put( "euml", "235" );
333 LATIN1_ENTITIES.put( "igrave", "236" );
334 LATIN1_ENTITIES.put( "iacute", "237" );
335 LATIN1_ENTITIES.put( "icirc", "238" );
336 LATIN1_ENTITIES.put( "iuml", "239" );
337 LATIN1_ENTITIES.put( "eth", "240" );
338 LATIN1_ENTITIES.put( "ntilde", "241" );
339 LATIN1_ENTITIES.put( "ograve", "242" );
340 LATIN1_ENTITIES.put( "oacute", "243" );
341 LATIN1_ENTITIES.put( "ocirc", "244" );
342 LATIN1_ENTITIES.put( "otilde", "245" );
343 LATIN1_ENTITIES.put( "ouml", "246" );
344 LATIN1_ENTITIES.put( "divide", "247" );
345 LATIN1_ENTITIES.put( "oslash", "248" );
346 LATIN1_ENTITIES.put( "ugrave", "249" );
347 LATIN1_ENTITIES.put( "uacute", "250" );
348 LATIN1_ENTITIES.put( "ucirc", "251" );
349 LATIN1_ENTITIES.put( "uuml", "252" );
350 LATIN1_ENTITIES.put( "yacute", "253" );
351 LATIN1_ENTITIES.put( "thorn", "254" );
352 LATIN1_ENTITIES.put( "yuml", "255" );
353
354 }
355
356 }