001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser; 018 019 import java.util.HashMap; 020 import java.util.regex.Matcher; 021 import java.util.regex.Pattern; 022 023 import org.apache.log4j.Logger; 024 025 /** 026 * 027 * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a> 028 * @version $Id: FeedFilter.java 373614 2006-01-30 22:31:21Z mvdb $ 029 */ 030 public class FeedFilter { 031 032 private static Logger log = Logger.getLogger( FeedFilter.class ); 033 034 public static boolean DO_REMOVE_LEADING_PROLOG = true; 035 public static boolean DO_DECODE_ENTITIES = true; 036 037 public static HashMap LATIN1_ENTITIES = new HashMap(); 038 039 private static Pattern entity_pattern = Pattern.compile( "&([a-zA-Z]+);" ); 040 041 /** 042 * This is probably the wrong behavior. I shouldn't call this method I 043 * think because assuming a content type is bad form. 044 * 045 * @deprecated Specify an encoding with #parse( bytes[], encoding ) 046 * 047 */ 048 public static byte[] parse( byte[] bytes ) 049 throws Exception { 050 051 return parse( bytes, "UTF-8" ); 052 053 } 054 055 public static byte[] parse( byte[] bytes, String encoding ) 056 throws Exception { 057 058 String content = new String( bytes, encoding ); 059 060 return parse( content, encoding ); 061 062 } 063 064 /** 065 * Parse out an input string of content. 066 * 067 * http://wiki.apache.org/jakarta-commons/FeedParser_2fStringAllocationConsideredHelpful 068 * 069 * 070 */ 071 public static byte[] parse( String content, String encoding ) 072 throws Exception { 073 074 //FIXME: return an object here so that I can flag a bozo bit. 075 076 //remove leading prolog... 077 if ( DO_REMOVE_LEADING_PROLOG ) 078 content = doRemoveLeadingProlog( content, encoding ); 079 080 //decode HTML entities that are referenced. 081 if ( DO_DECODE_ENTITIES ) 082 content = doDecodeEntities( content ); 083 084 //TODO: undeclared namespace prefixes should be expanded to their common 085 //form. 'rdf, 'atom', 'xhtml' etc. Considering that they're will only be 086 //a handful H and then 4^36 different possibilities the probability will 087 //only be H in 4^36 which is pretty good that we won't have a false 088 //positive. 089 090 return content.getBytes( encoding ); 091 092 } 093 094 /** 095 * Removing prolog whitespace, comments, and other garbage from the 096 * beginning of a feed. 097 * 098 * 099 */ 100 private static String doRemoveLeadingProlog( String content, String encoding ) { 101 102 // if we're a UTF-16 or UTF-32 feed we need to LEAVE the prolog because 103 // it triggers a UTF-16 parse due to the BOM. 104 // 105 // FIXME: this isn't actually true. We should leave the BOM and remove 106 // the prolog anyway due to the fact that this will still break the 107 // parser. Come up with some tests for UTF-16 to see if I can get it to 108 // break and then update this method. 109 110 if ( "UTF-16".equals( encoding ) || 111 "UTF-32".equals( encoding ) ) 112 return content; 113 114 //move to the beginning of the first element or comment. When this is a 115 //processing instruction we will move to that 116 int begin = content.indexOf( "<" ); 117 118 if ( begin > 0 ) { 119 content = content.substring( begin, content.length() ); 120 log.warn( "Skipped whitespace in prolog and moved towards first element." ); 121 } 122 123 //now skip to the XML processing instruction when necessary. This is 124 //used to remove comments prior to <?xml which are not allowed. 125 126 begin = content.indexOf( "<?xml" ); 127 128 if ( begin > 0 ) { 129 content = content.substring( begin, content.length() ); 130 log.warn( "Removed prolog towards first processing instruction." ); 131 } 132 133 content = doRemoveElementProlog( content ); 134 135 return content; 136 137 } 138 139 /** 140 * Remove element content between: 141 * 142 * <?xml version="1.0"?> 143 * 144 * THIS IS BROKEN PROLOG 145 * 146 * <foo> 147 * 148 * 149 */ 150 private static String doRemoveElementProlog( String content ) { 151 152 int end = content.lastIndexOf( "?>", 100 ); 153 154 if ( end == -1 ) 155 return content; 156 157 StringBuffer buff = new StringBuffer( content.length() ); 158 end = end + 2; 159 buff.append( content.substring( 0, end ) ); 160 161 int begin = content.indexOf( "<", end ); 162 163 if ( begin != -1 ) { 164 165 buff.append( "\n" ); 166 buff.append( content.substring( begin, content.length() ) ); 167 168 } 169 170 return buff.toString(); 171 172 } 173 174 private static String doDecodeEntities( String content ) { 175 176 StringBuffer buff = new StringBuffer( content.length() + 1000 ); 177 178 Matcher m = entity_pattern.matcher( content ); 179 180 int begin = 0; 181 182 boolean hasFilterDecodedEntities = false; 183 boolean hasFilterFoundUnknownEntity = false; 184 185 //FIXME: note that when I was benchmarking this code that this showed up 186 //as a MAJOR bottleneck so we might want to optimize it a little more. 187 188 while ( m.find() ) { 189 190 buff.append( content.substring( begin, m.start() ) ); 191 192 String entity = m.group( 1 ); 193 194 String value = (String)LATIN1_ENTITIES.get( entity ); 195 196 if ( value != null ) { 197 buff.append( "&#" ); 198 buff.append( value ); 199 buff.append( ";" ); 200 201 hasFilterDecodedEntities = true; 202 203 } else { 204 205 //This is not a known entity so we have no way to correct it. 206 //If this is done then we have a problem and the feed probably 207 //still won't parse 208 buff.append( "&" ); 209 buff.append( entity ); 210 buff.append( ";" ); 211 212 hasFilterFoundUnknownEntity = true; 213 } 214 215 begin = m.end( 0 ); 216 217 } 218 219 buff.append( content.substring( begin, content.length() ) ); 220 221 if ( hasFilterFoundUnknownEntity ) 222 log.warn( "Filter encountered unknown entities" ); 223 224 if ( hasFilterDecodedEntities ) 225 log.warn( "Filter has decoded latin1 entities." ); 226 227 return buff.toString(); 228 229 } 230 231 public static void main( String[] args ) throws Exception { 232 233 byte[] b = parse( "hello é world".getBytes() ); 234 235 String v = new String( b ); 236 237 System.out.println( "v: " + v ); 238 239 } 240 241 static { 242 243 // load the latin1 entity map. We will replace latin1 entities with 244 // their char references directly. For example if someone incorrectly 245 // references: 246 // 247 // ä 248 // 249 // we replace it with: 250 // 251 // ä 252 // 253 // Which is correct in Latin1 254 255 // http://my.netscape.com/publish/formats/rss-0.91.dtd 256 257 LATIN1_ENTITIES.put( "nbsp", "160" ); 258 LATIN1_ENTITIES.put( "iexcl", "161" ); 259 LATIN1_ENTITIES.put( "cent", "162" ); 260 LATIN1_ENTITIES.put( "pound", "163" ); 261 LATIN1_ENTITIES.put( "curren", "164" ); 262 LATIN1_ENTITIES.put( "yen", "165" ); 263 LATIN1_ENTITIES.put( "brvbar", "166" ); 264 LATIN1_ENTITIES.put( "sect", "167" ); 265 LATIN1_ENTITIES.put( "uml", "168" ); 266 LATIN1_ENTITIES.put( "copy", "169" ); 267 LATIN1_ENTITIES.put( "ordf", "170" ); 268 LATIN1_ENTITIES.put( "laquo", "171" ); 269 LATIN1_ENTITIES.put( "not", "172" ); 270 LATIN1_ENTITIES.put( "shy", "173" ); 271 LATIN1_ENTITIES.put( "reg", "174" ); 272 LATIN1_ENTITIES.put( "macr", "175" ); 273 LATIN1_ENTITIES.put( "deg", "176" ); 274 LATIN1_ENTITIES.put( "plusmn", "177" ); 275 LATIN1_ENTITIES.put( "sup2", "178" ); 276 LATIN1_ENTITIES.put( "sup3", "179" ); 277 LATIN1_ENTITIES.put( "acute", "180" ); 278 LATIN1_ENTITIES.put( "micro", "181" ); 279 LATIN1_ENTITIES.put( "para", "182" ); 280 LATIN1_ENTITIES.put( "middot", "183" ); 281 LATIN1_ENTITIES.put( "cedil", "184" ); 282 LATIN1_ENTITIES.put( "sup1", "185" ); 283 LATIN1_ENTITIES.put( "ordm", "186" ); 284 LATIN1_ENTITIES.put( "raquo", "187" ); 285 LATIN1_ENTITIES.put( "frac14", "188" ); 286 LATIN1_ENTITIES.put( "frac12", "189" ); 287 LATIN1_ENTITIES.put( "frac34", "190" ); 288 LATIN1_ENTITIES.put( "iquest", "191" ); 289 LATIN1_ENTITIES.put( "Agrave", "192" ); 290 LATIN1_ENTITIES.put( "Aacute", "193" ); 291 LATIN1_ENTITIES.put( "Acirc", "194" ); 292 LATIN1_ENTITIES.put( "Atilde", "195" ); 293 LATIN1_ENTITIES.put( "Auml", "196" ); 294 LATIN1_ENTITIES.put( "Aring", "197" ); 295 LATIN1_ENTITIES.put( "AElig", "198" ); 296 LATIN1_ENTITIES.put( "Ccedil", "199" ); 297 LATIN1_ENTITIES.put( "Egrave", "200" ); 298 LATIN1_ENTITIES.put( "Eacute", "201" ); 299 LATIN1_ENTITIES.put( "Ecirc", "202" ); 300 LATIN1_ENTITIES.put( "Euml", "203" ); 301 LATIN1_ENTITIES.put( "Igrave", "204" ); 302 LATIN1_ENTITIES.put( "Iacute", "205" ); 303 LATIN1_ENTITIES.put( "Icirc", "206" ); 304 LATIN1_ENTITIES.put( "Iuml", "207" ); 305 LATIN1_ENTITIES.put( "ETH", "208" ); 306 LATIN1_ENTITIES.put( "Ntilde", "209" ); 307 LATIN1_ENTITIES.put( "Ograve", "210" ); 308 LATIN1_ENTITIES.put( "Oacute", "211" ); 309 LATIN1_ENTITIES.put( "Ocirc", "212" ); 310 LATIN1_ENTITIES.put( "Otilde", "213" ); 311 LATIN1_ENTITIES.put( "Ouml", "214" ); 312 LATIN1_ENTITIES.put( "times", "215" ); 313 LATIN1_ENTITIES.put( "Oslash", "216" ); 314 LATIN1_ENTITIES.put( "Ugrave", "217" ); 315 LATIN1_ENTITIES.put( "Uacute", "218" ); 316 LATIN1_ENTITIES.put( "Ucirc", "219" ); 317 LATIN1_ENTITIES.put( "Uuml", "220" ); 318 LATIN1_ENTITIES.put( "Yacute", "221" ); 319 LATIN1_ENTITIES.put( "THORN", "222" ); 320 LATIN1_ENTITIES.put( "szlig", "223" ); 321 LATIN1_ENTITIES.put( "agrave", "224" ); 322 LATIN1_ENTITIES.put( "aacute", "225" ); 323 LATIN1_ENTITIES.put( "acirc", "226" ); 324 LATIN1_ENTITIES.put( "atilde", "227" ); 325 LATIN1_ENTITIES.put( "auml", "228" ); 326 LATIN1_ENTITIES.put( "aring", "229" ); 327 LATIN1_ENTITIES.put( "aelig", "230" ); 328 LATIN1_ENTITIES.put( "ccedil", "231" ); 329 LATIN1_ENTITIES.put( "egrave", "232" ); 330 LATIN1_ENTITIES.put( "eacute", "233" ); 331 LATIN1_ENTITIES.put( "ecirc", "234" ); 332 LATIN1_ENTITIES.put( "euml", "235" ); 333 LATIN1_ENTITIES.put( "igrave", "236" ); 334 LATIN1_ENTITIES.put( "iacute", "237" ); 335 LATIN1_ENTITIES.put( "icirc", "238" ); 336 LATIN1_ENTITIES.put( "iuml", "239" ); 337 LATIN1_ENTITIES.put( "eth", "240" ); 338 LATIN1_ENTITIES.put( "ntilde", "241" ); 339 LATIN1_ENTITIES.put( "ograve", "242" ); 340 LATIN1_ENTITIES.put( "oacute", "243" ); 341 LATIN1_ENTITIES.put( "ocirc", "244" ); 342 LATIN1_ENTITIES.put( "otilde", "245" ); 343 LATIN1_ENTITIES.put( "ouml", "246" ); 344 LATIN1_ENTITIES.put( "divide", "247" ); 345 LATIN1_ENTITIES.put( "oslash", "248" ); 346 LATIN1_ENTITIES.put( "ugrave", "249" ); 347 LATIN1_ENTITIES.put( "uacute", "250" ); 348 LATIN1_ENTITIES.put( "ucirc", "251" ); 349 LATIN1_ENTITIES.put( "uuml", "252" ); 350 LATIN1_ENTITIES.put( "yacute", "253" ); 351 LATIN1_ENTITIES.put( "thorn", "254" ); 352 LATIN1_ENTITIES.put( "yuml", "255" ); 353 354 } 355 356 }