001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser.locate; 018 019 import java.util.HashMap; 020 import java.util.regex.Matcher; 021 import java.util.regex.Pattern; 022 023 /** 024 * 025 * Given a string of HTML content we decode the entities it contains. 026 * 027 * NOTE: Currently this is a trivial implementation and we need to go through 028 * and make sure all HTML entities are correctly supported. 029 * 030 * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a> 031 * @version $Id: EntityDecoder.java 373622 2006-01-30 22:53:00Z mvdb $ 032 */ 033 public class EntityDecoder { 034 035 //FIXME: see FeedFilter.java for a list of all valid HTML entities. I 036 //should replace them with character literals in this situation. 037 038 private static HashMap entities = new HashMap(); 039 040 static Pattern pattern = Pattern.compile( "&([a-z]+);" ); 041 042 static { 043 044 //FIXME: there are a LOT more of these and we need an exhaustive colleciton. 045 046 entities.put( "gt", ">" ); 047 entities.put( "apos", ">" ); 048 entities.put( "lt", "<" ); 049 entities.put( "amp", "&" ); 050 051 //FIXME: 052 entities.put( "raquo", "" ); 053 entities.put( "laquo", "" ); 054 055 } 056 057 /** 058 * Decode content. If a null is passed in we return null. 059 * 060 * 061 */ 062 public static String decode( String content ) { 063 064 if ( content == null ) 065 return null; 066 067 //FIXME(performance): do I have existing code that does this more efficiently? 068 if (content == null) 069 return null; 070 071 StringBuffer buff = new StringBuffer( content.length() ); 072 073 Matcher m = pattern.matcher( content ); 074 075 int index = 0; 076 while ( m.find() ) { 077 078 //figure out which entity to escape or just include it. 079 080 buff.append( content.substring( index, m.start( 0 ) ) ); 081 082 String entity = m.group( 1 ); 083 084 if ( entities.containsKey( entity ) ) { 085 buff.append( entities.get( entity ) ); 086 } else { 087 //found an entity we no NOTHING about. Should we warn? 088 089 buff.append( m.group( 0 ) ); 090 } 091 092 index = m.end( 0 ); 093 094 } 095 096 buff.append( content.substring( index, content.length() ) ); 097 098 return buff.toString(); 099 100 } 101 102 public static void main( String[] args ) throws Exception { 103 104 System.out.println( decode( "&" ) ); 105 System.out.println( decode( "asdf&asdf" ) ); 106 107 System.out.println( decode( "asdf&" ) ); 108 109 System.out.println( decode( "&asdf" ) ); 110 111 } 112 113 }