001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser.locate;
018    
019    import java.util.HashMap;
020    import java.util.regex.Matcher;
021    import java.util.regex.Pattern;
022    
023    /**
024     *
025     * Given a string of HTML content we decode the entities it contains.
026     *
027     * NOTE: Currently this is a trivial implementation and we need to go through
028     * and make sure all HTML entities are correctly supported.
029     * 
030     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
031     * @version $Id: EntityDecoder.java 373622 2006-01-30 22:53:00Z mvdb $
032     */
033    public class EntityDecoder {
034    
035        //FIXME: see FeedFilter.java for a list of all valid HTML entities.  I
036        //should replace them with character literals in this situation.
037        
038        private static HashMap entities = new HashMap();
039    
040        static Pattern pattern = Pattern.compile( "&([a-z]+);" );
041        
042        static {
043    
044            //FIXME: there are a LOT more of these and we need an exhaustive colleciton.
045            
046            entities.put( "gt", ">" );
047            entities.put( "apos", ">" );
048            entities.put( "lt", "<" );
049            entities.put( "amp", "&" );
050    
051            //FIXME: 
052            entities.put( "raquo", "" );
053            entities.put( "laquo", "" );
054            
055        }
056    
057        /**
058         * Decode content.  If a null is passed in we return null. 
059         *
060         * 
061         */
062        public static String decode( String content ) {
063    
064            if ( content == null )
065                return null;
066            
067            //FIXME(performance): do I have existing code that does this more efficiently?
068            if (content == null)
069                return null;
070    
071            StringBuffer buff = new StringBuffer( content.length() );
072    
073            Matcher m = pattern.matcher( content );
074            
075            int index = 0;
076            while ( m.find() ) {
077    
078                //figure out which entity to escape or just include it.
079    
080                buff.append( content.substring( index, m.start( 0 ) ) );
081    
082                String entity = m.group( 1 );
083    
084                if ( entities.containsKey( entity ) ) {
085                    buff.append( entities.get( entity ) );
086                } else {
087                    //found an entity we no NOTHING about.  Should we warn?
088                    
089                    buff.append( m.group( 0 ) );
090                }
091    
092                index = m.end( 0 );
093    
094            }
095    
096            buff.append( content.substring( index, content.length() ) );
097    
098            return buff.toString();
099            
100        }
101    
102        public static void main( String[] args ) throws Exception {
103    
104            System.out.println( decode( "&amp;" ) );
105            System.out.println( decode( "asdf&amp;asdf" ) );
106    
107            System.out.println( decode( "asdf&amp;" ) );
108    
109            System.out.println( decode( "&amp;asdf" ) );
110    
111        }
112    
113    }