001    /*
002     * Copyright 1999,2004 The Apache Software Foundation.
003     * 
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     * 
008     *      http://www.apache.org/licenses/LICENSE-2.0
009     * 
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.apache.commons.feedparser.tools;
018    
019    
020    /**
021     *
022     * Given an XML document pull out the encoding or the default (UTF-8) if not
023     * specified.
024     *
025     * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
026     */
027    public class XMLEncodingParser {
028    
029        public static final String ENCODING = "encoding=\"";
030        
031        /**
032         *
033         * 
034         */
035        public static String parse( byte[] content ) throws Exception {
036    
037            //this isn't really pretty but it is fast.
038    
039            //just use the first 100 bytes
040    
041            String str;
042    
043            if ( content.length > 100 ) {
044                str = new String( content, 0, 100 );
045            } else {
046                str = new String( content );
047            }
048    
049            String result = getEncodingFromBOM( content );
050    
051            if ( result != null )
052                return result;
053            
054            int end = str.indexOf( ">" );
055    
056            if ( end == -1 )
057                return "UTF-8";
058    
059            String decl = str.substring( 0, end );
060    
061            int index = decl.indexOf( ENCODING );
062            
063            if ( index != -1 ) {
064    
065                String encoding = decl.substring( index + ENCODING.length(),
066                                                  decl.length() );
067    
068                end = encoding.indexOf( "\"" );
069                
070                if ( end == -1 )
071                    return "UTF-8";
072    
073                encoding = encoding.substring( 0, end);
074                encoding = encoding.toUpperCase();
075    
076                if ( "UTF8".equals( encoding ) )
077                    encoding = "UTF-8";
078                
079                return encoding;
080                
081            }
082    
083            return "UTF-8";
084    
085        }
086    
087        private static String getEncodingFromBOM( byte[] content ) {
088    
089            // Technically speaking if we see a BOM is specified we're supposed to
090            // return UTF-16 or UTF-32 but because we only care about anything UTF
091            // returning UTF-8 is incorrect but acceptable.
092            //
093            // http://www.unicode.org/faq/utf_bom.html#BOM
094    
095            if ( content.length > 2 ) {
096    
097                //perform UTF-16 tests
098                if ( content[0] == -1 &&
099                     content[1] == -2 ) 
100                    return "UTF-16";
101    
102                if ( content[0] == -2 &&
103                     content[1] == -1 ) 
104                    return "UTF-16";
105    
106            }
107    
108            if ( content.length > 4 ) {
109    
110                //perform UTF-16 tests
111                if ( content[0] == 0 &&
112                     content[1] == 0 &&
113                     content[2] == -2 &&
114                     content[3] == -1 ) 
115                    return "UTF-32";
116    
117                if ( content[0] == -1 &&
118                     content[1] == -2 &&
119                     content[2] == 0 &&
120                     content[3] == 0 ) 
121                    return "UTF-32";
122    
123            }
124    
125            return null;
126            
127        }
128        
129        public static void main( String[] args ) throws Exception {
130    
131            System.out.println( parse( "<?xml encoding=\"utf-8\"?>".getBytes() ) );
132            System.out.println( parse( "<?xml encoding=\"UTF-8\"?>".getBytes() ) );
133            System.out.println( parse( "<?xml encoding=\"utf8\"?>".getBytes() ) );
134    
135        }
136    
137    }