001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser.tools; 018 019 020 /** 021 * 022 * Given an XML document pull out the encoding or the default (UTF-8) if not 023 * specified. 024 * 025 * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a> 026 */ 027 public class XMLEncodingParser { 028 029 public static final String ENCODING = "encoding=\""; 030 031 /** 032 * 033 * 034 */ 035 public static String parse( byte[] content ) throws Exception { 036 037 //this isn't really pretty but it is fast. 038 039 //just use the first 100 bytes 040 041 String str; 042 043 if ( content.length > 100 ) { 044 str = new String( content, 0, 100 ); 045 } else { 046 str = new String( content ); 047 } 048 049 String result = getEncodingFromBOM( content ); 050 051 if ( result != null ) 052 return result; 053 054 int end = str.indexOf( ">" ); 055 056 if ( end == -1 ) 057 return "UTF-8"; 058 059 String decl = str.substring( 0, end ); 060 061 int index = decl.indexOf( ENCODING ); 062 063 if ( index != -1 ) { 064 065 String encoding = decl.substring( index + ENCODING.length(), 066 decl.length() ); 067 068 end = encoding.indexOf( "\"" ); 069 070 if ( end == -1 ) 071 return "UTF-8"; 072 073 encoding = encoding.substring( 0, end); 074 encoding = encoding.toUpperCase(); 075 076 if ( "UTF8".equals( encoding ) ) 077 encoding = "UTF-8"; 078 079 return encoding; 080 081 } 082 083 return "UTF-8"; 084 085 } 086 087 private static String getEncodingFromBOM( byte[] content ) { 088 089 // Technically speaking if we see a BOM is specified we're supposed to 090 // return UTF-16 or UTF-32 but because we only care about anything UTF 091 // returning UTF-8 is incorrect but acceptable. 092 // 093 // http://www.unicode.org/faq/utf_bom.html#BOM 094 095 if ( content.length > 2 ) { 096 097 //perform UTF-16 tests 098 if ( content[0] == -1 && 099 content[1] == -2 ) 100 return "UTF-16"; 101 102 if ( content[0] == -2 && 103 content[1] == -1 ) 104 return "UTF-16"; 105 106 } 107 108 if ( content.length > 4 ) { 109 110 //perform UTF-16 tests 111 if ( content[0] == 0 && 112 content[1] == 0 && 113 content[2] == -2 && 114 content[3] == -1 ) 115 return "UTF-32"; 116 117 if ( content[0] == -1 && 118 content[1] == -2 && 119 content[2] == 0 && 120 content[3] == 0 ) 121 return "UTF-32"; 122 123 } 124 125 return null; 126 127 } 128 129 public static void main( String[] args ) throws Exception { 130 131 System.out.println( parse( "<?xml encoding=\"utf-8\"?>".getBytes() ) ); 132 System.out.println( parse( "<?xml encoding=\"UTF-8\"?>".getBytes() ) ); 133 System.out.println( parse( "<?xml encoding=\"utf8\"?>".getBytes() ) ); 134 135 } 136 137 }