001 /* 002 * Copyright 1999,2004 The Apache Software Foundation. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.apache.commons.feedparser.tools; 018 019 /** 020 * Class that can cleanse a string so that nothing can be present to break an 021 * XML parser. This is a VERY non-portable class as it is meant to work just 022 * with Xalan/Xerces and may remove more text and replace things that are 023 * non-XML centric. 024 * 025 * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a> 026 * @version $Id: XMLCleanser.java 159211 2005-03-27 23:24:21Z burton $ 027 */ 028 public class XMLCleanser { 029 030 public static String cleanse( String content ) { 031 032 StringBuffer buff = new StringBuffer( content.length() ); 033 034 for ( int i = 0; i < content.length(); ++i ) { 035 036 char c = content.charAt( i ); 037 038 if ( isXMLCharacter( c ) ) { 039 040 buff.append( c ); 041 042 } 043 044 } 045 046 return buff.toString(); 047 048 } 049 050 /** 051 * Copy based on a byte array. 052 * 053 * 054 */ 055 public static String cleanse( byte[] content, String encoding ) throws Exception { 056 057 String s = new String( content, encoding ); 058 059 StringBuffer buff = new StringBuffer( content.length ); 060 061 for ( int i = 0; i < s.length(); ++i ) { 062 063 char c = s.charAt( i ); 064 065 if ( isXMLCharacter( c ) ) { 066 067 buff.append( c ); 068 069 } 070 071 } 072 073 return buff.toString(); 074 075 } 076 077 public static char[] cleanseToCharArray( byte[] content ) { 078 079 char[] buff = new char[content.length]; 080 081 int index = 0; 082 083 for ( int i = 0; i < content.length; ++i ) { 084 085 char c = (char)content[ i ]; 086 087 if ( isXMLCharacter( c ) ) { 088 089 buff[index] = c; 090 091 ++index; 092 } 093 094 } 095 096 return buff; 097 098 } 099 100 /** 101 * Copy based on a byte array. 102 * 103 * 104 */ 105 public static byte[] cleanseToByteArray( byte[] content ) { 106 107 byte[] buff = new byte[ content.length ]; 108 109 int index = 0; 110 for ( int i = 0; i < content.length; ++i ) { 111 112 char c = (char)content[ i ]; 113 114 if ( isXMLCharacter( c ) ) { 115 116 //buff.append( c ); 117 buff[index] = content[ i ]; 118 ++index; 119 } 120 121 } 122 123 return buff; 124 125 } 126 127 /* 128 * This is a utility function for determining whether a specified character 129 * is a character according to production 2 of the XML 1.0 specification. 130 * 131 * @param c <code>char</code> to check for XML compliance. 132 133 * @return <code>boolean</code> - true if it's a character, false otherwise. 134 */ 135 public static boolean isXMLCharacter( char c ) { 136 137 // A parsed entity contains text, a sequence of characters, which may 138 // represent markup or character data. A character is an atomic unit of 139 // text as specified by ISO/IEC 10646 [ISO/IEC 10646]. Legal characters 140 // are tab, carriage return, line feed, and the legal graphic characters 141 // of Unicode and ISO/IEC 10646. The use of "compatibility characters", 142 // as defined in section 6.8 of [Unicode], is discouraged. 143 144 // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | 145 // [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate 146 // blocks, FFFE, and FFFF. */ 147 148 if (c == '\n') return true; 149 if (c == '\r') return true; 150 if (c == '\t') return true; 151 152 //NOTE: this was BROKEN! The range between 0x80 and 0xFF is valid XML 153 //and would end up dropping latin characters in UTF-8. Why did I want 154 //to return false here again? 155 156 //if (c < 0x20) return false; if (c < 0x80) return true; 157 //if (c < 0xFF) return false; if (c <= 0xD7FF) return true; 158 159 if (c < 0x20) return false; if (c <= 0xD7FF) return true; 160 if (c < 0xE000) return false; if (c <= 0xFFFD) return true; 161 if (c < 0x10000) return false; if (c <= 0x10FFFF) return true; 162 163 return false; 164 165 } 166 167 }