001 /* 002 * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons-sandbox//xmlio/src/java/org/apache/commons/xmlio/in/SimpleImporter.java,v 1.1 2004/10/08 11:56:20 ozeigermann Exp $ 003 * $Revision: 155476 $ 004 * $Date: 2005-02-26 13:31:24 +0000 (Sat, 26 Feb 2005) $ 005 * 006 * ==================================================================== 007 * 008 * Copyright 2004 The Apache Software Foundation 009 * 010 * Licensed under the Apache License, Version 2.0 (the "License"); 011 * you may not use this file except in compliance with the License. 012 * You may obtain a copy of the License at 013 * 014 * http://www.apache.org/licenses/LICENSE-2.0 015 * 016 * Unless required by applicable law or agreed to in writing, software 017 * distributed under the License is distributed on an "AS IS" BASIS, 018 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 019 * See the License for the specific language governing permissions and 020 * limitations under the License. 021 * 022 */ 023 024 package org.apache.commons.xmlio.in; 025 026 import java.io.*; 027 import java.util.*; 028 import java.net.*; 029 030 import org.xml.sax.*; 031 import org.xml.sax.helpers.*; 032 import javax.xml.parsers.*; 033 034 /** 035 * <b>Simple</b> and <b>fast</b> importer for XML configuration or import files. <br> 036 * <br> 037 * It is based on SAX and can be considered an extension to it. This means it is 038 * callback oriented and does not build an internal data structure like the DOM. 039 * While SAX is simple, fast, and memory friendly it might be a bit too 040 * rudimentary for most tasks. <code>SimpleImporter</code> adds more high level 041 * means for importing XML while preserving the SAX's benefits. <br> 042 * <br> 043 * As with SAX you register a callback handler ({@link SimpleImportHandler}) 044 * that is called upon events. Consider the following example implementation 045 * of a {@link SimpleImportHandler}:<br><br> 046 * <code><pre> 047 * public class DemoHandler implements SimpleImportHandler { 048 * public void startDocument() { } 049 * public void endDocument() { } 050 * 051 * public void cData(SimplePath path, String cdata) { } 052 * 053 * public void startElement(SimplePath path, String name, AttributesImpl attributes, String leadingCDdata) { 054 * if (path.matches("/root/interesting-element")) { 055 * System.out.println(leadingCDdata); 056 * } 057 * } 058 * public void endElement(SimplePath path, String name) { } 059 * 060 * } 061 * </pre></code> 062 * 063 * Registering this class with {@link #addSimpleImportHandler} and call 064 * {@link #parse} on an input stream or {@link #parseUrlOrFile} will dump 065 * the leading text of the element matching the path ({@link SimplePath}) 066 * "/root/interesting-element".<br> 067 * <br> 068 * <em>Note</em>: This class is thread safe. 069 * 070 */ 071 public class SimpleImporter { 072 073 // properties 074 private boolean trimContent = true; 075 private boolean makeCopy = false; 076 private boolean zeroLengthIsNull = true; 077 private boolean includeLeadingCDataIntoStartElementCallback = true; 078 private boolean fullDebug = false; 079 private boolean useQName = true; 080 private boolean buildComplexPath = false; 081 082 protected SAXParserFactory factory; 083 084 protected List callbackHandlerList = new ArrayList(); 085 086 // internal state 087 protected StringBuffer currentMixedPCData = null; 088 protected boolean foundMixedPCData = false; 089 // the first (leading) CDATA is exacly the part between a start tag 090 // and any other tagging 091 protected StringBuffer firstPCData = null; 092 protected boolean isFirstPCData = true; 093 094 // remember start element for later flushing 095 protected ParseElement currentElement = null; 096 097 protected PathStack parseStack = new PathStack(); 098 099 protected String debugBuffer = null; 100 101 /** Creates a new SimpleImporter object having default property settings. It is recommended 102 * to set all properties explicitly for clearity. 103 */ 104 public SimpleImporter() { 105 factory = SAXParserFactory.newInstance(); 106 } 107 108 /** Determines if we have found any mixed content while parsing. */ 109 public boolean getFoundMixedPCData() { 110 return foundMixedPCData; 111 } 112 113 /** 114 * Determines if the path shall be assembled of the full qualified names. <code>true</code> is the default. 115 */ 116 public boolean getUseQName() { 117 return useQName; 118 } 119 120 /** 121 * Sets if the path shall be assembled of the full qualified names. <code>true</code> is the default. 122 */ 123 public void setUseQName(boolean useQName) { 124 this.useQName = useQName; 125 } 126 127 /** 128 * Determines if the simple path created will have complex additional info. 129 */ 130 public boolean getBuildComplexPath() { 131 return buildComplexPath; 132 } 133 134 /** 135 * Sets if the simple path created will have complex additional info. 136 */ 137 public void setBuildComplexPath(boolean buildComplexPath) { 138 this.buildComplexPath = buildComplexPath; 139 } 140 141 /** Sets the full debug mode which enables us to get the parsed stream 142 * as string via the {@link #getParsedStreamForDebug()} 143 * method even if an error occured. 144 */ 145 public void setFullDebugMode(boolean fullDebug) { 146 this.fullDebug = fullDebug; 147 } 148 149 /** Gets the property described in 150 * {@link #setFullDebugMode}. 151 */ 152 public boolean getFullDebugMode() { 153 return fullDebug; 154 } 155 156 /** Gets the whole stream parsed in the {@link #parse} method. As this requires some actions 157 * significantly slowing down the whole parse, this only works if it has been enabled 158 * by the the {@link #setFullDebugMode} method. 159 */ 160 public String getParsedStreamForDebug() { 161 if (!getFullDebugMode()) { 162 return null; 163 } else { 164 return debugBuffer; 165 } 166 } 167 168 /** Gets property telling importer to return any leading CDATA, i.e. 169 * CDATA directly following a start tag before any other tagging, 170 * along with the start element 171 * method. If set to false leading CDATA will be returned using method 172 * {@link SimpleImportHandler#cData} just like any CDATA in a mixed 173 * content. <br> 174 * 175 * @see SimpleImportHandler#startElement 176 * @see #setIncludeLeadingCDataIntoStartElementCallback 177 */ 178 public boolean getIncludeLeadingCDataIntoStartElementCallback() { 179 return includeLeadingCDataIntoStartElementCallback; 180 } 181 182 /** Sets the property described in 183 * {@link #getIncludeLeadingCDataIntoStartElementCallback}. 184 */ 185 public void setIncludeLeadingCDataIntoStartElementCallback(boolean includeLeadingCDataIntoStartElementCallback) { 186 this.includeLeadingCDataIntoStartElementCallback = includeLeadingCDataIntoStartElementCallback; 187 } 188 189 /** Sets the property described in 190 * {@link #setTrimContent}. 191 */ 192 public boolean getTrimContent() { 193 return trimContent; 194 } 195 196 /** Sets when all content shall be trimed. 197 * If set in conjunction with {@link #setZeroLengthIsNull} all whitespace data will not be 198 * reported to callback handlers. 199 */ 200 public void setTrimContent(boolean trimContent) { 201 this.trimContent = trimContent; 202 } 203 204 /** Gets property: When findind zero length content should it be treated as null data? 205 * If it is treated as null data nothing is reported to handlers when finding zero length data. 206 */ 207 public boolean getZeroLengthIsNull() { 208 return zeroLengthIsNull; 209 } 210 211 /** Sets the property described in 212 * {@link #getZeroLengthIsNull}. 213 */ 214 public void setZeroLengthIsNull(boolean zeroLengthIsNull) { 215 this.zeroLengthIsNull = zeroLengthIsNull; 216 } 217 218 /** Gets the property describing if every callback handler gets a fresh copy of the parsed data. 219 * This is only important when there is more than one callback handler. If so and it is not set, 220 * all handlers will get <em>identical</em> objects. This is bad if you expect them to change any 221 * of that data. 222 */ 223 public boolean getMakeCopy() { 224 return makeCopy; 225 } 226 227 /** Sets the property described in {@link #getMakeCopy}. */ 228 public void setMakeCopy(boolean makeCopy) { 229 this.makeCopy = makeCopy; 230 } 231 232 /** Adds a new callback handler if it is not in the callback list, yet. 233 * This can be dynamically done while parsing. 234 * @see #removeSimpleImportHandler 235 */ 236 public void addSimpleImportHandler(SimpleImportHandler callbackHandler) { 237 synchronized (callbackHandlerList) { 238 if (!callbackHandlerList.contains(callbackHandler)) { 239 callbackHandlerList.add(callbackHandler); 240 } 241 } 242 } 243 244 /** Removes a callback handler if it is in the callback list. 245 * This can be dynamically done while parsing. 246 * @see #addSimpleImportHandler 247 */ 248 public void removeSimpleImportHandler(SimpleImportHandler callbackHandler) { 249 synchronized (callbackHandlerList) { 250 callbackHandlerList.remove(callbackHandler); 251 } 252 } 253 254 /** Tries to parse the file or URL named by parameter <code>urlOrFileName</code>. 255 * First it tries to parse it as URL, if this does not work, it tries to parse it as file. 256 * If one option works, an input stream will be opened and {@link #parse} will be called with it. 257 * If both does not work, an exception is thrown. 258 * 259 * @see #parse 260 */ 261 public synchronized void parseUrlOrFile(String urlOrFileName) 262 throws ParserConfigurationException, SAXException, IOException, SimpleImporterException { 263 Throwable urlException = null; 264 Throwable fileException = null; 265 InputStream in = null; 266 try { 267 URL url = new URL(urlOrFileName); 268 URLConnection urlConnection = url.openConnection(); 269 in = urlConnection.getInputStream(); 270 } catch (MalformedURLException mue) { 271 urlException = mue; 272 } catch (IOException ioe) { 273 urlException = ioe; 274 } 275 276 try { 277 in = new FileInputStream(urlOrFileName); 278 } catch (IOException ioe) { 279 fileException = ioe; 280 } 281 282 if (in != null) { 283 parse(new InputSource(new BufferedInputStream(in))); 284 } else { 285 throw new SimpleImporterException( 286 "Could not parse " 287 + urlOrFileName 288 + ", is neither URL (" 289 + urlException.getMessage() 290 + ") nor file (" 291 + fileException.getMessage() 292 + ")."); 293 } 294 } 295 296 /** Parses the input source using the standard SAX parser and calls back the callback handlers. 297 * If enabled with {@link #setFullDebugMode} the source will be verbosely copied first.<br> 298 *<br> 299 * <em>Note</em>: This method is synchronized, so you can not have two concurrent parses. 300 */ 301 public synchronized void parse(InputSource is) throws ParserConfigurationException, SAXException, IOException { 302 firstPCData = null; 303 currentElement = null; 304 factory.setNamespaceAware(!useQName || buildComplexPath); 305 SAXParser parser = factory.newSAXParser(); 306 if (getFullDebugMode()) { 307 InputSource preReadIn = bufferParserStream(is); 308 parser.parse(preReadIn, new SAXHandler()); 309 } else { 310 parser.parse(is, new SAXHandler()); 311 } 312 } 313 314 private InputSource bufferParserStream(InputSource is) throws IOException { 315 StringBuffer buf = new StringBuffer(); 316 Reader reader; 317 BufferedReader bufferedReader; 318 if (is.getCharacterStream() != null) { 319 reader = is.getCharacterStream(); 320 } else { 321 String encoding = is.getEncoding(); 322 if (encoding != null) { 323 reader = new InputStreamReader(is.getByteStream(), encoding); 324 } else { 325 reader = new InputStreamReader(is.getByteStream()); 326 } 327 } 328 if (reader instanceof BufferedReader) { 329 bufferedReader = (BufferedReader) reader; 330 } else { 331 bufferedReader = new BufferedReader(reader); 332 } 333 334 while (true) { 335 String line = bufferedReader.readLine(); 336 if (line == null) { 337 break; 338 } else { 339 buf.append(line).append('\n'); 340 } 341 } 342 debugBuffer = buf.toString(); 343 return new InputSource(new StringReader(debugBuffer)); 344 } 345 346 // callback handlers with start element method when there is data 347 private void callBackStartElementWhenReady() { 348 if (currentElement != null) { 349 String content = getFirstPCData(); 350 SimplePath path; 351 if (buildComplexPath) { 352 path = 353 new SimplePath( 354 currentElement.path, 355 (Item[]) currentElement.pathList.toArray(new Item[currentElement.pathList.size()])); 356 } else { 357 path = new SimplePath(currentElement.path); 358 359 } 360 361 synchronized (callbackHandlerList) { 362 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) { 363 SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next(); 364 if (getMakeCopy()) { 365 // string is constant any way, no need to make a copy 366 callbackHandler.startElement( 367 new SimplePath(path), 368 currentElement.name, 369 new AttributesImpl(currentElement.attributes), 370 content); 371 } else { 372 callbackHandler.startElement(path, currentElement.name, currentElement.attributes, content); 373 } 374 } 375 } 376 377 firstPCData = null; 378 currentElement = null; 379 } 380 } 381 382 private void sendCharacters(String text) { 383 if (text == null) 384 return; 385 386 if (isFirstPCData) { 387 if (includeLeadingCDataIntoStartElementCallback) { 388 addToFirstPCData(text); 389 } else { 390 sendCData(text); 391 } 392 } else { 393 foundMixedPCData = true; 394 sendCData(text); 395 } 396 } 397 398 private void callBackCDATAWhenReady() { 399 callBackStartElementWhenReady(); 400 if (currentMixedPCData == null) { 401 return; 402 } 403 String text = currentMixedPCData.toString(); 404 text = trimPCData(text); 405 if (text == null) { 406 return; 407 } 408 409 SimplePath path; 410 if (buildComplexPath) { 411 path = 412 new SimplePath( 413 parseStack.getPath(), 414 (Item[]) parseStack.getPathList().toArray(new Item[parseStack.getPathList().size()])); 415 } else { 416 path = new SimplePath(parseStack.getPath()); 417 418 } 419 420 synchronized (callbackHandlerList) { 421 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) { 422 SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next(); 423 if (getMakeCopy()) { 424 // string is constant any way, no need to make a copy 425 callbackHandler.cData(new SimplePath(path), text); 426 } else { 427 callbackHandler.cData(path, text); 428 } 429 } 430 } 431 currentMixedPCData = null; 432 } 433 434 // send normal (not leading) CDATA to handlers 435 private void sendCData(String text) { 436 // defer sending it until we have a maximum chunck, i.e. until 437 // next tagging occurs 438 if (currentMixedPCData == null) { 439 currentMixedPCData = new StringBuffer(text.length()); 440 } 441 currentMixedPCData.append(text); 442 } 443 444 private void addToFirstPCData(String text) { 445 if (firstPCData == null) { 446 firstPCData = new StringBuffer(text.length()); 447 } 448 firstPCData.append(text); 449 } 450 451 private String getFirstPCData() { 452 if (firstPCData == null) { 453 return null; 454 } else { 455 String text = firstPCData.toString(); 456 return trimPCData(text); 457 } 458 } 459 460 // trim text depending on settings of properties 461 private String trimPCData(String pcData) { 462 if (pcData == null) { 463 return null; 464 } else { 465 if (getTrimContent()) { 466 pcData = pcData.trim(); 467 } 468 if (pcData.length() == 0 && getZeroLengthIsNull()) { 469 return null; 470 } else { 471 return pcData; 472 } 473 } 474 } 475 476 // use to temporarily save a an element 477 private final static class ParseElement { 478 public String name, path; 479 public List pathList; 480 public AttributesImpl attributes; 481 482 public ParseElement(String name, String path, List pathList, AttributesImpl attributes) { 483 this.name = name; 484 this.path = path; 485 this.attributes = attributes; 486 this.pathList = pathList; 487 } 488 } 489 490 private final class SAXHandler extends DefaultHandler { 491 public void startDocument() { 492 synchronized (callbackHandlerList) { 493 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) { 494 SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next(); 495 callbackHandler.startDocument(); 496 } 497 } 498 } 499 500 public void endDocument() { 501 // flush any pending start elements and character data, as now the show is over 502 callBackStartElementWhenReady(); 503 callBackCDATAWhenReady(); 504 synchronized (callbackHandlerList) { 505 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) { 506 SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next(); 507 callbackHandler.endDocument(); 508 } 509 } 510 } 511 512 public void characters(char ch[], int start, int length) { 513 if (length < 1) 514 return; 515 String text = new String(ch, start, length); 516 sendCharacters(text); 517 } 518 519 public void endElement(String namespaceURI, String localName, String qName) { 520 // be sure to have any pending start elements and character data flushed before 521 // sending end tag to keep right sequence of callbacks 522 callBackStartElementWhenReady(); 523 callBackCDATAWhenReady(); 524 String name; 525 if (!useQName || qName == null || qName.length() == 0) { 526 name = localName; 527 } else { 528 name = qName; 529 } 530 531 SimplePath path; 532 if (buildComplexPath) { 533 path = 534 new SimplePath( 535 parseStack.getPath(), 536 (Item[]) parseStack.getPathList().toArray(new Item[parseStack.getPathList().size()])); 537 } else { 538 path = new SimplePath(parseStack.getPath()); 539 540 } 541 542 synchronized (callbackHandlerList) { 543 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) { 544 SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next(); 545 if (getMakeCopy()) { 546 // string is constant any way, no need to make a copy 547 callbackHandler.endElement(new SimplePath(path), name); 548 } else { 549 callbackHandler.endElement(path, name); 550 } 551 } 552 } 553 554 // this must never be 555 if (parseStack.empty()) { 556 throw new SimpleImporterException("Umatchted end tag: " + name); 557 } else { 558 Object top = parseStack.peek(); 559 String topName; 560 if (buildComplexPath) { 561 topName = ((Item)top).getName(); 562 } else { 563 topName = (String)top; 564 } 565 if (!name.equals(topName)) { 566 throw new SimpleImporterException( 567 "End tag " + name + " does not match start tag " + top); 568 } else { 569 parseStack.pop(); 570 } 571 } 572 // any CDATA following can't be leading 573 isFirstPCData = false; 574 } 575 576 public void startElement(String namespaceURI, String localName, String qName, Attributes atts) { 577 // be sure to have any pending start elements and character data flushed before 578 // opening a new one to keep right sequence of callbacks 579 callBackStartElementWhenReady(); 580 callBackCDATAWhenReady(); 581 String name; 582 if (!useQName || qName == null || qName.length() == 0) { 583 name = localName; 584 } else { 585 name = qName; 586 } 587 parseStack.push(namespaceURI, name); 588 // Defer callback to handlers as it is not clear now how 589 // much (if any) CDATA has to be passed over with start element method. 590 AttributesImpl attributesCopy = new AttributesImpl(atts); 591 currentElement = new ParseElement(name, parseStack.getPath(), parseStack.getPathList(), attributesCopy); 592 // Any CDATA (can be more the one SAX event) following is leading 593 // until next tag. Actually it is sufficient to switch this off 594 // in end tag not in start tag, as it would be turned on again 595 // immediately. 596 isFirstPCData = true; 597 } 598 } 599 600 // Notion of a stack representing a path. 601 private final class PathStack { 602 603 private List pathStack; 604 605 public PathStack(int initialCapacity) { 606 pathStack = new ArrayList(initialCapacity); 607 } 608 609 public PathStack() { 610 pathStack = new ArrayList(); 611 } 612 613 public String getPath() { 614 StringBuffer path = new StringBuffer(100); 615 // this is always there as root 616 path.append('/'); 617 for (Iterator it = pathStack.iterator(); it.hasNext();) { 618 Object element = it.next(); 619 String pathElement; 620 if (buildComplexPath) { 621 pathElement = ((Item) element).getName(); 622 } else { 623 pathElement = (String) element; 624 } 625 path.append(pathElement).append('/'); 626 } 627 return path.toString(); 628 } 629 630 public List getPathList() { 631 return pathStack; 632 } 633 634 public String toString() { 635 return getPath(); 636 } 637 638 public void push(String namespaceURI, String name) { 639 if (buildComplexPath) { 640 pathStack.add(new Item(name, namespaceURI)); 641 } else { 642 pathStack.add(name); 643 } 644 } 645 646 public int size() { 647 return pathStack.size(); 648 } 649 650 public boolean empty() { 651 return (pathStack.size() <= 0); 652 } 653 654 public Object peek() { 655 int size = pathStack.size(); 656 if (size > 0) { 657 return pathStack.get(size - 1); 658 } else { 659 return null; 660 } 661 } 662 663 public Object pop() { 664 int size = pathStack.size(); 665 if (size > 0) { 666 Object o = pathStack.get(size - 1); 667 pathStack.remove(size - 1); 668 return o; 669 } else { 670 return null; 671 } 672 } 673 674 } 675 }