001 /*
002 * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons-sandbox//xmlio/src/java/org/apache/commons/xmlio/in/SimpleImporter.java,v 1.1 2004/10/08 11:56:20 ozeigermann Exp $
003 * $Revision: 155476 $
004 * $Date: 2005-02-26 13:31:24 +0000 (Sat, 26 Feb 2005) $
005 *
006 * ====================================================================
007 *
008 * Copyright 2004 The Apache Software Foundation
009 *
010 * Licensed under the Apache License, Version 2.0 (the "License");
011 * you may not use this file except in compliance with the License.
012 * You may obtain a copy of the License at
013 *
014 * http://www.apache.org/licenses/LICENSE-2.0
015 *
016 * Unless required by applicable law or agreed to in writing, software
017 * distributed under the License is distributed on an "AS IS" BASIS,
018 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
019 * See the License for the specific language governing permissions and
020 * limitations under the License.
021 *
022 */
023
024 package org.apache.commons.xmlio.in;
025
026 import java.io.*;
027 import java.util.*;
028 import java.net.*;
029
030 import org.xml.sax.*;
031 import org.xml.sax.helpers.*;
032 import javax.xml.parsers.*;
033
034 /**
035 * <b>Simple</b> and <b>fast</b> importer for XML configuration or import files. <br>
036 * <br>
037 * It is based on SAX and can be considered an extension to it. This means it is
038 * callback oriented and does not build an internal data structure like the DOM.
039 * While SAX is simple, fast, and memory friendly it might be a bit too
040 * rudimentary for most tasks. <code>SimpleImporter</code> adds more high level
041 * means for importing XML while preserving the SAX's benefits. <br>
042 * <br>
043 * As with SAX you register a callback handler ({@link SimpleImportHandler})
044 * that is called upon events. Consider the following example implementation
045 * of a {@link SimpleImportHandler}:<br><br>
046 * <code><pre>
047 * public class DemoHandler implements SimpleImportHandler {
048 * public void startDocument() { }
049 * public void endDocument() { }
050 *
051 * public void cData(SimplePath path, String cdata) { }
052 *
053 * public void startElement(SimplePath path, String name, AttributesImpl attributes, String leadingCDdata) {
054 * if (path.matches("/root/interesting-element")) {
055 * System.out.println(leadingCDdata);
056 * }
057 * }
058 * public void endElement(SimplePath path, String name) { }
059 *
060 * }
061 * </pre></code>
062 *
063 * Registering this class with {@link #addSimpleImportHandler} and call
064 * {@link #parse} on an input stream or {@link #parseUrlOrFile} will dump
065 * the leading text of the element matching the path ({@link SimplePath})
066 * "/root/interesting-element".<br>
067 * <br>
068 * <em>Note</em>: This class is thread safe.
069 *
070 */
071 public class SimpleImporter {
072
073 // properties
074 private boolean trimContent = true;
075 private boolean makeCopy = false;
076 private boolean zeroLengthIsNull = true;
077 private boolean includeLeadingCDataIntoStartElementCallback = true;
078 private boolean fullDebug = false;
079 private boolean useQName = true;
080 private boolean buildComplexPath = false;
081
082 protected SAXParserFactory factory;
083
084 protected List callbackHandlerList = new ArrayList();
085
086 // internal state
087 protected StringBuffer currentMixedPCData = null;
088 protected boolean foundMixedPCData = false;
089 // the first (leading) CDATA is exacly the part between a start tag
090 // and any other tagging
091 protected StringBuffer firstPCData = null;
092 protected boolean isFirstPCData = true;
093
094 // remember start element for later flushing
095 protected ParseElement currentElement = null;
096
097 protected PathStack parseStack = new PathStack();
098
099 protected String debugBuffer = null;
100
101 /** Creates a new SimpleImporter object having default property settings. It is recommended
102 * to set all properties explicitly for clearity.
103 */
104 public SimpleImporter() {
105 factory = SAXParserFactory.newInstance();
106 }
107
108 /** Determines if we have found any mixed content while parsing. */
109 public boolean getFoundMixedPCData() {
110 return foundMixedPCData;
111 }
112
113 /**
114 * Determines if the path shall be assembled of the full qualified names. <code>true</code> is the default.
115 */
116 public boolean getUseQName() {
117 return useQName;
118 }
119
120 /**
121 * Sets if the path shall be assembled of the full qualified names. <code>true</code> is the default.
122 */
123 public void setUseQName(boolean useQName) {
124 this.useQName = useQName;
125 }
126
127 /**
128 * Determines if the simple path created will have complex additional info.
129 */
130 public boolean getBuildComplexPath() {
131 return buildComplexPath;
132 }
133
134 /**
135 * Sets if the simple path created will have complex additional info.
136 */
137 public void setBuildComplexPath(boolean buildComplexPath) {
138 this.buildComplexPath = buildComplexPath;
139 }
140
141 /** Sets the full debug mode which enables us to get the parsed stream
142 * as string via the {@link #getParsedStreamForDebug()}
143 * method even if an error occured.
144 */
145 public void setFullDebugMode(boolean fullDebug) {
146 this.fullDebug = fullDebug;
147 }
148
149 /** Gets the property described in
150 * {@link #setFullDebugMode}.
151 */
152 public boolean getFullDebugMode() {
153 return fullDebug;
154 }
155
156 /** Gets the whole stream parsed in the {@link #parse} method. As this requires some actions
157 * significantly slowing down the whole parse, this only works if it has been enabled
158 * by the the {@link #setFullDebugMode} method.
159 */
160 public String getParsedStreamForDebug() {
161 if (!getFullDebugMode()) {
162 return null;
163 } else {
164 return debugBuffer;
165 }
166 }
167
168 /** Gets property telling importer to return any leading CDATA, i.e.
169 * CDATA directly following a start tag before any other tagging,
170 * along with the start element
171 * method. If set to false leading CDATA will be returned using method
172 * {@link SimpleImportHandler#cData} just like any CDATA in a mixed
173 * content. <br>
174 *
175 * @see SimpleImportHandler#startElement
176 * @see #setIncludeLeadingCDataIntoStartElementCallback
177 */
178 public boolean getIncludeLeadingCDataIntoStartElementCallback() {
179 return includeLeadingCDataIntoStartElementCallback;
180 }
181
182 /** Sets the property described in
183 * {@link #getIncludeLeadingCDataIntoStartElementCallback}.
184 */
185 public void setIncludeLeadingCDataIntoStartElementCallback(boolean includeLeadingCDataIntoStartElementCallback) {
186 this.includeLeadingCDataIntoStartElementCallback = includeLeadingCDataIntoStartElementCallback;
187 }
188
189 /** Sets the property described in
190 * {@link #setTrimContent}.
191 */
192 public boolean getTrimContent() {
193 return trimContent;
194 }
195
196 /** Sets when all content shall be trimed.
197 * If set in conjunction with {@link #setZeroLengthIsNull} all whitespace data will not be
198 * reported to callback handlers.
199 */
200 public void setTrimContent(boolean trimContent) {
201 this.trimContent = trimContent;
202 }
203
204 /** Gets property: When findind zero length content should it be treated as null data?
205 * If it is treated as null data nothing is reported to handlers when finding zero length data.
206 */
207 public boolean getZeroLengthIsNull() {
208 return zeroLengthIsNull;
209 }
210
211 /** Sets the property described in
212 * {@link #getZeroLengthIsNull}.
213 */
214 public void setZeroLengthIsNull(boolean zeroLengthIsNull) {
215 this.zeroLengthIsNull = zeroLengthIsNull;
216 }
217
218 /** Gets the property describing if every callback handler gets a fresh copy of the parsed data.
219 * This is only important when there is more than one callback handler. If so and it is not set,
220 * all handlers will get <em>identical</em> objects. This is bad if you expect them to change any
221 * of that data.
222 */
223 public boolean getMakeCopy() {
224 return makeCopy;
225 }
226
227 /** Sets the property described in {@link #getMakeCopy}. */
228 public void setMakeCopy(boolean makeCopy) {
229 this.makeCopy = makeCopy;
230 }
231
232 /** Adds a new callback handler if it is not in the callback list, yet.
233 * This can be dynamically done while parsing.
234 * @see #removeSimpleImportHandler
235 */
236 public void addSimpleImportHandler(SimpleImportHandler callbackHandler) {
237 synchronized (callbackHandlerList) {
238 if (!callbackHandlerList.contains(callbackHandler)) {
239 callbackHandlerList.add(callbackHandler);
240 }
241 }
242 }
243
244 /** Removes a callback handler if it is in the callback list.
245 * This can be dynamically done while parsing.
246 * @see #addSimpleImportHandler
247 */
248 public void removeSimpleImportHandler(SimpleImportHandler callbackHandler) {
249 synchronized (callbackHandlerList) {
250 callbackHandlerList.remove(callbackHandler);
251 }
252 }
253
254 /** Tries to parse the file or URL named by parameter <code>urlOrFileName</code>.
255 * First it tries to parse it as URL, if this does not work, it tries to parse it as file.
256 * If one option works, an input stream will be opened and {@link #parse} will be called with it.
257 * If both does not work, an exception is thrown.
258 *
259 * @see #parse
260 */
261 public synchronized void parseUrlOrFile(String urlOrFileName)
262 throws ParserConfigurationException, SAXException, IOException, SimpleImporterException {
263 Throwable urlException = null;
264 Throwable fileException = null;
265 InputStream in = null;
266 try {
267 URL url = new URL(urlOrFileName);
268 URLConnection urlConnection = url.openConnection();
269 in = urlConnection.getInputStream();
270 } catch (MalformedURLException mue) {
271 urlException = mue;
272 } catch (IOException ioe) {
273 urlException = ioe;
274 }
275
276 try {
277 in = new FileInputStream(urlOrFileName);
278 } catch (IOException ioe) {
279 fileException = ioe;
280 }
281
282 if (in != null) {
283 parse(new InputSource(new BufferedInputStream(in)));
284 } else {
285 throw new SimpleImporterException(
286 "Could not parse "
287 + urlOrFileName
288 + ", is neither URL ("
289 + urlException.getMessage()
290 + ") nor file ("
291 + fileException.getMessage()
292 + ").");
293 }
294 }
295
296 /** Parses the input source using the standard SAX parser and calls back the callback handlers.
297 * If enabled with {@link #setFullDebugMode} the source will be verbosely copied first.<br>
298 *<br>
299 * <em>Note</em>: This method is synchronized, so you can not have two concurrent parses.
300 */
301 public synchronized void parse(InputSource is) throws ParserConfigurationException, SAXException, IOException {
302 firstPCData = null;
303 currentElement = null;
304 factory.setNamespaceAware(!useQName || buildComplexPath);
305 SAXParser parser = factory.newSAXParser();
306 if (getFullDebugMode()) {
307 InputSource preReadIn = bufferParserStream(is);
308 parser.parse(preReadIn, new SAXHandler());
309 } else {
310 parser.parse(is, new SAXHandler());
311 }
312 }
313
314 private InputSource bufferParserStream(InputSource is) throws IOException {
315 StringBuffer buf = new StringBuffer();
316 Reader reader;
317 BufferedReader bufferedReader;
318 if (is.getCharacterStream() != null) {
319 reader = is.getCharacterStream();
320 } else {
321 String encoding = is.getEncoding();
322 if (encoding != null) {
323 reader = new InputStreamReader(is.getByteStream(), encoding);
324 } else {
325 reader = new InputStreamReader(is.getByteStream());
326 }
327 }
328 if (reader instanceof BufferedReader) {
329 bufferedReader = (BufferedReader) reader;
330 } else {
331 bufferedReader = new BufferedReader(reader);
332 }
333
334 while (true) {
335 String line = bufferedReader.readLine();
336 if (line == null) {
337 break;
338 } else {
339 buf.append(line).append('\n');
340 }
341 }
342 debugBuffer = buf.toString();
343 return new InputSource(new StringReader(debugBuffer));
344 }
345
346 // callback handlers with start element method when there is data
347 private void callBackStartElementWhenReady() {
348 if (currentElement != null) {
349 String content = getFirstPCData();
350 SimplePath path;
351 if (buildComplexPath) {
352 path =
353 new SimplePath(
354 currentElement.path,
355 (Item[]) currentElement.pathList.toArray(new Item[currentElement.pathList.size()]));
356 } else {
357 path = new SimplePath(currentElement.path);
358
359 }
360
361 synchronized (callbackHandlerList) {
362 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
363 SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
364 if (getMakeCopy()) {
365 // string is constant any way, no need to make a copy
366 callbackHandler.startElement(
367 new SimplePath(path),
368 currentElement.name,
369 new AttributesImpl(currentElement.attributes),
370 content);
371 } else {
372 callbackHandler.startElement(path, currentElement.name, currentElement.attributes, content);
373 }
374 }
375 }
376
377 firstPCData = null;
378 currentElement = null;
379 }
380 }
381
382 private void sendCharacters(String text) {
383 if (text == null)
384 return;
385
386 if (isFirstPCData) {
387 if (includeLeadingCDataIntoStartElementCallback) {
388 addToFirstPCData(text);
389 } else {
390 sendCData(text);
391 }
392 } else {
393 foundMixedPCData = true;
394 sendCData(text);
395 }
396 }
397
398 private void callBackCDATAWhenReady() {
399 callBackStartElementWhenReady();
400 if (currentMixedPCData == null) {
401 return;
402 }
403 String text = currentMixedPCData.toString();
404 text = trimPCData(text);
405 if (text == null) {
406 return;
407 }
408
409 SimplePath path;
410 if (buildComplexPath) {
411 path =
412 new SimplePath(
413 parseStack.getPath(),
414 (Item[]) parseStack.getPathList().toArray(new Item[parseStack.getPathList().size()]));
415 } else {
416 path = new SimplePath(parseStack.getPath());
417
418 }
419
420 synchronized (callbackHandlerList) {
421 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
422 SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
423 if (getMakeCopy()) {
424 // string is constant any way, no need to make a copy
425 callbackHandler.cData(new SimplePath(path), text);
426 } else {
427 callbackHandler.cData(path, text);
428 }
429 }
430 }
431 currentMixedPCData = null;
432 }
433
434 // send normal (not leading) CDATA to handlers
435 private void sendCData(String text) {
436 // defer sending it until we have a maximum chunck, i.e. until
437 // next tagging occurs
438 if (currentMixedPCData == null) {
439 currentMixedPCData = new StringBuffer(text.length());
440 }
441 currentMixedPCData.append(text);
442 }
443
444 private void addToFirstPCData(String text) {
445 if (firstPCData == null) {
446 firstPCData = new StringBuffer(text.length());
447 }
448 firstPCData.append(text);
449 }
450
451 private String getFirstPCData() {
452 if (firstPCData == null) {
453 return null;
454 } else {
455 String text = firstPCData.toString();
456 return trimPCData(text);
457 }
458 }
459
460 // trim text depending on settings of properties
461 private String trimPCData(String pcData) {
462 if (pcData == null) {
463 return null;
464 } else {
465 if (getTrimContent()) {
466 pcData = pcData.trim();
467 }
468 if (pcData.length() == 0 && getZeroLengthIsNull()) {
469 return null;
470 } else {
471 return pcData;
472 }
473 }
474 }
475
476 // use to temporarily save a an element
477 private final static class ParseElement {
478 public String name, path;
479 public List pathList;
480 public AttributesImpl attributes;
481
482 public ParseElement(String name, String path, List pathList, AttributesImpl attributes) {
483 this.name = name;
484 this.path = path;
485 this.attributes = attributes;
486 this.pathList = pathList;
487 }
488 }
489
490 private final class SAXHandler extends DefaultHandler {
491 public void startDocument() {
492 synchronized (callbackHandlerList) {
493 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
494 SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
495 callbackHandler.startDocument();
496 }
497 }
498 }
499
500 public void endDocument() {
501 // flush any pending start elements and character data, as now the show is over
502 callBackStartElementWhenReady();
503 callBackCDATAWhenReady();
504 synchronized (callbackHandlerList) {
505 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
506 SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
507 callbackHandler.endDocument();
508 }
509 }
510 }
511
512 public void characters(char ch[], int start, int length) {
513 if (length < 1)
514 return;
515 String text = new String(ch, start, length);
516 sendCharacters(text);
517 }
518
519 public void endElement(String namespaceURI, String localName, String qName) {
520 // be sure to have any pending start elements and character data flushed before
521 // sending end tag to keep right sequence of callbacks
522 callBackStartElementWhenReady();
523 callBackCDATAWhenReady();
524 String name;
525 if (!useQName || qName == null || qName.length() == 0) {
526 name = localName;
527 } else {
528 name = qName;
529 }
530
531 SimplePath path;
532 if (buildComplexPath) {
533 path =
534 new SimplePath(
535 parseStack.getPath(),
536 (Item[]) parseStack.getPathList().toArray(new Item[parseStack.getPathList().size()]));
537 } else {
538 path = new SimplePath(parseStack.getPath());
539
540 }
541
542 synchronized (callbackHandlerList) {
543 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
544 SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
545 if (getMakeCopy()) {
546 // string is constant any way, no need to make a copy
547 callbackHandler.endElement(new SimplePath(path), name);
548 } else {
549 callbackHandler.endElement(path, name);
550 }
551 }
552 }
553
554 // this must never be
555 if (parseStack.empty()) {
556 throw new SimpleImporterException("Umatchted end tag: " + name);
557 } else {
558 Object top = parseStack.peek();
559 String topName;
560 if (buildComplexPath) {
561 topName = ((Item)top).getName();
562 } else {
563 topName = (String)top;
564 }
565 if (!name.equals(topName)) {
566 throw new SimpleImporterException(
567 "End tag " + name + " does not match start tag " + top);
568 } else {
569 parseStack.pop();
570 }
571 }
572 // any CDATA following can't be leading
573 isFirstPCData = false;
574 }
575
576 public void startElement(String namespaceURI, String localName, String qName, Attributes atts) {
577 // be sure to have any pending start elements and character data flushed before
578 // opening a new one to keep right sequence of callbacks
579 callBackStartElementWhenReady();
580 callBackCDATAWhenReady();
581 String name;
582 if (!useQName || qName == null || qName.length() == 0) {
583 name = localName;
584 } else {
585 name = qName;
586 }
587 parseStack.push(namespaceURI, name);
588 // Defer callback to handlers as it is not clear now how
589 // much (if any) CDATA has to be passed over with start element method.
590 AttributesImpl attributesCopy = new AttributesImpl(atts);
591 currentElement = new ParseElement(name, parseStack.getPath(), parseStack.getPathList(), attributesCopy);
592 // Any CDATA (can be more the one SAX event) following is leading
593 // until next tag. Actually it is sufficient to switch this off
594 // in end tag not in start tag, as it would be turned on again
595 // immediately.
596 isFirstPCData = true;
597 }
598 }
599
600 // Notion of a stack representing a path.
601 private final class PathStack {
602
603 private List pathStack;
604
605 public PathStack(int initialCapacity) {
606 pathStack = new ArrayList(initialCapacity);
607 }
608
609 public PathStack() {
610 pathStack = new ArrayList();
611 }
612
613 public String getPath() {
614 StringBuffer path = new StringBuffer(100);
615 // this is always there as root
616 path.append('/');
617 for (Iterator it = pathStack.iterator(); it.hasNext();) {
618 Object element = it.next();
619 String pathElement;
620 if (buildComplexPath) {
621 pathElement = ((Item) element).getName();
622 } else {
623 pathElement = (String) element;
624 }
625 path.append(pathElement).append('/');
626 }
627 return path.toString();
628 }
629
630 public List getPathList() {
631 return pathStack;
632 }
633
634 public String toString() {
635 return getPath();
636 }
637
638 public void push(String namespaceURI, String name) {
639 if (buildComplexPath) {
640 pathStack.add(new Item(name, namespaceURI));
641 } else {
642 pathStack.add(name);
643 }
644 }
645
646 public int size() {
647 return pathStack.size();
648 }
649
650 public boolean empty() {
651 return (pathStack.size() <= 0);
652 }
653
654 public Object peek() {
655 int size = pathStack.size();
656 if (size > 0) {
657 return pathStack.get(size - 1);
658 } else {
659 return null;
660 }
661 }
662
663 public Object pop() {
664 int size = pathStack.size();
665 if (size > 0) {
666 Object o = pathStack.get(size - 1);
667 pathStack.remove(size - 1);
668 return o;
669 } else {
670 return null;
671 }
672 }
673
674 }
675 }