View Javadoc

1   /*
2    * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons-sandbox//xmlio/src/java/org/apache/commons/xmlio/in/SimpleImporter.java,v 1.1 2004/10/08 11:56:20 ozeigermann Exp $
3    * $Revision: 155476 $
4    * $Date: 2005-02-26 13:31:24 +0000 (Sat, 26 Feb 2005) $
5    *
6    * ====================================================================
7    *
8    * Copyright 2004 The Apache Software Foundation 
9    *
10   * Licensed under the Apache License, Version 2.0 (the "License");
11   * you may not use this file except in compliance with the License.
12   * You may obtain a copy of the License at
13   *
14   *     http://www.apache.org/licenses/LICENSE-2.0
15   *
16   * Unless required by applicable law or agreed to in writing, software
17   * distributed under the License is distributed on an "AS IS" BASIS,
18   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19   * See the License for the specific language governing permissions and
20   * limitations under the License.
21   *
22   */
23  
24  package org.apache.commons.xmlio.in;
25  
26  import java.io.*;
27  import java.util.*;
28  import java.net.*;
29  
30  import org.xml.sax.*;
31  import org.xml.sax.helpers.*;
32  import javax.xml.parsers.*;
33  
34  /**
35   * <b>Simple</b> and <b>fast</b> importer for XML configuration or import files. <br>
36   * <br>
37   * It is based on SAX and can be considered an extension to it. This means it is
38   * callback oriented and does not build an internal data structure like the DOM.
39   * While SAX is simple, fast, and memory friendly it might be a bit too 
40   * rudimentary for most tasks. <code>SimpleImporter</code> adds more high level
41   * means for importing XML while preserving the SAX's benefits. <br>
42   * <br>
43   * As with SAX you register a callback handler ({@link SimpleImportHandler})
44   * that is called upon events. Consider the following example implementation
45   * of a {@link SimpleImportHandler}:<br><br>
46   * <code><pre>
47   * public class DemoHandler implements SimpleImportHandler { 
48   * public void startDocument() { }
49   * public void endDocument() { }
50   * 
51   * public void cData(SimplePath path, String cdata) { }
52   * 
53   * public void startElement(SimplePath path, String name, AttributesImpl attributes, String leadingCDdata) {
54   * &nbsp;&nbsp;if (path.matches("/root/interesting-element")) {
55   * &nbsp;&nbsp;&nbsp;&nbsp;System.out.println(leadingCDdata);
56   * &nbsp;&nbsp;}
57   * }
58   * public void endElement(SimplePath path, String name) { }
59   * 
60   * }
61   * </pre></code>
62   * 
63   * Registering this class with {@link #addSimpleImportHandler} and call
64   * {@link #parse} on an input stream or {@link #parseUrlOrFile} will dump 
65   * the leading text of the element matching the path ({@link SimplePath}) 
66   * "/root/interesting-element".<br>
67   * <br>
68   * <em>Note</em>: This class is thread safe.
69   *
70   */
71  public class SimpleImporter {
72  
73      // properties
74      private boolean trimContent = true;
75      private boolean makeCopy = false;
76      private boolean zeroLengthIsNull = true;
77      private boolean includeLeadingCDataIntoStartElementCallback = true;
78      private boolean fullDebug = false;
79      private boolean useQName = true;
80      private boolean buildComplexPath = false;
81  
82      protected SAXParserFactory factory;
83  
84      protected List callbackHandlerList = new ArrayList();
85  
86      // internal state
87      protected StringBuffer currentMixedPCData = null;
88      protected boolean foundMixedPCData = false;
89      // the first (leading) CDATA is exacly the part between a start tag
90      // and any other tagging
91      protected StringBuffer firstPCData = null;
92      protected boolean isFirstPCData = true;
93  
94      // remember start element for later flushing
95      protected ParseElement currentElement = null;
96  
97      protected PathStack parseStack = new PathStack();
98  
99      protected String debugBuffer = null;
100 
101     /** Creates a new SimpleImporter object having default property settings. It is recommended
102      * to set all properties explicitly for clearity.
103      */
104     public SimpleImporter() {
105         factory = SAXParserFactory.newInstance();
106     }
107 
108     /** Determines if we have found any mixed content while parsing. */
109     public boolean getFoundMixedPCData() {
110         return foundMixedPCData;
111     }
112 
113     /**
114      * Determines if the path shall be assembled of the full qualified names. <code>true</code> is the default.
115      */
116     public boolean getUseQName() {
117         return useQName;
118     }
119 
120     /**
121      * Sets if the path shall be assembled of the full qualified names. <code>true</code> is the default.
122      */
123     public void setUseQName(boolean useQName) {
124         this.useQName = useQName;
125     }
126 
127     /**
128      * Determines if the simple path created will have complex additional info.  
129      */
130     public boolean getBuildComplexPath() {
131         return buildComplexPath;
132     }
133 
134     /**
135      * Sets if the simple path created will have complex additional info.  
136      */
137     public void setBuildComplexPath(boolean buildComplexPath) {
138         this.buildComplexPath = buildComplexPath;
139     }
140 
141     /** Sets the full debug mode which enables us to get the parsed stream
142      * as string via the {@link #getParsedStreamForDebug()}
143      * method even if an error occured.
144      */
145     public void setFullDebugMode(boolean fullDebug) {
146         this.fullDebug = fullDebug;
147     }
148 
149     /** Gets the property described in
150      * {@link #setFullDebugMode}.
151      */
152     public boolean getFullDebugMode() {
153         return fullDebug;
154     }
155 
156     /** Gets the whole stream parsed in the {@link #parse} method. As this requires some actions 
157      * significantly slowing down the whole parse, this only works if it has been enabled 
158      * by the the {@link #setFullDebugMode} method. 
159      */
160     public String getParsedStreamForDebug() {
161         if (!getFullDebugMode()) {
162             return null;
163         } else {
164             return debugBuffer;
165         }
166     }
167 
168     /** Gets property telling importer to return any leading CDATA, i.e.
169      * CDATA directly following a start tag before any other tagging,
170      * along with the start element
171      * method. If set to false leading CDATA will be returned using method
172      * {@link SimpleImportHandler#cData} just like any CDATA in a mixed
173      * content. <br>
174      *
175      * @see SimpleImportHandler#startElement
176      * @see #setIncludeLeadingCDataIntoStartElementCallback
177      */
178     public boolean getIncludeLeadingCDataIntoStartElementCallback() {
179         return includeLeadingCDataIntoStartElementCallback;
180     }
181 
182     /** Sets the property described in
183      * {@link #getIncludeLeadingCDataIntoStartElementCallback}.
184      */
185     public void setIncludeLeadingCDataIntoStartElementCallback(boolean includeLeadingCDataIntoStartElementCallback) {
186         this.includeLeadingCDataIntoStartElementCallback = includeLeadingCDataIntoStartElementCallback;
187     }
188 
189     /** Sets the property described in
190      * {@link #setTrimContent}.
191      */
192     public boolean getTrimContent() {
193         return trimContent;
194     }
195 
196     /** Sets when all content shall be trimed. 
197      * If set in conjunction with {@link #setZeroLengthIsNull} all whitespace data will not be
198      * reported to callback handlers. 
199      */
200     public void setTrimContent(boolean trimContent) {
201         this.trimContent = trimContent;
202     }
203 
204     /** Gets property: When findind zero length content should it be treated as null data? 
205      * If it is treated as null data nothing is reported to handlers when finding zero length data. 
206      */
207     public boolean getZeroLengthIsNull() {
208         return zeroLengthIsNull;
209     }
210 
211     /** Sets the property described in
212      * {@link #getZeroLengthIsNull}.
213      */
214     public void setZeroLengthIsNull(boolean zeroLengthIsNull) {
215         this.zeroLengthIsNull = zeroLengthIsNull;
216     }
217 
218     /** Gets the property describing if every callback handler gets a fresh copy of the parsed data. 
219      * This is only important when there is more than one callback handler. If so and it is not set,
220      * all handlers will get <em>identical</em> objects. This is bad if you expect them to change any
221      * of that data.
222      */
223     public boolean getMakeCopy() {
224         return makeCopy;
225     }
226 
227     /** Sets the property described in {@link #getMakeCopy}. */
228     public void setMakeCopy(boolean makeCopy) {
229         this.makeCopy = makeCopy;
230     }
231 
232     /** Adds a new callback handler if it is not in the callback list, yet. 
233      * This can be dynamically done while parsing. 
234      * @see #removeSimpleImportHandler
235      */
236     public void addSimpleImportHandler(SimpleImportHandler callbackHandler) {
237         synchronized (callbackHandlerList) {
238             if (!callbackHandlerList.contains(callbackHandler)) {
239                 callbackHandlerList.add(callbackHandler);
240             }
241         }
242     }
243 
244     /** Removes a callback handler if it is in the callback list. 
245      * This can be dynamically done while parsing. 
246      * @see #addSimpleImportHandler
247      */
248     public void removeSimpleImportHandler(SimpleImportHandler callbackHandler) {
249         synchronized (callbackHandlerList) {
250             callbackHandlerList.remove(callbackHandler);
251         }
252     }
253 
254     /** Tries to parse the file or URL named by parameter <code>urlOrFileName</code>. 
255      * First it tries to parse it as URL, if this does not work, it tries to parse it as file. 
256      * If one option works, an input stream will be opened and {@link #parse} will be called with it.
257      * If both does not work, an exception is thrown.
258      * 
259      * @see #parse
260      */
261     public synchronized void parseUrlOrFile(String urlOrFileName)
262         throws ParserConfigurationException, SAXException, IOException, SimpleImporterException {
263         Throwable urlException = null;
264         Throwable fileException = null;
265         InputStream in = null;
266         try {
267             URL url = new URL(urlOrFileName);
268             URLConnection urlConnection = url.openConnection();
269             in = urlConnection.getInputStream();
270         } catch (MalformedURLException mue) {
271             urlException = mue;
272         } catch (IOException ioe) {
273             urlException = ioe;
274         }
275 
276         try {
277             in = new FileInputStream(urlOrFileName);
278         } catch (IOException ioe) {
279             fileException = ioe;
280         }
281 
282         if (in != null) {
283             parse(new InputSource(new BufferedInputStream(in)));
284         } else {
285             throw new SimpleImporterException(
286                 "Could not parse "
287                     + urlOrFileName
288                     + ", is neither URL ("
289                     + urlException.getMessage()
290                     + ") nor file ("
291                     + fileException.getMessage()
292                     + ").");
293         }
294     }
295 
296     /** Parses the input source using the standard SAX parser and calls back the callback handlers.
297      * If enabled with {@link #setFullDebugMode} the source will be verbosely copied first.<br>
298      *<br>
299      * <em>Note</em>: This method is synchronized, so you can not have two concurrent parses.
300      */
301     public synchronized void parse(InputSource is) throws ParserConfigurationException, SAXException, IOException {
302         firstPCData = null;
303         currentElement = null;
304         factory.setNamespaceAware(!useQName || buildComplexPath);
305         SAXParser parser = factory.newSAXParser();
306         if (getFullDebugMode()) {
307             InputSource preReadIn = bufferParserStream(is);
308             parser.parse(preReadIn, new SAXHandler());
309         } else {
310             parser.parse(is, new SAXHandler());
311         }
312     }
313 
314     private InputSource bufferParserStream(InputSource is) throws IOException {
315         StringBuffer buf = new StringBuffer();
316         Reader reader;
317         BufferedReader bufferedReader;
318         if (is.getCharacterStream() != null) {
319             reader = is.getCharacterStream();
320         } else {
321             String encoding = is.getEncoding();
322             if (encoding != null) {
323                 reader = new InputStreamReader(is.getByteStream(), encoding);
324             } else {
325                 reader = new InputStreamReader(is.getByteStream());
326             }
327         }
328         if (reader instanceof BufferedReader) {
329             bufferedReader = (BufferedReader) reader;
330         } else {
331             bufferedReader = new BufferedReader(reader);
332         }
333 
334         while (true) {
335             String line = bufferedReader.readLine();
336             if (line == null) {
337                 break;
338             } else {
339                 buf.append(line).append('\n');
340             }
341         }
342         debugBuffer = buf.toString();
343         return new InputSource(new StringReader(debugBuffer));
344     }
345 
346     // callback handlers with start element method when there is data
347     private void callBackStartElementWhenReady() {
348         if (currentElement != null) {
349             String content = getFirstPCData();
350             SimplePath path;
351             if (buildComplexPath) {
352                 path =
353                     new SimplePath(
354                         currentElement.path,
355                         (Item[]) currentElement.pathList.toArray(new Item[currentElement.pathList.size()]));
356             } else {
357                 path = new SimplePath(currentElement.path);
358 
359             }
360 
361             synchronized (callbackHandlerList) {
362                 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
363                     SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
364                     if (getMakeCopy()) {
365                         // string is constant any way, no need to make a copy
366                         callbackHandler.startElement(
367                             new SimplePath(path),
368                             currentElement.name,
369                             new AttributesImpl(currentElement.attributes),
370                             content);
371                     } else {
372                         callbackHandler.startElement(path, currentElement.name, currentElement.attributes, content);
373                     }
374                 }
375             }
376 
377             firstPCData = null;
378             currentElement = null;
379         }
380     }
381 
382     private void sendCharacters(String text) {
383         if (text == null)
384             return;
385 
386         if (isFirstPCData) {
387             if (includeLeadingCDataIntoStartElementCallback) {
388                 addToFirstPCData(text);
389             } else {
390                 sendCData(text);
391             }
392         } else {
393             foundMixedPCData = true;
394             sendCData(text);
395         }
396     }
397 
398     private void callBackCDATAWhenReady() {
399         callBackStartElementWhenReady();
400         if (currentMixedPCData == null) {
401             return;
402         }
403         String text = currentMixedPCData.toString();
404         text = trimPCData(text);
405         if (text == null) {
406             return;
407         }
408 
409         SimplePath path;
410         if (buildComplexPath) {
411             path =
412                 new SimplePath(
413                     parseStack.getPath(),
414                     (Item[]) parseStack.getPathList().toArray(new Item[parseStack.getPathList().size()]));
415         } else {
416             path = new SimplePath(parseStack.getPath());
417 
418         }
419         
420         synchronized (callbackHandlerList) {
421             for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
422                 SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
423                 if (getMakeCopy()) {
424                     // string is constant any way, no need to make a copy
425                     callbackHandler.cData(new SimplePath(path), text);
426                 } else {
427                     callbackHandler.cData(path, text);
428                 }
429             }
430         }
431         currentMixedPCData = null;
432     }
433 
434     // send normal (not leading) CDATA to handlers
435     private void sendCData(String text) {
436         // defer sending it until we have a maximum chunck, i.e. until
437         // next tagging occurs
438         if (currentMixedPCData == null) {
439             currentMixedPCData = new StringBuffer(text.length());
440         }
441         currentMixedPCData.append(text);
442     }
443 
444     private void addToFirstPCData(String text) {
445         if (firstPCData == null) {
446             firstPCData = new StringBuffer(text.length());
447         }
448         firstPCData.append(text);
449     }
450 
451     private String getFirstPCData() {
452         if (firstPCData == null) {
453             return null;
454         } else {
455             String text = firstPCData.toString();
456             return trimPCData(text);
457         }
458     }
459 
460     // trim text depending on settings of properties
461     private String trimPCData(String pcData) {
462         if (pcData == null) {
463             return null;
464         } else {
465             if (getTrimContent()) {
466                 pcData = pcData.trim();
467             }
468             if (pcData.length() == 0 && getZeroLengthIsNull()) {
469                 return null;
470             } else {
471                 return pcData;
472             }
473         }
474     }
475 
476     // use to temporarily save a an element
477     private final static class ParseElement {
478         public String name, path;
479         public List pathList; 
480         public AttributesImpl attributes;
481 
482         public ParseElement(String name, String path, List pathList, AttributesImpl attributes) {
483             this.name = name;
484             this.path = path;
485             this.attributes = attributes;
486             this.pathList = pathList;
487         }
488     }
489 
490     private final class SAXHandler extends DefaultHandler {
491         public void startDocument() {
492             synchronized (callbackHandlerList) {
493                 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
494                     SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
495                     callbackHandler.startDocument();
496                 }
497             }
498         }
499 
500         public void endDocument() {
501             // flush any pending start elements and character data, as now the show is over
502             callBackStartElementWhenReady();
503             callBackCDATAWhenReady();
504             synchronized (callbackHandlerList) {
505                 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
506                     SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
507                     callbackHandler.endDocument();
508                 }
509             }
510         }
511 
512         public void characters(char ch[], int start, int length) {
513             if (length < 1)
514                 return;
515             String text = new String(ch, start, length);
516             sendCharacters(text);
517         }
518 
519         public void endElement(String namespaceURI, String localName, String qName) {
520             // be sure to have any pending start elements and character data flushed before
521             // sending end tag to keep right sequence of callbacks
522             callBackStartElementWhenReady();
523             callBackCDATAWhenReady();
524             String name;
525             if (!useQName || qName == null || qName.length() == 0) {
526                 name = localName;
527             } else {
528                 name = qName;
529             }
530 
531             SimplePath path;
532             if (buildComplexPath) {
533                 path =
534                     new SimplePath(
535                         parseStack.getPath(),
536                         (Item[]) parseStack.getPathList().toArray(new Item[parseStack.getPathList().size()]));
537             } else {
538                 path = new SimplePath(parseStack.getPath());
539 
540             }
541 
542             synchronized (callbackHandlerList) {
543                 for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
544                     SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
545                     if (getMakeCopy()) {
546                         // string is constant any way, no need to make a copy
547                         callbackHandler.endElement(new SimplePath(path), name);
548                     } else {
549                         callbackHandler.endElement(path, name);
550                     }
551                 }
552             }
553 
554             // this must never be
555             if (parseStack.empty()) {
556                 throw new SimpleImporterException("Umatchted end tag: " + name);
557             } else {
558                 Object top = parseStack.peek();
559                 String topName;
560                 if (buildComplexPath) {
561                     topName = ((Item)top).getName();
562                 } else {
563                     topName = (String)top;
564                 }
565                 if (!name.equals(topName)) {
566                     throw new SimpleImporterException(
567                         "End tag " + name + " does not match start tag " + top);
568                 } else {
569                     parseStack.pop();
570                 }
571             }
572             // any CDATA following can't be leading
573             isFirstPCData = false;
574         }
575 
576         public void startElement(String namespaceURI, String localName, String qName, Attributes atts) {
577             // be sure to have any pending start elements and character data flushed before
578             // opening a new one to keep right sequence of callbacks
579             callBackStartElementWhenReady();
580             callBackCDATAWhenReady();
581             String name;
582             if (!useQName || qName == null || qName.length() == 0) {
583                 name = localName;
584             } else {
585                 name = qName;
586             }
587             parseStack.push(namespaceURI, name);
588             // Defer callback to handlers as it is not clear now how
589             // much (if any) CDATA has to be passed over with start element method.
590             AttributesImpl attributesCopy = new AttributesImpl(atts);
591             currentElement = new ParseElement(name, parseStack.getPath(), parseStack.getPathList(), attributesCopy);
592             // Any CDATA (can be more the one SAX event) following is leading
593             // until next tag. Actually it is sufficient to switch this off
594             // in end tag not in start tag, as it would be turned on again
595             // immediately.
596             isFirstPCData = true;
597         }
598     }
599 
600     // Notion of a stack representing a path.
601     private final class PathStack {
602 
603         private List pathStack;
604 
605         public PathStack(int initialCapacity) {
606             pathStack = new ArrayList(initialCapacity);
607         }
608 
609         public PathStack() {
610             pathStack = new ArrayList();
611         }
612 
613         public String getPath() {
614             StringBuffer path = new StringBuffer(100);
615             // this is always there as root
616             path.append('/');
617             for (Iterator it = pathStack.iterator(); it.hasNext();) {
618                 Object element = it.next();
619                 String pathElement;
620                 if (buildComplexPath) {
621                     pathElement = ((Item) element).getName();
622                 } else {
623                     pathElement = (String) element;
624                 }
625                 path.append(pathElement).append('/');
626             }
627             return path.toString();
628         }
629 
630         public List getPathList() {
631             return pathStack;
632         }
633 
634         public String toString() {
635             return getPath();
636         }
637 
638         public void push(String namespaceURI, String name) {
639             if (buildComplexPath) {
640                 pathStack.add(new Item(name, namespaceURI));
641             } else {
642                 pathStack.add(name);
643             }
644         }
645 
646         public int size() {
647             return pathStack.size();
648         }
649 
650         public boolean empty() {
651             return (pathStack.size() <= 0);
652         }
653 
654         public Object peek() {
655             int size = pathStack.size();
656             if (size > 0) {
657                 return pathStack.get(size - 1);
658             } else {
659                 return null;
660             }
661         }
662 
663         public Object pop() {
664             int size = pathStack.size();
665             if (size > 0) {
666                 Object o = pathStack.get(size - 1);
667                 pathStack.remove(size - 1);
668                 return o;
669             } else {
670                 return null;
671             }
672         }
673 
674     }
675 }