001    /*
002     * $Header: /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons-sandbox//xmlio/src/java/org/apache/commons/xmlio/in/SimpleImporter.java,v 1.1 2004/10/08 11:56:20 ozeigermann Exp $
003     * $Revision: 155476 $
004     * $Date: 2005-02-26 13:31:24 +0000 (Sat, 26 Feb 2005) $
005     *
006     * ====================================================================
007     *
008     * Copyright 2004 The Apache Software Foundation 
009     *
010     * Licensed under the Apache License, Version 2.0 (the "License");
011     * you may not use this file except in compliance with the License.
012     * You may obtain a copy of the License at
013     *
014     *     http://www.apache.org/licenses/LICENSE-2.0
015     *
016     * Unless required by applicable law or agreed to in writing, software
017     * distributed under the License is distributed on an "AS IS" BASIS,
018     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
019     * See the License for the specific language governing permissions and
020     * limitations under the License.
021     *
022     */
023    
024    package org.apache.commons.xmlio.in;
025    
026    import java.io.*;
027    import java.util.*;
028    import java.net.*;
029    
030    import org.xml.sax.*;
031    import org.xml.sax.helpers.*;
032    import javax.xml.parsers.*;
033    
034    /**
035     * <b>Simple</b> and <b>fast</b> importer for XML configuration or import files. <br>
036     * <br>
037     * It is based on SAX and can be considered an extension to it. This means it is
038     * callback oriented and does not build an internal data structure like the DOM.
039     * While SAX is simple, fast, and memory friendly it might be a bit too 
040     * rudimentary for most tasks. <code>SimpleImporter</code> adds more high level
041     * means for importing XML while preserving the SAX's benefits. <br>
042     * <br>
043     * As with SAX you register a callback handler ({@link SimpleImportHandler})
044     * that is called upon events. Consider the following example implementation
045     * of a {@link SimpleImportHandler}:<br><br>
046     * <code><pre>
047     * public class DemoHandler implements SimpleImportHandler { 
048     * public void startDocument() { }
049     * public void endDocument() { }
050     * 
051     * public void cData(SimplePath path, String cdata) { }
052     * 
053     * public void startElement(SimplePath path, String name, AttributesImpl attributes, String leadingCDdata) {
054     * &nbsp;&nbsp;if (path.matches("/root/interesting-element")) {
055     * &nbsp;&nbsp;&nbsp;&nbsp;System.out.println(leadingCDdata);
056     * &nbsp;&nbsp;}
057     * }
058     * public void endElement(SimplePath path, String name) { }
059     * 
060     * }
061     * </pre></code>
062     * 
063     * Registering this class with {@link #addSimpleImportHandler} and call
064     * {@link #parse} on an input stream or {@link #parseUrlOrFile} will dump 
065     * the leading text of the element matching the path ({@link SimplePath}) 
066     * "/root/interesting-element".<br>
067     * <br>
068     * <em>Note</em>: This class is thread safe.
069     *
070     */
071    public class SimpleImporter {
072    
073        // properties
074        private boolean trimContent = true;
075        private boolean makeCopy = false;
076        private boolean zeroLengthIsNull = true;
077        private boolean includeLeadingCDataIntoStartElementCallback = true;
078        private boolean fullDebug = false;
079        private boolean useQName = true;
080        private boolean buildComplexPath = false;
081    
082        protected SAXParserFactory factory;
083    
084        protected List callbackHandlerList = new ArrayList();
085    
086        // internal state
087        protected StringBuffer currentMixedPCData = null;
088        protected boolean foundMixedPCData = false;
089        // the first (leading) CDATA is exacly the part between a start tag
090        // and any other tagging
091        protected StringBuffer firstPCData = null;
092        protected boolean isFirstPCData = true;
093    
094        // remember start element for later flushing
095        protected ParseElement currentElement = null;
096    
097        protected PathStack parseStack = new PathStack();
098    
099        protected String debugBuffer = null;
100    
101        /** Creates a new SimpleImporter object having default property settings. It is recommended
102         * to set all properties explicitly for clearity.
103         */
104        public SimpleImporter() {
105            factory = SAXParserFactory.newInstance();
106        }
107    
108        /** Determines if we have found any mixed content while parsing. */
109        public boolean getFoundMixedPCData() {
110            return foundMixedPCData;
111        }
112    
113        /**
114         * Determines if the path shall be assembled of the full qualified names. <code>true</code> is the default.
115         */
116        public boolean getUseQName() {
117            return useQName;
118        }
119    
120        /**
121         * Sets if the path shall be assembled of the full qualified names. <code>true</code> is the default.
122         */
123        public void setUseQName(boolean useQName) {
124            this.useQName = useQName;
125        }
126    
127        /**
128         * Determines if the simple path created will have complex additional info.  
129         */
130        public boolean getBuildComplexPath() {
131            return buildComplexPath;
132        }
133    
134        /**
135         * Sets if the simple path created will have complex additional info.  
136         */
137        public void setBuildComplexPath(boolean buildComplexPath) {
138            this.buildComplexPath = buildComplexPath;
139        }
140    
141        /** Sets the full debug mode which enables us to get the parsed stream
142         * as string via the {@link #getParsedStreamForDebug()}
143         * method even if an error occured.
144         */
145        public void setFullDebugMode(boolean fullDebug) {
146            this.fullDebug = fullDebug;
147        }
148    
149        /** Gets the property described in
150         * {@link #setFullDebugMode}.
151         */
152        public boolean getFullDebugMode() {
153            return fullDebug;
154        }
155    
156        /** Gets the whole stream parsed in the {@link #parse} method. As this requires some actions 
157         * significantly slowing down the whole parse, this only works if it has been enabled 
158         * by the the {@link #setFullDebugMode} method. 
159         */
160        public String getParsedStreamForDebug() {
161            if (!getFullDebugMode()) {
162                return null;
163            } else {
164                return debugBuffer;
165            }
166        }
167    
168        /** Gets property telling importer to return any leading CDATA, i.e.
169         * CDATA directly following a start tag before any other tagging,
170         * along with the start element
171         * method. If set to false leading CDATA will be returned using method
172         * {@link SimpleImportHandler#cData} just like any CDATA in a mixed
173         * content. <br>
174         *
175         * @see SimpleImportHandler#startElement
176         * @see #setIncludeLeadingCDataIntoStartElementCallback
177         */
178        public boolean getIncludeLeadingCDataIntoStartElementCallback() {
179            return includeLeadingCDataIntoStartElementCallback;
180        }
181    
182        /** Sets the property described in
183         * {@link #getIncludeLeadingCDataIntoStartElementCallback}.
184         */
185        public void setIncludeLeadingCDataIntoStartElementCallback(boolean includeLeadingCDataIntoStartElementCallback) {
186            this.includeLeadingCDataIntoStartElementCallback = includeLeadingCDataIntoStartElementCallback;
187        }
188    
189        /** Sets the property described in
190         * {@link #setTrimContent}.
191         */
192        public boolean getTrimContent() {
193            return trimContent;
194        }
195    
196        /** Sets when all content shall be trimed. 
197         * If set in conjunction with {@link #setZeroLengthIsNull} all whitespace data will not be
198         * reported to callback handlers. 
199         */
200        public void setTrimContent(boolean trimContent) {
201            this.trimContent = trimContent;
202        }
203    
204        /** Gets property: When findind zero length content should it be treated as null data? 
205         * If it is treated as null data nothing is reported to handlers when finding zero length data. 
206         */
207        public boolean getZeroLengthIsNull() {
208            return zeroLengthIsNull;
209        }
210    
211        /** Sets the property described in
212         * {@link #getZeroLengthIsNull}.
213         */
214        public void setZeroLengthIsNull(boolean zeroLengthIsNull) {
215            this.zeroLengthIsNull = zeroLengthIsNull;
216        }
217    
218        /** Gets the property describing if every callback handler gets a fresh copy of the parsed data. 
219         * This is only important when there is more than one callback handler. If so and it is not set,
220         * all handlers will get <em>identical</em> objects. This is bad if you expect them to change any
221         * of that data.
222         */
223        public boolean getMakeCopy() {
224            return makeCopy;
225        }
226    
227        /** Sets the property described in {@link #getMakeCopy}. */
228        public void setMakeCopy(boolean makeCopy) {
229            this.makeCopy = makeCopy;
230        }
231    
232        /** Adds a new callback handler if it is not in the callback list, yet. 
233         * This can be dynamically done while parsing. 
234         * @see #removeSimpleImportHandler
235         */
236        public void addSimpleImportHandler(SimpleImportHandler callbackHandler) {
237            synchronized (callbackHandlerList) {
238                if (!callbackHandlerList.contains(callbackHandler)) {
239                    callbackHandlerList.add(callbackHandler);
240                }
241            }
242        }
243    
244        /** Removes a callback handler if it is in the callback list. 
245         * This can be dynamically done while parsing. 
246         * @see #addSimpleImportHandler
247         */
248        public void removeSimpleImportHandler(SimpleImportHandler callbackHandler) {
249            synchronized (callbackHandlerList) {
250                callbackHandlerList.remove(callbackHandler);
251            }
252        }
253    
254        /** Tries to parse the file or URL named by parameter <code>urlOrFileName</code>. 
255         * First it tries to parse it as URL, if this does not work, it tries to parse it as file. 
256         * If one option works, an input stream will be opened and {@link #parse} will be called with it.
257         * If both does not work, an exception is thrown.
258         * 
259         * @see #parse
260         */
261        public synchronized void parseUrlOrFile(String urlOrFileName)
262            throws ParserConfigurationException, SAXException, IOException, SimpleImporterException {
263            Throwable urlException = null;
264            Throwable fileException = null;
265            InputStream in = null;
266            try {
267                URL url = new URL(urlOrFileName);
268                URLConnection urlConnection = url.openConnection();
269                in = urlConnection.getInputStream();
270            } catch (MalformedURLException mue) {
271                urlException = mue;
272            } catch (IOException ioe) {
273                urlException = ioe;
274            }
275    
276            try {
277                in = new FileInputStream(urlOrFileName);
278            } catch (IOException ioe) {
279                fileException = ioe;
280            }
281    
282            if (in != null) {
283                parse(new InputSource(new BufferedInputStream(in)));
284            } else {
285                throw new SimpleImporterException(
286                    "Could not parse "
287                        + urlOrFileName
288                        + ", is neither URL ("
289                        + urlException.getMessage()
290                        + ") nor file ("
291                        + fileException.getMessage()
292                        + ").");
293            }
294        }
295    
296        /** Parses the input source using the standard SAX parser and calls back the callback handlers.
297         * If enabled with {@link #setFullDebugMode} the source will be verbosely copied first.<br>
298         *<br>
299         * <em>Note</em>: This method is synchronized, so you can not have two concurrent parses.
300         */
301        public synchronized void parse(InputSource is) throws ParserConfigurationException, SAXException, IOException {
302            firstPCData = null;
303            currentElement = null;
304            factory.setNamespaceAware(!useQName || buildComplexPath);
305            SAXParser parser = factory.newSAXParser();
306            if (getFullDebugMode()) {
307                InputSource preReadIn = bufferParserStream(is);
308                parser.parse(preReadIn, new SAXHandler());
309            } else {
310                parser.parse(is, new SAXHandler());
311            }
312        }
313    
314        private InputSource bufferParserStream(InputSource is) throws IOException {
315            StringBuffer buf = new StringBuffer();
316            Reader reader;
317            BufferedReader bufferedReader;
318            if (is.getCharacterStream() != null) {
319                reader = is.getCharacterStream();
320            } else {
321                String encoding = is.getEncoding();
322                if (encoding != null) {
323                    reader = new InputStreamReader(is.getByteStream(), encoding);
324                } else {
325                    reader = new InputStreamReader(is.getByteStream());
326                }
327            }
328            if (reader instanceof BufferedReader) {
329                bufferedReader = (BufferedReader) reader;
330            } else {
331                bufferedReader = new BufferedReader(reader);
332            }
333    
334            while (true) {
335                String line = bufferedReader.readLine();
336                if (line == null) {
337                    break;
338                } else {
339                    buf.append(line).append('\n');
340                }
341            }
342            debugBuffer = buf.toString();
343            return new InputSource(new StringReader(debugBuffer));
344        }
345    
346        // callback handlers with start element method when there is data
347        private void callBackStartElementWhenReady() {
348            if (currentElement != null) {
349                String content = getFirstPCData();
350                SimplePath path;
351                if (buildComplexPath) {
352                    path =
353                        new SimplePath(
354                            currentElement.path,
355                            (Item[]) currentElement.pathList.toArray(new Item[currentElement.pathList.size()]));
356                } else {
357                    path = new SimplePath(currentElement.path);
358    
359                }
360    
361                synchronized (callbackHandlerList) {
362                    for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
363                        SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
364                        if (getMakeCopy()) {
365                            // string is constant any way, no need to make a copy
366                            callbackHandler.startElement(
367                                new SimplePath(path),
368                                currentElement.name,
369                                new AttributesImpl(currentElement.attributes),
370                                content);
371                        } else {
372                            callbackHandler.startElement(path, currentElement.name, currentElement.attributes, content);
373                        }
374                    }
375                }
376    
377                firstPCData = null;
378                currentElement = null;
379            }
380        }
381    
382        private void sendCharacters(String text) {
383            if (text == null)
384                return;
385    
386            if (isFirstPCData) {
387                if (includeLeadingCDataIntoStartElementCallback) {
388                    addToFirstPCData(text);
389                } else {
390                    sendCData(text);
391                }
392            } else {
393                foundMixedPCData = true;
394                sendCData(text);
395            }
396        }
397    
398        private void callBackCDATAWhenReady() {
399            callBackStartElementWhenReady();
400            if (currentMixedPCData == null) {
401                return;
402            }
403            String text = currentMixedPCData.toString();
404            text = trimPCData(text);
405            if (text == null) {
406                return;
407            }
408    
409            SimplePath path;
410            if (buildComplexPath) {
411                path =
412                    new SimplePath(
413                        parseStack.getPath(),
414                        (Item[]) parseStack.getPathList().toArray(new Item[parseStack.getPathList().size()]));
415            } else {
416                path = new SimplePath(parseStack.getPath());
417    
418            }
419            
420            synchronized (callbackHandlerList) {
421                for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
422                    SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
423                    if (getMakeCopy()) {
424                        // string is constant any way, no need to make a copy
425                        callbackHandler.cData(new SimplePath(path), text);
426                    } else {
427                        callbackHandler.cData(path, text);
428                    }
429                }
430            }
431            currentMixedPCData = null;
432        }
433    
434        // send normal (not leading) CDATA to handlers
435        private void sendCData(String text) {
436            // defer sending it until we have a maximum chunck, i.e. until
437            // next tagging occurs
438            if (currentMixedPCData == null) {
439                currentMixedPCData = new StringBuffer(text.length());
440            }
441            currentMixedPCData.append(text);
442        }
443    
444        private void addToFirstPCData(String text) {
445            if (firstPCData == null) {
446                firstPCData = new StringBuffer(text.length());
447            }
448            firstPCData.append(text);
449        }
450    
451        private String getFirstPCData() {
452            if (firstPCData == null) {
453                return null;
454            } else {
455                String text = firstPCData.toString();
456                return trimPCData(text);
457            }
458        }
459    
460        // trim text depending on settings of properties
461        private String trimPCData(String pcData) {
462            if (pcData == null) {
463                return null;
464            } else {
465                if (getTrimContent()) {
466                    pcData = pcData.trim();
467                }
468                if (pcData.length() == 0 && getZeroLengthIsNull()) {
469                    return null;
470                } else {
471                    return pcData;
472                }
473            }
474        }
475    
476        // use to temporarily save a an element
477        private final static class ParseElement {
478            public String name, path;
479            public List pathList; 
480            public AttributesImpl attributes;
481    
482            public ParseElement(String name, String path, List pathList, AttributesImpl attributes) {
483                this.name = name;
484                this.path = path;
485                this.attributes = attributes;
486                this.pathList = pathList;
487            }
488        }
489    
490        private final class SAXHandler extends DefaultHandler {
491            public void startDocument() {
492                synchronized (callbackHandlerList) {
493                    for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
494                        SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
495                        callbackHandler.startDocument();
496                    }
497                }
498            }
499    
500            public void endDocument() {
501                // flush any pending start elements and character data, as now the show is over
502                callBackStartElementWhenReady();
503                callBackCDATAWhenReady();
504                synchronized (callbackHandlerList) {
505                    for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
506                        SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
507                        callbackHandler.endDocument();
508                    }
509                }
510            }
511    
512            public void characters(char ch[], int start, int length) {
513                if (length < 1)
514                    return;
515                String text = new String(ch, start, length);
516                sendCharacters(text);
517            }
518    
519            public void endElement(String namespaceURI, String localName, String qName) {
520                // be sure to have any pending start elements and character data flushed before
521                // sending end tag to keep right sequence of callbacks
522                callBackStartElementWhenReady();
523                callBackCDATAWhenReady();
524                String name;
525                if (!useQName || qName == null || qName.length() == 0) {
526                    name = localName;
527                } else {
528                    name = qName;
529                }
530    
531                SimplePath path;
532                if (buildComplexPath) {
533                    path =
534                        new SimplePath(
535                            parseStack.getPath(),
536                            (Item[]) parseStack.getPathList().toArray(new Item[parseStack.getPathList().size()]));
537                } else {
538                    path = new SimplePath(parseStack.getPath());
539    
540                }
541    
542                synchronized (callbackHandlerList) {
543                    for (Iterator it = callbackHandlerList.iterator(); it.hasNext();) {
544                        SimpleImportHandler callbackHandler = (SimpleImportHandler) it.next();
545                        if (getMakeCopy()) {
546                            // string is constant any way, no need to make a copy
547                            callbackHandler.endElement(new SimplePath(path), name);
548                        } else {
549                            callbackHandler.endElement(path, name);
550                        }
551                    }
552                }
553    
554                // this must never be
555                if (parseStack.empty()) {
556                    throw new SimpleImporterException("Umatchted end tag: " + name);
557                } else {
558                    Object top = parseStack.peek();
559                    String topName;
560                    if (buildComplexPath) {
561                        topName = ((Item)top).getName();
562                    } else {
563                        topName = (String)top;
564                    }
565                    if (!name.equals(topName)) {
566                        throw new SimpleImporterException(
567                            "End tag " + name + " does not match start tag " + top);
568                    } else {
569                        parseStack.pop();
570                    }
571                }
572                // any CDATA following can't be leading
573                isFirstPCData = false;
574            }
575    
576            public void startElement(String namespaceURI, String localName, String qName, Attributes atts) {
577                // be sure to have any pending start elements and character data flushed before
578                // opening a new one to keep right sequence of callbacks
579                callBackStartElementWhenReady();
580                callBackCDATAWhenReady();
581                String name;
582                if (!useQName || qName == null || qName.length() == 0) {
583                    name = localName;
584                } else {
585                    name = qName;
586                }
587                parseStack.push(namespaceURI, name);
588                // Defer callback to handlers as it is not clear now how
589                // much (if any) CDATA has to be passed over with start element method.
590                AttributesImpl attributesCopy = new AttributesImpl(atts);
591                currentElement = new ParseElement(name, parseStack.getPath(), parseStack.getPathList(), attributesCopy);
592                // Any CDATA (can be more the one SAX event) following is leading
593                // until next tag. Actually it is sufficient to switch this off
594                // in end tag not in start tag, as it would be turned on again
595                // immediately.
596                isFirstPCData = true;
597            }
598        }
599    
600        // Notion of a stack representing a path.
601        private final class PathStack {
602    
603            private List pathStack;
604    
605            public PathStack(int initialCapacity) {
606                pathStack = new ArrayList(initialCapacity);
607            }
608    
609            public PathStack() {
610                pathStack = new ArrayList();
611            }
612    
613            public String getPath() {
614                StringBuffer path = new StringBuffer(100);
615                // this is always there as root
616                path.append('/');
617                for (Iterator it = pathStack.iterator(); it.hasNext();) {
618                    Object element = it.next();
619                    String pathElement;
620                    if (buildComplexPath) {
621                        pathElement = ((Item) element).getName();
622                    } else {
623                        pathElement = (String) element;
624                    }
625                    path.append(pathElement).append('/');
626                }
627                return path.toString();
628            }
629    
630            public List getPathList() {
631                return pathStack;
632            }
633    
634            public String toString() {
635                return getPath();
636            }
637    
638            public void push(String namespaceURI, String name) {
639                if (buildComplexPath) {
640                    pathStack.add(new Item(name, namespaceURI));
641                } else {
642                    pathStack.add(name);
643                }
644            }
645    
646            public int size() {
647                return pathStack.size();
648            }
649    
650            public boolean empty() {
651                return (pathStack.size() <= 0);
652            }
653    
654            public Object peek() {
655                int size = pathStack.size();
656                if (size > 0) {
657                    return pathStack.get(size - 1);
658                } else {
659                    return null;
660                }
661            }
662    
663            public Object pop() {
664                int size = pathStack.size();
665                if (size > 0) {
666                    Object o = pathStack.get(size - 1);
667                    pathStack.remove(size - 1);
668                    return o;
669                } else {
670                    return null;
671                }
672            }
673    
674        }
675    }