001    package org.apache.commons.digester3;
002    
003    /*
004     * Licensed to the Apache Software Foundation (ASF) under one
005     * or more contributor license agreements.  See the NOTICE file
006     * distributed with this work for additional information
007     * regarding copyright ownership.  The ASF licenses this file
008     * to you under the Apache License, Version 2.0 (the
009     * "License"); you may not use this file except in compliance
010     * with the License.  You may obtain a copy of the License at
011     *
012     *   http://www.apache.org/licenses/LICENSE-2.0
013     *
014     * Unless required by applicable law or agreed to in writing,
015     * software distributed under the License is distributed on an
016     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017     * KIND, either express or implied.  See the License for the
018     * specific language governing permissions and limitations
019     * under the License.
020     */
021    
022    import java.util.ArrayList;
023    import java.util.Collections;
024    import java.util.Comparator;
025    import java.util.HashMap;
026    import java.util.Iterator;
027    import java.util.List;
028    import java.util.Map;
029    
030    import org.xml.sax.Attributes;
031    
032    /**
033     * <p>
034     * Extension of {@link RulesBase} for complex schema.
035     * </p>
036     * <p>
037     * This is an extension of the basic pattern matching scheme intended to improve support for mapping complex xml-schema.
038     * It is intended to be a minimal extension of the standard rules big enough to support complex schema but without the
039     * full generality offered by more exotic matching pattern rules.
040     * </p>
041     * <h4>When should you use this rather than the original?</h4>
042     * <p>
043     * This pattern-matching engine is complex and slower than the basic default RulesBase class, but offers more
044     * functionality:
045     * <ul>
046     * <li>Universal patterns allow patterns to be specified which will match regardless of whether there are
047     * "better matching" patterns available.</li>
048     * <li>Parent-match patterns (eg "a/b/?") allow matching for all direct children of a specified element.</li>
049     * <li>Ancestor-match patterns (eg "a/b/*") allow matching all elements nested within a specified element to any nesting
050     * depth.</li>
051     * <li>Completely-wild patterns ("*" or "!*") allow matching all elements.</li>
052     * </ul>
053     * </p>
054     * <h4>Universal Match Patterns</h4>
055     * <p>
056     * The default RulesBase pattern-matching engine always attempts to find the "best matching pattern", and will ignore
057     * rules associated with other patterns that match but are not "as good". As an example, if the pattern "a/b/c" is
058     * associated with rules 1 and 2, and "*&#47;c" is associated with rules 3 and 4 then element "a/b/c" will cause only
059     * rules 1 and 2 to execute. Rules 3 and 4 do have matching patterns, but because the patterns are shorter and include
060     * wildcard characters they are regarded as being "not as good" as a direct match. In general, exact patterns are better
061     * than wildcard patterns, and among multiple patterns with wildcards, the longest is preferred. See the RulesBase class
062     * for more information.
063     * </p>
064     * <p>
065     * This feature of preferring "better" patterns can be a powerful tool. However it also means that patterns can interact
066     * in unexpected ways.
067     * </p>
068     * <p>
069     * When using the ExtendedBaseRules, any pattern prefixed with '!' bypasses the "best match" feature. Even if there is
070     * an exact match or a longer wildcard match, patterns prefixed by '!' will still be tested to see if they match, and if
071     * so their associated Rule objects will be included in the set of rules to be executed in the normal manner.
072     * </p>
073     * <ul>
074     * <li>Pattern <code>"!*&#47;a/b"</code> matches whenever an 'b' element is inside an 'a'.</li>
075     * <li>Pattern <code>"!a/b/?"</code> matches any child of a parent matching <code>"a/b"</code> (see
076     * "Parent Match Patterns").</li>
077     * <li>Pattern <code>"!*&#47;a/b/?"</code> matches any child of a parent matching <code>"!*&#47;a/b"</code> (see
078     * "Parent Match Patterns").</li>
079     * <li>Pattern <code>"!a/b/*"</code> matches any element whose path starts with "a" then "b" (see
080     * "Ancestor Match Patterns").</li>
081     * <li>Pattern <code>"!*&#47;a/b/*"</code> matches any elements whose path contains 'a/b' (see
082     * "Ancestor Match Patterns").</li>
083     * </ul>
084     * <h4>Parent Match Patterns</h4>
085     * <p>
086     * These will match direct child elements of a particular parent element.
087     * <ul>
088     * <li>
089     *  <code>"a/b/c/?"</code> matches any child whose parent matches <code>"a/b/c"</code>. Exact parent rules take
090     * precedence over Ancestor Match patterns.</li>
091     * <li>
092     *  <code>"*&#47;a/b/c/?"</code> matches any child whose parent matches <code>"*&#47;a/b/c"</code>. The longest
093     * matching still applies to parent matches but the length excludes the '?', which effectively means that standard
094     * wildcard matches with the same level of depth are chosen in preference.</li>
095     * </ul>
096     * </p>
097     * <h4>Ancestor Match Patterns</h4>
098     * <p>
099     * These will match elements whose parentage includes a particular sequence of elements.
100     * <ul>
101     * <li>
102     *  <code>"a/b/*"</code> matches any element whose path starts with 'a' then 'b'. Exact parent and parent match rules
103     * take precedence. The longest ancestor match will take precedence.</li>
104     * <li>
105     *  <code>"*&#47;a/b/*"</code> matches any elements whose path contains an element 'a' followed by an element 'b'.
106     * The longest matching still applies but the length excludes the '*' at the end.</li>
107     * </ul>
108     * </p>
109     * <h4>Completely Wild Patterns</h4>
110     * <p>
111     * Pattern <code>"*"</code> matches every pattern that isn't matched by any other basic rule.
112     * </p>
113     * <p>
114     * Pattern <code>"!*"</code> matches every pattern.
115     * </p>
116     * <h4>Using The Extended Rules</h4>
117     * <p>
118     * By default, a Digester instance uses a {@link RulesBase} instance as its pattern matching engine. To use an
119     * ExtendedBaseRules instance, call the Digester.setRules method before adding any Rule objects to the digester
120     * instance:
121     *
122     * <pre>
123     * Digester digester = new Digester();
124     * digester.setRules( new ExtendedBaseRules() );
125     * </pre>
126     *
127     * </p>
128     * <p>
129     * The most important thing to remember when using the extended rules is that universal and non-universal patterns are
130     * completely independent. Universal patterns are never affected by the addition of new patterns or the removal of
131     * existing ones. Non-universal patterns are never affected by the addition of new <em>universal</em> patterns or the
132     * removal of existing <em>universal</em> patterns. As in the basic matching rules, non-universal (basic) patterns
133     * <strong>can</strong> be affected by the addition of new <em>non-universal</em> patterns or the removal of existing
134     * <em>non-universal</em> patterns, because only rules associated with the "best matching" pattern for each xml element
135     * are executed.
136     * <p>
137     * This means that you can use universal patterns to build up the simple parts of your structure - for example defining
138     * universal creation and property setting rules. More sophisticated and complex mapping will require non-universal
139     * patterns and this might mean that some of the universal rules will need to be replaced by a series of special cases
140     * using non-universal rules. But by using universal rules as your backbone, these additions should not break your
141     * existing rules.
142     * </p>
143     */
144    public class ExtendedBaseRules
145        extends RulesBase
146    {
147    
148        // ----------------------------------------------------- Instance Variables
149    
150        /**
151         * Counts the entry number for the rules.
152         */
153        private int counter = 0;
154    
155        /**
156         * The decision algorithm used (unfortunately) doesn't preserve the entry order. This map is used by a comparator
157         * which orders the list of matches before it's returned. This map stores the entry number keyed by the rule.
158         */
159        private Map<Rule, Integer> order = new HashMap<Rule, Integer>();
160    
161        // --------------------------------------------------------- Public Methods
162    
163        /**
164         * {@inheritDoc}
165         */
166        @Override
167        protected void registerRule( String pattern, Rule rule )
168        {
169            super.registerRule( pattern, rule );
170            counter++;
171            order.put( rule, counter );
172        }
173    
174        /**
175         * {@inheritDoc}
176         */
177        @Override
178        public List<Rule> match( String namespaceURI, String pattern, String name, Attributes attributes )
179        {
180            // calculate the pattern of the parent
181            // (if the element has one)
182            String parentPattern = "";
183            int lastIndex = pattern.lastIndexOf( '/' );
184    
185            boolean hasParent = true;
186            if ( lastIndex == -1 )
187            {
188                // element has no parent
189                hasParent = false;
190            }
191            else
192            {
193                // calculate the pattern of the parent
194                parentPattern = pattern.substring( 0, lastIndex );
195            }
196    
197            // we keep the list of universal matches separate
198            List<Rule> universalList = new ArrayList<Rule>( counter );
199    
200            // Universal wildcards ('*') in the middle of the pattern-string
201            List<Rule> recList = null;
202            // temporary parentPattern
203            // we don't want to change anything....
204            String tempParentPattern = parentPattern;
205            int parentLastIndex = tempParentPattern.lastIndexOf( '/' );
206            // look for pattern. Here, we search the whole
207            // parent. Not ideal, but does the thing....
208            while ( parentLastIndex > -1 && recList == null )
209            {
210                recList = this.cache.get( tempParentPattern + "/*/" + pattern.substring( lastIndex + 1 ) );
211                if ( recList != null )
212                {
213                    // when /*/-pattern-string is found, add method
214                    // list to universalList.
215                    // Digester will do the rest
216                    universalList.addAll( recList );
217                }
218                else
219                {
220                    // if not, shorten tempParent to move /*/ one position
221                    // to the left.
222                    // as last part of patttern is always added
223                    // we make sure pattern is allowed anywhere.
224                    tempParentPattern = parentPattern.substring( 0, parentLastIndex );
225                }
226    
227                parentLastIndex = tempParentPattern.lastIndexOf( '/' );
228            }
229    
230            // Universal all wildards ('!*')
231            // These are always matched so always add them
232            List<Rule> tempList = this.cache.get( "!*" );
233            if ( tempList != null )
234            {
235                universalList.addAll( tempList );
236            }
237    
238            // Universal exact parent match
239            // need to get this now since only wildcards are considered later
240            tempList = this.cache.get( "!" + parentPattern + "/?" );
241            if ( tempList != null )
242            {
243                universalList.addAll( tempList );
244            }
245    
246            // base behaviour means that if we certain matches, we don't continue
247            // but we just have a single combined loop and so we have to set
248            // a variable
249            boolean ignoreBasicMatches = false;
250    
251            // see if we have an exact basic pattern match
252            List<Rule> rulesList = this.cache.get( pattern );
253            if ( rulesList != null )
254            {
255                // we have a match!
256                // so ignore all basic matches from now on
257                ignoreBasicMatches = true;
258    
259            }
260            else
261            {
262    
263                // see if we have an exact child match
264                if ( hasParent )
265                {
266                    // matching children takes preference
267                    rulesList = this.cache.get( parentPattern + "/?" );
268                    if ( rulesList != null )
269                    {
270                        // we have a match!
271                        // so ignore all basic matches from now on
272                        ignoreBasicMatches = true;
273    
274                    }
275                    else
276                    {
277                        // we don't have a match yet - so try exact ancester
278                        //
279                        rulesList = findExactAncesterMatch( pattern );
280                        if ( rulesList != null )
281                        {
282                            // we have a match!
283                            // so ignore all basic matches from now on
284                            ignoreBasicMatches = true;
285                        }
286                    }
287                }
288            }
289    
290            // OK - we're ready for the big loop!
291            // Unlike the basic rules case,
292            // we have to go through for all those universal rules in all cases.
293    
294            // Find the longest key, ie more discriminant
295            int longKeyLength = 0;
296    
297            for ( String key : this.cache.keySet() )
298            {
299    
300                // find out if it's a univeral pattern
301                // set a flag
302                boolean isUniversal = key.startsWith( "!" );
303                if ( isUniversal )
304                {
305                    // and find the underlying key
306                    key = key.substring( 1, key.length() );
307                }
308    
309                // don't need to check exact matches
310                boolean wildcardMatchStart = key.startsWith( "*/" );
311                boolean wildcardMatchEnd = key.endsWith( "/*" );
312                if ( wildcardMatchStart || ( isUniversal && wildcardMatchEnd ) )
313                {
314    
315                    boolean parentMatched = false;
316                    boolean basicMatched = false;
317                    boolean ancesterMatched = false;
318    
319                    boolean parentMatchEnd = key.endsWith( "/?" );
320                    if ( parentMatchEnd )
321                    {
322                        // try for a parent match
323                        parentMatched = parentMatch( key, parentPattern );
324    
325                    }
326                    else if ( wildcardMatchEnd )
327                    {
328                        // check for ancester match
329                        if ( wildcardMatchStart )
330                        {
331                            String patternBody = key.substring( 2, key.length() - 2 );
332                            if ( pattern.endsWith( patternBody ) )
333                            {
334                                ancesterMatched = true;
335                            }
336                            else
337                            {
338                                ancesterMatched = ( pattern.indexOf( patternBody + "/" ) > -1 );
339                            }
340                        }
341                        else
342                        {
343                            String bodyPattern = key.substring( 0, key.length() - 2 );
344                            if ( pattern.startsWith( bodyPattern ) )
345                            {
346                                if ( pattern.length() == bodyPattern.length() )
347                                {
348                                    // exact match
349                                    ancesterMatched = true;
350                                }
351                                else
352                                {
353                                    ancesterMatched = ( pattern.charAt( bodyPattern.length() ) == '/' );
354                                }
355                            }
356                            else
357                            {
358                                ancesterMatched = false;
359                            }
360                        }
361                    }
362                    else
363                    {
364                        // try for a base match
365                        basicMatched = basicMatch( key, pattern );
366                    }
367    
368                    if ( parentMatched || basicMatched || ancesterMatched )
369                    {
370                        if ( isUniversal )
371                        {
372                            // universal rules go straight in
373                            // (no longest matching rule)
374                            tempList = this.cache.get( "!" + key );
375                            if ( tempList != null )
376                            {
377                                universalList.addAll( tempList );
378                            }
379    
380                        }
381                        else
382                        {
383                            if ( !ignoreBasicMatches )
384                            {
385                                // ensure that all parent matches are SHORTER
386                                // than rules with same level of matching.
387                                //
388                                // the calculations below don't work for universal
389                                // matching, but we don't care because in that case
390                                // this if-stmt is not entered.
391                                int keyLength = key.length();
392                                if ( wildcardMatchStart )
393                                {
394                                    --keyLength;
395                                }
396                                if ( wildcardMatchEnd )
397                                {
398                                    --keyLength;
399                                }
400                                else if ( parentMatchEnd )
401                                {
402                                    --keyLength;
403                                }
404    
405                                if ( keyLength > longKeyLength )
406                                {
407                                    rulesList = this.cache.get( key );
408                                    longKeyLength = keyLength;
409                                }
410                            }
411                        }
412                    }
413                }
414            }
415    
416            // '*' works in practice as a default matching
417            // (this is because anything is a deeper match!)
418            if ( rulesList == null )
419            {
420                rulesList = this.cache.get( "*" );
421            }
422    
423            // if we've matched a basic pattern, then add to the universal list
424            if ( rulesList != null )
425            {
426                universalList.addAll( rulesList );
427            }
428    
429            // don't filter if namespace is null
430            if ( namespaceURI != null )
431            {
432                // remove invalid namespaces
433                Iterator<Rule> it = universalList.iterator();
434                while ( it.hasNext() )
435                {
436                    Rule rule = it.next();
437                    String nsUri = rule.getNamespaceURI();
438                    if ( nsUri != null && !nsUri.equals( namespaceURI ) )
439                    {
440                        it.remove();
441                    }
442                }
443            }
444    
445            // need to make sure that the collection is sort in the order
446            // of addition. We use a custom comparator for this
447            Collections.sort( universalList, new Comparator<Rule>()
448            {
449    
450                public int compare( Rule r1, Rule r2 )
451                {
452                    // Get the entry order from the map
453                    Integer i1 = order.get( r1 );
454                    Integer i2 = order.get( r2 );
455    
456                    // and use that to perform the comparison
457                    if ( i1 == null )
458                    {
459                        if ( i2 == null )
460                        {
461    
462                            return 0;
463    
464                        }
465                        return -1;
466                    }
467                    else if ( i2 == null )
468                    {
469                        return 1;
470                    }
471    
472                    return ( i1.intValue() - i2.intValue() );
473                }
474            } );
475    
476            return universalList;
477        }
478    
479        /**
480         * Checks the input parentPattern contains the input key at the end.
481         *
482         * @param key The key to be found
483         * @param parentPattern The pattern where looking for the key
484         * @return true, if {@code key} is found inside {@code parentPattern}, false otherwise
485         */
486        private boolean parentMatch( String key, String parentPattern )
487        {
488            return parentPattern.endsWith( key.substring( 1, key.length() - 2 ) );
489        }
490    
491        /**
492         * Standard match. Matches the end of the pattern to the key.
493         *
494         * @param key The key to be found
495         * @param pattern The pattern where looking for the key
496         * @return true, if {@code key} is found inside {@code pattern}, false otherwise
497         */
498        private boolean basicMatch( String key, String pattern )
499        {
500            return ( pattern.equals( key.substring( 2 ) ) || pattern.endsWith( key.substring( 1 ) ) );
501        }
502    
503        /**
504         * Finds an exact ancester match for given pattern
505         *
506         * @param parentPattern The input pattern
507         * @return A list of {@code Rule} related to the input pattern
508         */
509        private List<Rule> findExactAncesterMatch( String parentPattern )
510        {
511            List<Rule> matchingRules = null;
512            int lastIndex = parentPattern.length();
513            while ( lastIndex-- > 0 )
514            {
515                lastIndex = parentPattern.lastIndexOf( '/', lastIndex );
516                if ( lastIndex > 0 )
517                {
518                    matchingRules = this.cache.get( parentPattern.substring( 0, lastIndex ) + "/*" );
519                    if ( matchingRules != null )
520                    {
521                        return matchingRules;
522                    }
523                }
524            }
525            return null;
526        }
527    
528    }