001package org.apache.commons.digester3;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import java.util.ArrayList;
023import java.util.Collections;
024import java.util.Comparator;
025import java.util.HashMap;
026import java.util.Iterator;
027import java.util.List;
028import java.util.Map;
029
030import org.xml.sax.Attributes;
031
032/**
033 * <p>
034 * Extension of {@link RulesBase} for complex schema.
035 * </p>
036 * <p>
037 * This is an extension of the basic pattern matching scheme intended to improve support for mapping complex xml-schema.
038 * It is intended to be a minimal extension of the standard rules big enough to support complex schema but without the
039 * full generality offered by more exotic matching pattern rules.
040 * </p>
041 * <h4>When should you use this rather than the original?</h4>
042 * <p>
043 * This pattern-matching engine is complex and slower than the basic default RulesBase class, but offers more
044 * functionality:
045 * <ul>
046 * <li>Universal patterns allow patterns to be specified which will match regardless of whether there are
047 * "better matching" patterns available.</li>
048 * <li>Parent-match patterns (eg "a/b/?") allow matching for all direct children of a specified element.</li>
049 * <li>Ancestor-match patterns (eg "a/b/*") allow matching all elements nested within a specified element to any nesting
050 * depth.</li>
051 * <li>Completely-wild patterns ("*" or "!*") allow matching all elements.</li>
052 * </ul>
053 * </p>
054 * <h4>Universal Match Patterns</h4>
055 * <p>
056 * The default RulesBase pattern-matching engine always attempts to find the "best matching pattern", and will ignore
057 * rules associated with other patterns that match but are not "as good". As an example, if the pattern "a/b/c" is
058 * associated with rules 1 and 2, and "*&#47;c" is associated with rules 3 and 4 then element "a/b/c" will cause only
059 * rules 1 and 2 to execute. Rules 3 and 4 do have matching patterns, but because the patterns are shorter and include
060 * wildcard characters they are regarded as being "not as good" as a direct match. In general, exact patterns are better
061 * than wildcard patterns, and among multiple patterns with wildcards, the longest is preferred. See the RulesBase class
062 * for more information.
063 * </p>
064 * <p>
065 * This feature of preferring "better" patterns can be a powerful tool. However it also means that patterns can interact
066 * in unexpected ways.
067 * </p>
068 * <p>
069 * When using the ExtendedBaseRules, any pattern prefixed with '!' bypasses the "best match" feature. Even if there is
070 * an exact match or a longer wildcard match, patterns prefixed by '!' will still be tested to see if they match, and if
071 * so their associated Rule objects will be included in the set of rules to be executed in the normal manner.
072 * </p>
073 * <ul>
074 * <li>Pattern <code>"!*&#47;a/b"</code> matches whenever an 'b' element is inside an 'a'.</li>
075 * <li>Pattern <code>"!a/b/?"</code> matches any child of a parent matching <code>"a/b"</code> (see
076 * "Parent Match Patterns").</li>
077 * <li>Pattern <code>"!*&#47;a/b/?"</code> matches any child of a parent matching <code>"!*&#47;a/b"</code> (see
078 * "Parent Match Patterns").</li>
079 * <li>Pattern <code>"!a/b/*"</code> matches any element whose path starts with "a" then "b" (see
080 * "Ancestor Match Patterns").</li>
081 * <li>Pattern <code>"!*&#47;a/b/*"</code> matches any elements whose path contains 'a/b' (see
082 * "Ancestor Match Patterns").</li>
083 * </ul>
084 * <h4>Parent Match Patterns</h4>
085 * <p>
086 * These will match direct child elements of a particular parent element.
087 * <ul>
088 * <li>
089 *  <code>"a/b/c/?"</code> matches any child whose parent matches <code>"a/b/c"</code>. Exact parent rules take
090 * precedence over Ancestor Match patterns.</li>
091 * <li>
092 *  <code>"*&#47;a/b/c/?"</code> matches any child whose parent matches <code>"*&#47;a/b/c"</code>. The longest
093 * matching still applies to parent matches but the length excludes the '?', which effectively means that standard
094 * wildcard matches with the same level of depth are chosen in preference.</li>
095 * </ul>
096 * </p>
097 * <h4>Ancestor Match Patterns</h4>
098 * <p>
099 * These will match elements whose parentage includes a particular sequence of elements.
100 * <ul>
101 * <li>
102 *  <code>"a/b/*"</code> matches any element whose path starts with 'a' then 'b'. Exact parent and parent match rules
103 * take precedence. The longest ancestor match will take precedence.</li>
104 * <li>
105 *  <code>"*&#47;a/b/*"</code> matches any elements whose path contains an element 'a' followed by an element 'b'.
106 * The longest matching still applies but the length excludes the '*' at the end.</li>
107 * </ul>
108 * </p>
109 * <h4>Completely Wild Patterns</h4>
110 * <p>
111 * Pattern <code>"*"</code> matches every pattern that isn't matched by any other basic rule.
112 * </p>
113 * <p>
114 * Pattern <code>"!*"</code> matches every pattern.
115 * </p>
116 * <h4>Using The Extended Rules</h4>
117 * <p>
118 * By default, a Digester instance uses a {@link RulesBase} instance as its pattern matching engine. To use an
119 * ExtendedBaseRules instance, call the Digester.setRules method before adding any Rule objects to the digester
120 * instance:
121 *
122 * <pre>
123 * Digester digester = new Digester();
124 * digester.setRules( new ExtendedBaseRules() );
125 * </pre>
126 *
127 * </p>
128 * <p>
129 * The most important thing to remember when using the extended rules is that universal and non-universal patterns are
130 * completely independent. Universal patterns are never affected by the addition of new patterns or the removal of
131 * existing ones. Non-universal patterns are never affected by the addition of new <em>universal</em> patterns or the
132 * removal of existing <em>universal</em> patterns. As in the basic matching rules, non-universal (basic) patterns
133 * <strong>can</strong> be affected by the addition of new <em>non-universal</em> patterns or the removal of existing
134 * <em>non-universal</em> patterns, because only rules associated with the "best matching" pattern for each xml element
135 * are executed.
136 * <p>
137 * This means that you can use universal patterns to build up the simple parts of your structure - for example defining
138 * universal creation and property setting rules. More sophisticated and complex mapping will require non-universal
139 * patterns and this might mean that some of the universal rules will need to be replaced by a series of special cases
140 * using non-universal rules. But by using universal rules as your backbone, these additions should not break your
141 * existing rules.
142 * </p>
143 */
144public class ExtendedBaseRules
145    extends RulesBase
146{
147
148    // ----------------------------------------------------- Instance Variables
149
150    /**
151     * Counts the entry number for the rules.
152     */
153    private int counter = 0;
154
155    /**
156     * The decision algorithm used (unfortunately) doesn't preserve the entry order. This map is used by a comparator
157     * which orders the list of matches before it's returned. This map stores the entry number keyed by the rule.
158     */
159    private Map<Rule, Integer> order = new HashMap<Rule, Integer>();
160
161    // --------------------------------------------------------- Public Methods
162
163    /**
164     * {@inheritDoc}
165     */
166    @Override
167    protected void registerRule( String pattern, Rule rule )
168    {
169        super.registerRule( pattern, rule );
170        counter++;
171        order.put( rule, counter );
172    }
173
174    /**
175     * {@inheritDoc}
176     */
177    @Override
178    public List<Rule> match( String namespaceURI, String pattern, String name, Attributes attributes )
179    {
180        // calculate the pattern of the parent
181        // (if the element has one)
182        String parentPattern = "";
183        int lastIndex = pattern.lastIndexOf( '/' );
184
185        boolean hasParent = true;
186        if ( lastIndex == -1 )
187        {
188            // element has no parent
189            hasParent = false;
190        }
191        else
192        {
193            // calculate the pattern of the parent
194            parentPattern = pattern.substring( 0, lastIndex );
195        }
196
197        // we keep the list of universal matches separate
198        List<Rule> universalList = new ArrayList<Rule>( counter );
199
200        // Universal wildcards ('*') in the middle of the pattern-string
201        List<Rule> recList = null;
202        // temporary parentPattern
203        // we don't want to change anything....
204        String tempParentPattern = parentPattern;
205        int parentLastIndex = tempParentPattern.lastIndexOf( '/' );
206        // look for pattern. Here, we search the whole
207        // parent. Not ideal, but does the thing....
208        while ( parentLastIndex > -1 && recList == null )
209        {
210            recList = this.cache.get( tempParentPattern + "/*/" + pattern.substring( lastIndex + 1 ) );
211            if ( recList != null )
212            {
213                // when /*/-pattern-string is found, add method
214                // list to universalList.
215                // Digester will do the rest
216                universalList.addAll( recList );
217            }
218            else
219            {
220                // if not, shorten tempParent to move /*/ one position
221                // to the left.
222                // as last part of patttern is always added
223                // we make sure pattern is allowed anywhere.
224                tempParentPattern = parentPattern.substring( 0, parentLastIndex );
225            }
226
227            parentLastIndex = tempParentPattern.lastIndexOf( '/' );
228        }
229
230        // Universal all wildards ('!*')
231        // These are always matched so always add them
232        List<Rule> tempList = this.cache.get( "!*" );
233        if ( tempList != null )
234        {
235            universalList.addAll( tempList );
236        }
237
238        // Universal exact parent match
239        // need to get this now since only wildcards are considered later
240        tempList = this.cache.get( "!" + parentPattern + "/?" );
241        if ( tempList != null )
242        {
243            universalList.addAll( tempList );
244        }
245
246        // base behaviour means that if we certain matches, we don't continue
247        // but we just have a single combined loop and so we have to set
248        // a variable
249        boolean ignoreBasicMatches = false;
250
251        // see if we have an exact basic pattern match
252        List<Rule> rulesList = this.cache.get( pattern );
253        if ( rulesList != null )
254        {
255            // we have a match!
256            // so ignore all basic matches from now on
257            ignoreBasicMatches = true;
258
259        }
260        else
261        {
262
263            // see if we have an exact child match
264            if ( hasParent )
265            {
266                // matching children takes preference
267                rulesList = this.cache.get( parentPattern + "/?" );
268                if ( rulesList != null )
269                {
270                    // we have a match!
271                    // so ignore all basic matches from now on
272                    ignoreBasicMatches = true;
273
274                }
275                else
276                {
277                    // we don't have a match yet - so try exact ancester
278                    //
279                    rulesList = findExactAncesterMatch( pattern );
280                    if ( rulesList != null )
281                    {
282                        // we have a match!
283                        // so ignore all basic matches from now on
284                        ignoreBasicMatches = true;
285                    }
286                }
287            }
288        }
289
290        // OK - we're ready for the big loop!
291        // Unlike the basic rules case,
292        // we have to go through for all those universal rules in all cases.
293
294        // Find the longest key, ie more discriminant
295        int longKeyLength = 0;
296
297        for ( String key : this.cache.keySet() )
298        {
299
300            // find out if it's a univeral pattern
301            // set a flag
302            boolean isUniversal = key.startsWith( "!" );
303            if ( isUniversal )
304            {
305                // and find the underlying key
306                key = key.substring( 1, key.length() );
307            }
308
309            // don't need to check exact matches
310            boolean wildcardMatchStart = key.startsWith( "*/" );
311            boolean wildcardMatchEnd = key.endsWith( "/*" );
312            if ( wildcardMatchStart || ( isUniversal && wildcardMatchEnd ) )
313            {
314
315                boolean parentMatched = false;
316                boolean basicMatched = false;
317                boolean ancesterMatched = false;
318
319                boolean parentMatchEnd = key.endsWith( "/?" );
320                if ( parentMatchEnd )
321                {
322                    // try for a parent match
323                    parentMatched = parentMatch( key, parentPattern );
324
325                }
326                else if ( wildcardMatchEnd )
327                {
328                    // check for ancester match
329                    if ( wildcardMatchStart )
330                    {
331                        String patternBody = key.substring( 2, key.length() - 2 );
332                        if ( pattern.endsWith( patternBody ) )
333                        {
334                            ancesterMatched = true;
335                        }
336                        else
337                        {
338                            ancesterMatched = ( pattern.indexOf( patternBody + "/" ) > -1 );
339                        }
340                    }
341                    else
342                    {
343                        String bodyPattern = key.substring( 0, key.length() - 2 );
344                        if ( pattern.startsWith( bodyPattern ) )
345                        {
346                            if ( pattern.length() == bodyPattern.length() )
347                            {
348                                // exact match
349                                ancesterMatched = true;
350                            }
351                            else
352                            {
353                                ancesterMatched = ( pattern.charAt( bodyPattern.length() ) == '/' );
354                            }
355                        }
356                        else
357                        {
358                            ancesterMatched = false;
359                        }
360                    }
361                }
362                else
363                {
364                    // try for a base match
365                    basicMatched = basicMatch( key, pattern );
366                }
367
368                if ( parentMatched || basicMatched || ancesterMatched )
369                {
370                    if ( isUniversal )
371                    {
372                        // universal rules go straight in
373                        // (no longest matching rule)
374                        tempList = this.cache.get( "!" + key );
375                        if ( tempList != null )
376                        {
377                            universalList.addAll( tempList );
378                        }
379
380                    }
381                    else
382                    {
383                        if ( !ignoreBasicMatches )
384                        {
385                            // ensure that all parent matches are SHORTER
386                            // than rules with same level of matching.
387                            //
388                            // the calculations below don't work for universal
389                            // matching, but we don't care because in that case
390                            // this if-stmt is not entered.
391                            int keyLength = key.length();
392                            if ( wildcardMatchStart )
393                            {
394                                --keyLength;
395                            }
396                            if ( wildcardMatchEnd )
397                            {
398                                --keyLength;
399                            }
400                            else if ( parentMatchEnd )
401                            {
402                                --keyLength;
403                            }
404
405                            if ( keyLength > longKeyLength )
406                            {
407                                rulesList = this.cache.get( key );
408                                longKeyLength = keyLength;
409                            }
410                        }
411                    }
412                }
413            }
414        }
415
416        // '*' works in practice as a default matching
417        // (this is because anything is a deeper match!)
418        if ( rulesList == null )
419        {
420            rulesList = this.cache.get( "*" );
421        }
422
423        // if we've matched a basic pattern, then add to the universal list
424        if ( rulesList != null )
425        {
426            universalList.addAll( rulesList );
427        }
428
429        // don't filter if namespace is null
430        if ( namespaceURI != null )
431        {
432            // remove invalid namespaces
433            Iterator<Rule> it = universalList.iterator();
434            while ( it.hasNext() )
435            {
436                Rule rule = it.next();
437                String nsUri = rule.getNamespaceURI();
438                if ( nsUri != null && !nsUri.equals( namespaceURI ) )
439                {
440                    it.remove();
441                }
442            }
443        }
444
445        // need to make sure that the collection is sort in the order
446        // of addition. We use a custom comparator for this
447        Collections.sort( universalList, new Comparator<Rule>()
448        {
449
450            public int compare( Rule r1, Rule r2 )
451            {
452                // Get the entry order from the map
453                Integer i1 = order.get( r1 );
454                Integer i2 = order.get( r2 );
455
456                // and use that to perform the comparison
457                if ( i1 == null )
458                {
459                    if ( i2 == null )
460                    {
461
462                        return 0;
463
464                    }
465                    return -1;
466                }
467                else if ( i2 == null )
468                {
469                    return 1;
470                }
471
472                return ( i1.intValue() - i2.intValue() );
473            }
474        } );
475
476        return universalList;
477    }
478
479    /**
480     * Checks the input parentPattern contains the input key at the end.
481     *
482     * @param key The key to be found
483     * @param parentPattern The pattern where looking for the key
484     * @return true, if {@code key} is found inside {@code parentPattern}, false otherwise
485     */
486    private boolean parentMatch( String key, String parentPattern )
487    {
488        return parentPattern.endsWith( key.substring( 1, key.length() - 2 ) );
489    }
490
491    /**
492     * Standard match. Matches the end of the pattern to the key.
493     *
494     * @param key The key to be found
495     * @param pattern The pattern where looking for the key
496     * @return true, if {@code key} is found inside {@code pattern}, false otherwise
497     */
498    private boolean basicMatch( String key, String pattern )
499    {
500        return ( pattern.equals( key.substring( 2 ) ) || pattern.endsWith( key.substring( 1 ) ) );
501    }
502
503    /**
504     * Finds an exact ancester match for given pattern
505     *
506     * @param parentPattern The input pattern
507     * @return A list of {@code Rule} related to the input pattern
508     */
509    private List<Rule> findExactAncesterMatch( String parentPattern )
510    {
511        List<Rule> matchingRules = null;
512        int lastIndex = parentPattern.length();
513        while ( lastIndex-- > 0 )
514        {
515            lastIndex = parentPattern.lastIndexOf( '/', lastIndex );
516            if ( lastIndex > 0 )
517            {
518                matchingRules = this.cache.get( parentPattern.substring( 0, lastIndex ) + "/*" );
519                if ( matchingRules != null )
520                {
521                    return matchingRules;
522                }
523            }
524        }
525        return null;
526    }
527
528}