View Javadoc

1   package org.apache.commons.digester3;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.util.ArrayList;
23  import java.util.Collections;
24  import java.util.Comparator;
25  import java.util.HashMap;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  
30  import org.xml.sax.Attributes;
31  
32  /**
33   * <p>
34   * Extension of {@link RulesBase} for complex schema.
35   * </p>
36   * <p>
37   * This is an extension of the basic pattern matching scheme intended to improve support for mapping complex xml-schema.
38   * It is intended to be a minimal extension of the standard rules big enough to support complex schema but without the
39   * full generality offered by more exotic matching pattern rules.
40   * </p>
41   * <h4>When should you use this rather than the original?</h4>
42   * <p>
43   * This pattern-matching engine is complex and slower than the basic default RulesBase class, but offers more
44   * functionality:
45   * <ul>
46   * <li>Universal patterns allow patterns to be specified which will match regardless of whether there are
47   * "better matching" patterns available.</li>
48   * <li>Parent-match patterns (eg "a/b/?") allow matching for all direct children of a specified element.</li>
49   * <li>Ancestor-match patterns (eg "a/b/*") allow matching all elements nested within a specified element to any nesting
50   * depth.</li>
51   * <li>Completely-wild patterns ("*" or "!*") allow matching all elements.</li>
52   * </ul>
53   * </p>
54   * <h4>Universal Match Patterns</h4>
55   * <p>
56   * The default RulesBase pattern-matching engine always attempts to find the "best matching pattern", and will ignore
57   * rules associated with other patterns that match but are not "as good". As an example, if the pattern "a/b/c" is
58   * associated with rules 1 and 2, and "*&#47;c" is associated with rules 3 and 4 then element "a/b/c" will cause only
59   * rules 1 and 2 to execute. Rules 3 and 4 do have matching patterns, but because the patterns are shorter and include
60   * wildcard characters they are regarded as being "not as good" as a direct match. In general, exact patterns are better
61   * than wildcard patterns, and among multiple patterns with wildcards, the longest is preferred. See the RulesBase class
62   * for more information.
63   * </p>
64   * <p>
65   * This feature of preferring "better" patterns can be a powerful tool. However it also means that patterns can interact
66   * in unexpected ways.
67   * </p>
68   * <p>
69   * When using the ExtendedBaseRules, any pattern prefixed with '!' bypasses the "best match" feature. Even if there is
70   * an exact match or a longer wildcard match, patterns prefixed by '!' will still be tested to see if they match, and if
71   * so their associated Rule objects will be included in the set of rules to be executed in the normal manner.
72   * </p>
73   * <ul>
74   * <li>Pattern <code>"!*&#47;a/b"</code> matches whenever an 'b' element is inside an 'a'.</li>
75   * <li>Pattern <code>"!a/b/?"</code> matches any child of a parent matching <code>"a/b"</code> (see
76   * "Parent Match Patterns").</li>
77   * <li>Pattern <code>"!*&#47;a/b/?"</code> matches any child of a parent matching <code>"!*&#47;a/b"</code> (see
78   * "Parent Match Patterns").</li>
79   * <li>Pattern <code>"!a/b/*"</code> matches any element whose path starts with "a" then "b" (see
80   * "Ancestor Match Patterns").</li>
81   * <li>Pattern <code>"!*&#47;a/b/*"</code> matches any elements whose path contains 'a/b' (see
82   * "Ancestor Match Patterns").</li>
83   * </ul>
84   * <h4>Parent Match Patterns</h4>
85   * <p>
86   * These will match direct child elements of a particular parent element.
87   * <ul>
88   * <li>
89   *  <code>"a/b/c/?"</code> matches any child whose parent matches <code>"a/b/c"</code>. Exact parent rules take
90   * precedence over Ancestor Match patterns.</li>
91   * <li>
92   *  <code>"*&#47;a/b/c/?"</code> matches any child whose parent matches <code>"*&#47;a/b/c"</code>. The longest
93   * matching still applies to parent matches but the length excludes the '?', which effectively means that standard
94   * wildcard matches with the same level of depth are chosen in preference.</li>
95   * </ul>
96   * </p>
97   * <h4>Ancestor Match Patterns</h4>
98   * <p>
99   * These will match elements whose parentage includes a particular sequence of elements.
100  * <ul>
101  * <li>
102  *  <code>"a/b/*"</code> matches any element whose path starts with 'a' then 'b'. Exact parent and parent match rules
103  * take precedence. The longest ancestor match will take precedence.</li>
104  * <li>
105  *  <code>"*&#47;a/b/*"</code> matches any elements whose path contains an element 'a' followed by an element 'b'.
106  * The longest matching still applies but the length excludes the '*' at the end.</li>
107  * </ul>
108  * </p>
109  * <h4>Completely Wild Patterns</h4>
110  * <p>
111  * Pattern <code>"*"</code> matches every pattern that isn't matched by any other basic rule.
112  * </p>
113  * <p>
114  * Pattern <code>"!*"</code> matches every pattern.
115  * </p>
116  * <h4>Using The Extended Rules</h4>
117  * <p>
118  * By default, a Digester instance uses a {@link RulesBase} instance as its pattern matching engine. To use an
119  * ExtendedBaseRules instance, call the Digester.setRules method before adding any Rule objects to the digester
120  * instance:
121  *
122  * <pre>
123  * Digester digester = new Digester();
124  * digester.setRules( new ExtendedBaseRules() );
125  * </pre>
126  *
127  * </p>
128  * <p>
129  * The most important thing to remember when using the extended rules is that universal and non-universal patterns are
130  * completely independent. Universal patterns are never affected by the addition of new patterns or the removal of
131  * existing ones. Non-universal patterns are never affected by the addition of new <em>universal</em> patterns or the
132  * removal of existing <em>universal</em> patterns. As in the basic matching rules, non-universal (basic) patterns
133  * <strong>can</strong> be affected by the addition of new <em>non-universal</em> patterns or the removal of existing
134  * <em>non-universal</em> patterns, because only rules associated with the "best matching" pattern for each xml element
135  * are executed.
136  * <p>
137  * This means that you can use universal patterns to build up the simple parts of your structure - for example defining
138  * universal creation and property setting rules. More sophisticated and complex mapping will require non-universal
139  * patterns and this might mean that some of the universal rules will need to be replaced by a series of special cases
140  * using non-universal rules. But by using universal rules as your backbone, these additions should not break your
141  * existing rules.
142  * </p>
143  */
144 public class ExtendedBaseRules
145     extends RulesBase
146 {
147 
148     // ----------------------------------------------------- Instance Variables
149 
150     /**
151      * Counts the entry number for the rules.
152      */
153     private int counter = 0;
154 
155     /**
156      * The decision algorithm used (unfortunately) doesn't preserve the entry order. This map is used by a comparator
157      * which orders the list of matches before it's returned. This map stores the entry number keyed by the rule.
158      */
159     private Map<Rule, Integer> order = new HashMap<Rule, Integer>();
160 
161     // --------------------------------------------------------- Public Methods
162 
163     /**
164      * {@inheritDoc}
165      */
166     @Override
167     protected void registerRule( String pattern, Rule rule )
168     {
169         super.registerRule( pattern, rule );
170         counter++;
171         order.put( rule, counter );
172     }
173 
174     /**
175      * {@inheritDoc}
176      */
177     @Override
178     public List<Rule> match( String namespaceURI, String pattern, String name, Attributes attributes )
179     {
180         // calculate the pattern of the parent
181         // (if the element has one)
182         String parentPattern = "";
183         int lastIndex = pattern.lastIndexOf( '/' );
184 
185         boolean hasParent = true;
186         if ( lastIndex == -1 )
187         {
188             // element has no parent
189             hasParent = false;
190         }
191         else
192         {
193             // calculate the pattern of the parent
194             parentPattern = pattern.substring( 0, lastIndex );
195         }
196 
197         // we keep the list of universal matches separate
198         List<Rule> universalList = new ArrayList<Rule>( counter );
199 
200         // Universal wildcards ('*') in the middle of the pattern-string
201         List<Rule> recList = null;
202         // temporary parentPattern
203         // we don't want to change anything....
204         String tempParentPattern = parentPattern;
205         int parentLastIndex = tempParentPattern.lastIndexOf( '/' );
206         // look for pattern. Here, we search the whole
207         // parent. Not ideal, but does the thing....
208         while ( parentLastIndex > -1 && recList == null )
209         {
210             recList = this.cache.get( tempParentPattern + "/*/" + pattern.substring( lastIndex + 1 ) );
211             if ( recList != null )
212             {
213                 // when /*/-pattern-string is found, add method
214                 // list to universalList.
215                 // Digester will do the rest
216                 universalList.addAll( recList );
217             }
218             else
219             {
220                 // if not, shorten tempParent to move /*/ one position
221                 // to the left.
222                 // as last part of patttern is always added
223                 // we make sure pattern is allowed anywhere.
224                 tempParentPattern = parentPattern.substring( 0, parentLastIndex );
225             }
226 
227             parentLastIndex = tempParentPattern.lastIndexOf( '/' );
228         }
229 
230         // Universal all wildards ('!*')
231         // These are always matched so always add them
232         List<Rule> tempList = this.cache.get( "!*" );
233         if ( tempList != null )
234         {
235             universalList.addAll( tempList );
236         }
237 
238         // Universal exact parent match
239         // need to get this now since only wildcards are considered later
240         tempList = this.cache.get( "!" + parentPattern + "/?" );
241         if ( tempList != null )
242         {
243             universalList.addAll( tempList );
244         }
245 
246         // base behaviour means that if we certain matches, we don't continue
247         // but we just have a single combined loop and so we have to set
248         // a variable
249         boolean ignoreBasicMatches = false;
250 
251         // see if we have an exact basic pattern match
252         List<Rule> rulesList = this.cache.get( pattern );
253         if ( rulesList != null )
254         {
255             // we have a match!
256             // so ignore all basic matches from now on
257             ignoreBasicMatches = true;
258 
259         }
260         else
261         {
262 
263             // see if we have an exact child match
264             if ( hasParent )
265             {
266                 // matching children takes preference
267                 rulesList = this.cache.get( parentPattern + "/?" );
268                 if ( rulesList != null )
269                 {
270                     // we have a match!
271                     // so ignore all basic matches from now on
272                     ignoreBasicMatches = true;
273 
274                 }
275                 else
276                 {
277                     // we don't have a match yet - so try exact ancester
278                     //
279                     rulesList = findExactAncesterMatch( pattern );
280                     if ( rulesList != null )
281                     {
282                         // we have a match!
283                         // so ignore all basic matches from now on
284                         ignoreBasicMatches = true;
285                     }
286                 }
287             }
288         }
289 
290         // OK - we're ready for the big loop!
291         // Unlike the basic rules case,
292         // we have to go through for all those universal rules in all cases.
293 
294         // Find the longest key, ie more discriminant
295         int longKeyLength = 0;
296 
297         for ( String key : this.cache.keySet() )
298         {
299 
300             // find out if it's a univeral pattern
301             // set a flag
302             boolean isUniversal = key.startsWith( "!" );
303             if ( isUniversal )
304             {
305                 // and find the underlying key
306                 key = key.substring( 1, key.length() );
307             }
308 
309             // don't need to check exact matches
310             boolean wildcardMatchStart = key.startsWith( "*/" );
311             boolean wildcardMatchEnd = key.endsWith( "/*" );
312             if ( wildcardMatchStart || ( isUniversal && wildcardMatchEnd ) )
313             {
314 
315                 boolean parentMatched = false;
316                 boolean basicMatched = false;
317                 boolean ancesterMatched = false;
318 
319                 boolean parentMatchEnd = key.endsWith( "/?" );
320                 if ( parentMatchEnd )
321                 {
322                     // try for a parent match
323                     parentMatched = parentMatch( key, parentPattern );
324 
325                 }
326                 else if ( wildcardMatchEnd )
327                 {
328                     // check for ancester match
329                     if ( wildcardMatchStart )
330                     {
331                         String patternBody = key.substring( 2, key.length() - 2 );
332                         if ( pattern.endsWith( patternBody ) )
333                         {
334                             ancesterMatched = true;
335                         }
336                         else
337                         {
338                             ancesterMatched = ( pattern.indexOf( patternBody + "/" ) > -1 );
339                         }
340                     }
341                     else
342                     {
343                         String bodyPattern = key.substring( 0, key.length() - 2 );
344                         if ( pattern.startsWith( bodyPattern ) )
345                         {
346                             if ( pattern.length() == bodyPattern.length() )
347                             {
348                                 // exact match
349                                 ancesterMatched = true;
350                             }
351                             else
352                             {
353                                 ancesterMatched = ( pattern.charAt( bodyPattern.length() ) == '/' );
354                             }
355                         }
356                         else
357                         {
358                             ancesterMatched = false;
359                         }
360                     }
361                 }
362                 else
363                 {
364                     // try for a base match
365                     basicMatched = basicMatch( key, pattern );
366                 }
367 
368                 if ( parentMatched || basicMatched || ancesterMatched )
369                 {
370                     if ( isUniversal )
371                     {
372                         // universal rules go straight in
373                         // (no longest matching rule)
374                         tempList = this.cache.get( "!" + key );
375                         if ( tempList != null )
376                         {
377                             universalList.addAll( tempList );
378                         }
379 
380                     }
381                     else
382                     {
383                         if ( !ignoreBasicMatches )
384                         {
385                             // ensure that all parent matches are SHORTER
386                             // than rules with same level of matching.
387                             //
388                             // the calculations below don't work for universal
389                             // matching, but we don't care because in that case
390                             // this if-stmt is not entered.
391                             int keyLength = key.length();
392                             if ( wildcardMatchStart )
393                             {
394                                 --keyLength;
395                             }
396                             if ( wildcardMatchEnd )
397                             {
398                                 --keyLength;
399                             }
400                             else if ( parentMatchEnd )
401                             {
402                                 --keyLength;
403                             }
404 
405                             if ( keyLength > longKeyLength )
406                             {
407                                 rulesList = this.cache.get( key );
408                                 longKeyLength = keyLength;
409                             }
410                         }
411                     }
412                 }
413             }
414         }
415 
416         // '*' works in practice as a default matching
417         // (this is because anything is a deeper match!)
418         if ( rulesList == null )
419         {
420             rulesList = this.cache.get( "*" );
421         }
422 
423         // if we've matched a basic pattern, then add to the universal list
424         if ( rulesList != null )
425         {
426             universalList.addAll( rulesList );
427         }
428 
429         // don't filter if namespace is null
430         if ( namespaceURI != null )
431         {
432             // remove invalid namespaces
433             Iterator<Rule> it = universalList.iterator();
434             while ( it.hasNext() )
435             {
436                 Rule rule = it.next();
437                 String nsUri = rule.getNamespaceURI();
438                 if ( nsUri != null && !nsUri.equals( namespaceURI ) )
439                 {
440                     it.remove();
441                 }
442             }
443         }
444 
445         // need to make sure that the collection is sort in the order
446         // of addition. We use a custom comparator for this
447         Collections.sort( universalList, new Comparator<Rule>()
448         {
449 
450             public int compare( Rule r1, Rule r2 )
451             {
452                 // Get the entry order from the map
453                 Integer i1 = order.get( r1 );
454                 Integer i2 = order.get( r2 );
455 
456                 // and use that to perform the comparison
457                 if ( i1 == null )
458                 {
459                     if ( i2 == null )
460                     {
461 
462                         return 0;
463 
464                     }
465                     return -1;
466                 }
467                 else if ( i2 == null )
468                 {
469                     return 1;
470                 }
471 
472                 return ( i1.intValue() - i2.intValue() );
473             }
474         } );
475 
476         return universalList;
477     }
478 
479     /**
480      * Checks the input parentPattern contains the input key at the end.
481      *
482      * @param key The key to be found
483      * @param parentPattern The pattern where looking for the key
484      * @return true, if {@code key} is found inside {@code parentPattern}, false otherwise
485      */
486     private boolean parentMatch( String key, String parentPattern )
487     {
488         return parentPattern.endsWith( key.substring( 1, key.length() - 2 ) );
489     }
490 
491     /**
492      * Standard match. Matches the end of the pattern to the key.
493      *
494      * @param key The key to be found
495      * @param pattern The pattern where looking for the key
496      * @return true, if {@code key} is found inside {@code pattern}, false otherwise
497      */
498     private boolean basicMatch( String key, String pattern )
499     {
500         return ( pattern.equals( key.substring( 2 ) ) || pattern.endsWith( key.substring( 1 ) ) );
501     }
502 
503     /**
504      * Finds an exact ancester match for given pattern
505      *
506      * @param parentPattern The input pattern
507      * @return A list of {@code Rule} related to the input pattern
508      */
509     private List<Rule> findExactAncesterMatch( String parentPattern )
510     {
511         List<Rule> matchingRules = null;
512         int lastIndex = parentPattern.length();
513         while ( lastIndex-- > 0 )
514         {
515             lastIndex = parentPattern.lastIndexOf( '/', lastIndex );
516             if ( lastIndex > 0 )
517             {
518                 matchingRules = this.cache.get( parentPattern.substring( 0, lastIndex ) + "/*" );
519                 if ( matchingRules != null )
520                 {
521                     return matchingRules;
522                 }
523             }
524         }
525         return null;
526     }
527 
528 }