001package org.apache.commons.digester3; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one 005 * or more contributor license agreements. See the NOTICE file 006 * distributed with this work for additional information 007 * regarding copyright ownership. The ASF licenses this file 008 * to you under the Apache License, Version 2.0 (the 009 * "License"); you may not use this file except in compliance 010 * with the License. You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, 015 * software distributed under the License is distributed on an 016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 017 * KIND, either express or implied. See the License for the 018 * specific language governing permissions and limitations 019 * under the License. 020 */ 021 022import java.util.ArrayList; 023import java.util.Collections; 024import java.util.Comparator; 025import java.util.HashMap; 026import java.util.Iterator; 027import java.util.List; 028import java.util.Map; 029 030import org.xml.sax.Attributes; 031 032/** 033 * <p> 034 * Extension of {@link RulesBase} for complex schema. 035 * </p> 036 * <p> 037 * This is an extension of the basic pattern matching scheme intended to improve support for mapping complex xml-schema. 038 * It is intended to be a minimal extension of the standard rules big enough to support complex schema but without the 039 * full generality offered by more exotic matching pattern rules. 040 * </p> 041 * <h4>When should you use this rather than the original?</h4> 042 * <p> 043 * This pattern-matching engine is complex and slower than the basic default RulesBase class, but offers more 044 * functionality: 045 * <ul> 046 * <li>Universal patterns allow patterns to be specified which will match regardless of whether there are 047 * "better matching" patterns available.</li> 048 * <li>Parent-match patterns (eg "a/b/?") allow matching for all direct children of a specified element.</li> 049 * <li>Ancestor-match patterns (eg "a/b/*") allow matching all elements nested within a specified element to any nesting 050 * depth.</li> 051 * <li>Completely-wild patterns ("*" or "!*") allow matching all elements.</li> 052 * </ul> 053 * </p> 054 * <h4>Universal Match Patterns</h4> 055 * <p> 056 * The default RulesBase pattern-matching engine always attempts to find the "best matching pattern", and will ignore 057 * rules associated with other patterns that match but are not "as good". As an example, if the pattern "a/b/c" is 058 * associated with rules 1 and 2, and "*/c" is associated with rules 3 and 4 then element "a/b/c" will cause only 059 * rules 1 and 2 to execute. Rules 3 and 4 do have matching patterns, but because the patterns are shorter and include 060 * wildcard characters they are regarded as being "not as good" as a direct match. In general, exact patterns are better 061 * than wildcard patterns, and among multiple patterns with wildcards, the longest is preferred. See the RulesBase class 062 * for more information. 063 * </p> 064 * <p> 065 * This feature of preferring "better" patterns can be a powerful tool. However it also means that patterns can interact 066 * in unexpected ways. 067 * </p> 068 * <p> 069 * When using the ExtendedBaseRules, any pattern prefixed with '!' bypasses the "best match" feature. Even if there is 070 * an exact match or a longer wildcard match, patterns prefixed by '!' will still be tested to see if they match, and if 071 * so their associated Rule objects will be included in the set of rules to be executed in the normal manner. 072 * </p> 073 * <ul> 074 * <li>Pattern <code>"!*/a/b"</code> matches whenever an 'b' element is inside an 'a'.</li> 075 * <li>Pattern <code>"!a/b/?"</code> matches any child of a parent matching <code>"a/b"</code> (see 076 * "Parent Match Patterns").</li> 077 * <li>Pattern <code>"!*/a/b/?"</code> matches any child of a parent matching <code>"!*/a/b"</code> (see 078 * "Parent Match Patterns").</li> 079 * <li>Pattern <code>"!a/b/*"</code> matches any element whose path starts with "a" then "b" (see 080 * "Ancestor Match Patterns").</li> 081 * <li>Pattern <code>"!*/a/b/*"</code> matches any elements whose path contains 'a/b' (see 082 * "Ancestor Match Patterns").</li> 083 * </ul> 084 * <h4>Parent Match Patterns</h4> 085 * <p> 086 * These will match direct child elements of a particular parent element. 087 * <ul> 088 * <li> 089 * <code>"a/b/c/?"</code> matches any child whose parent matches <code>"a/b/c"</code>. Exact parent rules take 090 * precedence over Ancestor Match patterns.</li> 091 * <li> 092 * <code>"*/a/b/c/?"</code> matches any child whose parent matches <code>"*/a/b/c"</code>. The longest 093 * matching still applies to parent matches but the length excludes the '?', which effectively means that standard 094 * wildcard matches with the same level of depth are chosen in preference.</li> 095 * </ul> 096 * </p> 097 * <h4>Ancestor Match Patterns</h4> 098 * <p> 099 * These will match elements whose parentage includes a particular sequence of elements. 100 * <ul> 101 * <li> 102 * <code>"a/b/*"</code> matches any element whose path starts with 'a' then 'b'. Exact parent and parent match rules 103 * take precedence. The longest ancestor match will take precedence.</li> 104 * <li> 105 * <code>"*/a/b/*"</code> matches any elements whose path contains an element 'a' followed by an element 'b'. 106 * The longest matching still applies but the length excludes the '*' at the end.</li> 107 * </ul> 108 * </p> 109 * <h4>Completely Wild Patterns</h4> 110 * <p> 111 * Pattern <code>"*"</code> matches every pattern that isn't matched by any other basic rule. 112 * </p> 113 * <p> 114 * Pattern <code>"!*"</code> matches every pattern. 115 * </p> 116 * <h4>Using The Extended Rules</h4> 117 * <p> 118 * By default, a Digester instance uses a {@link RulesBase} instance as its pattern matching engine. To use an 119 * ExtendedBaseRules instance, call the Digester.setRules method before adding any Rule objects to the digester 120 * instance: 121 * 122 * <pre> 123 * Digester digester = new Digester(); 124 * digester.setRules( new ExtendedBaseRules() ); 125 * </pre> 126 * 127 * </p> 128 * <p> 129 * The most important thing to remember when using the extended rules is that universal and non-universal patterns are 130 * completely independent. Universal patterns are never affected by the addition of new patterns or the removal of 131 * existing ones. Non-universal patterns are never affected by the addition of new <em>universal</em> patterns or the 132 * removal of existing <em>universal</em> patterns. As in the basic matching rules, non-universal (basic) patterns 133 * <strong>can</strong> be affected by the addition of new <em>non-universal</em> patterns or the removal of existing 134 * <em>non-universal</em> patterns, because only rules associated with the "best matching" pattern for each xml element 135 * are executed. 136 * <p> 137 * This means that you can use universal patterns to build up the simple parts of your structure - for example defining 138 * universal creation and property setting rules. More sophisticated and complex mapping will require non-universal 139 * patterns and this might mean that some of the universal rules will need to be replaced by a series of special cases 140 * using non-universal rules. But by using universal rules as your backbone, these additions should not break your 141 * existing rules. 142 * </p> 143 */ 144public class ExtendedBaseRules 145 extends RulesBase 146{ 147 148 // ----------------------------------------------------- Instance Variables 149 150 /** 151 * Counts the entry number for the rules. 152 */ 153 private int counter = 0; 154 155 /** 156 * The decision algorithm used (unfortunately) doesn't preserve the entry order. This map is used by a comparator 157 * which orders the list of matches before it's returned. This map stores the entry number keyed by the rule. 158 */ 159 private Map<Rule, Integer> order = new HashMap<Rule, Integer>(); 160 161 // --------------------------------------------------------- Public Methods 162 163 /** 164 * {@inheritDoc} 165 */ 166 @Override 167 protected void registerRule( String pattern, Rule rule ) 168 { 169 super.registerRule( pattern, rule ); 170 counter++; 171 order.put( rule, counter ); 172 } 173 174 /** 175 * {@inheritDoc} 176 */ 177 @Override 178 public List<Rule> match( String namespaceURI, String pattern, String name, Attributes attributes ) 179 { 180 // calculate the pattern of the parent 181 // (if the element has one) 182 String parentPattern = ""; 183 int lastIndex = pattern.lastIndexOf( '/' ); 184 185 boolean hasParent = true; 186 if ( lastIndex == -1 ) 187 { 188 // element has no parent 189 hasParent = false; 190 } 191 else 192 { 193 // calculate the pattern of the parent 194 parentPattern = pattern.substring( 0, lastIndex ); 195 } 196 197 // we keep the list of universal matches separate 198 List<Rule> universalList = new ArrayList<Rule>( counter ); 199 200 // Universal wildcards ('*') in the middle of the pattern-string 201 List<Rule> recList = null; 202 // temporary parentPattern 203 // we don't want to change anything.... 204 String tempParentPattern = parentPattern; 205 int parentLastIndex = tempParentPattern.lastIndexOf( '/' ); 206 // look for pattern. Here, we search the whole 207 // parent. Not ideal, but does the thing.... 208 while ( parentLastIndex > -1 && recList == null ) 209 { 210 recList = this.cache.get( tempParentPattern + "/*/" + pattern.substring( lastIndex + 1 ) ); 211 if ( recList != null ) 212 { 213 // when /*/-pattern-string is found, add method 214 // list to universalList. 215 // Digester will do the rest 216 universalList.addAll( recList ); 217 } 218 else 219 { 220 // if not, shorten tempParent to move /*/ one position 221 // to the left. 222 // as last part of patttern is always added 223 // we make sure pattern is allowed anywhere. 224 tempParentPattern = parentPattern.substring( 0, parentLastIndex ); 225 } 226 227 parentLastIndex = tempParentPattern.lastIndexOf( '/' ); 228 } 229 230 // Universal all wildards ('!*') 231 // These are always matched so always add them 232 List<Rule> tempList = this.cache.get( "!*" ); 233 if ( tempList != null ) 234 { 235 universalList.addAll( tempList ); 236 } 237 238 // Universal exact parent match 239 // need to get this now since only wildcards are considered later 240 tempList = this.cache.get( "!" + parentPattern + "/?" ); 241 if ( tempList != null ) 242 { 243 universalList.addAll( tempList ); 244 } 245 246 // base behaviour means that if we certain matches, we don't continue 247 // but we just have a single combined loop and so we have to set 248 // a variable 249 boolean ignoreBasicMatches = false; 250 251 // see if we have an exact basic pattern match 252 List<Rule> rulesList = this.cache.get( pattern ); 253 if ( rulesList != null ) 254 { 255 // we have a match! 256 // so ignore all basic matches from now on 257 ignoreBasicMatches = true; 258 259 } 260 else 261 { 262 263 // see if we have an exact child match 264 if ( hasParent ) 265 { 266 // matching children takes preference 267 rulesList = this.cache.get( parentPattern + "/?" ); 268 if ( rulesList != null ) 269 { 270 // we have a match! 271 // so ignore all basic matches from now on 272 ignoreBasicMatches = true; 273 274 } 275 else 276 { 277 // we don't have a match yet - so try exact ancester 278 // 279 rulesList = findExactAncesterMatch( pattern ); 280 if ( rulesList != null ) 281 { 282 // we have a match! 283 // so ignore all basic matches from now on 284 ignoreBasicMatches = true; 285 } 286 } 287 } 288 } 289 290 // OK - we're ready for the big loop! 291 // Unlike the basic rules case, 292 // we have to go through for all those universal rules in all cases. 293 294 // Find the longest key, ie more discriminant 295 int longKeyLength = 0; 296 297 for ( String key : this.cache.keySet() ) 298 { 299 300 // find out if it's a univeral pattern 301 // set a flag 302 boolean isUniversal = key.startsWith( "!" ); 303 if ( isUniversal ) 304 { 305 // and find the underlying key 306 key = key.substring( 1, key.length() ); 307 } 308 309 // don't need to check exact matches 310 boolean wildcardMatchStart = key.startsWith( "*/" ); 311 boolean wildcardMatchEnd = key.endsWith( "/*" ); 312 if ( wildcardMatchStart || ( isUniversal && wildcardMatchEnd ) ) 313 { 314 315 boolean parentMatched = false; 316 boolean basicMatched = false; 317 boolean ancesterMatched = false; 318 319 boolean parentMatchEnd = key.endsWith( "/?" ); 320 if ( parentMatchEnd ) 321 { 322 // try for a parent match 323 parentMatched = parentMatch( key, parentPattern ); 324 325 } 326 else if ( wildcardMatchEnd ) 327 { 328 // check for ancester match 329 if ( wildcardMatchStart ) 330 { 331 String patternBody = key.substring( 2, key.length() - 2 ); 332 if ( pattern.endsWith( patternBody ) ) 333 { 334 ancesterMatched = true; 335 } 336 else 337 { 338 ancesterMatched = ( pattern.indexOf( patternBody + "/" ) > -1 ); 339 } 340 } 341 else 342 { 343 String bodyPattern = key.substring( 0, key.length() - 2 ); 344 if ( pattern.startsWith( bodyPattern ) ) 345 { 346 if ( pattern.length() == bodyPattern.length() ) 347 { 348 // exact match 349 ancesterMatched = true; 350 } 351 else 352 { 353 ancesterMatched = ( pattern.charAt( bodyPattern.length() ) == '/' ); 354 } 355 } 356 else 357 { 358 ancesterMatched = false; 359 } 360 } 361 } 362 else 363 { 364 // try for a base match 365 basicMatched = basicMatch( key, pattern ); 366 } 367 368 if ( parentMatched || basicMatched || ancesterMatched ) 369 { 370 if ( isUniversal ) 371 { 372 // universal rules go straight in 373 // (no longest matching rule) 374 tempList = this.cache.get( "!" + key ); 375 if ( tempList != null ) 376 { 377 universalList.addAll( tempList ); 378 } 379 380 } 381 else 382 { 383 if ( !ignoreBasicMatches ) 384 { 385 // ensure that all parent matches are SHORTER 386 // than rules with same level of matching. 387 // 388 // the calculations below don't work for universal 389 // matching, but we don't care because in that case 390 // this if-stmt is not entered. 391 int keyLength = key.length(); 392 if ( wildcardMatchStart ) 393 { 394 --keyLength; 395 } 396 if ( wildcardMatchEnd ) 397 { 398 --keyLength; 399 } 400 else if ( parentMatchEnd ) 401 { 402 --keyLength; 403 } 404 405 if ( keyLength > longKeyLength ) 406 { 407 rulesList = this.cache.get( key ); 408 longKeyLength = keyLength; 409 } 410 } 411 } 412 } 413 } 414 } 415 416 // '*' works in practice as a default matching 417 // (this is because anything is a deeper match!) 418 if ( rulesList == null ) 419 { 420 rulesList = this.cache.get( "*" ); 421 } 422 423 // if we've matched a basic pattern, then add to the universal list 424 if ( rulesList != null ) 425 { 426 universalList.addAll( rulesList ); 427 } 428 429 // don't filter if namespace is null 430 if ( namespaceURI != null ) 431 { 432 // remove invalid namespaces 433 Iterator<Rule> it = universalList.iterator(); 434 while ( it.hasNext() ) 435 { 436 Rule rule = it.next(); 437 String nsUri = rule.getNamespaceURI(); 438 if ( nsUri != null && !nsUri.equals( namespaceURI ) ) 439 { 440 it.remove(); 441 } 442 } 443 } 444 445 // need to make sure that the collection is sort in the order 446 // of addition. We use a custom comparator for this 447 Collections.sort( universalList, new Comparator<Rule>() 448 { 449 450 public int compare( Rule r1, Rule r2 ) 451 { 452 // Get the entry order from the map 453 Integer i1 = order.get( r1 ); 454 Integer i2 = order.get( r2 ); 455 456 // and use that to perform the comparison 457 if ( i1 == null ) 458 { 459 if ( i2 == null ) 460 { 461 462 return 0; 463 464 } 465 return -1; 466 } 467 else if ( i2 == null ) 468 { 469 return 1; 470 } 471 472 return ( i1.intValue() - i2.intValue() ); 473 } 474 } ); 475 476 return universalList; 477 } 478 479 /** 480 * Checks the input parentPattern contains the input key at the end. 481 * 482 * @param key The key to be found 483 * @param parentPattern The pattern where looking for the key 484 * @return true, if {@code key} is found inside {@code parentPattern}, false otherwise 485 */ 486 private boolean parentMatch( String key, String parentPattern ) 487 { 488 return parentPattern.endsWith( key.substring( 1, key.length() - 2 ) ); 489 } 490 491 /** 492 * Standard match. Matches the end of the pattern to the key. 493 * 494 * @param key The key to be found 495 * @param pattern The pattern where looking for the key 496 * @return true, if {@code key} is found inside {@code pattern}, false otherwise 497 */ 498 private boolean basicMatch( String key, String pattern ) 499 { 500 return ( pattern.equals( key.substring( 2 ) ) || pattern.endsWith( key.substring( 1 ) ) ); 501 } 502 503 /** 504 * Finds an exact ancester match for given pattern 505 * 506 * @param parentPattern The input pattern 507 * @return A list of {@code Rule} related to the input pattern 508 */ 509 private List<Rule> findExactAncesterMatch( String parentPattern ) 510 { 511 List<Rule> matchingRules = null; 512 int lastIndex = parentPattern.length(); 513 while ( lastIndex-- > 0 ) 514 { 515 lastIndex = parentPattern.lastIndexOf( '/', lastIndex ); 516 if ( lastIndex > 0 ) 517 { 518 matchingRules = this.cache.get( parentPattern.substring( 0, lastIndex ) + "/*" ); 519 if ( matchingRules != null ) 520 { 521 return matchingRules; 522 } 523 } 524 } 525 return null; 526 } 527 528}