View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.vfs2.util;
18  
19  import java.util.BitSet;
20  
21  import org.apache.commons.lang3.util.FluentBitSet;
22  import org.apache.commons.vfs2.provider.GenericURLFileName;
23  
24  /**
25   * Internal URI encoding {@link BitSet} definitions.
26   * <p>
27   * This was forked from the {@link BitSet}s in {@code org.apache.commons.httpclient.URI}, in order to not be dependent
28   * on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s, but it should work with any different
29   * HTTP backend provider implementations.
30   * </p>
31   */
32  final class URIBitSets {
33  
34      /**
35       * The percent "%" character always has the reserved purpose of being the escape indicator, it must be escaped as "%25"
36       * in order to be used as data within a URI.
37       */
38      static final FluentBitSet PERCENT = bitSet('%');
39  
40      /**
41       * BitSet for digit.
42       * <p>
43       * <blockquote>
44       *
45       * <pre>
46       * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
47       * </pre>
48       *
49       * </blockquote>
50       * </p>
51       */
52      static final FluentBitSet DIGIT = bitSet().setInclusive('0', '9');
53  
54      /**
55       * BitSet for alpha.
56       * <p>
57       * <blockquote>
58       *
59       * <pre>
60       * alpha = lowalpha | upalpha
61       * </pre>
62       *
63       * </blockquote>
64       * </p>
65       */
66      static final FluentBitSet ALPHA = bitSet().setInclusive('a', 'z').setInclusive('A', 'Z');
67  
68      /**
69       * BitSet for alphanum (join of alpha &amp; digit).
70       * <p>
71       * <blockquote>
72       *
73       * <pre>
74       * alphanum = alpha | digit
75       * </pre>
76       *
77       * </blockquote>
78       * </p>
79       */
80      static final FluentBitSet ALPHANUM = bitSet().or(ALPHA, DIGIT);
81  
82      /**
83       * BitSet for hex.
84       * <p>
85       * <blockquote>
86       *
87       * <pre>
88       * hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | "a" | "b" | "c" | "d" | "e" | "f"
89       * </pre>
90       *
91       * </blockquote>
92       * </p>
93       */
94      static final FluentBitSet HEX = bitSet().or(DIGIT).setInclusive('a', 'f').setInclusive('A', 'F');
95  
96      /**
97       * BitSet for escaped.
98       * <p>
99       * <blockquote>
100      *
101      * <pre>
102      * escaped       = "%" hex hex
103      * </pre>
104      *
105      * </blockquote>
106      * </p>
107      */
108     static final FluentBitSet ESCAPED = bitSet().or(PERCENT, HEX);
109 
110     /**
111      * BitSet for mark.
112      * <p>
113      * <blockquote>
114      *
115      * <pre>
116      * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
117      * </pre>
118      *
119      * </blockquote>
120      * </p>
121      */
122     static final FluentBitSet MARK = bitSet('-', '_', '.', '!', '~', '*', '\'', '(', ')');
123 
124     /**
125      * Data characters that are allowed in a URI but do not have a reserved purpose are called unreserved.
126      * <p>
127      * <blockquote>
128      *
129      * <pre>
130      * unreserved = alphanum | mark
131      * </pre>
132      *
133      * </blockquote>
134      * </p>
135      */
136     static final FluentBitSet UNRESERVED = bitSet().or(ALPHANUM, MARK);
137 
138     /**
139      * BitSet for reserved.
140      * <p>
141      * <blockquote>
142      *
143      * <pre>
144      * reserved = ";" | "/" | "?" | ":" | "@" | "&amp;" | "=" | "+" | "$" | ","
145      * </pre>
146      *
147      * </blockquote>
148      * </p>
149      */
150     static final FluentBitSet RESERVED = bitSet(';', '/', '?', ':', '@', '&', '=', '+', '$', ',');
151 
152     /**
153      * BitSet for uric.
154      * <p>
155      * <blockquote>
156      *
157      * <pre>
158      * uric = reserved | unreserved | escaped
159      * </pre>
160      *
161      * </blockquote>
162      * </p>
163      */
164     static final FluentBitSet URIC = bitSet().or(RESERVED, UNRESERVED, ESCAPED);
165 
166     /**
167      * BitSet for fragment (alias for uric).
168      * <p>
169      * <blockquote>
170      *
171      * <pre>
172      * fragment      = *uric
173      * </pre>
174      *
175      * </blockquote>
176      * </p>
177      */
178     static final FluentBitSet FRAGMENT = URIC;
179 
180     /**
181      * BitSet for query (alias for uric).
182      * <p>
183      * <blockquote>
184      *
185      * <pre>
186      * query         = *uric
187      * </pre>
188      *
189      * </blockquote>
190      * </p>
191      */
192     static final FluentBitSet QUERY = URIC;
193 
194     /**
195      * BitSet for pchar.
196      * <p>
197      * <blockquote>
198      *
199      * <pre>
200      * pchar = unreserved | escaped | ":" | "@" | "&amp;" | "=" | "+" | "$" | ","
201      * </pre>
202      *
203      * </blockquote>
204      * </p>
205      */
206     static final FluentBitSet PCHAR = bitSet(':', '@', '&', '=', '+', '$', ',').or(UNRESERVED, ESCAPED);
207 
208     /**
209      * BitSet for param (alias for pchar).
210      * <p>
211      * <blockquote>
212      *
213      * <pre>
214      * param         = *pchar
215      * </pre>
216      *
217      * </blockquote>
218      * </p>
219      */
220     static final FluentBitSet PARAM = PCHAR;
221 
222     /**
223      * BitSet for segment.
224      * <p>
225      * <blockquote>
226      *
227      * <pre>
228      * segment       = *pchar *( ";" param )
229      * </pre>
230      *
231      * </blockquote>
232      * </p>
233      */
234     static final FluentBitSet SEGMENT = bitSet(';').or(PCHAR, PARAM);
235 
236     /**
237      * BitSet for path segments.
238      * <p>
239      * <blockquote>
240      *
241      * <pre>
242      * path_segments = segment *( "/" segment )
243      * </pre>
244      *
245      * </blockquote>
246      * </p>
247      */
248     static final FluentBitSet PATH_SEGMENTS = bitSet('/').or(SEGMENT);
249 
250     /**
251      * URI absolute path.
252      * <p>
253      * <blockquote>
254      *
255      * <pre>
256      * abs_path      = "/"  path_segments
257      * </pre>
258      *
259      * </blockquote>
260      * </p>
261      */
262     static final FluentBitSet ABS_PATH = bitSet('/').or(PATH_SEGMENTS);
263 
264     /**
265      * URI bitset for encoding typical non-slash characters.
266      * <p>
267      * <blockquote>
268      *
269      * <pre>
270      * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" | "&amp;" | "=" | "+" | "$" | ","
271      * </pre>
272      *
273      * </blockquote>
274      * </p>
275      */
276     static final FluentBitSet URIC_NO_SLASH = bitSet(';', '?', ';', '@', '&', '=', '+', '$', ',').or(UNRESERVED, ESCAPED);
277 
278     /**
279      * URI bitset that combines uric_no_slash and uric.
280      * <p>
281      * <blockquote>
282      *
283      * <pre>
284      * opaque_part = uric_no_slash * uric
285      * </pre>
286      *
287      * </blockquote>
288      * </p>
289      */
290     static final FluentBitSet OPAQUE_PART = bitSet().or(URIC_NO_SLASH, URIC);
291 
292     /**
293      * URI bitset that combines absolute path and opaque part.
294      * <p>
295      * <blockquote>
296      *
297      * <pre>
298      * path          = [ abs_path | opaque_part ]
299      * </pre>
300      *
301      * </blockquote>
302      * </p>
303      */
304     static final FluentBitSet PATH = bitSet().or(ABS_PATH, OPAQUE_PART);
305 
306     /**
307      * Port, a logical alias for digit.
308      */
309     static final FluentBitSet PORT = DIGIT;
310 
311     /**
312      * Bitset that combines digit and dot fo IPv$address.
313      * <p>
314      * <blockquote>
315      *
316      * <pre>
317      * IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit
318      * </pre>
319      *
320      * </blockquote>
321      * </p>
322      */
323     static final FluentBitSet IPV4ADDRESS = bitSet('.').or(DIGIT);
324 
325     /**
326      * RFC 2373.
327      * <p>
328      * <blockquote>
329      *
330      * <pre>
331      * IPv6address = hexpart [ ":" IPv4address ]
332      * </pre>
333      *
334      * </blockquote>
335      * </p>
336      */
337     static final FluentBitSet IPV6ADDRESS = bitSet(':').or(HEX, IPV4ADDRESS);
338 
339     /**
340      * RFC 2732, 2373.
341      * <p>
342      * <blockquote>
343      *
344      * <pre>
345      * IPv6reference   = "[" IPv6address "]"
346      * </pre>
347      *
348      * </blockquote>
349      * </p>
350      */
351     static final FluentBitSet IPV6REFERENCE = bitSet('[', ']').or(IPV6ADDRESS);
352 
353     /**
354      * BitSet for toplabel.
355      * <p>
356      * <blockquote>
357      *
358      * <pre>
359      * toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
360      * </pre>
361      *
362      * </blockquote>
363      * </p>
364      */
365     static final FluentBitSet TOPLABEL = bitSet('-').or(ALPHANUM);
366 
367     /**
368      * BitSet for domainlabel.
369      * <p>
370      * <blockquote>
371      *
372      * <pre>
373      * domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
374      * </pre>
375      *
376      * </blockquote>
377      * </p>
378      */
379     static final FluentBitSet DOMAINLABEL = TOPLABEL;
380 
381     /**
382      * BitSet for hostname.
383      * <p>
384      * <blockquote>
385      *
386      * <pre>
387      * hostname      = *( domainlabel "." ) toplabel [ "." ]
388      * </pre>
389      *
390      * </blockquote>
391      * </p>
392      */
393     static final FluentBitSet HOSTNAME = bitSet('.').or(TOPLABEL);
394 
395     /**
396      * BitSet for host.
397      * <p>
398      * <blockquote>
399      *
400      * <pre>
401      * host = hostname | IPv4address | IPv6reference
402      * </pre>
403      *
404      * </blockquote>
405      * </p>
406      */
407     static final FluentBitSet HOST = bitSet().or(HOSTNAME, IPV6REFERENCE);
408 
409     // Static initializer for host
410 //    static {
411 //        HOST.or(HOSTNAME);
412 //        // host.or(IPv4address);
413 //        HOST.or(IPV6REFERENCE); // IPv4address
414 //    }
415     /**
416      * BitSet for hostport.
417      * <p>
418      * <blockquote>
419      *
420      * <pre>
421      * hostport      = host [ ":" port ]
422      * </pre>
423      *
424      * </blockquote>
425      * </p>
426      */
427     static final FluentBitSet HOSTPORT = bitSet(':').or(HOST, PORT);
428 
429     /**
430      * Bitset for userinfo.
431      * <p>
432      * <blockquote>
433      *
434      * <pre>
435      * userinfo      = *( unreserved | escaped |
436      *                    ";" | ":" | "&amp;" | "=" | "+" | "$" | "," )
437      * </pre>
438      *
439      * </blockquote>
440      * </p>
441      */
442     static final FluentBitSet USERINFO = bitSet(';', ':', '&', '=', '+', '$', ',').or(UNRESERVED, ESCAPED);
443 
444     /**
445      * BitSet for within the userinfo component like user and password.
446      */
447     static final FluentBitSet WITHIN_USERRINFO = bitSet(';', ':', '@', '?', '/').or(USERINFO);
448 
449     /**
450      * Bitset for server.
451      * <p>
452      * <blockquote>
453      *
454      * <pre>
455      * server        = [ [ userinfo "@" ] hostport ]
456      * </pre>
457      *
458      * </blockquote>
459      * </p>
460      */
461     static final FluentBitSet SERVER = bitSet('@').or(USERINFO, HOSTPORT);
462 
463     /**
464      * BitSet for reg_name.
465      * <p>
466      * <blockquote>
467      *
468      * <pre>
469      * reg_name = 1 * (unreserved | escaped | "$" | "," | ";" | ":" | "@" | "&amp;" | "=" | "+")
470      * </pre>
471      *
472      * </blockquote>
473      * </p>
474      */
475     static final FluentBitSet REG_NAME = bitSet('$', ',', ';', ':', '@', '&', '=', '+').or(UNRESERVED, ESCAPED);
476 
477     /**
478      * BitSet for authority.
479      * <p>
480      * <blockquote>
481      *
482      * <pre>
483      * authority = server | reg_name
484      * </pre>
485      *
486      * </blockquote>
487      * </p>
488      */
489     static final FluentBitSet AUTHORITY = bitSet().or(SERVER, REG_NAME);
490 
491     /**
492      * BitSet for scheme.
493      * <p>
494      * <blockquote>
495      *
496      * <pre>
497      * scheme = alpha * (alpha | digit | "+" | "-" | ".")
498      * </pre>
499      *
500      * </blockquote>
501      * </p>
502      */
503     static final FluentBitSet SCHEME = bitSet('+', '-', '.').or(ALPHA, DIGIT);
504 
505     /**
506      * BitSet for rel_segment.
507      * <p>
508      * <blockquote>
509      *
510      * <pre>
511      * rel_segment = 1 * (unreserved | escaped | ";" | "@" | "&amp;" | "=" | "+" | "$" | ",")
512      * </pre>
513      *
514      * </blockquote>
515      * </p>
516      */
517     static final FluentBitSet REL_SEGMENT = bitSet(';', '@', '&', '=', '+', '$', ',').or(UNRESERVED, ESCAPED);
518 
519     /**
520      * BitSet for rel_path.
521      * <p>
522      * <blockquote>
523      *
524      * <pre>
525      * rel_path = rel_segment[abs_path]
526      * </pre>
527      *
528      * </blockquote>
529      * </p>
530      */
531     static final FluentBitSet REL_PATH = bitSet().or(REL_SEGMENT, ABS_PATH);
532 
533     /**
534      * BitSet for net_path.
535      * <p>
536      * <blockquote>
537      *
538      * <pre>
539      * net_path      = "//" authority [ abs_path ]
540      * </pre>
541      *
542      * </blockquote>
543      * </p>
544      */
545     static final FluentBitSet NET_PATH = bitSet('/').or(AUTHORITY, ABS_PATH);
546 
547     /**
548      * BitSet for hier_part.
549      * <p>
550      * <blockquote>
551      *
552      * <pre>
553      * hier_part     = ( net_path | abs_path ) [ "?" query ]
554      * </pre>
555      *
556      * </blockquote>
557      * </p>
558      */
559     // hier_part.set('?'); already included
560     static final FluentBitSet HIER_PART = bitSet().or(NET_PATH, ABS_PATH, QUERY);
561 
562     /**
563      * BitSet for relativeURI.
564      * <p>
565      * <blockquote>
566      *
567      * <pre>
568      * relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
569      * </pre>
570      *
571      * </blockquote>
572      * </p>
573      */
574     // relativeURI.set('?'); already included
575     static final FluentBitSet RELATIVEURI = bitSet().or(NET_PATH, ABS_PATH, REL_PATH, QUERY);
576 
577     /**
578      * BitSet for absoluteURI.
579      * <p>
580      * <blockquote>
581      *
582      * <pre>
583      * absoluteURI   = scheme ":" ( hier_part | opaque_part )
584      * </pre>
585      *
586      * </blockquote>
587      * </p>
588      */
589     static final FluentBitSet ABSOLUTEURI = bitSet(':').or(SCHEME, HIER_PART, OPAQUE_PART);
590 
591     /**
592      * BitSet for URI-reference.
593      * <p>
594      * <blockquote>
595      *
596      * <pre>
597      * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
598      * </pre>
599      *
600      * </blockquote>
601      * </p>
602      */
603     static final FluentBitSet URI_REFERENCE = bitSet('#').or(ABSOLUTEURI, RELATIVEURI, FRAGMENT);
604 
605     // Characters disallowed within the URI syntax
606     // Excluded US-ASCII Characters are like control, space, delims and unwise
607 
608     /**
609      * BitSet for space.
610      */
611     static final FluentBitSet SPACE = bitSet(0x20);
612 
613     /**
614      * BitSet for delims.
615      */
616     static final FluentBitSet DELIMS = bitSet('<', '>', '#', '%', '"');
617 
618     /**
619      * BitSet for unwise.
620      */
621     static final FluentBitSet UNWISE = bitSet('{', '}', '|', '\\', '^', '[', ']', '`');
622 
623     /**
624      * Disallowed rel_path before escaping.
625      */
626     static final FluentBitSet DISALLOWED_REL_PATH = bitSet().or(URIC).andNot(REL_PATH);
627 
628     /**
629      * Disallowed opaque_part before escaping.
630      */
631     static final FluentBitSet DISALLOWED_OPAQUE_PART = bitSet().or(URIC).andNot(OPAQUE_PART);
632 
633     /**
634      * Those characters that are allowed for the authority component.
635      */
636     static final FluentBitSet ALLOWED_AUTHORITY = bitSet().or(AUTHORITY).clear('%');
637 
638     // Characters allowed within and for each component
639 
640     /**
641      * Those characters that are allowed for the opaque_part.
642      */
643     static final FluentBitSet ALLOWED_OPAQUE_PART = bitSet().or(OPAQUE_PART).clear('%');
644 
645     /**
646      * Those characters that are allowed for the reg_name.
647      */
648     // allowed_reg_name.andNot(percent);
649     static final FluentBitSet ALLOWED_REG_NAME = bitSet().or(REG_NAME).clear('%');
650 
651     /**
652      * Those characters that are allowed for the userinfo component.
653      */
654     // allowed_userinfo.andNot(percent);
655     static final FluentBitSet ALLOWED_USER_INFO = bitSet().or(USERINFO).clear('%');
656 
657     /**
658      * Those characters that are allowed for within the userinfo component.
659      */
660     static final FluentBitSet ALLOWED_WITHIN_USERINFO = bitSet().or(WITHIN_USERRINFO).clear('%');
661 
662     /**
663      * Those characters that are allowed for the IPv6reference component. The characters '[', ']' in IPv6reference should be
664      * excluded.
665      */
666     // allowed_IPv6reference.andNot(unwise);
667     static final FluentBitSet ALLOWED_IPV6REFERENCE = bitSet().or(IPV6REFERENCE).clear('[', ']');
668 
669     /**
670      * Those characters that are allowed for the host component. The characters '[', ']' in IPv6reference should be
671      * excluded.
672      */
673     static final FluentBitSet ALLOWED_HOST = bitSet().or(HOSTNAME, ALLOWED_IPV6REFERENCE);
674 
675     /**
676      * Those characters that are allowed for the authority component.
677      */
678     static final FluentBitSet ALLOWED_WITHIN_AUTHORITY = bitSet().or(SERVER, REG_NAME).clear(';', ':', '@', '?', '/');
679 
680     /**
681      * Those characters that are allowed for the abs_path.
682      */
683     // allowed_abs_path.set('/'); // already included
684     static final FluentBitSet ALLOWED_ABS_PATH = bitSet().or(ABS_PATH).andNot(PERCENT).clear('+');
685 
686     /**
687      * Those characters that are allowed for the rel_path.
688      */
689     static final FluentBitSet ALLOWED_REL_PATH = bitSet().or(REL_PATH).clear('%', '+');
690 
691     /**
692      * Those characters that are allowed within the path.
693      */
694     static final FluentBitSet ALLOWED_WITHIN_PATH = bitSet().or(ABS_PATH).clear('/', ';', '=', '?');
695 
696     /**
697      * Those characters that are allowed for the query component.
698      */
699     static final FluentBitSet ALLOWED_QUERY = bitSet().or(URIC).clear('%');
700 
701     /**
702      * Those characters that are allowed within the query component.
703      */
704     // excluded 'reserved'
705     static final FluentBitSet ALLOWED_WITHIN_QUERY = bitSet().or(ALLOWED_QUERY).andNot(RESERVED);
706 
707     /**
708      * Those characters that are allowed for the fragment component.
709      */
710     static final FluentBitSet ALLOWED_FRAGMENT = bitSet().or(URIC).clear('%');
711 
712     /**
713      * BitSet for control.
714      */
715     private static final int CHARACTER_DEL = 0x7F;
716     private static final int CHARACTER_US = 0x1F;
717     static final FluentBitSet CONTROL = bitSet().setInclusive(0, CHARACTER_US).set(CHARACTER_DEL);
718 
719     private static final int NBITS = 256;
720 
721     static FluentBitSet bitSet() {
722         return new FluentBitSet(NBITS);
723     }
724 
725     private static FluentBitSet bitSet(final int... bitIndexArray) {
726         return bitSet().set(bitIndexArray);
727     }
728 
729     private URIBitSets() {
730     }
731 
732 }