1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.commons.vfs2.util;
18
19 import java.util.BitSet;
20
21 import org.apache.commons.lang3.util.FluentBitSet;
22 import org.apache.commons.vfs2.provider.GenericURLFileName;
23
24 /**
25 * Internal URI encoding {@link BitSet} definitions.
26 * <p>
27 * This was forked from the {@link BitSet}s in {@code org.apache.commons.httpclient.URI}, in order to not be dependent
28 * on HttpClient v3 API, when generating and handling {@link GenericURLFileName}s, but it should work with any different
29 * HTTP backend provider implementations.
30 * </p>
31 */
32 final class URIBitSets {
33
34 /**
35 * The percent "%" character always has the reserved purpose of being the escape indicator, it must be escaped as "%25"
36 * in order to be used as data within a URI.
37 */
38 static final FluentBitSet PERCENT = bitSet('%');
39
40 /**
41 * BitSet for digit.
42 * <p>
43 * <blockquote>
44 *
45 * <pre>
46 * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
47 * </pre>
48 *
49 * </blockquote>
50 * </p>
51 */
52 static final FluentBitSet DIGIT = bitSet().setInclusive('0', '9');
53
54 /**
55 * BitSet for alpha.
56 * <p>
57 * <blockquote>
58 *
59 * <pre>
60 * alpha = lowalpha | upalpha
61 * </pre>
62 *
63 * </blockquote>
64 * </p>
65 */
66 static final FluentBitSet ALPHA = bitSet().setInclusive('a', 'z').setInclusive('A', 'Z');
67
68 /**
69 * BitSet for alphanum (join of alpha & digit).
70 * <p>
71 * <blockquote>
72 *
73 * <pre>
74 * alphanum = alpha | digit
75 * </pre>
76 *
77 * </blockquote>
78 * </p>
79 */
80 static final FluentBitSet ALPHANUM = bitSet().or(ALPHA, DIGIT);
81
82 /**
83 * BitSet for hex.
84 * <p>
85 * <blockquote>
86 *
87 * <pre>
88 * hex = digit | "A" | "B" | "C" | "D" | "E" | "F" | "a" | "b" | "c" | "d" | "e" | "f"
89 * </pre>
90 *
91 * </blockquote>
92 * </p>
93 */
94 static final FluentBitSet HEX = bitSet().or(DIGIT).setInclusive('a', 'f').setInclusive('A', 'F');
95
96 /**
97 * BitSet for escaped.
98 * <p>
99 * <blockquote>
100 *
101 * <pre>
102 * escaped = "%" hex hex
103 * </pre>
104 *
105 * </blockquote>
106 * </p>
107 */
108 static final FluentBitSet ESCAPED = bitSet().or(PERCENT, HEX);
109
110 /**
111 * BitSet for mark.
112 * <p>
113 * <blockquote>
114 *
115 * <pre>
116 * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
117 * </pre>
118 *
119 * </blockquote>
120 * </p>
121 */
122 static final FluentBitSet MARK = bitSet('-', '_', '.', '!', '~', '*', '\'', '(', ')');
123
124 /**
125 * Data characters that are allowed in a URI but do not have a reserved purpose are called unreserved.
126 * <p>
127 * <blockquote>
128 *
129 * <pre>
130 * unreserved = alphanum | mark
131 * </pre>
132 *
133 * </blockquote>
134 * </p>
135 */
136 static final FluentBitSet UNRESERVED = bitSet().or(ALPHANUM, MARK);
137
138 /**
139 * BitSet for reserved.
140 * <p>
141 * <blockquote>
142 *
143 * <pre>
144 * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ","
145 * </pre>
146 *
147 * </blockquote>
148 * </p>
149 */
150 static final FluentBitSet RESERVED = bitSet(';', '/', '?', ':', '@', '&', '=', '+', '$', ',');
151
152 /**
153 * BitSet for uric.
154 * <p>
155 * <blockquote>
156 *
157 * <pre>
158 * uric = reserved | unreserved | escaped
159 * </pre>
160 *
161 * </blockquote>
162 * </p>
163 */
164 static final FluentBitSet URIC = bitSet().or(RESERVED, UNRESERVED, ESCAPED);
165
166 /**
167 * BitSet for fragment (alias for uric).
168 * <p>
169 * <blockquote>
170 *
171 * <pre>
172 * fragment = *uric
173 * </pre>
174 *
175 * </blockquote>
176 * </p>
177 */
178 static final FluentBitSet FRAGMENT = URIC;
179
180 /**
181 * BitSet for query (alias for uric).
182 * <p>
183 * <blockquote>
184 *
185 * <pre>
186 * query = *uric
187 * </pre>
188 *
189 * </blockquote>
190 * </p>
191 */
192 static final FluentBitSet QUERY = URIC;
193
194 /**
195 * BitSet for pchar.
196 * <p>
197 * <blockquote>
198 *
199 * <pre>
200 * pchar = unreserved | escaped | ":" | "@" | "&" | "=" | "+" | "$" | ","
201 * </pre>
202 *
203 * </blockquote>
204 * </p>
205 */
206 static final FluentBitSet PCHAR = bitSet(':', '@', '&', '=', '+', '$', ',').or(UNRESERVED, ESCAPED);
207
208 /**
209 * BitSet for param (alias for pchar).
210 * <p>
211 * <blockquote>
212 *
213 * <pre>
214 * param = *pchar
215 * </pre>
216 *
217 * </blockquote>
218 * </p>
219 */
220 static final FluentBitSet PARAM = PCHAR;
221
222 /**
223 * BitSet for segment.
224 * <p>
225 * <blockquote>
226 *
227 * <pre>
228 * segment = *pchar *( ";" param )
229 * </pre>
230 *
231 * </blockquote>
232 * </p>
233 */
234 static final FluentBitSet SEGMENT = bitSet(';').or(PCHAR, PARAM);
235
236 /**
237 * BitSet for path segments.
238 * <p>
239 * <blockquote>
240 *
241 * <pre>
242 * path_segments = segment *( "/" segment )
243 * </pre>
244 *
245 * </blockquote>
246 * </p>
247 */
248 static final FluentBitSet PATH_SEGMENTS = bitSet('/').or(SEGMENT);
249
250 /**
251 * URI absolute path.
252 * <p>
253 * <blockquote>
254 *
255 * <pre>
256 * abs_path = "/" path_segments
257 * </pre>
258 *
259 * </blockquote>
260 * </p>
261 */
262 static final FluentBitSet ABS_PATH = bitSet('/').or(PATH_SEGMENTS);
263
264 /**
265 * URI bitset for encoding typical non-slash characters.
266 * <p>
267 * <blockquote>
268 *
269 * <pre>
270 * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | ","
271 * </pre>
272 *
273 * </blockquote>
274 * </p>
275 */
276 static final FluentBitSet URIC_NO_SLASH = bitSet(';', '?', ';', '@', '&', '=', '+', '$', ',').or(UNRESERVED, ESCAPED);
277
278 /**
279 * URI bitset that combines uric_no_slash and uric.
280 * <p>
281 * <blockquote>
282 *
283 * <pre>
284 * opaque_part = uric_no_slash * uric
285 * </pre>
286 *
287 * </blockquote>
288 * </p>
289 */
290 static final FluentBitSet OPAQUE_PART = bitSet().or(URIC_NO_SLASH, URIC);
291
292 /**
293 * URI bitset that combines absolute path and opaque part.
294 * <p>
295 * <blockquote>
296 *
297 * <pre>
298 * path = [ abs_path | opaque_part ]
299 * </pre>
300 *
301 * </blockquote>
302 * </p>
303 */
304 static final FluentBitSet PATH = bitSet().or(ABS_PATH, OPAQUE_PART);
305
306 /**
307 * Port, a logical alias for digit.
308 */
309 static final FluentBitSet PORT = DIGIT;
310
311 /**
312 * Bitset that combines digit and dot fo IPv$address.
313 * <p>
314 * <blockquote>
315 *
316 * <pre>
317 * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
318 * </pre>
319 *
320 * </blockquote>
321 * </p>
322 */
323 static final FluentBitSet IPV4ADDRESS = bitSet('.').or(DIGIT);
324
325 /**
326 * RFC 2373.
327 * <p>
328 * <blockquote>
329 *
330 * <pre>
331 * IPv6address = hexpart [ ":" IPv4address ]
332 * </pre>
333 *
334 * </blockquote>
335 * </p>
336 */
337 static final FluentBitSet IPV6ADDRESS = bitSet(':').or(HEX, IPV4ADDRESS);
338
339 /**
340 * RFC 2732, 2373.
341 * <p>
342 * <blockquote>
343 *
344 * <pre>
345 * IPv6reference = "[" IPv6address "]"
346 * </pre>
347 *
348 * </blockquote>
349 * </p>
350 */
351 static final FluentBitSet IPV6REFERENCE = bitSet('[', ']').or(IPV6ADDRESS);
352
353 /**
354 * BitSet for toplabel.
355 * <p>
356 * <blockquote>
357 *
358 * <pre>
359 * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
360 * </pre>
361 *
362 * </blockquote>
363 * </p>
364 */
365 static final FluentBitSet TOPLABEL = bitSet('-').or(ALPHANUM);
366
367 /**
368 * BitSet for domainlabel.
369 * <p>
370 * <blockquote>
371 *
372 * <pre>
373 * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
374 * </pre>
375 *
376 * </blockquote>
377 * </p>
378 */
379 static final FluentBitSet DOMAINLABEL = TOPLABEL;
380
381 /**
382 * BitSet for hostname.
383 * <p>
384 * <blockquote>
385 *
386 * <pre>
387 * hostname = *( domainlabel "." ) toplabel [ "." ]
388 * </pre>
389 *
390 * </blockquote>
391 * </p>
392 */
393 static final FluentBitSet HOSTNAME = bitSet('.').or(TOPLABEL);
394
395 /**
396 * BitSet for host.
397 * <p>
398 * <blockquote>
399 *
400 * <pre>
401 * host = hostname | IPv4address | IPv6reference
402 * </pre>
403 *
404 * </blockquote>
405 * </p>
406 */
407 static final FluentBitSet HOST = bitSet().or(HOSTNAME, IPV6REFERENCE);
408
409 // Static initializer for host
410 // static {
411 // HOST.or(HOSTNAME);
412 // // host.or(IPv4address);
413 // HOST.or(IPV6REFERENCE); // IPv4address
414 // }
415 /**
416 * BitSet for hostport.
417 * <p>
418 * <blockquote>
419 *
420 * <pre>
421 * hostport = host [ ":" port ]
422 * </pre>
423 *
424 * </blockquote>
425 * </p>
426 */
427 static final FluentBitSet HOSTPORT = bitSet(':').or(HOST, PORT);
428
429 /**
430 * Bitset for userinfo.
431 * <p>
432 * <blockquote>
433 *
434 * <pre>
435 * userinfo = *( unreserved | escaped |
436 * ";" | ":" | "&" | "=" | "+" | "$" | "," )
437 * </pre>
438 *
439 * </blockquote>
440 * </p>
441 */
442 static final FluentBitSet USERINFO = bitSet(';', ':', '&', '=', '+', '$', ',').or(UNRESERVED, ESCAPED);
443
444 /**
445 * BitSet for within the userinfo component like user and password.
446 */
447 static final FluentBitSet WITHIN_USERRINFO = bitSet(';', ':', '@', '?', '/').or(USERINFO);
448
449 /**
450 * Bitset for server.
451 * <p>
452 * <blockquote>
453 *
454 * <pre>
455 * server = [ [ userinfo "@" ] hostport ]
456 * </pre>
457 *
458 * </blockquote>
459 * </p>
460 */
461 static final FluentBitSet SERVER = bitSet('@').or(USERINFO, HOSTPORT);
462
463 /**
464 * BitSet for reg_name.
465 * <p>
466 * <blockquote>
467 *
468 * <pre>
469 * reg_name = 1 * (unreserved | escaped | "$" | "," | ";" | ":" | "@" | "&" | "=" | "+")
470 * </pre>
471 *
472 * </blockquote>
473 * </p>
474 */
475 static final FluentBitSet REG_NAME = bitSet('$', ',', ';', ':', '@', '&', '=', '+').or(UNRESERVED, ESCAPED);
476
477 /**
478 * BitSet for authority.
479 * <p>
480 * <blockquote>
481 *
482 * <pre>
483 * authority = server | reg_name
484 * </pre>
485 *
486 * </blockquote>
487 * </p>
488 */
489 static final FluentBitSet AUTHORITY = bitSet().or(SERVER, REG_NAME);
490
491 /**
492 * BitSet for scheme.
493 * <p>
494 * <blockquote>
495 *
496 * <pre>
497 * scheme = alpha * (alpha | digit | "+" | "-" | ".")
498 * </pre>
499 *
500 * </blockquote>
501 * </p>
502 */
503 static final FluentBitSet SCHEME = bitSet('+', '-', '.').or(ALPHA, DIGIT);
504
505 /**
506 * BitSet for rel_segment.
507 * <p>
508 * <blockquote>
509 *
510 * <pre>
511 * rel_segment = 1 * (unreserved | escaped | ";" | "@" | "&" | "=" | "+" | "$" | ",")
512 * </pre>
513 *
514 * </blockquote>
515 * </p>
516 */
517 static final FluentBitSet REL_SEGMENT = bitSet(';', '@', '&', '=', '+', '$', ',').or(UNRESERVED, ESCAPED);
518
519 /**
520 * BitSet for rel_path.
521 * <p>
522 * <blockquote>
523 *
524 * <pre>
525 * rel_path = rel_segment[abs_path]
526 * </pre>
527 *
528 * </blockquote>
529 * </p>
530 */
531 static final FluentBitSet REL_PATH = bitSet().or(REL_SEGMENT, ABS_PATH);
532
533 /**
534 * BitSet for net_path.
535 * <p>
536 * <blockquote>
537 *
538 * <pre>
539 * net_path = "//" authority [ abs_path ]
540 * </pre>
541 *
542 * </blockquote>
543 * </p>
544 */
545 static final FluentBitSet NET_PATH = bitSet('/').or(AUTHORITY, ABS_PATH);
546
547 /**
548 * BitSet for hier_part.
549 * <p>
550 * <blockquote>
551 *
552 * <pre>
553 * hier_part = ( net_path | abs_path ) [ "?" query ]
554 * </pre>
555 *
556 * </blockquote>
557 * </p>
558 */
559 // hier_part.set('?'); already included
560 static final FluentBitSet HIER_PART = bitSet().or(NET_PATH, ABS_PATH, QUERY);
561
562 /**
563 * BitSet for relativeURI.
564 * <p>
565 * <blockquote>
566 *
567 * <pre>
568 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
569 * </pre>
570 *
571 * </blockquote>
572 * </p>
573 */
574 // relativeURI.set('?'); already included
575 static final FluentBitSet RELATIVEURI = bitSet().or(NET_PATH, ABS_PATH, REL_PATH, QUERY);
576
577 /**
578 * BitSet for absoluteURI.
579 * <p>
580 * <blockquote>
581 *
582 * <pre>
583 * absoluteURI = scheme ":" ( hier_part | opaque_part )
584 * </pre>
585 *
586 * </blockquote>
587 * </p>
588 */
589 static final FluentBitSet ABSOLUTEURI = bitSet(':').or(SCHEME, HIER_PART, OPAQUE_PART);
590
591 /**
592 * BitSet for URI-reference.
593 * <p>
594 * <blockquote>
595 *
596 * <pre>
597 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
598 * </pre>
599 *
600 * </blockquote>
601 * </p>
602 */
603 static final FluentBitSet URI_REFERENCE = bitSet('#').or(ABSOLUTEURI, RELATIVEURI, FRAGMENT);
604
605 // Characters disallowed within the URI syntax
606 // Excluded US-ASCII Characters are like control, space, delims and unwise
607
608 /**
609 * BitSet for space.
610 */
611 static final FluentBitSet SPACE = bitSet(0x20);
612
613 /**
614 * BitSet for delims.
615 */
616 static final FluentBitSet DELIMS = bitSet('<', '>', '#', '%', '"');
617
618 /**
619 * BitSet for unwise.
620 */
621 static final FluentBitSet UNWISE = bitSet('{', '}', '|', '\\', '^', '[', ']', '`');
622
623 /**
624 * Disallowed rel_path before escaping.
625 */
626 static final FluentBitSet DISALLOWED_REL_PATH = bitSet().or(URIC).andNot(REL_PATH);
627
628 /**
629 * Disallowed opaque_part before escaping.
630 */
631 static final FluentBitSet DISALLOWED_OPAQUE_PART = bitSet().or(URIC).andNot(OPAQUE_PART);
632
633 /**
634 * Those characters that are allowed for the authority component.
635 */
636 static final FluentBitSet ALLOWED_AUTHORITY = bitSet().or(AUTHORITY).clear('%');
637
638 // Characters allowed within and for each component
639
640 /**
641 * Those characters that are allowed for the opaque_part.
642 */
643 static final FluentBitSet ALLOWED_OPAQUE_PART = bitSet().or(OPAQUE_PART).clear('%');
644
645 /**
646 * Those characters that are allowed for the reg_name.
647 */
648 // allowed_reg_name.andNot(percent);
649 static final FluentBitSet ALLOWED_REG_NAME = bitSet().or(REG_NAME).clear('%');
650
651 /**
652 * Those characters that are allowed for the userinfo component.
653 */
654 // allowed_userinfo.andNot(percent);
655 static final FluentBitSet ALLOWED_USER_INFO = bitSet().or(USERINFO).clear('%');
656
657 /**
658 * Those characters that are allowed for within the userinfo component.
659 */
660 static final FluentBitSet ALLOWED_WITHIN_USERINFO = bitSet().or(WITHIN_USERRINFO).clear('%');
661
662 /**
663 * Those characters that are allowed for the IPv6reference component. The characters '[', ']' in IPv6reference should be
664 * excluded.
665 */
666 // allowed_IPv6reference.andNot(unwise);
667 static final FluentBitSet ALLOWED_IPV6REFERENCE = bitSet().or(IPV6REFERENCE).clear('[', ']');
668
669 /**
670 * Those characters that are allowed for the host component. The characters '[', ']' in IPv6reference should be
671 * excluded.
672 */
673 static final FluentBitSet ALLOWED_HOST = bitSet().or(HOSTNAME, ALLOWED_IPV6REFERENCE);
674
675 /**
676 * Those characters that are allowed for the authority component.
677 */
678 static final FluentBitSet ALLOWED_WITHIN_AUTHORITY = bitSet().or(SERVER, REG_NAME).clear(';', ':', '@', '?', '/');
679
680 /**
681 * Those characters that are allowed for the abs_path.
682 */
683 // allowed_abs_path.set('/'); // already included
684 static final FluentBitSet ALLOWED_ABS_PATH = bitSet().or(ABS_PATH).andNot(PERCENT).clear('+');
685
686 /**
687 * Those characters that are allowed for the rel_path.
688 */
689 static final FluentBitSet ALLOWED_REL_PATH = bitSet().or(REL_PATH).clear('%', '+');
690
691 /**
692 * Those characters that are allowed within the path.
693 */
694 static final FluentBitSet ALLOWED_WITHIN_PATH = bitSet().or(ABS_PATH).clear('/', ';', '=', '?');
695
696 /**
697 * Those characters that are allowed for the query component.
698 */
699 static final FluentBitSet ALLOWED_QUERY = bitSet().or(URIC).clear('%');
700
701 /**
702 * Those characters that are allowed within the query component.
703 */
704 // excluded 'reserved'
705 static final FluentBitSet ALLOWED_WITHIN_QUERY = bitSet().or(ALLOWED_QUERY).andNot(RESERVED);
706
707 /**
708 * Those characters that are allowed for the fragment component.
709 */
710 static final FluentBitSet ALLOWED_FRAGMENT = bitSet().or(URIC).clear('%');
711
712 /**
713 * BitSet for control.
714 */
715 private static final int CHARACTER_DEL = 0x7F;
716 private static final int CHARACTER_US = 0x1F;
717 static final FluentBitSet CONTROL = bitSet().setInclusive(0, CHARACTER_US).set(CHARACTER_DEL);
718
719 private static final int NBITS = 256;
720
721 static FluentBitSet bitSet() {
722 return new FluentBitSet(NBITS);
723 }
724
725 private static FluentBitSet bitSet(final int... bitIndexArray) {
726 return bitSet().set(bitIndexArray);
727 }
728
729 private URIBitSets() {
730 }
731
732 }