1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 package org.apache.commons.httpclient;
31
32 import java.io.IOException;
33 import java.io.ObjectInputStream;
34 import java.io.ObjectOutputStream;
35 import java.io.Serializable;
36 import java.util.Locale;
37 import java.util.BitSet;
38 import java.util.Hashtable;
39
40 import org.apache.commons.codec.DecoderException;
41 import org.apache.commons.codec.net.URLCodec;
42 import org.apache.commons.httpclient.util.EncodingUtil;
43
44 /***
45 * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396.
46 * This class has the purpose of supportting of parsing a URI reference to
47 * extend any specific protocols, the character encoding of the protocol to
48 * be transported and the charset of the document.
49 * <p>
50 * A URI is always in an "escaped" form, since escaping or unescaping a
51 * completed URI might change its semantics.
52 * <p>
53 * Implementers should be careful not to escape or unescape the same string
54 * more than once, since unescaping an already unescaped string might lead to
55 * misinterpreting a percent data character as another escaped character,
56 * or vice versa in the case of escaping an already escaped string.
57 * <p>
58 * In order to avoid these problems, data types used as follows:
59 * <p><blockquote><pre>
60 * URI character sequence: char
61 * octet sequence: byte
62 * original character sequence: String
63 * </pre></blockquote><p>
64 *
65 * So, a URI is a sequence of characters as an array of a char type, which
66 * is not always represented as a sequence of octets as an array of byte.
67 * <p>
68 *
69 * URI Syntactic Components
70 * <p><blockquote><pre>
71 * - In general, written as follows:
72 * Absolute URI = <scheme>:<scheme-specific-part>
73 * Generic URI = <scheme>://<authority><path>?<query>
74 *
75 * - Syntax
76 * absoluteURI = scheme ":" ( hier_part | opaque_part )
77 * hier_part = ( net_path | abs_path ) [ "?" query ]
78 * net_path = "//" authority [ abs_path ]
79 * abs_path = "/" path_segments
80 * </pre></blockquote><p>
81 *
82 * The following examples illustrate URI that are in common use.
83 * <pre>
84 * ftp://ftp.is.co.za/rfc/rfc1808.txt
85 * -- ftp scheme for File Transfer Protocol services
86 * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles
87 * -- gopher scheme for Gopher and Gopher+ Protocol services
88 * http://www.math.uio.no/faq/compression-faq/part1.html
89 * -- http scheme for Hypertext Transfer Protocol services
90 * mailto:mduerst@ifi.unizh.ch
91 * -- mailto scheme for electronic mail addresses
92 * news:comp.infosystems.www.servers.unix
93 * -- news scheme for USENET news groups and articles
94 * telnet://melvyl.ucop.edu/
95 * -- telnet scheme for interactive services via the TELNET Protocol
96 * </pre>
97 * Please, notice that there are many modifications from URL(RFC 1738) and
98 * relative URL(RFC 1808).
99 * <p>
100 * <b>The expressions for a URI</b>
101 * <p><pre>
102 * For escaped URI forms
103 * - URI(char[]) // constructor
104 * - char[] getRawXxx() // method
105 * - String getEscapedXxx() // method
106 * - String toString() // method
107 * <p>
108 * For unescaped URI forms
109 * - URI(String) // constructor
110 * - String getXXX() // method
111 * </pre><p>
112 *
113 * @author <a href="mailto:jericho@apache.org">Sung-Gu</a>
114 * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a>
115 * @version $Revision: 179784 $ $Date: 2002/03/14 15:14:01
116 */
117 public class URI implements Cloneable, Comparable, Serializable {
118
119
120
121
122 /*** Create an instance as an internal use */
123 protected URI() {
124 }
125
126 /***
127 * Construct a URI from a string with the given charset. The input string can
128 * be either in escaped or unescaped form.
129 *
130 * @param s URI character sequence
131 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
132 * <tt>false</tt> otherwise.
133 * @param charset the charset string to do escape encoding, if required
134 *
135 * @throws URIException If the URI cannot be created.
136 * @throws NullPointerException if input string is <code>null</code>
137 *
138 * @see #getProtocolCharset
139 *
140 * @since 3.0
141 */
142 public URI(String s, boolean escaped, String charset)
143 throws URIException, NullPointerException {
144 protocolCharset = charset;
145 parseUriReference(s, escaped);
146 }
147
148 /***
149 * Construct a URI from a string with the given charset. The input string can
150 * be either in escaped or unescaped form.
151 *
152 * @param s URI character sequence
153 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
154 * <tt>false</tt> otherwise.
155 *
156 * @throws URIException If the URI cannot be created.
157 * @throws NullPointerException if input string is <code>null</code>
158 *
159 * @see #getProtocolCharset
160 *
161 * @since 3.0
162 */
163 public URI(String s, boolean escaped)
164 throws URIException, NullPointerException {
165 parseUriReference(s, escaped);
166 }
167
168 /***
169 * Construct a URI as an escaped form of a character array with the given
170 * charset.
171 *
172 * @param escaped the URI character sequence
173 * @param charset the charset string to do escape encoding
174 * @throws URIException If the URI cannot be created.
175 * @throws NullPointerException if <code>escaped</code> is <code>null</code>
176 * @see #getProtocolCharset
177 *
178 * @deprecated Use #URI(String, boolean, String)
179 */
180 public URI(char[] escaped, String charset)
181 throws URIException, NullPointerException {
182 protocolCharset = charset;
183 parseUriReference(new String(escaped), true);
184 }
185
186
187 /***
188 * Construct a URI as an escaped form of a character array.
189 * An URI can be placed within double-quotes or angle brackets like
190 * "http://test.com/" and <http://test.com/>
191 *
192 * @param escaped the URI character sequence
193 * @throws URIException If the URI cannot be created.
194 * @throws NullPointerException if <code>escaped</code> is <code>null</code>
195 * @see #getDefaultProtocolCharset
196 *
197 * @deprecated Use #URI(String, boolean)
198 */
199 public URI(char[] escaped)
200 throws URIException, NullPointerException {
201 parseUriReference(new String(escaped), true);
202 }
203
204
205 /***
206 * Construct a URI from the given string with the given charset.
207 *
208 * @param original the string to be represented to URI character sequence
209 * It is one of absoluteURI and relativeURI.
210 * @param charset the charset string to do escape encoding
211 * @throws URIException If the URI cannot be created.
212 * @see #getProtocolCharset
213 *
214 * @deprecated Use #URI(String, boolean, String)
215 */
216 public URI(String original, String charset) throws URIException {
217 protocolCharset = charset;
218 parseUriReference(original, false);
219 }
220
221
222 /***
223 * Construct a URI from the given string.
224 * <p><blockquote><pre>
225 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
226 * </pre></blockquote><p>
227 * An URI can be placed within double-quotes or angle brackets like
228 * "http://test.com/" and <http://test.com/>
229 *
230 * @param original the string to be represented to URI character sequence
231 * It is one of absoluteURI and relativeURI.
232 * @throws URIException If the URI cannot be created.
233 * @see #getDefaultProtocolCharset
234 *
235 * @deprecated Use #URI(String, boolean)
236 */
237 public URI(String original) throws URIException {
238 parseUriReference(original, false);
239 }
240
241
242 /***
243 * Construct a general URI from the given components.
244 * <p><blockquote><pre>
245 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
246 * absoluteURI = scheme ":" ( hier_part | opaque_part )
247 * opaque_part = uric_no_slash *uric
248 * </pre></blockquote><p>
249 * It's for absolute URI = <scheme>:<scheme-specific-part>#
250 * <fragment>.
251 *
252 * @param scheme the scheme string
253 * @param schemeSpecificPart scheme_specific_part
254 * @param fragment the fragment string
255 * @throws URIException If the URI cannot be created.
256 * @see #getDefaultProtocolCharset
257 */
258 public URI(String scheme, String schemeSpecificPart, String fragment)
259 throws URIException {
260
261
262 if (scheme == null) {
263 throw new URIException(URIException.PARSING, "scheme required");
264 }
265 char[] s = scheme.toLowerCase().toCharArray();
266 if (validate(s, URI.scheme)) {
267 _scheme = s;
268 } else {
269 throw new URIException(URIException.PARSING, "incorrect scheme");
270 }
271 _opaque = encode(schemeSpecificPart, allowed_opaque_part,
272 getProtocolCharset());
273
274 _is_opaque_part = true;
275 _fragment = fragment.toCharArray();
276
277 setURI();
278 }
279
280
281 /***
282 * Construct a general URI from the given components.
283 * <p><blockquote><pre>
284 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
285 * absoluteURI = scheme ":" ( hier_part | opaque_part )
286 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
287 * hier_part = ( net_path | abs_path ) [ "?" query ]
288 * </pre></blockquote><p>
289 * It's for absolute URI = <scheme>:<path>?<query>#<
290 * fragment> and relative URI = <path>?<query>#<fragment
291 * >.
292 *
293 * @param scheme the scheme string
294 * @param authority the authority string
295 * @param path the path string
296 * @param query the query string
297 * @param fragment the fragment string
298 * @throws URIException If the new URI cannot be created.
299 * @see #getDefaultProtocolCharset
300 */
301 public URI(String scheme, String authority, String path, String query,
302 String fragment) throws URIException {
303
304
305 StringBuffer buff = new StringBuffer();
306 if (scheme != null) {
307 buff.append(scheme);
308 buff.append(':');
309 }
310 if (authority != null) {
311 buff.append("//");
312 buff.append(authority);
313 }
314 if (path != null) {
315 if ((scheme != null || authority != null)
316 && !path.startsWith("/")) {
317 throw new URIException(URIException.PARSING,
318 "abs_path requested");
319 }
320 buff.append(path);
321 }
322 if (query != null) {
323 buff.append('?');
324 buff.append(query);
325 }
326 if (fragment != null) {
327 buff.append('#');
328 buff.append(fragment);
329 }
330 parseUriReference(buff.toString(), false);
331 }
332
333
334 /***
335 * Construct a general URI from the given components.
336 *
337 * @param scheme the scheme string
338 * @param userinfo the userinfo string
339 * @param host the host string
340 * @param port the port number
341 * @throws URIException If the new URI cannot be created.
342 * @see #getDefaultProtocolCharset
343 */
344 public URI(String scheme, String userinfo, String host, int port)
345 throws URIException {
346
347 this(scheme, userinfo, host, port, null, null, null);
348 }
349
350
351 /***
352 * Construct a general URI from the given components.
353 *
354 * @param scheme the scheme string
355 * @param userinfo the userinfo string
356 * @param host the host string
357 * @param port the port number
358 * @param path the path string
359 * @throws URIException If the new URI cannot be created.
360 * @see #getDefaultProtocolCharset
361 */
362 public URI(String scheme, String userinfo, String host, int port,
363 String path) throws URIException {
364
365 this(scheme, userinfo, host, port, path, null, null);
366 }
367
368
369 /***
370 * Construct a general URI from the given components.
371 *
372 * @param scheme the scheme string
373 * @param userinfo the userinfo string
374 * @param host the host string
375 * @param port the port number
376 * @param path the path string
377 * @param query the query string
378 * @throws URIException If the new URI cannot be created.
379 * @see #getDefaultProtocolCharset
380 */
381 public URI(String scheme, String userinfo, String host, int port,
382 String path, String query) throws URIException {
383
384 this(scheme, userinfo, host, port, path, query, null);
385 }
386
387
388 /***
389 * Construct a general URI from the given components.
390 *
391 * @param scheme the scheme string
392 * @param userinfo the userinfo string
393 * @param host the host string
394 * @param port the port number
395 * @param path the path string
396 * @param query the query string
397 * @param fragment the fragment string
398 * @throws URIException If the new URI cannot be created.
399 * @see #getDefaultProtocolCharset
400 */
401 public URI(String scheme, String userinfo, String host, int port,
402 String path, String query, String fragment) throws URIException {
403
404 this(scheme, (host == null) ? null
405 : ((userinfo != null) ? userinfo + '@' : "") + host
406 + ((port != -1) ? ":" + port : ""), path, query, fragment);
407 }
408
409
410 /***
411 * Construct a general URI from the given components.
412 *
413 * @param scheme the scheme string
414 * @param host the host string
415 * @param path the path string
416 * @param fragment the fragment string
417 * @throws URIException If the new URI cannot be created.
418 * @see #getDefaultProtocolCharset
419 */
420 public URI(String scheme, String host, String path, String fragment)
421 throws URIException {
422
423 this(scheme, host, path, null, fragment);
424 }
425
426
427 /***
428 * Construct a general URI with the given relative URI string.
429 *
430 * @param base the base URI
431 * @param relative the relative URI string
432 * @throws URIException If the new URI cannot be created.
433 *
434 * @deprecated Use #URI(URI, String, boolean)
435 */
436 public URI(URI base, String relative) throws URIException {
437 this(base, new URI(relative));
438 }
439
440
441 /***
442 * Construct a general URI with the given relative URI string.
443 *
444 * @param base the base URI
445 * @param relative the relative URI string
446 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
447 * <tt>false</tt> otherwise.
448 *
449 * @throws URIException If the new URI cannot be created.
450 *
451 * @since 3.0
452 */
453 public URI(URI base, String relative, boolean escaped) throws URIException {
454 this(base, new URI(relative, escaped));
455 }
456
457
458 /***
459 * Construct a general URI with the given relative URI.
460 * <p><blockquote><pre>
461 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
462 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
463 * </pre></blockquote><p>
464 * Resolving Relative References to Absolute Form.
465 *
466 * <strong>Examples of Resolving Relative URI References</strong>
467 *
468 * Within an object with a well-defined base URI of
469 * <p><blockquote><pre>
470 * http://a/b/c/d;p?q
471 * </pre></blockquote><p>
472 * the relative URI would be resolved as follows:
473 *
474 * Normal Examples
475 *
476 * <p><blockquote><pre>
477 * g:h = g:h
478 * g = http://a/b/c/g
479 * ./g = http://a/b/c/g
480 * g/ = http://a/b/c/g/
481 * /g = http://a/g
482 * //g = http://g
483 * ?y = http://a/b/c/?y
484 * g?y = http://a/b/c/g?y
485 * #s = (current document)#s
486 * g#s = http://a/b/c/g#s
487 * g?y#s = http://a/b/c/g?y#s
488 * ;x = http://a/b/c/;x
489 * g;x = http://a/b/c/g;x
490 * g;x?y#s = http://a/b/c/g;x?y#s
491 * . = http://a/b/c/
492 * ./ = http://a/b/c/
493 * .. = http://a/b/
494 * ../ = http://a/b/
495 * ../g = http://a/b/g
496 * ../.. = http://a/
497 * ../../ = http://a/
498 * ../../g = http://a/g
499 * </pre></blockquote><p>
500 *
501 * Some URI schemes do not allow a hierarchical syntax matching the
502 * <hier_part> syntax, and thus cannot use relative references.
503 *
504 * @param base the base URI
505 * @param relative the relative URI
506 * @throws URIException If the new URI cannot be created.
507 */
508 public URI(URI base, URI relative) throws URIException {
509
510 if (base._scheme == null) {
511 throw new URIException(URIException.PARSING, "base URI required");
512 }
513 if (base._scheme != null) {
514 this._scheme = base._scheme;
515 this._authority = base._authority;
516 }
517 if (base._is_opaque_part || relative._is_opaque_part) {
518 this._scheme = base._scheme;
519 this._is_opaque_part = base._is_opaque_part
520 || relative._is_opaque_part;
521 this._opaque = relative._opaque;
522 this._fragment = relative._fragment;
523 this.setURI();
524 return;
525 }
526 if (relative._scheme != null) {
527 this._scheme = relative._scheme;
528 this._is_net_path = relative._is_net_path;
529 this._authority = relative._authority;
530 if (relative._is_server) {
531 this._is_server = relative._is_server;
532 this._userinfo = relative._userinfo;
533 this._host = relative._host;
534 this._port = relative._port;
535 } else if (relative._is_reg_name) {
536 this._is_reg_name = relative._is_reg_name;
537 }
538 this._is_abs_path = relative._is_abs_path;
539 this._is_rel_path = relative._is_rel_path;
540 this._path = relative._path;
541 } else if (base._authority != null && relative._scheme == null) {
542 this._is_net_path = base._is_net_path;
543 this._authority = base._authority;
544 if (base._is_server) {
545 this._is_server = base._is_server;
546 this._userinfo = base._userinfo;
547 this._host = base._host;
548 this._port = base._port;
549 } else if (base._is_reg_name) {
550 this._is_reg_name = base._is_reg_name;
551 }
552 }
553 if (relative._authority != null) {
554 this._is_net_path = relative._is_net_path;
555 this._authority = relative._authority;
556 if (relative._is_server) {
557 this._is_server = relative._is_server;
558 this._userinfo = relative._userinfo;
559 this._host = relative._host;
560 this._port = relative._port;
561 } else if (relative._is_reg_name) {
562 this._is_reg_name = relative._is_reg_name;
563 }
564 this._is_abs_path = relative._is_abs_path;
565 this._is_rel_path = relative._is_rel_path;
566 this._path = relative._path;
567 }
568
569 if (relative._scheme == null && relative._authority == null) {
570 if ((relative._path == null || relative._path.length == 0)
571 && relative._query == null) {
572
573
574 this._path = base._path;
575 this._query = base._query;
576 } else {
577 this._path = resolvePath(base._path, relative._path);
578 }
579 }
580
581 if (relative._query != null) {
582 this._query = relative._query;
583 }
584
585 if (relative._fragment != null) {
586 this._fragment = relative._fragment;
587 }
588 this.setURI();
589
590
591 parseUriReference(new String(_uri), true);
592 }
593
594
595
596 /*** Version ID for serialization */
597 static final long serialVersionUID = 604752400577948726L;
598
599
600 /***
601 * Cache the hash code for this URI.
602 */
603 protected int hash = 0;
604
605
606 /***
607 * This Uniform Resource Identifier (URI).
608 * The URI is always in an "escaped" form, since escaping or unescaping
609 * a completed URI might change its semantics.
610 */
611 protected char[] _uri = null;
612
613
614 /***
615 * The charset of the protocol used by this URI instance.
616 */
617 protected String protocolCharset = null;
618
619
620 /***
621 * The default charset of the protocol. RFC 2277, 2396
622 */
623 protected static String defaultProtocolCharset = "UTF-8";
624
625
626 /***
627 * The default charset of the document. RFC 2277, 2396
628 * The platform's charset is used for the document by default.
629 */
630 protected static String defaultDocumentCharset = null;
631 protected static String defaultDocumentCharsetByLocale = null;
632 protected static String defaultDocumentCharsetByPlatform = null;
633
634 static {
635 Locale locale = Locale.getDefault();
636
637 if (locale != null) {
638 defaultDocumentCharsetByLocale =
639 LocaleToCharsetMap.getCharset(locale);
640
641 defaultDocumentCharset = defaultDocumentCharsetByLocale;
642 }
643
644 try {
645 defaultDocumentCharsetByPlatform = System.getProperty("file.encoding");
646 } catch (SecurityException ignore) {
647 }
648 if (defaultDocumentCharset == null) {
649
650 defaultDocumentCharset = defaultDocumentCharsetByPlatform;
651 }
652 }
653
654
655 /***
656 * The scheme.
657 */
658 protected char[] _scheme = null;
659
660
661 /***
662 * The opaque.
663 */
664 protected char[] _opaque = null;
665
666
667 /***
668 * The authority.
669 */
670 protected char[] _authority = null;
671
672
673 /***
674 * The userinfo.
675 */
676 protected char[] _userinfo = null;
677
678
679 /***
680 * The host.
681 */
682 protected char[] _host = null;
683
684
685 /***
686 * The port.
687 */
688 protected int _port = -1;
689
690
691 /***
692 * The path.
693 */
694 protected char[] _path = null;
695
696
697 /***
698 * The query.
699 */
700 protected char[] _query = null;
701
702
703 /***
704 * The fragment.
705 */
706 protected char[] _fragment = null;
707
708
709 /***
710 * The root path.
711 */
712 protected static char[] rootPath = { '/' };
713
714
715
716 /***
717 * The percent "%" character always has the reserved purpose of being the
718 * escape indicator, it must be escaped as "%25" in order to be used as
719 * data within a URI.
720 */
721 protected static final BitSet percent = new BitSet(256);
722
723 static {
724 percent.set('%');
725 }
726
727
728 /***
729 * BitSet for digit.
730 * <p><blockquote><pre>
731 * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
732 * "8" | "9"
733 * </pre></blockquote><p>
734 */
735 protected static final BitSet digit = new BitSet(256);
736
737 static {
738 for (int i = '0'; i <= '9'; i++) {
739 digit.set(i);
740 }
741 }
742
743
744 /***
745 * BitSet for alpha.
746 * <p><blockquote><pre>
747 * alpha = lowalpha | upalpha
748 * </pre></blockquote><p>
749 */
750 protected static final BitSet alpha = new BitSet(256);
751
752 static {
753 for (int i = 'a'; i <= 'z'; i++) {
754 alpha.set(i);
755 }
756 for (int i = 'A'; i <= 'Z'; i++) {
757 alpha.set(i);
758 }
759 }
760
761
762 /***
763 * BitSet for alphanum (join of alpha & digit).
764 * <p><blockquote><pre>
765 * alphanum = alpha | digit
766 * </pre></blockquote><p>
767 */
768 protected static final BitSet alphanum = new BitSet(256);
769
770 static {
771 alphanum.or(alpha);
772 alphanum.or(digit);
773 }
774
775
776 /***
777 * BitSet for hex.
778 * <p><blockquote><pre>
779 * hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
780 * "a" | "b" | "c" | "d" | "e" | "f"
781 * </pre></blockquote><p>
782 */
783 protected static final BitSet hex = new BitSet(256);
784
785 static {
786 hex.or(digit);
787 for (int i = 'a'; i <= 'f'; i++) {
788 hex.set(i);
789 }
790 for (int i = 'A'; i <= 'F'; i++) {
791 hex.set(i);
792 }
793 }
794
795
796 /***
797 * BitSet for escaped.
798 * <p><blockquote><pre>
799 * escaped = "%" hex hex
800 * </pre></blockquote><p>
801 */
802 protected static final BitSet escaped = new BitSet(256);
803
804 static {
805 escaped.or(percent);
806 escaped.or(hex);
807 }
808
809
810 /***
811 * BitSet for mark.
812 * <p><blockquote><pre>
813 * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
814 * "(" | ")"
815 * </pre></blockquote><p>
816 */
817 protected static final BitSet mark = new BitSet(256);
818
819 static {
820 mark.set('-');
821 mark.set('_');
822 mark.set('.');
823 mark.set('!');
824 mark.set('~');
825 mark.set('*');
826 mark.set('\'');
827 mark.set('(');
828 mark.set(')');
829 }
830
831
832 /***
833 * Data characters that are allowed in a URI but do not have a reserved
834 * purpose are called unreserved.
835 * <p><blockquote><pre>
836 * unreserved = alphanum | mark
837 * </pre></blockquote><p>
838 */
839 protected static final BitSet unreserved = new BitSet(256);
840
841 static {
842 unreserved.or(alphanum);
843 unreserved.or(mark);
844 }
845
846
847 /***
848 * BitSet for reserved.
849 * <p><blockquote><pre>
850 * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
851 * "$" | ","
852 * </pre></blockquote><p>
853 */
854 protected static final BitSet reserved = new BitSet(256);
855
856 static {
857 reserved.set(';');
858 reserved.set('/');
859 reserved.set('?');
860 reserved.set(':');
861 reserved.set('@');
862 reserved.set('&');
863 reserved.set('=');
864 reserved.set('+');
865 reserved.set('$');
866 reserved.set(',');
867 }
868
869
870 /***
871 * BitSet for uric.
872 * <p><blockquote><pre>
873 * uric = reserved | unreserved | escaped
874 * </pre></blockquote><p>
875 */
876 protected static final BitSet uric = new BitSet(256);
877
878 static {
879 uric.or(reserved);
880 uric.or(unreserved);
881 uric.or(escaped);
882 }
883
884
885 /***
886 * BitSet for fragment (alias for uric).
887 * <p><blockquote><pre>
888 * fragment = *uric
889 * </pre></blockquote><p>
890 */
891 protected static final BitSet fragment = uric;
892
893
894 /***
895 * BitSet for query (alias for uric).
896 * <p><blockquote><pre>
897 * query = *uric
898 * </pre></blockquote><p>
899 */
900 protected static final BitSet query = uric;
901
902
903 /***
904 * BitSet for pchar.
905 * <p><blockquote><pre>
906 * pchar = unreserved | escaped |
907 * ":" | "@" | "&" | "=" | "+" | "$" | ","
908 * </pre></blockquote><p>
909 */
910 protected static final BitSet pchar = new BitSet(256);
911
912 static {
913 pchar.or(unreserved);
914 pchar.or(escaped);
915 pchar.set(':');
916 pchar.set('@');
917 pchar.set('&');
918 pchar.set('=');
919 pchar.set('+');
920 pchar.set('$');
921 pchar.set(',');
922 }
923
924
925 /***
926 * BitSet for param (alias for pchar).
927 * <p><blockquote><pre>
928 * param = *pchar
929 * </pre></blockquote><p>
930 */
931 protected static final BitSet param = pchar;
932
933
934 /***
935 * BitSet for segment.
936 * <p><blockquote><pre>
937 * segment = *pchar *( ";" param )
938 * </pre></blockquote><p>
939 */
940 protected static final BitSet segment = new BitSet(256);
941
942 static {
943 segment.or(pchar);
944 segment.set(';');
945 segment.or(param);
946 }
947
948
949 /***
950 * BitSet for path segments.
951 * <p><blockquote><pre>
952 * path_segments = segment *( "/" segment )
953 * </pre></blockquote><p>
954 */
955 protected static final BitSet path_segments = new BitSet(256);
956
957 static {
958 path_segments.set('/');
959 path_segments.or(segment);
960 }
961
962
963 /***
964 * URI absolute path.
965 * <p><blockquote><pre>
966 * abs_path = "/" path_segments
967 * </pre></blockquote><p>
968 */
969 protected static final BitSet abs_path = new BitSet(256);
970
971 static {
972 abs_path.set('/');
973 abs_path.or(path_segments);
974 }
975
976
977 /***
978 * URI bitset for encoding typical non-slash characters.
979 * <p><blockquote><pre>
980 * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
981 * "&" | "=" | "+" | "$" | ","
982 * </pre></blockquote><p>
983 */
984 protected static final BitSet uric_no_slash = new BitSet(256);
985
986 static {
987 uric_no_slash.or(unreserved);
988 uric_no_slash.or(escaped);
989 uric_no_slash.set(';');
990 uric_no_slash.set('?');
991 uric_no_slash.set(';');
992 uric_no_slash.set('@');
993 uric_no_slash.set('&');
994 uric_no_slash.set('=');
995 uric_no_slash.set('+');
996 uric_no_slash.set('$');
997 uric_no_slash.set(',');
998 }
999
1000
1001 /***
1002 * URI bitset that combines uric_no_slash and uric.
1003 * <p><blockquote><pre>
1004 * opaque_part = uric_no_slash *uric
1005 * </pre></blockquote><p>
1006 */
1007 protected static final BitSet opaque_part = new BitSet(256);
1008
1009 static {
1010
1011 opaque_part.or(uric_no_slash);
1012 opaque_part.or(uric);
1013 }
1014
1015
1016 /***
1017 * URI bitset that combines absolute path and opaque part.
1018 * <p><blockquote><pre>
1019 * path = [ abs_path | opaque_part ]
1020 * </pre></blockquote><p>
1021 */
1022 protected static final BitSet path = new BitSet(256);
1023
1024 static {
1025 path.or(abs_path);
1026 path.or(opaque_part);
1027 }
1028
1029
1030 /***
1031 * Port, a logical alias for digit.
1032 */
1033 protected static final BitSet port = digit;
1034
1035
1036 /***
1037 * Bitset that combines digit and dot fo IPv$address.
1038 * <p><blockquote><pre>
1039 * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
1040 * </pre></blockquote><p>
1041 */
1042 protected static final BitSet IPv4address = new BitSet(256);
1043
1044 static {
1045 IPv4address.or(digit);
1046 IPv4address.set('.');
1047 }
1048
1049
1050 /***
1051 * RFC 2373.
1052 * <p><blockquote><pre>
1053 * IPv6address = hexpart [ ":" IPv4address ]
1054 * </pre></blockquote><p>
1055 */
1056 protected static final BitSet IPv6address = new BitSet(256);
1057
1058 static {
1059 IPv6address.or(hex);
1060 IPv6address.set(':');
1061 IPv6address.or(IPv4address);
1062 }
1063
1064
1065 /***
1066 * RFC 2732, 2373.
1067 * <p><blockquote><pre>
1068 * IPv6reference = "[" IPv6address "]"
1069 * </pre></blockquote><p>
1070 */
1071 protected static final BitSet IPv6reference = new BitSet(256);
1072
1073 static {
1074 IPv6reference.set('[');
1075 IPv6reference.or(IPv6address);
1076 IPv6reference.set(']');
1077 }
1078
1079
1080 /***
1081 * BitSet for toplabel.
1082 * <p><blockquote><pre>
1083 * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
1084 * </pre></blockquote><p>
1085 */
1086 protected static final BitSet toplabel = new BitSet(256);
1087
1088 static {
1089 toplabel.or(alphanum);
1090 toplabel.set('-');
1091 }
1092
1093
1094 /***
1095 * BitSet for domainlabel.
1096 * <p><blockquote><pre>
1097 * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
1098 * </pre></blockquote><p>
1099 */
1100 protected static final BitSet domainlabel = toplabel;
1101
1102
1103 /***
1104 * BitSet for hostname.
1105 * <p><blockquote><pre>
1106 * hostname = *( domainlabel "." ) toplabel [ "." ]
1107 * </pre></blockquote><p>
1108 */
1109 protected static final BitSet hostname = new BitSet(256);
1110
1111 static {
1112 hostname.or(toplabel);
1113
1114 hostname.set('.');
1115 }
1116
1117
1118 /***
1119 * BitSet for host.
1120 * <p><blockquote><pre>
1121 * host = hostname | IPv4address | IPv6reference
1122 * </pre></blockquote><p>
1123 */
1124 protected static final BitSet host = new BitSet(256);
1125
1126 static {
1127 host.or(hostname);
1128
1129 host.or(IPv6reference);
1130 }
1131
1132
1133 /***
1134 * BitSet for hostport.
1135 * <p><blockquote><pre>
1136 * hostport = host [ ":" port ]
1137 * </pre></blockquote><p>
1138 */
1139 protected static final BitSet hostport = new BitSet(256);
1140
1141 static {
1142 hostport.or(host);
1143 hostport.set(':');
1144 hostport.or(port);
1145 }
1146
1147
1148 /***
1149 * Bitset for userinfo.
1150 * <p><blockquote><pre>
1151 * userinfo = *( unreserved | escaped |
1152 * ";" | ":" | "&" | "=" | "+" | "$" | "," )
1153 * </pre></blockquote><p>
1154 */
1155 protected static final BitSet userinfo = new BitSet(256);
1156
1157 static {
1158 userinfo.or(unreserved);
1159 userinfo.or(escaped);
1160 userinfo.set(';');
1161 userinfo.set(':');
1162 userinfo.set('&');
1163 userinfo.set('=');
1164 userinfo.set('+');
1165 userinfo.set('$');
1166 userinfo.set(',');
1167 }
1168
1169
1170 /***
1171 * BitSet for within the userinfo component like user and password.
1172 */
1173 public static final BitSet within_userinfo = new BitSet(256);
1174
1175 static {
1176 within_userinfo.or(userinfo);
1177 within_userinfo.clear(';');
1178 within_userinfo.clear(':');
1179 within_userinfo.clear('@');
1180 within_userinfo.clear('?');
1181 within_userinfo.clear('/');
1182 }
1183
1184
1185 /***
1186 * Bitset for server.
1187 * <p><blockquote><pre>
1188 * server = [ [ userinfo "@" ] hostport ]
1189 * </pre></blockquote><p>
1190 */
1191 protected static final BitSet server = new BitSet(256);
1192
1193 static {
1194 server.or(userinfo);
1195 server.set('@');
1196 server.or(hostport);
1197 }
1198
1199
1200 /***
1201 * BitSet for reg_name.
1202 * <p><blockquote><pre>
1203 * reg_name = 1*( unreserved | escaped | "$" | "," |
1204 * ";" | ":" | "@" | "&" | "=" | "+" )
1205 * </pre></blockquote><p>
1206 */
1207 protected static final BitSet reg_name = new BitSet(256);
1208
1209 static {
1210 reg_name.or(unreserved);
1211 reg_name.or(escaped);
1212 reg_name.set('$');
1213 reg_name.set(',');
1214 reg_name.set(';');
1215 reg_name.set(':');
1216 reg_name.set('@');
1217 reg_name.set('&');
1218 reg_name.set('=');
1219 reg_name.set('+');
1220 }
1221
1222
1223 /***
1224 * BitSet for authority.
1225 * <p><blockquote><pre>
1226 * authority = server | reg_name
1227 * </pre></blockquote><p>
1228 */
1229 protected static final BitSet authority = new BitSet(256);
1230
1231 static {
1232 authority.or(server);
1233 authority.or(reg_name);
1234 }
1235
1236
1237 /***
1238 * BitSet for scheme.
1239 * <p><blockquote><pre>
1240 * scheme = alpha *( alpha | digit | "+" | "-" | "." )
1241 * </pre></blockquote><p>
1242 */
1243 protected static final BitSet scheme = new BitSet(256);
1244
1245 static {
1246 scheme.or(alpha);
1247 scheme.or(digit);
1248 scheme.set('+');
1249 scheme.set('-');
1250 scheme.set('.');
1251 }
1252
1253
1254 /***
1255 * BitSet for rel_segment.
1256 * <p><blockquote><pre>
1257 * rel_segment = 1*( unreserved | escaped |
1258 * ";" | "@" | "&" | "=" | "+" | "$" | "," )
1259 * </pre></blockquote><p>
1260 */
1261 protected static final BitSet rel_segment = new BitSet(256);
1262
1263 static {
1264 rel_segment.or(unreserved);
1265 rel_segment.or(escaped);
1266 rel_segment.set(';');
1267 rel_segment.set('@');
1268 rel_segment.set('&');
1269 rel_segment.set('=');
1270 rel_segment.set('+');
1271 rel_segment.set('$');
1272 rel_segment.set(',');
1273 }
1274
1275
1276 /***
1277 * BitSet for rel_path.
1278 * <p><blockquote><pre>
1279 * rel_path = rel_segment [ abs_path ]
1280 * </pre></blockquote><p>
1281 */
1282 protected static final BitSet rel_path = new BitSet(256);
1283
1284 static {
1285 rel_path.or(rel_segment);
1286 rel_path.or(abs_path);
1287 }
1288
1289
1290 /***
1291 * BitSet for net_path.
1292 * <p><blockquote><pre>
1293 * net_path = "//" authority [ abs_path ]
1294 * </pre></blockquote><p>
1295 */
1296 protected static final BitSet net_path = new BitSet(256);
1297
1298 static {
1299 net_path.set('/');
1300 net_path.or(authority);
1301 net_path.or(abs_path);
1302 }
1303
1304
1305 /***
1306 * BitSet for hier_part.
1307 * <p><blockquote><pre>
1308 * hier_part = ( net_path | abs_path ) [ "?" query ]
1309 * </pre></blockquote><p>
1310 */
1311 protected static final BitSet hier_part = new BitSet(256);
1312
1313 static {
1314 hier_part.or(net_path);
1315 hier_part.or(abs_path);
1316
1317 hier_part.or(query);
1318 }
1319
1320
1321 /***
1322 * BitSet for relativeURI.
1323 * <p><blockquote><pre>
1324 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
1325 * </pre></blockquote><p>
1326 */
1327 protected static final BitSet relativeURI = new BitSet(256);
1328
1329 static {
1330 relativeURI.or(net_path);
1331 relativeURI.or(abs_path);
1332 relativeURI.or(rel_path);
1333
1334 relativeURI.or(query);
1335 }
1336
1337
1338 /***
1339 * BitSet for absoluteURI.
1340 * <p><blockquote><pre>
1341 * absoluteURI = scheme ":" ( hier_part | opaque_part )
1342 * </pre></blockquote><p>
1343 */
1344 protected static final BitSet absoluteURI = new BitSet(256);
1345
1346 static {
1347 absoluteURI.or(scheme);
1348 absoluteURI.set(':');
1349 absoluteURI.or(hier_part);
1350 absoluteURI.or(opaque_part);
1351 }
1352
1353
1354 /***
1355 * BitSet for URI-reference.
1356 * <p><blockquote><pre>
1357 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
1358 * </pre></blockquote><p>
1359 */
1360 protected static final BitSet URI_reference = new BitSet(256);
1361
1362 static {
1363 URI_reference.or(absoluteURI);
1364 URI_reference.or(relativeURI);
1365 URI_reference.set('#');
1366 URI_reference.or(fragment);
1367 }
1368
1369
1370
1371
1372 /***
1373 * BitSet for control.
1374 */
1375 public static final BitSet control = new BitSet(256);
1376
1377 static {
1378 for (int i = 0; i <= 0x1F; i++) {
1379 control.set(i);
1380 }
1381 control.set(0x7F);
1382 }
1383
1384 /***
1385 * BitSet for space.
1386 */
1387 public static final BitSet space = new BitSet(256);
1388
1389 static {
1390 space.set(0x20);
1391 }
1392
1393
1394 /***
1395 * BitSet for delims.
1396 */
1397 public static final BitSet delims = new BitSet(256);
1398
1399 static {
1400 delims.set('<');
1401 delims.set('>');
1402 delims.set('#');
1403 delims.set('%');
1404 delims.set('"');
1405 }
1406
1407
1408 /***
1409 * BitSet for unwise.
1410 */
1411 public static final BitSet unwise = new BitSet(256);
1412
1413 static {
1414 unwise.set('{');
1415 unwise.set('}');
1416 unwise.set('|');
1417 unwise.set('//');
1418 unwise.set('^');
1419 unwise.set('[');
1420 unwise.set(']');
1421 unwise.set('`');
1422 }
1423
1424
1425 /***
1426 * Disallowed rel_path before escaping.
1427 */
1428 public static final BitSet disallowed_rel_path = new BitSet(256);
1429
1430 static {
1431 disallowed_rel_path.or(uric);
1432 disallowed_rel_path.andNot(rel_path);
1433 }
1434
1435
1436 /***
1437 * Disallowed opaque_part before escaping.
1438 */
1439 public static final BitSet disallowed_opaque_part = new BitSet(256);
1440
1441 static {
1442 disallowed_opaque_part.or(uric);
1443 disallowed_opaque_part.andNot(opaque_part);
1444 }
1445
1446
1447
1448 /***
1449 * Those characters that are allowed for the authority component.
1450 */
1451 public static final BitSet allowed_authority = new BitSet(256);
1452
1453 static {
1454 allowed_authority.or(authority);
1455 allowed_authority.clear('%');
1456 }
1457
1458
1459 /***
1460 * Those characters that are allowed for the opaque_part.
1461 */
1462 public static final BitSet allowed_opaque_part = new BitSet(256);
1463
1464 static {
1465 allowed_opaque_part.or(opaque_part);
1466 allowed_opaque_part.clear('%');
1467 }
1468
1469
1470 /***
1471 * Those characters that are allowed for the reg_name.
1472 */
1473 public static final BitSet allowed_reg_name = new BitSet(256);
1474
1475 static {
1476 allowed_reg_name.or(reg_name);
1477
1478 allowed_reg_name.clear('%');
1479 }
1480
1481
1482 /***
1483 * Those characters that are allowed for the userinfo component.
1484 */
1485 public static final BitSet allowed_userinfo = new BitSet(256);
1486
1487 static {
1488 allowed_userinfo.or(userinfo);
1489
1490 allowed_userinfo.clear('%');
1491 }
1492
1493
1494 /***
1495 * Those characters that are allowed for within the userinfo component.
1496 */
1497 public static final BitSet allowed_within_userinfo = new BitSet(256);
1498
1499 static {
1500 allowed_within_userinfo.or(within_userinfo);
1501 allowed_within_userinfo.clear('%');
1502 }
1503
1504
1505 /***
1506 * Those characters that are allowed for the IPv6reference component.
1507 * The characters '[', ']' in IPv6reference should be excluded.
1508 */
1509 public static final BitSet allowed_IPv6reference = new BitSet(256);
1510
1511 static {
1512 allowed_IPv6reference.or(IPv6reference);
1513
1514 allowed_IPv6reference.clear('[');
1515 allowed_IPv6reference.clear(']');
1516 }
1517
1518
1519 /***
1520 * Those characters that are allowed for the host component.
1521 * The characters '[', ']' in IPv6reference should be excluded.
1522 */
1523 public static final BitSet allowed_host = new BitSet(256);
1524
1525 static {
1526 allowed_host.or(hostname);
1527 allowed_host.or(allowed_IPv6reference);
1528 }
1529
1530
1531 /***
1532 * Those characters that are allowed for the authority component.
1533 */
1534 public static final BitSet allowed_within_authority = new BitSet(256);
1535
1536 static {
1537 allowed_within_authority.or(server);
1538 allowed_within_authority.or(reg_name);
1539 allowed_within_authority.clear(';');
1540 allowed_within_authority.clear(':');
1541 allowed_within_authority.clear('@');
1542 allowed_within_authority.clear('?');
1543 allowed_within_authority.clear('/');
1544 }
1545
1546
1547 /***
1548 * Those characters that are allowed for the abs_path.
1549 */
1550 public static final BitSet allowed_abs_path = new BitSet(256);
1551
1552 static {
1553 allowed_abs_path.or(abs_path);
1554
1555 allowed_abs_path.andNot(percent);
1556 }
1557
1558
1559 /***
1560 * Those characters that are allowed for the rel_path.
1561 */
1562 public static final BitSet allowed_rel_path = new BitSet(256);
1563
1564 static {
1565 allowed_rel_path.or(rel_path);
1566 allowed_rel_path.clear('%');
1567 }
1568
1569
1570 /***
1571 * Those characters that are allowed within the path.
1572 */
1573 public static final BitSet allowed_within_path = new BitSet(256);
1574
1575 static {
1576 allowed_within_path.or(abs_path);
1577 allowed_within_path.clear('/');
1578 allowed_within_path.clear(';');
1579 allowed_within_path.clear('=');
1580 allowed_within_path.clear('?');
1581 }
1582
1583
1584 /***
1585 * Those characters that are allowed for the query component.
1586 */
1587 public static final BitSet allowed_query = new BitSet(256);
1588
1589 static {
1590 allowed_query.or(uric);
1591 allowed_query.clear('%');
1592 }
1593
1594
1595 /***
1596 * Those characters that are allowed within the query component.
1597 */
1598 public static final BitSet allowed_within_query = new BitSet(256);
1599
1600 static {
1601 allowed_within_query.or(allowed_query);
1602 allowed_within_query.andNot(reserved);
1603 }
1604
1605
1606 /***
1607 * Those characters that are allowed for the fragment component.
1608 */
1609 public static final BitSet allowed_fragment = new BitSet(256);
1610
1611 static {
1612 allowed_fragment.or(uric);
1613 allowed_fragment.clear('%');
1614 }
1615
1616
1617
1618
1619
1620
1621
1622 protected boolean _is_hier_part;
1623 protected boolean _is_opaque_part;
1624
1625
1626 protected boolean _is_net_path;
1627 protected boolean _is_abs_path;
1628 protected boolean _is_rel_path;
1629
1630
1631 protected boolean _is_reg_name;
1632 protected boolean _is_server;
1633
1634
1635 protected boolean _is_hostname;
1636 protected boolean _is_IPv4address;
1637 protected boolean _is_IPv6reference;
1638
1639
1640
1641 /***
1642 * Encodes URI string.
1643 *
1644 * This is a two mapping, one from original characters to octets, and
1645 * subsequently a second from octets to URI characters:
1646 * <p><blockquote><pre>
1647 * original character sequence->octet sequence->URI character sequence
1648 * </pre></blockquote><p>
1649 *
1650 * An escaped octet is encoded as a character triplet, consisting of the
1651 * percent character "%" followed by the two hexadecimal digits
1652 * representing the octet code. For example, "%20" is the escaped
1653 * encoding for the US-ASCII space character.
1654 * <p>
1655 * Conversion from the local filesystem character set to UTF-8 will
1656 * normally involve a two step process. First convert the local character
1657 * set to the UCS; then convert the UCS to UTF-8.
1658 * The first step in the process can be performed by maintaining a mapping
1659 * table that includes the local character set code and the corresponding
1660 * UCS code.
1661 * The next step is to convert the UCS character code to the UTF-8 encoding.
1662 * <p>
1663 * Mapping between vendor codepages can be done in a very similar manner
1664 * as described above.
1665 * <p>
1666 * The only time escape encodings can allowedly be made is when a URI is
1667 * being created from its component parts. The escape and validate methods
1668 * are internally performed within this method.
1669 *
1670 * @param original the original character sequence
1671 * @param allowed those characters that are allowed within a component
1672 * @param charset the protocol charset
1673 * @return URI character sequence
1674 * @throws URIException null component or unsupported character encoding
1675 */
1676
1677 protected static char[] encode(String original, BitSet allowed,
1678 String charset) throws URIException {
1679 if (original == null) {
1680 throw new IllegalArgumentException("Original string may not be null");
1681 }
1682 if (allowed == null) {
1683 throw new IllegalArgumentException("Allowed bitset may not be null");
1684 }
1685 byte[] rawdata = URLCodec.encodeUrl(allowed, EncodingUtil.getBytes(original, charset));
1686 return EncodingUtil.getAsciiString(rawdata).toCharArray();
1687 }
1688
1689 /***
1690 * Decodes URI encoded string.
1691 *
1692 * This is a two mapping, one from URI characters to octets, and
1693 * subsequently a second from octets to original characters:
1694 * <p><blockquote><pre>
1695 * URI character sequence->octet sequence->original character sequence
1696 * </pre></blockquote><p>
1697 *
1698 * A URI must be separated into its components before the escaped
1699 * characters within those components can be allowedly decoded.
1700 * <p>
1701 * Notice that there is a chance that URI characters that are non UTF-8
1702 * may be parsed as valid UTF-8. A recent non-scientific analysis found
1703 * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
1704 * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
1705 * false reading.
1706 * <p>
1707 * The percent "%" character always has the reserved purpose of being
1708 * the escape indicator, it must be escaped as "%25" in order to be used
1709 * as data within a URI.
1710 * <p>
1711 * The unescape method is internally performed within this method.
1712 *
1713 * @param component the URI character sequence
1714 * @param charset the protocol charset
1715 * @return original character sequence
1716 * @throws URIException incomplete trailing escape pattern or unsupported
1717 * character encoding
1718 */
1719 protected static String decode(char[] component, String charset)
1720 throws URIException {
1721 if (component == null) {
1722 throw new IllegalArgumentException("Component array of chars may not be null");
1723 }
1724 return decode(new String(component), charset);
1725 }
1726
1727 /***
1728 * Decodes URI encoded string.
1729 *
1730 * This is a two mapping, one from URI characters to octets, and
1731 * subsequently a second from octets to original characters:
1732 * <p><blockquote><pre>
1733 * URI character sequence->octet sequence->original character sequence
1734 * </pre></blockquote><p>
1735 *
1736 * A URI must be separated into its components before the escaped
1737 * characters within those components can be allowedly decoded.
1738 * <p>
1739 * Notice that there is a chance that URI characters that are non UTF-8
1740 * may be parsed as valid UTF-8. A recent non-scientific analysis found
1741 * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
1742 * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
1743 * false reading.
1744 * <p>
1745 * The percent "%" character always has the reserved purpose of being
1746 * the escape indicator, it must be escaped as "%25" in order to be used
1747 * as data within a URI.
1748 * <p>
1749 * The unescape method is internally performed within this method.
1750 *
1751 * @param component the URI character sequence
1752 * @param charset the protocol charset
1753 * @return original character sequence
1754 * @throws URIException incomplete trailing escape pattern or unsupported
1755 * character encoding
1756 *
1757 * @since 3.0
1758 */
1759 protected static String decode(String component, String charset)
1760 throws URIException {
1761 if (component == null) {
1762 throw new IllegalArgumentException("Component array of chars may not be null");
1763 }
1764 byte[] rawdata = null;
1765 try {
1766 rawdata = URLCodec.decodeUrl(EncodingUtil.getAsciiBytes(component));
1767 } catch (DecoderException e) {
1768 throw new URIException(e.getMessage());
1769 }
1770 return EncodingUtil.getString(rawdata, charset);
1771 }
1772 /***
1773 * Pre-validate the unescaped URI string within a specific component.
1774 *
1775 * @param component the component string within the component
1776 * @param disallowed those characters disallowed within the component
1777 * @return if true, it doesn't have the disallowed characters
1778 * if false, the component is undefined or an incorrect one
1779 */
1780 protected boolean prevalidate(String component, BitSet disallowed) {
1781
1782 if (component == null) {
1783 return false;
1784 }
1785 char[] target = component.toCharArray();
1786 for (int i = 0; i < target.length; i++) {
1787 if (disallowed.get(target[i])) {
1788 return false;
1789 }
1790 }
1791 return true;
1792 }
1793
1794
1795 /***
1796 * Validate the URI characters within a specific component.
1797 * The component must be performed after escape encoding. Or it doesn't
1798 * include escaped characters.
1799 *
1800 * @param component the characters sequence within the component
1801 * @param generous those characters that are allowed within a component
1802 * @return if true, it's the correct URI character sequence
1803 */
1804 protected boolean validate(char[] component, BitSet generous) {
1805
1806 return validate(component, 0, -1, generous);
1807 }
1808
1809
1810 /***
1811 * Validate the URI characters within a specific component.
1812 * The component must be performed after escape encoding. Or it doesn't
1813 * include escaped characters.
1814 * <p>
1815 * It's not that much strict, generous. The strict validation might be
1816 * performed before being called this method.
1817 *
1818 * @param component the characters sequence within the component
1819 * @param soffset the starting offset of the given component
1820 * @param eoffset the ending offset of the given component
1821 * if -1, it means the length of the component
1822 * @param generous those characters that are allowed within a component
1823 * @return if true, it's the correct URI character sequence
1824 */
1825 protected boolean validate(char[] component, int soffset, int eoffset,
1826 BitSet generous) {
1827
1828 if (eoffset == -1) {
1829 eoffset = component.length - 1;
1830 }
1831 for (int i = soffset; i <= eoffset; i++) {
1832 if (!generous.get(component[i])) {
1833 return false;
1834 }
1835 }
1836 return true;
1837 }
1838
1839
1840 /***
1841 * In order to avoid any possilbity of conflict with non-ASCII characters,
1842 * Parse a URI reference as a <code>String</code> with the character
1843 * encoding of the local system or the document.
1844 * <p>
1845 * The following line is the regular expression for breaking-down a URI
1846 * reference into its components.
1847 * <p><blockquote><pre>
1848 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1849 * 12 3 4 5 6 7 8 9
1850 * </pre></blockquote><p>
1851 * For example, matching the above expression to
1852 * http://jakarta.apache.org/ietf/uri/#Related
1853 * results in the following subexpression matches:
1854 * <p><blockquote><pre>
1855 * $1 = http:
1856 * scheme = $2 = http
1857 * $3 = //jakarta.apache.org
1858 * authority = $4 = jakarta.apache.org
1859 * path = $5 = /ietf/uri/
1860 * $6 = <undefined>
1861 * query = $7 = <undefined>
1862 * $8 = #Related
1863 * fragment = $9 = Related
1864 * </pre></blockquote><p>
1865 *
1866 * @param original the original character sequence
1867 * @param escaped <code>true</code> if <code>original</code> is escaped
1868 * @throws URIException If an error occurs.
1869 */
1870 protected void parseUriReference(String original, boolean escaped)
1871 throws URIException {
1872
1873
1874 if (original == null) {
1875 throw new URIException("URI-Reference required");
1876 }
1877
1878
1879
1880
1881 String tmp = original.trim();
1882
1883
1884
1885
1886
1887 int length = tmp.length();
1888
1889
1890
1891
1892 if (length > 0) {
1893 char[] firstDelimiter = { tmp.charAt(0) };
1894 if (validate(firstDelimiter, delims)) {
1895 if (length >= 2) {
1896 char[] lastDelimiter = { tmp.charAt(length - 1) };
1897 if (validate(lastDelimiter, delims)) {
1898 tmp = tmp.substring(1, length - 1);
1899 length = length - 2;
1900 }
1901 }
1902 }
1903 }
1904
1905
1906
1907
1908 int from = 0;
1909
1910
1911
1912
1913 boolean isStartedFromPath = false;
1914 int atColon = tmp.indexOf(':');
1915 int atSlash = tmp.indexOf('/');
1916 if (atColon <= 0 || (atSlash >= 0 && atSlash < atColon)) {
1917 isStartedFromPath = true;
1918 }
1919
1920
1921
1922
1923
1924
1925
1926 int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
1927 if (at == -1) {
1928 at = 0;
1929 }
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939 if (at > 0 && at < length && tmp.charAt(at) == ':') {
1940 char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
1941 if (validate(target, scheme)) {
1942 _scheme = target;
1943 } else {
1944 throw new URIException("incorrect scheme");
1945 }
1946 from = ++at;
1947 }
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958 _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
1959 if (0 <= at && at < length && tmp.charAt(at) == '/') {
1960
1961 _is_hier_part = true;
1962 if (at + 2 < length && tmp.charAt(at + 1) == '/') {
1963
1964 int next = indexFirstOf(tmp, "/?#", at + 2);
1965 if (next == -1) {
1966 next = (tmp.substring(at + 2).length() == 0) ? at + 2
1967 : tmp.length();
1968 }
1969 parseAuthority(tmp.substring(at + 2, next), escaped);
1970 from = at = next;
1971
1972 _is_net_path = true;
1973 }
1974 if (from == at) {
1975
1976 _is_abs_path = true;
1977 }
1978 }
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988 if (from < length) {
1989
1990 int next = indexFirstOf(tmp, "?#", from);
1991 if (next == -1) {
1992 next = tmp.length();
1993 }
1994 if (!_is_abs_path) {
1995 if (!escaped
1996 && prevalidate(tmp.substring(from, next), disallowed_rel_path)
1997 || escaped
1998 && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
1999
2000 _is_rel_path = true;
2001 } else if (!escaped
2002 && prevalidate(tmp.substring(from, next), disallowed_opaque_part)
2003 || escaped
2004 && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
2005
2006 _is_opaque_part = true;
2007 } else {
2008
2009 _path = null;
2010 }
2011 }
2012 if (escaped) {
2013 setRawPath(tmp.substring(from, next).toCharArray());
2014 } else {
2015 setPath(tmp.substring(from, next));
2016 }
2017 at = next;
2018 }
2019
2020
2021 String charset = getProtocolCharset();
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031 if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
2032 int next = tmp.indexOf('#', at + 1);
2033 if (next == -1) {
2034 next = tmp.length();
2035 }
2036 _query = (escaped) ? tmp.substring(at + 1, next).toCharArray()
2037 : encode(tmp.substring(at + 1, next), allowed_query, charset);
2038 at = next;
2039 }
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049 if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
2050 if (at + 1 == length) {
2051 _fragment = "".toCharArray();
2052 } else {
2053 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray()
2054 : encode(tmp.substring(at + 1), allowed_fragment, charset);
2055 }
2056 }
2057
2058
2059 setURI();
2060 }
2061
2062
2063 /***
2064 * Get the earlier index that to be searched for the first occurrance in
2065 * one of any of the given string.
2066 *
2067 * @param s the string to be indexed
2068 * @param delims the delimiters used to index
2069 * @return the earlier index if there are delimiters
2070 */
2071 protected int indexFirstOf(String s, String delims) {
2072 return indexFirstOf(s, delims, -1);
2073 }
2074
2075
2076 /***
2077 * Get the earlier index that to be searched for the first occurrance in
2078 * one of any of the given string.
2079 *
2080 * @param s the string to be indexed
2081 * @param delims the delimiters used to index
2082 * @param offset the from index
2083 * @return the earlier index if there are delimiters
2084 */
2085 protected int indexFirstOf(String s, String delims, int offset) {
2086 if (s == null || s.length() == 0) {
2087 return -1;
2088 }
2089 if (delims == null || delims.length() == 0) {
2090 return -1;
2091 }
2092
2093 if (offset < 0) {
2094 offset = 0;
2095 } else if (offset > s.length()) {
2096 return -1;
2097 }
2098
2099 int min = s.length();
2100 char[] delim = delims.toCharArray();
2101 for (int i = 0; i < delim.length; i++) {
2102 int at = s.indexOf(delim[i], offset);
2103 if (at >= 0 && at < min) {
2104 min = at;
2105 }
2106 }
2107 return (min == s.length()) ? -1 : min;
2108 }
2109
2110
2111 /***
2112 * Get the earlier index that to be searched for the first occurrance in
2113 * one of any of the given array.
2114 *
2115 * @param s the character array to be indexed
2116 * @param delim the delimiter used to index
2117 * @return the ealier index if there are a delimiter
2118 */
2119 protected int indexFirstOf(char[] s, char delim) {
2120 return indexFirstOf(s, delim, 0);
2121 }
2122
2123
2124 /***
2125 * Get the earlier index that to be searched for the first occurrance in
2126 * one of any of the given array.
2127 *
2128 * @param s the character array to be indexed
2129 * @param delim the delimiter used to index
2130 * @param offset The offset.
2131 * @return the ealier index if there is a delimiter
2132 */
2133 protected int indexFirstOf(char[] s, char delim, int offset) {
2134 if (s == null || s.length == 0) {
2135 return -1;
2136 }
2137
2138 if (offset < 0) {
2139 offset = 0;
2140 } else if (offset > s.length) {
2141 return -1;
2142 }
2143 for (int i = offset; i < s.length; i++) {
2144 if (s[i] == delim) {
2145 return i;
2146 }
2147 }
2148 return -1;
2149 }
2150
2151
2152 /***
2153 * Parse the authority component.
2154 *
2155 * @param original the original character sequence of authority component
2156 * @param escaped <code>true</code> if <code>original</code> is escaped
2157 * @throws URIException If an error occurs.
2158 */
2159 protected void parseAuthority(String original, boolean escaped)
2160 throws URIException {
2161
2162
2163 _is_reg_name = _is_server =
2164 _is_hostname = _is_IPv4address = _is_IPv6reference = false;
2165
2166
2167 String charset = getProtocolCharset();
2168
2169 boolean hasPort = true;
2170 int from = 0;
2171 int next = original.indexOf('@');
2172 if (next != -1) {
2173
2174 _userinfo = (escaped) ? original.substring(0, next).toCharArray()
2175 : encode(original.substring(0, next), allowed_userinfo,
2176 charset);
2177 from = next + 1;
2178 }
2179 next = original.indexOf('[', from);
2180 if (next >= from) {
2181 next = original.indexOf(']', from);
2182 if (next == -1) {
2183 throw new URIException(URIException.PARSING, "IPv6reference");
2184 } else {
2185 next++;
2186 }
2187
2188 _host = (escaped) ? original.substring(from, next).toCharArray()
2189 : encode(original.substring(from, next), allowed_IPv6reference,
2190 charset);
2191
2192 _is_IPv6reference = true;
2193 } else {
2194 next = original.indexOf(':', from);
2195 if (next == -1) {
2196 next = original.length();
2197 hasPort = false;
2198 }
2199
2200 _host = original.substring(from, next).toCharArray();
2201 if (validate(_host, IPv4address)) {
2202
2203 _is_IPv4address = true;
2204 } else if (validate(_host, hostname)) {
2205
2206 _is_hostname = true;
2207 } else {
2208
2209 _is_reg_name = true;
2210 }
2211 }
2212 if (_is_reg_name) {
2213
2214 _is_server = _is_hostname = _is_IPv4address =
2215 _is_IPv6reference = false;
2216
2217 _authority = (escaped) ? original.toString().toCharArray()
2218 : encode(original.toString(), allowed_reg_name, charset);
2219 } else {
2220 if (original.length() - 1 > next && hasPort
2221 && original.charAt(next) == ':') {
2222 from = next + 1;
2223 try {
2224 _port = Integer.parseInt(original.substring(from));
2225 } catch (NumberFormatException error) {
2226 throw new URIException(URIException.PARSING,
2227 "invalid port number");
2228 }
2229 }
2230
2231 StringBuffer buf = new StringBuffer();
2232 if (_userinfo != null) {
2233 buf.append(_userinfo);
2234 buf.append('@');
2235 }
2236 if (_host != null) {
2237 buf.append(_host);
2238 if (_port != -1) {
2239 buf.append(':');
2240 buf.append(_port);
2241 }
2242 }
2243 _authority = buf.toString().toCharArray();
2244
2245 _is_server = true;
2246 }
2247 }
2248
2249
2250 /***
2251 * Once it's parsed successfully, set this URI.
2252 *
2253 * @see #getRawURI
2254 */
2255 protected void setURI() {
2256
2257 StringBuffer buf = new StringBuffer();
2258
2259 if (_scheme != null) {
2260 buf.append(_scheme);
2261 buf.append(':');
2262 }
2263 if (_is_net_path) {
2264 buf.append("//");
2265 if (_authority != null) {
2266 if (_userinfo != null) {
2267 if (_host != null) {
2268 buf.append(_host);
2269 if (_port != -1) {
2270 buf.append(':');
2271 buf.append(_port);
2272 }
2273 }
2274 } else {
2275 buf.append(_authority);
2276 }
2277 }
2278 }
2279 if (_opaque != null && _is_opaque_part) {
2280 buf.append(_opaque);
2281 } else if (_path != null) {
2282
2283 if (_path.length != 0) {
2284 buf.append(_path);
2285 }
2286 }
2287 if (_query != null) {
2288 buf.append('?');
2289 buf.append(_query);
2290 }
2291
2292 _uri = buf.toString().toCharArray();
2293 hash = 0;
2294 }
2295
2296
2297
2298
2299 /***
2300 * Tell whether or not this URI is absolute.
2301 *
2302 * @return true iif this URI is absoluteURI
2303 */
2304 public boolean isAbsoluteURI() {
2305 return (_scheme != null);
2306 }
2307
2308
2309 /***
2310 * Tell whether or not this URI is relative.
2311 *
2312 * @return true iif this URI is relativeURI
2313 */
2314 public boolean isRelativeURI() {
2315 return (_scheme == null);
2316 }
2317
2318
2319 /***
2320 * Tell whether or not the absoluteURI of this URI is hier_part.
2321 *
2322 * @return true iif the absoluteURI is hier_part
2323 */
2324 public boolean isHierPart() {
2325 return _is_hier_part;
2326 }
2327
2328
2329 /***
2330 * Tell whether or not the absoluteURI of this URI is opaque_part.
2331 *
2332 * @return true iif the absoluteURI is opaque_part
2333 */
2334 public boolean isOpaquePart() {
2335 return _is_opaque_part;
2336 }
2337
2338
2339 /***
2340 * Tell whether or not the relativeURI or heir_part of this URI is net_path.
2341 * It's the same function as the has_authority() method.
2342 *
2343 * @return true iif the relativeURI or heir_part is net_path
2344 * @see #hasAuthority
2345 */
2346 public boolean isNetPath() {
2347 return _is_net_path || (_authority != null);
2348 }
2349
2350
2351 /***
2352 * Tell whether or not the relativeURI or hier_part of this URI is abs_path.
2353 *
2354 * @return true iif the relativeURI or hier_part is abs_path
2355 */
2356 public boolean isAbsPath() {
2357 return _is_abs_path;
2358 }
2359
2360
2361 /***
2362 * Tell whether or not the relativeURI of this URI is rel_path.
2363 *
2364 * @return true iif the relativeURI is rel_path
2365 */
2366 public boolean isRelPath() {
2367 return _is_rel_path;
2368 }
2369
2370
2371 /***
2372 * Tell whether or not this URI has authority.
2373 * It's the same function as the is_net_path() method.
2374 *
2375 * @return true iif this URI has authority
2376 * @see #isNetPath
2377 */
2378 public boolean hasAuthority() {
2379 return (_authority != null) || _is_net_path;
2380 }
2381
2382 /***
2383 * Tell whether or not the authority component of this URI is reg_name.
2384 *
2385 * @return true iif the authority component is reg_name
2386 */
2387 public boolean isRegName() {
2388 return _is_reg_name;
2389 }
2390
2391
2392 /***
2393 * Tell whether or not the authority component of this URI is server.
2394 *
2395 * @return true iif the authority component is server
2396 */
2397 public boolean isServer() {
2398 return _is_server;
2399 }
2400
2401
2402 /***
2403 * Tell whether or not this URI has userinfo.
2404 *
2405 * @return true iif this URI has userinfo
2406 */
2407 public boolean hasUserinfo() {
2408 return (_userinfo != null);
2409 }
2410
2411
2412 /***
2413 * Tell whether or not the host part of this URI is hostname.
2414 *
2415 * @return true iif the host part is hostname
2416 */
2417 public boolean isHostname() {
2418 return _is_hostname;
2419 }
2420
2421
2422 /***
2423 * Tell whether or not the host part of this URI is IPv4address.
2424 *
2425 * @return true iif the host part is IPv4address
2426 */
2427 public boolean isIPv4address() {
2428 return _is_IPv4address;
2429 }
2430
2431
2432 /***
2433 * Tell whether or not the host part of this URI is IPv6reference.
2434 *
2435 * @return true iif the host part is IPv6reference
2436 */
2437 public boolean isIPv6reference() {
2438 return _is_IPv6reference;
2439 }
2440
2441
2442 /***
2443 * Tell whether or not this URI has query.
2444 *
2445 * @return true iif this URI has query
2446 */
2447 public boolean hasQuery() {
2448 return (_query != null);
2449 }
2450
2451
2452 /***
2453 * Tell whether or not this URI has fragment.
2454 *
2455 * @return true iif this URI has fragment
2456 */
2457 public boolean hasFragment() {
2458 return (_fragment != null);
2459 }
2460
2461
2462
2463
2464
2465 /***
2466 * Set the default charset of the protocol.
2467 * <p>
2468 * The character set used to store files SHALL remain a local decision and
2469 * MAY depend on the capability of local operating systems. Prior to the
2470 * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format
2471 * and UTF-8 encoded. This approach, while allowing international exchange
2472 * of URIs, will still allow backward compatibility with older systems
2473 * because the code set positions for ASCII characters are identical to the
2474 * one byte sequence in UTF-8.
2475 * <p>
2476 * An individual URI scheme may require a single charset, define a default
2477 * charset, or provide a way to indicate the charset used.
2478 *
2479 * <p>
2480 * Always all the time, the setter method is always succeeded and throws
2481 * <code>DefaultCharsetChanged</code> exception.
2482 *
2483 * So API programmer must follow the following way:
2484 * <code><pre>
2485 * import org.apache.util.URI$DefaultCharsetChanged;
2486 * .
2487 * .
2488 * .
2489 * try {
2490 * URI.setDefaultProtocolCharset("UTF-8");
2491 * } catch (DefaultCharsetChanged cc) {
2492 * // CASE 1: the exception could be ignored, when it is set by user
2493 * if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) {
2494 * // CASE 2: let user know the default protocol charset changed
2495 * } else {
2496 * // CASE 2: let user know the default document charset changed
2497 * }
2498 * }
2499 * </pre></code>
2500 *
2501 * The API programmer is responsible to set the correct charset.
2502 * And each application should remember its own charset to support.
2503 *
2504 * @param charset the default charset for each protocol
2505 * @throws DefaultCharsetChanged default charset changed
2506 */
2507 public static void setDefaultProtocolCharset(String charset)
2508 throws DefaultCharsetChanged {
2509
2510 defaultProtocolCharset = charset;
2511 throw new DefaultCharsetChanged(DefaultCharsetChanged.PROTOCOL_CHARSET,
2512 "the default protocol charset changed");
2513 }
2514
2515
2516 /***
2517 * Get the default charset of the protocol.
2518 * <p>
2519 * An individual URI scheme may require a single charset, define a default
2520 * charset, or provide a way to indicate the charset used.
2521 * <p>
2522 * To work globally either requires support of a number of character sets
2523 * and to be able to convert between them, or the use of a single preferred
2524 * character set.
2525 * For support of global compatibility it is STRONGLY RECOMMENDED that
2526 * clients and servers use UTF-8 encoding when exchanging URIs.
2527 *
2528 * @return the default charset string
2529 */
2530 public static String getDefaultProtocolCharset() {
2531 return defaultProtocolCharset;
2532 }
2533
2534
2535 /***
2536 * Get the protocol charset used by this current URI instance.
2537 * It was set by the constructor for this instance. If it was not set by
2538 * contructor, it will return the default protocol charset.
2539 *
2540 * @return the protocol charset string
2541 * @see #getDefaultProtocolCharset
2542 */
2543 public String getProtocolCharset() {
2544 return (protocolCharset != null)
2545 ? protocolCharset
2546 : defaultProtocolCharset;
2547 }
2548
2549
2550 /***
2551 * Set the default charset of the document.
2552 * <p>
2553 * Notice that it will be possible to contain mixed characters (e.g.
2554 * ftp://host/KoreanNamespace/ChineseResource). To handle the Bi-directional
2555 * display of these character sets, the protocol charset could be simply
2556 * used again. Because it's not yet implemented that the insertion of BIDI
2557 * control characters at different points during composition is extracted.
2558 * <p>
2559 *
2560 * Always all the time, the setter method is always succeeded and throws
2561 * <code>DefaultCharsetChanged</code> exception.
2562 *
2563 * So API programmer must follow the following way:
2564 * <code><pre>
2565 * import org.apache.util.URI$DefaultCharsetChanged;
2566 * .
2567 * .
2568 * .
2569 * try {
2570 * URI.setDefaultDocumentCharset("EUC-KR");
2571 * } catch (DefaultCharsetChanged cc) {
2572 * // CASE 1: the exception could be ignored, when it is set by user
2573 * if (cc.getReasonCode() == DefaultCharsetChanged.DOCUMENT_CHARSET) {
2574 * // CASE 2: let user know the default document charset changed
2575 * } else {
2576 * // CASE 2: let user know the default protocol charset changed
2577 * }
2578 * }
2579 * </pre></code>
2580 *
2581 * The API programmer is responsible to set the correct charset.
2582 * And each application should remember its own charset to support.
2583 *
2584 * @param charset the default charset for the document
2585 * @throws DefaultCharsetChanged default charset changed
2586 */
2587 public static void setDefaultDocumentCharset(String charset)
2588 throws DefaultCharsetChanged {
2589
2590 defaultDocumentCharset = charset;
2591 throw new DefaultCharsetChanged(DefaultCharsetChanged.DOCUMENT_CHARSET,
2592 "the default document charset changed");
2593 }
2594
2595
2596 /***
2597 * Get the recommended default charset of the document.
2598 *
2599 * @return the default charset string
2600 */
2601 public static String getDefaultDocumentCharset() {
2602 return defaultDocumentCharset;
2603 }
2604
2605
2606 /***
2607 * Get the default charset of the document by locale.
2608 *
2609 * @return the default charset string by locale
2610 */
2611 public static String getDefaultDocumentCharsetByLocale() {
2612 return defaultDocumentCharsetByLocale;
2613 }
2614
2615
2616 /***
2617 * Get the default charset of the document by platform.
2618 *
2619 * @return the default charset string by platform
2620 */
2621 public static String getDefaultDocumentCharsetByPlatform() {
2622 return defaultDocumentCharsetByPlatform;
2623 }
2624
2625
2626
2627 /***
2628 * Get the scheme.
2629 *
2630 * @return the scheme
2631 */
2632 public char[] getRawScheme() {
2633 return _scheme;
2634 }
2635
2636
2637 /***
2638 * Get the scheme.
2639 *
2640 * @return the scheme
2641 * null if undefined scheme
2642 */
2643 public String getScheme() {
2644 return (_scheme == null) ? null : new String(_scheme);
2645 }
2646
2647
2648
2649 /***
2650 * Set the authority. It can be one type of server, hostport, hostname,
2651 * IPv4address, IPv6reference and reg_name.
2652 * <p><blockquote><pre>
2653 * authority = server | reg_name
2654 * </pre></blockquote><p>
2655 *
2656 * @param escapedAuthority the raw escaped authority
2657 * @throws URIException If {@link
2658 * #parseAuthority(java.lang.String,boolean)} fails
2659 * @throws NullPointerException null authority
2660 */
2661 public void setRawAuthority(char[] escapedAuthority)
2662 throws URIException, NullPointerException {
2663
2664 parseAuthority(new String(escapedAuthority), true);
2665 setURI();
2666 }
2667
2668
2669 /***
2670 * Set the authority. It can be one type of server, hostport, hostname,
2671 * IPv4address, IPv6reference and reg_name.
2672 * Note that there is no setAuthority method by the escape encoding reason.
2673 *
2674 * @param escapedAuthority the escaped authority string
2675 * @throws URIException If {@link
2676 * #parseAuthority(java.lang.String,boolean)} fails
2677 */
2678 public void setEscapedAuthority(String escapedAuthority)
2679 throws URIException {
2680
2681 parseAuthority(escapedAuthority, true);
2682 setURI();
2683 }
2684
2685
2686 /***
2687 * Get the raw-escaped authority.
2688 *
2689 * @return the raw-escaped authority
2690 */
2691 public char[] getRawAuthority() {
2692 return _authority;
2693 }
2694
2695
2696 /***
2697 * Get the escaped authority.
2698 *
2699 * @return the escaped authority
2700 */
2701 public String getEscapedAuthority() {
2702 return (_authority == null) ? null : new String(_authority);
2703 }
2704
2705
2706 /***
2707 * Get the authority.
2708 *
2709 * @return the authority
2710 * @throws URIException If {@link #decode} fails
2711 */
2712 public String getAuthority() throws URIException {
2713 return (_authority == null) ? null : decode(_authority,
2714 getProtocolCharset());
2715 }
2716
2717
2718
2719 /***
2720 * Get the raw-escaped userinfo.
2721 *
2722 * @return the raw-escaped userinfo
2723 * @see #getAuthority
2724 */
2725 public char[] getRawUserinfo() {
2726 return _userinfo;
2727 }
2728
2729
2730 /***
2731 * Get the escaped userinfo.
2732 *
2733 * @return the escaped userinfo
2734 * @see #getAuthority
2735 */
2736 public String getEscapedUserinfo() {
2737 return (_userinfo == null) ? null : new String(_userinfo);
2738 }
2739
2740
2741 /***
2742 * Get the userinfo.
2743 *
2744 * @return the userinfo
2745 * @throws URIException If {@link #decode} fails
2746 * @see #getAuthority
2747 */
2748 public String getUserinfo() throws URIException {
2749 return (_userinfo == null) ? null : decode(_userinfo,
2750 getProtocolCharset());
2751 }
2752
2753
2754
2755 /***
2756 * Get the host.
2757 * <p><blockquote><pre>
2758 * host = hostname | IPv4address | IPv6reference
2759 * </pre></blockquote><p>
2760 *
2761 * @return the host
2762 * @see #getAuthority
2763 */
2764 public char[] getRawHost() {
2765 return _host;
2766 }
2767
2768
2769 /***
2770 * Get the host.
2771 * <p><blockquote><pre>
2772 * host = hostname | IPv4address | IPv6reference
2773 * </pre></blockquote><p>
2774 *
2775 * @return the host
2776 * @throws URIException If {@link #decode} fails
2777 * @see #getAuthority
2778 */
2779 public String getHost() throws URIException {
2780 if (_host != null) {
2781 return decode(_host, getProtocolCharset());
2782 } else {
2783 return null;
2784 }
2785 }
2786
2787
2788
2789 /***
2790 * Get the port. In order to get the specfic default port, the specific
2791 * protocol-supported class extended from the URI class should be used.
2792 * It has the server-based naming authority.
2793 *
2794 * @return the port
2795 * if -1, it has the default port for the scheme or the server-based
2796 * naming authority is not supported in the specific URI.
2797 */
2798 public int getPort() {
2799 return _port;
2800 }
2801
2802
2803
2804 /***
2805 * Set the raw-escaped path.
2806 *
2807 * @param escapedPath the path character sequence
2808 * @throws URIException encoding error or not proper for initial instance
2809 * @see #encode
2810 */
2811 public void setRawPath(char[] escapedPath) throws URIException {
2812 if (escapedPath == null || escapedPath.length == 0) {
2813 _path = _opaque = escapedPath;
2814 setURI();
2815 return;
2816 }
2817
2818 escapedPath = removeFragmentIdentifier(escapedPath);
2819 if (_is_net_path || _is_abs_path) {
2820 if (escapedPath[0] != '/') {
2821 throw new URIException(URIException.PARSING,
2822 "not absolute path");
2823 }
2824 if (!validate(escapedPath, abs_path)) {
2825 throw new URIException(URIException.ESCAPING,
2826 "escaped absolute path not valid");
2827 }
2828 _path = escapedPath;
2829 } else if (_is_rel_path) {
2830 int at = indexFirstOf(escapedPath, '/');
2831 if (at == 0) {
2832 throw new URIException(URIException.PARSING, "incorrect path");
2833 }
2834 if (at > 0 && !validate(escapedPath, 0, at - 1, rel_segment)
2835 && !validate(escapedPath, at, -1, abs_path)
2836 || at < 0 && !validate(escapedPath, 0, -1, rel_segment)) {
2837
2838 throw new URIException(URIException.ESCAPING,
2839 "escaped relative path not valid");
2840 }
2841 _path = escapedPath;
2842 } else if (_is_opaque_part) {
2843 if (!uric_no_slash.get(escapedPath[0])
2844 && !validate(escapedPath, 1, -1, uric)) {
2845 throw new URIException(URIException.ESCAPING,
2846 "escaped opaque part not valid");
2847 }
2848 _opaque = escapedPath;
2849 } else {
2850 throw new URIException(URIException.PARSING, "incorrect path");
2851 }
2852 setURI();
2853 }
2854
2855
2856 /***
2857 * Set the escaped path.
2858 *
2859 * @param escapedPath the escaped path string
2860 * @throws URIException encoding error or not proper for initial instance
2861 * @see #encode
2862 */
2863 public void setEscapedPath(String escapedPath) throws URIException {
2864 if (escapedPath == null) {
2865 _path = _opaque = null;
2866 setURI();
2867 return;
2868 }
2869 setRawPath(escapedPath.toCharArray());
2870 }
2871
2872
2873 /***
2874 * Set the path.
2875 *
2876 * @param path the path string
2877 * @throws URIException set incorrectly or fragment only
2878 * @see #encode
2879 */
2880 public void setPath(String path) throws URIException {
2881
2882 if (path == null || path.length() == 0) {
2883 _path = _opaque = (path == null) ? null : path.toCharArray();
2884 setURI();
2885 return;
2886 }
2887
2888 String charset = getProtocolCharset();
2889
2890 if (_is_net_path || _is_abs_path) {
2891 _path = encode(path, allowed_abs_path, charset);
2892 } else if (_is_rel_path) {
2893 StringBuffer buff = new StringBuffer(path.length());
2894 int at = path.indexOf('/');
2895 if (at == 0) {
2896 throw new URIException(URIException.PARSING,
2897 "incorrect relative path");
2898 }
2899 if (at > 0) {
2900 buff.append(encode(path.substring(0, at), allowed_rel_path,
2901 charset));
2902 buff.append(encode(path.substring(at), allowed_abs_path,
2903 charset));
2904 } else {
2905 buff.append(encode(path, allowed_rel_path, charset));
2906 }
2907 _path = buff.toString().toCharArray();
2908 } else if (_is_opaque_part) {
2909 StringBuffer buf = new StringBuffer();
2910 buf.insert(0, encode(path.substring(0, 1), uric_no_slash, charset));
2911 buf.insert(1, encode(path.substring(1), uric, charset));
2912 _opaque = buf.toString().toCharArray();
2913 } else {
2914 throw new URIException(URIException.PARSING, "incorrect path");
2915 }
2916 setURI();
2917 }
2918
2919
2920 /***
2921 * Resolve the base and relative path.
2922 *
2923 * @param basePath a character array of the basePath
2924 * @param relPath a character array of the relPath
2925 * @return the resolved path
2926 * @throws URIException no more higher path level to be resolved
2927 */
2928 protected char[] resolvePath(char[] basePath, char[] relPath)
2929 throws URIException {
2930
2931
2932 String base = (basePath == null) ? "" : new String(basePath);
2933 int at = base.lastIndexOf('/');
2934 if (at != -1) {
2935 basePath = base.substring(0, at + 1).toCharArray();
2936 }
2937
2938 if (relPath == null || relPath.length == 0) {
2939 return normalize(basePath);
2940 } else if (relPath[0] == '/') {
2941 return normalize(relPath);
2942 } else {
2943 StringBuffer buff = new StringBuffer(base.length()
2944 + relPath.length);
2945 buff.append((at != -1) ? base.substring(0, at + 1) : "/");
2946 buff.append(relPath);
2947 return normalize(buff.toString().toCharArray());
2948 }
2949 }
2950
2951
2952 /***
2953 * Get the raw-escaped current hierarchy level in the given path.
2954 * If the last namespace is a collection, the slash mark ('/') should be
2955 * ended with at the last character of the path string.
2956 *
2957 * @param path the path
2958 * @return the current hierarchy level
2959 * @throws URIException no hierarchy level
2960 */
2961 protected char[] getRawCurrentHierPath(char[] path) throws URIException {
2962
2963 if (_is_opaque_part) {
2964 throw new URIException(URIException.PARSING, "no hierarchy level");
2965 }
2966 if (path == null) {
2967 throw new URIException(URIException.PARSING, "empty path");
2968 }
2969 String buff = new String(path);
2970 int first = buff.indexOf('/');
2971 int last = buff.lastIndexOf('/');
2972 if (last == 0) {
2973 return rootPath;
2974 } else if (first != last && last != -1) {
2975 return buff.substring(0, last).toCharArray();
2976 }
2977
2978 return path;
2979 }
2980
2981
2982 /***
2983 * Get the raw-escaped current hierarchy level.
2984 *
2985 * @return the raw-escaped current hierarchy level
2986 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2987 */
2988 public char[] getRawCurrentHierPath() throws URIException {
2989 return (_path == null) ? null : getRawCurrentHierPath(_path);
2990 }
2991
2992
2993 /***
2994 * Get the escaped current hierarchy level.
2995 *
2996 * @return the escaped current hierarchy level
2997 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2998 */
2999 public String getEscapedCurrentHierPath() throws URIException {
3000 char[] path = getRawCurrentHierPath();
3001 return (path == null) ? null : new String(path);
3002 }
3003
3004
3005 /***
3006 * Get the current hierarchy level.
3007 *
3008 * @return the current hierarchy level
3009 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3010 * @see #decode
3011 */
3012 public String getCurrentHierPath() throws URIException {
3013 char[] path = getRawCurrentHierPath();
3014 return (path == null) ? null : decode(path, getProtocolCharset());
3015 }
3016
3017
3018 /***
3019 * Get the level above the this hierarchy level.
3020 *
3021 * @return the raw above hierarchy level
3022 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3023 */
3024 public char[] getRawAboveHierPath() throws URIException {
3025 char[] path = getRawCurrentHierPath();
3026 return (path == null) ? null : getRawCurrentHierPath(path);
3027 }
3028
3029
3030 /***
3031 * Get the level above the this hierarchy level.
3032 *
3033 * @return the raw above hierarchy level
3034 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3035 */
3036 public String getEscapedAboveHierPath() throws URIException {
3037 char[] path = getRawAboveHierPath();
3038 return (path == null) ? null : new String(path);
3039 }
3040
3041
3042 /***
3043 * Get the level above the this hierarchy level.
3044 *
3045 * @return the above hierarchy level
3046 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3047 * @see #decode
3048 */
3049 public String getAboveHierPath() throws URIException {
3050 char[] path = getRawAboveHierPath();
3051 return (path == null) ? null : decode(path, getProtocolCharset());
3052 }
3053
3054
3055 /***
3056 * Get the raw-escaped path.
3057 * <p><blockquote><pre>
3058 * path = [ abs_path | opaque_part ]
3059 * </pre></blockquote><p>
3060 *
3061 * @return the raw-escaped path
3062 */
3063 public char[] getRawPath() {
3064 return _is_opaque_part ? _opaque : _path;
3065 }
3066
3067
3068 /***
3069 * Get the escaped path.
3070 * <p><blockquote><pre>
3071 * path = [ abs_path | opaque_part ]
3072 * abs_path = "/" path_segments
3073 * opaque_part = uric_no_slash *uric
3074 * </pre></blockquote><p>
3075 *
3076 * @return the escaped path string
3077 */
3078 public String getEscapedPath() {
3079 char[] path = getRawPath();
3080 return (path == null) ? null : new String(path);
3081 }
3082
3083
3084 /***
3085 * Get the path.
3086 * <p><blockquote><pre>
3087 * path = [ abs_path | opaque_part ]
3088 * </pre></blockquote><p>
3089 * @return the path string
3090 * @throws URIException If {@link #decode} fails.
3091 * @see #decode
3092 */
3093 public String getPath() throws URIException {
3094 char[] path = getRawPath();
3095 return (path == null) ? null : decode(path, getProtocolCharset());
3096 }
3097
3098
3099 /***
3100 * Get the raw-escaped basename of the path.
3101 *
3102 * @return the raw-escaped basename
3103 */
3104 public char[] getRawName() {
3105 if (_path == null) {
3106 return null;
3107 }
3108
3109 int at = 0;
3110 for (int i = _path.length - 1; i >= 0; i--) {
3111 if (_path[i] == '/') {
3112 at = i + 1;
3113 break;
3114 }
3115 }
3116 int len = _path.length - at;
3117 char[] basename = new char[len];
3118 System.arraycopy(_path, at, basename, 0, len);
3119 return basename;
3120 }
3121
3122
3123 /***
3124 * Get the escaped basename of the path.
3125 *
3126 * @return the escaped basename string
3127 */
3128 public String getEscapedName() {
3129 char[] basename = getRawName();
3130 return (basename == null) ? null : new String(basename);
3131 }
3132
3133
3134 /***
3135 * Get the basename of the path.
3136 *
3137 * @return the basename string
3138 * @throws URIException incomplete trailing escape pattern or unsupported
3139 * character encoding
3140 * @see #decode
3141 */
3142 public String getName() throws URIException {
3143 char[] basename = getRawName();
3144 return (basename == null) ? null : decode(getRawName(),
3145 getProtocolCharset());
3146 }
3147
3148
3149
3150 /***
3151 * Get the raw-escaped path and query.
3152 *
3153 * @return the raw-escaped path and query
3154 */
3155 public char[] getRawPathQuery() {
3156
3157 if (_path == null && _query == null) {
3158 return null;
3159 }
3160 StringBuffer buff = new StringBuffer();
3161 if (_path != null) {
3162 buff.append(_path);
3163 }
3164 if (_query != null) {
3165 buff.append('?');
3166 buff.append(_query);
3167 }
3168 return buff.toString().toCharArray();
3169 }
3170
3171
3172 /***
3173 * Get the escaped query.
3174 *
3175 * @return the escaped path and query string
3176 */
3177 public String getEscapedPathQuery() {
3178 char[] rawPathQuery = getRawPathQuery();
3179 return (rawPathQuery == null) ? null : new String(rawPathQuery);
3180 }
3181
3182
3183 /***
3184 * Get the path and query.
3185 *
3186 * @return the path and query string.
3187 * @throws URIException incomplete trailing escape pattern or unsupported
3188 * character encoding
3189 * @see #decode
3190 */
3191 public String getPathQuery() throws URIException {
3192 char[] rawPathQuery = getRawPathQuery();
3193 return (rawPathQuery == null) ? null : decode(rawPathQuery,
3194 getProtocolCharset());
3195 }
3196
3197
3198
3199 /***
3200 * Set the raw-escaped query.
3201 *
3202 * @param escapedQuery the raw-escaped query
3203 * @throws URIException escaped query not valid
3204 */
3205 public void setRawQuery(char[] escapedQuery) throws URIException {
3206 if (escapedQuery == null || escapedQuery.length == 0) {
3207 _query = escapedQuery;
3208 setURI();
3209 return;
3210 }
3211
3212 escapedQuery = removeFragmentIdentifier(escapedQuery);
3213 if (!validate(escapedQuery, query)) {
3214 throw new URIException(URIException.ESCAPING,
3215 "escaped query not valid");
3216 }
3217 _query = escapedQuery;
3218 setURI();
3219 }
3220
3221
3222 /***
3223 * Set the escaped query string.
3224 *
3225 * @param escapedQuery the escaped query string
3226 * @throws URIException escaped query not valid
3227 */
3228 public void setEscapedQuery(String escapedQuery) throws URIException {
3229 if (escapedQuery == null) {
3230 _query = null;
3231 setURI();
3232 return;
3233 }
3234 setRawQuery(escapedQuery.toCharArray());
3235 }
3236
3237
3238 /***
3239 * Set the query.
3240 * <p>
3241 * When a query string is not misunderstood the reserved special characters
3242 * ("&", "=", "+", ",", and "$") within a query component, it is
3243 * recommended to use in encoding the whole query with this method.
3244 * <p>
3245 * The additional APIs for the special purpose using by the reserved
3246 * special characters used in each protocol are implemented in each protocol
3247 * classes inherited from <code>URI</code>. So refer to the same-named APIs
3248 * implemented in each specific protocol instance.
3249 *
3250 * @param query the query string.
3251 * @throws URIException incomplete trailing escape pattern or unsupported
3252 * character encoding
3253 * @see #encode
3254 */
3255 public void setQuery(String query) throws URIException {
3256 if (query == null || query.length() == 0) {
3257 _query = (query == null) ? null : query.toCharArray();
3258 setURI();
3259 return;
3260 }
3261 setRawQuery(encode(query, allowed_query, getProtocolCharset()));
3262 }
3263
3264
3265 /***
3266 * Get the raw-escaped query.
3267 *
3268 * @return the raw-escaped query
3269 */
3270 public char[] getRawQuery() {
3271 return _query;
3272 }
3273
3274
3275 /***
3276 * Get the escaped query.
3277 *
3278 * @return the escaped query string
3279 */
3280 public String getEscapedQuery() {
3281 return (_query == null) ? null : new String(_query);
3282 }
3283
3284
3285 /***
3286 * Get the query.
3287 *
3288 * @return the query string.
3289 * @throws URIException incomplete trailing escape pattern or unsupported
3290 * character encoding
3291 * @see #decode
3292 */
3293 public String getQuery() throws URIException {
3294 return (_query == null) ? null : decode(_query, getProtocolCharset());
3295 }
3296
3297
3298
3299 /***
3300 * Set the raw-escaped fragment.
3301 *
3302 * @param escapedFragment the raw-escaped fragment
3303 * @throws URIException escaped fragment not valid
3304 */
3305 public void setRawFragment(char[] escapedFragment) throws URIException {
3306 if (escapedFragment == null || escapedFragment.length == 0) {
3307 _fragment = escapedFragment;
3308 hash = 0;
3309 return;
3310 }
3311 if (!validate(escapedFragment, fragment)) {
3312 throw new URIException(URIException.ESCAPING,
3313 "escaped fragment not valid");
3314 }
3315 _fragment = escapedFragment;
3316 hash = 0;
3317 }
3318
3319
3320 /***
3321 * Set the escaped fragment string.
3322 *
3323 * @param escapedFragment the escaped fragment string
3324 * @throws URIException escaped fragment not valid
3325 */
3326 public void setEscapedFragment(String escapedFragment) throws URIException {
3327 if (escapedFragment == null) {
3328 _fragment = null;
3329 hash = 0;
3330 return;
3331 }
3332 setRawFragment(escapedFragment.toCharArray());
3333 }
3334
3335
3336 /***
3337 * Set the fragment.
3338 *
3339 * @param fragment the fragment string.
3340 * @throws URIException If an error occurs.
3341 */
3342 public void setFragment(String fragment) throws URIException {
3343 if (fragment == null || fragment.length() == 0) {
3344 _fragment = (fragment == null) ? null : fragment.toCharArray();
3345 hash = 0;
3346 return;
3347 }
3348 _fragment = encode(fragment, allowed_fragment, getProtocolCharset());
3349 hash = 0;
3350 }
3351
3352
3353 /***
3354 * Get the raw-escaped fragment.
3355 * <p>
3356 * The optional fragment identifier is not part of a URI, but is often used
3357 * in conjunction with a URI.
3358 * <p>
3359 * The format and interpretation of fragment identifiers is dependent on
3360 * the media type [RFC2046] of the retrieval result.
3361 * <p>
3362 * A fragment identifier is only meaningful when a URI reference is
3363 * intended for retrieval and the result of that retrieval is a document
3364 * for which the identified fragment is consistently defined.
3365 *
3366 * @return the raw-escaped fragment
3367 */
3368 public char[] getRawFragment() {
3369 return _fragment;
3370 }
3371
3372
3373 /***
3374 * Get the escaped fragment.
3375 *
3376 * @return the escaped fragment string
3377 */
3378 public String getEscapedFragment() {
3379 return (_fragment == null) ? null : new String(_fragment);
3380 }
3381
3382
3383 /***
3384 * Get the fragment.
3385 *
3386 * @return the fragment string
3387 * @throws URIException incomplete trailing escape pattern or unsupported
3388 * character encoding
3389 * @see #decode
3390 */
3391 public String getFragment() throws URIException {
3392 return (_fragment == null) ? null : decode(_fragment,
3393 getProtocolCharset());
3394 }
3395
3396
3397
3398 /***
3399 * Remove the fragment identifier of the given component.
3400 *
3401 * @param component the component that a fragment may be included
3402 * @return the component that the fragment identifier is removed
3403 */
3404 protected char[] removeFragmentIdentifier(char[] component) {
3405 if (component == null) {
3406 return null;
3407 }
3408 int lastIndex = new String(component).indexOf('#');
3409 if (lastIndex != -1) {
3410 component = new String(component).substring(0,
3411 lastIndex).toCharArray();
3412 }
3413 return component;
3414 }
3415
3416
3417 /***
3418 * Normalize the given hier path part.
3419 *
3420 * <p>Algorithm taken from URI reference parser at
3421 * http://www.apache.org/~fielding/uri/rev-2002/issues.html.
3422 *
3423 * @param path the path to normalize
3424 * @return the normalized path
3425 * @throws URIException no more higher path level to be normalized
3426 */
3427 protected char[] normalize(char[] path) throws URIException {
3428
3429 if (path == null) {
3430 return null;
3431 }
3432
3433 String normalized = new String(path);
3434
3435
3436 if (normalized.startsWith("./")) {
3437 normalized = normalized.substring(1);
3438 } else if (normalized.startsWith("../")) {
3439 normalized = normalized.substring(2);
3440 } else if (normalized.startsWith("..")) {
3441 normalized = normalized.substring(2);
3442 }
3443
3444
3445 int index = -1;
3446 while ((index = normalized.indexOf("/./")) != -1) {
3447 normalized = normalized.substring(0, index) + normalized.substring(index + 2);
3448 }
3449
3450
3451 if (normalized.endsWith("/.")) {
3452 normalized = normalized.substring(0, normalized.length() - 1);
3453 }
3454
3455 int startIndex = 0;
3456
3457
3458
3459
3460
3461
3462 while ((index = normalized.indexOf("/../", startIndex)) != -1) {
3463 int slashIndex = normalized.lastIndexOf('/', index - 1);
3464 if (slashIndex >= 0) {
3465 normalized = normalized.substring(0, slashIndex) + normalized.substring(index + 3);
3466 } else {
3467 startIndex = index + 3;
3468 }
3469 }
3470 if (normalized.endsWith("/..")) {
3471 int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
3472 if (slashIndex >= 0) {
3473 normalized = normalized.substring(0, slashIndex + 1);
3474 }
3475 }
3476
3477
3478
3479
3480
3481
3482 while ((index = normalized.indexOf("/../")) != -1) {
3483 int slashIndex = normalized.lastIndexOf('/', index - 1);
3484 if (slashIndex >= 0) {
3485 break;
3486 } else {
3487 normalized = normalized.substring(index + 3);
3488 }
3489 }
3490 if (normalized.endsWith("/..")) {
3491 int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
3492 if (slashIndex < 0) {
3493 normalized = "/";
3494 }
3495 }
3496
3497 return normalized.toCharArray();
3498 }
3499
3500
3501 /***
3502 * Normalizes the path part of this URI. Normalization is only meant to be performed on
3503 * URIs with an absolute path. Calling this method on a relative path URI will have no
3504 * effect.
3505 *
3506 * @throws URIException no more higher path level to be normalized
3507 *
3508 * @see #isAbsPath()
3509 */
3510 public void normalize() throws URIException {
3511 if (isAbsPath()) {
3512 _path = normalize(_path);
3513 setURI();
3514 }
3515 }
3516
3517
3518 /***
3519 * Test if the first array is equal to the second array.
3520 *
3521 * @param first the first character array
3522 * @param second the second character array
3523 * @return true if they're equal
3524 */
3525 protected boolean equals(char[] first, char[] second) {
3526
3527 if (first == null && second == null) {
3528 return true;
3529 }
3530 if (first == null || second == null) {
3531 return false;
3532 }
3533 if (first.length != second.length) {
3534 return false;
3535 }
3536 for (int i = 0; i < first.length; i++) {
3537 if (first[i] != second[i]) {
3538 return false;
3539 }
3540 }
3541 return true;
3542 }
3543
3544
3545 /***
3546 * Test an object if this URI is equal to another.
3547 *
3548 * @param obj an object to compare
3549 * @return true if two URI objects are equal
3550 */
3551 public boolean equals(Object obj) {
3552
3553
3554 if (obj == this) {
3555 return true;
3556 }
3557 if (!(obj instanceof URI)) {
3558 return false;
3559 }
3560 URI another = (URI) obj;
3561
3562 if (!equals(_scheme, another._scheme)) {
3563 return false;
3564 }
3565
3566 if (!equals(_opaque, another._opaque)) {
3567 return false;
3568 }
3569
3570
3571 if (!equals(_authority, another._authority)) {
3572 return false;
3573 }
3574
3575 if (!equals(_path, another._path)) {
3576 return false;
3577 }
3578
3579 if (!equals(_query, another._query)) {
3580 return false;
3581 }
3582
3583 if (!equals(_fragment, another._fragment)) {
3584 return false;
3585 }
3586 return true;
3587 }
3588
3589
3590
3591 /***
3592 * Write the content of this URI.
3593 *
3594 * @param oos the object-output stream
3595 * @throws IOException If an IO problem occurs.
3596 */
3597 protected void writeObject(ObjectOutputStream oos)
3598 throws IOException {
3599
3600 oos.defaultWriteObject();
3601 }
3602
3603
3604 /***
3605 * Read a URI.
3606 *
3607 * @param ois the object-input stream
3608 * @throws ClassNotFoundException If one of the classes specified in the
3609 * input stream cannot be found.
3610 * @throws IOException If an IO problem occurs.
3611 */
3612 protected void readObject(ObjectInputStream ois)
3613 throws ClassNotFoundException, IOException {
3614
3615 ois.defaultReadObject();
3616 }
3617
3618
3619
3620 /***
3621 * Return a hash code for this URI.
3622 *
3623 * @return a has code value for this URI
3624 */
3625 public int hashCode() {
3626 if (hash == 0) {
3627 char[] c = _uri;
3628 if (c != null) {
3629 for (int i = 0, len = c.length; i < len; i++) {
3630 hash = 31 * hash + c[i];
3631 }
3632 }
3633 c = _fragment;
3634 if (c != null) {
3635 for (int i = 0, len = c.length; i < len; i++) {
3636 hash = 31 * hash + c[i];
3637 }
3638 }
3639 }
3640 return hash;
3641 }
3642
3643
3644
3645 /***
3646 * Compare this URI to another object.
3647 *
3648 * @param obj the object to be compared.
3649 * @return 0, if it's same,
3650 * -1, if failed, first being compared with in the authority component
3651 * @throws ClassCastException not URI argument
3652 */
3653 public int compareTo(Object obj) throws ClassCastException {
3654
3655 URI another = (URI) obj;
3656 if (!equals(_authority, another.getRawAuthority())) {
3657 return -1;
3658 }
3659 return toString().compareTo(another.toString());
3660 }
3661
3662
3663
3664 /***
3665 * Create and return a copy of this object, the URI-reference containing
3666 * the userinfo component. Notice that the whole URI-reference including
3667 * the userinfo component counld not be gotten as a <code>String</code>.
3668 * <p>
3669 * To copy the identical <code>URI</code> object including the userinfo
3670 * component, it should be used.
3671 *
3672 * @return a clone of this instance
3673 */
3674 public synchronized Object clone() {
3675
3676 URI instance = new URI();
3677
3678 instance._uri = _uri;
3679 instance._scheme = _scheme;
3680 instance._opaque = _opaque;
3681 instance._authority = _authority;
3682 instance._userinfo = _userinfo;
3683 instance._host = _host;
3684 instance._port = _port;
3685 instance._path = _path;
3686 instance._query = _query;
3687 instance._fragment = _fragment;
3688
3689 instance.protocolCharset = protocolCharset;
3690
3691 instance._is_hier_part = _is_hier_part;
3692 instance._is_opaque_part = _is_opaque_part;
3693 instance._is_net_path = _is_net_path;
3694 instance._is_abs_path = _is_abs_path;
3695 instance._is_rel_path = _is_rel_path;
3696 instance._is_reg_name = _is_reg_name;
3697 instance._is_server = _is_server;
3698 instance._is_hostname = _is_hostname;
3699 instance._is_IPv4address = _is_IPv4address;
3700 instance._is_IPv6reference = _is_IPv6reference;
3701
3702 return instance;
3703 }
3704
3705
3706
3707 /***
3708 * It can be gotten the URI character sequence. It's raw-escaped.
3709 * For the purpose of the protocol to be transported, it will be useful.
3710 * <p>
3711 * It is clearly unwise to use a URL that contains a password which is
3712 * intended to be secret. In particular, the use of a password within
3713 * the 'userinfo' component of a URL is strongly disrecommended except
3714 * in those rare cases where the 'password' parameter is intended to be
3715 * public.
3716 * <p>
3717 * When you want to get each part of the userinfo, you need to use the
3718 * specific methods in the specific URL. It depends on the specific URL.
3719 *
3720 * @return the URI character sequence
3721 */
3722 public char[] getRawURI() {
3723 return _uri;
3724 }
3725
3726
3727 /***
3728 * It can be gotten the URI character sequence. It's escaped.
3729 * For the purpose of the protocol to be transported, it will be useful.
3730 *
3731 * @return the escaped URI string
3732 */
3733 public String getEscapedURI() {
3734 return (_uri == null) ? null : new String(_uri);
3735 }
3736
3737
3738 /***
3739 * It can be gotten the URI character sequence.
3740 *
3741 * @return the original URI string
3742 * @throws URIException incomplete trailing escape pattern or unsupported
3743 * character encoding
3744 * @see #decode
3745 */
3746 public String getURI() throws URIException {
3747 return (_uri == null) ? null : decode(_uri, getProtocolCharset());
3748 }
3749
3750
3751 /***
3752 * Get the URI reference character sequence.
3753 *
3754 * @return the URI reference character sequence
3755 */
3756 public char[] getRawURIReference() {
3757 if (_fragment == null) {
3758 return _uri;
3759 }
3760 if (_uri == null) {
3761 return _fragment;
3762 }
3763
3764 String uriReference = new String(_uri) + "#" + new String(_fragment);
3765 return uriReference.toCharArray();
3766 }
3767
3768
3769 /***
3770 * Get the escaped URI reference string.
3771 *
3772 * @return the escaped URI reference string
3773 */
3774 public String getEscapedURIReference() {
3775 char[] uriReference = getRawURIReference();
3776 return (uriReference == null) ? null : new String(uriReference);
3777 }
3778
3779
3780 /***
3781 * Get the original URI reference string.
3782 *
3783 * @return the original URI reference string
3784 * @throws URIException If {@link #decode} fails.
3785 */
3786 public String getURIReference() throws URIException {
3787 char[] uriReference = getRawURIReference();
3788 return (uriReference == null) ? null : decode(uriReference,
3789 getProtocolCharset());
3790 }
3791
3792
3793 /***
3794 * Get the escaped URI string.
3795 * <p>
3796 * On the document, the URI-reference form is only used without the userinfo
3797 * component like http://jakarta.apache.org/ by the security reason.
3798 * But the URI-reference form with the userinfo component could be parsed.
3799 * <p>
3800 * In other words, this URI and any its subclasses must not expose the
3801 * URI-reference expression with the userinfo component like
3802 * http://user:password@hostport/restricted_zone.<br>
3803 * It means that the API client programmer should extract each user and
3804 * password to access manually. Probably it will be supported in the each
3805 * subclass, however, not a whole URI-reference expression.
3806 *
3807 * @return the escaped URI string
3808 * @see #clone()
3809 */
3810 public String toString() {
3811 return getEscapedURI();
3812 }
3813
3814
3815
3816
3817 /***
3818 * The charset-changed normal operation to represent to be required to
3819 * alert to user the fact the default charset is changed.
3820 */
3821 public static class DefaultCharsetChanged extends RuntimeException {
3822
3823
3824
3825 /***
3826 * The constructor with a reason string and its code arguments.
3827 *
3828 * @param reasonCode the reason code
3829 * @param reason the reason
3830 */
3831 public DefaultCharsetChanged(int reasonCode, String reason) {
3832 super(reason);
3833 this.reason = reason;
3834 this.reasonCode = reasonCode;
3835 }
3836
3837
3838
3839 /*** No specified reason code. */
3840 public static final int UNKNOWN = 0;
3841
3842 /*** Protocol charset changed. */
3843 public static final int PROTOCOL_CHARSET = 1;
3844
3845 /*** Document charset changed. */
3846 public static final int DOCUMENT_CHARSET = 2;
3847
3848
3849
3850 /*** The reason code. */
3851 private int reasonCode;
3852
3853 /*** The reason message. */
3854 private String reason;
3855
3856
3857
3858 /***
3859 * Get the reason code.
3860 *
3861 * @return the reason code
3862 */
3863 public int getReasonCode() {
3864 return reasonCode;
3865 }
3866
3867 /***
3868 * Get the reason message.
3869 *
3870 * @return the reason message
3871 */
3872 public String getReason() {
3873 return reason;
3874 }
3875
3876 }
3877
3878
3879 /***
3880 * A mapping to determine the (somewhat arbitrarily) preferred charset for a
3881 * given locale. Supports all locales recognized in JDK 1.1.
3882 * <p>
3883 * The distribution of this class is Servlets.com. It was originally
3884 * written by Jason Hunter [jhunter at acm.org] and used by with permission.
3885 */
3886 public static class LocaleToCharsetMap {
3887
3888 /*** A mapping of language code to charset */
3889 private static final Hashtable LOCALE_TO_CHARSET_MAP;
3890 static {
3891 LOCALE_TO_CHARSET_MAP = new Hashtable();
3892 LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6");
3893 LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5");
3894 LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5");
3895 LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1");
3896 LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2");
3897 LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1");
3898 LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1");
3899 LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7");
3900 LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1");
3901 LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1");
3902 LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1");
3903 LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1");
3904 LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1");
3905 LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2");
3906 LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2");
3907 LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1");
3908 LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1");
3909 LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8");
3910 LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS");
3911 LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR");
3912 LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2");
3913 LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2");
3914 LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5");
3915 LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1");
3916 LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1");
3917 LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2");
3918 LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1");
3919 LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2");
3920 LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5");
3921 LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5");
3922 LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2");
3923 LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2");
3924 LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2");
3925 LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5");
3926 LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1");
3927 LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9");
3928 LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5");
3929 LOCALE_TO_CHARSET_MAP.put("zh", "GB2312");
3930 LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5");
3931 }
3932
3933 /***
3934 * Get the preferred charset for the given locale.
3935 *
3936 * @param locale the locale
3937 * @return the preferred charset or null if the locale is not
3938 * recognized.
3939 */
3940 public static String getCharset(Locale locale) {
3941
3942 String charset =
3943 (String) LOCALE_TO_CHARSET_MAP.get(locale.toString());
3944 if (charset != null) {
3945 return charset;
3946 }
3947
3948
3949 charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.getLanguage());
3950 return charset;
3951 }
3952
3953 }
3954
3955 }
3956