001/* URI.java -- An URI class
002   Copyright (C) 2002, 2004, 2005, 2006, 2008  Free Software Foundation, Inc.
003
004This file is part of GNU Classpath.
005
006GNU Classpath is free software; you can redistribute it and/or modify
007it under the terms of the GNU General Public License as published by
008the Free Software Foundation; either version 2, or (at your option)
009any later version.
010
011GNU Classpath is distributed in the hope that it will be useful, but
012WITHOUT ANY WARRANTY; without even the implied warranty of
013MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014General Public License for more details.
015
016You should have received a copy of the GNU General Public License
017along with GNU Classpath; see the file COPYING.  If not, write to the
018Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
01902110-1301 USA.
020
021Linking this library statically or dynamically with other modules is
022making a combined work based on this library.  Thus, the terms and
023conditions of the GNU General Public License cover the whole
024combination.
025
026As a special exception, the copyright holders of this library give you
027permission to link this library with independent modules to produce an
028executable, regardless of the license terms of these independent
029modules, and to copy and distribute the resulting executable under
030terms of your choice, provided that you also meet, for each linked
031independent module, the terms and conditions of the license of that
032module.  An independent module is a module which is not derived from
033or based on this library.  If you modify this library, you may extend
034this exception to your version of the library, but you are not
035obligated to do so.  If you do not wish to do so, delete this
036exception statement from your version. */
037
038
039package java.net;
040
041import gnu.java.lang.CPStringBuilder;
042
043import java.io.IOException;
044import java.io.ObjectInputStream;
045import java.io.ObjectOutputStream;
046import java.io.Serializable;
047import java.util.regex.Matcher;
048import java.util.regex.Pattern;
049
050/**
051 * <p>
052 * A URI instance represents that defined by 
053 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC3986</a>,
054 * with some deviations.
055 * </p>
056 * <p>
057 * At its highest level, a URI consists of:
058 * </p>
059 * <code>[<em>scheme</em><strong>:</strong>]<em>scheme-specific-part</em>
060 * [<strong>#</strong><em>fragment</em>]</code>
061 * </p>
062 * <p>
063 * where <strong>#</strong> and <strong>:</strong> are literal characters,
064 * and those parts enclosed in square brackets are optional.
065 * </p>
066 * <p>
067 * There are two main types of URI.  An <em>opaque</em> URI is one
068 * which just consists of the above three parts, and is not further
069 * defined.  An example of such a URI would be <em>mailto:</em> URI.
070 * In contrast, <em>hierarchical</em> URIs give further definition
071 * to the scheme-specific part, so as represent some part of a hierarchical
072 * structure.
073 * </p>
074 * <p>
075 * <code>[<strong>//</strong><em>authority</em>][<em>path</em>]
076 * [<strong>?</strong><em>query</em>]</code>
077 * </p>
078 * <p>
079 * with <strong>/</strong> and <strong>?</strong> being literal characters.
080 * When server-based, the authority section is further subdivided into:
081 * </p>
082 * <p>
083 * <code>[<em>user-info</em><strong>@</strong>]<em>host</em>
084 * [<strong>:</strong><em>port</em>]</code>
085 * </p>
086 * <p>
087 * with <strong>@</strong> and <strong>:</strong> as literal characters.
088 * Authority sections that are not server-based are said to be registry-based.
089 * </p>
090 * <p>
091 * Hierarchical URIs can be either relative or absolute.  Absolute URIs
092 * always start with a `<strong>/</strong>', while relative URIs don't
093 * specify a scheme.  Opaque URIs are always absolute.
094 * </p>
095 * <p>
096 * Each part of the URI may have one of three states: undefined, empty
097 * or containing some content.  The former two of these are represented
098 * by <code>null</code> and the empty string in Java, respectively.
099 * The scheme-specific part may never be undefined.  It also follows from
100 * this that the path sub-part may also not be undefined, so as to ensure
101 * the former.
102 * </p>
103 * <h2>Character Escaping and Quoting</h2>
104 * <p>
105 * The characters that can be used within a valid URI are restricted.
106 * There are two main classes of characters which can't be used as is
107 * within the URI:
108 * </p>
109 * <ol>
110 * <li><strong>Characters outside the US-ASCII character set</strong>.
111 * These have to be <strong>escaped</strong> in order to create
112 * an RFC-compliant URI; this means replacing the character with the
113 * appropriate hexadecimal value, preceded by a `%'.</li>
114 * <li><strong>Illegal characters</strong> (e.g. space characters,
115 * control characters) are quoted, which results in them being encoded
116 * in the same way as non-US-ASCII characters.</li>
117 * </ol>
118 * <p>
119 * The set of valid characters differs depending on the section of the URI:
120 * </p>
121 * <ul>
122 * <li><strong>Scheme</strong>: Must be an alphanumeric, `-', `.' or '+'.</li>
123 * <li><strong>Authority</strong>:Composed of the username, host, port, `@'
124 * and `:'.</li>
125 * <li><strong>Username</strong>: Allows unreserved or percent-encoded
126 * characters, sub-delimiters and `:'.</li>
127 * <li><strong>Host</strong>: Allows unreserved or percent-encoded
128 * characters, sub-delimiters and square brackets (`[' and `]') for IPv6
129 * addresses.</li>
130 * <li><strong>Port</strong>: Digits only.</li>
131 * <li><strong>Path</strong>: Allows the path characters and `/'.
132 * <li><strong>Query</strong>: Allows the path characters, `?' and '/'.
133 * <li><strong>Fragment</strong>: Allows the path characters, `?' and '/'.
134 * </ul>
135 * <p>
136 * These definitions reference the following sets of characters:
137 * </p>
138 * <ul>
139 * <li><strong>Unreserved characters</strong>: The alphanumerics plus
140 * `-', `.', `_', and `~'.</li>
141 * <li><strong>Sub-delimiters</strong>: `!', `$', `&', `(', `)', `*',
142 * `+', `,', `;', `=' and the single-quote itself.</li>
143 * <li><strong>Path characters</strong>: Unreserved and percent-encoded
144 * characters and the sub-delimiters along with `@' and `:'.</li>
145 * </ul>
146 * <p>
147 * The constructors and accessor methods allow the use and retrieval of
148 * URI components which contain non-US-ASCII characters directly.
149 * They are only escaped when the <code>toASCIIString()</code> method
150 * is used.  In contrast, illegal characters are always quoted, with the
151 * exception of the return values of the non-raw accessors.
152 * </p>
153 *
154 * @author Ito Kazumitsu (ito.kazumitsu@hitachi-cable.co.jp)
155 * @author Dalibor Topic (robilad@kaffe.org)
156 * @author Michael Koch (konqueror@gmx.de)
157 * @author Andrew John Hughes (gnu_andrew@member.fsf.org)
158 * @since 1.4
159 */
160public final class URI 
161  implements Comparable<URI>, Serializable
162{
163  /**
164   * For serialization compatability.
165   */
166  static final long serialVersionUID = -6052424284110960213L;
167
168  /**
169   * Regular expression for parsing URIs.
170   *
171   * Taken from RFC 2396, Appendix B.
172   * This expression doesn't parse IPv6 addresses.
173   */
174  private static final String URI_REGEXP =
175    "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?";
176
177  /**
178   * Regular expression for parsing the authority segment.
179   */
180  private static final String AUTHORITY_REGEXP =
181    "(([^?#]*)@)?([^?#:]*)(:([0-9]*))?";
182
183  /**
184   * Valid characters (taken from rfc2396/3986)
185   */
186  private static final String RFC2396_DIGIT = "0123456789";
187  private static final String RFC2396_LOWALPHA = "abcdefghijklmnopqrstuvwxyz";
188  private static final String RFC2396_UPALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
189  private static final String RFC2396_ALPHA =
190    RFC2396_LOWALPHA + RFC2396_UPALPHA;
191  private static final String RFC2396_ALPHANUM = RFC2396_DIGIT + RFC2396_ALPHA;
192  private static final String RFC3986_UNRESERVED = RFC2396_ALPHANUM + "-._~";
193  private static final String RFC3986_SUBDELIMS = "!$&'()*+,;=";
194  private static final String RFC3986_REG_NAME =
195    RFC3986_UNRESERVED + RFC3986_SUBDELIMS + "%";
196  private static final String RFC3986_PCHAR = RFC3986_UNRESERVED + 
197    RFC3986_SUBDELIMS + ":@%";
198  private static final String RFC3986_SEGMENT = RFC3986_PCHAR;
199  private static final String RFC3986_PATH_SEGMENTS = RFC3986_SEGMENT + "/";
200  private static final String RFC3986_SSP = RFC3986_PCHAR + "?/";
201  private static final String RFC3986_HOST = RFC3986_REG_NAME + "[]";
202  private static final String RFC3986_USERINFO = RFC3986_REG_NAME + ":";
203
204  /**
205   * Index of scheme component in parsed URI.
206   */
207  private static final int SCHEME_GROUP = 2;
208
209  /**
210   * Index of scheme-specific-part in parsed URI.
211   */
212  private static final int SCHEME_SPEC_PART_GROUP = 3;
213
214  /**
215   * Index of authority component in parsed URI.
216   */
217  private static final int AUTHORITY_GROUP = 5;
218
219  /**
220   * Index of path component in parsed URI.
221   */
222  private static final int PATH_GROUP = 6;
223
224  /**
225   * Index of query component in parsed URI.
226   */
227  private static final int QUERY_GROUP = 8;
228
229  /**
230   * Index of fragment component in parsed URI.
231   */
232  private static final int FRAGMENT_GROUP = 10;
233  
234  /**
235   * Index of userinfo component in parsed authority section.
236   */
237  private static final int AUTHORITY_USERINFO_GROUP = 2;
238
239  /**
240   * Index of host component in parsed authority section.
241   */
242  private static final int AUTHORITY_HOST_GROUP = 3;
243
244  /**
245   * Index of port component in parsed authority section.
246   */
247  private static final int AUTHORITY_PORT_GROUP = 5;
248
249  /**
250   * The compiled version of the URI regular expression.
251   */
252  private static final Pattern URI_PATTERN;
253
254  /**
255   * The compiled version of the authority regular expression.
256   */
257  private static final Pattern AUTHORITY_PATTERN;
258
259  /**
260   * The set of valid hexadecimal characters.
261   */
262  private static final String HEX = "0123456789ABCDEF";
263
264  private transient String scheme;
265  private transient String rawSchemeSpecificPart;
266  private transient String schemeSpecificPart;
267  private transient String rawAuthority;
268  private transient String authority;
269  private transient String rawUserInfo;
270  private transient String userInfo;
271  private transient String rawHost;
272  private transient String host;
273  private transient int port = -1;
274  private transient String rawPath;
275  private transient String path;
276  private transient String rawQuery;
277  private transient String query;
278  private transient String rawFragment;
279  private transient String fragment;
280  private String string;
281
282  /**
283   * Static initializer to pre-compile the regular expressions.
284   */
285  static
286  {
287    URI_PATTERN = Pattern.compile(URI_REGEXP);
288    AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEXP);
289  }
290
291  private void readObject(ObjectInputStream is)
292    throws ClassNotFoundException, IOException
293  {
294    this.string = (String) is.readObject();
295    try
296      {
297        parseURI(this.string);
298      }
299    catch (URISyntaxException x)
300      {
301        // Should not happen.
302        throw new RuntimeException(x);
303      }
304  }
305
306  private void writeObject(ObjectOutputStream os) throws IOException
307  {
308    if (string == null)
309      string = toString(); 
310    os.writeObject(string);
311  }
312
313  /**
314   * <p>
315   * Returns the string content of the specified group of the supplied
316   * matcher.  The returned value is modified according to the following:
317   * </p>
318   * <ul>
319   * <li>If the resulting string has a length greater than 0, then
320   * that string is returned.</li>
321   * <li>If a string of zero length, is matched, then the content
322   * of the preceding group is considered.  If this is also an empty
323   * string, then <code>null</code> is returned to indicate an undefined
324   * value.  Otherwise, the value is truly the empty string and this is
325   * the returned value.</li>
326   * </ul>
327   * <p>
328   * This method is used for matching against all parts of the URI
329   * that may be either undefined or empty (i.e. all those but the
330   * scheme-specific part and the path).  In each case, the preceding
331   * group is the content of the original group, along with some
332   * additional distinguishing feature.  For example, the preceding
333   * group for the query includes the preceding question mark,
334   * while that of the fragment includes the hash symbol.  The presence
335   * of these features enables disambiguation between the two cases
336   * of a completely unspecified value and a simple non-existant value.
337   * The scheme differs in that it will never return an empty string;
338   * the delimiter follows the scheme rather than preceding it, so
339   * it becomes part of the following section.  The same is true
340   * of the user information.
341   * </p>
342   *
343   * @param match the matcher, which contains the results of the URI
344   *              matched against the URI regular expression.
345   * @return either the matched content, <code>null</code> for undefined
346   *         values, or an empty string for a URI part with empty content.
347   */
348  private static String getURIGroup(Matcher match, int group)
349  {
350    String matched = match.group(group);
351    if (matched == null || matched.length() == 0)
352      {
353        String prevMatched = match.group(group -1);
354        if (prevMatched == null || prevMatched.length() == 0)
355          return null;
356        else
357          return "";
358      }
359    return matched;
360  }
361
362  /**
363   * Sets fields of this URI by parsing the given string.
364   *
365   * @param str The string to parse
366   *
367   * @exception URISyntaxException If the given string violates RFC 2396
368   */
369  private void parseURI(String str) throws URISyntaxException
370  {
371    Matcher matcher = URI_PATTERN.matcher(str);
372    
373    if (matcher.matches())
374      {
375        scheme = getURIGroup(matcher, SCHEME_GROUP);
376        rawSchemeSpecificPart = matcher.group(SCHEME_SPEC_PART_GROUP);
377        schemeSpecificPart = unquote(rawSchemeSpecificPart);
378        if (!isOpaque())
379          {
380            rawAuthority = getURIGroup(matcher, AUTHORITY_GROUP);
381            rawPath = matcher.group(PATH_GROUP);
382            rawQuery = getURIGroup(matcher, QUERY_GROUP);
383          }
384        rawFragment = getURIGroup(matcher, FRAGMENT_GROUP);
385      }
386    else
387      throw new URISyntaxException(str,
388                                   "doesn't match URI regular expression");
389    parseServerAuthority();
390
391    // We must eagerly unquote the parts, because this is the only time
392    // we may throw an exception.
393    authority = unquote(rawAuthority);
394    userInfo = unquote(rawUserInfo);
395    host = unquote(rawHost);
396    path = unquote(rawPath);
397    query = unquote(rawQuery);
398    fragment = unquote(rawFragment);
399  }
400
401  /**
402   * Unquote "%" + hex quotes characters
403   *
404   * @param str The string to unquote or null.
405   *
406   * @return The unquoted string or null if str was null.
407   *
408   * @exception URISyntaxException If the given string contains invalid
409   * escape sequences.
410   */
411  private static String unquote(String str) throws URISyntaxException
412  {
413    if (str == null)
414      return null;
415    byte[] buf = new byte[str.length()];
416    int pos = 0;
417    for (int i = 0; i < str.length(); i++)
418      {
419        char c = str.charAt(i);
420        if (c == '%')
421          {
422            if (i + 2 >= str.length())
423              throw new URISyntaxException(str, "Invalid quoted character");
424            int hi = Character.digit(str.charAt(++i), 16);
425            int lo = Character.digit(str.charAt(++i), 16);
426            if (lo < 0 || hi < 0)
427              throw new URISyntaxException(str, "Invalid quoted character");
428            buf[pos++] = (byte) (hi * 16 + lo);
429          }
430        else
431          buf[pos++] = (byte) c;
432      }
433    try
434      {
435        return new String(buf, 0, pos, "utf-8");
436      }
437    catch (java.io.UnsupportedEncodingException x2)
438      {
439        throw (Error) new InternalError().initCause(x2);
440      }
441  }
442
443  /**
444   * Quote characters illegal in URIs in given string.
445   *
446   * Replace illegal characters by encoding their UTF-8
447   * representation as "%" + hex code for each resulting
448   * UTF-8 character.
449   *
450   * @param str The string to quote
451   *
452   * @return The quoted string.
453   */
454  private static String quote(String str)
455  {
456    return quote(str, RFC3986_SSP);
457  }
458
459  /**
460   * Quote characters illegal in URI authorities in given string.
461   *
462   * Replace illegal characters by encoding their UTF-8
463   * representation as "%" + hex code for each resulting
464   * UTF-8 character.
465   *
466   * @param str The string to quote
467   *
468   * @return The quoted string.
469   */
470  private static String quoteAuthority(String str)
471  {
472    // Technically, we should be using RFC2396_AUTHORITY, but
473    // it contains no additional characters.
474    return quote(str, RFC3986_REG_NAME);
475  }
476
477  /**
478   * Quotes the characters in the supplied string that are not part of
479   * the specified set of legal characters.
480   *
481   * @param str the string to quote
482   * @param legalCharacters the set of legal characters
483   *
484   * @return the quoted string.
485   */
486  private static String quote(String str, String legalCharacters)
487  {
488    CPStringBuilder sb = new CPStringBuilder(str.length());
489    for (int i = 0; i < str.length(); i++)
490      {
491        char c = str.charAt(i);
492        if ((legalCharacters.indexOf(c) == -1)
493            && (c <= 127))
494          {
495            sb.append('%');
496            sb.append(HEX.charAt(c / 16));
497            sb.append(HEX.charAt(c % 16));
498          }
499        else
500          sb.append(c);
501      }
502    return sb.toString();
503  }
504
505  /**
506   * Quote characters illegal in URI hosts in given string.
507   *
508   * Replace illegal characters by encoding their UTF-8
509   * representation as "%" + hex code for each resulting
510   * UTF-8 character.
511   *
512   * @param str The string to quote
513   *
514   * @return The quoted string.
515   */
516  private static String quoteHost(String str)
517  {
518    return quote(str, RFC3986_HOST);
519  }
520
521  /**
522   * Quote characters illegal in URI paths in given string.
523   *
524   * Replace illegal characters by encoding their UTF-8
525   * representation as "%" + hex code for each resulting
526   * UTF-8 character.
527   *
528   * @param str The string to quote
529   *
530   * @return The quoted string.
531   */
532  private static String quotePath(String str)
533  {
534    // Technically, we should be using RFC2396_PATH, but
535    // it contains no additional characters.
536    return quote(str, RFC3986_PATH_SEGMENTS);
537  }
538
539  /**
540   * Quote characters illegal in URI user infos in given string.
541   *
542   * Replace illegal characters by encoding their UTF-8
543   * representation as "%" + hex code for each resulting
544   * UTF-8 character.
545   *
546   * @param str The string to quote
547   *
548   * @return The quoted string.
549   */
550  private static String quoteUserInfo(String str)
551  {
552    return quote(str, RFC3986_USERINFO);
553  }
554
555  /**
556   * Creates an URI from the given string
557   *
558   * @param str The string to create the URI from
559   *
560   * @exception URISyntaxException If the given string violates RFC 2396
561   * @exception NullPointerException If str is null
562   */
563  public URI(String str) throws URISyntaxException
564  {
565    this.string = str;
566    parseURI(str);
567  }
568
569  /**
570   * Create an URI from the given components
571   *
572   * @param scheme The scheme name
573   * @param userInfo The username and authorization info
574   * @param host The hostname
575   * @param port The port number
576   * @param path The path
577   * @param query The query
578   * @param fragment The fragment
579   *
580   * @exception URISyntaxException If the given string violates RFC 2396
581   */
582  public URI(String scheme, String userInfo, String host, int port,
583             String path, String query, String fragment)
584    throws URISyntaxException
585  {
586    this((scheme == null ? "" : scheme + ":")
587         + (userInfo == null && host == null && port == -1 ? "" : "//")
588         + (userInfo == null ? "" : quoteUserInfo(userInfo) + "@")
589         + (host == null ? "" : quoteHost(host))
590         + (port == -1 ? "" : ":" + String.valueOf(port))
591         + (path == null ? "" : quotePath(path))
592         + (query == null ? "" : "?" + quote(query))
593         + (fragment == null ? "" : "#" + quote(fragment)));
594  }
595
596  /**
597   * Create an URI from the given components
598   *
599   * @param scheme The scheme name
600   * @param authority The authority
601   * @param path The apth
602   * @param query The query
603   * @param fragment The fragment
604   *
605   * @exception URISyntaxException If the given string violates RFC 2396
606   */
607  public URI(String scheme, String authority, String path, String query,
608             String fragment) throws URISyntaxException
609  {
610    this((scheme == null ? "" : scheme + ":")
611         + (authority == null ? "" : "//" + quoteAuthority(authority))
612         + (path == null ? "" : quotePath(path))
613         + (query == null ? "" : "?" + quote(query))
614         + (fragment == null ? "" : "#" + quote(fragment)));
615  }
616
617  /**
618   * Create an URI from the given components
619   *
620   * @param scheme The scheme name
621   * @param host The hostname
622   * @param path The path
623   * @param fragment The fragment
624   *
625   * @exception URISyntaxException If the given string violates RFC 2396
626   */
627  public URI(String scheme, String host, String path, String fragment)
628    throws URISyntaxException
629  {
630    this(scheme, null, host, -1, path, null, fragment);
631  }
632
633  /**
634   * Create an URI from the given components
635   *
636   * @param scheme The scheme name
637   * @param ssp The scheme specific part
638   * @param fragment The fragment
639   *
640   * @exception URISyntaxException If the given string violates RFC 2396
641   */
642  public URI(String scheme, String ssp, String fragment)
643    throws URISyntaxException
644  {
645    this((scheme == null ? "" : scheme + ":")
646         + (ssp == null ? "" : quote(ssp))
647         + (fragment == null ? "" : "#" + quote(fragment)));
648  }
649
650  /**
651   * Create an URI from the given string
652   *
653   * @param str The string to create the URI from
654   *
655   * @exception IllegalArgumentException If the given string violates RFC 2396
656   * @exception NullPointerException If str is null
657   */
658  public static URI create(String str)
659  {
660    try
661      {
662        return new URI(str);
663      }
664    catch (URISyntaxException e)
665      {
666        throw (IllegalArgumentException) new IllegalArgumentException()
667              .initCause(e);
668      }
669  }
670
671  /**
672   * Attempts to parse this URI's authority component, if defined,
673   * into user-information, host, and port components.  The purpose
674   * of this method was to disambiguate between some authority sections,
675   * which form invalid server-based authories, but valid registry
676   * based authorities.  In the updated RFC 3986, the authority section
677   * is defined differently, with registry-based authorities part of
678   * the host section.  Thus, this method is now simply an explicit
679   * way of parsing any authority section.
680   *
681   * @return the URI, with the authority section parsed into user
682   *         information, host and port components.
683   * @throws URISyntaxException if the given string violates RFC 2396
684   */
685  public URI parseServerAuthority() throws URISyntaxException
686  {
687    if (rawAuthority != null)
688      {
689        Matcher matcher = AUTHORITY_PATTERN.matcher(rawAuthority);
690
691        if (matcher.matches())
692          {
693            rawUserInfo = getURIGroup(matcher, AUTHORITY_USERINFO_GROUP);
694            rawHost = getURIGroup(matcher, AUTHORITY_HOST_GROUP);
695            
696            String portStr = getURIGroup(matcher, AUTHORITY_PORT_GROUP);
697            
698            if (portStr != null && ! portStr.isEmpty())
699              try
700                {
701                  port = Integer.parseInt(portStr);
702                }
703              catch (NumberFormatException e)
704                {
705                  URISyntaxException use =
706                    new URISyntaxException
707                      (string, "doesn't match URI regular expression");
708                  use.initCause(e);
709                  throw use;
710                }
711          }
712        else
713          throw new URISyntaxException(string,
714                                       "doesn't match URI regular expression");
715      }
716    return this;
717  }
718
719  /**
720   * <p>
721   * Returns a normalized version of the URI.  If the URI is opaque,
722   * or its path is already in normal form, then this URI is simply
723   * returned.  Otherwise, the following transformation of the path
724   * element takes place:
725   * </p>
726   * <ol>
727   * <li>All `.' segments are removed.</li>
728   * <li>Each `..' segment which can be paired with a prior non-`..' segment
729   * is removed along with the preceding segment.</li>
730   * <li>A `.' segment is added to the front if the first segment contains
731   * a colon (`:').  This is a deviation from the RFC, which prevents
732   * confusion between the path and the scheme.</li>
733   * </ol>
734   * <p>
735   * The resulting URI will be free of `.' and `..' segments, barring those
736   * that were prepended or which couldn't be paired, respectively.
737   * </p>
738   *
739   * @return the normalized URI.
740   */
741  public URI normalize()
742  {
743    if (isOpaque() || path.indexOf("/./") == -1 && path.indexOf("/../") == -1)
744      return this;
745    try
746      {
747        return new URI(scheme, authority, normalizePath(path), query,
748                       fragment);
749      }
750    catch (URISyntaxException e)
751      {
752        throw (Error) new InternalError("Normalized URI variant could not "+
753                                        "be constructed").initCause(e);
754      }
755  }
756
757  /**
758   * <p>
759   * Normalize the given path.  The following transformation takes place:
760   * </p>
761   * <ol>
762   * <li>All `.' segments are removed.</li>
763   * <li>Each `..' segment which can be paired with a prior non-`..' segment
764   * is removed along with the preceding segment.</li>
765   * <li>A `.' segment is added to the front if the first segment contains
766   * a colon (`:').  This is a deviation from the RFC, which prevents
767   * confusion between the path and the scheme.</li>
768   * </ol>
769   * <p>
770   * The resulting URI will be free of `.' and `..' segments, barring those
771   * that were prepended or which couldn't be paired, respectively.
772   * </p>
773   * 
774   * @param relativePath the relative path to be normalized.
775   * @return the normalized path.
776   */
777  private String normalizePath(String relativePath)
778  {
779    /* 
780       This follows the algorithm in section 5.2.4. of RFC3986,
781       but doesn't modify the input buffer.
782    */
783    CPStringBuilder input = new CPStringBuilder(relativePath);
784    CPStringBuilder output = new CPStringBuilder();
785    int start = 0;
786    while (start < input.length())
787      {
788        /* A */
789        if (input.indexOf("../",start) == start)
790          {
791            start += 3;
792            continue;
793          }
794        if (input.indexOf("./",start) == start)
795          {
796            start += 2;
797            continue;
798          }
799        /* B */
800        if (input.indexOf("/./",start) == start)
801          {
802            start += 2;
803            continue;
804          }
805        if (input.indexOf("/.",start) == start
806            && input.charAt(start + 2) != '.')
807          {
808            start += 1;
809            input.setCharAt(start,'/');
810            continue;
811          }
812        /* C */
813        if (input.indexOf("/../",start) == start)
814          {
815            start += 3;
816            removeLastSegment(output);
817            continue;
818          }
819        if (input.indexOf("/..",start) == start)
820          {
821            start += 2;
822            input.setCharAt(start,'/');
823            removeLastSegment(output);
824            continue;
825          }
826        /* D */
827        if (start == input.length() - 1 && input.indexOf(".",start) == start)
828          {
829            input.delete(0,1);
830            continue;
831          }
832        if (start == input.length() - 2 && input.indexOf("..",start) == start)
833          {
834            input.delete(0,2);
835            continue;
836          }
837        /* E */
838        int indexOfSlash = input.indexOf("/",start);
839        while (indexOfSlash == start)
840          {
841            output.append("/");
842            ++start;
843            indexOfSlash = input.indexOf("/",start);
844          }
845        if (indexOfSlash == -1)
846          indexOfSlash = input.length();
847        output.append(input.substring(start, indexOfSlash));
848        start = indexOfSlash;
849      }
850    return output.toString();
851  }
852
853  /**
854   * Removes the last segment of the path from the specified buffer.
855   *
856   * @param buffer the buffer containing the path.
857   */
858  private void removeLastSegment(CPStringBuilder buffer)
859  {
860    int lastSlash = buffer.lastIndexOf("/");
861    if (lastSlash == -1)
862      buffer.setLength(0);
863    else
864      buffer.setLength(lastSlash);
865  }
866
867  /**
868   * Resolves the given URI against this URI
869   *
870   * @param uri The URI to resolve against this URI
871   *
872   * @return The resulting URI, or null when it couldn't be resolved
873   * for some reason.
874   *
875   * @throws NullPointerException if uri is null
876   */
877  public URI resolve(URI uri)
878  {
879    if (uri.isAbsolute())
880      return uri;
881    if (uri.isOpaque())
882      return uri;
883
884    String scheme = uri.getScheme();
885    String schemeSpecificPart = uri.getSchemeSpecificPart();
886    String authority = uri.getAuthority();
887    String path = uri.getPath();
888    String query = uri.getQuery();
889    String fragment = uri.getFragment();
890
891    try
892      {
893        if (fragment != null && path != null && path.equals("")
894            && scheme == null && authority == null && query == null)
895          return new URI(this.scheme, this.schemeSpecificPart, fragment);
896
897        if (authority == null)
898          {
899            authority = this.authority;
900            if (path == null)
901              path = "";
902            if (! (path.startsWith("/")))
903              {
904                CPStringBuilder basepath = new CPStringBuilder(this.path);
905                int i = this.path.lastIndexOf('/');
906
907                if (i >= 0)
908                  basepath.delete(i + 1, basepath.length());
909
910                basepath.append(path);
911                path = normalizePath(basepath.toString());
912              }
913          }
914        return new URI(this.scheme, authority, path, query, fragment);
915      }
916    catch (URISyntaxException e)
917      {
918        throw (Error) new InternalError("Resolved URI variant could not "+
919                                        "be constructed").initCause(e);
920      }
921  }
922
923  /**
924   * Resolves the given URI string against this URI
925   *
926   * @param str The URI as string to resolve against this URI
927   *
928   * @return The resulting URI
929   *
930   * @throws IllegalArgumentException If the given URI string
931   * violates RFC 2396
932   * @throws NullPointerException If uri is null
933   */
934  public URI resolve(String str) throws IllegalArgumentException
935  {
936    return resolve(create(str));
937  }
938
939  /**
940   * <p>
941   * Relativizes the given URI against this URI.  The following
942   * algorithm is used:
943   * </p>
944   * <ul>
945   * <li>If either URI is opaque, the given URI is returned.</li>
946   * <li>If the schemes of the URIs differ, the given URI is returned.</li>
947   * <li>If the authority components of the URIs differ, then the given
948   * URI is returned.</li>
949   * <li>If the path of this URI is not a prefix of the supplied URI,
950   * then the given URI is returned.</li>
951   * <li>If all the above conditions hold, a new URI is created using the
952   * query and fragment components of the given URI, along with a path
953   * computed by removing the path of this URI from the start of the path
954   * of the supplied URI.</li>
955   * </ul>
956   *
957   * @param uri the URI to relativize agsint this URI
958   * @return the resulting URI
959   * @throws NullPointerException if the uri is null
960   */
961  public URI relativize(URI uri)
962  {
963    if (isOpaque() || uri.isOpaque())
964      return uri;
965    if (scheme == null && uri.getScheme() != null)
966      return uri;
967    if (scheme != null && !(scheme.equals(uri.getScheme())))
968      return uri;
969    if (rawAuthority == null && uri.getRawAuthority() != null)
970      return uri;
971    if (rawAuthority != null && !(rawAuthority.equals(uri.getRawAuthority())))
972      return uri;
973    String basePath = rawPath;
974    if (!(uri.getRawPath().equals(rawPath)))
975      {
976        if (!(basePath.endsWith("/")))
977          basePath = basePath.concat("/");
978        if (!(uri.getRawPath().startsWith(basePath)))
979          return uri;
980      }
981    try
982      {
983        return new URI(null, null, 
984                       uri.getRawPath().substring(basePath.length()),
985                       uri.getRawQuery(), uri.getRawFragment());
986      }
987    catch (URISyntaxException e)
988      {
989        throw (Error) new InternalError("Relativized URI variant could not "+
990                                        "be constructed").initCause(e);       
991      }
992  }
993
994  /**
995   * Creates an URL from an URI
996   *
997   * @throws MalformedURLException If a protocol handler for the URL could
998   * not be found, or if some other error occurred while constructing the URL
999   * @throws IllegalArgumentException If the URI is not absolute
1000   */
1001  public URL toURL() throws IllegalArgumentException, MalformedURLException
1002  {
1003    if (isAbsolute())
1004      return new URL(this.toString());
1005
1006    throw new IllegalArgumentException("not absolute");
1007  }
1008
1009  /**
1010   * Returns the scheme of the URI
1011   */
1012  public String getScheme()
1013  {
1014    return scheme;
1015  }
1016
1017  /**
1018   * Tells whether this URI is absolute or not
1019   */
1020  public boolean isAbsolute()
1021  {
1022    return scheme != null;
1023  }
1024
1025  /**
1026   * Tell whether this URI is opaque or not
1027   */
1028  public boolean isOpaque()
1029  {
1030    return ((scheme != null) && ! (schemeSpecificPart.startsWith("/")));
1031  }
1032
1033  /**
1034   * Returns the raw scheme specific part of this URI.
1035   * The scheme-specific part is never undefined, though it may be empty
1036   */
1037  public String getRawSchemeSpecificPart()
1038  {
1039    return rawSchemeSpecificPart;
1040  }
1041
1042  /**
1043   * Returns the decoded scheme specific part of this URI.
1044   */
1045  public String getSchemeSpecificPart()
1046  {
1047    return schemeSpecificPart;
1048  }
1049
1050  /**
1051   * Returns the raw authority part of this URI
1052   */
1053  public String getRawAuthority()
1054  {
1055    return rawAuthority;
1056  }
1057
1058  /**
1059   * Returns the decoded authority part of this URI
1060   */
1061  public String getAuthority()
1062  {
1063    return authority;
1064  }
1065
1066  /**
1067   * Returns the raw user info part of this URI
1068   */
1069  public String getRawUserInfo()
1070  {
1071    return rawUserInfo;
1072  }
1073
1074  /**
1075   * Returns the decoded user info part of this URI
1076   */
1077  public String getUserInfo()
1078  {
1079    return userInfo;
1080  }
1081
1082  /**
1083   * Returns the hostname of the URI
1084   */
1085  public String getHost()
1086  {
1087    return host;
1088  }
1089
1090  /**
1091   * Returns the port number of the URI
1092   */
1093  public int getPort()
1094  {
1095    return port;
1096  }
1097
1098  /**
1099   * Returns the raw path part of this URI
1100   */
1101  public String getRawPath()
1102  {
1103    return rawPath;
1104  }
1105
1106  /**
1107   * Returns the path of the URI
1108   */
1109  public String getPath()
1110  {
1111    return path;
1112  }
1113
1114  /**
1115   * Returns the raw query part of this URI
1116   */
1117  public String getRawQuery()
1118  {
1119    return rawQuery;
1120  }
1121
1122  /**
1123   * Returns the query of the URI
1124   */
1125  public String getQuery()
1126  {
1127    return query;
1128  }
1129
1130  /**
1131   * Return the raw fragment part of this URI
1132   */
1133  public String getRawFragment()
1134  {
1135    return rawFragment;
1136  }
1137
1138  /**
1139   * Returns the fragment of the URI
1140   */
1141  public String getFragment()
1142  {
1143    return fragment;
1144  }
1145
1146  /**
1147   * <p> 
1148   * Compares the URI with the given object for equality.  If the
1149   * object is not a <code>URI</code>, then the method returns false.
1150   * Otherwise, the following criteria are observed:
1151   * </p>
1152   * <ul>
1153   * <li>The scheme of the URIs must either be null (undefined) in both cases,
1154   * or equal, ignorant of case.</li>
1155   * <li>The raw fragment of the URIs must either be null (undefined) in both
1156   * cases, or equal, ignorant of case.</li>
1157   * <li>Both URIs must be of the same type (opaque or hierarchial)</li>
1158   * <li><strong>For opaque URIs:</strong></li>
1159   * <ul>
1160   * <li>The raw scheme-specific parts must be equal.</li>
1161   * </ul>
1162   * <li>For hierarchical URIs:</li>
1163   * <ul>
1164   * <li>The raw paths must be equal, ignorant of case.</li>
1165   * <li>The raw queries are either both undefined or both equal, ignorant
1166   * of case.</li>
1167   * <li>The raw authority sections are either both undefined or:</li>
1168   * <li><strong>For registry-based authorities:</strong></li>
1169   * <ul><li>they are equal.</li></ul>
1170   * <li><strong>For server-based authorities:</strong></li>
1171   * <ul>
1172   * <li>the hosts are equal, ignoring case</li>
1173   * <li>the ports are equal</li>
1174   * <li>the user information components are equal</li>
1175   * </ul>
1176   * </ul>
1177   * </ul>
1178   *
1179   * @param obj the obj to compare the URI with.
1180   * @return <code>true</code> if the objects are equal, according to
1181   *         the specification above.
1182   */
1183  public boolean equals(Object obj)
1184  {
1185    if (!(obj instanceof URI))
1186      return false;
1187    URI uriObj = (URI) obj;
1188    if (scheme == null)
1189      {
1190        if (uriObj.getScheme() != null)
1191          return false;
1192      }
1193    else
1194      if (!(scheme.equalsIgnoreCase(uriObj.getScheme())))
1195        return false;
1196    if (rawFragment == null)
1197      {
1198        if (uriObj.getRawFragment() != null)
1199          return false;
1200      }
1201    else
1202      if (!(rawFragment.equalsIgnoreCase(uriObj.getRawFragment())))
1203        return false;
1204    boolean opaqueThis = isOpaque();
1205    boolean opaqueObj = uriObj.isOpaque();
1206    if (opaqueThis && opaqueObj)
1207      return rawSchemeSpecificPart.equals(uriObj.getRawSchemeSpecificPart());
1208    else if (!opaqueThis && !opaqueObj)
1209      {
1210        boolean common = rawPath.equalsIgnoreCase(uriObj.getRawPath())
1211          && ((rawQuery == null && uriObj.getRawQuery() == null)
1212              || rawQuery.equalsIgnoreCase(uriObj.getRawQuery()));
1213        if (rawAuthority == null && uriObj.getRawAuthority() == null)
1214          return common;
1215        if (host == null)
1216          return common 
1217            && rawAuthority.equalsIgnoreCase(uriObj.getRawAuthority());
1218        return common 
1219          && host.equalsIgnoreCase(uriObj.getHost())
1220          && port == uriObj.getPort()
1221          && (rawUserInfo == null ?
1222              uriObj.getRawUserInfo() == null :
1223              rawUserInfo.equalsIgnoreCase(uriObj.getRawUserInfo()));
1224      }
1225    else
1226      return false;
1227  }
1228
1229  /**
1230   * Computes the hashcode of the URI
1231   */
1232  public int hashCode()
1233  {
1234    return (getScheme() == null ? 0 : 13 * getScheme().hashCode())
1235      + 17 * getRawSchemeSpecificPart().hashCode()
1236      + (getRawFragment() == null ? 0 : 21 + getRawFragment().hashCode());
1237  }
1238
1239  /**
1240   * Compare the URI with another URI.
1241   * Undefined components are taken to be less than any other component.
1242   * The following criteria are observed:
1243   * </p>
1244   * <ul>
1245   * <li>Two URIs with different schemes are compared according to their
1246   * scheme, regardless of case.</li>
1247   * <li>A hierarchical URI is less than an opaque URI with the same
1248   * scheme.</li>
1249   * <li><strong>For opaque URIs:</strong></li>
1250   * <ul>
1251   * <li>URIs with differing scheme-specific parts are ordered according
1252   * to the ordering of the scheme-specific part.</li>
1253   * <li>URIs with the same scheme-specific part are ordered by the
1254   * raw fragment.</li>
1255   * </ul>
1256   * <li>For hierarchical URIs:</li>
1257   * <ul>
1258   * <li>URIs are ordered according to their raw authority sections,
1259   * if they are unequal.</li>
1260   * <li><strong>For registry-based authorities:</strong></li>
1261   * <ul><li>they are ordered according to the ordering of the authority
1262   * component.</li></ul>
1263   * <li><strong>For server-based authorities:</strong></li>
1264   * <ul>
1265   * <li>URIs are ordered according to the raw user information.</li>
1266   * <li>URIs with the same user information are ordered by the host,
1267   * ignoring case.</li>
1268   * <lI>URIs with the same host are ordered by the port.</li>
1269   * </ul>
1270   * <li>URIs with the same authority section are ordered by the raw path.</li>
1271   * <li>URIs with the same path are ordered by their raw query.</li>
1272   * <li>URIs with the same query are ordered by their raw fragments.</li>
1273   * </ul>
1274   * </ul>
1275   *
1276   * @param uri The other URI to compare this URI with
1277   * @return a negative integer, zero or a positive integer depending
1278   *         on whether this URI is less than, equal to or greater
1279   *         than that supplied, respectively.
1280   */
1281  public int compareTo(URI uri) 
1282    throws ClassCastException
1283  {
1284    if (scheme == null && uri.getScheme() != null)
1285      return -1;
1286    if (scheme != null)
1287      {
1288        int sCompare = scheme.compareToIgnoreCase(uri.getScheme()); 
1289        if (sCompare != 0)
1290          return sCompare;
1291      }
1292    boolean opaqueThis = isOpaque();
1293    boolean opaqueObj = uri.isOpaque();
1294    if (opaqueThis && !opaqueObj)
1295      return 1;
1296    if (!opaqueThis && opaqueObj)
1297      return -1;
1298    if (opaqueThis)
1299      {
1300        int ssCompare = 
1301          rawSchemeSpecificPart.compareTo(uri.getRawSchemeSpecificPart());
1302        if (ssCompare == 0)
1303          return compareFragments(uri);
1304        else
1305          return ssCompare;
1306      }
1307    if (rawAuthority == null && uri.getRawAuthority() != null)
1308      return -1;
1309    if (rawAuthority != null)
1310      {
1311        int aCompare = rawAuthority.compareTo(uri.getRawAuthority());
1312        if (aCompare != 0)
1313          {
1314            if (host == null)
1315              return aCompare;
1316            if (rawUserInfo == null && uri.getRawUserInfo() != null)
1317              return -1;
1318            int uCompare = rawUserInfo.compareTo(uri.getRawUserInfo());
1319            if (uCompare != 0)
1320              return uCompare;
1321            if (host == null && uri.getHost() != null)
1322              return -1;
1323            int hCompare = host.compareTo(uri.getHost());
1324            if (hCompare != 0)
1325              return hCompare;
1326            int uriPort = uri.getPort();
1327            return (uriPort == port) ? 0 : (uriPort > port) ? -1 : 1;
1328          }
1329      }
1330    if (rawPath == null && uri.getRawPath() != null)
1331      return -1;
1332    if (rawPath != null)
1333      {
1334        int pCompare = rawPath.compareTo(uri.getRawPath()); 
1335        if (pCompare != 0)
1336          return pCompare;
1337      }
1338    if (rawQuery == null && uri.getRawQuery() != null)
1339      return -1;
1340    if (rawQuery != null)
1341      {
1342        int qCompare = rawQuery.compareTo(uri.getRawQuery());
1343        if (qCompare != 0)
1344          return qCompare;
1345      }
1346    return compareFragments(uri);
1347  }
1348
1349  /**
1350   * Compares the fragment of this URI with that of the supplied URI.
1351   *
1352   * @param uri the URI to compare with this one.
1353   * @return a negative integer, zero or a positive integer depending
1354   *         on whether this uri's fragment is less than, equal to
1355   *         or greater than the fragment of the uri supplied, respectively.
1356   */
1357  private int compareFragments(URI uri)
1358  {
1359    if (rawFragment == null && uri.getRawFragment() != null)
1360      return -1;
1361    else if (rawFragment == null)
1362      return 0;
1363    else
1364      return rawFragment.compareTo(uri.getRawFragment());
1365  }
1366
1367  /**
1368   * Returns the URI as a String.  If the URI was created using a constructor,
1369   * then this will be the same as the original input string.
1370   *
1371   * @return a string representation of the URI.
1372   */
1373  public String toString()
1374  {
1375    return (scheme == null ? "" : scheme + ":")
1376      + rawSchemeSpecificPart
1377      + (rawFragment == null ? "" : "#" + rawFragment);
1378  }
1379
1380  /**
1381   * Returns the URI as US-ASCII string.  This is the same as the result
1382   * from <code>toString()</code> for URIs that don't contain any non-US-ASCII
1383   * characters.  Otherwise, the non-US-ASCII characters are replaced
1384   * by their percent-encoded representations.
1385   *
1386   * @return a string representation of the URI, containing only US-ASCII
1387   *         characters.
1388   */
1389  public String toASCIIString()
1390  {
1391    String strRep = toString();
1392    boolean inNonAsciiBlock = false;
1393    CPStringBuilder buffer = new CPStringBuilder();
1394    CPStringBuilder encBuffer = null;
1395    for (int i = 0; i < strRep.length(); i++)
1396      {
1397        char c = strRep.charAt(i);
1398        if (c <= 127)
1399          {
1400            if (inNonAsciiBlock)
1401              {
1402                buffer.append(escapeCharacters(encBuffer.toString()));
1403                inNonAsciiBlock = false;
1404              }
1405            buffer.append(c);
1406          }
1407        else
1408          {
1409            if (!inNonAsciiBlock)
1410              {
1411                encBuffer = new CPStringBuilder();
1412                inNonAsciiBlock = true;
1413              }
1414            encBuffer.append(c);
1415          }
1416      }
1417    return buffer.toString();
1418  }
1419
1420  /**
1421   * Converts the non-ASCII characters in the supplied string
1422   * to their equivalent percent-encoded representations.
1423   * That is, they are replaced by "%" followed by their hexadecimal value.
1424   *
1425   * @param str a string including non-ASCII characters.
1426   * @return the string with the non-ASCII characters converted to their
1427   *         percent-encoded representations.
1428   */
1429  private static String escapeCharacters(String str)
1430  {
1431    try
1432      {
1433        CPStringBuilder sb = new CPStringBuilder(); 
1434        // this is far from optimal, but it works
1435        byte[] utf8 = str.getBytes("utf-8");
1436        for (int j = 0; j < utf8.length; j++)
1437          {
1438            sb.append('%');
1439            sb.append(HEX.charAt((utf8[j] & 0xff) / 16));
1440            sb.append(HEX.charAt((utf8[j] & 0xff) % 16));
1441          }
1442        return sb.toString();
1443      }
1444    catch (java.io.UnsupportedEncodingException x)
1445      {
1446        throw (Error) new InternalError("Escaping error").initCause(x);
1447      }
1448  }
1449
1450}