View Javadoc

1   /*
2    * $Header: /home/cvs/jakarta-commons/httpclient/src/java/org/apache/commons/httpclient/URI.java,v 1.36.2.5 2004/02/22 18:21:13 olegk Exp $
3    * $Revision: 1.36.2.5 $
4    * $Date: 2004/02/22 18:21:13 $
5    *
6    * ====================================================================
7    *
8    *  Copyright 2002-2004 The Apache Software Foundation
9    *
10   *  Licensed under the Apache License, Version 2.0 (the "License");
11   *  you may not use this file except in compliance with the License.
12   *  You may obtain a copy of the License at
13   *
14   *      http://www.apache.org/licenses/LICENSE-2.0
15   *
16   *  Unless required by applicable law or agreed to in writing, software
17   *  distributed under the License is distributed on an "AS IS" BASIS,
18   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19   *  See the License for the specific language governing permissions and
20   *  limitations under the License.
21   * ====================================================================
22   *
23   * This software consists of voluntary contributions made by many
24   * individuals on behalf of the Apache Software Foundation.  For more
25   * information on the Apache Software Foundation, please see
26   * <http://www.apache.org/>.
27   *
28   * [Additional notices, if required by prior licensing conditions]
29   *
30   */
31  
32  package org.apache.commons.httpclient;
33  
34  import java.io.IOException;
35  import java.io.ObjectInputStream;
36  import java.io.ObjectOutputStream;
37  import java.io.Serializable;
38  import java.io.UnsupportedEncodingException;
39  import java.util.Locale;
40  import java.util.BitSet;
41  import java.util.Hashtable;
42  import java.net.URL;
43  
44  /***
45   * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396.
46   * This class has the purpose of supportting of parsing a URI reference to
47   * extend any specific protocols, the character encoding of the protocol to 
48   * be transported and the charset of the document.
49   * <p>
50   * A URI is always in an "escaped" form, since escaping or unescaping a
51   * completed URI might change its semantics.  
52   * <p>
53   * Implementers should be careful not to escape or unescape the same string
54   * more than once, since unescaping an already unescaped string might lead to
55   * misinterpreting a percent data character as another escaped character,
56   * or vice versa in the case of escaping an already escaped string.
57   * <p>
58   * In order to avoid these problems, data types used as follows:
59   * <p><blockquote><pre>
60   *   URI character sequence: char
61   *   octet sequence: byte
62   *   original character sequence: String
63   * </pre></blockquote><p>
64   *
65   * So, a URI is a sequence of characters as an array of a char type, which
66   * is not always represented as a sequence of octets as an array of byte.
67   * <p>
68   * 
69   * URI Syntactic Components
70   * <p><blockquote><pre>
71   * - In general, written as follows:
72   *   Absolute URI = &lt;scheme&gt:&lt;scheme-specific-part&gt;
73   *   Generic URI = &lt;scheme&gt;://&lt;authority&gt;&lt;path&gt;?&lt;query&gt;
74   *
75   * - Syntax
76   *   absoluteURI   = scheme ":" ( hier_part | opaque_part )
77   *   hier_part     = ( net_path | abs_path ) [ "?" query ]
78   *   net_path      = "//" authority [ abs_path ]
79   *   abs_path      = "/"  path_segments
80   * </pre></blockquote><p>
81   *
82   * The following examples illustrate URI that are in common use.
83   * <pre>
84   * ftp://ftp.is.co.za/rfc/rfc1808.txt
85   *    -- ftp scheme for File Transfer Protocol services
86   * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles
87   *    -- gopher scheme for Gopher and Gopher+ Protocol services
88   * http://www.math.uio.no/faq/compression-faq/part1.html
89   *    -- http scheme for Hypertext Transfer Protocol services
90   * mailto:mduerst@ifi.unizh.ch
91   *    -- mailto scheme for electronic mail addresses
92   * news:comp.infosystems.www.servers.unix
93   *    -- news scheme for USENET news groups and articles
94   * telnet://melvyl.ucop.edu/
95   *    -- telnet scheme for interactive services via the TELNET Protocol
96   * </pre>
97   * Please, notice that there are many modifications from URL(RFC 1738) and
98   * relative URL(RFC 1808).
99   * <p>
100  * <b>The expressions for a URI</b>
101  * <p><pre>
102  * For escaped URI forms
103  *  - URI(char[]) // constructor
104  *  - char[] getRawXxx() // method
105  *  - String getEscapedXxx() // method
106  *  - String toString() // method
107  * <p>
108  * For unescaped URI forms
109  *  - URI(String) // constructor
110  *  - String getXXX() // method
111  * </pre><p>
112  *
113  * @author <a href="mailto:jericho@apache.org">Sung-Gu</a>
114  * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a>
115  * @version $Revision: 1.36.2.5 $ $Date: 2002/03/14 15:14:01 
116  */
117 public class URI implements Cloneable, Comparable, Serializable {
118 
119 
120     // ----------------------------------------------------------- Constructors
121 
122     /*** Create an instance as an internal use */
123     protected URI() {
124     }
125 
126 
127     /***
128      * Construct a URI as an escaped form of a character array with the given
129      * charset.
130      *
131      * @param escaped the URI character sequence
132      * @param charset the charset string to do escape encoding
133      * @throws URIException If the URI cannot be created.
134      * @throws NullPointerException if <code>escaped</code> is <code>null</code>
135      * @see #getProtocolCharset
136      */
137     public URI(char[] escaped, String charset) 
138         throws URIException, NullPointerException {
139         protocolCharset = charset;
140         parseUriReference(new String(escaped), true);
141     }
142 
143 
144     /***
145      * Construct a URI as an escaped form of a character array.
146      * An URI can be placed within double-quotes or angle brackets like 
147      * "http://test.com/" and &lt;http://test.com/&gt;
148      * 
149      * @param escaped the URI character sequence
150      * @throws URIException If the URI cannot be created.
151      * @throws NullPointerException if <code>escaped</code> is <code>null</code>
152      * @see #getDefaultProtocolCharset
153      */
154     public URI(char[] escaped) 
155         throws URIException, NullPointerException {
156         parseUriReference(new String(escaped), true);
157     }
158 
159 
160     /***
161      * Construct a URI from the given string with the given charset.
162      *
163      * @param original the string to be represented to URI character sequence
164      * It is one of absoluteURI and relativeURI.
165      * @param charset the charset string to do escape encoding
166      * @throws URIException If the URI cannot be created.
167      * @see #getProtocolCharset
168      */
169     public URI(String original, String charset) throws URIException {
170         protocolCharset = charset;
171         parseUriReference(original, false);
172     }
173 
174 
175     /***
176      * Construct a URI from the given string.
177      * <p><blockquote><pre>
178      *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
179      * </pre></blockquote><p>
180      * An URI can be placed within double-quotes or angle brackets like 
181      * "http://test.com/" and &lt;http://test.com/&gt;
182      *
183      * @param original the string to be represented to URI character sequence
184      * It is one of absoluteURI and relativeURI.
185      * @throws URIException If the URI cannot be created.
186      * @see #getDefaultProtocolCharset
187      */
188     public URI(String original) throws URIException {
189         parseUriReference(original, false);
190     }
191 
192 
193     /***
194      * Construct a URI from a URL.
195      *
196      * @param url a valid URL.
197      * @throws URIException If the URI cannot be created.
198      * @since 2.0 
199      * @deprecated currently somewhat wrong and diffrent with java.net.URL usage
200      */
201     public URI(URL url) throws URIException {
202         this(url.toString());
203     }
204 
205 
206     /***
207      * Construct a general URI from the given components.
208      * <p><blockquote><pre>
209      *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
210      *   absoluteURI   = scheme ":" ( hier_part | opaque_part )
211      *   opaque_part   = uric_no_slash *uric
212      * </pre></blockquote><p>
213      * It's for absolute URI = &lt;scheme&gt;:&lt;scheme-specific-part&gt;#
214      * &lt;fragment&gt;.
215      *
216      * @param scheme the scheme string
217      * @param schemeSpecificPart scheme_specific_part
218      * @param fragment the fragment string
219      * @throws URIException If the URI cannot be created.
220      * @see #getDefaultProtocolCharset
221      */
222     public URI(String scheme, String schemeSpecificPart, String fragment)
223         throws URIException {
224 
225         // validate and contruct the URI character sequence
226         if (scheme == null) {
227            throw new URIException(URIException.PARSING, "scheme required");
228         }
229         char[] s = scheme.toLowerCase().toCharArray();
230         if (validate(s, URI.scheme)) {
231             _scheme = s; // is_absoluteURI
232         } else {
233             throw new URIException(URIException.PARSING, "incorrect scheme");
234         }
235         _opaque = encode(schemeSpecificPart, allowed_opaque_part,
236                 getProtocolCharset());
237         // Set flag
238         _is_opaque_part = true;
239         _fragment = fragment.toCharArray(); 
240 
241         setURI();
242     }
243 
244 
245     /***
246      * Construct a general URI from the given components.
247      * <p><blockquote><pre>
248      *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
249      *   absoluteURI   = scheme ":" ( hier_part | opaque_part )
250      *   relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
251      *   hier_part     = ( net_path | abs_path ) [ "?" query ]
252      * </pre></blockquote><p>
253      * It's for absolute URI = &lt;scheme&gt;:&lt;path&gt;?&lt;query&gt;#&lt;
254      * fragment&gt; and relative URI = &lt;path&gt;?&lt;query&gt;#&lt;fragment
255      * &gt;.
256      *
257      * @param scheme the scheme string
258      * @param authority the authority string
259      * @param path the path string
260      * @param query the query string
261      * @param fragment the fragment string
262      * @throws URIException If the new URI cannot be created.
263      * @see #getDefaultProtocolCharset
264      */
265     public URI(String scheme, String authority, String path, String query,
266                String fragment) throws URIException {
267 
268         // validate and contruct the URI character sequence
269         StringBuffer buff = new StringBuffer();
270         if (scheme != null) {
271             buff.append(scheme);
272             buff.append(':');
273         }
274         if (authority != null) {
275             buff.append("//");
276             buff.append(authority);
277         }
278         if (path != null) {  // accept empty path
279             if ((scheme != null || authority != null)
280                     && !path.startsWith("/")) {
281                 throw new URIException(URIException.PARSING,
282                         "abs_path requested");
283             }
284             buff.append(path);
285         }
286         if (query != null) {
287             buff.append('?');
288             buff.append(query);
289         }
290         if (fragment != null) {
291             buff.append('#');
292             buff.append(fragment);
293         }
294         parseUriReference(buff.toString(), false);
295     }
296 
297 
298     /***
299      * Construct a general URI from the given components.
300      *
301      * @param scheme the scheme string
302      * @param userinfo the userinfo string
303      * @param host the host string
304      * @param port the port number
305      * @throws URIException If the new URI cannot be created.
306      * @see #getDefaultProtocolCharset
307      */
308     public URI(String scheme, String userinfo, String host, int port)
309         throws URIException {
310 
311         this(scheme, userinfo, host, port, null, null, null);
312     }
313 
314 
315     /***
316      * Construct a general URI from the given components.
317      *
318      * @param scheme the scheme string
319      * @param userinfo the userinfo string
320      * @param host the host string
321      * @param port the port number
322      * @param path the path string
323      * @throws URIException If the new URI cannot be created.
324      * @see #getDefaultProtocolCharset
325      */
326     public URI(String scheme, String userinfo, String host, int port,
327             String path) throws URIException {
328 
329         this(scheme, userinfo, host, port, path, null, null);
330     }
331 
332 
333     /***
334      * Construct a general URI from the given components.
335      *
336      * @param scheme the scheme string
337      * @param userinfo the userinfo string
338      * @param host the host string
339      * @param port the port number
340      * @param path the path string
341      * @param query the query string
342      * @throws URIException If the new URI cannot be created.
343      * @see #getDefaultProtocolCharset
344      */
345     public URI(String scheme, String userinfo, String host, int port,
346             String path, String query) throws URIException {
347 
348         this(scheme, userinfo, host, port, path, query, null);
349     }
350 
351 
352     /***
353      * Construct a general URI from the given components.
354      *
355      * @param scheme the scheme string
356      * @param userinfo the userinfo string
357      * @param host the host string
358      * @param port the port number
359      * @param path the path string
360      * @param query the query string
361      * @param fragment the fragment string
362      * @throws URIException If the new URI cannot be created.
363      * @see #getDefaultProtocolCharset
364      */
365     public URI(String scheme, String userinfo, String host, int port,
366             String path, String query, String fragment) throws URIException {
367 
368         this(scheme, (host == null) ? null 
369             : ((userinfo != null) ? userinfo + '@' : "") + host 
370                 + ((port != -1) ? ":" + port : ""), path, query, fragment);
371     }
372 
373 
374     /***
375      * Construct a general URI from the given components.
376      *
377      * @param scheme the scheme string
378      * @param host the host string
379      * @param path the path string
380      * @param fragment the fragment string
381      * @throws URIException If the new URI cannot be created.
382      * @see #getDefaultProtocolCharset
383      */
384     public URI(String scheme, String host, String path, String fragment)
385         throws URIException {
386 
387         this(scheme, host, path, null, fragment);
388     }
389 
390 
391     /***
392      * Construct a general URI with the given relative URI string.
393      *
394      * @param base the base URI
395      * @param relative the relative URI string
396      * @throws URIException If the new URI cannot be created.
397      */
398     public URI(URI base, String relative) throws URIException {
399         this(base, new URI(relative));
400     }
401 
402 
403     /***
404      * Construct a general URI with the given relative URI.
405      * <p><blockquote><pre>
406      *   URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
407      *   relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
408      * </pre></blockquote><p>
409      * Resolving Relative References to Absolute Form.
410      *
411      * <strong>Examples of Resolving Relative URI References</strong>
412      *
413      * Within an object with a well-defined base URI of
414      * <p><blockquote><pre>
415      *   http://a/b/c/d;p?q
416      * </pre></blockquote><p>
417      * the relative URI would be resolved as follows:
418      *
419      * Normal Examples
420      *
421      * <p><blockquote><pre>
422      *   g:h           =  g:h
423      *   g             =  http://a/b/c/g
424      *   ./g           =  http://a/b/c/g
425      *   g/            =  http://a/b/c/g/
426      *   /g            =  http://a/g
427      *   //g           =  http://g
428      *   ?y            =  http://a/b/c/?y
429      *   g?y           =  http://a/b/c/g?y
430      *   #s            =  (current document)#s
431      *   g#s           =  http://a/b/c/g#s
432      *   g?y#s         =  http://a/b/c/g?y#s
433      *   ;x            =  http://a/b/c/;x
434      *   g;x           =  http://a/b/c/g;x
435      *   g;x?y#s       =  http://a/b/c/g;x?y#s
436      *   .             =  http://a/b/c/
437      *   ./            =  http://a/b/c/
438      *   ..            =  http://a/b/
439      *   ../           =  http://a/b/
440      *   ../g          =  http://a/b/g
441      *   ../..         =  http://a/
442      *   ../../        =  http://a/ 
443      *   ../../g       =  http://a/g
444      * </pre></blockquote><p>
445      *
446      * Some URI schemes do not allow a hierarchical syntax matching the
447      * <hier_part> syntax, and thus cannot use relative references.
448      *
449      * @param base the base URI
450      * @param relative the relative URI
451      * @throws URIException If the new URI cannot be created.
452      */
453     public URI(URI base, URI relative) throws URIException {
454 
455         if (base._scheme == null) {
456             throw new URIException(URIException.PARSING, "base URI required");
457         }
458         if (base._scheme != null) {
459             this._scheme = base._scheme;
460             this._authority = base._authority;
461         }
462         if (base._is_opaque_part || relative._is_opaque_part) {
463             this._scheme = base._scheme;
464             this._is_opaque_part = base._is_opaque_part 
465                 || relative._is_opaque_part;
466             this._opaque = relative._opaque;
467             this._fragment = relative._fragment;
468             this.setURI();
469             return;
470         }
471         if (relative._scheme != null) {
472             this._scheme = relative._scheme;
473             this._is_net_path = relative._is_net_path;
474             this._authority = relative._authority;
475             if (relative._is_server) {
476                 this._is_server = relative._is_server;
477                 this._userinfo = relative._userinfo;
478                 this._host = relative._host;
479                 this._port = relative._port;
480             } else if (relative._is_reg_name) {
481                 this._is_reg_name = relative._is_reg_name;
482             }
483             this._is_abs_path = relative._is_abs_path;
484             this._is_rel_path = relative._is_rel_path;
485             this._path = relative._path;
486         } else if (base._authority != null && relative._scheme == null) {
487             this._is_net_path = base._is_net_path;
488             this._authority = base._authority;
489             if (base._is_server) {
490                 this._is_server = base._is_server;
491                 this._userinfo = base._userinfo;
492                 this._host = base._host;
493                 this._port = base._port;
494             } else if (base._is_reg_name) {
495                 this._is_reg_name = base._is_reg_name;
496             }
497         }
498         if (relative._authority != null) {
499             this._is_net_path = relative._is_net_path;
500             this._authority = relative._authority;
501             if (relative._is_server) {
502                 this._is_server = relative._is_server;
503                 this._userinfo = relative._userinfo;
504                 this._host = relative._host;
505                 this._port = relative._port;
506             } else if (relative._is_reg_name) {
507                 this._is_reg_name = relative._is_reg_name;
508             }
509             this._is_abs_path = relative._is_abs_path;
510             this._is_rel_path = relative._is_rel_path;
511             this._path = relative._path;
512         }
513         // resolve the path and query if necessary
514         if (relative._scheme == null && relative._authority == null) {
515             if ((relative._path == null || relative._path.length == 0)
516                 && relative._query == null) {
517                 // handle a reference to the current document, see RFC 2396 
518                 // section 5.2 step 2
519                 this._path = base._path;
520                 this._query = base._query;
521             } else {
522                 this._path = resolvePath(base._path, relative._path);
523             }
524         }
525         // base._query removed
526         if (relative._query != null) {
527             this._query = relative._query;
528         }
529         // base._fragment removed
530         if (relative._fragment != null) {
531             this._fragment = relative._fragment;
532         }
533         this.setURI();
534         // reparse the newly built URI, this will ensure that all flags are set correctly.
535         // TODO there must be a better way to do this
536         parseUriReference(new String(_uri), true);
537     }
538 
539     // --------------------------------------------------- Instance Variables
540 
541     /*** Version ID for serialization */
542     static final long serialVersionUID = 604752400577948726L;
543 
544 
545     /***
546      * Cache the hash code for this URI.
547      */
548     protected int hash = 0;
549 
550 
551     /***
552      * This Uniform Resource Identifier (URI).
553      * The URI is always in an "escaped" form, since escaping or unescaping
554      * a completed URI might change its semantics.  
555      */
556     protected char[] _uri = null;
557 
558 
559     /***
560      * The charset of the protocol used by this URI instance.
561      */
562     protected String protocolCharset = null;
563 
564 
565     /***
566      * The default charset of the protocol.  RFC 2277, 2396
567      */
568     protected static String defaultProtocolCharset = "UTF-8";
569 
570 
571     /***
572      * The default charset of the document.  RFC 2277, 2396
573      * The platform's charset is used for the document by default.
574      */
575     protected static String defaultDocumentCharset = null;
576     protected static String defaultDocumentCharsetByLocale = null;
577     protected static String defaultDocumentCharsetByPlatform = null;
578     // Static initializer for defaultDocumentCharset
579     static {
580         Locale locale = Locale.getDefault();
581         // in order to support backward compatiblity
582         if (locale != null) {
583             defaultDocumentCharsetByLocale =
584                 LocaleToCharsetMap.getCharset(locale);
585             // set the default document charset
586             defaultDocumentCharset = defaultDocumentCharsetByLocale;
587         }
588         // in order to support platform encoding
589         try {
590             defaultDocumentCharsetByPlatform = System.getProperty("file.encoding");
591         } catch(SecurityException ignore) {
592         }
593         if (defaultDocumentCharset == null) {
594             // set the default document charset
595             defaultDocumentCharset = defaultDocumentCharsetByPlatform;
596         }
597     }
598 
599 
600     /***
601      * The scheme.
602      */
603     protected char[] _scheme = null;
604 
605 
606     /***
607      * The opaque.
608      */
609     protected char[] _opaque = null;
610 
611 
612     /***
613      * The authority.
614      */
615     protected char[] _authority = null;
616 
617 
618     /***
619      * The userinfo.
620      */
621     protected char[] _userinfo = null;
622 
623 
624     /***
625      * The host.
626      */
627     protected char[] _host = null;
628 
629 
630     /***
631      * The port.
632      */
633     protected int _port = -1;
634 
635 
636     /***
637      * The path.
638      */
639     protected char[] _path = null;
640 
641 
642     /***
643      * The query.
644      */
645     protected char[] _query = null;
646 
647 
648     /***
649      * The fragment.
650      */
651     protected char[] _fragment = null;
652 
653 
654     /***
655      * The root path.
656      */
657     protected static char[] rootPath = { '/' };
658 
659     // ---------------------- Generous characters for each component validation
660 
661     /***
662      * The percent "%" character always has the reserved purpose of being the
663      * escape indicator, it must be escaped as "%25" in order to be used as
664      * data within a URI.
665      */
666     protected static final BitSet percent = new BitSet(256);
667     // Static initializer for percent
668     static {
669         percent.set('%');
670     }
671 
672 
673     /***
674      * BitSet for digit.
675      * <p><blockquote><pre>
676      * digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
677      *            "8" | "9"
678      * </pre></blockquote><p>
679      */
680     protected static final BitSet digit = new BitSet(256);
681     // Static initializer for digit
682     static {
683         for (int i = '0'; i <= '9'; i++) {
684             digit.set(i);
685         }
686     }
687 
688 
689     /***
690      * BitSet for alpha.
691      * <p><blockquote><pre>
692      * alpha         = lowalpha | upalpha
693      * </pre></blockquote><p>
694      */
695     protected static final BitSet alpha = new BitSet(256);
696     // Static initializer for alpha
697     static {
698         for (int i = 'a'; i <= 'z'; i++) {
699             alpha.set(i);
700         }
701         for (int i = 'A'; i <= 'Z'; i++) {
702             alpha.set(i);
703         }
704     }
705 
706 
707     /***
708      * BitSet for alphanum (join of alpha &amp; digit).
709      * <p><blockquote><pre>
710      *  alphanum      = alpha | digit
711      * </pre></blockquote><p>
712      */
713     protected static final BitSet alphanum = new BitSet(256);
714     // Static initializer for alphanum
715     static {
716         alphanum.or(alpha);
717         alphanum.or(digit);
718     }
719 
720 
721     /***
722      * BitSet for hex.
723      * <p><blockquote><pre>
724      * hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
725      *                         "a" | "b" | "c" | "d" | "e" | "f"
726      * </pre></blockquote><p>
727      */
728     protected static final BitSet hex = new BitSet(256);
729     // Static initializer for hex
730     static {
731         hex.or(digit);
732         for (int i = 'a'; i <= 'f'; i++) {
733             hex.set(i);
734         }
735         for (int i = 'A'; i <= 'F'; i++) {
736             hex.set(i);
737         }
738     }
739 
740 
741     /***
742      * BitSet for escaped.
743      * <p><blockquote><pre>
744      * escaped       = "%" hex hex
745      * </pre></blockquote><p>
746      */
747     protected static final BitSet escaped = new BitSet(256);
748     // Static initializer for escaped
749     static {
750         escaped.or(percent);
751         escaped.or(hex);
752     }
753 
754 
755     /***
756      * BitSet for mark.
757      * <p><blockquote><pre>
758      * mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
759      *                 "(" | ")"
760      * </pre></blockquote><p>
761      */
762     protected static final BitSet mark = new BitSet(256);
763     // Static initializer for mark
764     static {
765         mark.set('-');
766         mark.set('_');
767         mark.set('.');
768         mark.set('!');
769         mark.set('~');
770         mark.set('*');
771         mark.set('\'');
772         mark.set('(');
773         mark.set(')');
774     }
775 
776 
777     /***
778      * Data characters that are allowed in a URI but do not have a reserved
779      * purpose are called unreserved.
780      * <p><blockquote><pre>
781      * unreserved    = alphanum | mark
782      * </pre></blockquote><p>
783      */
784     protected static final BitSet unreserved = new BitSet(256);
785     // Static initializer for unreserved
786     static {
787         unreserved.or(alphanum);
788         unreserved.or(mark);
789     }
790 
791 
792     /***
793      * BitSet for reserved.
794      * <p><blockquote><pre>
795      * reserved      = ";" | "/" | "?" | ":" | "@" | "&amp;" | "=" | "+" |
796      *                 "$" | ","
797      * </pre></blockquote><p>
798      */
799     protected static final BitSet reserved = new BitSet(256);
800     // Static initializer for reserved
801     static {
802         reserved.set(';');
803         reserved.set('/');
804         reserved.set('?');
805         reserved.set(':');
806         reserved.set('@');
807         reserved.set('&');
808         reserved.set('=');
809         reserved.set('+');
810         reserved.set('$');
811         reserved.set(',');
812     }
813 
814 
815     /***
816      * BitSet for uric.
817      * <p><blockquote><pre>
818      * uric          = reserved | unreserved | escaped
819      * </pre></blockquote><p>
820      */
821     protected static final BitSet uric = new BitSet(256);
822     // Static initializer for uric
823     static {
824         uric.or(reserved);
825         uric.or(unreserved);
826         uric.or(escaped);
827     }
828 
829 
830     /***
831      * BitSet for fragment (alias for uric).
832      * <p><blockquote><pre>
833      * fragment      = *uric
834      * </pre></blockquote><p>
835      */
836     protected static final BitSet fragment = uric;
837 
838 
839     /***
840      * BitSet for query (alias for uric).
841      * <p><blockquote><pre>
842      * query         = *uric
843      * </pre></blockquote><p>
844      */
845     protected static final BitSet query = uric;
846 
847 
848     /***
849      * BitSet for pchar.
850      * <p><blockquote><pre>
851      * pchar         = unreserved | escaped |
852      *                 ":" | "@" | "&amp;" | "=" | "+" | "$" | ","
853      * </pre></blockquote><p>
854      */
855     protected static final BitSet pchar = new BitSet(256);
856     // Static initializer for pchar
857     static {
858         pchar.or(unreserved);
859         pchar.or(escaped);
860         pchar.set(':');
861         pchar.set('@');
862         pchar.set('&');
863         pchar.set('=');
864         pchar.set('+');
865         pchar.set('$');
866         pchar.set(',');
867     }
868 
869 
870     /***
871      * BitSet for param (alias for pchar).
872      * <p><blockquote><pre>
873      * param         = *pchar
874      * </pre></blockquote><p>
875      */
876     protected static final BitSet param = pchar;
877 
878 
879     /***
880      * BitSet for segment.
881      * <p><blockquote><pre>
882      * segment       = *pchar *( ";" param )
883      * </pre></blockquote><p>
884      */
885     protected static final BitSet segment = new BitSet(256);
886     // Static initializer for segment
887     static {
888         segment.or(pchar);
889         segment.set(';');
890         segment.or(param);
891     }
892 
893 
894     /***
895      * BitSet for path segments.
896      * <p><blockquote><pre>
897      * path_segments = segment *( "/" segment )
898      * </pre></blockquote><p>
899      */
900     protected static final BitSet path_segments = new BitSet(256);
901     // Static initializer for path_segments
902     static {
903         path_segments.set('/');
904         path_segments.or(segment);
905     }
906 
907 
908     /***
909      * URI absolute path.
910      * <p><blockquote><pre>
911      * abs_path      = "/"  path_segments
912      * </pre></blockquote><p>
913      */
914     protected static final BitSet abs_path = new BitSet(256);
915     // Static initializer for abs_path
916     static {
917         abs_path.set('/');
918         abs_path.or(path_segments);
919     }
920 
921 
922     /***
923      * URI bitset for encoding typical non-slash characters.
924      * <p><blockquote><pre>
925      * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
926      *                 "&amp;" | "=" | "+" | "$" | ","
927      * </pre></blockquote><p>
928      */
929     protected static final BitSet uric_no_slash = new BitSet(256);
930     // Static initializer for uric_no_slash
931     static {
932         uric_no_slash.or(unreserved);
933         uric_no_slash.or(escaped);
934         uric_no_slash.set(';');
935         uric_no_slash.set('?');
936         uric_no_slash.set(';');
937         uric_no_slash.set('@');
938         uric_no_slash.set('&');
939         uric_no_slash.set('=');
940         uric_no_slash.set('+');
941         uric_no_slash.set('$');
942         uric_no_slash.set(',');
943     }
944     
945 
946     /***
947      * URI bitset that combines uric_no_slash and uric.
948      * <p><blockquote><pre>
949      * opaque_part   = uric_no_slash *uric
950      * </pre></blockquote><p>
951      */
952     protected static final BitSet opaque_part = new BitSet(256);
953     // Static initializer for opaque_part
954     static {
955         // it's generous. because first character must not include a slash
956         opaque_part.or(uric_no_slash);
957         opaque_part.or(uric);
958     }
959     
960 
961     /***
962      * URI bitset that combines absolute path and opaque part.
963      * <p><blockquote><pre>
964      * path          = [ abs_path | opaque_part ]
965      * </pre></blockquote><p>
966      */
967     protected static final BitSet path = new BitSet(256);
968     // Static initializer for path
969     static {
970         path.or(abs_path);
971         path.or(opaque_part);
972     }
973 
974 
975     /***
976      * Port, a logical alias for digit.
977      */
978     protected static final BitSet port = digit;
979 
980 
981     /***
982      * Bitset that combines digit and dot fo IPv$address.
983      * <p><blockquote><pre>
984      * IPv4address   = 1*digit "." 1*digit "." 1*digit "." 1*digit
985      * </pre></blockquote><p>
986      */
987     protected static final BitSet IPv4address = new BitSet(256);
988     // Static initializer for IPv4address
989     static {
990         IPv4address.or(digit);
991         IPv4address.set('.');
992     }
993 
994 
995     /***
996      * RFC 2373.
997      * <p><blockquote><pre>
998      * IPv6address = hexpart [ ":" IPv4address ]
999      * </pre></blockquote><p>
1000      */
1001     protected static final BitSet IPv6address = new BitSet(256);
1002     // Static initializer for IPv6address reference
1003     static {
1004         IPv6address.or(hex); // hexpart
1005         IPv6address.set(':');
1006         IPv6address.or(IPv4address);
1007     }
1008 
1009 
1010     /***
1011      * RFC 2732, 2373.
1012      * <p><blockquote><pre>
1013      * IPv6reference   = "[" IPv6address "]"
1014      * </pre></blockquote><p>
1015      */
1016     protected static final BitSet IPv6reference = new BitSet(256);
1017     // Static initializer for IPv6reference
1018     static {
1019         IPv6reference.set('[');
1020         IPv6reference.or(IPv6address);
1021         IPv6reference.set(']');
1022     }
1023 
1024 
1025     /***
1026      * BitSet for toplabel.
1027      * <p><blockquote><pre>
1028      * toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
1029      * </pre></blockquote><p>
1030      */
1031     protected static final BitSet toplabel = new BitSet(256);
1032     // Static initializer for toplabel
1033     static {
1034         toplabel.or(alphanum);
1035         toplabel.set('-');
1036     }
1037 
1038 
1039     /***
1040      * BitSet for domainlabel.
1041      * <p><blockquote><pre>
1042      * domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
1043      * </pre></blockquote><p>
1044      */
1045     protected static final BitSet domainlabel = toplabel;
1046 
1047 
1048     /***
1049      * BitSet for hostname.
1050      * <p><blockquote><pre>
1051      * hostname      = *( domainlabel "." ) toplabel [ "." ]
1052      * </pre></blockquote><p>
1053      */
1054     protected static final BitSet hostname = new BitSet(256);
1055     // Static initializer for hostname
1056     static {
1057         hostname.or(toplabel);
1058         // hostname.or(domainlabel);
1059         hostname.set('.');
1060     }
1061 
1062 
1063     /***
1064      * BitSet for host.
1065      * <p><blockquote><pre>
1066      * host          = hostname | IPv4address | IPv6reference
1067      * </pre></blockquote><p>
1068      */
1069     protected static final BitSet host = new BitSet(256);
1070     // Static initializer for host
1071     static {
1072         host.or(hostname);
1073         // host.or(IPv4address);
1074         host.or(IPv6reference); // IPv4address
1075     }
1076 
1077 
1078     /***
1079      * BitSet for hostport.
1080      * <p><blockquote><pre>
1081      * hostport      = host [ ":" port ]
1082      * </pre></blockquote><p>
1083      */
1084     protected static final BitSet hostport = new BitSet(256);
1085     // Static initializer for hostport
1086     static {
1087         hostport.or(host);
1088         hostport.set(':');
1089         hostport.or(port);
1090     }
1091 
1092 
1093     /***
1094      * Bitset for userinfo.
1095      * <p><blockquote><pre>
1096      * userinfo      = *( unreserved | escaped |
1097      *                    ";" | ":" | "&amp;" | "=" | "+" | "$" | "," )
1098      * </pre></blockquote><p>
1099      */
1100     protected static final BitSet userinfo = new BitSet(256);
1101     // Static initializer for userinfo
1102     static {
1103         userinfo.or(unreserved);
1104         userinfo.or(escaped);
1105         userinfo.set(';');
1106         userinfo.set(':');
1107         userinfo.set('&');
1108         userinfo.set('=');
1109         userinfo.set('+');
1110         userinfo.set('$');
1111         userinfo.set(',');
1112     }
1113 
1114 
1115     /***
1116      * BitSet for within the userinfo component like user and password.
1117      */
1118     public static final BitSet within_userinfo = new BitSet(256);
1119     // Static initializer for within_userinfo
1120     static {
1121         within_userinfo.or(userinfo);
1122         within_userinfo.clear(';'); // reserved within authority
1123         within_userinfo.clear(':');
1124         within_userinfo.clear('@');
1125         within_userinfo.clear('?');
1126         within_userinfo.clear('/');
1127     }
1128 
1129 
1130     /***
1131      * Bitset for server.
1132      * <p><blockquote><pre>
1133      * server        = [ [ userinfo "@" ] hostport ]
1134      * </pre></blockquote><p>
1135      */
1136     protected static final BitSet server = new BitSet(256);
1137     // Static initializer for server
1138     static {
1139         server.or(userinfo);
1140         server.set('@');
1141         server.or(hostport);
1142     }
1143 
1144 
1145     /***
1146      * BitSet for reg_name.
1147      * <p><blockquote><pre>
1148      * reg_name      = 1*( unreserved | escaped | "$" | "," |
1149      *                     ";" | ":" | "@" | "&amp;" | "=" | "+" )
1150      * </pre></blockquote><p>
1151      */
1152     protected static final BitSet reg_name = new BitSet(256);
1153     // Static initializer for reg_name
1154     static {
1155         reg_name.or(unreserved);
1156         reg_name.or(escaped);
1157         reg_name.set('$');
1158         reg_name.set(',');
1159         reg_name.set(';');
1160         reg_name.set(':');
1161         reg_name.set('@');
1162         reg_name.set('&');
1163         reg_name.set('=');
1164         reg_name.set('+');
1165     }
1166 
1167 
1168     /***
1169      * BitSet for authority.
1170      * <p><blockquote><pre>
1171      * authority     = server | reg_name
1172      * </pre></blockquote><p>
1173      */
1174     protected static final BitSet authority = new BitSet(256);
1175     // Static initializer for authority
1176     static {
1177         authority.or(server);
1178         authority.or(reg_name);
1179     }
1180 
1181 
1182     /***
1183      * BitSet for scheme.
1184      * <p><blockquote><pre>
1185      * scheme        = alpha *( alpha | digit | "+" | "-" | "." )
1186      * </pre></blockquote><p>
1187      */
1188     protected static final BitSet scheme = new BitSet(256);
1189     // Static initializer for scheme
1190     static {
1191         scheme.or(alpha);
1192         scheme.or(digit);
1193         scheme.set('+');
1194         scheme.set('-');
1195         scheme.set('.');
1196     }
1197 
1198 
1199     /***
1200      * BitSet for rel_segment.
1201      * <p><blockquote><pre>
1202      * rel_segment   = 1*( unreserved | escaped |
1203      *                     ";" | "@" | "&amp;" | "=" | "+" | "$" | "," )
1204      * </pre></blockquote><p>
1205      */
1206     protected static final BitSet rel_segment = new BitSet(256);
1207     // Static initializer for rel_segment
1208     static {
1209         rel_segment.or(unreserved);
1210         rel_segment.or(escaped);
1211         rel_segment.set(';');
1212         rel_segment.set('@');
1213         rel_segment.set('&');
1214         rel_segment.set('=');
1215         rel_segment.set('+');
1216         rel_segment.set('$');
1217         rel_segment.set(',');
1218     }
1219 
1220 
1221     /***
1222      * BitSet for rel_path.
1223      * <p><blockquote><pre>
1224      * rel_path      = rel_segment [ abs_path ]
1225      * </pre></blockquote><p>
1226      */
1227     protected static final BitSet rel_path = new BitSet(256);
1228     // Static initializer for rel_path
1229     static {
1230         rel_path.or(rel_segment);
1231         rel_path.or(abs_path);
1232     }
1233 
1234 
1235     /***
1236      * BitSet for net_path.
1237      * <p><blockquote><pre>
1238      * net_path      = "//" authority [ abs_path ]
1239      * </pre></blockquote><p>
1240      */
1241     protected static final BitSet net_path = new BitSet(256);
1242     // Static initializer for net_path
1243     static {
1244         net_path.set('/');
1245         net_path.or(authority);
1246         net_path.or(abs_path);
1247     }
1248     
1249 
1250     /***
1251      * BitSet for hier_part.
1252      * <p><blockquote><pre>
1253      * hier_part     = ( net_path | abs_path ) [ "?" query ]
1254      * </pre></blockquote><p>
1255      */
1256     protected static final BitSet hier_part = new BitSet(256);
1257     // Static initializer for hier_part
1258     static {
1259         hier_part.or(net_path);
1260         hier_part.or(abs_path);
1261         // hier_part.set('?'); aleady included
1262         hier_part.or(query);
1263     }
1264 
1265 
1266     /***
1267      * BitSet for relativeURI.
1268      * <p><blockquote><pre>
1269      * relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ]
1270      * </pre></blockquote><p>
1271      */
1272     protected static final BitSet relativeURI = new BitSet(256);
1273     // Static initializer for relativeURI
1274     static {
1275         relativeURI.or(net_path);
1276         relativeURI.or(abs_path);
1277         relativeURI.or(rel_path);
1278         // relativeURI.set('?'); aleady included
1279         relativeURI.or(query);
1280     }
1281 
1282 
1283     /***
1284      * BitSet for absoluteURI.
1285      * <p><blockquote><pre>
1286      * absoluteURI   = scheme ":" ( hier_part | opaque_part )
1287      * </pre></blockquote><p>
1288      */
1289     protected static final BitSet absoluteURI = new BitSet(256);
1290     // Static initializer for absoluteURI
1291     static {
1292         absoluteURI.or(scheme);
1293         absoluteURI.set(':');
1294         absoluteURI.or(hier_part);
1295         absoluteURI.or(opaque_part);
1296     }
1297 
1298 
1299     /***
1300      * BitSet for URI-reference.
1301      * <p><blockquote><pre>
1302      * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
1303      * </pre></blockquote><p>
1304      */
1305     protected static final BitSet URI_reference = new BitSet(256);
1306     // Static initializer for URI_reference
1307     static {
1308         URI_reference.or(absoluteURI);
1309         URI_reference.or(relativeURI);
1310         URI_reference.set('#');
1311         URI_reference.or(fragment);
1312     }
1313 
1314     // ---------------------------- Characters disallowed within the URI syntax
1315     // Excluded US-ASCII Characters are like control, space, delims and unwise
1316 
1317     /***
1318      * BitSet for control.
1319      */
1320     public static final BitSet control = new BitSet(256);
1321     // Static initializer for control
1322     static {
1323         for (int i = 0; i <= 0x1F; i++) {
1324             control.set(i);
1325         }
1326         control.set(0x7F);
1327     }
1328 
1329     /***
1330      * BitSet for space.
1331      */
1332     public static final BitSet space = new BitSet(256);
1333     // Static initializer for space
1334     static {
1335         space.set(0x20);
1336     }
1337 
1338 
1339     /***
1340      * BitSet for delims.
1341      */
1342     public static final BitSet delims = new BitSet(256);
1343     // Static initializer for delims
1344     static {
1345         delims.set('<');
1346         delims.set('>');
1347         delims.set('#');
1348         delims.set('%');
1349         delims.set('"');
1350     }
1351 
1352 
1353     /***
1354      * BitSet for unwise.
1355      */
1356     public static final BitSet unwise = new BitSet(256);
1357     // Static initializer for unwise
1358     static {
1359         unwise.set('{');
1360         unwise.set('}');
1361         unwise.set('|');
1362         unwise.set('//');
1363         unwise.set('^');
1364         unwise.set('[');
1365         unwise.set(']');
1366         unwise.set('`');
1367     }
1368 
1369 
1370     /***
1371      * Disallowed rel_path before escaping.
1372      */
1373     public static final BitSet disallowed_rel_path = new BitSet(256);
1374     // Static initializer for disallowed_rel_path
1375     static {
1376         disallowed_rel_path.or(uric);
1377         disallowed_rel_path.andNot(rel_path);
1378     }
1379 
1380 
1381     /***
1382      * Disallowed opaque_part before escaping.
1383      */
1384     public static final BitSet disallowed_opaque_part = new BitSet(256);
1385     // Static initializer for disallowed_opaque_part
1386     static {
1387         disallowed_opaque_part.or(uric);
1388         disallowed_opaque_part.andNot(opaque_part);
1389     }
1390 
1391     // ----------------------- Characters allowed within and for each component
1392 
1393     /***
1394      * Those characters that are allowed for the authority component.
1395      */
1396     public static final BitSet allowed_authority = new BitSet(256);
1397     // Static initializer for allowed_authority
1398     static {
1399         allowed_authority.or(authority);
1400         allowed_authority.clear('%');
1401     }
1402 
1403 
1404     /***
1405      * Those characters that are allowed for the opaque_part.
1406      */
1407     public static final BitSet allowed_opaque_part = new BitSet(256);
1408     // Static initializer for allowed_opaque_part 
1409     static {
1410         allowed_opaque_part.or(opaque_part);
1411         allowed_opaque_part.clear('%');
1412     }
1413 
1414 
1415     /***
1416      * Those characters that are allowed for the reg_name.
1417      */
1418     public static final BitSet allowed_reg_name = new BitSet(256);
1419     // Static initializer for allowed_reg_name 
1420     static {
1421         allowed_reg_name.or(reg_name);
1422         // allowed_reg_name.andNot(percent);
1423         allowed_reg_name.clear('%');
1424     }
1425 
1426 
1427     /***
1428      * Those characters that are allowed for the userinfo component.
1429      */
1430     public static final BitSet allowed_userinfo = new BitSet(256);
1431     // Static initializer for allowed_userinfo
1432     static {
1433         allowed_userinfo.or(userinfo);
1434         // allowed_userinfo.andNot(percent);
1435         allowed_userinfo.clear('%');
1436     }
1437 
1438 
1439     /***
1440      * Those characters that are allowed for within the userinfo component.
1441      */
1442     public static final BitSet allowed_within_userinfo = new BitSet(256);
1443     // Static initializer for allowed_within_userinfo
1444     static {
1445         allowed_within_userinfo.or(within_userinfo);
1446         allowed_within_userinfo.clear('%');
1447     }
1448 
1449 
1450     /***
1451      * Those characters that are allowed for the IPv6reference component.
1452      * The characters '[', ']' in IPv6reference should be excluded.
1453      */
1454     public static final BitSet allowed_IPv6reference = new BitSet(256);
1455     // Static initializer for allowed_IPv6reference
1456     static {
1457         allowed_IPv6reference.or(IPv6reference);
1458         // allowed_IPv6reference.andNot(unwise);
1459         allowed_IPv6reference.clear('[');
1460         allowed_IPv6reference.clear(']');
1461     }
1462 
1463 
1464     /***
1465      * Those characters that are allowed for the host component.
1466      * The characters '[', ']' in IPv6reference should be excluded.
1467      */
1468     public static final BitSet allowed_host = new BitSet(256);
1469     // Static initializer for allowed_host
1470     static {
1471         allowed_host.or(hostname);
1472         allowed_host.or(allowed_IPv6reference);
1473     }
1474 
1475 
1476     /***
1477      * Those characters that are allowed for the authority component.
1478      */
1479     public static final BitSet allowed_within_authority = new BitSet(256);
1480     // Static initializer for allowed_within_authority
1481     static {
1482         allowed_within_authority.or(server);
1483         allowed_within_authority.or(reg_name);
1484         allowed_within_authority.clear(';');
1485         allowed_within_authority.clear(':');
1486         allowed_within_authority.clear('@');
1487         allowed_within_authority.clear('?');
1488         allowed_within_authority.clear('/');
1489     }
1490 
1491 
1492     /***
1493      * Those characters that are allowed for the abs_path.
1494      */
1495     public static final BitSet allowed_abs_path = new BitSet(256);
1496     // Static initializer for allowed_abs_path
1497     static {
1498         allowed_abs_path.or(abs_path);
1499         // allowed_abs_path.set('/');  // aleady included
1500         allowed_abs_path.andNot(percent);
1501     }
1502 
1503 
1504     /***
1505      * Those characters that are allowed for the rel_path.
1506      */
1507     public static final BitSet allowed_rel_path = new BitSet(256);
1508     // Static initializer for allowed_rel_path
1509     static {
1510         allowed_rel_path.or(rel_path);
1511         allowed_rel_path.clear('%');
1512     }
1513 
1514 
1515     /***
1516      * Those characters that are allowed within the path.
1517      */
1518     public static final BitSet allowed_within_path = new BitSet(256);
1519     // Static initializer for allowed_within_path
1520     static {
1521         allowed_within_path.or(abs_path);
1522         allowed_within_path.clear('/');
1523         allowed_within_path.clear(';');
1524         allowed_within_path.clear('=');
1525         allowed_within_path.clear('?');
1526     }
1527 
1528 
1529     /***
1530      * Those characters that are allowed for the query component.
1531      */
1532     public static final BitSet allowed_query = new BitSet(256);
1533     // Static initializer for allowed_query
1534     static {
1535         allowed_query.or(uric);
1536         allowed_query.clear('%');
1537     }
1538 
1539 
1540     /***
1541      * Those characters that are allowed within the query component.
1542      */
1543     public static final BitSet allowed_within_query = new BitSet(256);
1544     // Static initializer for allowed_within_query
1545     static {
1546         allowed_within_query.or(allowed_query);
1547         allowed_within_query.andNot(reserved); // excluded 'reserved'
1548     }
1549 
1550 
1551     /***
1552      * Those characters that are allowed for the fragment component.
1553      */
1554     public static final BitSet allowed_fragment = new BitSet(256);
1555     // Static initializer for allowed_fragment
1556     static {
1557         allowed_fragment.or(uric);
1558         allowed_fragment.clear('%');
1559     }
1560 
1561     // ------------------------------------------- Flags for this URI-reference
1562 
1563     // TODO: Figure out what all these variables are for and provide javadoc
1564 
1565     // URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
1566     // absoluteURI   = scheme ":" ( hier_part | opaque_part )
1567     protected boolean _is_hier_part;
1568     protected boolean _is_opaque_part;
1569     // relativeURI   = ( net_path | abs_path | rel_path ) [ "?" query ] 
1570     // hier_part     = ( net_path | abs_path ) [ "?" query ]
1571     protected boolean _is_net_path;
1572     protected boolean _is_abs_path;
1573     protected boolean _is_rel_path;
1574     // net_path      = "//" authority [ abs_path ] 
1575     // authority     = server | reg_name
1576     protected boolean _is_reg_name;
1577     protected boolean _is_server;  // = _has_server
1578     // server        = [ [ userinfo "@" ] hostport ]
1579     // host          = hostname | IPv4address | IPv6reference
1580     protected boolean _is_hostname;
1581     protected boolean _is_IPv4address;
1582     protected boolean _is_IPv6reference;
1583 
1584     // ------------------------------------------ Character and escape encoding
1585     
1586     /***
1587      * Encodes URI string.
1588      *
1589      * This is a two mapping, one from original characters to octets, and
1590      * subsequently a second from octets to URI characters:
1591      * <p><blockquote><pre>
1592      *   original character sequence->octet sequence->URI character sequence
1593      * </pre></blockquote><p>
1594      *
1595      * An escaped octet is encoded as a character triplet, consisting of the
1596      * percent character "%" followed by the two hexadecimal digits
1597      * representing the octet code. For example, "%20" is the escaped
1598      * encoding for the US-ASCII space character.
1599      * <p>
1600      * Conversion from the local filesystem character set to UTF-8 will
1601      * normally involve a two step process. First convert the local character
1602      * set to the UCS; then convert the UCS to UTF-8.
1603      * The first step in the process can be performed by maintaining a mapping
1604      * table that includes the local character set code and the corresponding
1605      * UCS code.
1606      * The next step is to convert the UCS character code to the UTF-8 encoding.
1607      * <p>
1608      * Mapping between vendor codepages can be done in a very similar manner
1609      * as described above.
1610      * <p>
1611      * The only time escape encodings can allowedly be made is when a URI is
1612      * being created from its component parts.  The escape and validate methods
1613      * are internally performed within this method.
1614      *
1615      * @param original the original character sequence
1616      * @param allowed those characters that are allowed within a component
1617      * @param charset the protocol charset
1618      * @return URI character sequence
1619      * @throws URIException null component or unsupported character encoding
1620      */
1621     protected static char[] encode(String original, BitSet allowed,
1622             String charset) throws URIException {
1623 
1624         // encode original to uri characters.
1625         if (original == null) {
1626             throw new URIException(URIException.PARSING, "null");
1627         }
1628         // escape octet to uri characters.
1629         if (allowed == null) {
1630             throw new URIException(URIException.PARSING,
1631                     "null allowed characters");
1632         }
1633         byte[] octets;
1634         try {
1635             octets = original.getBytes(charset);
1636         } catch (UnsupportedEncodingException error) {
1637             throw new URIException(URIException.UNSUPPORTED_ENCODING, charset);
1638         }
1639         StringBuffer buf = new StringBuffer(octets.length);
1640         for (int i = 0; i < octets.length; i++) {
1641             char c = (char) octets[i];
1642             if (allowed.get(c)) {
1643                 buf.append(c);
1644             } else {
1645                 buf.append('%');
1646                 byte b = octets[i]; // use the original byte value
1647                 char hexadecimal = Character.forDigit((b >> 4) & 0xF, 16);
1648                 buf.append(Character.toUpperCase(hexadecimal)); // high
1649                 hexadecimal = Character.forDigit(b & 0xF, 16);
1650                 buf.append(Character.toUpperCase(hexadecimal)); // low
1651             }
1652         }
1653 
1654         return buf.toString().toCharArray();
1655     }
1656 
1657 
1658     /***
1659      * Decodes URI encoded string.
1660      *
1661      * This is a two mapping, one from URI characters to octets, and
1662      * subsequently a second from octets to original characters:
1663      * <p><blockquote><pre>
1664      *   URI character sequence->octet sequence->original character sequence
1665      * </pre></blockquote><p>
1666      *
1667      * A URI must be separated into its components before the escaped
1668      * characters within those components can be allowedly decoded.
1669      * <p>
1670      * Notice that there is a chance that URI characters that are non UTF-8
1671      * may be parsed as valid UTF-8.  A recent non-scientific analysis found
1672      * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
1673      * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
1674      * false reading.
1675      * <p>
1676      * The percent "%" character always has the reserved purpose of being
1677      * the escape indicator, it must be escaped as "%25" in order to be used
1678      * as data within a URI.
1679      * <p>
1680      * The unescape method is internally performed within this method.
1681      *
1682      * @param component the URI character sequence
1683      * @param charset the protocol charset
1684      * @return original character sequence
1685      * @throws URIException incomplete trailing escape pattern or unsupported
1686      * character encoding
1687      */
1688     protected static String decode(char[] component, String charset)
1689         throws URIException {
1690 
1691         // unescape uri characters to octets
1692         if (component == null) {  
1693             return null;
1694         }
1695 
1696         byte[] octets;
1697         try {
1698             octets = new String(component).getBytes(charset);
1699         } catch (UnsupportedEncodingException error) {
1700             throw new URIException(URIException.UNSUPPORTED_ENCODING,
1701                     "not supported " + charset + " encoding");
1702         }
1703         int length = octets.length;
1704         int oi = 0; // output index
1705         for (int ii = 0; ii < length; oi++) {
1706             byte aByte = (byte) octets[ii++];
1707             if (aByte == '%' && ii + 2 <= length)  {
1708                 byte high = (byte) Character.digit((char) octets[ii++], 16);
1709                 byte low = (byte) Character.digit((char) octets[ii++], 16);
1710                 if (high == -1 || low == -1) {
1711                     throw new URIException(URIException.ESCAPING,
1712                             "incomplete trailing escape pattern");
1713                             
1714                 }
1715                 aByte = (byte) ((high << 4) + low);
1716             }
1717             octets[oi] = (byte) aByte;
1718         }
1719 
1720         String result;
1721         try {
1722             result = new String(octets, 0, oi, charset);
1723         } catch (UnsupportedEncodingException error) {
1724             throw new URIException(URIException.UNSUPPORTED_ENCODING,
1725                     "not supported " + charset + " encoding");
1726         }
1727 
1728         return result;
1729     }
1730 
1731 
1732     /***
1733      * Pre-validate the unescaped URI string within a specific component.
1734      *
1735      * @param component the component string within the component
1736      * @param disallowed those characters disallowed within the component
1737      * @return if true, it doesn't have the disallowed characters
1738      * if false, the component is undefined or an incorrect one
1739      */
1740     protected boolean prevalidate(String component, BitSet disallowed) {
1741         // prevalidate the given component by disallowed characters
1742         if (component == null) {
1743             return false; // undefined
1744         }
1745         char[] target = component.toCharArray();
1746         for (int i = 0; i < target.length; i++) {
1747             if (disallowed.get(target[i])) {
1748                 return false;
1749             }
1750         }
1751         return true;
1752     }
1753 
1754 
1755     /***
1756      * Validate the URI characters within a specific component.
1757      * The component must be performed after escape encoding. Or it doesn't
1758      * include escaped characters.
1759      *
1760      * @param component the characters sequence within the component
1761      * @param generous those characters that are allowed within a component
1762      * @return if true, it's the correct URI character sequence
1763      */
1764     protected boolean validate(char[] component, BitSet generous) {
1765         // validate each component by generous characters
1766         return validate(component, 0, -1, generous);
1767     }
1768 
1769 
1770     /***
1771      * Validate the URI characters within a specific component.
1772      * The component must be performed after escape encoding. Or it doesn't
1773      * include escaped characters.
1774      * <p>
1775      * It's not that much strict, generous.  The strict validation might be 
1776      * performed before being called this method.
1777      *
1778      * @param component the characters sequence within the component
1779      * @param soffset the starting offset of the given component
1780      * @param eoffset the ending offset of the given component
1781      * if -1, it means the length of the component
1782      * @param generous those characters that are allowed within a component
1783      * @return if true, it's the correct URI character sequence
1784      */
1785     protected boolean validate(char[] component, int soffset, int eoffset,
1786             BitSet generous) {
1787         // validate each component by generous characters
1788         if (eoffset == -1) {
1789             eoffset = component.length - 1;
1790         }
1791         for (int i = soffset; i <= eoffset; i++) {
1792             if (!generous.get(component[i])) { 
1793                 return false;
1794             }
1795         }
1796         return true;
1797     }
1798 
1799 
1800     /***
1801      * In order to avoid any possilbity of conflict with non-ASCII characters,
1802      * Parse a URI reference as a <code>String</code> with the character
1803      * encoding of the local system or the document.
1804      * <p>
1805      * The following line is the regular expression for breaking-down a URI
1806      * reference into its components.
1807      * <p><blockquote><pre>
1808      *   ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1809      *    12            3  4          5       6  7        8 9
1810      * </pre></blockquote><p>
1811      * For example, matching the above expression to
1812      *   http://jakarta.apache.org/ietf/uri/#Related
1813      * results in the following subexpression matches:
1814      * <p><blockquote><pre>
1815      *               $1 = http:
1816      *  scheme    =  $2 = http
1817      *               $3 = //jakarta.apache.org
1818      *  authority =  $4 = jakarta.apache.org
1819      *  path      =  $5 = /ietf/uri/
1820      *               $6 = <undefined>
1821      *  query     =  $7 = <undefined>
1822      *               $8 = #Related
1823      *  fragment  =  $9 = Related
1824      * </pre></blockquote><p>
1825      *
1826      * @param original the original character sequence
1827      * @param escaped <code>true</code> if <code>original</code> is escaped
1828      * @throws URIException If an error occurs.
1829      */
1830     protected void parseUriReference(String original, boolean escaped)
1831         throws URIException {
1832 
1833         // validate and contruct the URI character sequence
1834         if (original == null) {
1835             throw new URIException("URI-Reference required");
1836         }
1837 
1838         /* @
1839          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1840          */
1841         String tmp = original.trim();
1842         
1843         /*
1844          * The length of the string sequence of characters.
1845          * It may not be equal to the length of the byte array.
1846          */
1847         int length = tmp.length();
1848 
1849         /*
1850          * Remove the delimiters like angle brackets around an URI.
1851          */
1852         if (length > 0) {
1853             char[] firstDelimiter = { tmp.charAt(0) };
1854             if (validate(firstDelimiter, delims)) {
1855                 if (length >= 2) {
1856                     char[] lastDelimiter = { tmp.charAt(length - 1) };
1857                     if (validate(lastDelimiter, delims)) {
1858                         tmp = tmp.substring(1, length - 1);
1859                         length = length - 2;
1860                     }
1861                 }
1862             }
1863         }
1864 
1865         /*
1866          * The starting index
1867          */
1868         int from = 0;
1869 
1870         /*
1871          * The test flag whether the URI is started from the path component.
1872          */
1873         boolean isStartedFromPath = false;
1874         int atColon = tmp.indexOf(':');
1875         int atSlash = tmp.indexOf('/');
1876         if (atColon < 0 || (atSlash >= 0 && atSlash < atColon)) {
1877             isStartedFromPath = true;
1878         }
1879 
1880         /*
1881          * <p><blockquote><pre>
1882          *     @@@@@@@@
1883          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1884          * </pre></blockquote><p>
1885          */
1886         int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
1887         if (at == -1) { 
1888             at = 0;
1889         }
1890 
1891         /*
1892          * Parse the scheme.
1893          * <p><blockquote><pre>
1894          *  scheme    =  $2 = http
1895          *              @
1896          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1897          * </pre></blockquote><p>
1898          */
1899         if (at < length && tmp.charAt(at) == ':') {
1900             char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
1901             if (validate(target, scheme)) {
1902                 _scheme = target;
1903             } else {
1904                 throw new URIException("incorrect scheme");
1905             }
1906             from = ++at;
1907         }
1908 
1909         /*
1910          * Parse the authority component.
1911          * <p><blockquote><pre>
1912          *  authority =  $4 = jakarta.apache.org
1913          *                  @@
1914          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1915          * </pre></blockquote><p>
1916          */
1917         // Reset flags
1918         _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
1919         if (0 <= at && at < length && tmp.charAt(at) == '/') {
1920             // Set flag
1921             _is_hier_part = true;
1922             if (at + 2 < length && tmp.charAt(at + 1) == '/') {
1923                 // the temporary index to start the search from
1924                 int next = indexFirstOf(tmp, "/?#", at + 2);
1925                 if (next == -1) {
1926                     next = (tmp.substring(at + 2).length() == 0) ? at + 2 
1927                         : tmp.length();
1928                 }
1929                 parseAuthority(tmp.substring(at + 2, next), escaped);
1930                 from = at = next;
1931                 // Set flag
1932                 _is_net_path = true;
1933             }
1934             if (from == at) {
1935                 // Set flag
1936                 _is_abs_path = true;
1937             }
1938         }
1939 
1940         /*
1941          * Parse the path component.
1942          * <p><blockquote><pre>
1943          *  path      =  $5 = /ietf/uri/
1944          *                                @@@@@@
1945          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1946          * </pre></blockquote><p>
1947          */
1948         if (from < length) {
1949             // rel_path = rel_segment [ abs_path ]
1950             int next = indexFirstOf(tmp, "?#", from);
1951             if (next == -1) {
1952                 next = tmp.length();
1953             }
1954             if (!_is_abs_path) {
1955                 if (!escaped 
1956                     && prevalidate(tmp.substring(from, next), disallowed_rel_path) 
1957                     || escaped 
1958                     && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
1959                     // Set flag
1960                     _is_rel_path = true;
1961                 } else if (!escaped 
1962                     && prevalidate(tmp.substring(from, next), disallowed_opaque_part) 
1963                     || escaped 
1964                     && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
1965                     // Set flag
1966                     _is_opaque_part = true;
1967                 } else {
1968                     // the path component may be empty
1969                     _path = null;
1970                 }
1971             }
1972             if (escaped) {
1973                 setRawPath(tmp.substring(from, next).toCharArray());
1974             } else {
1975                 setPath(tmp.substring(from, next));
1976             }
1977             at = next;
1978         }
1979 
1980         // set the charset to do escape encoding
1981         String charset = getProtocolCharset();
1982 
1983         /*
1984          * Parse the query component.
1985          * <p><blockquote><pre>
1986          *  query     =  $7 = <undefined>
1987          *                                        @@@@@@@@@
1988          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1989          * </pre></blockquote><p>
1990          */
1991         if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
1992             int next = tmp.indexOf('#', at + 1);
1993             if (next == -1) {
1994                 next = tmp.length();
1995             }
1996             _query = (escaped) ? tmp.substring(at + 1, next).toCharArray() 
1997                 : encode(tmp.substring(at + 1, next), allowed_query, charset);
1998             at = next;
1999         }
2000 
2001         /*
2002          * Parse the fragment component.
2003          * <p><blockquote><pre>
2004          *  fragment  =  $9 = Related
2005          *                                                   @@@@@@@@
2006          *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
2007          * </pre></blockquote><p>
2008          */
2009         if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
2010             if (at + 1 == length) { // empty fragment
2011                 _fragment = "".toCharArray();
2012             } else {
2013                 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray() 
2014                     : encode(tmp.substring(at + 1), allowed_fragment, charset);
2015             }
2016         }
2017 
2018         // set this URI.
2019         setURI();
2020     }
2021 
2022 
2023     /***
2024      * Get the earlier index that to be searched for the first occurrance in
2025      * one of any of the given string.
2026      *
2027      * @param s the string to be indexed
2028      * @param delims the delimiters used to index
2029      * @return the earlier index if there are delimiters
2030      */
2031     protected int indexFirstOf(String s, String delims) {
2032         return indexFirstOf(s, delims, -1);
2033     }
2034 
2035 
2036     /***
2037      * Get the earlier index that to be searched for the first occurrance in
2038      * one of any of the given string.
2039      *
2040      * @param s the string to be indexed
2041      * @param delims the delimiters used to index
2042      * @param offset the from index
2043      * @return the earlier index if there are delimiters
2044      */
2045     protected int indexFirstOf(String s, String delims, int offset) {
2046         if (s == null || s.length() == 0) {
2047             return -1;
2048         }
2049         if (delims == null || delims.length() == 0) {
2050             return -1;
2051         }
2052         // check boundaries
2053         if (offset < 0) {
2054             offset = 0;
2055         } else if (offset > s.length()) {
2056             return -1;
2057         }
2058         // s is never null
2059         int min = s.length();
2060         char[] delim = delims.toCharArray();
2061         for (int i = 0; i < delim.length; i++) {
2062             int at = s.indexOf(delim[i], offset);
2063             if (at >= 0 && at < min) {
2064                 min = at;
2065             }
2066         }
2067         return (min == s.length()) ? -1 : min;
2068     }
2069 
2070 
2071     /***
2072      * Get the earlier index that to be searched for the first occurrance in
2073      * one of any of the given array.
2074      *
2075      * @param s the character array to be indexed
2076      * @param delim the delimiter used to index
2077      * @return the ealier index if there are a delimiter
2078      */
2079     protected int indexFirstOf(char[] s, char delim) {
2080         return indexFirstOf(s, delim, 0);
2081     }
2082 
2083 
2084     /***
2085      * Get the earlier index that to be searched for the first occurrance in
2086      * one of any of the given array.
2087      *
2088      * @param s the character array to be indexed
2089      * @param delim the delimiter used to index
2090      * @param offset The offset.
2091      * @return the ealier index if there is a delimiter
2092      */
2093     protected int indexFirstOf(char[] s, char delim, int offset) {
2094         if (s == null || s.length == 0) {
2095             return -1;
2096         }
2097         // check boundaries
2098         if (offset < 0) {
2099             offset = 0;
2100         } else if (offset > s.length) {
2101             return -1;
2102         }
2103         for (int i = offset; i < s.length; i++) {
2104             if (s[i] == delim) {
2105                 return i;
2106             }
2107         }
2108         return -1;
2109     }
2110 
2111 
2112     /***
2113      * Parse the authority component.
2114      *
2115      * @param original the original character sequence of authority component
2116      * @param escaped <code>true</code> if <code>original</code> is escaped
2117      * @throws URIException If an error occurs.
2118      */
2119     protected void parseAuthority(String original, boolean escaped)
2120         throws URIException {
2121 
2122         // Reset flags
2123         _is_reg_name = _is_server =
2124         _is_hostname = _is_IPv4address = _is_IPv6reference = false;
2125 
2126         // set the charset to do escape encoding
2127         String charset = getProtocolCharset();
2128 
2129         boolean hasPort = true;
2130         int from = 0;
2131         int next = original.indexOf('@');
2132         if (next != -1) { // neither -1 and 0
2133             // each protocol extented from URI supports the specific userinfo
2134             _userinfo = (escaped) ? original.substring(0, next).toCharArray() 
2135                 : encode(original.substring(0, next), allowed_userinfo,
2136                         charset);
2137             from = next + 1;
2138         }
2139         next = original.indexOf('[', from);
2140         if (next >= from) {
2141             next = original.indexOf(']', from);
2142             if (next == -1) {
2143                 throw new URIException(URIException.PARSING, "IPv6reference");
2144             } else {
2145                 next++;
2146             }
2147             // In IPv6reference, '[', ']' should be excluded
2148             _host = (escaped) ? original.substring(from, next).toCharArray() 
2149                 : encode(original.substring(from, next), allowed_IPv6reference,
2150                         charset);
2151             // Set flag
2152             _is_IPv6reference = true;
2153         } else { // only for !_is_IPv6reference
2154             next = original.indexOf(':', from);
2155             if (next == -1) {
2156                 next = original.length();
2157                 hasPort = false;
2158             }
2159             // REMINDME: it doesn't need the pre-validation
2160             _host = original.substring(from, next).toCharArray();
2161             if (validate(_host, IPv4address)) {
2162                 // Set flag
2163                 _is_IPv4address = true;
2164             } else if (validate(_host, hostname)) {
2165                 // Set flag
2166                 _is_hostname = true;
2167             } else {
2168                 // Set flag
2169                 _is_reg_name = true;
2170             }
2171         }
2172         if (_is_reg_name) {
2173             // Reset flags for a server-based naming authority
2174             _is_server = _is_hostname = _is_IPv4address =
2175             _is_IPv6reference = false;
2176             // set a registry-based naming authority
2177             _authority = (escaped) ? original.toString().toCharArray() 
2178                 : encode(original.toString(), allowed_reg_name, charset);
2179         } else {
2180             if (original.length() - 1 > next && hasPort 
2181                 && original.charAt(next) == ':') { // not empty
2182                 from = next + 1;
2183                 try {
2184                     _port = Integer.parseInt(original.substring(from));
2185                 } catch (NumberFormatException error) {
2186                     throw new URIException(URIException.PARSING,
2187                             "invalid port number");
2188                 }
2189             }
2190             // set a server-based naming authority
2191             StringBuffer buf = new StringBuffer();
2192             if (_userinfo != null) { // has_userinfo
2193                 buf.append(_userinfo);
2194                 buf.append('@');
2195             }
2196             if (_host != null) {
2197                 buf.append(_host);
2198                 if (_port != -1) {
2199                     buf.append(':');
2200                     buf.append(_port);
2201                 }
2202             }
2203             _authority = buf.toString().toCharArray();
2204             // Set flag
2205             _is_server = true;
2206         }
2207     }
2208 
2209 
2210     /***
2211      * Once it's parsed successfully, set this URI.
2212      *
2213      * @see #getRawURI
2214      */
2215     protected void setURI() {
2216         // set _uri
2217         StringBuffer buf = new StringBuffer();
2218         // ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
2219         if (_scheme != null) {
2220             buf.append(_scheme);
2221             buf.append(':');
2222         }
2223         if (_is_net_path) {
2224             buf.append("//");
2225             if (_authority != null) { // has_authority
2226                 if (_userinfo != null) { // by default, remove userinfo part
2227                     if (_host != null) {
2228                         buf.append(_host);
2229                         if (_port != -1) {
2230                             buf.append(':');
2231                             buf.append(_port);
2232                         }
2233                     }
2234                 } else {
2235                     buf.append(_authority);
2236                 }
2237             }
2238         }
2239         if (_opaque != null && _is_opaque_part) {
2240             buf.append(_opaque);
2241         } else if (_path != null) {
2242             // _is_hier_part or _is_relativeURI
2243             if (_path.length != 0) {
2244                 buf.append(_path);
2245             }
2246         }
2247         if (_query != null) { // has_query
2248             buf.append('?');
2249             buf.append(_query);
2250         }
2251         // ignore the fragment identifier
2252         _uri = buf.toString().toCharArray();
2253         hash = 0;
2254     }
2255 
2256     // ----------------------------------------------------------- Test methods
2257   
2258 
2259     /***
2260      * Tell whether or not this URI is absolute.
2261      *
2262      * @return true iif this URI is absoluteURI
2263      */
2264     public boolean isAbsoluteURI() {
2265         return (_scheme != null);
2266     }
2267   
2268 
2269     /***
2270      * Tell whether or not this URI is relative.
2271      *
2272      * @return true iif this URI is relativeURI
2273      */
2274     public boolean isRelativeURI() {
2275         return (_scheme == null);
2276     }
2277 
2278 
2279     /***
2280      * Tell whether or not the absoluteURI of this URI is hier_part.
2281      *
2282      * @return true iif the absoluteURI is hier_part
2283      */
2284     public boolean isHierPart() {
2285         return _is_hier_part;
2286     }
2287 
2288 
2289     /***
2290      * Tell whether or not the absoluteURI of this URI is opaque_part.
2291      *
2292      * @return true iif the absoluteURI is opaque_part
2293      */
2294     public boolean isOpaquePart() {
2295         return _is_opaque_part;
2296     }
2297 
2298 
2299     /***
2300      * Tell whether or not the relativeURI or heir_part of this URI is net_path.
2301      * It's the same function as the has_authority() method.
2302      *
2303      * @return true iif the relativeURI or heir_part is net_path
2304      * @see #hasAuthority
2305      */
2306     public boolean isNetPath() {
2307         return _is_net_path || (_authority != null);
2308     }
2309 
2310 
2311     /***
2312      * Tell whether or not the relativeURI or hier_part of this URI is abs_path.
2313      *
2314      * @return true iif the relativeURI or hier_part is abs_path
2315      */
2316     public boolean isAbsPath() {
2317         return _is_abs_path;
2318     }
2319 
2320 
2321     /***
2322      * Tell whether or not the relativeURI of this URI is rel_path.
2323      *
2324      * @return true iif the relativeURI is rel_path
2325      */
2326     public boolean isRelPath() {
2327         return _is_rel_path;
2328     }
2329 
2330 
2331     /***
2332      * Tell whether or not this URI has authority.
2333      * It's the same function as the is_net_path() method.
2334      *
2335      * @return true iif this URI has authority
2336      * @see #isNetPath
2337      */
2338     public boolean hasAuthority() {
2339         return (_authority != null) || _is_net_path;
2340     }
2341 
2342     /***
2343      * Tell whether or not the authority component of this URI is reg_name.
2344      *
2345      * @return true iif the authority component is reg_name
2346      */
2347     public boolean isRegName() {
2348         return _is_reg_name;
2349     }
2350   
2351 
2352     /***
2353      * Tell whether or not the authority component of this URI is server.
2354      *
2355      * @return true iif the authority component is server
2356      */
2357     public boolean isServer() {
2358         return _is_server;
2359     }
2360   
2361 
2362     /***
2363      * Tell whether or not this URI has userinfo.
2364      *
2365      * @return true iif this URI has userinfo
2366      */
2367     public boolean hasUserinfo() {
2368         return (_userinfo != null);
2369     }
2370   
2371 
2372     /***
2373      * Tell whether or not the host part of this URI is hostname.
2374      *
2375      * @return true iif the host part is hostname
2376      */
2377     public boolean isHostname() {
2378         return _is_hostname;
2379     }
2380 
2381 
2382     /***
2383      * Tell whether or not the host part of this URI is IPv4address.
2384      *
2385      * @return true iif the host part is IPv4address
2386      */
2387     public boolean isIPv4address() {
2388         return _is_IPv4address;
2389     }
2390 
2391 
2392     /***
2393      * Tell whether or not the host part of this URI is IPv6reference.
2394      *
2395      * @return true iif the host part is IPv6reference
2396      */
2397     public boolean isIPv6reference() {
2398         return _is_IPv6reference;
2399     }
2400 
2401 
2402     /***
2403      * Tell whether or not this URI has query.
2404      *
2405      * @return true iif this URI has query
2406      */
2407     public boolean hasQuery() {
2408         return (_query != null);
2409     }
2410    
2411 
2412     /***
2413      * Tell whether or not this URI has fragment.
2414      *
2415      * @return true iif this URI has fragment
2416      */
2417     public boolean hasFragment() {
2418         return (_fragment != null);
2419     }
2420    
2421    
2422     // ---------------------------------------------------------------- Charset
2423 
2424 
2425     /***
2426      * Set the default charset of the protocol.
2427      * <p>
2428      * The character set used to store files SHALL remain a local decision and
2429      * MAY depend on the capability of local operating systems. Prior to the
2430      * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format
2431      * and UTF-8 encoded. This approach, while allowing international exchange
2432      * of URIs, will still allow backward compatibility with older systems
2433      * because the code set positions for ASCII characters are identical to the
2434      * one byte sequence in UTF-8.
2435      * <p>
2436      * An individual URI scheme may require a single charset, define a default
2437      * charset, or provide a way to indicate the charset used.
2438      *
2439      * <p>
2440      * Always all the time, the setter method is always succeeded and throws
2441      * <code>DefaultCharsetChanged</code> exception.
2442      *
2443      * So API programmer must follow the following way:
2444      * <code><pre>
2445      *  import org.apache.util.URI$DefaultCharsetChanged;
2446      *      .
2447      *      .
2448      *      .
2449      *  try {
2450      *      URI.setDefaultProtocolCharset("UTF-8");
2451      *  } catch (DefaultCharsetChanged cc) {
2452      *      // CASE 1: the exception could be ignored, when it is set by user
2453      *      if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) {
2454      *      // CASE 2: let user know the default protocol charset changed
2455      *      } else {
2456      *      // CASE 2: let user know the default document charset changed
2457      *      }
2458      *  }
2459      *  </pre></code>
2460      *
2461      * The API programmer is responsible to set the correct charset.
2462      * And each application should remember its own charset to support.
2463      *
2464      * @param charset the default charset for each protocol
2465      * @throws DefaultCharsetChanged default charset changed
2466      */
2467     public static void setDefaultProtocolCharset(String charset) 
2468         throws DefaultCharsetChanged {
2469             
2470         defaultProtocolCharset = charset;
2471         throw new DefaultCharsetChanged(DefaultCharsetChanged.PROTOCOL_CHARSET,
2472                 "the default protocol charset changed");
2473     }
2474 
2475 
2476     /***
2477      * Get the default charset of the protocol.
2478      * <p>
2479      * An individual URI scheme may require a single charset, define a default
2480      * charset, or provide a way to indicate the charset used.
2481      * <p>
2482      * To work globally either requires support of a number of character sets
2483      * and to be able to convert between them, or the use of a single preferred
2484      * character set.
2485      * For support of global compatibility it is STRONGLY RECOMMENDED that
2486      * clients and servers use UTF-8 encoding when exchanging URIs.
2487      *
2488      * @return the default charset string
2489      */
2490     public static String getDefaultProtocolCharset() {
2491         return defaultProtocolCharset;
2492     }
2493 
2494 
2495     /***
2496      * Get the protocol charset used by this current URI instance.
2497      * It was set by the constructor for this instance. If it was not set by
2498      * contructor, it will return the default protocol charset.
2499      *
2500      * @return the protocol charset string
2501      * @see #getDefaultProtocolCharset
2502      */
2503     public String getProtocolCharset() {
2504         return (protocolCharset != null) 
2505             ? protocolCharset 
2506             : defaultProtocolCharset;
2507     }
2508 
2509 
2510     /***
2511      * Set the default charset of the document.
2512      * <p>
2513      * Notice that it will be possible to contain mixed characters (e.g.
2514      * ftp://host/KoreanNamespace/ChineseResource). To handle the Bi-directional
2515      * display of these character sets, the protocol charset could be simply
2516      * used again. Because it's not yet implemented that the insertion of BIDI
2517      * control characters at different points during composition is extracted.
2518      * <p>
2519      *
2520      * Always all the time, the setter method is always succeeded and throws
2521      * <code>DefaultCharsetChanged</code> exception.
2522      *
2523      * So API programmer must follow the following way:
2524      * <code><pre>
2525      *  import org.apache.util.URI$DefaultCharsetChanged;
2526      *      .
2527      *      .
2528      *      .
2529      *  try {
2530      *      URI.setDefaultDocumentCharset("EUC-KR");
2531      *  } catch (DefaultCharsetChanged cc) {
2532      *      // CASE 1: the exception could be ignored, when it is set by user
2533      *      if (cc.getReasonCode() == DefaultCharsetChanged.DOCUMENT_CHARSET) {
2534      *      // CASE 2: let user know the default document charset changed
2535      *      } else {
2536      *      // CASE 2: let user know the default protocol charset changed
2537      *      }
2538      *  }
2539      *  </pre></code>
2540      *
2541      * The API programmer is responsible to set the correct charset.
2542      * And each application should remember its own charset to support.
2543      *
2544      * @param charset the default charset for the document
2545      * @throws DefaultCharsetChanged default charset changed
2546      */
2547     public static void setDefaultDocumentCharset(String charset) 
2548         throws DefaultCharsetChanged {
2549             
2550         defaultDocumentCharset = charset;
2551         throw new DefaultCharsetChanged(DefaultCharsetChanged.DOCUMENT_CHARSET,
2552                 "the default document charset changed");
2553     }
2554 
2555 
2556     /***
2557      * Get the recommended default charset of the document.
2558      *
2559      * @return the default charset string
2560      */
2561     public static String getDefaultDocumentCharset() {
2562         return defaultDocumentCharset;
2563     }
2564 
2565 
2566     /***
2567      * Get the default charset of the document by locale.
2568      *
2569      * @return the default charset string by locale
2570      */
2571     public static String getDefaultDocumentCharsetByLocale() {
2572         return defaultDocumentCharsetByLocale;
2573     }
2574 
2575 
2576     /***
2577      * Get the default charset of the document by platform.
2578      *
2579      * @return the default charset string by platform
2580      */
2581     public static String getDefaultDocumentCharsetByPlatform() {
2582         return defaultDocumentCharsetByPlatform;
2583     }
2584 
2585     // ------------------------------------------------------------- The scheme
2586 
2587     /***
2588      * Get the scheme.
2589      *
2590      * @return the scheme
2591      */
2592     public char[] getRawScheme() {
2593         return _scheme;
2594     }
2595 
2596 
2597     /***
2598      * Get the scheme.
2599      *
2600      * @return the scheme
2601      * null if undefined scheme
2602      */
2603     public String getScheme() {
2604         return (_scheme == null) ? null : new String(_scheme);
2605     }
2606 
2607     // ---------------------------------------------------------- The authority
2608 
2609     /***
2610      * Set the authority.  It can be one type of server, hostport, hostname,
2611      * IPv4address, IPv6reference and reg_name.
2612      * <p><blockquote><pre>
2613      *   authority     = server | reg_name
2614      * </pre></blockquote><p>
2615      *
2616      * @param escapedAuthority the raw escaped authority
2617      * @throws URIException If {@link 
2618      * #parseAuthority(java.lang.String,boolean)} fails
2619      * @throws NullPointerException null authority
2620      */
2621     public void setRawAuthority(char[] escapedAuthority) 
2622         throws URIException, NullPointerException {
2623             
2624         parseAuthority(new String(escapedAuthority), true);
2625         setURI();
2626     }
2627 
2628 
2629     /***
2630      * Set the authority.  It can be one type of server, hostport, hostname,
2631      * IPv4address, IPv6reference and reg_name.
2632      * Note that there is no setAuthority method by the escape encoding reason.
2633      *
2634      * @param escapedAuthority the escaped authority string
2635      * @throws URIException If {@link 
2636      * #parseAuthority(java.lang.String,boolean)} fails
2637      */
2638     public void setEscapedAuthority(String escapedAuthority)
2639         throws URIException {
2640 
2641         parseAuthority(escapedAuthority, true);
2642         setURI();
2643     }
2644 
2645 
2646     /***
2647      * Get the raw-escaped authority.
2648      *
2649      * @return the raw-escaped authority
2650      */
2651     public char[] getRawAuthority() {
2652         return _authority;
2653     }
2654 
2655 
2656     /***
2657      * Get the escaped authority.
2658      *
2659      * @return the escaped authority
2660      */
2661     public String getEscapedAuthority() {
2662         return (_authority == null) ? null : new String(_authority);
2663     }
2664 
2665 
2666     /***
2667      * Get the authority.
2668      *
2669      * @return the authority
2670      * @throws URIException If {@link #decode} fails
2671      */
2672     public String getAuthority() throws URIException {
2673         return (_authority == null) ? null : decode(_authority,
2674                 getProtocolCharset());
2675     }
2676 
2677     // ----------------------------------------------------------- The userinfo
2678 
2679     /***
2680      * Get the raw-escaped userinfo.
2681      *
2682      * @return the raw-escaped userinfo
2683      * @see #getAuthority
2684      */
2685     public char[] getRawUserinfo() {
2686         return _userinfo;
2687     }
2688 
2689 
2690     /***
2691      * Get the escaped userinfo.
2692      *
2693      * @return the escaped userinfo
2694      * @see #getAuthority
2695      */
2696     public String getEscapedUserinfo() {
2697         return (_userinfo == null) ? null : new String(_userinfo);
2698     }
2699 
2700 
2701     /***
2702      * Get the userinfo.
2703      *
2704      * @return the userinfo
2705      * @throws URIException If {@link #decode} fails
2706      * @see #getAuthority
2707      */
2708     public String getUserinfo() throws URIException {
2709         return (_userinfo == null) ? null : decode(_userinfo,
2710                 getProtocolCharset());
2711     }
2712 
2713     // --------------------------------------------------------------- The host
2714 
2715     /***
2716      * Get the host.
2717      * <p><blockquote><pre>
2718      *   host          = hostname | IPv4address | IPv6reference
2719      * </pre></blockquote><p>
2720      *
2721      * @return the host
2722      * @see #getAuthority
2723      */
2724     public char[] getRawHost() {
2725         return _host;
2726     }
2727 
2728 
2729     /***
2730      * Get the host.
2731      * <p><blockquote><pre>
2732      *   host          = hostname | IPv4address | IPv6reference
2733      * </pre></blockquote><p>
2734      *
2735      * @return the host
2736      * @throws URIException If {@link #decode} fails
2737      * @see #getAuthority
2738      */
2739     public String getHost() throws URIException {
2740         return decode(_host, getProtocolCharset());
2741     }
2742 
2743     // --------------------------------------------------------------- The port
2744 
2745     /***
2746      * Get the port.  In order to get the specfic default port, the specific
2747      * protocol-supported class extended from the URI class should be used.
2748      * It has the server-based naming authority.
2749      *
2750      * @return the port
2751      * if -1, it has the default port for the scheme or the server-based
2752      * naming authority is not supported in the specific URI.
2753      */
2754     public int getPort() {
2755         return _port;
2756     }
2757 
2758     // --------------------------------------------------------------- The path
2759 
2760     /***
2761      * Set the raw-escaped path.
2762      *
2763      * @param escapedPath the path character sequence
2764      * @throws URIException encoding error or not proper for initial instance
2765      * @see #encode
2766      */
2767     public void setRawPath(char[] escapedPath) throws URIException {
2768         if (escapedPath == null || escapedPath.length == 0) {
2769             _path = _opaque = escapedPath;
2770             setURI();
2771             return;
2772         }
2773         // remove the fragment identifier
2774         escapedPath = removeFragmentIdentifier(escapedPath);
2775         if (_is_net_path || _is_abs_path) {
2776             if (escapedPath[0] != '/') {
2777                 throw new URIException(URIException.PARSING,
2778                         "not absolute path");
2779             }
2780             if (!validate(escapedPath, abs_path)) {
2781                 throw new URIException(URIException.ESCAPING,
2782                         "escaped absolute path not valid");
2783             }
2784             _path = escapedPath;
2785         } else if (_is_rel_path) {
2786             int at = indexFirstOf(escapedPath, '/');
2787             if (at == 0) {
2788                 throw new URIException(URIException.PARSING, "incorrect path");
2789             }
2790             if (at > 0 && !validate(escapedPath, 0, at - 1, rel_segment) 
2791                 && !validate(escapedPath, at, -1, abs_path) 
2792                 || at < 0 && !validate(escapedPath, 0, -1, rel_segment)) {
2793             
2794                 throw new URIException(URIException.ESCAPING,
2795                         "escaped relative path not valid");
2796             }
2797             _path = escapedPath;
2798         } else if (_is_opaque_part) {
2799             if (!uric_no_slash.get(escapedPath[0]) 
2800                 && !validate(escapedPath, 1, -1, uric)) {
2801                 throw new URIException(URIException.ESCAPING,
2802                     "escaped opaque part not valid");
2803             }
2804             _opaque = escapedPath;
2805         } else {
2806             throw new URIException(URIException.PARSING, "incorrect path");
2807         }
2808         setURI();
2809     }
2810 
2811 
2812     /***
2813      * Set the escaped path.
2814      *
2815      * @param escapedPath the escaped path string
2816      * @throws URIException encoding error or not proper for initial instance
2817      * @see #encode
2818      */
2819     public void setEscapedPath(String escapedPath) throws URIException {
2820         if (escapedPath == null) {
2821             _path = _opaque = null;
2822             setURI();
2823             return;
2824         }
2825         setRawPath(escapedPath.toCharArray());
2826     }
2827 
2828 
2829     /***
2830      * Set the path.
2831      *
2832      * @param path the path string
2833      * @throws URIException set incorrectly or fragment only
2834      * @see #encode
2835      */
2836     public void setPath(String path) throws URIException {
2837 
2838         if (path == null || path.length() == 0) {
2839             _path = _opaque = (path == null) ? null : path.toCharArray();
2840             setURI();
2841             return;
2842         }
2843         // set the charset to do escape encoding
2844         String charset = getProtocolCharset();
2845 
2846         if (_is_net_path || _is_abs_path) {
2847             _path = encode(path, allowed_abs_path, charset);
2848         } else if (_is_rel_path) {
2849             StringBuffer buff = new StringBuffer(path.length());
2850             int at = path.indexOf('/');
2851             if (at == 0) { // never 0
2852                 throw new URIException(URIException.PARSING,
2853                         "incorrect relative path");
2854             }
2855             if (at > 0) {
2856                 buff.append(encode(path.substring(0, at), allowed_rel_path,
2857                             charset));
2858                 buff.append(encode(path.substring(at), allowed_abs_path,
2859                             charset));
2860             } else {
2861                 buff.append(encode(path, allowed_rel_path, charset));
2862             }
2863             _path = buff.toString().toCharArray();
2864         } else if (_is_opaque_part) {
2865             StringBuffer buf = new StringBuffer();
2866             buf.insert(0, encode(path.substring(0, 1), uric_no_slash, charset));
2867             buf.insert(1, encode(path.substring(1), uric, charset));
2868             _opaque = buf.toString().toCharArray();
2869         } else {
2870             throw new URIException(URIException.PARSING, "incorrect path");
2871         }
2872         setURI();
2873     }
2874 
2875 
2876     /***
2877      * Resolve the base and relative path.
2878      *
2879      * @param basePath a character array of the basePath
2880      * @param relPath a character array of the relPath
2881      * @return the resolved path
2882      * @throws URIException no more higher path level to be resolved
2883      */
2884     protected char[] resolvePath(char[] basePath, char[] relPath)
2885         throws URIException {
2886 
2887         // REMINDME: paths are never null
2888         String base = (basePath == null) ? "" : new String(basePath);
2889         int at = base.lastIndexOf('/');
2890         if (at != -1) {
2891             basePath = base.substring(0, at + 1).toCharArray();
2892         }
2893         // _path could be empty
2894         if (relPath == null || relPath.length == 0) {
2895             return normalize(basePath);
2896         } else if (relPath[0] == '/') {
2897             return normalize(relPath);
2898         } else {
2899             StringBuffer buff = new StringBuffer(base.length() 
2900                 + relPath.length);
2901             buff.append((at != -1) ? base.substring(0, at + 1) : "/");
2902             buff.append(relPath);
2903             return normalize(buff.toString().toCharArray());
2904         }
2905     }
2906 
2907 
2908     /***
2909      * Get the raw-escaped current hierarchy level in the given path.
2910      * If the last namespace is a collection, the slash mark ('/') should be
2911      * ended with at the last character of the path string.
2912      *
2913      * @param path the path
2914      * @return the current hierarchy level
2915      * @throws URIException no hierarchy level
2916      */
2917     protected char[] getRawCurrentHierPath(char[] path) throws URIException {
2918 
2919         if (_is_opaque_part) {
2920             throw new URIException(URIException.PARSING, "no hierarchy level");
2921         }
2922         if (path == null) {
2923             throw new URIException(URIException.PARSING, "empty path");
2924         }
2925         String buff = new String(path);
2926         int first = buff.indexOf('/');
2927         int last = buff.lastIndexOf('/');
2928         if (last == 0) {
2929             return rootPath;
2930         } else if (first != last && last != -1) {
2931             return buff.substring(0, last).toCharArray();
2932         }
2933         // FIXME: it could be a document on the server side
2934         return path;
2935     }
2936 
2937 
2938     /***
2939      * Get the raw-escaped current hierarchy level.
2940      *
2941      * @return the raw-escaped current hierarchy level
2942      * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2943      */
2944     public char[] getRawCurrentHierPath() throws URIException {
2945         return (_path == null) ? null : getRawCurrentHierPath(_path);
2946     }
2947  
2948 
2949     /***
2950      * Get the escaped current hierarchy level.
2951      *
2952      * @return the escaped current hierarchy level
2953      * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2954      */
2955     public String getEscapedCurrentHierPath() throws URIException {
2956         char[] path = getRawCurrentHierPath();
2957         return (path == null) ? null : new String(path);
2958     }
2959  
2960 
2961     /***
2962      * Get the current hierarchy level.
2963      *
2964      * @return the current hierarchy level
2965      * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2966      * @see #decode
2967      */
2968     public String getCurrentHierPath() throws URIException {
2969         char[] path = getRawCurrentHierPath();
2970         return (path == null) ? null : decode(path, getProtocolCharset());
2971     }
2972 
2973 
2974     /***
2975      * Get the level above the this hierarchy level.
2976      *
2977      * @return the raw above hierarchy level
2978      * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2979      */
2980     public char[] getRawAboveHierPath() throws URIException {
2981         char[] path = getRawCurrentHierPath();
2982         return (path == null) ? null : getRawCurrentHierPath(path);
2983     }
2984 
2985 
2986     /***
2987      * Get the level above the this hierarchy level.
2988      *
2989      * @return the raw above hierarchy level
2990      * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2991      */
2992     public String getEscapedAboveHierPath() throws URIException {
2993         char[] path = getRawAboveHierPath();
2994         return (path == null) ? null : new String(path);
2995     }
2996 
2997 
2998     /***
2999      * Get the level above the this hierarchy level.
3000      *
3001      * @return the above hierarchy level
3002      * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3003      * @see #decode
3004      */
3005     public String getAboveHierPath() throws URIException {
3006         char[] path = getRawAboveHierPath();
3007         return (path == null) ? null : decode(path, getProtocolCharset());
3008     }
3009 
3010 
3011     /***
3012      * Get the raw-escaped path.
3013      * <p><blockquote><pre>
3014      *   path          = [ abs_path | opaque_part ]
3015      * </pre></blockquote><p>
3016      *
3017      * @return the raw-escaped path
3018      */
3019     public char[] getRawPath() {
3020         return _is_opaque_part ? _opaque : _path;
3021     }
3022 
3023 
3024     /***
3025      * Get the escaped path.
3026      * <p><blockquote><pre>
3027      *   path          = [ abs_path | opaque_part ]
3028      *   abs_path      = "/"  path_segments 
3029      *   opaque_part   = uric_no_slash *uric
3030      * </pre></blockquote><p>
3031      *
3032      * @return the escaped path string
3033      */
3034     public String getEscapedPath() {
3035         char[] path = getRawPath();
3036         return (path == null) ? null : new String(path);
3037     }
3038 
3039 
3040     /***
3041      * Get the path.
3042      * <p><blockquote><pre>
3043      *   path          = [ abs_path | opaque_part ]
3044      * </pre></blockquote><p>
3045      * @return the path string
3046      * @throws URIException If {@link #decode} fails.
3047      * @see #decode
3048      */
3049     public String getPath() throws URIException { 
3050         char[] path =  getRawPath();
3051         return (path == null) ? null : decode(path, getProtocolCharset());
3052     }
3053 
3054 
3055     /***
3056      * Get the raw-escaped basename of the path.
3057      *
3058      * @return the raw-escaped basename
3059      */
3060     public char[] getRawName() {
3061         if (_path == null) { 
3062             return null;
3063         }
3064 
3065         int at = 0;
3066         for (int i = _path.length - 1; i >= 0; i--) {
3067             if (_path[i] == '/') {
3068                 at = i + 1;
3069                 break;
3070             }
3071         }
3072         int len = _path.length - at;
3073         char[] basename =  new char[len];
3074         System.arraycopy(_path, at, basename, 0, len);
3075         return basename;
3076     }
3077 
3078 
3079     /***
3080      * Get the escaped basename of the path.
3081      *
3082      * @return the escaped basename string
3083      */
3084     public String getEscapedName() {
3085         char[] basename = getRawName();
3086         return (basename == null) ? null : new String(basename);
3087     }
3088 
3089 
3090     /***
3091      * Get the basename of the path.
3092      *
3093      * @return the basename string
3094      * @throws URIException incomplete trailing escape pattern or unsupported
3095      * character encoding
3096      * @see #decode
3097      */
3098     public String getName() throws URIException {
3099         char[] basename = getRawName();
3100         return (basename == null) ? null : decode(getRawName(),
3101                 getProtocolCharset());
3102     }
3103 
3104     // ----------------------------------------------------- The path and query 
3105 
3106     /***
3107      * Get the raw-escaped path and query.
3108      *
3109      * @return the raw-escaped path and query
3110      */
3111     public char[] getRawPathQuery() {
3112 
3113         if (_path == null && _query == null) {
3114             return null;
3115         }
3116         StringBuffer buff = new StringBuffer();
3117         if (_path != null) {
3118             buff.append(_path);
3119         }
3120         if (_query != null) {
3121             buff.append('?');
3122             buff.append(_query);
3123         }
3124         return buff.toString().toCharArray();
3125     }
3126 
3127 
3128     /***
3129      * Get the escaped query.
3130      *
3131      * @return the escaped path and query string
3132      */
3133     public String getEscapedPathQuery() {
3134         char[] rawPathQuery = getRawPathQuery();
3135         return (rawPathQuery == null) ? null : new String(rawPathQuery);
3136     }
3137 
3138 
3139     /***
3140      * Get the path and query.
3141      *
3142      * @return the path and query string.
3143      * @throws URIException incomplete trailing escape pattern or unsupported
3144      * character encoding
3145      * @see #decode
3146      */
3147     public String getPathQuery() throws URIException {
3148         char[] rawPathQuery = getRawPathQuery();
3149         return (rawPathQuery == null) ? null : decode(rawPathQuery,
3150                 getProtocolCharset());
3151     }
3152 
3153     // -------------------------------------------------------------- The query 
3154 
3155     /***
3156      * Set the raw-escaped query.
3157      *
3158      * @param escapedQuery the raw-escaped query
3159      * @throws URIException escaped query not valid
3160      */
3161     public void setRawQuery(char[] escapedQuery) throws URIException {
3162         if (escapedQuery == null || escapedQuery.length == 0) {
3163             _query = escapedQuery;
3164             setURI();
3165             return;
3166         }
3167         // remove the fragment identifier
3168         escapedQuery = removeFragmentIdentifier(escapedQuery);
3169         if (!validate(escapedQuery, query)) {
3170             throw new URIException(URIException.ESCAPING,
3171                     "escaped query not valid");
3172         }
3173         _query = escapedQuery;
3174         setURI();
3175     }
3176 
3177 
3178     /***
3179      * Set the escaped query string.
3180      *
3181      * @param escapedQuery the escaped query string
3182      * @throws URIException escaped query not valid
3183      */
3184     public void setEscapedQuery(String escapedQuery) throws URIException {
3185         if (escapedQuery == null) {
3186             _query = null;
3187             setURI();
3188             return;
3189         }
3190         setRawQuery(escapedQuery.toCharArray());
3191     }
3192 
3193 
3194     /***
3195      * Set the query.
3196      * <p>
3197      * When a query string is not misunderstood the reserved special characters
3198      * ("&amp;", "=", "+", ",", and "$") within a query component, it is
3199      * recommended to use in encoding the whole query with this method.
3200      * <p>
3201      * The additional APIs for the special purpose using by the reserved
3202      * special characters used in each protocol are implemented in each protocol
3203      * classes inherited from <code>URI</code>.  So refer to the same-named APIs
3204      * implemented in each specific protocol instance.
3205      *
3206      * @param query the query string.
3207      * @throws URIException incomplete trailing escape pattern or unsupported
3208      * character encoding
3209      * @see #encode
3210      */
3211     public void setQuery(String query) throws URIException {
3212         if (query == null || query.length() == 0) {
3213             _query = (query == null) ? null : query.toCharArray();
3214             setURI();
3215             return;
3216         }
3217         setRawQuery(encode(query, allowed_query, getProtocolCharset()));
3218     }
3219 
3220 
3221     /***
3222      * Get the raw-escaped query.
3223      *
3224      * @return the raw-escaped query
3225      */
3226     public char[] getRawQuery() {
3227         return _query;
3228     }
3229 
3230 
3231     /***
3232      * Get the escaped query.
3233      *
3234      * @return the escaped query string
3235      */
3236     public String getEscapedQuery() {
3237         return (_query == null) ? null : new String(_query);
3238     }
3239 
3240 
3241     /***
3242      * Get the query.
3243      *
3244      * @return the query string.
3245      * @throws URIException incomplete trailing escape pattern or unsupported
3246      * character encoding
3247      * @see #decode
3248      */
3249     public String getQuery() throws URIException {
3250         return (_query == null) ? null : decode(_query, getProtocolCharset());
3251     }
3252 
3253     // ----------------------------------------------------------- The fragment 
3254 
3255     /***
3256      * Set the raw-escaped fragment.
3257      *
3258      * @param escapedFragment the raw-escaped fragment
3259      * @throws URIException escaped fragment not valid
3260      */
3261     public void setRawFragment(char[] escapedFragment) throws URIException {
3262         if (escapedFragment == null || escapedFragment.length == 0) {
3263             _fragment = escapedFragment;
3264             hash = 0;
3265             return;
3266         }
3267         if (!validate(escapedFragment, fragment)) {
3268             throw new URIException(URIException.ESCAPING,
3269                     "escaped fragment not valid");
3270         }
3271         _fragment = escapedFragment;
3272         hash = 0;
3273     }
3274 
3275 
3276     /***
3277      * Set the escaped fragment string.
3278      *
3279      * @param escapedFragment the escaped fragment string
3280      * @throws URIException escaped fragment not valid
3281      */
3282     public void setEscapedFragment(String escapedFragment) throws URIException {
3283         if (escapedFragment == null) {
3284             _fragment = null;
3285             hash = 0;
3286             return;
3287         }
3288         setRawFragment(escapedFragment.toCharArray());
3289     }
3290 
3291 
3292     /***
3293      * Set the fragment.
3294      *
3295      * @param fragment the fragment string.
3296      * @throws URIException If an error occurs.
3297      */
3298     public void setFragment(String fragment) throws URIException {
3299         if (fragment == null || fragment.length() == 0) {
3300             _fragment = (fragment == null) ? null : fragment.toCharArray();
3301             hash = 0;
3302             return;
3303         }
3304         _fragment = encode(fragment, allowed_fragment, getProtocolCharset());
3305         hash = 0;
3306     }
3307 
3308 
3309     /***
3310      * Get the raw-escaped fragment.
3311      * <p>
3312      * The optional fragment identifier is not part of a URI, but is often used
3313      * in conjunction with a URI.
3314      * <p>
3315      * The format and interpretation of fragment identifiers is dependent on
3316      * the media type [RFC2046] of the retrieval result.
3317      * <p>
3318      * A fragment identifier is only meaningful when a URI reference is
3319      * intended for retrieval and the result of that retrieval is a document
3320      * for which the identified fragment is consistently defined.
3321      *
3322      * @return the raw-escaped fragment
3323      */
3324     public char[] getRawFragment() {
3325         return _fragment;
3326     }
3327 
3328 
3329     /***
3330      * Get the escaped fragment.
3331      *
3332      * @return the escaped fragment string
3333      */
3334     public String getEscapedFragment() {
3335         return (_fragment == null) ? null : new String(_fragment);
3336     }
3337 
3338 
3339     /***
3340      * Get the fragment.
3341      *
3342      * @return the fragment string
3343      * @throws URIException incomplete trailing escape pattern or unsupported
3344      * character encoding
3345      * @see #decode
3346      */
3347     public String getFragment() throws URIException {
3348         return (_fragment == null) ? null : decode(_fragment,
3349                 getProtocolCharset());
3350     }
3351 
3352     // ------------------------------------------------------------- Utilities 
3353 
3354     /***
3355      * Remove the fragment identifier of the given component.
3356      *
3357      * @param component the component that a fragment may be included
3358      * @return the component that the fragment identifier is removed
3359      */
3360     protected char[] removeFragmentIdentifier(char[] component) {
3361         if (component == null) { 
3362             return null;
3363         }
3364         int lastIndex = new String(component).indexOf('#');
3365         if (lastIndex != -1) {
3366             component = new String(component).substring(0,
3367                     lastIndex).toCharArray();
3368         }
3369         return component;
3370     }
3371 
3372 
3373     /***
3374      * Normalize the given hier path part.
3375      * 
3376      * <p>Algorithm taken from URI reference parser at 
3377      * http://www.apache.org/~fielding/uri/rev-2002/issues.html.
3378      *
3379      * @param path the path to normalize
3380      * @return the normalized path
3381      * @throws URIException no more higher path level to be normalized
3382      */
3383     protected char[] normalize(char[] path) throws URIException {
3384 
3385         if (path == null) { 
3386             return null;
3387         }
3388 
3389         String normalized = new String(path);
3390 
3391         // If the buffer begins with "./" or "../", the "." or ".." is removed.
3392         if (normalized.startsWith("./")) {
3393             normalized = normalized.substring(1);
3394         } else if (normalized.startsWith("../")) {
3395             normalized = normalized.substring(2);
3396         } else if (normalized.startsWith("..")) {
3397             normalized = normalized.substring(2);
3398         }
3399 
3400         // All occurrences of "/./" in the buffer are replaced with "/"
3401         int index = -1;
3402         while ((index = normalized.indexOf("/./")) != -1) {
3403             normalized = normalized.substring(0, index) + normalized.substring(index + 2);
3404         }
3405 
3406         // If the buffer ends with "/.", the "." is removed.
3407         if (normalized.endsWith("/.")) {
3408             normalized = normalized.substring(0, normalized.length() - 1);
3409         }
3410 
3411         int startIndex = 0;
3412 
3413         // All occurrences of "/<segment>/../" in the buffer, where ".."
3414         // and <segment> are complete path segments, are iteratively replaced
3415         // with "/" in order from left to right until no matching pattern remains.
3416         // If the buffer ends with "/<segment>/..", that is also replaced
3417         // with "/".  Note that <segment> may be empty.
3418         while ((index = normalized.indexOf("/../", startIndex)) != -1) {
3419             int slashIndex = normalized.lastIndexOf('/', index - 1);
3420             if (slashIndex >= 0) {
3421                 normalized = normalized.substring(0, slashIndex) + normalized.substring(index + 3);
3422             } else {
3423                 startIndex = index + 3;   
3424             }
3425         }
3426         if (normalized.endsWith("/..")) {
3427             int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
3428             if (slashIndex >= 0) {
3429                 normalized = normalized.substring(0, slashIndex + 1);
3430             }
3431         }
3432 
3433         // All prefixes of "<segment>/../" in the buffer, where ".."
3434         // and <segment> are complete path segments, are iteratively replaced
3435         // with "/" in order from left to right until no matching pattern remains.
3436         // If the buffer ends with "<segment>/..", that is also replaced
3437         // with "/".  Note that <segment> may be empty.
3438         while ((index = normalized.indexOf("/../")) != -1) {
3439             int slashIndex = normalized.lastIndexOf('/', index - 1);
3440             if (slashIndex >= 0) {
3441                 break;
3442             } else {
3443                 normalized = normalized.substring(index + 3);
3444             }
3445         }
3446         if (normalized.endsWith("/..")) {
3447             int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
3448             if (slashIndex < 0) {
3449                 normalized = "/";
3450             }
3451         }
3452 
3453         return normalized.toCharArray();
3454     }
3455 
3456 
3457     /***
3458      * Normalizes the path part of this URI.  Normalization is only meant to be performed on 
3459      * URIs with an absolute path.  Calling this method on a relative path URI will have no
3460      * effect.
3461      *
3462      * @throws URIException no more higher path level to be normalized
3463      * 
3464      * @see #isAbsPath()
3465      */
3466     public void normalize() throws URIException {
3467         if (isAbsPath()) {
3468             _path = normalize(_path);
3469             setURI();
3470         }
3471     }
3472 
3473 
3474     /***
3475      * Test if the first array is equal to the second array.
3476      *
3477      * @param first the first character array
3478      * @param second the second character array
3479      * @return true if they're equal
3480      */
3481     protected boolean equals(char[] first, char[] second) {
3482 
3483         if (first == null && second == null) {
3484             return true;
3485         }
3486         if (first == null || second == null) {
3487             return false;
3488         }
3489         if (first.length != second.length) {
3490             return false;
3491         }
3492         for (int i = 0; i < first.length; i++) {
3493             if (first[i] != second[i]) {
3494                 return false;
3495             }
3496         }
3497         return true;
3498     }
3499 
3500 
3501     /***
3502      * Test an object if this URI is equal to another.
3503      *
3504      * @param obj an object to compare
3505      * @return true if two URI objects are equal
3506      */
3507     public boolean equals(Object obj) {
3508 
3509         // normalize and test each components
3510         if (obj == this) {
3511             return true;
3512         }
3513         if (!(obj instanceof URI)) {
3514             return false;
3515         }
3516         URI another = (URI) obj;
3517         // scheme
3518         if (!equals(_scheme, another._scheme)) {
3519             return false;
3520         }
3521         // is_opaque_part or is_hier_part?  and opaque
3522         if (!equals(_opaque, another._opaque)) {
3523             return false;
3524         }
3525         // is_hier_part
3526         // has_authority
3527         if (!equals(_authority, another._authority)) {
3528             return false;
3529         }
3530         // path
3531         if (!equals(_path, another._path)) {
3532             return false;
3533         }
3534         // has_query
3535         if (!equals(_query, another._query)) {
3536             return false;
3537         }
3538         // has_fragment?  should be careful of the only fragment case.
3539         if (!equals(_fragment, another._fragment)) {
3540             return false;
3541         }
3542         return true;
3543     }
3544 
3545     // ---------------------------------------------------------- Serialization
3546 
3547     /***
3548      * Write the content of this URI.
3549      *
3550      * @param oos the object-output stream
3551      * @throws IOException If an IO problem occurs.
3552      */
3553     protected void writeObject(ObjectOutputStream oos)
3554         throws IOException {
3555 
3556         oos.defaultWriteObject();
3557     }
3558 
3559 
3560     /***
3561      * Read a URI.
3562      *
3563      * @param ois the object-input stream
3564      * @throws ClassNotFoundException If one of the classes specified in the
3565      * input stream cannot be found.
3566      * @throws IOException If an IO problem occurs.
3567      */
3568     protected void readObject(ObjectInputStream ois)
3569         throws ClassNotFoundException, IOException {
3570 
3571         ois.defaultReadObject();
3572     }
3573 
3574     // -------------------------------------------------------------- Hash code
3575 
3576     /***
3577      * Return a hash code for this URI.
3578      *
3579      * @return a has code value for this URI
3580      */
3581     public int hashCode() {
3582         if (hash == 0) {
3583             char[] c = _uri;
3584             if (c != null) {
3585                 for (int i = 0, len = c.length; i < len; i++) {
3586                     hash = 31 * hash + c[i];
3587                 }
3588             }
3589             c = _fragment;
3590             if (c != null) {
3591                 for (int i = 0, len = c.length; i < len; i++) {
3592                     hash = 31 * hash + c[i];
3593                 }
3594             }
3595         }
3596         return hash;
3597     }
3598 
3599     // ------------------------------------------------------------- Comparison 
3600 
3601     /***
3602      * Compare this URI to another object. 
3603      *
3604      * @param obj the object to be compared.
3605      * @return 0, if it's same,
3606      * -1, if failed, first being compared with in the authority component
3607      * @throws ClassCastException not URI argument
3608      */
3609     public int compareTo(Object obj) throws ClassCastException {
3610 
3611         URI another = (URI) obj;
3612         if (!equals(_authority, another.getRawAuthority())) { 
3613             return -1;
3614         }
3615         return toString().compareTo(another.toString());
3616     }
3617 
3618     // ------------------------------------------------------------------ Clone
3619 
3620     /***
3621      * Create and return a copy of this object, the URI-reference containing
3622      * the userinfo component.  Notice that the whole URI-reference including
3623      * the userinfo component counld not be gotten as a <code>String</code>.
3624      * <p>
3625      * To copy the identical <code>URI</code> object including the userinfo
3626      * component, it should be used.
3627      *
3628      * @return a clone of this instance
3629      */
3630     public synchronized Object clone() {
3631 
3632         URI instance = new URI();
3633 
3634         instance._uri = _uri;
3635         instance._scheme = _scheme;
3636         instance._opaque = _opaque;
3637         instance._authority = _authority;
3638         instance._userinfo = _userinfo;
3639         instance._host = _host;
3640         instance._port = _port;
3641         instance._path = _path;
3642         instance._query = _query;
3643         instance._fragment = _fragment;
3644         // the charset to do escape encoding for this instance
3645         instance.protocolCharset = protocolCharset;
3646         // flags
3647         instance._is_hier_part = _is_hier_part;
3648         instance._is_opaque_part = _is_opaque_part;
3649         instance._is_net_path = _is_net_path;
3650         instance._is_abs_path = _is_abs_path;
3651         instance._is_rel_path = _is_rel_path;
3652         instance._is_reg_name = _is_reg_name;
3653         instance._is_server = _is_server;
3654         instance._is_hostname = _is_hostname;
3655         instance._is_IPv4address = _is_IPv4address;
3656         instance._is_IPv6reference = _is_IPv6reference;
3657 
3658         return instance;
3659     }
3660 
3661     // ------------------------------------------------------------ Get the URI
3662 
3663     /***
3664      * It can be gotten the URI character sequence. It's raw-escaped.
3665      * For the purpose of the protocol to be transported, it will be useful.
3666      * <p>
3667      * It is clearly unwise to use a URL that contains a password which is
3668      * intended to be secret. In particular, the use of a password within
3669      * the 'userinfo' component of a URL is strongly disrecommended except
3670      * in those rare cases where the 'password' parameter is intended to be
3671      * public.
3672      * <p>
3673      * When you want to get each part of the userinfo, you need to use the
3674      * specific methods in the specific URL. It depends on the specific URL.
3675      *
3676      * @return the URI character sequence
3677      */
3678     public char[] getRawURI() {
3679         return _uri;
3680     }
3681 
3682 
3683     /***
3684      * It can be gotten the URI character sequence. It's escaped.
3685      * For the purpose of the protocol to be transported, it will be useful.
3686      *
3687      * @return the escaped URI string
3688      */
3689     public String getEscapedURI() {
3690         return (_uri == null) ? null : new String(_uri);
3691     }
3692     
3693 
3694     /***
3695      * It can be gotten the URI character sequence.
3696      *
3697      * @return the original URI string
3698      * @throws URIException incomplete trailing escape pattern or unsupported
3699      * character encoding
3700      * @see #decode
3701      */
3702     public String getURI() throws URIException {
3703         return (_uri == null) ? null : decode(_uri, getProtocolCharset());
3704     }
3705 
3706 
3707     /***
3708      * Get the URI reference character sequence.
3709      *
3710      * @return the URI reference character sequence
3711      */
3712     public char[] getRawURIReference() {
3713         if (_fragment == null) { 
3714             return _uri;
3715         }
3716         if (_uri == null) { 
3717             return _fragment;
3718         }
3719         // if _uri != null &&  _fragment != null
3720         String uriReference = new String(_uri) + "#" + new String(_fragment);
3721         return uriReference.toCharArray();
3722     }
3723 
3724 
3725     /***
3726      * Get the escaped URI reference string.
3727      *
3728      * @return the escaped URI reference string
3729      */
3730     public String getEscapedURIReference() {
3731         char[] uriReference = getRawURIReference();
3732         return (uriReference == null) ? null : new String(uriReference);
3733     }
3734 
3735 
3736     /***
3737      * Get the original URI reference string.
3738      *
3739      * @return the original URI reference string
3740      * @throws URIException If {@link #decode} fails.
3741      */
3742     public String getURIReference() throws URIException {
3743         char[] uriReference = getRawURIReference();
3744         return (uriReference == null) ? null : decode(uriReference,
3745                 getProtocolCharset());
3746     }
3747 
3748 
3749     /***
3750      * Get the escaped URI string.
3751      * <p>
3752      * On the document, the URI-reference form is only used without the userinfo
3753      * component like http://jakarta.apache.org/ by the security reason.
3754      * But the URI-reference form with the userinfo component could be parsed.
3755      * <p>
3756      * In other words, this URI and any its subclasses must not expose the
3757      * URI-reference expression with the userinfo component like
3758      * http://user:password@hostport/restricted_zone.<br>
3759      * It means that the API client programmer should extract each user and
3760      * password to access manually.  Probably it will be supported in the each
3761      * subclass, however, not a whole URI-reference expression.
3762      *
3763      * @return the escaped URI string
3764      * @see #clone()
3765      */
3766     public String toString() {
3767         return getEscapedURI();
3768     }
3769 
3770 
3771     // ------------------------------------------------------------ Inner class
3772 
3773     /*** 
3774      * The charset-changed normal operation to represent to be required to
3775      * alert to user the fact the default charset is changed.
3776      */
3777     public static class DefaultCharsetChanged extends RuntimeException {
3778 
3779         // ------------------------------------------------------- constructors
3780 
3781         /***
3782          * The constructor with a reason string and its code arguments.
3783          *
3784          * @param reasonCode the reason code
3785          * @param reason the reason
3786          */
3787         public DefaultCharsetChanged(int reasonCode, String reason) {
3788             super(reason);
3789             this.reason = reason;
3790             this.reasonCode = reasonCode;
3791         }
3792 
3793         // ---------------------------------------------------------- constants
3794 
3795         /*** No specified reason code. */
3796         public static final int UNKNOWN = 0;
3797 
3798         /*** Protocol charset changed. */
3799         public static final int PROTOCOL_CHARSET = 1;
3800 
3801         /*** Document charset changed. */
3802         public static final int DOCUMENT_CHARSET = 2;
3803 
3804         // ------------------------------------------------- instance variables
3805 
3806         /*** The reason code. */
3807         private int reasonCode;
3808 
3809         /*** The reason message. */
3810         private String reason;
3811 
3812         // ------------------------------------------------------------ methods
3813 
3814         /***
3815          * Get the reason code.
3816          *
3817          * @return the reason code
3818          */
3819         public int getReasonCode() {
3820             return reasonCode;
3821         }
3822 
3823         /***
3824          * Get the reason message.
3825          *
3826          * @return the reason message
3827          */
3828         public String getReason() {
3829             return reason;
3830         }
3831 
3832     }
3833 
3834 
3835     /*** 
3836      * A mapping to determine the (somewhat arbitrarily) preferred charset for a
3837      * given locale.  Supports all locales recognized in JDK 1.1.
3838      * <p>
3839      * The distribution of this class is Servlets.com.    It was originally
3840      * written by Jason Hunter [jhunter at acm.org] and used by with permission.
3841      */
3842     public static class LocaleToCharsetMap {
3843 
3844         /*** A mapping of language code to charset */
3845         private static final Hashtable LOCALE_TO_CHARSET_MAP;
3846         static {
3847             LOCALE_TO_CHARSET_MAP = new Hashtable();
3848             LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6");
3849             LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5");
3850             LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5");
3851             LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1");
3852             LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2");
3853             LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1");
3854             LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1");
3855             LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7");
3856             LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1");
3857             LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1");
3858             LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1");
3859             LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1");
3860             LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1");
3861             LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2");
3862             LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2");
3863             LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1");
3864             LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1");
3865             LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8");
3866             LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS");
3867             LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR");
3868             LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2");
3869             LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2");
3870             LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5");
3871             LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1");
3872             LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1");
3873             LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2");
3874             LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1");
3875             LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2");
3876             LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5");
3877             LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5");
3878             LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2");
3879             LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2");
3880             LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2");
3881             LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5");
3882             LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1");
3883             LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9");
3884             LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5");
3885             LOCALE_TO_CHARSET_MAP.put("zh", "GB2312");
3886             LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5");
3887         }
3888        
3889         /***
3890          * Get the preferred charset for the given locale.
3891          *
3892          * @param locale the locale
3893          * @return the preferred charset or null if the locale is not
3894          * recognized.
3895          */
3896         public static String getCharset(Locale locale) {
3897             // try for an full name match (may include country)
3898             String charset =
3899                 (String) LOCALE_TO_CHARSET_MAP.get(locale.toString());
3900             if (charset != null) { 
3901                 return charset;
3902             }
3903            
3904             // if a full name didn't match, try just the language
3905             charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.getLanguage());
3906             return charset;  // may be null
3907         }
3908 
3909     }
3910 
3911 }
3912