View Javadoc

1   /*
2    * $Header: /home/cvs/jakarta-commons/httpclient/src/java/org/apache/commons/httpclient/util/URIUtil.java,v 1.21.2.1 2004/02/22 18:21:16 olegk Exp $
3    * $Revision: 1.21.2.1 $
4    * $Date: 2004/02/22 18:21:16 $
5    *
6    * ====================================================================
7    *
8    *  Copyright 2002-2004 The Apache Software Foundation
9    *
10   *  Licensed under the Apache License, Version 2.0 (the "License");
11   *  you may not use this file except in compliance with the License.
12   *  You may obtain a copy of the License at
13   *
14   *      http://www.apache.org/licenses/LICENSE-2.0
15   *
16   *  Unless required by applicable law or agreed to in writing, software
17   *  distributed under the License is distributed on an "AS IS" BASIS,
18   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19   *  See the License for the specific language governing permissions and
20   *  limitations under the License.
21   * ====================================================================
22   *
23   * This software consists of voluntary contributions made by many
24   * individuals on behalf of the Apache Software Foundation.  For more
25   * information on the Apache Software Foundation, please see
26   * <http://www.apache.org/>.
27   *
28   * [Additional notices, if required by prior licensing conditions]
29   *
30   */
31  
32  package org.apache.commons.httpclient.util;
33  
34  import java.io.UnsupportedEncodingException;
35  import java.util.BitSet;
36  import org.apache.commons.httpclient.URI;
37  import org.apache.commons.httpclient.URIException;
38  
39  /***
40   * The URI escape and character encoding and decoding utility.
41   * It's compatible with {@link org.apache.commons.httpclient.HttpURL} rather
42   * than {@link org.apache.commons.httpclient.URI}.
43   *
44   * @author <a href="mailto:jericho@apache.org">Sung-Gu</a>
45   * @version $Revision: 1.21.2.1 $ $Date: 2002/03/14 15:14:01 
46   */
47  
48  public class URIUtil {
49  
50      // ----------------------------------------------------- Instance variables
51  
52      protected static final BitSet empty = new BitSet(1);
53  
54      // ---------------------------------------------------------- URI utilities
55  
56      /***
57       * Get the basename of an URI.   It's possibly an empty string.
58       *
59       * @param uri a string regarded an URI
60       * @return the basename string; an empty string if the path ends with slash
61       */
62      public static String getName(String uri) {
63          if (uri == null || uri.length() == 0) { return uri; } 
64          String path = URIUtil.getPath(uri);
65          int at = path.lastIndexOf("/");
66          int to = path.length();
67          return (at >= 0) ? path.substring(at + 1, to) : path;
68      }
69  
70  
71      /***
72       * Get the query of an URI.
73       *
74       * @param uri a string regarded an URI
75       * @return the query string; <code>null</code> if empty or undefined
76       */
77      public static String getQuery(String uri) {
78          if (uri == null || uri.length() == 0) { return null; } 
79          // consider of net_path
80          int at = uri.indexOf("//");
81          int from = uri.indexOf(
82              "/", 
83              at >= 0 ? (uri.lastIndexOf("/", at - 1) >= 0 ? 0 : at + 2) : 0
84          );
85          // the authority part of URI ignored
86          int to = uri.length();
87          // reuse the at and from variables to consider the query
88          at = uri.indexOf("?", from);
89          if (at >= 0) {
90              from = at + 1;
91          } else {
92              return null;
93          }
94          // check the fragment
95          if (uri.lastIndexOf("#") > from) {
96              to = uri.lastIndexOf("#");
97          }
98          // get the path and query.
99          return (from < 0 || from == to) ? null : uri.substring(from, to);
100     }
101 
102 
103     /***
104      * Get the path of an URI.
105      *
106      * @param uri a string regarded an URI
107      * @return the path string
108      */
109     public static String getPath(String uri) {
110         if (uri == null) {
111             return null;
112         } 
113         // consider of net_path
114         int at = uri.indexOf("//");
115         int from = uri.indexOf(
116             "/", 
117             at >= 0 ? (uri.lastIndexOf("/", at - 1) >= 0 ? 0 : at + 2) : 0
118         );
119         // the authority part of URI ignored 
120         int to = uri.length();
121         // check the query
122         if (uri.indexOf('?', from) != -1) {
123             to = uri.indexOf('?', from);
124         }
125         // check the fragment
126         if (uri.lastIndexOf("#") > from && uri.lastIndexOf("#") < to) {
127             to = uri.lastIndexOf("#");
128         }
129         // get only the path.
130         return (from < 0) ? (at >= 0 ? "/" : uri) : uri.substring(from, to);
131     }
132 
133 
134     /***
135      * Get the path and query of an URI.
136      *
137      * @param uri a string regarded an URI
138      * @return the path and query string
139      */
140     public static String getPathQuery(String uri) {
141         if (uri == null) {
142             return null;
143         } 
144         // consider of net_path
145         int at = uri.indexOf("//");
146         int from = uri.indexOf(
147             "/", 
148             at >= 0 ? (uri.lastIndexOf("/", at - 1) >= 0 ? 0 : at + 2) : 0
149         );
150         // the authority part of URI ignored
151         int to = uri.length();
152         // Ignore the '?' mark so to ignore the query.
153         // check the fragment
154         if (uri.lastIndexOf("#") > from) {
155             to = uri.lastIndexOf("#");
156         }
157         // get the path and query.
158         return (from < 0) ? (at >= 0 ? "/" : uri) : uri.substring(from, to);
159     }
160 
161 
162     /***
163      * Get the path of an URI and its rest part.
164      *
165      * @param uri a string regarded an URI
166      * @return the string from the path part
167      */
168     public static String getFromPath(String uri) {
169         if (uri == null) {
170             return null;
171         } 
172         // consider of net_path
173         int at = uri.indexOf("//");
174         int from = uri.indexOf(
175             "/", 
176             at >= 0 ? (uri.lastIndexOf("/", at - 1) >= 0 ? 0 : at + 2) : 0
177         );
178         // get the path and its rest.
179         return (from < 0) ? (at >= 0 ? "/" : uri) : uri.substring(from);
180     }
181 
182     // ----------------------------------------------------- Encoding utilities
183 
184     /***
185      * Get the all escaped and encoded string with the default protocl charset.
186      * It's the same function to use <code>encode(String unescaped, Bitset
187      * empty, URI.getDefaultProtocolCharset())</code>.
188      *
189      * @param unescaped an unescaped string
190      * @return the escaped string
191      * 
192      * @throws URIException if the default protocol charset is not supported
193      *
194      * @see URI#getDefaultProtocolCharset
195      * @see #encode
196      */
197     public static String encodeAll(String unescaped) throws URIException {
198         return encodeAll(unescaped, URI.getDefaultProtocolCharset());
199     }
200  
201 
202     /***
203      * Get the all escaped and encoded string with a given charset.
204      * It's the same function to use <code>encode(String unescaped, Bitset
205      * empty, String charset)</code>.
206      *
207      * @param unescaped an unescaped string
208      * @param charset the charset
209      * @return the escaped string
210      * 
211      * @throws URIException if the charset is not supported
212      * 
213      * @see #encode
214      */
215     public static String encodeAll(String unescaped, String charset)
216         throws URIException {
217 
218         return encode(unescaped, empty, charset);
219     }
220   
221 
222     /***
223      * Escape and encode a string regarded as within the authority component of
224      * an URI with the default protocol charset.
225      * Within the authority component, the characters ";", ":", "@", "?", and
226      * "/" are reserved.
227      *
228      * @param unescaped an unescaped string
229      * @return the escaped string
230      * 
231      * @throws URIException if the default protocol charset is not supported
232      * 
233      * @see URI#getDefaultProtocolCharset
234      * @see #encode
235      */
236     public static String encodeWithinAuthority(String unescaped)
237         throws URIException {
238 
239         return encodeWithinAuthority(unescaped, URI.getDefaultProtocolCharset());
240     }
241 
242 
243     /***
244      * Escape and encode a string regarded as within the authority component of
245      * an URI with a given charset.
246      * Within the authority component, the characters ";", ":", "@", "?", and
247      * "/" are reserved.
248      *
249      * @param unescaped an unescaped string
250      * @param charset the charset
251      * @return the escaped string
252      * 
253      * @throws URIException if the charset is not supported
254      * 
255      * @see #encode
256      */
257     public static String encodeWithinAuthority(String unescaped, String charset)
258         throws URIException {
259 
260         return encode(unescaped, URI.allowed_within_authority, charset);
261     }
262 
263 
264     /***
265      * Escape and encode a string regarded as the path and query components of
266      * an URI with the default protocol charset.
267      *
268      * @param unescaped an unescaped string
269      * @return the escaped string
270      * 
271      * @throws URIException if the default protocol charset is not supported
272      * 
273      * @see URI#getDefaultProtocolCharset
274      * @see #encode
275      */
276     public static String encodePathQuery(String unescaped) throws URIException {
277         return encodePathQuery(unescaped, URI.getDefaultProtocolCharset());
278     }
279 
280 
281     /***
282      * Escape and encode a string regarded as the path and query components of
283      * an URI with a given charset.
284      *
285      * @param unescaped an unescaped string
286      * @param charset the charset
287      * @return the escaped string
288      * 
289      * @throws URIException if the charset is not supported
290      * 
291      * @see #encode
292      */
293     public static String encodePathQuery(String unescaped, String charset)
294         throws URIException {
295 
296         int at = unescaped.indexOf('?');
297         if (at < 0) {
298             return encode(unescaped, URI.allowed_abs_path, charset);
299         }
300         // else
301         return  encode(unescaped.substring(0, at), URI.allowed_abs_path, charset)
302             + '?' + encode(unescaped.substring(at + 1), URI.allowed_query, charset);
303     }
304 
305 
306     /***
307      * Escape and encode a string regarded as within the path component of an
308      * URI with the default protocol charset.
309      * The path may consist of a sequence of path segments separated by a
310      * single slash "/" character.  Within a path segment, the characters
311      * "/", ";", "=", and "?" are reserved.
312      *
313      * @param unescaped an unescaped string
314      * @return the escaped string
315      * 
316      * @throws URIException if the default protocol charset is not supported
317      * 
318      * @see URI#getDefaultProtocolCharset
319      * @see #encode
320      */
321     public static String encodeWithinPath(String unescaped)
322         throws URIException {
323 
324         return encodeWithinPath(unescaped, URI.getDefaultProtocolCharset());
325     }
326 
327 
328     /***
329      * Escape and encode a string regarded as within the path component of an
330      * URI with a given charset.
331      * The path may consist of a sequence of path segments separated by a
332      * single slash "/" character.  Within a path segment, the characters
333      * "/", ";", "=", and "?" are reserved.
334      *
335      * @param unescaped an unescaped string
336      * @param charset the charset
337      * @return the escaped string
338      * 
339      * @throws URIException if the charset is not supported
340      * 
341      * @see #encode
342      */
343     public static String encodeWithinPath(String unescaped, String charset)
344         throws URIException {
345 
346         return encode(unescaped, URI.allowed_within_path, charset);
347     }
348 
349 
350     /***
351      * Escape and encode a string regarded as the path component of an URI with
352      * the default protocol charset.
353      *
354      * @param unescaped an unescaped string
355      * @return the escaped string
356      * 
357      * @throws URIException if the default protocol charset is not supported
358      * 
359      * @see URI#getDefaultProtocolCharset
360      * @see #encode
361      */
362     public static String encodePath(String unescaped) throws URIException {
363         return encodePath(unescaped, URI.getDefaultProtocolCharset());
364     }
365 
366 
367     /***
368      * Escape and encode a string regarded as the path component of an URI with
369      * a given charset.
370      *
371      * @param unescaped an unescaped string
372      * @param charset the charset
373      * @return the escaped string
374      * 
375      * @throws URIException if the charset is not supported
376      * 
377      * @see #encode
378      */
379     public static String encodePath(String unescaped, String charset)
380         throws URIException {
381 
382         return encode(unescaped, URI.allowed_abs_path, charset);
383     }
384 
385 
386     /***
387      * Escape and encode a string regarded as within the query component of an
388      * URI with the default protocol charset.
389      * When a query comprise the name and value pairs, it is used in order
390      * to encode each name and value string.  The reserved special characters
391      * within a query component are being included in encoding the query.
392      *
393      * @param unescaped an unescaped string
394      * @return the escaped string
395      * 
396      * @throws URIException if the default protocol charset is not supported
397      * 
398      * @see URI#getDefaultProtocolCharset
399      * @see #encode
400      */
401     public static String encodeWithinQuery(String unescaped)
402         throws URIException {
403 
404         return encodeWithinQuery(unescaped, URI.getDefaultProtocolCharset());
405     }
406 
407 
408     /***
409      * Escape and encode a string regarded as within the query component of an
410      * URI with a given charset.
411      * When a query comprise the name and value pairs, it is used in order
412      * to encode each name and value string.  The reserved special characters
413      * within a query component are being included in encoding the query.
414      *
415      * @param unescaped an unescaped string
416      * @param charset the charset
417      * @return the escaped string
418      * 
419      * @throws URIException if the charset is not supported
420      * 
421      * @see #encode
422      */
423     public static String encodeWithinQuery(String unescaped, String charset)
424         throws URIException {
425 
426         return encode(unescaped, URI.allowed_within_query, charset);
427     }
428 
429 
430     /***
431      * Escape and encode a string regarded as the query component of an URI with
432      * the default protocol charset.
433      * When a query string is not misunderstood the reserved special characters
434      * ("&amp;", "=", "+", ",", and "$") within a query component, this method
435      * is recommended to use in encoding the whole query.
436      *
437      * @param unescaped an unescaped string
438      * @return the escaped string
439      * 
440      * @throws URIException if the default protocol charset is not supported
441      * 
442      * @see URI#getDefaultProtocolCharset
443      * @see #encode
444      */
445     public static String encodeQuery(String unescaped) throws URIException {
446         return encodeQuery(unescaped, URI.getDefaultProtocolCharset());
447     }
448 
449 
450     /***
451      * Escape and encode a string regarded as the query component of an URI with
452      * a given charset.
453      * When a query string is not misunderstood the reserved special characters
454      * ("&amp;", "=", "+", ",", and "$") within a query component, this method
455      * is recommended to use in encoding the whole query.
456      *
457      * @param unescaped an unescaped string
458      * @param charset the charset
459      * @return the escaped string
460      * 
461      * @throws URIException if the charset is not supported
462      * 
463      * @see #encode
464      */
465     public static String encodeQuery(String unescaped, String charset)
466         throws URIException {
467 
468         return encode(unescaped, URI.allowed_query, charset);
469     }
470 
471 
472     /***
473      * Escape and encode a given string with allowed characters not to be
474      * escaped and the default protocol charset.
475      *
476      * @param unescaped a string
477      * @param allowed allowed characters not to be escaped
478      * @return the escaped string
479      * 
480      * @throws URIException if the default protocol charset is not supported
481      * 
482      * @see URI#getDefaultProtocolCharset
483      * @see Coder#encode
484      */
485     public static String encode(String unescaped, BitSet allowed)
486         throws URIException {
487 
488         return encode(unescaped, allowed, URI.getDefaultProtocolCharset());
489     }
490 
491 
492     /***
493      * Escape and encode a given string with allowed characters not to be
494      * escaped and a given charset.
495      *
496      * @param unescaped a string
497      * @param allowed allowed characters not to be escaped
498      * @param charset the charset
499      * @return the escaped string
500      * 
501      * @throws URIException if the charset is not supported
502      * 
503      * @see Coder#encode
504      */
505     public static String encode(String unescaped, BitSet allowed,
506             String charset) throws URIException {
507 
508         return new String(Coder.encode(unescaped, allowed, charset));
509     }
510 
511 
512     /***
513      * Unescape and decode a given string regarded as an escaped string with the
514      * default protocol charset.
515      *
516      * @param escaped a string
517      * @return the unescaped string
518      * 
519      * @throws URIException if the default protocol charset is not supported
520      * 
521      * @see URI#getDefaultProtocolCharset
522      * @see Coder#decode
523      */
524     public static String decode(String escaped) throws URIException {
525         return Coder.decode(escaped.toCharArray(), URI.getDefaultProtocolCharset());
526     }
527 
528 
529     /***
530      * Unescape and decode a given string regarded as an escaped string.
531      *
532      * @param escaped a string
533      * @param charset the charset
534      * @return the unescaped string
535      * 
536      * @throws URIException if the charset is not supported
537      * 
538      * @see Coder#decode
539      */
540     public static String decode(String escaped, String charset)
541         throws URIException {
542 
543         return Coder.decode(escaped.toCharArray(), charset);
544     }
545 
546     // --------------------------------- transforming a string between charsets
547 
548     /***
549      * Convert a target string to the specified character encoded string with
550      * the default protocol charset.
551      *
552      * @param target a target string
553      * @return the protocol character encoded string
554      * 
555      * @throws URIException if the default protocol charset is not supported
556      * 
557      * @see URI#getDefaultProtocolCharset
558      * 
559      * @deprecated Do not use. To be removed
560      */
561     public static String toProtocolCharset(String target) throws URIException {
562         return toUsingCharset(
563             target, 
564             URI.getDefaultDocumentCharset(), 
565             URI.getDefaultProtocolCharset());
566     }
567 
568 
569     /***
570      * Convert a target string to the specified character encoded string with
571      * a given protocol charset.
572      *
573      * @param target a target string
574      * @param charset the transformed protocol charset
575      * @return the protocol character encoded string
576      * 
577      * @throws URIException if the charset is not supported
578      * 
579      * @deprecated Do not use. To be removed
580      */
581     public static String toProtocolCharset(String target, String charset)
582         throws URIException {
583 
584         return toUsingCharset(target, URI.getDefaultDocumentCharset(), charset);
585     }
586 
587 
588     /***
589      * Convert a target string to the specified character encoded string with
590      * the default document charset.
591      *
592      * @param target a target string
593      * @return the document character encoded string
594      * 
595      * @throws URIException if the default protocol charset is not supported
596      * 
597      * @see URI#getDefaultDocumentCharset
598      * 
599      * @deprecated Do not use. To be removed
600      */
601     public static String toDocumentCharset(String target) throws URIException {
602         return toUsingCharset(target, URI.getDefaultProtocolCharset(),
603                 URI.getDefaultDocumentCharset());
604     }
605 
606 
607     /***
608      * Convert a target string to the specified character encoded string with
609      * a given document charset.
610      *
611      * @param target a target string
612      * @param charset the transformed document charset
613      * @return the document character encoded string
614      * 
615      * @throws URIException if the charset is not supported
616      * 
617      * @deprecated Do not use. To be removed
618      */
619     public static String toDocumentCharset(String target, String charset)
620         throws URIException {
621 
622         return toUsingCharset(target, URI.getDefaultProtocolCharset(), charset);
623     }
624 
625 
626     /***
627      * Convert a target string from the <code>fromCharset</code> charset to
628      * the <code>toCharset</code> charset.
629      * <p>
630      * What if the document charset is ISO-8859-1 and the protocol charset is
631      * UTF-8, when it's read from the document part and is used in the protocol
632      * part, the use of the method will be <code>toUsingCharset(the string,
633      * "ISO-8859-1", "UTF-8")</code>.
634      *
635      * @param target a target string
636      * @param fromCharset the previous charset
637      * @param toCharset the changing charset
638      * @return the document character encoded string
639      * 
640      * @throws URIException if either of the charsets are not supported
641      * 
642      * @deprecated Do not use. To be removed
643      */
644 
645     public static String toUsingCharset(String target, String fromCharset,
646             String toCharset) throws URIException {
647 
648         try {
649             return new String(target.getBytes(fromCharset), toCharset);
650         } catch (UnsupportedEncodingException error) {
651             throw new URIException(URIException.UNSUPPORTED_ENCODING,
652                     error.getMessage());
653         }
654     }
655 
656     // ---------------------------------------------------------- Inner classes
657 
658     /***
659      * The basic and internal utility for URI escape and character encoding and
660      * decoding.
661      */
662     protected static class Coder extends URI {
663 
664         /***
665          * Escape and encode a given string with allowed characters not to be
666          * escaped.
667          *
668          * @param unescapedComponent an unescaped component
669          * @param allowed allowed characters not to be escaped
670          * @param charset the charset to encode
671          * @return the escaped and encoded string
672          * 
673          * @throws URIException if the charset is not supported
674          */
675         public static char[] encode(String unescapedComponent, BitSet allowed, String charset) 
676             throws URIException {
677 
678             return URI.encode(unescapedComponent, allowed, charset);
679         }
680 
681 
682         /***
683          * Unescape and decode a given string.
684          *
685          * @param escapedComponent an being-unescaped component
686          * @param charset the charset to decode
687          * @return the escaped and encoded string
688          * 
689          * @throws URIException if the charset is not supported
690          */
691         public static String decode(char[] escapedComponent, String charset)
692             throws URIException {
693 
694             return URI.decode(escapedComponent, charset);
695         }
696 
697 
698         /***
699          * Verify whether a given string is escaped or not
700          *
701          * @param original given characters
702          * @return true if the given character array is 7 bit ASCII-compatible.
703          */
704         public static boolean verifyEscaped(char[] original) {
705             for (int i = 0; i < original.length; i++) {
706                 int c = original[i];
707                 if (c > 128) {
708                     return false;
709                 } else if (c == '%') {
710                     if (Character.digit(original[++i], 16) == -1 
711                         || Character.digit(original[++i], 16) == -1) {
712                         return false;
713                     }
714                 }
715             }
716             return true;
717         }
718 
719 
720         /***
721          * Replace from a given character to given character in an array order
722          * for a given string.
723          *
724          * @param original a given string
725          * @param from a replacing character array
726          * @param to a replaced character array
727          * @return the replaced string
728          */
729         public static String replace(String original, char[] from, char[] to) {
730             for (int i = from.length; i > 0; --i) {
731                 original = replace(original, from[i], to[i]);
732             }
733             return original.toString();
734         }
735 
736 
737         /***
738          * Replace from a given character to given character for a given string.
739          *
740          * @param original a given string
741          * @param from a replacing character array
742          * @param to a replaced character array
743          * @return the replaced string
744          */
745         public static String replace(String original, char from, char to) {
746             StringBuffer result = new StringBuffer(original.length());
747             int at, saved = 0;
748             do {
749                 at = original.indexOf(from);
750                 if (at >= 0) {
751                     result.append(original.substring(0, at));
752                     result.append(to);
753                 } else {
754                     result.append(original.substring(saved));
755                 }
756                 saved = at;
757             } while (at >= 0);
758             return result.toString();
759         }
760     }
761 
762 }
763