001: /*
002: * ParameterParser.java December 2002
003: *
004: * Copyright (C) 2002, Niall Gallagher <niallg@users.sf.net>
005: *
006: * This library is free software; you can redistribute it and/or
007: * modify it under the terms of the GNU Lesser General Public
008: * License as published by the Free Software Foundation.
009: *
010: * This library is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser General Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser General
016: * Public License along with this library; if not, write to the
017: * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018: * Boston, MA 02111-1307 USA
019: */
020:
021: package simple.util.parse;
022:
023: import simple.util.net.Parameters;
024: import java.util.Enumeration;
025: import java.util.Hashtable;
026: import java.net.URLEncoder;
027: import java.util.Set;
028:
029: /**
030: * The <code>ParameterParser</code> is used to parse data encoded in
031: * the <code>application/x-www-form-urlencoded</code> MIME type. It
032: * is also used to parse a query string from a HTTP URL, see RFC 2616.
033: * The parsed parameters are available through the various methods of
034: * the <code>simple.http.net.Parameters</code> interface. The syntax
035: * of the parsed parameters is described below in BNF.
036: * <pre>
037: *
038: * params = *(pair [ "&" params])
039: * pair = name "=" value
040: * name = *(text | escaped)
041: * value = *(text | escaped)
042: * escaped = % HEX HEX
043: *
044: * </pre>
045: * This will consume all data found as a name or value, if the data
046: * is a "+" character then it is replaced with a space character.
047: * This regards only "=", "&", and "%" as having special values.
048: * The "=" character delimits the name from the value and the "&"
049: * delimits the name value pair. The "%" character represents the
050: * start of an escaped sequence, which consists of two hex digits.
051: * All escaped sequences are converted to its character value.
052: *
053: * @author Niall Gallagher
054: */
055: public class ParameterParser extends MapParser implements Parameters {
056:
057: /**
058: * Used to accumulate the characters for the parameter name.
059: */
060: private Token name;
061:
062: /**
063: * Used to accumulate the characters for the parameter value.
064: */
065: private Token value;
066:
067: /**
068: * Constructor for the <code>ParameterParser</code>. This creates
069: * an instance that can be use to parse HTML form data and URL
070: * query strings encoded as application/x-www-form-urlencoded.
071: * The parsed parameters are made available through the interface
072: * <code>simple.util.net.Parameters</code>.
073: */
074: public ParameterParser() {
075: this .name = new Token();
076: this .value = new Token();
077: }
078:
079: /**
080: * Constructor for the <code>ParameterParser</code>. This creates
081: * an instance that can be use to parse HTML form data and URL
082: * query strings encoded as application/x-www-form-urlencoded.
083: * The parsed parameters are made available through the interface
084: * <code>simple.util.net.Parameters</code>.
085: *
086: * @param text this is the text to parse for the parameters
087: */
088: public ParameterParser(String text) {
089: this ();
090: parse(text);
091: }
092:
093: /**
094: * This enumerates the names of every parameter. This enables
095: * the parameter values to be extracted by providing the name
096: * to the <code>getParameter</code> method. The resulting
097: * <code>Enumeration</code> contains string objects.
098: *
099: * @return this returns an <code>Enumeration</code> of names
100: */
101: public Enumeration getParameterNames() {
102: return map.keys();
103: }
104:
105: /**
106: * This extracts a value for the given name. The name issued
107: * to this method must be from the <code>Enumeration</code>
108: * issued. If there is no parameter of this name this will
109: * return a null value. If there are multiple values this
110: * will return the first value.
111: *
112: * @param name the name of the parameter value to retrieve
113: *
114: * @return this returns the first value for the given name
115: */
116: public String getParameter(Object name) {
117: return token(name);
118: }
119:
120: /**
121: * This extracts an integer parameter for the named value.
122: * If the named parameter does not exist this will return
123: * a zero value. If however the parameter exists but is
124: * not in the format of a decimal integer value then this
125: * will throw a <code>NumberFormatException</code>.
126: *
127: * @param name the name of the parameter value to retrieve
128: *
129: * @return this returns the parameter value as an integer
130: *
131: * @throws NumberFormatException if the value is not valid
132: */
133: public int getInteger(Object name) {
134: String value = token(name);
135:
136: if (value != null) {
137: return Integer.parseInt(value);
138: }
139: return 0;
140: }
141:
142: /**
143: * This extracts a float parameter for the named value.
144: * If the named parameter does not exist this will return
145: * a zero value. If however the parameter exists but is
146: * not in the format of a floating point number then this
147: * will throw a <code>NumberFormatException</code>.
148: *
149: * @param name the name of the parameter value to retrieve
150: *
151: * @return this returns the parameter value as a float
152: *
153: * @throws NumberFormatException if the value is not valid
154: */
155: public float getFloat(Object name) {
156: String value = token(name);
157:
158: if (value != null) {
159: return Float.parseFloat(value);
160: }
161: return 0.0f;
162: }
163:
164: /**
165: * This extracts a boolean parameter for the named value.
166: * If the named parameter does not exist this will return
167: * false otherwize the value is evaluated. If it is either
168: * <code>true</code> or <code>false</code> then those
169: * boolean values are returned, otherwize it is false.
170: *
171: * @param name the name of the parameter value to retrieve
172: *
173: * @return this returns the parameter value as an float
174: */
175: public boolean getBoolean(Object name) {
176: Boolean flag = Boolean.FALSE;
177: String value = token(name);
178:
179: if (value != null) {
180: flag = Boolean.valueOf(value);
181: }
182: return flag.booleanValue();
183: }
184:
185: /**
186: * This initializes the parser so that it can be used several
187: * times. This clears any previous parameters extracted. This
188: * ensures that when the next <code>parse(String)</code> is
189: * invoked the status of the <code>Parameters</code> is empty.
190: */
191: protected void init() {
192: map.clear();
193: name.len = 0;
194: value.len = 0;
195: off = 0;
196: }
197:
198: /**
199: * This performs the actual parsing of the parameter text. The
200: * parameters parsed from this are taken as "name=value" pairs.
201: * Multiple pairs within the text are separated by an "&".
202: * This will parse and insert all parameters into a hashtable.
203: */
204: protected void parse() {
205: param();
206: while (skip("&")) {
207: param();
208: }
209: }
210:
211: /**
212: * This method adds the name and value to a map so that the next
213: * name and value can be collected. The name and value are added
214: * to the map as string objects. Once added to the map the
215: * <code>Token</code> objects are set to have zero length so they
216: * can be reused to collect further values. This will add the
217: * values to the map as an array of type string. This is done so
218: * that if there are multiple values that they can be stored.
219: */
220: private void insert() {
221: if (name.len > 0) {
222: insert(name, value);
223: }
224: name.len = 0;
225: value.len = 0;
226: }
227:
228: /**
229: * This will add the given name and value to the parameters map.
230: * If any previous value of the given name has been inserted
231: * into the map then this will overwrite that value. This is
232: * used to ensure that the string value is inserted to the map.
233: *
234: * @param name this is the name of the value to be inserted
235: * @param value this is the value of a that is to be inserted
236: */
237: private void insert(Token name, Token value) {
238: put(name.toString(), value.toString());
239: }
240:
241: /**
242: * This is an expression that is defined by RFC 2396 it is used
243: * in the definition of a segment expression. This is basically
244: * a list of chars with escaped sequences.
245: * <p>
246: * This method has to ensure that no escaped chars go unchecked.
247: * This ensures that the read offset does not go out of bounds
248: * and consequently throw an out of bounds exception.
249: */
250: private void param() {
251: name();
252: if (skip("=")) { /* in case of error*/
253: value();
254: }
255: insert();
256: }
257:
258: /**
259: * This extracts the name of the parameter from the character
260: * buffer. The name of a parameter is defined as a set of
261: * chars including escape sequences. This will extract the
262: * parameter name and buffer the chars. The name ends when a
263: * equals character, "=", is encountered.
264: */
265: private void name() {
266: int mark = off;
267: int pos = off;
268:
269: while (off < count) {
270: if (buf[off] == '%') { /* escaped */
271: escape();
272: } else if (buf[off] == '=') {
273: break;
274: } else if (buf[off] == '+') {
275: buf[off] = ' ';
276: }
277: buf[pos++] = buf[off++];
278: }
279: name.len = pos - mark;
280: name.off = mark;
281: }
282:
283: /**
284: * This extracts a parameter value from a path segment. The
285: * parameter value consists of a sequence of chars and some
286: * escape sequences. The parameter value is buffered so that
287: * the name and values can be paired. The end of the value
288: * is determined as the end of the buffer or an ampersand.
289: */
290: private void value() {
291: int mark = off;
292: int pos = off;
293:
294: while (off < count) {
295: if (buf[off] == '%') { /* escaped */
296: escape();
297: } else if (buf[off] == '+') {
298: buf[off] = ' ';
299: } else if (buf[off] == '&') {
300: break;
301: }
302: buf[pos++] = buf[off++];
303: }
304: value.len = pos - mark;
305: value.off = mark;
306: }
307:
308: /**
309: * This converts an encountered escaped sequence, that is all
310: * embedded hexidecimal characters into a native UCS character
311: * value. This does not take any characters from the stream it
312: * just prepares the buffer with the correct byte. The escaped
313: * sequence within the URI will be interpreded as UTF-8.
314: * <p>
315: * This will leave the next character to read from the buffer
316: * as the character encoded from the URI. If there is a fully
317: * valid escaped sequence, that is <code>"%" HEX HEX</code>.
318: * This decodes the escaped sequence using UTF-8 encoding, all
319: * encoded sequences should be in UCS-2 to fit in a Java char.
320: */
321: private void escape() {
322: int peek = peek(off);
323:
324: if (!unicode(peek)) {
325: binary(peek);
326: }
327: }
328:
329: /**
330: * This method determines, using a peek character, whether the
331: * sequence of escaped characters within the URI is binary data.
332: * If the data within the escaped sequence is binary then this
333: * will ensure that the next character read from the URI is the
334: * binary octet. This is used strictly for backward compatible
335: * parsing of URI strings, binary data should never appear.
336: *
337: * @param peek this is the first escaped character from the URI
338: *
339: * @return currently this implementation always returns true
340: */
341: private boolean binary(int peek) {
342: if (off + 2 < count) {
343: off += 2;
344: buf[off] = bits(peek);
345: }
346: return true;
347: }
348:
349: /**
350: * This method determines, using a peek character, whether the
351: * sequence of escaped characters within the URI is in UTF-8. If
352: * a UTF-8 character can be successfully decoded from the URI it
353: * will be the next character read from the buffer. This can
354: * check for both UCS-2 and UCS-4 characters. However, because
355: * the Java <code>char</code> can only hold UCS-2, the UCS-4
356: * characters will have only the low order octets stored.
357: * <p>
358: * The WWW Consortium provides a reference implementation of a
359: * UTF-8 decoding for Java, in this the low order octets in the
360: * UCS-4 sequence are used for the character. So, in the
361: * absence of a defined behaviour, the W3C behaviour is assumed.
362: *
363: * @param peek this is the first escaped character from the URI
364: *
365: * @return this returns true if a UTF-8 character is decoded
366: */
367: private boolean unicode(int peek) {
368: if ((peek & 0x80) == 0x00) {
369: return unicode(peek, 0);
370: }
371: if ((peek & 0xe0) == 0xc0) {
372: return unicode(peek & 0x1f, 1);
373: }
374: if ((peek & 0xf0) == 0xe0) {
375: return unicode(peek & 0x0f, 2);
376: }
377: if ((peek & 0xf8) == 0xf0) {
378: return unicode(peek & 0x07, 3);
379: }
380: if ((peek & 0xfc) == 0xf8) {
381: return unicode(peek & 0x03, 4);
382: }
383: if ((peek & 0xfe) == 0xfc) {
384: return unicode(peek & 0x01, 5);
385: }
386: return false;
387: }
388:
389: /**
390: * This method will decode the specified amount of escaped
391: * characters from the URI and convert them into a single Java
392: * UCS-2 character. If there are not enough characters within
393: * the URI then this will return false and leave the URI alone.
394: * <p>
395: * The number of characters left is determined from the first
396: * UTF-8 octet, as specified in RFC 2279, and because this is
397: * a URI there must that number of <code>"%" HEX HEX</code>
398: * sequences left. If successful the next character read is
399: * the UTF-8 sequence decoded into a native UCS-2 character.
400: *
401: * @param peek contains the bits read from the first UTF octet
402: * @param more this specifies the number of UTF octets left
403: *
404: * @return this returns true if a UTF-8 character is decoded
405: */
406: private boolean unicode(int peek, int more) {
407: if (off + more * 3 >= count) {
408: return false;
409: }
410: return unicode(peek, more, off);
411: }
412:
413: /**
414: * This will decode the specified amount of trailing UTF-8 bits
415: * from the URI. The trailing bits are those following the first
416: * UTF-8 octet, which specifies the length, in octets, of the
417: * sequence. The trailing octets are of the form 10xxxxxx, for
418: * each of these octets only the last six bits are valid UCS
419: * bits. So a conversion is basically an accumulation of these.
420: * <p>
421: * If at any point during the accumulation of the UTF-8 bits
422: * there is a parsing error, then parsing is aborted an false
423: * is returned, as a result the URI is left unchanged.
424: *
425: * @param peek bytes that have been accumulated fron the URI
426: * @param more this specifies the number of UTF octets left
427: * @param pos this specifies the position the parsing begins
428: *
429: * @return this returns true if a UTF-8 character is decoded
430: */
431: private boolean unicode(int peek, int more, int pos) {
432: while (more-- > 0) {
433: if (buf[pos] == '%') {
434: int next = pos + 3;
435: int hex = peek(next);
436:
437: if ((hex & 0xc0) == 0x80) {
438: peek = (peek << 6) | (hex & 0x3f);
439: pos = next;
440: continue;
441: }
442: }
443: return false;
444: }
445: if (pos + 2 < count) {
446: off = pos + 2;
447: buf[off] = bits(peek);
448: }
449: return true;
450: }
451:
452: /**
453: * Defines behaviour for UCS-2 versus UCS-4 conversion from four
454: * octets. The UTF-8 encoding scheme enables UCS-4 characters to
455: * be encoded and decodeded. However, Java supports the 16-bit
456: * UCS-2 character set, and so the 32-bit UCS-4 character set is
457: * not compatable. This basically decides what to do with UCS-4.
458: *
459: * @param data up to four octets to be converted to UCS-2 format
460: *
461: * @return this returns a native UCS-2 character from the int
462: */
463: private char bits(int data) {
464: return (char) data;
465: }
466:
467: /**
468: * This will return the escape expression specified from the URI
469: * as an integer value of the hexidecimal sequence. This does
470: * not make any changes to the buffer it simply checks to see if
471: * the characters at the position specified are an escaped set
472: * characters of the form <code>"%" HEX HEX</code>, if so, then
473: * it will convert that hexidecimal string in to an integer
474: * value, or -1 if the expression is not hexidecimal.
475: *
476: * @param pos this is the position the expression starts from
477: *
478: * @return the integer value of the hexidecimal expression
479: */
480: private int peek(int pos) {
481: if (buf[pos] == '%') {
482: if (count <= pos + 2) {
483: return -1;
484: }
485: char high = buf[pos + 1];
486: char low = buf[pos + 2];
487:
488: return convert(high, low);
489: }
490: return -1;
491: }
492:
493: /**
494: * This will convert the two hexidecimal characters to a real
495: * integer value, which is returned. This requires characters
496: * within the range of 'A' to 'F' and 'a' to 'f', and also
497: * the digits '0' to '9'. The characters encoded using the
498: * ISO-8859-1 encoding scheme, if the characters are not with
499: * in the range specified then this returns -1.
500: *
501: * @param high this is the high four bits within the integer
502: * @param low this is the low four bits within the integer
503: *
504: * @return this returns the indeger value of the conversion
505: */
506: private int convert(char high, char low) {
507: int hex = 0x00;
508:
509: if (hex(high) && hex(low)) {
510: if ('A' <= high && high <= 'F') {
511: high -= 'A' - 'a';
512: }
513: if (high >= 'a') {
514: hex ^= (high - 'a') + 10;
515: } else {
516: hex ^= high - '0';
517: }
518: hex <<= 4;
519:
520: if ('A' <= low && low <= 'F') {
521: low -= 'A' - 'a';
522: }
523: if (low >= 'a') {
524: hex ^= (low - 'a') + 10;
525: } else {
526: hex ^= low - '0';
527: }
528: return hex;
529: }
530: return -1;
531: }
532:
533: /**
534: * This is used to determine whether a char is a hexadecimal
535: * <code>char</code> or not. A hexadecimal character is considered
536: * to be a character within the range of <code>0 - 9</code> and
537: * between <code>a - f</code> and <code>A - F</code>. This will
538: * return <code>true</code> if the character is in this range.
539: *
540: * @param ch this is the character which is to be determined here
541: *
542: * @return true if the character given has a hexadecimal value
543: */
544: private boolean hex(char ch) {
545: if (ch >= '0' && ch <= '9') {
546: return true;
547: } else if (ch >= 'a' && ch <= 'f') {
548: return true;
549: } else if (ch >= 'A' && ch <= 'F') {
550: return true;
551: }
552: return false;
553: }
554:
555: /**
556: * This <code>encode</code> method will escape the text that
557: * is provided. This is used to that the parameter pairs can
558: * be encoded in such a way that it can be transferred over
559: * HTTP/1.1 using the ISO-8859-1 character set.
560: *
561: * @param text this is the text that is to be escaped
562: *
563: * @return the text with % HEX HEX UTF-8 escape sequences
564: */
565: private String encode(String text) {
566: try {
567: return URLEncoder.encode(text, "UTF-8");
568: } catch (Exception e) {
569: return text;
570: }
571: }
572:
573: /**
574: * This <code>encode</code> method will escape the name=value
575: * pair provided using the UTF-8 character set. This method
576: * will ensure that the parameters are encoded in such a way
577: * that they can be transferred via HTTP in ISO-8859-1.
578: *
579: * @param name this is the name of that is to be escaped
580: * @param value this is the value that is to be escaped
581: *
582: * @return the pair with % HEX HEX UTF-8 escape sequences
583: */
584: private String encode(String name, String value) {
585: return encode(name) + "=" + encode(value);
586: }
587:
588: /**
589: * This <code>toString</code> method is used to compose an string
590: * in the <code>application/x-www-form-urlencoded</code> MIME type.
591: * This will encode the tokens specified in the <code>Set</code>.
592: * Each name=value pair acquired is converted into a UTF-8 escape
593: * sequence so that the parameters can be sent in the IS0-8859-1
594: * format required via the HTTP/1.1 specification RFC 2616.
595: *
596: * @param set this is the set of parameters to be encoded
597: *
598: * @return returns a HTTP parameter encoding for the pairs
599: */
600: public String toString(Set set) {
601: Object[] list = set.toArray();
602: String text = "";
603:
604: for (int i = 0; i < list.length; i++) {
605: String name = list[i].toString();
606: String value = token(name);
607:
608: if (i > 0) {
609: text += "&";
610: }
611: text += encode(name, value);
612: }
613: return text;
614: }
615:
616: /**
617: * This <code>toString</code> method is used to compose an string
618: * in the <code>application/x-www-form-urlencoded</code> MIME type.
619: * This will iterate over all tokens that have been added to this
620: * object, either during parsing, or during use of the instance.
621: * Each name=value pair acquired is converted into a UTF-8 escape
622: * sequence so that the parameters can be sent in the IS0-8859-1
623: * format required via the HTTP/1.1 specification RFC 2616.
624: *
625: * @return returns a HTTP parameter encoding for the pairs
626: */
627: public String toString() {
628: Set set = map.keySet();
629:
630: if (map.size() > 0) {
631: return toString(set);
632: }
633: return "";
634: }
635:
636: /**
637: * This is used to mark regions within the buffer that reoresent
638: * a valid token for either the name of a parameter or its value.
639: * This is used as an alternative to the <code>ParseBuffer</code>
640: * which requires memory to be allocated for storing the data
641: * read from the buffer. This requires only two integer values.
642: */
643: private class Token {
644:
645: /**
646: * This represents the number of characters in the token.
647: */
648: public int len;
649:
650: /**
651: * This represents the start offset within the buffer.
652: */
653: public int off;
654:
655: /**
656: * In order to represent the <code>Token</code> as a value
657: * that can be used this converts it to a <code>String</code>.
658: * If the length of the token is less than or equal to zero
659: * this will return and empty string for the value.
660: *
661: * @return this returns a value representing the token
662: */
663: public String toString() {
664: if (len <= 0) {
665: return "";
666: }
667: return new String(buf, off, len);
668: }
669: }
670: }
|