001: /*
002: * CookieParser.java February 2001
003: *
004: * Copyright (C) 2001, Niall Gallagher <niallg@users.sf.net>
005: *
006: * This library is free software; you can redistribute it and/or
007: * modify it under the terms of the GNU Lesser General Public
008: * License as published by the Free Software Foundation.
009: *
010: * This library is distributed in the hope that it will be useful,
011: * but WITHOUT ANY WARRANTY; without even the implied warranty of
012: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013: * GNU Lesser General Public License for more details.
014: *
015: * You should have received a copy of the GNU Lesser General
016: * Public License along with this library; if not, write to the
017: * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018: * Boston, MA 02111-1307 USA
019: */
020:
021: package simple.util.parse;
022:
023: import simple.util.net.Cookie;
024: import simple.util.net.CookieCollection;
025: import java.io.ObjectOutputStream;
026: import java.io.ObjectInputStream;
027: import java.io.Serializable;
028: import java.io.IOException;
029:
030: /**
031: * CookieParser is used to parse the cookie header. The cookie header is
032: * one of the headers that is used by the HTTP state management mechinism.
033: * The Cookie header is the header that is sent from the client to the
034: * server in response to a Set-Cookie header. The syntax of the Cookie
035: * header as taken from RFC 2109, HTTP State Management Mechanism.
036: * <pre>
037: *
038: * cookie = "Cookie:" cookie-version
039: * 1*((";" | ",") cookie-value)
040: * cookie-value = NAME "=" VALUE [";" path] [";" domain]
041: * cookie-version = "$Version" "=" value
042: * NAME = attr
043: * VALUE = value
044: * path = "$Path" "=" value
045: * domain = "$Domain" "=" value
046: *
047: * </pre>
048: * The cookie header may consist of several cookies. Each cookie can be
049: * extracted from the header by examining the it syntax of the cookie
050: * header. The syntax of the cookie header is defined in RFC 2109.
051: * <p>
052: * Each cookie has a <code>$Version</code> attribute followed by multiple
053: * cookies. Each contains a name and a value, followed by an optional
054: * <code>$Path</code> and <code>$Domain</code> attribute. This will parse
055: * a given cookie header and return each cookie extracted as a
056: * <code>Cookie</code> object.
057: * <p>
058: * This implements the <code>CookieCollection</code> that is used to
059: * iterate amongst a collection of <code>Cookie</code> objects. The
060: * <code>CookieCollection</code> can be serialized.
061: *
062: * @author Niall Gallagher
063: */
064: public class CookieParser extends Parser implements CookieCollection {
065:
066: /**
067: * Determines when the <code>Parser</code> has finished.
068: */
069: private transient boolean finished;
070:
071: /**
072: * Used so the <code>Parser</code> does not parse twice.
073: */
074: private transient boolean parsed;
075:
076: /**
077: * Version of the <code>Cookie</code> being parsed.
078: */
079: private transient int version;
080:
081: /**
082: * Used to store the name of the <code>Cookie</code>.
083: */
084: private Token name;
085:
086: /**
087: * Used to store the value of the <code>Cookie</code>.
088: */
089: private Token value;
090:
091: /**
092: * Used to store the <code>$Path</code> values.
093: */
094: private Token path;
095:
096: /**
097: * Used to store the <code>$Domain</code> values.
098: */
099: private Token domain;
100:
101: /**
102: * Create a <code>CookieParser</code> that contains no cookies.
103: * the instance will return <code>false</code> for the
104: * <code>hasMore</code> method. cookies may be parsed using
105: * this instance by using the <code>parse</code> method.
106: */
107: public CookieParser() {
108: this .path = new Token();
109: this .domain = new Token();
110: this .name = new Token();
111: this .value = new Token();
112: this .finished = true;
113: }
114:
115: /**
116: * This is primarily a convineance constructor. This will parse the
117: * <code>String</code> given to extract the cookies. This could be
118: * achived by calling the default no-arg constructor and then using
119: * the instance to invoke the <code>parse</code> method on that
120: * <code>String</code>.
121: *
122: * @param header a <code>String</code> containing a cookie value
123: */
124: public CookieParser(String header) {
125: this ();
126: parse(header);
127: }
128:
129: /**
130: * Resets the cookie and the buffer variables for this
131: * <code>CookieParser</code>. It is used to set the
132: * state of the parser to start parsing a new cookie.
133: */
134: protected void init() {
135: finished = false;
136: parsed = false;
137: version = 0;
138: off = 0;
139: version();
140: }
141:
142: /**
143: * This will extract the next <code>Cookie</code> from the
144: * buffer. If all the characters in the buffer have already
145: * been examined then this method will simply do nothing.
146: * Otherwise this will parse the remainder of the buffer
147: * and (if it follows RFC 2109) produce a <code>Cookie</code>.
148: */
149: protected void parse() {
150: if (!finished) {
151: cookie();
152: parsed = true;
153: }
154: }
155:
156: /**
157: * This is used to skip an arbitrary <code>String</code> within the
158: * <code>char</code> buf. It checks the length of the <code>String</code>
159: * first to ensure that it will not go out of bounds. A comparison
160: * is then made with the buffers contents and the <code>String</code>
161: * if the reigon in the buffer matched the <code>String</code> then the
162: * offset within the buffer is increased by the <code>String</code>'s
163: * length so that it has effectively skipped it.
164: * <p>
165: * This <code>skip</code> method will ignore all of the whitespace text.
166: * This will also skip trailing spaces within the the input text and
167: * all spaces within the source text. For example if the input was
168: * the string "s omete xt" and the source was "some text to skip" then
169: * the result of a skip ignoring spaces would be "to skip" in the
170: * source string, as the trailing spaces are also eaten by this.
171: *
172: * @param text this is the <code>String</code> value to be skipped
173: *
174: * @return true if the <code>String</code> was skipped
175: */
176: protected boolean skip(String text) {
177: int size = text.length();
178: int seek = off;
179: int read = 0;
180:
181: if (off + size > count) {
182: return false;
183: }
184: while (read < size) {
185: char a = text.charAt(read);
186: char b = buf[seek];
187:
188: if (space(b)) {
189: if (++seek >= count) {
190: return false;
191: }
192: } else if (space(a)) {
193: if (++read >= size) {
194: continue;
195: }
196: } else {
197: if (toLower(a) != toLower(b)) {
198: return false;
199: }
200: read++;
201: seek++;
202: }
203: }
204: for (off = seek; off < count; off++) {
205: if (!space(buf[off]))
206: break;
207: }
208: return true;
209: }
210:
211: /**
212: * The <code>writeObject</code> method is used so that
213: * the <code>CookieCollection</code> can be serialized with
214: * minimal effort. To restore the <code>Parser</code> the
215: * <code>parse(String)</code> method can be reinvoked.
216: *
217: * @param out this is the <code>OutputStream</code> that
218: * this object is to be written to
219: *
220: * @exception IOException is thrown if ther is an I/O error
221: */
222: private void writeObject(ObjectOutputStream out) throws IOException {
223: out.defaultWriteObject();
224: out.writeObject(new String(buf, 0, count));
225: }
226:
227: /**
228: * This <code>readObject</code> method is used so that the
229: * <code>CookieCollection</code> can be deserialized with
230: * minimal effort. The <code>Parser</code> is reconstructed
231: * by reading the characters that form the original cookie
232: * header.
233: *
234: * @param in this is the <code>OutputStream</code> that this
235: * <code>CookieCollection</code> will be written to
236: *
237: * @exception IOException thrown if there is an I/O problem
238: * @exception ClassNotFoundException this is not likley
239: */
240: private void readObject(ObjectInputStream in) throws IOException,
241: ClassNotFoundException {
242: in.defaultReadObject();
243: parse((String) in.readObject());
244: }
245:
246: /**
247: * This is used so that the collection of <code>Cookies</code>
248: * can be reiterated. This allows the collection to be reused.
249: * The <code>reset</code> method will invoke the superclasses
250: * <code>init</code> method. This will reinitialize this
251: * <code>Parser</code> so the cookie will be reparsed.
252: */
253: public void reset() {
254: init();
255: parse();
256: }
257:
258: /**
259: * Extracts the next <code>Cookie</code> object from the string
260: * given. This will return <code>null</code> when there are no
261: * more cookies left in the <code>String</code> being parsed.
262: * <p>
263: * To find out when there are no more cookies left use the
264: * <code>hasMore</code> method. This will only set the name,
265: * value, path, domain name version of the <code>cookie</code>
266: * because as of RFC 2109 these are the only attributes a
267: * <code>Cookie</code> may have, the path and domain are
268: * optional.
269: *
270: * @return an initialized <code>Cookie</code> object
271: */
272: public Cookie next() {
273: if (!hasMore()) {
274: return null;
275: }
276: parsed = false;
277: return getCookie();
278: }
279:
280: /**
281: * Creates the <code>Cookie</code> from the token objects. It is
282: * assumed that the <code>Cookie</code> <code>String</code> has
283: * been parsed when this is called. This should only be used after
284: * the <code>parse</code> method has been called.
285: * <p>
286: * If there is no <code>$Domain</code> or <code>$Path</code>
287: * within the <code>Cookie</code> <code>String</code> then the
288: * <code>getDomain</code> and <code>getPath</code> are null.
289: *
290: * @return the <code>Cookie</code> that was just parsed
291: */
292: private Cookie getCookie() {
293: return getCookie(name.toString(), value.toString());
294: }
295:
296: /**
297: * Creates the <code>Cookie</code> from the token objects. It is
298: * assumed that the <code>Cookie</code> <code>String</code> has
299: * been parsed when this is called. This should only be used after
300: * the <code>parse</code> method has been called.
301: * <p>
302: * If there is no <code>$Domain</code> or <code>$Path</code>
303: * within the <code>Cookie</code> <code>String</code> then the
304: * <code>getDomain</code> and <code>getPath</code> are null.
305: *
306: * @param name the name that the <code>Cookie</code> contains
307: * @param value the value that the <code>Cookie</code> contains
308: *
309: * @return the <code>Cookie</code> that was just parsed
310: */
311: private Cookie getCookie(String name, String value) {
312: Cookie cookie = new Cookie(name, value);
313: if (domain.len > 0) {
314: cookie.setDomain(domain.toString());
315: }
316: if (path.len > 0) {
317: cookie.setPath(path.toString());
318: }
319: cookie.setVersion(version);
320: return cookie;
321: }
322:
323: /**
324: * Determine wheather or not there are any <code>Cookie</code>s
325: * left in the <code>String</code>. This will attempt to extract
326: * another <code>Cookie</code> from the <code>String</code> and
327: * cache the result so the <code>next</code> method will produce
328: * this <code>Cookie</code>. If another <code>Cookie</code> cannot
329: * be parsed from the remainer of the <code>String</code> then
330: * this will return <code>false</code> otherwise it will return
331: * <code>true</code>.
332: *
333: * @return true if there are more cookies false otherwise
334: */
335: public boolean hasMore() {
336: if (finished)
337: return false;
338: if (parsed)
339: return true;
340: parse();
341: if (name.len <= 0) {
342: finished = true;
343: return false;
344: }
345: return true;
346:
347: }
348:
349: /**
350: * This is used to parse a <code>Cookie</code> from the buffer
351: * that contains the <code>Cookie</code> values. This will first
352: * try to remove any trailing value after the version/prev
353: * <code>Cookie</code> once this is removed it will extract the
354: * name/value pair from the <code>Cookie</code>. The name and
355: * value of the <code>Cookie</code> will be saved by the name
356: * and value tokens.
357: */
358: private void cookie() {
359: if (!skip(",")) { /* ,|; */
360: skip(";");
361: }
362: name();
363: skip("="); /* = */
364: value();
365: }
366:
367: /**
368: * This initializes the name token and extracts the name of this
369: * <code>Cookie</code>. The offset and length of the name will be
370: * saved in the name token. This will read all <code>char</code>'s
371: * upto but excluding the first '=' <code>char</code> encountered
372: * from the <code>off</code> within the buffer.
373: */
374: private void name() {
375: name.off = off;
376: name.len = 0;
377: while (off < count) {
378: if (buf[off] == '=') {
379: break;
380: }
381: name.len++;
382: off++;
383: }
384: }
385:
386: /**
387: * Used to extract everything found after the <code>NAME '='</code>
388: * within a <code>Cookie</code>. This extracts the <code>Cookie</code>
389: * value the <code>$Path</code> and <code>$Domain</code> attributes
390: * if they exist (i.e. <code>$Path</code> and <code>$Domain</code>
391: * are optional in a cookie see RFC 2109).
392: * <p>
393: * The path method reads the terminal found before it as does the
394: * <code>domain</code> method that is ";$Path" is read as the first
395: * part of the path method. This is because if there is no path the
396: * parser should not read data it does not know belongs to a specific
397: * part of the <code>Cookie</code>.
398: */
399: private void value() {
400: data();
401: path();
402: domain();
403: }
404:
405: /**
406: * This initializes the value token and extracts the value of this
407: * <code>Cookie</code>. The offset and length of the value will be
408: * saved in the value token. This will read all <code>char</code>'s
409: * upto but excluding the first terminal char encountered from the
410: * off within the buffer, or if the value is a literal it will read
411: * a literal from the buffer (literal is any data between quotes
412: * except if the quote is prefixed with a backward slash character
413: * that is '\').
414: */
415: private void data() {
416: value.off = off;
417: value.len = 0;
418: if (off < count && buf[off] == '"') {
419: value.len++;
420: for (off++; off < count;) {
421: value.len++;
422: if (buf[off++] == '"')
423: if (buf[off - 2] != '\\') {
424: break;
425: }
426: }
427: value.len -= 2; /* remove " */
428: value.off++; /* remove " */
429: } else {
430: while (off < count) {
431: if (terminal(buf[off]))
432: break;
433: value.len++;
434: off++;
435: }
436: }
437: }
438:
439: /**
440: * This initializes the path token and extracts the <code>$Path</code>
441: * of this <code>Cookie</code>. The offset and length of the path will
442: * be saved in the path token. This will read all <code>char</code>'s
443: * up to but excluding the first terminal <code>char</code> encountered
444: * from the <code>off</code> within the buffer, or if the value is a
445: * literal it will read a literal from the buffer (literal is any data
446: * between quotes except if the quote is prefixed with a backward slash
447: * character, that is '\').
448: * <p>
449: * This reads the terminal before the <code>$Path</code> so that if
450: * there is no <code>$Path</code> for the <code>Cookie</code> then
451: * the character before it will not be read needlessly.
452: */
453: private void path() {
454: path.len = 0; /* reset */
455: if (skip(";$Path=")) {
456: path.off = off;
457: if (buf[off] == '"') {
458: path.len++;
459: for (off++; off < count;) {
460: path.len++;
461: if (buf[off++] == '"')
462: if (buf[off - 2] != '\\') {
463: break;
464: }
465: }
466: path.len -= 2; /* remove " */
467: path.off++; /* remove " */
468: } else {
469: while (off < count) {
470: if (terminal(buf[off]))
471: break;
472: path.len++;
473: off++;
474: }
475: }
476: }
477: }
478:
479: /**
480: * Initializes the domain token and extracts the <code>$Domain</code>
481: * of this <code>Cookie</code>. The offset and length of the domain
482: * will be saved in the path token. This will read all characters up
483: * to but excluding the first terminal <code>char</code> encountered
484: * from the off within the buffer, or if the value is a literal it
485: * will read a literal from the buffer (literal is any data between
486: * quotes except if the quote is prefixed with a backward slash
487: * character, that is '\').
488: * <p>
489: * This reads the terminal before the <code>$Domain</code> so that
490: * if there is no <code>$Domain</code> for the <code>Cookie</code>
491: * then the character before it will not be read needlessly.
492: */
493: private void domain() {
494: domain.len = 0; /* reset */
495: if (skip(";$Domain=")) {
496: domain.off = off;
497: if (buf[off] == '"') {
498: domain.len++;
499: for (off++; off < count;) {
500: domain.len++;
501: if (buf[off++] == '"')
502: if (buf[off - 2] != '\\') {
503: break;
504: }
505: }
506: domain.len -= 2; /* remove " */
507: domain.off++; /* remove " */
508: } else {
509: while (off < count) {
510: if (terminal(buf[off]))
511: break;
512: domain.len++;
513: off++;
514: }
515: }
516: }
517: }
518:
519: /**
520: * This extracts the <code>$Version</code> of this <code>Cookie</code>.
521: * The version is parsed and converted into a decimal int from the digit
522: * characters that make up a version.
523: * <p>
524: * This will read all digit <code>char</code>'s up to but excluding the
525: * first non digit <code>char</code> that it encounters from the offset
526: * within the buffer, or if the value is a literal it will read a literal
527: * from the buffer (literal is any data between quotes except if the quote
528: * is prefixed with a backward slash character i.e. '\').
529: */
530: private void version() {
531: if (skip("$Version=")) {
532: if (buf[off] == '"') {
533: off++;
534: }
535: while (off < count) {
536: if (!digit(buf[off])) {
537: break;
538: }
539: version *= 10;
540: version += buf[off];
541: version -= '0';
542: off++;
543: }
544: if (buf[off] == '"') {
545: off++;
546: }
547: } else {
548: version = 1;
549: }
550: }
551:
552: /**
553: * This is used to determine if a given iso8859-1 character is
554: * a terminal <code>char</code>. That is either the ';' or ','
555: * characters.
556: *
557: * @param c the character that is to be compared
558: *
559: * @return <code>true</code> if it is a ';' or a ','
560: */
561: private boolean terminal(char c) {
562: return c == ';' || c == ',';
563: }
564:
565: /**
566: * This is a token object that is used to store the offset and
567: * length of a region of chars in the <code>CookieParser.buf</code>
568: * array. The <code>toString</code> method of this token will
569: * produce the <code>String</code> value of the region it
570: * represents.
571: */
572: private class Token implements Serializable {
573:
574: /**
575: * The number of <code>char</code>'s that were consumed.
576: * Declaring these primitives transient means that they
577: * will be given a default value upon deserialization.
578: * The default value for an <code>int</code> is 0.
579: */
580: public transient int len;
581:
582: /**
583: * The offset withing the buffer the token lies.
584: * Declaring these primitives transient means that they
585: * will be given a default value upon deserialization.
586: * The default value for an <code>int</code> is 0.
587: */
588: public transient int off;
589:
590: /**
591: * This converts region within the buffer to a <code>String</code>.
592: * This converts the region only if there is a sufficent length.
593: *
594: * @return the <code>String</code> value of the region
595: */
596: public String toString() {
597: return new String(buf, off, len);
598: }
599: }
600: }
|