001: /*
002: * $HeadURL: https://svn.apache.org/repos/asf/httpcomponents/httpcore/tags/4.0-beta1/module-main/src/main/java/org/apache/http/message/BasicTokenIterator.java $
003: * $Revision: 602520 $
004: * $Date: 2007-12-08 18:42:26 +0100 (Sat, 08 Dec 2007) $
005: *
006: * ====================================================================
007: * Licensed to the Apache Software Foundation (ASF) under one
008: * or more contributor license agreements. See the NOTICE file
009: * distributed with this work for additional information
010: * regarding copyright ownership. The ASF licenses this file
011: * to you under the Apache License, Version 2.0 (the
012: * "License"); you may not use this file except in compliance
013: * with the License. You may obtain a copy of the License at
014: *
015: * http://www.apache.org/licenses/LICENSE-2.0
016: *
017: * Unless required by applicable law or agreed to in writing,
018: * software distributed under the License is distributed on an
019: * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
020: * KIND, either express or implied. See the License for the
021: * specific language governing permissions and limitations
022: * under the License.
023: * ====================================================================
024: *
025: * This software consists of voluntary contributions made by many
026: * individuals on behalf of the Apache Software Foundation. For more
027: * information on the Apache Software Foundation, please see
028: * <http://www.apache.org/>.
029: *
030: */
031:
032: package org.apache.http.message;
033:
034: import java.util.NoSuchElementException;
035:
036: import org.apache.http.HeaderIterator;
037: import org.apache.http.ParseException;
038: import org.apache.http.TokenIterator;
039:
040: /**
041: * Basic implementation of a {@link TokenIterator}.
042: * This implementation parses <tt>#token<tt> sequences as
043: * defined by RFC 2616, section 2.
044: * It extends that definition somewhat beyond US-ASCII.
045: *
046: * @version $Revision: 602520 $
047: */
048: public class BasicTokenIterator implements TokenIterator {
049:
050: /** The HTTP separator characters. Defined in RFC 2616, section 2.2. */
051: // the order of the characters here is adjusted to put the
052: // most likely candidates at the beginning of the collection
053: public final static String HTTP_SEPARATORS = " ,;=()<>@:\\\"/[]?{}\t";
054:
055: /** The iterator from which to obtain the next header. */
056: protected final HeaderIterator headerIt;
057:
058: /**
059: * The value of the current header.
060: * This is the header value that includes {@link #currentToken}.
061: * Undefined if the iteration is over.
062: */
063: protected String currentHeader;
064:
065: /**
066: * The token to be returned by the next call to {@link #currentToken}.
067: * <code>null</code> if the iteration is over.
068: */
069: protected String currentToken;
070:
071: /**
072: * The position after {@link #currentToken} in {@link #currentHeader}.
073: * Undefined if the iteration is over.
074: */
075: protected int searchPos;
076:
077: /**
078: * Creates a new instance of {@link BasicTokenIterator}.
079: *
080: * @param headerIterator the iterator for the headers to tokenize
081: */
082: public BasicTokenIterator(final HeaderIterator headerIterator) {
083: if (headerIterator == null) {
084: throw new IllegalArgumentException(
085: "Header iterator must not be null.");
086: }
087:
088: this .headerIt = headerIterator;
089: this .searchPos = findNext(-1);
090: }
091:
092: // non-javadoc, see interface TokenIterator
093: public boolean hasNext() {
094: return (this .currentToken != null);
095: }
096:
097: /**
098: * Obtains the next token from this iteration.
099: *
100: * @return the next token in this iteration
101: *
102: * @throws NoSuchElementException if the iteration is already over
103: * @throws ParseException if an invalid header value is encountered
104: */
105: public String nextToken() throws NoSuchElementException,
106: ParseException {
107:
108: if (this .currentToken == null) {
109: throw new NoSuchElementException(
110: "Iteration already finished.");
111: }
112:
113: final String result = this .currentToken;
114: // updates currentToken, may trigger ParseException:
115: this .searchPos = findNext(this .searchPos);
116:
117: return result;
118: }
119:
120: /**
121: * Returns the next token.
122: * Same as {@link #nextToken}, but with generic return type.
123: *
124: * @return the next token in this iteration
125: *
126: * @throws NoSuchElementException if there are no more tokens
127: * @throws ParseException if an invalid header value is encountered
128: */
129: public final Object next() throws NoSuchElementException,
130: ParseException {
131: return nextToken();
132: }
133:
134: /**
135: * Removing tokens is not supported.
136: *
137: * @throws UnsupportedOperationException always
138: */
139: public final void remove() throws UnsupportedOperationException {
140:
141: throw new UnsupportedOperationException(
142: "Removing tokens is not supported.");
143: }
144:
145: /**
146: * Determines the next token.
147: * If found, the token is stored in {@link #currentToken}.
148: * The return value indicates the position after the token
149: * in {@link #currentHeader}. If necessary, the next header
150: * will be obtained from {@link #headerIt}.
151: * If not found, {@link #currentToken} is set to <code>null</code>.
152: *
153: * @param from the position in the current header at which to
154: * start the search, -1 to search in the first header
155: *
156: * @return the position after the found token in the current header, or
157: * negative if there was no next token
158: *
159: * @throws ParseException if an invalid header value is encountered
160: */
161: protected int findNext(int from) throws ParseException {
162:
163: if (from < 0) {
164: // called from the constructor, initialize the first header
165: if (!this .headerIt.hasNext()) {
166: return -1;
167: }
168: this .currentHeader = this .headerIt.nextHeader().getValue();
169: from = 0;
170: } else {
171: // called after a token, make sure there is a separator
172: from = findTokenSeparator(from);
173: }
174:
175: int start = findTokenStart(from);
176: if (start < 0) {
177: this .currentToken = null;
178: return -1; // nothing found
179: }
180:
181: int end = findTokenEnd(start);
182: this .currentToken = createToken(this .currentHeader, start, end);
183: return end;
184: }
185:
186: /**
187: * Creates a new token to be returned.
188: * Called from {@link #findNext findNext} after the token is identified.
189: * The default implementation simply calls
190: * {@link java.lang.String#substring String.substring}.
191: * <br/>
192: * If header values are significantly longer than tokens, and some
193: * tokens are permanently referenced by the application, there can
194: * be problems with garbage collection. A substring will hold a
195: * reference to the full characters of the original string and
196: * therefore occupies more memory than might be expected.
197: * To avoid this, override this method and create a new string
198: * instead of a substring.
199: *
200: * @param value the full header value from which to create a token
201: * @param start the index of the first token character
202: * @param end the index after the last token character
203: *
204: * @return a string representing the token identified by the arguments
205: */
206: protected String createToken(String value, int start, int end) {
207: return value.substring(start, end);
208: }
209:
210: /**
211: * Determines the starting position of the next token.
212: * This method will iterate over headers if necessary.
213: *
214: * @param from the position in the current header at which to
215: * start the search
216: *
217: * @return the position of the token start in the current header,
218: * negative if no token start could be found
219: */
220: protected int findTokenStart(int from) {
221: if (from < 0) {
222: throw new IllegalArgumentException(
223: "Search position must not be negative: " + from);
224: }
225:
226: boolean found = false;
227: while (!found && (this .currentHeader != null)) {
228:
229: final int to = this .currentHeader.length();
230: while (!found && (from < to)) {
231:
232: final char ch = this .currentHeader.charAt(from);
233: if (isTokenSeparator(ch) || isWhitespace(ch)) {
234: // whitspace and token separators are skipped
235: from++;
236: } else if (isTokenChar(this .currentHeader.charAt(from))) {
237: // found the start of a token
238: found = true;
239: } else {
240: throw new ParseException(
241: "Invalid character before token (pos "
242: + from + "): " + this .currentHeader);
243: }
244: }
245: if (!found) {
246: if (this .headerIt.hasNext()) {
247: this .currentHeader = this .headerIt.nextHeader()
248: .getValue();
249: from = 0;
250: } else {
251: this .currentHeader = null;
252: }
253: }
254: } // while headers
255:
256: return found ? from : -1;
257: }
258:
259: /**
260: * Determines the position of the next token separator.
261: * Because of multi-header joining rules, the end of a
262: * header value is a token separator. This method does
263: * therefore not need to iterate over headers.
264: *
265: * @param from the position in the current header at which to
266: * start the search
267: *
268: * @return the position of a token separator in the current header,
269: * or at the end
270: *
271: * @throws ParseException
272: * if a new token is found before a token separator.
273: * RFC 2616, section 2.1 explicitly requires a comma between
274: * tokens for <tt>#</tt>.
275: */
276: protected int findTokenSeparator(int from) {
277: if (from < 0) {
278: throw new IllegalArgumentException(
279: "Search position must not be negative: " + from);
280: }
281:
282: boolean found = false;
283: final int to = this .currentHeader.length();
284: while (!found && (from < to)) {
285: final char ch = this .currentHeader.charAt(from);
286: if (isTokenSeparator(ch)) {
287: found = true;
288: } else if (isWhitespace(ch)) {
289: from++;
290: } else if (isTokenChar(ch)) {
291: throw new ParseException(
292: "Tokens without separator (pos " + from + "): "
293: + this .currentHeader);
294: } else {
295: throw new ParseException(
296: "Invalid character after token (pos " + from
297: + "): " + this .currentHeader);
298: }
299: }
300:
301: return from;
302: }
303:
304: /**
305: * Determines the ending position of the current token.
306: * This method will not leave the current header value,
307: * since the end of the header value is a token boundary.
308: *
309: * @param from the position of the first character of the token
310: *
311: * @return the position after the last character of the token.
312: * The behavior is undefined if <code>from</code> does not
313: * point to a token character in the current header value.
314: */
315: protected int findTokenEnd(int from) {
316: if (from < 0) {
317: throw new IllegalArgumentException(
318: "Token start position must not be negative: "
319: + from);
320: }
321:
322: final int to = this .currentHeader.length();
323: int end = from + 1;
324: while ((end < to)
325: && isTokenChar(this .currentHeader.charAt(end))) {
326: end++;
327: }
328:
329: return end;
330: }
331:
332: /**
333: * Checks whether a character is a token separator.
334: * RFC 2616, section 2.1 defines comma as the separator for
335: * <tt>#token</tt> sequences. The end of a header value will
336: * also separate tokens, but that is not a character check.
337: *
338: * @param ch the character to check
339: *
340: * @return <code>true</code> if the character is a token separator,
341: * <code>false</code> otherwise
342: */
343: protected boolean isTokenSeparator(char ch) {
344: return (ch == ',');
345: }
346:
347: /**
348: * Checks whether a character is a whitespace character.
349: * RFC 2616, section 2.2 defines space and horizontal tab as whitespace.
350: * The optional preceeding line break is irrelevant, since header
351: * continuation is handled transparently when parsing messages.
352: *
353: * @param ch the character to check
354: *
355: * @return <code>true</code> if the character is whitespace,
356: * <code>false</code> otherwise
357: */
358: protected boolean isWhitespace(char ch) {
359:
360: // we do not use Character.isWhitspace(ch) here, since that allows
361: // many control characters which are not whitespace as per RFC 2616
362: return ((ch == '\t') || Character.isSpaceChar(ch));
363: }
364:
365: /**
366: * Checks whether a character is a valid token character.
367: * Whitespace, control characters, and HTTP separators are not
368: * valid token characters. The HTTP specification (RFC 2616, section 2.2)
369: * defines tokens only for the US-ASCII character set, this
370: * method extends the definition to other character sets.
371: *
372: * @param ch the character to check
373: *
374: * @return <code>true</code> if the character is a valid token start,
375: * <code>false</code> otherwise
376: */
377: protected boolean isTokenChar(char ch) {
378:
379: // common sense extension of ALPHA + DIGIT
380: if (Character.isLetterOrDigit(ch))
381: return true;
382:
383: // common sense extension of CTL
384: if (Character.isISOControl(ch))
385: return false;
386:
387: // no common sense extension for this
388: if (isHttpSeparator(ch))
389: return false;
390:
391: // RFC 2616, section 2.2 defines a token character as
392: // "any CHAR except CTLs or separators". The controls
393: // and separators are included in the checks above.
394: // This will yield unexpected results for Unicode format characters.
395: // If that is a problem, overwrite isHttpSeparator(char) to filter
396: // out the false positives.
397: return true;
398: }
399:
400: /**
401: * Checks whether a character is an HTTP separator.
402: * The implementation in this class checks only for the HTTP separators
403: * defined in RFC 2616, section 2.2. If you need to detect other
404: * separators beyond the US-ASCII character set, override this method.
405: *
406: * @param ch the character to check
407: *
408: * @return <code>true</code> if the character is an HTTP separator
409: */
410: protected boolean isHttpSeparator(char ch) {
411: return (HTTP_SEPARATORS.indexOf(ch) >= 0);
412: }
413:
414: } // class BasicTokenIterator
|