001: /* The following code was generated by JFlex 1.4.1 on 12/18/07 9:22 PM */
002:
003: package org.apache.lucene.analysis.standard;
004:
005: /**
006: * Licensed to the Apache Software Foundation (ASF) under one or more
007: * contributor license agreements. See the NOTICE file distributed with
008: * this work for additional information regarding copyright ownership.
009: * The ASF licenses this file to You under the Apache License, Version 2.0
010: * (the "License"); you may not use this file except in compliance with
011: * the License. You may obtain a copy of the License at
012: *
013: * http://www.apache.org/licenses/LICENSE-2.0
014: *
015: * Unless required by applicable law or agreed to in writing, software
016: * distributed under the License is distributed on an "AS IS" BASIS,
017: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
018: * See the License for the specific language governing permissions and
019: * limitations under the License.
020: */
021:
022: import org.apache.lucene.analysis.Token;
023:
024: /**
025: * This class is a scanner generated by
026: * <a href="http://www.jflex.de/">JFlex</a> 1.4.1
027: * on 12/18/07 9:22 PM from the specification file
028: * <tt>/Volumes/User/grantingersoll/projects/lucene/java/lucene-clean/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
029: */
030: class StandardTokenizerImpl {
031:
032: /** This character denotes the end of file */
033: public static final int YYEOF = -1;
034:
035: /** initial size of the lookahead buffer */
036: private static final int ZZ_BUFFERSIZE = 16384;
037:
038: /** lexical states */
039: public static final int YYINITIAL = 0;
040:
041: /**
042: * Translates characters to character classes
043: */
044: private static final String ZZ_CMAP_PACKED = "\11\0\1\0\1\16\1\0\1\0\1\15\22\0\1\0\5\0\1\3"
045: + "\1\1\4\0\1\7\1\5\1\2\1\7\12\11\6\0\1\4\32\10"
046: + "\4\0\1\6\1\0\32\10\105\0\27\10\1\0\37\10\1\0\u0568\10"
047: + "\12\12\206\10\12\12\u026c\10\12\12\166\10\12\12\166\10\12\12\166\10"
048: + "\12\12\166\10\12\12\167\10\11\12\166\10\12\12\166\10\12\12\166\10"
049: + "\12\12\340\10\12\12\166\10\12\12\u0166\10\12\12\266\10\u0100\10\u0e00\10"
050: + "\u1040\0\u0150\14\140\0\20\14\u0100\0\200\14\200\0\u19c0\14\100\0\u5200\14"
051: + "\u0c00\0\u2bb0\13\u2150\0\u0200\14\u0465\0\73\14\75\10\43\0";
052:
053: /**
054: * Translates characters to character classes
055: */
056: private static final char[] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
057:
058: /**
059: * Translates DFA states to action switch labels.
060: */
061: private static final int[] ZZ_ACTION = zzUnpackAction();
062:
063: private static final String ZZ_ACTION_PACKED_0 = "\1\0\1\1\4\2\1\3\1\1\6\0\2\2\6\0"
064: + "\1\4\4\5\2\6\2\0\1\7\1\0\1\7\3\5"
065: + "\6\7\3\5\1\10\1\0\1\11\2\0\1\10\1\11"
066: + "\1\0\2\11\2\10\2\5\1\12";
067:
068: private static int[] zzUnpackAction() {
069: int[] result = new int[61];
070: int offset = 0;
071: offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
072: return result;
073: }
074:
075: private static int zzUnpackAction(String packed, int offset,
076: int[] result) {
077: int i = 0; /* index in packed string */
078: int j = offset; /* index in unpacked array */
079: int l = packed.length();
080: while (i < l) {
081: int count = packed.charAt(i++);
082: int value = packed.charAt(i++);
083: do
084: result[j++] = value;
085: while (--count > 0);
086: }
087: return j;
088: }
089:
090: /**
091: * Translates a state to a row index in the transition table
092: */
093: private static final int[] ZZ_ROWMAP = zzUnpackRowMap();
094:
095: private static final String ZZ_ROWMAP_PACKED_0 = "\0\0\0\17\0\36\0\55\0\74\0\113\0\17\0\132"
096: + "\0\151\0\170\0\207\0\226\0\245\0\264\0\303\0\322"
097: + "\0\341\0\360\0\377\0\u010e\0\u011d\0\u012c\0\u013b\0\u014a"
098: + "\0\u0159\0\u0168\0\u0177\0\207\0\u0186\0\u0195\0\u01a4\0\u01b3"
099: + "\0\u01c2\0\u01d1\0\u01e0\0\u01ef\0\u01fe\0\u020d\0\u021c\0\u022b"
100: + "\0\u023a\0\u0249\0\u0258\0\u0267\0\u0276\0\u0285\0\u0294\0\u02a3"
101: + "\0\u02b2\0\u02c1\0\u02d0\0\u02df\0\u02ee\0\u02fd\0\u012c\0\341"
102: + "\0\170\0\u011d\0\u030c\0\u031b\0\u032a";
103:
104: private static int[] zzUnpackRowMap() {
105: int[] result = new int[61];
106: int offset = 0;
107: offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
108: return result;
109: }
110:
111: private static int zzUnpackRowMap(String packed, int offset,
112: int[] result) {
113: int i = 0; /* index in packed string */
114: int j = offset; /* index in unpacked array */
115: int l = packed.length();
116: while (i < l) {
117: int high = packed.charAt(i++) << 16;
118: result[j++] = high | packed.charAt(i++);
119: }
120: return j;
121: }
122:
123: /**
124: * The transition table of the DFA
125: */
126: private static final int[] ZZ_TRANS = zzUnpackTrans();
127:
128: private static final String ZZ_TRANS_PACKED_0 = "\10\2\1\3\1\4\1\5\1\6\1\7\1\10\1\2"
129: + "\20\0\1\11\1\12\1\13\1\14\2\15\1\16\1\17"
130: + "\1\4\1\20\1\6\5\0\1\21\1\0\1\22\2\23"
131: + "\1\24\3\4\1\6\4\0\1\11\1\25\1\13\1\14"
132: + "\2\23\1\24\1\20\1\4\1\20\1\6\5\0\1\26"
133: + "\1\0\1\22\2\15\1\16\4\6\21\0\1\2\10\0"
134: + "\1\27\1\0\1\27\14\0\1\30\1\31\1\32\1\33"
135: + "\13\0\1\34\1\0\1\34\14\0\1\35\1\36\1\35"
136: + "\1\36\13\0\1\37\2\40\1\41\13\0\1\16\2\42"
137: + "\5\0\1\11\1\26\1\13\1\14\2\15\1\16\1\17"
138: + "\1\4\1\20\1\6\4\0\1\11\1\21\1\13\1\14"
139: + "\2\23\1\24\1\20\1\4\1\20\1\6\13\0\1\43"
140: + "\2\44\1\45\13\0\4\36\13\0\1\46\2\47\1\50"
141: + "\13\0\1\51\2\52\1\53\13\0\1\54\1\44\1\55"
142: + "\1\45\13\0\1\56\2\31\1\33\4\0\1\11\6\0"
143: + "\1\27\1\0\1\27\6\0\1\57\1\0\1\22\2\60"
144: + "\1\0\1\56\2\31\1\33\5\0\1\61\1\0\1\22"
145: + "\2\62\1\63\3\31\1\33\5\0\1\64\1\0\1\22"
146: + "\2\62\1\63\3\31\1\33\5\0\1\65\1\0\1\22"
147: + "\2\60\1\0\4\33\5\0\1\66\2\0\1\66\2\0"
148: + "\1\35\1\36\1\35\1\36\5\0\1\66\2\0\1\66"
149: + "\2\0\4\36\5\0\1\60\1\0\1\22\2\60\1\0"
150: + "\1\37\2\40\1\41\5\0\1\62\1\0\1\22\2\62"
151: + "\1\63\3\40\1\41\5\0\1\60\1\0\1\22\2\60"
152: + "\1\0\4\41\5\0\1\63\2\0\3\63\3\42\6\0"
153: + "\1\67\1\0\1\22\2\15\1\16\1\43\2\44\1\45"
154: + "\5\0\1\70\1\0\1\22\2\23\1\24\3\44\1\45"
155: + "\5\0\1\67\1\0\1\22\2\15\1\16\4\45\5\0"
156: + "\1\15\1\0\1\22\2\15\1\16\1\46\2\47\1\50"
157: + "\5\0\1\23\1\0\1\22\2\23\1\24\3\47\1\50"
158: + "\5\0\1\15\1\0\1\22\2\15\1\16\4\50\5\0"
159: + "\1\16\2\0\3\16\1\51\2\52\1\53\5\0\1\24"
160: + "\2\0\3\24\3\52\1\53\5\0\1\16\2\0\3\16"
161: + "\4\53\5\0\1\71\1\0\1\22\2\15\1\16\1\43"
162: + "\2\44\1\45\5\0\1\72\1\0\1\22\2\23\1\24"
163: + "\3\44\1\45\5\0\1\65\1\0\1\22\2\60\1\0"
164: + "\1\56\2\31\1\33\13\0\1\73\1\33\1\73\1\33"
165: + "\13\0\4\41\13\0\4\45\13\0\4\50\13\0\4\53"
166: + "\13\0\1\74\1\45\1\74\1\45\13\0\4\33\13\0"
167: + "\4\75\5\0\1\57\1\0\1\22\2\60\1\0\4\33"
168: + "\5\0\1\71\1\0\1\22\2\15\1\16\4\45\5\0"
169: + "\1\66\2\0\1\66\2\0\4\75\3\0";
170:
171: private static int[] zzUnpackTrans() {
172: int[] result = new int[825];
173: int offset = 0;
174: offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
175: return result;
176: }
177:
178: private static int zzUnpackTrans(String packed, int offset,
179: int[] result) {
180: int i = 0; /* index in packed string */
181: int j = offset; /* index in unpacked array */
182: int l = packed.length();
183: while (i < l) {
184: int count = packed.charAt(i++);
185: int value = packed.charAt(i++);
186: value--;
187: do
188: result[j++] = value;
189: while (--count > 0);
190: }
191: return j;
192: }
193:
194: /* error codes */
195: private static final int ZZ_UNKNOWN_ERROR = 0;
196: private static final int ZZ_NO_MATCH = 1;
197: private static final int ZZ_PUSHBACK_2BIG = 2;
198:
199: /* error messages for the codes above */
200: private static final String ZZ_ERROR_MSG[] = {
201: "Unkown internal scanner error",
202: "Error: could not match input",
203: "Error: pushback value was too large" };
204:
205: /**
206: * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
207: */
208: private static final int[] ZZ_ATTRIBUTE = zzUnpackAttribute();
209:
210: private static final String ZZ_ATTRIBUTE_PACKED_0 = "\1\0\1\11\4\1\1\11\1\1\6\0\2\1\6\0"
211: + "\7\1\2\0\1\1\1\0\16\1\1\0\1\1\2\0" + "\2\1\1\0\7\1";
212:
213: private static int[] zzUnpackAttribute() {
214: int[] result = new int[61];
215: int offset = 0;
216: offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset,
217: result);
218: return result;
219: }
220:
221: private static int zzUnpackAttribute(String packed, int offset,
222: int[] result) {
223: int i = 0; /* index in packed string */
224: int j = offset; /* index in unpacked array */
225: int l = packed.length();
226: while (i < l) {
227: int count = packed.charAt(i++);
228: int value = packed.charAt(i++);
229: do
230: result[j++] = value;
231: while (--count > 0);
232: }
233: return j;
234: }
235:
236: /** the input device */
237: private java.io.Reader zzReader;
238:
239: /** the current state of the DFA */
240: private int zzState;
241:
242: /** the current lexical state */
243: private int zzLexicalState = YYINITIAL;
244:
245: /** this buffer contains the current text to be matched and is
246: the source of the yytext() string */
247: private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
248:
249: /** the textposition at the last accepting state */
250: private int zzMarkedPos;
251:
252: /** the textposition at the last state to be included in yytext */
253: private int zzPushbackPos;
254:
255: /** the current text position in the buffer */
256: private int zzCurrentPos;
257:
258: /** startRead marks the beginning of the yytext() string in the buffer */
259: private int zzStartRead;
260:
261: /** endRead marks the last character in the buffer, that has been read
262: from input */
263: private int zzEndRead;
264:
265: /** number of newlines encountered up to the start of the matched text */
266: private int yyline;
267:
268: /** the number of characters up to the start of the matched text */
269: private int yychar;
270:
271: /**
272: * the number of characters from the last newline up to the start of the
273: * matched text
274: */
275: private int yycolumn;
276:
277: /**
278: * zzAtBOL == true <=> the scanner is currently at the beginning of a line
279: */
280: private boolean zzAtBOL = true;
281:
282: /** zzAtEOF == true <=> the scanner is at the EOF */
283: private boolean zzAtEOF;
284:
285: /* user code: */
286:
287: public static final int ALPHANUM = 0;
288: public static final int APOSTROPHE = 1;
289: public static final int ACRONYM = 2;
290: public static final int COMPANY = 3;
291: public static final int EMAIL = 4;
292: public static final int HOST = 5;
293: public static final int NUM = 6;
294: public static final int CJ = 7;
295: /**
296: * @deprecated this solves a bug where HOSTs that end with '.' are identified
297: * as ACRONYMs. It is deprecated and will be removed in the next
298: * release.
299: */
300: public static final int ACRONYM_DEP = 8;
301:
302: public static final String[] TOKEN_TYPES = new String[] {
303: "<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>",
304: "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>" };
305:
306: public final int yychar() {
307: return yychar;
308: }
309:
310: /**
311: * Fills Lucene token with the current token text.
312: */
313: final void getText(Token t) {
314: t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos
315: - zzStartRead);
316: }
317:
318: /**
319: * Creates a new scanner
320: * There is also a java.io.InputStream version of this constructor.
321: *
322: * @param in the java.io.Reader to read input from.
323: */
324: StandardTokenizerImpl(java.io.Reader in) {
325: this .zzReader = in;
326: }
327:
328: /**
329: * Creates a new scanner.
330: * There is also java.io.Reader version of this constructor.
331: *
332: * @param in the java.io.Inputstream to read input from.
333: */
334: StandardTokenizerImpl(java.io.InputStream in) {
335: this (new java.io.InputStreamReader(in));
336: }
337:
338: /**
339: * Unpacks the compressed character translation table.
340: *
341: * @param packed the packed character translation table
342: * @return the unpacked character translation table
343: */
344: private static char[] zzUnpackCMap(String packed) {
345: char[] map = new char[0x10000];
346: int i = 0; /* index in packed string */
347: int j = 0; /* index in unpacked array */
348: while (i < 156) {
349: int count = packed.charAt(i++);
350: char value = packed.charAt(i++);
351: do
352: map[j++] = value;
353: while (--count > 0);
354: }
355: return map;
356: }
357:
358: /**
359: * Refills the input buffer.
360: *
361: * @return <code>false</code>, iff there was new input.
362: *
363: * @exception java.io.IOException if any I/O-Error occurs
364: */
365: private boolean zzRefill() throws java.io.IOException {
366:
367: /* first: make room (if you can) */
368: if (zzStartRead > 0) {
369: System.arraycopy(zzBuffer, zzStartRead, zzBuffer, 0,
370: zzEndRead - zzStartRead);
371:
372: /* translate stored positions */
373: zzEndRead -= zzStartRead;
374: zzCurrentPos -= zzStartRead;
375: zzMarkedPos -= zzStartRead;
376: zzPushbackPos -= zzStartRead;
377: zzStartRead = 0;
378: }
379:
380: /* is the buffer big enough? */
381: if (zzCurrentPos >= zzBuffer.length) {
382: /* if not: blow it up */
383: char newBuffer[] = new char[zzCurrentPos * 2];
384: System
385: .arraycopy(zzBuffer, 0, newBuffer, 0,
386: zzBuffer.length);
387: zzBuffer = newBuffer;
388: }
389:
390: /* finally: fill the buffer with new input */
391: int numRead = zzReader.read(zzBuffer, zzEndRead,
392: zzBuffer.length - zzEndRead);
393:
394: if (numRead < 0) {
395: return true;
396: } else {
397: zzEndRead += numRead;
398: return false;
399: }
400: }
401:
402: /**
403: * Closes the input stream.
404: */
405: public final void yyclose() throws java.io.IOException {
406: zzAtEOF = true; /* indicate end of file */
407: zzEndRead = zzStartRead; /* invalidate buffer */
408:
409: if (zzReader != null)
410: zzReader.close();
411: }
412:
413: /**
414: * Resets the scanner to read from a new input stream.
415: * Does not close the old reader.
416: *
417: * All internal variables are reset, the old input stream
418: * <b>cannot</b> be reused (internal buffer is discarded and lost).
419: * Lexical state is set to <tt>ZZ_INITIAL</tt>.
420: *
421: * @param reader the new input stream
422: */
423: public final void yyreset(java.io.Reader reader) {
424: zzReader = reader;
425: zzAtBOL = true;
426: zzAtEOF = false;
427: zzEndRead = zzStartRead = 0;
428: zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
429: yyline = yychar = yycolumn = 0;
430: zzLexicalState = YYINITIAL;
431: }
432:
433: /**
434: * Returns the current lexical state.
435: */
436: public final int yystate() {
437: return zzLexicalState;
438: }
439:
440: /**
441: * Enters a new lexical state
442: *
443: * @param newState the new lexical state
444: */
445: public final void yybegin(int newState) {
446: zzLexicalState = newState;
447: }
448:
449: /**
450: * Returns the text matched by the current regular expression.
451: */
452: public final String yytext() {
453: return new String(zzBuffer, zzStartRead, zzMarkedPos
454: - zzStartRead);
455: }
456:
457: /**
458: * Returns the character at position <tt>pos</tt> from the
459: * matched text.
460: *
461: * It is equivalent to yytext().charAt(pos), but faster
462: *
463: * @param pos the position of the character to fetch.
464: * A value from 0 to yylength()-1.
465: *
466: * @return the character at position pos
467: */
468: public final char yycharat(int pos) {
469: return zzBuffer[zzStartRead + pos];
470: }
471:
472: /**
473: * Returns the length of the matched text region.
474: */
475: public final int yylength() {
476: return zzMarkedPos - zzStartRead;
477: }
478:
479: /**
480: * Reports an error that occured while scanning.
481: *
482: * In a wellformed scanner (no or only correct usage of
483: * yypushback(int) and a match-all fallback rule) this method
484: * will only be called with things that "Can't Possibly Happen".
485: * If this method is called, something is seriously wrong
486: * (e.g. a JFlex bug producing a faulty scanner etc.).
487: *
488: * Usual syntax/scanner level error handling should be done
489: * in error fallback rules.
490: *
491: * @param errorCode the code of the errormessage to display
492: */
493: private void zzScanError(int errorCode) {
494: String message;
495: try {
496: message = ZZ_ERROR_MSG[errorCode];
497: } catch (ArrayIndexOutOfBoundsException e) {
498: message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
499: }
500:
501: throw new Error(message);
502: }
503:
504: /**
505: * Pushes the specified amount of characters back into the input stream.
506: *
507: * They will be read again by then next call of the scanning method
508: *
509: * @param number the number of characters to be read again.
510: * This number must not be greater than yylength()!
511: */
512: public void yypushback(int number) {
513: if (number > yylength())
514: zzScanError(ZZ_PUSHBACK_2BIG);
515:
516: zzMarkedPos -= number;
517: }
518:
519: /**
520: * Resumes scanning until the next regular expression is matched,
521: * the end of input is encountered or an I/O-Error occurs.
522: *
523: * @return the next token
524: * @exception java.io.IOException if any I/O-Error occurs
525: */
526: public int getNextToken() throws java.io.IOException {
527: int zzInput;
528: int zzAction;
529:
530: // cached fields:
531: int zzCurrentPosL;
532: int zzMarkedPosL;
533: int zzEndReadL = zzEndRead;
534: char[] zzBufferL = zzBuffer;
535: char[] zzCMapL = ZZ_CMAP;
536:
537: int[] zzTransL = ZZ_TRANS;
538: int[] zzRowMapL = ZZ_ROWMAP;
539: int[] zzAttrL = ZZ_ATTRIBUTE;
540:
541: while (true) {
542: zzMarkedPosL = zzMarkedPos;
543:
544: yychar += zzMarkedPosL - zzStartRead;
545:
546: zzAction = -1;
547:
548: zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
549:
550: zzState = zzLexicalState;
551:
552: zzForAction: {
553: while (true) {
554:
555: if (zzCurrentPosL < zzEndReadL)
556: zzInput = zzBufferL[zzCurrentPosL++];
557: else if (zzAtEOF) {
558: zzInput = YYEOF;
559: break zzForAction;
560: } else {
561: // store back cached positions
562: zzCurrentPos = zzCurrentPosL;
563: zzMarkedPos = zzMarkedPosL;
564: boolean eof = zzRefill();
565: // get translated positions and possibly new buffer
566: zzCurrentPosL = zzCurrentPos;
567: zzMarkedPosL = zzMarkedPos;
568: zzBufferL = zzBuffer;
569: zzEndReadL = zzEndRead;
570: if (eof) {
571: zzInput = YYEOF;
572: break zzForAction;
573: } else {
574: zzInput = zzBufferL[zzCurrentPosL++];
575: }
576: }
577: int zzNext = zzTransL[zzRowMapL[zzState]
578: + zzCMapL[zzInput]];
579: if (zzNext == -1)
580: break zzForAction;
581: zzState = zzNext;
582:
583: int zzAttributes = zzAttrL[zzState];
584: if ((zzAttributes & 1) == 1) {
585: zzAction = zzState;
586: zzMarkedPosL = zzCurrentPosL;
587: if ((zzAttributes & 8) == 8)
588: break zzForAction;
589: }
590:
591: }
592: }
593:
594: // store back cached position
595: zzMarkedPos = zzMarkedPosL;
596:
597: switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
598: case 5: {
599: return HOST;
600: }
601: case 11:
602: break;
603: case 9: {
604: return ACRONYM_DEP;
605: }
606: case 12:
607: break;
608: case 8: {
609: return ACRONYM;
610: }
611: case 13:
612: break;
613: case 1: { /* ignore */
614: }
615: case 14:
616: break;
617: case 7: {
618: return NUM;
619: }
620: case 15:
621: break;
622: case 3: {
623: return CJ;
624: }
625: case 16:
626: break;
627: case 2: {
628: return ALPHANUM;
629: }
630: case 17:
631: break;
632: case 6: {
633: return COMPANY;
634: }
635: case 18:
636: break;
637: case 4: {
638: return APOSTROPHE;
639: }
640: case 19:
641: break;
642: case 10: {
643: return EMAIL;
644: }
645: case 20:
646: break;
647: default:
648: if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
649: zzAtEOF = true;
650: return YYEOF;
651: } else {
652: zzScanError(ZZ_NO_MATCH);
653: }
654: }
655: }
656: }
657:
658: }
|