0001: /* The following code was generated by JFlex 1.4.1 on 1/4/08 3:30 PM */
0002:
0003: package org.apache.lucene.wikipedia.analysis;
0004:
0005: /**
0006: * Licensed to the Apache Software Foundation (ASF) under one or more
0007: * contributor license agreements. See the NOTICE file distributed with
0008: * this work for additional information regarding copyright ownership.
0009: * The ASF licenses this file to You under the Apache License, Version 2.0
0010: * (the "License"); you may not use this file except in compliance with
0011: * the License. You may obtain a copy of the License at
0012: *
0013: * http://www.apache.org/licenses/LICENSE-2.0
0014: *
0015: * Unless required by applicable law or agreed to in writing, software
0016: * distributed under the License is distributed on an "AS IS" BASIS,
0017: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0018: * See the License for the specific language governing permissions and
0019: * limitations under the License.
0020: */
0021:
0022: import org.apache.lucene.analysis.Token;
0023:
0024: /**
0025: * This class is a scanner generated by
0026: * <a href="http://www.jflex.de/">JFlex</a> 1.4.1
0027: * on 1/4/08 3:30 PM from the specification file
0028: * <tt>/Volumes/User/grantingersoll/projects/lucene/Lucene-Trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex</tt>
0029: */
0030: class WikipediaTokenizerImpl {
0031:
0032: /** This character denotes the end of file */
0033: public static final int YYEOF = -1;
0034:
0035: /** initial size of the lookahead buffer */
0036: private static final int ZZ_BUFFERSIZE = 16384;
0037:
0038: /** lexical states */
0039: public static final int DOUBLE_BRACE_STATE = 7;
0040: public static final int INTERNAL_LINK_STATE = 2;
0041: public static final int TWO_SINGLE_QUOTES_STATE = 4;
0042: public static final int CATEGORY_STATE = 1;
0043: public static final int FIVE_SINGLE_QUOTES_STATE = 5;
0044: public static final int STRING = 8;
0045: public static final int YYINITIAL = 0;
0046: public static final int DOUBLE_EQUALS_STATE = 6;
0047: public static final int THREE_SINGLE_QUOTES_STATE = 5;
0048: public static final int EXTERNAL_LINK_STATE = 3;
0049:
0050: /**
0051: * Translates characters to character classes
0052: */
0053: private static final String ZZ_CMAP_PACKED = "\11\0\1\24\1\23\1\0\1\24\1\22\22\0\1\24\1\0\1\12"
0054: + "\1\53\2\0\1\3\1\1\4\0\1\14\1\5\1\2\1\10\12\16"
0055: + "\1\27\1\0\1\7\1\11\1\13\1\53\1\4\2\15\1\30\5\15"
0056: + "\1\41\21\15\1\25\1\0\1\26\1\0\1\6\1\0\1\31\1\43"
0057: + "\2\15\1\33\1\40\1\34\1\50\1\41\4\15\1\42\1\35\1\51"
0058: + "\1\15\1\36\1\52\1\32\3\15\1\44\1\37\1\15\1\45\1\47"
0059: + "\1\46\102\0\27\15\1\0\37\15\1\0\u0568\15\12\17\206\15\12\17"
0060: + "\u026c\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17"
0061: + "\167\15\11\17\166\15\12\17\166\15\12\17\166\15\12\17\340\15\12\17"
0062: + "\166\15\12\17\u0166\15\12\17\266\15\u0100\15\u0e00\15\u1040\0\u0150\21\140\0"
0063: + "\20\21\u0100\0\200\21\200\0\u19c0\21\100\0\u5200\21\u0c00\0\u2bb0\20\u2150\0"
0064: + "\u0200\21\u0465\0\73\21\75\15\43\0";
0065:
0066: /**
0067: * Translates characters to character classes
0068: */
0069: private static final char[] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
0070:
0071: /**
0072: * Translates DFA states to action switch labels.
0073: */
0074: private static final int[] ZZ_ACTION = zzUnpackAction();
0075:
0076: private static final String ZZ_ACTION_PACKED_0 = "\11\0\4\1\4\2\1\3\1\1\1\4\2\1\1\5"
0077: + "\1\1\1\6\1\1\2\7\1\10\1\11\1\10\1\12"
0078: + "\1\13\1\7\1\14\1\15\1\16\1\17\1\7\1\20"
0079: + "\1\7\4\21\1\22\1\21\1\23\1\24\1\25\3\0"
0080: + "\1\26\14\0\1\27\1\30\1\31\1\32\1\10\1\0"
0081: + "\1\33\1\0\1\34\1\0\1\35\3\0\1\36\1\37"
0082: + "\2\40\1\37\2\41\2\0\1\40\1\0\14\40\1\37"
0083: + "\3\0\1\10\1\42\3\0\1\43\1\44\5\0\1\45"
0084: + "\4\0\1\45\2\0\2\45\2\0\1\10\5\0\1\30"
0085: + "\1\37\1\40\1\46\3\0\1\10\2\0\1\47\30\0"
0086: + "\1\50\2\0\1\51\1\52\1\53";
0087:
0088: private static int[] zzUnpackAction() {
0089: int[] result = new int[178];
0090: int offset = 0;
0091: offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
0092: return result;
0093: }
0094:
0095: private static int zzUnpackAction(String packed, int offset,
0096: int[] result) {
0097: int i = 0; /* index in packed string */
0098: int j = offset; /* index in unpacked array */
0099: int l = packed.length();
0100: while (i < l) {
0101: int count = packed.charAt(i++);
0102: int value = packed.charAt(i++);
0103: do
0104: result[j++] = value;
0105: while (--count > 0);
0106: }
0107: return j;
0108: }
0109:
0110: /**
0111: * Translates a state to a row index in the transition table
0112: */
0113: private static final int[] ZZ_ROWMAP = zzUnpackRowMap();
0114:
0115: private static final String ZZ_ROWMAP_PACKED_0 = "\0\0\0\54\0\130\0\204\0\260\0\334\0\u0108\0\u0134"
0116: + "\0\u0160\0\u018c\0\u01b8\0\u01e4\0\u0210\0\u023c\0\u0268\0\u0294"
0117: + "\0\u02c0\0\u018c\0\u02ec\0\u0318\0\u0344\0\u0370\0\u039c\0\u03c8"
0118: + "\0\u03f4\0\u0420\0\u018c\0\u0370\0\u044c\0\u018c\0\u0478\0\u04a4"
0119: + "\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8\0\u0604"
0120: + "\0\u0630\0\u018c\0\u065c\0\u0370\0\u0688\0\u06b4\0\u06e0\0\u070c"
0121: + "\0\u018c\0\u018c\0\u0738\0\u0764\0\u0790\0\u018c\0\u07bc\0\u07e8"
0122: + "\0\u0814\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c\0\u0948"
0123: + "\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u018c\0\u018c\0\u0a24\0\u0a50"
0124: + "\0\u0a7c\0\u0aa8\0\u0ad4\0\u0b00\0\u0b2c\0\u0b58\0\u0b84\0\u0bb0"
0125: + "\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c\0\u0814\0\u0cb8\0\u0ce4"
0126: + "\0\u0d10\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0\0\u0dec\0\u0e18\0\u0e44"
0127: + "\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20\0\u0f4c\0\u0f78\0\u0fa4"
0128: + "\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u018c\0\u1080\0\u10ac\0\u10d8"
0129: + "\0\u1104\0\u018c\0\u1130\0\u115c\0\u1188\0\u11b4\0\u11e0\0\u120c"
0130: + "\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8\0\u1314\0\u1340\0\u07e8"
0131: + "\0\u0974\0\u136c\0\u1398\0\u13c4\0\u13f0\0\u141c\0\u1448\0\u1474"
0132: + "\0\u14a0\0\u018c\0\u14cc\0\u14f8\0\u1524\0\u1550\0\u157c\0\u15a8"
0133: + "\0\u15d4\0\u1600\0\u162c\0\u018c\0\u1658\0\u1684\0\u16b0\0\u16dc"
0134: + "\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8\0\u17e4\0\u1810\0\u183c"
0135: + "\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918\0\u1944\0\u1970\0\u199c"
0136: + "\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78\0\u1aa4\0\u1ad0\0\u018c"
0137: + "\0\u018c\0\u018c";
0138:
0139: private static int[] zzUnpackRowMap() {
0140: int[] result = new int[178];
0141: int offset = 0;
0142: offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
0143: return result;
0144: }
0145:
0146: private static int zzUnpackRowMap(String packed, int offset,
0147: int[] result) {
0148: int i = 0; /* index in packed string */
0149: int j = offset; /* index in unpacked array */
0150: int l = packed.length();
0151: while (i < l) {
0152: int high = packed.charAt(i++) << 16;
0153: result[j++] = high | packed.charAt(i++);
0154: }
0155: return j;
0156: }
0157:
0158: /**
0159: * The transition table of the DFA
0160: */
0161: private static final int[] ZZ_TRANS = zzUnpackTrans();
0162:
0163: private static final String ZZ_TRANS_PACKED_0 = "\1\12\1\13\5\12\1\14\1\12\1\15\3\12\1\16"
0164: + "\1\17\1\20\1\21\1\22\1\23\2\12\1\24\2\12"
0165: + "\15\16\1\25\2\12\3\16\10\12\1\26\5\12\4\27"
0166: + "\1\12\1\23\3\12\1\30\1\12\15\27\3\12\3\27"
0167: + "\10\12\1\26\5\12\4\31\1\12\1\23\3\12\1\32"
0168: + "\1\12\15\31\3\12\3\31\1\12\7\33\1\34\5\33"
0169: + "\4\35\1\33\1\23\2\12\1\33\1\36\1\33\15\35"
0170: + "\3\33\1\37\2\35\2\33\1\40\5\33\1\34\5\33"
0171: + "\4\41\1\33\1\42\2\33\1\43\2\33\15\41\3\33"
0172: + "\3\41\10\33\1\34\5\33\4\44\1\33\1\42\2\33"
0173: + "\1\43\2\33\15\44\3\33\3\44\10\33\1\34\1\33"
0174: + "\1\45\3\33\4\46\1\33\1\42\5\33\15\46\3\33"
0175: + "\3\46\10\33\1\47\5\33\4\50\1\33\1\42\5\33"
0176: + "\15\50\1\33\1\51\1\33\3\50\1\33\1\52\1\53"
0177: + "\5\52\1\54\1\52\1\55\3\52\4\56\1\52\1\57"
0178: + "\2\52\1\60\2\52\15\56\2\52\1\61\3\56\1\52"
0179: + "\55\0\1\62\62\0\1\63\4\0\4\64\7\0\6\64"
0180: + "\1\65\6\64\3\0\3\64\12\0\1\66\43\0\1\67"
0181: + "\1\70\1\71\1\72\2\73\1\0\1\74\3\0\1\74"
0182: + "\1\16\1\17\1\20\1\21\7\0\15\16\3\0\3\16"
0183: + "\3\0\1\75\1\0\1\76\2\77\1\0\1\100\3\0"
0184: + "\1\100\3\17\1\21\7\0\15\17\3\0\3\17\2\0"
0185: + "\1\67\1\101\1\71\1\72\2\77\1\0\1\100\3\0"
0186: + "\1\100\1\20\1\17\1\20\1\21\7\0\15\20\3\0"
0187: + "\3\20\3\0\1\102\1\0\1\76\2\73\1\0\1\74"
0188: + "\3\0\1\74\4\21\7\0\15\21\3\0\3\21\24\0"
0189: + "\1\12\55\0\1\103\73\0\1\104\16\0\1\63\4\0"
0190: + "\4\64\7\0\15\64\3\0\3\64\16\0\4\27\7\0"
0191: + "\15\27\3\0\3\27\27\0\1\105\42\0\4\31\7\0"
0192: + "\15\31\3\0\3\31\27\0\1\106\42\0\4\35\7\0"
0193: + "\15\35\3\0\3\35\16\0\4\35\7\0\2\35\1\107"
0194: + "\12\35\3\0\3\35\2\0\1\110\67\0\4\41\7\0"
0195: + "\15\41\3\0\3\41\24\0\1\33\55\0\1\111\43\0"
0196: + "\4\44\7\0\15\44\3\0\3\44\12\0\1\105\57\0"
0197: + "\4\46\7\0\15\46\3\0\3\46\11\0\1\112\4\0"
0198: + "\4\64\7\0\15\64\3\0\3\64\16\0\4\50\7\0"
0199: + "\15\50\3\0\3\50\47\0\1\105\6\0\1\113\63\0"
0200: + "\1\114\57\0\4\56\7\0\15\56\3\0\3\56\24\0"
0201: + "\1\52\55\0\1\115\43\0\4\64\7\0\15\64\3\0"
0202: + "\3\64\14\0\1\33\1\0\4\116\1\0\3\117\3\0"
0203: + "\15\116\3\0\3\116\14\0\1\33\1\0\4\116\1\0"
0204: + "\3\117\3\0\3\116\1\120\11\116\3\0\3\116\16\0"
0205: + "\1\121\1\0\1\121\10\0\15\121\3\0\3\121\16\0"
0206: + "\1\122\1\123\1\124\1\125\7\0\15\122\3\0\3\122"
0207: + "\16\0\1\126\1\0\1\126\10\0\15\126\3\0\3\126"
0208: + "\16\0\1\127\1\130\1\127\1\130\7\0\15\127\3\0"
0209: + "\3\127\16\0\1\131\2\132\1\133\7\0\15\131\3\0"
0210: + "\3\131\16\0\1\74\2\134\10\0\15\74\3\0\3\74"
0211: + "\16\0\1\135\2\136\1\137\7\0\15\135\3\0\3\135"
0212: + "\16\0\4\130\7\0\15\130\3\0\3\130\16\0\1\140"
0213: + "\2\141\1\142\7\0\15\140\3\0\3\140\16\0\1\143"
0214: + "\2\144\1\145\7\0\15\143\3\0\3\143\16\0\1\146"
0215: + "\1\136\1\147\1\137\7\0\15\146\3\0\3\146\16\0"
0216: + "\1\150\2\123\1\125\7\0\15\150\3\0\3\150\30\0"
0217: + "\1\151\1\152\64\0\1\153\27\0\4\35\7\0\2\35"
0218: + "\1\154\12\35\3\0\3\35\2\0\1\155\101\0\1\156"
0219: + "\1\157\40\0\4\64\7\0\6\64\1\160\6\64\3\0"
0220: + "\3\64\2\0\1\161\63\0\1\162\71\0\1\163\1\164"
0221: + "\34\0\1\165\1\0\1\33\1\0\4\116\1\0\3\117"
0222: + "\3\0\15\116\3\0\3\116\16\0\4\166\1\0\3\117"
0223: + "\3\0\15\166\3\0\3\166\12\0\1\165\1\0\1\33"
0224: + "\1\0\4\116\1\0\3\117\3\0\10\116\1\167\4\116"
0225: + "\3\0\3\116\2\0\1\67\13\0\1\121\1\0\1\121"
0226: + "\10\0\15\121\3\0\3\121\3\0\1\170\1\0\1\76"
0227: + "\2\171\6\0\1\122\1\123\1\124\1\125\7\0\15\122"
0228: + "\3\0\3\122\3\0\1\172\1\0\1\76\2\173\1\0"
0229: + "\1\174\3\0\1\174\3\123\1\125\7\0\15\123\3\0"
0230: + "\3\123\3\0\1\175\1\0\1\76\2\173\1\0\1\174"
0231: + "\3\0\1\174\1\124\1\123\1\124\1\125\7\0\15\124"
0232: + "\3\0\3\124\3\0\1\176\1\0\1\76\2\171\6\0"
0233: + "\4\125\7\0\15\125\3\0\3\125\3\0\1\177\2\0"
0234: + "\1\177\7\0\1\127\1\130\1\127\1\130\7\0\15\127"
0235: + "\3\0\3\127\3\0\1\177\2\0\1\177\7\0\4\130"
0236: + "\7\0\15\130\3\0\3\130\3\0\1\171\1\0\1\76"
0237: + "\2\171\6\0\1\131\2\132\1\133\7\0\15\131\3\0"
0238: + "\3\131\3\0\1\173\1\0\1\76\2\173\1\0\1\174"
0239: + "\3\0\1\174\3\132\1\133\7\0\15\132\3\0\3\132"
0240: + "\3\0\1\171\1\0\1\76\2\171\6\0\4\133\7\0"
0241: + "\15\133\3\0\3\133\3\0\1\174\2\0\2\174\1\0"
0242: + "\1\174\3\0\1\174\3\134\10\0\15\134\3\0\3\134"
0243: + "\3\0\1\102\1\0\1\76\2\73\1\0\1\74\3\0"
0244: + "\1\74\1\135\2\136\1\137\7\0\15\135\3\0\3\135"
0245: + "\3\0\1\75\1\0\1\76\2\77\1\0\1\100\3\0"
0246: + "\1\100\3\136\1\137\7\0\15\136\3\0\3\136\3\0"
0247: + "\1\102\1\0\1\76\2\73\1\0\1\74\3\0\1\74"
0248: + "\4\137\7\0\15\137\3\0\3\137\3\0\1\73\1\0"
0249: + "\1\76\2\73\1\0\1\74\3\0\1\74\1\140\2\141"
0250: + "\1\142\7\0\15\140\3\0\3\140\3\0\1\77\1\0"
0251: + "\1\76\2\77\1\0\1\100\3\0\1\100\3\141\1\142"
0252: + "\7\0\15\141\3\0\3\141\3\0\1\73\1\0\1\76"
0253: + "\2\73\1\0\1\74\3\0\1\74\4\142\7\0\15\142"
0254: + "\3\0\3\142\3\0\1\74\2\0\2\74\1\0\1\74"
0255: + "\3\0\1\74\1\143\2\144\1\145\7\0\15\143\3\0"
0256: + "\3\143\3\0\1\100\2\0\2\100\1\0\1\100\3\0"
0257: + "\1\100\3\144\1\145\7\0\15\144\3\0\3\144\3\0"
0258: + "\1\74\2\0\2\74\1\0\1\74\3\0\1\74\4\145"
0259: + "\7\0\15\145\3\0\3\145\3\0\1\200\1\0\1\76"
0260: + "\2\73\1\0\1\74\3\0\1\74\1\146\1\136\1\147"
0261: + "\1\137\7\0\15\146\3\0\3\146\3\0\1\201\1\0"
0262: + "\1\76\2\77\1\0\1\100\3\0\1\100\1\147\1\136"
0263: + "\1\147\1\137\7\0\15\147\3\0\3\147\3\0\1\176"
0264: + "\1\0\1\76\2\171\6\0\1\150\2\123\1\125\7\0"
0265: + "\15\150\3\0\3\150\31\0\1\152\54\0\1\202\64\0"
0266: + "\1\203\26\0\4\35\7\0\15\35\3\0\1\35\1\204"
0267: + "\1\35\31\0\1\157\54\0\1\205\35\0\1\33\1\0"
0268: + "\4\116\1\0\3\117\3\0\3\116\1\206\11\116\3\0"
0269: + "\3\116\2\0\1\207\102\0\1\164\54\0\1\210\34\0"
0270: + "\1\211\52\0\1\165\3\0\4\166\7\0\15\166\3\0"
0271: + "\3\166\12\0\1\165\1\0\1\212\1\0\4\116\1\0"
0272: + "\3\117\3\0\15\116\3\0\3\116\16\0\1\213\1\125"
0273: + "\1\213\1\125\7\0\15\213\3\0\3\213\16\0\4\133"
0274: + "\7\0\15\133\3\0\3\133\16\0\4\137\7\0\15\137"
0275: + "\3\0\3\137\16\0\4\142\7\0\15\142\3\0\3\142"
0276: + "\16\0\4\145\7\0\15\145\3\0\3\145\16\0\1\214"
0277: + "\1\137\1\214\1\137\7\0\15\214\3\0\3\214\16\0"
0278: + "\4\125\7\0\15\125\3\0\3\125\16\0\4\215\7\0"
0279: + "\15\215\3\0\3\215\33\0\1\216\61\0\1\217\30\0"
0280: + "\4\35\6\0\1\220\15\35\3\0\2\35\1\221\33\0"
0281: + "\1\222\32\0\1\165\1\0\1\33\1\0\4\116\1\0"
0282: + "\3\117\3\0\10\116\1\223\4\116\3\0\3\116\2\0"
0283: + "\1\224\104\0\1\225\36\0\4\226\7\0\15\226\3\0"
0284: + "\3\226\3\0\1\170\1\0\1\76\2\171\6\0\1\213"
0285: + "\1\125\1\213\1\125\7\0\15\213\3\0\3\213\3\0"
0286: + "\1\200\1\0\1\76\2\73\1\0\1\74\3\0\1\74"
0287: + "\1\214\1\137\1\214\1\137\7\0\15\214\3\0\3\214"
0288: + "\3\0\1\177\2\0\1\177\7\0\4\215\7\0\15\215"
0289: + "\3\0\3\215\34\0\1\227\55\0\1\230\26\0\1\231"
0290: + "\60\0\4\35\6\0\1\220\15\35\3\0\3\35\34\0"
0291: + "\1\232\31\0\1\165\1\0\1\105\1\0\4\116\1\0"
0292: + "\3\117\3\0\15\116\3\0\3\116\34\0\1\233\32\0"
0293: + "\1\234\2\0\4\226\7\0\15\226\3\0\3\226\35\0"
0294: + "\1\235\62\0\1\236\20\0\1\237\77\0\1\240\53\0"
0295: + "\1\241\32\0\1\33\1\0\4\166\1\0\3\117\3\0"
0296: + "\15\166\3\0\3\166\36\0\1\242\53\0\1\243\33\0"
0297: + "\4\244\7\0\15\244\3\0\3\244\36\0\1\245\53\0"
0298: + "\1\246\54\0\1\247\61\0\1\250\11\0\1\251\12\0"
0299: + "\4\244\7\0\15\244\3\0\3\244\37\0\1\252\53\0"
0300: + "\1\253\54\0\1\254\22\0\1\12\62\0\4\255\7\0"
0301: + "\15\255\3\0\3\255\40\0\1\256\53\0\1\257\43\0"
0302: + "\1\260\26\0\2\255\1\0\2\255\1\0\2\255\2\0"
0303: + "\5\255\7\0\15\255\3\0\4\255\27\0\1\261\53\0"
0304: + "\1\262\24\0";
0305:
0306: private static int[] zzUnpackTrans() {
0307: int[] result = new int[6908];
0308: int offset = 0;
0309: offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
0310: return result;
0311: }
0312:
0313: private static int zzUnpackTrans(String packed, int offset,
0314: int[] result) {
0315: int i = 0; /* index in packed string */
0316: int j = offset; /* index in unpacked array */
0317: int l = packed.length();
0318: while (i < l) {
0319: int count = packed.charAt(i++);
0320: int value = packed.charAt(i++);
0321: value--;
0322: do
0323: result[j++] = value;
0324: while (--count > 0);
0325: }
0326: return j;
0327: }
0328:
0329: /* error codes */
0330: private static final int ZZ_UNKNOWN_ERROR = 0;
0331: private static final int ZZ_NO_MATCH = 1;
0332: private static final int ZZ_PUSHBACK_2BIG = 2;
0333:
0334: /* error messages for the codes above */
0335: private static final String ZZ_ERROR_MSG[] = {
0336: "Unkown internal scanner error",
0337: "Error: could not match input",
0338: "Error: pushback value was too large" };
0339:
0340: /**
0341: * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
0342: */
0343: private static final int[] ZZ_ATTRIBUTE = zzUnpackAttribute();
0344:
0345: private static final String ZZ_ATTRIBUTE_PACKED_0 = "\11\0\1\11\7\1\1\11\10\1\1\11\2\1\1\11"
0346: + "\13\1\1\11\6\1\2\11\3\0\1\11\14\0\2\1"
0347: + "\2\11\1\1\1\0\1\1\1\0\1\1\1\0\1\1"
0348: + "\3\0\7\1\2\0\1\1\1\0\15\1\3\0\1\1"
0349: + "\1\11\3\0\1\1\1\11\5\0\1\1\4\0\1\1"
0350: + "\2\0\2\1\2\0\1\1\5\0\1\11\3\1\3\0"
0351: + "\1\1\2\0\1\11\30\0\1\1\2\0\3\11";
0352:
0353: private static int[] zzUnpackAttribute() {
0354: int[] result = new int[178];
0355: int offset = 0;
0356: offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset,
0357: result);
0358: return result;
0359: }
0360:
0361: private static int zzUnpackAttribute(String packed, int offset,
0362: int[] result) {
0363: int i = 0; /* index in packed string */
0364: int j = offset; /* index in unpacked array */
0365: int l = packed.length();
0366: while (i < l) {
0367: int count = packed.charAt(i++);
0368: int value = packed.charAt(i++);
0369: do
0370: result[j++] = value;
0371: while (--count > 0);
0372: }
0373: return j;
0374: }
0375:
0376: /** the input device */
0377: private java.io.Reader zzReader;
0378:
0379: /** the current state of the DFA */
0380: private int zzState;
0381:
0382: /** the current lexical state */
0383: private int zzLexicalState = YYINITIAL;
0384:
0385: /** this buffer contains the current text to be matched and is
0386: the source of the yytext() string */
0387: private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
0388:
0389: /** the textposition at the last accepting state */
0390: private int zzMarkedPos;
0391:
0392: /** the textposition at the last state to be included in yytext */
0393: private int zzPushbackPos;
0394:
0395: /** the current text position in the buffer */
0396: private int zzCurrentPos;
0397:
0398: /** startRead marks the beginning of the yytext() string in the buffer */
0399: private int zzStartRead;
0400:
0401: /** endRead marks the last character in the buffer, that has been read
0402: from input */
0403: private int zzEndRead;
0404:
0405: /** number of newlines encountered up to the start of the matched text */
0406: private int yyline;
0407:
0408: /** the number of characters up to the start of the matched text */
0409: private int yychar;
0410:
0411: /**
0412: * the number of characters from the last newline up to the start of the
0413: * matched text
0414: */
0415: private int yycolumn;
0416:
0417: /**
0418: * zzAtBOL == true <=> the scanner is currently at the beginning of a line
0419: */
0420: private boolean zzAtBOL = true;
0421:
0422: /** zzAtEOF == true <=> the scanner is at the EOF */
0423: private boolean zzAtEOF;
0424:
0425: /* user code: */
0426:
0427: public static final int ALPHANUM = 0;
0428: public static final int APOSTROPHE = 1;
0429: public static final int ACRONYM = 2;
0430: public static final int COMPANY = 3;
0431: public static final int EMAIL = 4;
0432: public static final int HOST = 5;
0433: public static final int NUM = 6;
0434: public static final int CJ = 7;
0435: public static final int INTERNAL_LINK = 8;
0436: public static final int EXTERNAL_LINK = 9;
0437: public static final int CITATION = 10;
0438: public static final int CATEGORY = 11;
0439: public static final int BOLD = 12;
0440: public static final int ITALICS = 13;
0441: public static final int BOLD_ITALICS = 14;
0442: public static final int HEADING = 15;
0443: public static final int SUB_HEADING = 16;
0444: public static final int EXTERNAL_LINK_URL = 17;
0445:
0446: private int currentTokType;
0447: private int numBalanced = 0;
0448: private int positionInc = 1;
0449: private int numLinkToks = 0;
0450:
0451: public static final String[] TOKEN_TYPES = new String[] {
0452: "<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>",
0453: "<EMAIL>", "<HOST>", "<NUM>", "<CJ>",
0454: WikipediaTokenizer.INTERNAL_LINK,
0455: WikipediaTokenizer.EXTERNAL_LINK,
0456: WikipediaTokenizer.CITATION, WikipediaTokenizer.CATEGORY,
0457: WikipediaTokenizer.BOLD, WikipediaTokenizer.ITALICS,
0458: WikipediaTokenizer.BOLD_ITALICS,
0459: WikipediaTokenizer.HEADING, WikipediaTokenizer.SUB_HEADING,
0460: WikipediaTokenizer.EXTERNAL_LINK_URL };
0461:
0462: public final int yychar() {
0463: return yychar;
0464: }
0465:
0466: public final int getPositionIncrement() {
0467: return positionInc;
0468: }
0469:
0470: /**
0471: * Fills Lucene token with the current token text.
0472: */
0473: final void getText(Token t, int tokType) {
0474: t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos
0475: - zzStartRead);
0476: }
0477:
0478: /**
0479: * Creates a new scanner
0480: * There is also a java.io.InputStream version of this constructor.
0481: *
0482: * @param in the java.io.Reader to read input from.
0483: */
0484: WikipediaTokenizerImpl(java.io.Reader in) {
0485: this .zzReader = in;
0486: }
0487:
0488: /**
0489: * Creates a new scanner.
0490: * There is also java.io.Reader version of this constructor.
0491: *
0492: * @param in the java.io.Inputstream to read input from.
0493: */
0494: WikipediaTokenizerImpl(java.io.InputStream in) {
0495: this (new java.io.InputStreamReader(in));
0496: }
0497:
0498: /**
0499: * Unpacks the compressed character translation table.
0500: *
0501: * @param packed the packed character translation table
0502: * @return the unpacked character translation table
0503: */
0504: private static char[] zzUnpackCMap(String packed) {
0505: char[] map = new char[0x10000];
0506: int i = 0; /* index in packed string */
0507: int j = 0; /* index in unpacked array */
0508: while (i < 230) {
0509: int count = packed.charAt(i++);
0510: char value = packed.charAt(i++);
0511: do
0512: map[j++] = value;
0513: while (--count > 0);
0514: }
0515: return map;
0516: }
0517:
0518: /**
0519: * Refills the input buffer.
0520: *
0521: * @return <code>false</code>, iff there was new input.
0522: *
0523: * @exception java.io.IOException if any I/O-Error occurs
0524: */
0525: private boolean zzRefill() throws java.io.IOException {
0526:
0527: /* first: make room (if you can) */
0528: if (zzStartRead > 0) {
0529: System.arraycopy(zzBuffer, zzStartRead, zzBuffer, 0,
0530: zzEndRead - zzStartRead);
0531:
0532: /* translate stored positions */
0533: zzEndRead -= zzStartRead;
0534: zzCurrentPos -= zzStartRead;
0535: zzMarkedPos -= zzStartRead;
0536: zzPushbackPos -= zzStartRead;
0537: zzStartRead = 0;
0538: }
0539:
0540: /* is the buffer big enough? */
0541: if (zzCurrentPos >= zzBuffer.length) {
0542: /* if not: blow it up */
0543: char newBuffer[] = new char[zzCurrentPos * 2];
0544: System
0545: .arraycopy(zzBuffer, 0, newBuffer, 0,
0546: zzBuffer.length);
0547: zzBuffer = newBuffer;
0548: }
0549:
0550: /* finally: fill the buffer with new input */
0551: int numRead = zzReader.read(zzBuffer, zzEndRead,
0552: zzBuffer.length - zzEndRead);
0553:
0554: if (numRead < 0) {
0555: return true;
0556: } else {
0557: zzEndRead += numRead;
0558: return false;
0559: }
0560: }
0561:
0562: /**
0563: * Closes the input stream.
0564: */
0565: public final void yyclose() throws java.io.IOException {
0566: zzAtEOF = true; /* indicate end of file */
0567: zzEndRead = zzStartRead; /* invalidate buffer */
0568:
0569: if (zzReader != null)
0570: zzReader.close();
0571: }
0572:
0573: /**
0574: * Resets the scanner to read from a new input stream.
0575: * Does not close the old reader.
0576: *
0577: * All internal variables are reset, the old input stream
0578: * <b>cannot</b> be reused (internal buffer is discarded and lost).
0579: * Lexical state is set to <tt>ZZ_INITIAL</tt>.
0580: *
0581: * @param reader the new input stream
0582: */
0583: public final void yyreset(java.io.Reader reader) {
0584: zzReader = reader;
0585: zzAtBOL = true;
0586: zzAtEOF = false;
0587: zzEndRead = zzStartRead = 0;
0588: zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
0589: yyline = yychar = yycolumn = 0;
0590: zzLexicalState = YYINITIAL;
0591: }
0592:
0593: /**
0594: * Returns the current lexical state.
0595: */
0596: public final int yystate() {
0597: return zzLexicalState;
0598: }
0599:
0600: /**
0601: * Enters a new lexical state
0602: *
0603: * @param newState the new lexical state
0604: */
0605: public final void yybegin(int newState) {
0606: zzLexicalState = newState;
0607: }
0608:
0609: /**
0610: * Returns the text matched by the current regular expression.
0611: */
0612: public final String yytext() {
0613: return new String(zzBuffer, zzStartRead, zzMarkedPos
0614: - zzStartRead);
0615: }
0616:
0617: /**
0618: * Returns the character at position <tt>pos</tt> from the
0619: * matched text.
0620: *
0621: * It is equivalent to yytext().charAt(pos), but faster
0622: *
0623: * @param pos the position of the character to fetch.
0624: * A value from 0 to yylength()-1.
0625: *
0626: * @return the character at position pos
0627: */
0628: public final char yycharat(int pos) {
0629: return zzBuffer[zzStartRead + pos];
0630: }
0631:
0632: /**
0633: * Returns the length of the matched text region.
0634: */
0635: public final int yylength() {
0636: return zzMarkedPos - zzStartRead;
0637: }
0638:
0639: /**
0640: * Reports an error that occured while scanning.
0641: *
0642: * In a wellformed scanner (no or only correct usage of
0643: * yypushback(int) and a match-all fallback rule) this method
0644: * will only be called with things that "Can't Possibly Happen".
0645: * If this method is called, something is seriously wrong
0646: * (e.g. a JFlex bug producing a faulty scanner etc.).
0647: *
0648: * Usual syntax/scanner level error handling should be done
0649: * in error fallback rules.
0650: *
0651: * @param errorCode the code of the errormessage to display
0652: */
0653: private void zzScanError(int errorCode) {
0654: String message;
0655: try {
0656: message = ZZ_ERROR_MSG[errorCode];
0657: } catch (ArrayIndexOutOfBoundsException e) {
0658: message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
0659: }
0660:
0661: throw new Error(message);
0662: }
0663:
0664: /**
0665: * Pushes the specified amount of characters back into the input stream.
0666: *
0667: * They will be read again by then next call of the scanning method
0668: *
0669: * @param number the number of characters to be read again.
0670: * This number must not be greater than yylength()!
0671: */
0672: public void yypushback(int number) {
0673: if (number > yylength())
0674: zzScanError(ZZ_PUSHBACK_2BIG);
0675:
0676: zzMarkedPos -= number;
0677: }
0678:
0679: /**
0680: * Resumes scanning until the next regular expression is matched,
0681: * the end of input is encountered or an I/O-Error occurs.
0682: *
0683: * @return the next token
0684: * @exception java.io.IOException if any I/O-Error occurs
0685: */
0686: public int getNextToken() throws java.io.IOException {
0687: int zzInput;
0688: int zzAction;
0689:
0690: // cached fields:
0691: int zzCurrentPosL;
0692: int zzMarkedPosL;
0693: int zzEndReadL = zzEndRead;
0694: char[] zzBufferL = zzBuffer;
0695: char[] zzCMapL = ZZ_CMAP;
0696:
0697: int[] zzTransL = ZZ_TRANS;
0698: int[] zzRowMapL = ZZ_ROWMAP;
0699: int[] zzAttrL = ZZ_ATTRIBUTE;
0700:
0701: while (true) {
0702: zzMarkedPosL = zzMarkedPos;
0703:
0704: yychar += zzMarkedPosL - zzStartRead;
0705:
0706: zzAction = -1;
0707:
0708: zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
0709:
0710: zzState = zzLexicalState;
0711:
0712: zzForAction: {
0713: while (true) {
0714:
0715: if (zzCurrentPosL < zzEndReadL)
0716: zzInput = zzBufferL[zzCurrentPosL++];
0717: else if (zzAtEOF) {
0718: zzInput = YYEOF;
0719: break zzForAction;
0720: } else {
0721: // store back cached positions
0722: zzCurrentPos = zzCurrentPosL;
0723: zzMarkedPos = zzMarkedPosL;
0724: boolean eof = zzRefill();
0725: // get translated positions and possibly new buffer
0726: zzCurrentPosL = zzCurrentPos;
0727: zzMarkedPosL = zzMarkedPos;
0728: zzBufferL = zzBuffer;
0729: zzEndReadL = zzEndRead;
0730: if (eof) {
0731: zzInput = YYEOF;
0732: break zzForAction;
0733: } else {
0734: zzInput = zzBufferL[zzCurrentPosL++];
0735: }
0736: }
0737: int zzNext = zzTransL[zzRowMapL[zzState]
0738: + zzCMapL[zzInput]];
0739: if (zzNext == -1)
0740: break zzForAction;
0741: zzState = zzNext;
0742:
0743: int zzAttributes = zzAttrL[zzState];
0744: if ((zzAttributes & 1) == 1) {
0745: zzAction = zzState;
0746: zzMarkedPosL = zzCurrentPosL;
0747: if ((zzAttributes & 8) == 8)
0748: break zzForAction;
0749: }
0750:
0751: }
0752: }
0753:
0754: // store back cached position
0755: zzMarkedPos = zzMarkedPosL;
0756:
0757: switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
0758: case 7: { /* ignore */
0759: }
0760: case 44:
0761: break;
0762: case 3: {
0763: positionInc = 1;
0764: return CJ;
0765: }
0766: case 45:
0767: break;
0768: case 28: {
0769: numBalanced = 0;
0770: currentTokType = ALPHANUM;
0771: yybegin(YYINITIAL);/*end italics*/
0772: }
0773: case 46:
0774: break;
0775: case 9: {
0776: numLinkToks = 0;
0777: positionInc = 0;
0778: yybegin(YYINITIAL);
0779: }
0780: case 47:
0781: break;
0782: case 4: {
0783: positionInc = 1;
0784: currentTokType = EXTERNAL_LINK_URL;
0785: yybegin(EXTERNAL_LINK_STATE);
0786: }
0787: case 48:
0788: break;
0789: case 39: {
0790: numBalanced = 0;
0791: currentTokType = ALPHANUM;
0792: yybegin(YYINITIAL);/*end bold italics*/
0793: }
0794: case 49:
0795: break;
0796: case 11: {
0797: currentTokType = ITALICS;
0798: yybegin(STRING);
0799: return currentTokType;/*italics*/
0800: }
0801: case 50:
0802: break;
0803: case 23: {
0804: positionInc = 1;
0805: currentTokType = INTERNAL_LINK;
0806: yybegin(INTERNAL_LINK_STATE);
0807: }
0808: case 51:
0809: break;
0810: case 5: {
0811: yybegin(CATEGORY_STATE);
0812: return currentTokType;
0813: }
0814: case 52:
0815: break;
0816: case 36: {
0817: numBalanced = 0;
0818: currentTokType = ALPHANUM;
0819: yybegin(YYINITIAL);/*end sub header*/
0820: }
0821: case 53:
0822: break;
0823: case 8: {
0824: if (numLinkToks == 0) {
0825: positionInc = 0;
0826: } else {
0827: positionInc = 1;
0828: }
0829: currentTokType = EXTERNAL_LINK;
0830: yybegin(EXTERNAL_LINK_STATE);
0831: numLinkToks++;
0832: return currentTokType;
0833: }
0834: case 54:
0835: break;
0836: case 24: {
0837: positionInc = 1;
0838: currentTokType = CITATION;
0839: yybegin(DOUBLE_BRACE_STATE);
0840: }
0841: case 55:
0842: break;
0843: case 22: {
0844: positionInc = 1;
0845: yybegin(DOUBLE_EQUALS_STATE);
0846: }
0847: case 56:
0848: break;
0849: case 41: {
0850: positionInc = 1;
0851: currentTokType = CATEGORY;
0852: yybegin(CATEGORY_STATE);
0853: }
0854: case 57:
0855: break;
0856: case 18: {
0857: yybegin(STRING);
0858: return currentTokType;/* STRING ALPHANUM*/
0859: }
0860: case 58:
0861: break;
0862: case 21: {
0863: positionInc = 1;
0864: if (numBalanced == 0) {
0865: numBalanced++;
0866: yybegin(TWO_SINGLE_QUOTES_STATE);
0867: } else {
0868: numBalanced = 0;
0869: }
0870: }
0871: case 59:
0872: break;
0873: case 1: {
0874: positionInc = 1;
0875: }
0876: case 60:
0877: break;
0878: case 43: {
0879: numBalanced = 0;
0880: currentTokType = CATEGORY;
0881: yybegin(CATEGORY_STATE);
0882: }
0883: case 61:
0884: break;
0885: case 25: {
0886: yybegin(YYINITIAL);
0887: }
0888: case 62:
0889: break;
0890: case 40: {
0891: positionInc = 1;
0892: yybegin(EXTERNAL_LINK_STATE);
0893: return currentTokType;
0894: }
0895: case 63:
0896: break;
0897: case 19: {
0898: numBalanced = 0;
0899: currentTokType = EXTERNAL_LINK;
0900: yybegin(EXTERNAL_LINK_STATE);
0901: }
0902: case 64:
0903: break;
0904: case 13: {
0905: yybegin(STRING);
0906: return currentTokType;
0907: }
0908: case 65:
0909: break;
0910: case 38: {
0911: positionInc = 1;
0912: return EMAIL;
0913: }
0914: case 66:
0915: break;
0916: case 37: {
0917: positionInc = 1;
0918: return ACRONYM;
0919: }
0920: case 67:
0921: break;
0922: case 17: { /* ignore STRING */
0923: }
0924: case 68:
0925: break;
0926: case 42: {
0927: currentTokType = CATEGORY;
0928: yybegin(CATEGORY_STATE);
0929: }
0930: case 69:
0931: break;
0932: case 20: {
0933: yybegin(STRING);
0934: return currentTokType;/*pipe*/
0935: }
0936: case 70:
0937: break;
0938: case 12: {
0939: currentTokType = EXTERNAL_LINK;
0940: yybegin(EXTERNAL_LINK_STATE);
0941: }
0942: case 71:
0943: break;
0944: case 29: {
0945: numBalanced = 0;
0946: currentTokType = INTERNAL_LINK;
0947: yybegin(INTERNAL_LINK_STATE);
0948: }
0949: case 72:
0950: break;
0951: case 35: {
0952: numBalanced = 0;
0953: currentTokType = ALPHANUM;
0954: yybegin(YYINITIAL);/*end bold*/
0955: }
0956: case 73:
0957: break;
0958: case 16: {
0959: yybegin(DOUBLE_BRACE_STATE);
0960: return currentTokType;
0961: }
0962: case 74:
0963: break;
0964: case 31: {
0965: positionInc = 1;
0966: return HOST;
0967: }
0968: case 75:
0969: break;
0970: case 34: {
0971: currentTokType = BOLD_ITALICS;
0972: yybegin(FIVE_SINGLE_QUOTES_STATE);
0973: }
0974: case 76:
0975: break;
0976: case 27: {
0977: currentTokType = INTERNAL_LINK;
0978: yybegin(INTERNAL_LINK_STATE);
0979: }
0980: case 77:
0981: break;
0982: case 14: {
0983: currentTokType = SUB_HEADING;
0984: yybegin(STRING);
0985: }
0986: case 78:
0987: break;
0988: case 30: {
0989: positionInc = 1;
0990: return APOSTROPHE;
0991: }
0992: case 79:
0993: break;
0994: case 32: {
0995: positionInc = 1;
0996: return NUM;
0997: }
0998: case 80:
0999: break;
1000: case 15: {
1001: currentTokType = HEADING;
1002: yybegin(DOUBLE_EQUALS_STATE);
1003: return currentTokType;
1004: }
1005: case 81:
1006: break;
1007: case 6: {
1008: yybegin(INTERNAL_LINK_STATE);
1009: return currentTokType;
1010: }
1011: case 82:
1012: break;
1013: case 2: {
1014: positionInc = 1;
1015: return ALPHANUM;
1016: }
1017: case 83:
1018: break;
1019: case 33: {
1020: positionInc = 1;
1021: return COMPANY;
1022: }
1023: case 84:
1024: break;
1025: case 10: {
1026: currentTokType = BOLD;
1027: yybegin(THREE_SINGLE_QUOTES_STATE);
1028: }
1029: case 85:
1030: break;
1031: case 26: {
1032: numLinkToks = 0;
1033: yybegin(YYINITIAL);
1034: }
1035: case 86:
1036: break;
1037: default:
1038: if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
1039: zzAtEOF = true;
1040: return YYEOF;
1041: } else {
1042: zzScanError(ZZ_NO_MATCH);
1043: }
1044: }
1045: }
1046: }
1047:
1048: }
|