01: /**
02: * Licensed to the Apache Software Foundation (ASF) under one or more
03: * contributor license agreements. See the NOTICE file distributed with
04: * this work for additional information regarding copyright ownership.
05: * The ASF licenses this file to You under the Apache License, Version 2.0
06: * (the "License"); you may not use this file except in compliance with
07: * the License. You may obtain a copy of the License at
08: *
09: * http://www.apache.org/licenses/LICENSE-2.0
10: *
11: * Unless required by applicable law or agreed to in writing, software
12: * distributed under the License is distributed on an "AS IS" BASIS,
13: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14: * See the License for the specific language governing permissions and
15: * limitations under the License.
16: */package org.apache.lucene.wikipedia.analysis;
17:
18: import org.apache.lucene.analysis.Token;
19: import org.apache.lucene.analysis.Tokenizer;
20:
21: import java.io.Reader;
22: import java.io.IOException;
23:
24: /**
25: * Extension of StandardTokenizer that is aware of Wikipedia syntax. It is based off of the
26: * Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
27: *<p/>
28: * EXPERIMENTAL !!!!!!!!!
29: * NOTE: This Tokenizer is considered experimental and the grammar is subject to change in the trunk and in follow up releases.
30: **/
31: public class WikipediaTokenizer extends Tokenizer {
32: public static final String INTERNAL_LINK = "il";
33: public static final String EXTERNAL_LINK = "el";
34: //The URL part of the link, i.e. the first token
35: public static final String EXTERNAL_LINK_URL = "elu";
36: public static final String CITATION = "ci";
37: public static final String CATEGORY = "c";
38: public static final String BOLD = "b";
39: public static final String ITALICS = "i";
40: public static final String BOLD_ITALICS = "bi";
41: public static final String HEADING = "h";
42: public static final String SUB_HEADING = "sh";
43: /**
44: * A private instance of the JFlex-constructed scanner
45: */
46: private final WikipediaTokenizerImpl scanner;
47:
48: void setInput(Reader reader) {
49: this .input = reader;
50: }
51:
52: /**
53: * Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
54: * <code>input</code> to a newly created JFlex scanner.
55: * @param input The Input Reader
56: */
57: public WikipediaTokenizer(Reader input) {
58: this .input = input;
59: this .scanner = new WikipediaTokenizerImpl(input);
60: }
61:
62: /*
63: * (non-Javadoc)
64: *
65: * @see org.apache.lucene.analysis.TokenStream#next()
66: */
67: public Token next(Token result) throws IOException {
68: int tokenType = scanner.getNextToken();
69:
70: if (tokenType == WikipediaTokenizerImpl.YYEOF) {
71: return null;
72: }
73:
74: scanner.getText(result, tokenType);
75: final int start = scanner.yychar();
76: result.setStartOffset(start);
77: result.setEndOffset(start + result.termLength());
78: result.setPositionIncrement(scanner.getPositionIncrement());
79: result.setType(WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]);
80: return result;
81: }
82:
83: /*
84: * (non-Javadoc)
85: *
86: * @see org.apache.lucene.analysis.TokenStream#reset()
87: */
88: public void reset() throws IOException {
89: super .reset();
90: scanner.yyreset(input);
91: }
92:
93: public void reset(Reader reader) throws IOException {
94: input = reader;
95: reset();
96: }
97:
98: }
|