01: package org.apache.lucene.analysis.sinks;
02:
03: /**
04: * Licensed to the Apache Software Foundation (ASF) under one or more
05: * contributor license agreements. See the NOTICE file distributed with
06: * this work for additional information regarding copyright ownership.
07: * The ASF licenses this file to You under the Apache License, Version 2.0
08: * (the "License"); you may not use this file except in compliance with
09: * the License. You may obtain a copy of the License at
10: *
11: * http://www.apache.org/licenses/LICENSE-2.0
12: *
13: * Unless required by applicable law or agreed to in writing, software
14: * distributed under the License is distributed on an "AS IS" BASIS,
15: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16: * See the License for the specific language governing permissions and
17: * limitations under the License.
18: */
19:
20: import org.apache.lucene.analysis.SinkTokenizer;
21: import org.apache.lucene.analysis.Token;
22:
23: import java.text.DateFormat;
24: import java.text.SimpleDateFormat;
25: import java.text.ParseException;
26: import java.util.List;
27: import java.util.Date;
28:
29: /**
30: * Attempts to parse the {@link org.apache.lucene.analysis.Token#termBuffer()} as a Date using a {@link java.text.DateFormat}.
31: * If the value is a Date, it will add it to the sink.
32: * <p/>
33: * Also marks the sink token with {@link org.apache.lucene.analysis.Token#type()} equal to {@link #DATE_TYPE}
34: *
35: *
36: **/
37: public class DateRecognizerSinkTokenizer extends SinkTokenizer {
38: public static final String DATE_TYPE = "date";
39:
40: protected DateFormat dateFormat;
41:
42: /**
43: * Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
44: */
45: public DateRecognizerSinkTokenizer() {
46: this (null, SimpleDateFormat.getDateInstance());
47: }
48:
49: public DateRecognizerSinkTokenizer(DateFormat dateFormat) {
50: this (null, dateFormat);
51: }
52:
53: /**
54: * Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
55: * @param input The input list of Tokens that are already Dates. They should be marked as type {@link #DATE_TYPE} for completeness
56: */
57: public DateRecognizerSinkTokenizer(List/*<Token>*/input) {
58: this (input, SimpleDateFormat.getDateInstance());
59: }
60:
61: /**
62: *
63: * @param input
64: * @param dateFormat The date format to use to try and parse the date. Note, this SinkTokenizer makes no attempt to synchronize the DateFormat object
65: */
66: public DateRecognizerSinkTokenizer(List/*<Token>*/input,
67: DateFormat dateFormat) {
68: super (input);
69: this .dateFormat = dateFormat;
70: }
71:
72: public void add(Token t) {
73: //Check to see if this token is a date
74: if (t != null) {
75: try {
76: Date date = dateFormat.parse(new String(t.termBuffer(),
77: 0, t.termLength()));//We don't care about the date, just that we can parse it as a date
78: if (date != null) {
79: t.setType(DATE_TYPE);
80: lst.add(t.clone());
81: }
82: } catch (ParseException e) {
83:
84: }
85: }
86:
87: }
88: }
|