001: package org.apache.lucene.analysis.el;
002:
003: /**
004: * Copyright 2005 The Apache Software Foundation
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: /**
020: * GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
021: * for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253.
022: * Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters,
023: * including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
024: * the definition of a new charset as well as the required logic in the toLowerCase() method.
025: *
026: * @author Panagiotis Astithas, past@ebs.gr
027: */
028: public class GreekCharsets {
029: // Unicode Greek charset
030: public static char[] UnicodeGreek = {
031: // lower case
032: '\u0390', '\u03AC', '\u03AD', '\u03AE', '\u03AF', '\u03B0',
033: '\u03B1', '\u03B2', '\u03B3', '\u03B4', '\u03B5', '\u03B6',
034: '\u03B7', '\u03B8', '\u03B9', '\u03BA', '\u03BB', '\u03BC',
035: '\u03BD', '\u03BE', '\u03BF', '\u03C0', '\u03C1', '\u03C2',
036: '\u03C3', '\u03C4', '\u03C5', '\u03C6', '\u03C7', '\u03C8',
037: '\u03C9', '\u03CA',
038: '\u03CB',
039: '\u03CC',
040: '\u03CD',
041: '\u03CE',
042: // upper case
043: '\u0386', '\u0388', '\u0389', '\u038A', '\u038C', '\u038E',
044: '\u038F', '\u0391', '\u0392', '\u0393', '\u0394', '\u0395',
045: '\u0396', '\u0397', '\u0398', '\u0399', '\u039A', '\u039B',
046: '\u039C', '\u039D', '\u039E', '\u039F', '\u03A0', '\u03A1',
047: '\u03A3', '\u03A4', '\u03A5', '\u03A6', '\u03A7', '\u03A8',
048: '\u03A9', '\u03AA', '\u03AB' };
049:
050: // ISO-8859-7 charset (ELOT-928)
051: public static char[] ISO = {
052: // lower case
053: 0xc0, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4,
054: 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee,
055: 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
056: 0xf9, 0xfa, 0xfb, 0xfc,
057: 0xfd,
058: 0xfe,
059: // upper case
060: 0xb6, 0xb8, 0xb9, 0xba, 0xbc, 0xbe, 0xbf, 0xc1, 0xc2, 0xc3,
061: 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd,
062: 0xce, 0xcf, 0xd0, 0xd1, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8,
063: 0xd9, 0xda, 0xdb };
064:
065: // CP1253 charset
066: public static char[] CP1253 = {
067: // lower case
068: 0xc0, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4,
069: 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee,
070: 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
071: 0xf9, 0xfa, 0xfb, 0xfc,
072: 0xfd,
073: 0xfe,
074: // upper case
075: 0xa2, 0xb8, 0xb9, 0xba, 0xbc, 0xbe, 0xbf, 0xc1, 0xc2, 0xc3,
076: 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd,
077: 0xce, 0xcf, 0xd0, 0xd1, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8,
078: 0xd9, 0xda, 0xdb };
079:
080: public static char toLowerCase(char letter, char[] charset) {
081: if (charset == UnicodeGreek) {
082: // First deal with lower case, not accented letters
083: if (letter >= '\u03B1' && letter <= '\u03C9') {
084: // Special case 'small final sigma', where we return 'small sigma'
085: if (letter == '\u03C2') {
086: return '\u03C3';
087: } else {
088: return letter;
089: }
090: }
091: // Then deal with lower case, accented letters
092: // alpha with acute
093: if (letter == '\u03AC') {
094: return '\u03B1';
095: }
096: // epsilon with acute
097: if (letter == '\u03AD') {
098: return '\u03B5';
099: }
100: // eta with acute
101: if (letter == '\u03AE') {
102: return '\u03B7';
103: }
104: // iota with acute, iota with diaeresis, iota with acute and diaeresis
105: if (letter == '\u03AF' || letter == '\u03CA'
106: || letter == '\u0390') {
107: return '\u03B9';
108: }
109: // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
110: if (letter == '\u03CD' || letter == '\u03CB'
111: || letter == '\u03B0') {
112: return '\u03C5';
113: }
114: // omicron with acute
115: if (letter == '\u03CC') {
116: return '\u03BF';
117: }
118: // omega with acute
119: if (letter == '\u03CE') {
120: return '\u03C9';
121: }
122: // After that, deal with upper case, not accented letters
123: if (letter >= '\u0391' && letter <= '\u03A9') {
124: return (char) (letter + 32);
125: }
126: // Finally deal with upper case, accented letters
127: // alpha with acute
128: if (letter == '\u0386') {
129: return '\u03B1';
130: }
131: // epsilon with acute
132: if (letter == '\u0388') {
133: return '\u03B5';
134: }
135: // eta with acute
136: if (letter == '\u0389') {
137: return '\u03B7';
138: }
139: // iota with acute, iota with diaeresis
140: if (letter == '\u038A' || letter == '\u03AA') {
141: return '\u03B9';
142: }
143: // upsilon with acute, upsilon with diaeresis
144: if (letter == '\u038E' || letter == '\u03AB') {
145: return '\u03C5';
146: }
147: // omicron with acute
148: if (letter == '\u038C') {
149: return '\u03BF';
150: }
151: // omega with acute
152: if (letter == '\u038F') {
153: return '\u03C9';
154: }
155: } else if (charset == ISO) {
156: // First deal with lower case, not accented letters
157: if (letter >= 0xe1 && letter <= 0xf9) {
158: // Special case 'small final sigma', where we return 'small sigma'
159: if (letter == 0xf2) {
160: return 0xf3;
161: } else {
162: return letter;
163: }
164: }
165: // Then deal with lower case, accented letters
166: // alpha with acute
167: if (letter == 0xdc) {
168: return 0xe1;
169: }
170: // epsilon with acute
171: if (letter == 0xdd) {
172: return 0xe5;
173: }
174: // eta with acute
175: if (letter == 0xde) {
176: return 0xe7;
177: }
178: // iota with acute, iota with diaeresis, iota with acute and diaeresis
179: if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
180: return '\u03B9';
181: }
182: // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
183: if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
184: return 0xf5;
185: }
186: // omicron with acute
187: if (letter == 0xfc) {
188: return 0xef;
189: }
190: // omega with acute
191: if (letter == 0xfe) {
192: return 0xf9;
193: }
194: // After that, deal with upper case, not accented letters
195: if (letter >= 0xc1 && letter <= 0xd9) {
196: return (char) (letter + 32);
197: }
198: // Finally deal with upper case, accented letters
199: // alpha with acute
200: if (letter == 0xb6) {
201: return 0xe1;
202: }
203: // epsilon with acute
204: if (letter == 0xb8) {
205: return 0xe5;
206: }
207: // eta with acute
208: if (letter == 0xb9) {
209: return 0xe7;
210: }
211: // iota with acute, iota with diaeresis
212: if (letter == 0xba || letter == 0xda) {
213: return 0xe9;
214: }
215: // upsilon with acute, upsilon with diaeresis
216: if (letter == 0xbe || letter == 0xdb) {
217: return 0xf5;
218: }
219: // omicron with acute
220: if (letter == 0xbc) {
221: return 0xef;
222: }
223: // omega with acute
224: if (letter == 0xbf) {
225: return 0xf9;
226: }
227: } else if (charset == CP1253) {
228: // First deal with lower case, not accented letters
229: if (letter >= 0xe1 && letter <= 0xf9) {
230: // Special case 'small final sigma', where we return 'small sigma'
231: if (letter == 0xf2) {
232: return 0xf3;
233: } else {
234: return letter;
235: }
236: }
237: // Then deal with lower case, accented letters
238: // alpha with acute
239: if (letter == 0xdc) {
240: return 0xe1;
241: }
242: // epsilon with acute
243: if (letter == 0xdd) {
244: return 0xe5;
245: }
246: // eta with acute
247: if (letter == 0xde) {
248: return 0xe7;
249: }
250: // iota with acute, iota with diaeresis, iota with acute and diaeresis
251: if (letter == 0xdf || letter == 0xfa || letter == 0xc0) {
252: return '\u03B9';
253: }
254: // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis
255: if (letter == 0xfd || letter == 0xfb || letter == 0xe0) {
256: return 0xf5;
257: }
258: // omicron with acute
259: if (letter == 0xfc) {
260: return 0xef;
261: }
262: // omega with acute
263: if (letter == 0xfe) {
264: return 0xf9;
265: }
266: // After that, deal with upper case, not accented letters
267: if (letter >= 0xc1 && letter <= 0xd9) {
268: return (char) (letter + 32);
269: }
270: // Finally deal with upper case, accented letters
271: // alpha with acute
272: if (letter == 0xa2) {
273: return 0xe1;
274: }
275: // epsilon with acute
276: if (letter == 0xb8) {
277: return 0xe5;
278: }
279: // eta with acute
280: if (letter == 0xb9) {
281: return 0xe7;
282: }
283: // iota with acute, iota with diaeresis
284: if (letter == 0xba || letter == 0xda) {
285: return 0xe9;
286: }
287: // upsilon with acute, upsilon with diaeresis
288: if (letter == 0xbe || letter == 0xdb) {
289: return 0xf5;
290: }
291: // omicron with acute
292: if (letter == 0xbc) {
293: return 0xef;
294: }
295: // omega with acute
296: if (letter == 0xbf) {
297: return 0xf9;
298: }
299: }
300:
301: return Character.toLowerCase(letter);
302: }
303: }
|