001: package org.apache.lucene.analysis.ru;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: /**
021: * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
022: * for russian characters in Unicode, KOI8 and CP1252.
023: * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
024: * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
025: * and adding logic to toLowerCase() method for that charset.
026: *
027: * @author Boris Okner, b.okner@rogers.com
028: * @version $Id: RussianCharsets.java 472959 2006-11-09 16:21:50Z yonik $
029: */
030: public class RussianCharsets {
031: // Unicode Russian charset (lowercase letters only)
032: public static char[] UnicodeRussian = { '\u0430', '\u0431',
033: '\u0432', '\u0433', '\u0434', '\u0435', '\u0436', '\u0437',
034: '\u0438', '\u0439', '\u043A', '\u043B', '\u043C', '\u043D',
035: '\u043E', '\u043F', '\u0440', '\u0441', '\u0442', '\u0443',
036: '\u0444', '\u0445', '\u0446', '\u0447', '\u0448', '\u0449',
037: '\u044A', '\u044B',
038: '\u044C',
039: '\u044D',
040: '\u044E',
041: '\u044F',
042: // upper case
043: '\u0410', '\u0411', '\u0412', '\u0413', '\u0414', '\u0415',
044: '\u0416', '\u0417', '\u0418', '\u0419', '\u041A', '\u041B',
045: '\u041C', '\u041D', '\u041E', '\u041F', '\u0420', '\u0421',
046: '\u0422', '\u0423', '\u0424', '\u0425', '\u0426', '\u0427',
047: '\u0428', '\u0429', '\u042A', '\u042B', '\u042C', '\u042D',
048: '\u042E', '\u042F' };
049:
050: // KOI8 charset
051: public static char[] KOI8 = { 0xc1, 0xc2, 0xd7, 0xc7, 0xc4, 0xc5,
052: 0xd6, 0xda, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
053: 0xd2, 0xd3, 0xd4, 0xd5, 0xc6, 0xc8, 0xc3, 0xde, 0xdb, 0xdd,
054: 0xdf, 0xd9, 0xd8, 0xdc,
055: 0xc0,
056: 0xd1,
057: // upper case
058: 0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa, 0xe9, 0xea,
059: 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf2, 0xf3, 0xf4, 0xf5,
060: 0xe6, 0xe8, 0xe3, 0xfe, 0xfb, 0xfd, 0xff, 0xf9, 0xf8, 0xfc,
061: 0xe0, 0xf1 };
062:
063: // CP1251 eharset
064: public static char[] CP1251 = { 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5,
065: 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
066: 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9,
067: 0xFA, 0xFB, 0xFC, 0xFD,
068: 0xFE,
069: 0xFF,
070: // upper case
071: 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9,
072: 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3,
073: 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD,
074: 0xDE, 0xDF };
075:
076: public static char toLowerCase(char letter, char[] charset) {
077: if (charset == UnicodeRussian) {
078: if (letter >= '\u0430' && letter <= '\u044F') {
079: return letter;
080: }
081: if (letter >= '\u0410' && letter <= '\u042F') {
082: return (char) (letter + 32);
083: }
084: }
085:
086: if (charset == KOI8) {
087: if (letter >= 0xe0 && letter <= 0xff) {
088: return (char) (letter - 32);
089: }
090: if (letter >= 0xc0 && letter <= 0xdf) {
091: return letter;
092: }
093:
094: }
095:
096: if (charset == CP1251) {
097: if (letter >= 0xC0 && letter <= 0xDF) {
098: return (char) (letter + 32);
099: }
100: if (letter >= 0xE0 && letter <= 0xFF) {
101: return letter;
102: }
103:
104: }
105:
106: return Character.toLowerCase(letter);
107: }
108: }
|