001: /*
002: *
003: * @(#)CollationRules.java 1.29 06/10/03
004: *
005: * Portions Copyright 2000-2006 Sun Microsystems, Inc. All Rights
006: * Reserved. Use is subject to license terms.
007: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
008: *
009: * This program is free software; you can redistribute it and/or
010: * modify it under the terms of the GNU General Public License version
011: * 2 only, as published by the Free Software Foundation.
012: *
013: * This program is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of
015: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
016: * General Public License version 2 for more details (a copy is
017: * included at /legal/license.txt).
018: *
019: * You should have received a copy of the GNU General Public License
020: * version 2 along with this work; if not, write to the Free Software
021: * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
022: * 02110-1301 USA
023: *
024: * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
025: * Clara, CA 95054 or visit www.sun.com if you need additional
026: * information or have any questions.
027: */
028:
029: /*
030: * (C) Copyright Taligent, Inc. 1996,1997 - All Rights Reserved
031: * (C) Copyright IBM Corp. 1996, 1997 - All Rights Reserved
032: *
033: * The original version of this source code and documentation is copyrighted
034: * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
035: * materials are provided under terms of a License Agreement between Taligent
036: * and Sun. This technology is protected by multiple US and International
037: * patents. This notice and attribution to Taligent may not be removed.
038: * Taligent is a registered trademark of Taligent, Inc.
039: *
040: */
041:
042: package java.text;
043:
044: /**
045: * CollationRules contains the default en_US collation rules as a base
046: * for building other collation tables.
047: * <p>Note that decompositions are done before these rules are used,
048: * so they do not have to contain accented characters, such as A-grave.
049: * @see RuleBasedCollator
050: * @see LocaleElements
051: * @version 1.22, 01/19/00
052: * @author Helena Shih, Mark Davis
053: */
054: final class CollationRules {
055: final static String DEFAULTRULES = new String("" // no FRENCH accent order by default, add in French Delta
056: // IGNORABLES (up to first < character)
057: // COMPLETELY IGNORE format characters
058: + "='\u200B'=\u200C=\u200D=\u200E=\u200F"
059: // Control Characters
060: + "=\u0000 =\u0001 =\u0002 =\u0003 =\u0004" //null, .. eot
061: + "=\u0005 =\u0006 =\u0007 =\u0008 ='\u0009'" //enq, ...
062: + "='\u000b' =\u000e" //vt,, so
063: + "=\u000f ='\u0010' =\u0011 =\u0012 =\u0013" //si, dle, dc1, dc2, dc3
064: + "=\u0014 =\u0015 =\u0016 =\u0017 =\u0018" //dc4, nak, syn, etb, can
065: + "=\u0019 =\u001a =\u001b =\u001c =\u001d" //em, sub, esc, fs, gs
066: + "=\u001e =\u001f =\u007f" //rs, us, del
067: //....then the C1 Latin 1 reserved control codes
068: + "=\u0080 =\u0081 =\u0082 =\u0083 =\u0084 =\u0085"
069: + "=\u0086 =\u0087 =\u0088 =\u0089 =\u008a =\u008b"
070: + "=\u008c =\u008d =\u008e =\u008f =\u0090 =\u0091"
071: + "=\u0092 =\u0093 =\u0094 =\u0095 =\u0096 =\u0097"
072: + "=\u0098 =\u0099 =\u009a =\u009b =\u009c =\u009d"
073: + "=\u009e =\u009f"
074: // IGNORE except for secondary, tertiary difference
075: // Spaces
076: + ";'\u0020';'\u00A0'" // spaces
077: + ";'\u2000';'\u2001';'\u2002';'\u2003';'\u2004'" // spaces
078: + ";'\u2005';'\u2006';'\u2007';'\u2008';'\u2009'" // spaces
079: + ";'\u200A';'\u3000';'\uFEFF'" // spaces
080: + ";'\r' ;'\t' ;'\n';'\f';'\u000b'" // whitespace
081:
082: // Non-spacing accents
083:
084: + ";\u0301" // non-spacing acute accent
085: + ";\u0300" // non-spacing grave accent
086: + ";\u0306" // non-spacing breve accent
087: + ";\u0302" // non-spacing circumflex accent
088: + ";\u030c" // non-spacing caron/hacek accent
089: + ";\u030a" // non-spacing ring above accent
090: + ";\u030d" // non-spacing vertical line above
091: + ";\u0308" // non-spacing diaeresis accent
092: + ";\u030b" // non-spacing double acute accent
093: + ";\u0303" // non-spacing tilde accent
094: + ";\u0307" // non-spacing dot above/overdot accent
095: + ";\u0304" // non-spacing macron accent
096: + ";\u0337" // non-spacing short slash overlay (overstruck diacritic)
097: + ";\u0327" // non-spacing cedilla accent
098: + ";\u0328" // non-spacing ogonek accent
099: + ";\u0323" // non-spacing dot-below/underdot accent
100: + ";\u0332" // non-spacing underscore/underline accent
101: // with the rest of the general diacritical marks in binary order
102: + ";\u0305" // non-spacing overscore/overline
103: + ";\u0309" // non-spacing hook above
104: + ";\u030e" // non-spacing double vertical line above
105: + ";\u030f" // non-spacing double grave
106: + ";\u0310" // non-spacing chandrabindu
107: + ";\u0311" // non-spacing inverted breve
108: + ";\u0312" // non-spacing turned comma above/cedilla above
109: + ";\u0313" // non-spacing comma above
110: + ";\u0314" // non-spacing reversed comma above
111: + ";\u0315" // non-spacing comma above right
112: + ";\u0316" // non-spacing grave below
113: + ";\u0317" // non-spacing acute below
114: + ";\u0318" // non-spacing left tack below
115: + ";\u0319" // non-spacing tack below
116: + ";\u031a" // non-spacing left angle above
117: + ";\u031b" // non-spacing horn
118: + ";\u031c" // non-spacing left half ring below
119: + ";\u031d" // non-spacing up tack below
120: + ";\u031e" // non-spacing down tack below
121: + ";\u031f" // non-spacing plus sign below
122: + ";\u0320" // non-spacing minus sign below
123: + ";\u0321" // non-spacing palatalized hook below
124: + ";\u0322" // non-spacing retroflex hook below
125: + ";\u0324" // non-spacing double dot below
126: + ";\u0325" // non-spacing ring below
127: + ";\u0326" // non-spacing comma below
128: + ";\u0329" // non-spacing vertical line below
129: + ";\u032a" // non-spacing bridge below
130: + ";\u032b" // non-spacing inverted double arch below
131: + ";\u032c" // non-spacing hacek below
132: + ";\u032d" // non-spacing circumflex below
133: + ";\u032e" // non-spacing breve below
134: + ";\u032f" // non-spacing inverted breve below
135: + ";\u0330" // non-spacing tilde below
136: + ";\u0331" // non-spacing macron below
137: + ";\u0333" // non-spacing double underscore
138: + ";\u0334" // non-spacing tilde overlay
139: + ";\u0335" // non-spacing short bar overlay
140: + ";\u0336" // non-spacing long bar overlay
141: + ";\u0338" // non-spacing long slash overlay
142: + ";\u0339" // non-spacing right half ring below
143: + ";\u033a" // non-spacing inverted bridge below
144: + ";\u033b" // non-spacing square below
145: + ";\u033c" // non-spacing seagull below
146: + ";\u033d" // non-spacing x above
147: + ";\u033e" // non-spacing vertical tilde
148: + ";\u033f" // non-spacing double overscore
149: //+ ";\u0340" // non-spacing grave tone mark == \u0300
150: //+ ";\u0341" // non-spacing acute tone mark == \u0301
151: + ";\u0342;"
152: //+ "\u0343;" // == \u0313
153: + "\u0344;\u0345;\u0360;\u0361" // newer
154: + ";\u0483;\u0484;\u0485;\u0486" // Cyrillic accents
155:
156: + ";\u20D0;\u20D1;\u20D2" // symbol accents
157: + ";\u20D3;\u20D4;\u20D5" // symbol accents
158: + ";\u20D6;\u20D7;\u20D8" // symbol accents
159: + ";\u20D9;\u20DA;\u20DB" // symbol accents
160: + ";\u20DC;\u20DD;\u20DE" // symbol accents
161: + ";\u20DF;\u20E0;\u20E1" // symbol accents
162:
163: + ",'\u002D';\u00AD" // dashes
164: + ";\u2010;\u2011;\u2012" // dashes
165: + ";\u2013;\u2014;\u2015" // dashes
166: + ";\u2212" // dashes
167:
168: // other punctuation
169:
170: + "<'\u005f'" // underline/underscore (spacing)
171: + "<\u00af" // overline or macron (spacing)
172: + "<'\u002c'" // comma (spacing)
173: + "<'\u003b'" // semicolon
174: + "<'\u003a'" // colon
175: + "<'\u0021'" // exclamation point
176: + "<\u00a1" // inverted exclamation point
177: + "<'\u003f'" // question mark
178: + "<\u00bf" // inverted question mark
179: + "<'\u002f'" // slash
180: + "<'\u002e'" // period/full stop
181: + "<\u00b4" // acute accent (spacing)
182: + "<'\u0060'" // grave accent (spacing)
183: + "<'\u005e'" // circumflex accent (spacing)
184: + "<\u00a8" // diaresis/umlaut accent (spacing)
185: + "<'\u007e'" // tilde accent (spacing)
186: + "<\u00b7" // middle dot (spacing)
187: + "<\u00b8" // cedilla accent (spacing)
188: + "<'\u0027'" // apostrophe
189: + "<'\"'" // quotation marks
190: + "<\u00ab" // left angle quotes
191: + "<\u00bb" // right angle quotes
192: + "<'\u0028'" // left parenthesis
193: + "<'\u0029'" // right parenthesis
194: + "<'\u005b'" // left bracket
195: + "<'\u005d'" // right bracket
196: + "<'\u007b'" // left brace
197: + "<'\u007d'" // right brace
198: + "<\u00a7" // section symbol
199: + "<\u00b6" // paragraph symbol
200: + "<\u00a9" // copyright symbol
201: + "<\u00ae" // registered trademark symbol
202: + "<'\u0040'" // at sign
203: + "<\u00a4" // international currency symbol
204: + "<\u0e3f" // baht sign
205: + "<\u00a2" // cent sign
206: + "<\u20a1" // colon sign
207: + "<\u20a2" // cruzeiro sign
208: + "<'\u0024'" // dollar sign
209: + "<\u20ab" // dong sign
210: + "<\u20ac" // euro sign
211: + "<\u20a3" // franc sign
212: + "<\u20a4" // lira sign
213: + "<\u20a5" // mill sign
214: + "<\u20a6" // naira sign
215: + "<\u20a7" // peseta sign
216: + "<\u00a3" // pound-sterling sign
217: + "<\u20a8" // rupee sign
218: + "<\u20aa" // new shekel sign
219: + "<\u20a9" // won sign
220: + "<\u00a5" // yen sign
221: + "<'\u002a'" // asterisk
222: + "<'\\'" // backslash
223: + "<'\u0026'" // ampersand
224: + "<'\u0023'" // number sign
225: + "<'\u0025'" // percent sign
226: + "<'\u002b'" // plus sign
227: + "<\u00b1" // plus-or-minus sign
228: + "<\u00f7" // divide sign
229: + "<\u00d7" // multiply sign
230: + "<'\u003c'" // less-than sign
231: + "<'\u003d'" // equal sign
232: + "<'\u003e'" // greater-than sign
233: + "<\u00ac" // end of line symbol/logical NOT symbol
234: + "<'\u007c'" // vertical line/logical OR symbol
235: + "<\u00a6" // broken vertical line
236: + "<\u00b0" // degree symbol
237: + "<\u00b5" // micro symbol
238:
239: // NUMERICS
240:
241: + "<0<1<2<3<4<5<6<7<8<9"
242: + "<\u00bc<\u00bd<\u00be" // 1/4,1/2,3/4 fractions
243:
244: // NON-IGNORABLES
245: + "<a,A" + "<b,B"
246: + "<c,C"
247: + "<d,D"
248: + "<\u00F0,\u00D0" // eth
249: + "<e,E" + "<f,F" + "<g,G" + "<h,H"
250: + "<i,I"
251: + "<j,J"
252: + "<k,K" + "<l,L" + "<m,M" + "<n,N"
253: + "<o,O"
254: + "<p,P"
255: + "<q,Q" + "<r,R"
256: + "<s, S & SS,\u00DF" // s-zet
257: + "<t,T"
258: + "& TH, \u00DE &TH, \u00FE " // thorn
259: + "<u,U" + "<v,V" + "<w,W" + "<x,X"
260: + "<y,Y"
261: + "<z,Z"
262: + "&AE,\u00C6" // ae & AE ligature
263: + "&AE,\u00E6" + "&OE,\u0152" // oe & OE ligature
264: + "&OE,\u0153");
265: }
|