Source Code Cross Referenced for UCharacterPropertyReader.java in » 6.0-JDK-Modules-sun » text » sun » text » normalizer » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » 6.0 JDK Modules sun » text » sun.text.normalizer

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /*
002:         * Portions Copyright 2005 Sun Microsystems, Inc.  All Rights Reserved.
003:         * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
004:         *
005:         * This code is free software; you can redistribute it and/or modify it
006:         * under the terms of the GNU General Public License version 2 only, as
007:         * published by the Free Software Foundation.  Sun designates this
008:         * particular file as subject to the "Classpath" exception as provided
009:         * by Sun in the LICENSE file that accompanied this code.
010:         *
011:         * This code is distributed in the hope that it will be useful, but WITHOUT
012:         * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
013:         * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
014:         * version 2 for more details (a copy is included in the LICENSE file that
015:         * accompanied this code).
016:         *
017:         * You should have received a copy of the GNU General Public License version
018:         * 2 along with this work; if not, write to the Free Software Foundation,
019:         * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
020:         *
021:         * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
022:         * CA 95054 USA or visit www.sun.com if you need additional information or
023:         * have any questions.
024:         */
025:
026:        /*
027:         *******************************************************************************
028:         * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
029:         *                                                                             *
030:         * The original version of this source code and documentation is copyrighted   *
031:         * and owned by IBM, These materials are provided under terms of a License     *
032:         * Agreement between IBM and Sun. This technology is protected by multiple     *
033:         * US and International patents. This notice and attribution to IBM may not    *
034:         * to removed.                                                                 *
035:         *******************************************************************************
036:         */
037:
038:        package sun.text.normalizer;
039:
040:        import java.io.InputStream;
041:        import java.io.DataInputStream;
042:        import java.io.IOException;
043:
044:        /**
045:         * <p>Internal reader class for ICU data file uprops.icu containing 
046:         * Unicode codepoint data.</p> 
047:         * <p>This class simply reads uprops.icu, authenticates that it is a valid
048:         * ICU data file and split its contents up into blocks of data for use in
049:         * <a href=UCharacterProperty.html>com.ibm.icu.impl.UCharacterProperty</a>.
050:         * </p> 
051:         * <p>uprops.icu which is in big-endian format is jared together with this 
052:         * package.</p>
053:         * @author Syn Wee Quek
054:         * @since release 2.1, February 1st 2002
055:         * @draft 2.1
056:         */
057:        /* Unicode character properties file format ------------------------------------
058:
059:         The file format prepared and written here contains several data
060:         structures that store indexes or data.
061:
062:
063:
064:         The following is a description of format version 3 .
065:
066:         Data contents:
067:
068:         The contents is a parsed, binary form of several Unicode character
069:         database files, most prominently UnicodeData.txt.
070:
071:         Any Unicode code point from 0 to 0x10ffff can be looked up to get
072:         the properties, if any, for that code point. This means that the input
073:         to the lookup are 21-bit unsigned integers, with not all of the
074:         21-bit range used.
075:
076:         It is assumed that client code keeps a uint32_t pointer
077:         to the beginning of the data:
078:
079:         const uint32_t *p32;
080:
081:         Formally, the file contains the following structures:
082:
083:         const int32_t indexes[16] with values i0..i15:
084:
085:         i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
086:         i1 exceptionsIndex;  -- 32-bit unit index to the table of 32-bit exception words
087:         i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
088:
089:         i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
090:         i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
091:         i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
092:
093:         i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
094:         i7..i9 reservedIndexes; -- reserved values; 0 for now
095:
096:         i10 maxValues; -- maximum code values for vector word 0, see uprops.h (format version 3.1+)
097:         i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (format version 3.2)
098:         i12..i15 reservedIndexes; -- reserved values; 0 for now
099:
100:         PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
101:
102:         P  const uint32_t props32[i1-i0];
103:         E  const uint32_t exceptions[i2-i1];
104:         U  const UChar uchars[2*(i3-i2)];
105:
106:         AT serialized trie for additional properties (byte size: 4*(i4-i3))
107:         PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
108:
109:         Trie lookup and properties:
110:
111:         In order to condense the data for the 21-bit code space, several properties of
112:         the Unicode code assignment are exploited:
113:         - The code space is sparse.
114:         - There are several 10k of consecutive codes with the same properties.
115:         - Characters and scripts are allocated in groups of 16 code points.
116:         - Inside blocks for scripts the properties are often repetitive.
117:         - The 21-bit space is not fully used for Unicode.
118:
119:         The lookup of properties for a given code point is done with a trie lookup,
120:         using the UTrie implementation.
121:         The trie lookup result is a 16-bit index in the props32[] table where the
122:         actual 32-bit properties word is stored. This is done to save space.
123:
124:         (There are thousands of 16-bit entries in the trie data table, but
125:         only a few hundred unique 32-bit properties words.
126:         If the trie data table contained 32-bit words directly, then that would be
127:         larger because the length of the table would be the same as now but the
128:         width would be 32 bits instead of 16. This saves more than 10kB.)
129:
130:         With a given Unicode code point
131:
132:         UChar32 c;
133:
134:         and 0<=c<0x110000, the lookup is done like this:
135:
136:         uint16_t i;
137:         UTRIE_GET16(c, i);
138:         uint32_t props=p32[i];
139:
140:         For some characters, not all of the properties can be efficiently encoded
141:         using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
142:         array:
143:
144:         if(props&EXCEPTION_BIT)) {
145:         uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
146:         ...
147:         }
148:
149:         The exception values are a variable number of uint32_t starting at
150:
151:         const uint32_t *pe=p32+exceptionsIndex+e;
152:
153:         The first uint32_t there contains flags about what values actually follow it.
154:         Some of the exception values are UChar32 code points for the case mappings,
155:         others are numeric values etc.
156:
157:         32-bit properties sets:
158:
159:         Each 32-bit properties word contains:
160:
161:         0.. 4  general category
162:         5      has exception values
163:         6..10  BiDi category
164:         11      is mirrored
165:         12..14  numericType:
166:         0 no numeric value
167:         1 decimal digit value
168:         2 digit value
169:         3 numeric value
170:         ### TODO: type 4 for Han digits & numbers?!
171:         15..19  reserved
172:         20..31  value according to bits 0..5:
173:         if(has exception) {
174:         exception index;
175:         } else switch(general category) {
176:         case Ll: delta to uppercase; -- same as titlecase
177:         case Lu: -delta to lowercase; -- titlecase is same as c
178:         case Lt: -delta to lowercase; -- uppercase is same as c
179:         default:
180:         if(is mirrored) {
181:         delta to mirror;
182:         } else if(numericType!=0) {
183:         numericValue;
184:         } else {
185:         0;
186:         };
187:         }
188:
189:         Exception values:
190:
191:         In the first uint32_t exception word for a code point,
192:         bits
193:         31..16  reserved
194:         15..0   flags that indicate which values follow:
195:
196:         bit
197:         0      has uppercase mapping
198:         1      has lowercase mapping
199:         2      has titlecase mapping
200:         3      unused
201:         4      has numeric value (numerator)
202:         if numericValue=0x7fffff00+x then numericValue=10^x
203:         5      has denominator value
204:         6      has a mirror-image Unicode code point
205:         7      has SpecialCasing.txt entries
206:         8      has CaseFolding.txt entries
207:
208:         According to the flags in this word, one or more uint32_t words follow it
209:         in the sequence of the bit flags in the flags word; if a flag is not set,
210:         then the value is missing or 0:
211:
212:         For the case mappings and the mirror-image Unicode code point,
213:         one uint32_t or UChar32 each is the code point.
214:         If the titlecase mapping is missing, then it is the same as the uppercase mapping.
215:
216:         For the digit values, bits 31..16 contain the decimal digit value, and
217:         bits 15..0 contain the digit value. A value of -1 indicates that
218:         this value is missing.
219:
220:         For the numeric/numerator value, an int32_t word contains the value directly,
221:         except for when there is no numerator but a denominator, then the numerator
222:         is implicitly 1. This means:
223:         numerator denominator result
224:         none      none        none
225:         x         none        x
226:         none      y           1/y
227:         x         y           x/y
228:
229:         If the numerator value is 0x7fffff00+x then it is replaced with 10^x.
230:
231:         For the denominator value, a uint32_t word contains the value directly.
232:
233:         For special casing mappings, the 32-bit exception word contains:
234:         31      if set, this character has complex, conditional mappings
235:         that are not stored;
236:         otherwise, the mappings are stored according to the following bits
237:         30..24  number of UChars used for mappings
238:         23..16  reserved
239:         15.. 0  UChar offset from the beginning of the UChars array where the
240:         UChars for the special case mappings are stored in the following format:
241:
242:         Format of special casing UChars:
243:         One UChar value with lengths as follows:
244:         14..10  number of UChars for titlecase mapping
245:         9.. 5  number of UChars for uppercase mapping
246:         4.. 0  number of UChars for lowercase mapping
247:
248:         Followed by the UChars for lowercase, uppercase, titlecase mappings in this order.
249:
250:         For case folding mappings, the 32-bit exception word contains:
251:         31..24  number of UChars used for the full mapping
252:         23..16  reserved
253:         15.. 0  UChar offset from the beginning of the UChars array where the
254:         UChars for the special case mappings are stored in the following format:
255:
256:         Format of case folding UChars:
257:         Two UChars contain the simple mapping as follows:
258:         0,  0   no simple mapping
259:         BMP,0   a simple mapping to a BMP code point
260:         s1, s2  a simple mapping to a supplementary code point stored as two surrogates
261:         This is followed by the UChars for the full case folding mappings.
262:
263:         Example:
264:         U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
265:         mapping and a numeric value.
266:         Its exception values would be stored as 3 uint32_t words:
267:
268:         - flags=0x0a (see above) with combining class 0
269:         - lowercase mapping 0x2170
270:         - numeric value=1
271:
272:         --- Additional properties (new in format version 2.1) ---
273:
274:         The second trie for additional properties (AT) is also a UTrie with 16-bit data.
275:         The data words consist of 32-bit unit indexes (not row indexes!) into the
276:         table of unique properties vectors (PV).
277:         Each vector contains a set of properties.
278:         The width of a vector (number of uint32_t per row) may change
279:         with the formatVersion, it is stored in i5.
280:
281:         Current properties: see icu/source/common/uprops.h
282:
283:         --- Changes in format version 3.1 ---
284:
285:         See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
286:
287:         --- Changes in format version 3.2 ---
288:
289:         - The tries use linear Latin-1 ranges.
290:         - The additional properties bits store full properties XYZ instead
291:         of partial Other_XYZ, so that changes in the derivation formulas
292:         need not be tracked in runtime library code.
293:         - Joining Type and Line Break are also stored completely, so that uprops.c
294:         needs no runtime formulas for enumerated properties either.
295:         - Store the case-sensitive flag in the main properties word.
296:         - i10 also contains U_LB_COUNT and U_EA_COUNT.
297:         - i11 contains maxValues2 for vector word 2.
298:
299:         ----------------------------------------------------------------------------- */
300:
301:        final class UCharacterPropertyReader implements  ICUBinary.Authenticate {
302:            // public methods ----------------------------------------------------
303:
304:            public boolean isDataVersionAcceptable(byte version[]) {
305:                return version[0] == DATA_FORMAT_VERSION_[0]
306:                        && version[2] == DATA_FORMAT_VERSION_[2]
307:                        && version[3] == DATA_FORMAT_VERSION_[3];
308:            }
309:
310:            // protected constructor ---------------------------------------------
311:
312:            /**
313:             * <p>Protected constructor.</p>
314:             * @param inputStream ICU uprop.dat file input stream
315:             * @exception IOException throw if data file fails authentication 
316:             * @draft 2.1
317:             */
318:            protected UCharacterPropertyReader(InputStream inputStream)
319:                    throws IOException {
320:                m_unicodeVersion_ = ICUBinary.readHeader(inputStream,
321:                        DATA_FORMAT_ID_, this );
322:                m_dataInputStream_ = new DataInputStream(inputStream);
323:            }
324:
325:            // protected methods -------------------------------------------------
326:
327:            /**
328:             * <p>Reads uprops.icu, parse it into blocks of data to be stored in
329:             * UCharacterProperty.</P
330:             * @param ucharppty UCharacterProperty instance
331:             * @exception thrown when data reading fails
332:             * @draft 2.1
333:             */
334:            protected void read(UCharacterProperty ucharppty)
335:                    throws IOException {
336:                // read the indexes
337:                int count = INDEX_SIZE_;
338:                m_propertyOffset_ = m_dataInputStream_.readInt();
339:                count--;
340:                m_exceptionOffset_ = m_dataInputStream_.readInt();
341:                count--;
342:                m_caseOffset_ = m_dataInputStream_.readInt();
343:                count--;
344:                m_additionalOffset_ = m_dataInputStream_.readInt();
345:                count--;
346:                m_additionalVectorsOffset_ = m_dataInputStream_.readInt();
347:                count--;
348:                m_additionalColumnsCount_ = m_dataInputStream_.readInt();
349:                count--;
350:                m_reservedOffset_ = m_dataInputStream_.readInt();
351:                count--;
352:                m_dataInputStream_.skipBytes(3 << 2);
353:                count -= 3;
354:                ucharppty.m_maxBlockScriptValue_ = m_dataInputStream_.readInt();
355:                count--; // 10
356:                ucharppty.m_maxJTGValue_ = m_dataInputStream_.readInt();
357:                count--; // 11
358:                m_dataInputStream_.skipBytes(count << 2);
359:
360:                // read the trie index block
361:                // m_props_index_ in terms of ints
362:                ucharppty.m_trie_ = new CharTrie(m_dataInputStream_, ucharppty);
363:
364:                // reads the 32 bit properties block
365:                int size = m_exceptionOffset_ - m_propertyOffset_;
366:                ucharppty.m_property_ = new int[size];
367:                for (int i = 0; i < size; i++) {
368:                    ucharppty.m_property_[i] = m_dataInputStream_.readInt();
369:                }
370:
371:                // reads the 32 bit exceptions block
372:                size = m_caseOffset_ - m_exceptionOffset_;
373:                ucharppty.m_exception_ = new int[size];
374:                for (int i = 0; i < size; i++) {
375:                    ucharppty.m_exception_[i] = m_dataInputStream_.readInt();
376:                }
377:
378:                // reads the 32 bit case block
379:                size = (m_additionalOffset_ - m_caseOffset_) << 1;
380:                ucharppty.m_case_ = new char[size];
381:                for (int i = 0; i < size; i++) {
382:                    ucharppty.m_case_[i] = m_dataInputStream_.readChar();
383:                }
384:
385:                // reads the additional property block
386:                ucharppty.m_additionalTrie_ = new CharTrie(m_dataInputStream_,
387:                        ucharppty);
388:
389:                // additional properties
390:                size = m_reservedOffset_ - m_additionalVectorsOffset_;
391:                ucharppty.m_additionalVectors_ = new int[size];
392:                for (int i = 0; i < size; i++) {
393:                    ucharppty.m_additionalVectors_[i] = m_dataInputStream_
394:                            .readInt();
395:                }
396:
397:                m_dataInputStream_.close();
398:                ucharppty.m_additionalColumnsCount_ = m_additionalColumnsCount_;
399:                ucharppty.m_unicodeVersion_ = VersionInfo.getInstance(
400:                        (int) m_unicodeVersion_[0], (int) m_unicodeVersion_[1],
401:                        (int) m_unicodeVersion_[2], (int) m_unicodeVersion_[3]);
402:            }
403:
404:            // private variables -------------------------------------------------
405:
406:            /**
407:             * Index size
408:             */
409:            private static final int INDEX_SIZE_ = 16;
410:
411:            /**
412:             * ICU data file input stream
413:             */
414:            private DataInputStream m_dataInputStream_;
415:
416:            /**
417:             * Offset information in the indexes.
418:             */
419:            private int m_propertyOffset_;
420:            private int m_exceptionOffset_;
421:            private int m_caseOffset_;
422:            private int m_additionalOffset_;
423:            private int m_additionalVectorsOffset_;
424:            private int m_additionalColumnsCount_;
425:            private int m_reservedOffset_;
426:            private byte m_unicodeVersion_[];
427:
428:            /**
429:             * File format version that this class understands.
430:             * No guarantees are made if a older version is used
431:             */
432:            private static final byte DATA_FORMAT_ID_[] = { (byte) 0x55,
433:                    (byte) 0x50, (byte) 0x72, (byte) 0x6F };
434:            private static final byte DATA_FORMAT_VERSION_[] = { (byte) 0x3,
435:                    (byte) 0x1, (byte) Trie.INDEX_STAGE_1_SHIFT_,
436:                    (byte) Trie.INDEX_STAGE_2_SHIFT_ };
437:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.