001: /**
002: *******************************************************************************
003: * Copyright (C) 2002-2005, International Business Machines Corporation and *
004: * others. All Rights Reserved. *
005: *******************************************************************************
006: */package com.ibm.icu.dev.tool.layout;
007:
008: import com.ibm.icu.lang.UCharacter;
009: import com.ibm.icu.lang.UScript;
010: import com.ibm.icu.text.UnicodeSet;
011: import com.ibm.icu.text.UTF16;
012:
013: /**
014: * @author Eric Mader
015: *
016: * Notes:
017: *
018: * The property \p{Decomposition_Type=Canonical} will match all characters with a canonical
019: * decomposition.
020: *
021: * So "[[\\p{Latin}\\p{Greek}\\p{Cyrillic}] & [\\p{Decomposition_Type=Canonical}]]"
022: * will match all Latin, Greek and Cyrillic characters with a canonical decomposition.
023: *
024: * Are these three scripts enough? Do we want to collect them all at once and distribute by script,
025: * or process them one script at a time. It's probably a good idea to build a single table for
026: * however many scripts there are.
027: *
028: * It might be better to collect all the characters that have a canonical decomposition and just
029: * sort them into however many scripts there are... unless we'll get characters in COMMON???
030: */
031: public class CanonGSUBBuilder {
032: static public String convertArabicString(int type, int ligature,
033: String decomp, ClassTable isolClassTable) {
034: int leftType = ArabicShaping.VALUE_NONE;
035: int rightType = ArabicShaping.VALUE_NONE;
036:
037: switch (type) {
038: case UCharacter.DecompositionType.ISOLATED:
039: break;
040:
041: case UCharacter.DecompositionType.FINAL:
042: rightType = ArabicShaping.VALUE_LEFT;
043: break;
044:
045: case UCharacter.DecompositionType.INITIAL:
046: leftType = ArabicShaping.VALUE_RIGHT;
047: break;
048:
049: case UCharacter.DecompositionType.MEDIAL:
050: rightType = ArabicShaping.VALUE_LEFT;
051: leftType = ArabicShaping.VALUE_RIGHT;
052: break;
053:
054: default:
055: return decomp + UCharacter.toString(ligature);
056: }
057:
058: char[] chars = decomp.toCharArray();
059:
060: ArabicShaping.shape(chars, leftType, rightType, isolClassTable);
061:
062: return new String(chars) + UCharacter.toString(ligature);
063: }
064:
065: static void buildArabicContextualForms(ArabicCharacterData data,
066: ClassTable initClassTable, ClassTable mediClassTable,
067: ClassTable finaClassTable, ClassTable isolClassTable) {
068: System.out.print("Finding Arabic contextual forms... ");
069:
070: for (int i = 0; i < data.countRecords(); i += 1) {
071: ArabicCharacterData.Record record = data.getRecord(i);
072: String decomposition = record.getDecomposition();
073:
074: if (decomposition != null && decomposition.length() == 1) {
075: int contextual = record.getCodePoint();
076: int isolated = UTF16.charAt(record.getDecomposition(),
077: 0);
078:
079: switch (record.getDecompositionType()) {
080: case UCharacter.DecompositionType.INITIAL:
081: initClassTable.addMapping(isolated, contextual);
082: break;
083:
084: case UCharacter.DecompositionType.MEDIAL:
085: mediClassTable.addMapping(isolated, contextual);
086: break;
087:
088: case UCharacter.DecompositionType.FINAL:
089: finaClassTable.addMapping(isolated, contextual);
090: break;
091:
092: case UCharacter.DecompositionType.ISOLATED:
093: isolClassTable.addMapping(isolated, contextual);
094: break;
095:
096: default:
097: // issue some error message?
098: break;
099: }
100: }
101: }
102:
103: System.out.println("Done.");
104: }
105:
106: static LigatureTree buildArabicLigatureTree(
107: ArabicCharacterData data, ClassTable isolClassTable) {
108: LigatureTree contextualTree = new LigatureTree();
109: int ligatureCount = 0;
110:
111: System.out.print("Building Arabic ligature tree... ");
112:
113: for (int i = 0; i < data.countRecords(); i += 1) {
114: ArabicCharacterData.Record record = data.getRecord(i);
115: String decomposition = record.getDecomposition();
116:
117: if (decomposition != null && decomposition.length() > 1) {
118: int ligature = record.getCodePoint();
119: int decompType = record.getDecompositionType();
120:
121: switch (decompType) {
122: case UCharacter.DecompositionType.FINAL:
123: case UCharacter.DecompositionType.INITIAL:
124: case UCharacter.DecompositionType.MEDIAL:
125: case UCharacter.DecompositionType.ISOLATED:
126: contextualTree.insert(convertArabicString(
127: decompType, ligature, decomposition,
128: isolClassTable));
129: ligatureCount += 1;
130: break;
131:
132: case UCharacter.DecompositionType.CANONICAL:
133: //cannonicalTree.insert(decomposition + UCharacter.toString(ligature));
134: break;
135: }
136: }
137: }
138:
139: System.out.println(ligatureCount + " ligatures.");
140:
141: return contextualTree;
142: }
143:
144: static final int SIMPLE_GLYPH = 1;
145: static final int LIGATURE_GLYPH = 2;
146: static final int MARK_GLYPH = 3;
147: static final int COMPONENT_GLYPH = 4;
148:
149: static final int categoryClassMap[] = { 0, // UNASSIGNED
150: SIMPLE_GLYPH, // UPPERCASE_LETTER
151: SIMPLE_GLYPH, // LOWERCASE_LETTER
152: SIMPLE_GLYPH, // TITLECASE_LETTER
153: SIMPLE_GLYPH, // MODIFIER_LETTER
154: SIMPLE_GLYPH, // OTHER_LETTER
155: MARK_GLYPH, // NON_SPACING_MARK
156: MARK_GLYPH, // ENCLOSING_MARK ??
157: MARK_GLYPH, // COMBINING_SPACING_MARK ??
158: SIMPLE_GLYPH, // DECIMAL_NUMBER
159: SIMPLE_GLYPH, // LETTER_NUMBER
160: SIMPLE_GLYPH, // OTHER_NUMBER;
161: 0, // SPACE_SEPARATOR
162: 0, // LINE_SEPARATOR
163: 0, // PARAGRAPH_SEPARATOR
164: 0, // CONTROL
165: 0, // FORMAT
166: 0, // PRIVATE_USE
167: 0, // SURROGATE
168: SIMPLE_GLYPH, // DASH_PUNCTUATION
169: SIMPLE_GLYPH, // START_PUNCTUATION
170: SIMPLE_GLYPH, // END_PUNCTUATION
171: SIMPLE_GLYPH, // CONNECTOR_PUNCTUATION
172: SIMPLE_GLYPH, // OTHER_PUNCTUATION
173: SIMPLE_GLYPH, // MATH_SYMBOL;
174: SIMPLE_GLYPH, // CURRENCY_SYMBOL
175: SIMPLE_GLYPH, // MODIFIER_SYMBOL
176: SIMPLE_GLYPH, // OTHER_SYMBOL
177: SIMPLE_GLYPH, // INITIAL_PUNCTUATION
178: SIMPLE_GLYPH // FINAL_PUNCTUATION
179: };
180:
181: static int getGlyphClass(ArabicCharacterData.Record record) {
182: String decomp = record.getDecomposition();
183:
184: if (decomp != null && decomp.length() > 1) {
185: return LIGATURE_GLYPH;
186: }
187:
188: return categoryClassMap[record.getGeneralCategory()];
189: }
190:
191: static void addArabicGlyphClasses(ArabicCharacterData data,
192: ClassTable classTable) {
193: System.out.print("Adding Arabic glyph classes... ");
194:
195: for (int i = 0; i < data.countRecords(); i += 1) {
196: ArabicCharacterData.Record record = data.getRecord(i);
197: classTable.addMapping(record.getCodePoint(),
198: getGlyphClass(record));
199: }
200:
201: System.out.println("Done.");
202: }
203:
204: private static void buildArabicTables(ScriptList scriptList,
205: FeatureList featureList, LookupList lookupList,
206: ClassTable classTable) {
207: // TODO: Might want to have the ligature table builder explicitly check for ligatures
208: // which start with space and tatweel rather than pulling them out here...
209: UnicodeSet arabicBlock = new UnicodeSet(
210: "[[\\p{block=Arabic}] & [[:Cf:][:Po:][:So:][:Mn:][:Nd:][:Lm:]]]");
211: UnicodeSet oddLigatures = new UnicodeSet(
212: "[\\uFC5E-\\uFC63\\uFCF2-\\uFCF4\\uFE70-\\uFE7F]");
213: UnicodeSet arabicLetters = new UnicodeSet("[\\p{Arabic}]");
214: ArabicCharacterData arabicData = ArabicCharacterData
215: .factory(arabicLetters.addAll(arabicBlock).removeAll(
216: oddLigatures));
217:
218: addArabicGlyphClasses(arabicData, classTable);
219:
220: ClassTable initClassTable = new ClassTable();
221: ClassTable mediClassTable = new ClassTable();
222: ClassTable finaClassTable = new ClassTable();
223: ClassTable isolClassTable = new ClassTable();
224:
225: buildArabicContextualForms(arabicData, initClassTable,
226: mediClassTable, finaClassTable, isolClassTable);
227: isolClassTable.snapshot();
228: LigatureTree ligaTree = buildArabicLigatureTree(arabicData,
229: isolClassTable);
230:
231: LigatureTreeWalker ligaWalker = new LigatureTreeWalker();
232:
233: ligaTree.walk(ligaWalker);
234:
235: Lookup initLookup, mediLookup, finaLookup, ligaLookup;
236:
237: initLookup = new Lookup(Lookup.GSST_Single, 0);
238: initLookup.addSubtable(initClassTable);
239:
240: mediLookup = new Lookup(Lookup.GSST_Single, 0);
241: mediLookup.addSubtable(mediClassTable);
242:
243: finaLookup = new Lookup(Lookup.GSST_Single, 0);
244: finaLookup.addSubtable(finaClassTable);
245:
246: ligaLookup = new Lookup(Lookup.GSST_Ligature,
247: Lookup.LF_IgnoreMarks);
248: ligaLookup.addSubtable(ligaWalker);
249:
250: Feature init = new Feature("init");
251: Feature medi = new Feature("medi");
252: Feature fina = new Feature("fina");
253: Feature liga = new Feature("liga");
254:
255: init.addLookup(lookupList.addLookup(initLookup));
256: medi.addLookup(lookupList.addLookup(mediLookup));
257: fina.addLookup(lookupList.addLookup(finaLookup));
258: liga.addLookup(lookupList.addLookup(ligaLookup));
259:
260: featureList.addFeature(init);
261: featureList.addFeature(medi);
262: featureList.addFeature(fina);
263: featureList.addFeature(liga);
264:
265: scriptList.addFeature("arab", "(default)", init);
266: scriptList.addFeature("arab", "(default)", medi);
267: scriptList.addFeature("arab", "(default)", fina);
268: scriptList.addFeature("arab", "(default)", liga);
269:
270: System.out.println();
271: }
272:
273: public static void buildLigatureTree(CanonicalCharacterData data,
274: int script, LigatureTree ligatureTree) {
275: int ligatureCount = 0;
276:
277: System.out.print("building composition ligature tree for "
278: + UScript.getName(script) + "... ");
279:
280: for (int i = 0; i < data.countRecords(script); i += 1) {
281: CanonicalCharacterData.Record record = data.getRecord(
282: script, i);
283: String composed = UCharacter.toString(record
284: .getComposedCharacter());
285:
286: for (int e = 0; e < record.countEquivalents(); e += 1) {
287: String equivalent = record.getEquivalent(e);
288:
289: ligatureTree.insert(equivalent + composed);
290: ligatureCount += 1;
291: }
292: }
293:
294: System.out.println(ligatureCount + " ligatures.");
295: }
296:
297: public static DecompTable[] buildDecompTables(
298: CanonicalCharacterData data, int script) {
299: int maxDecompCount = data.getMaxEquivalents(script);
300: DecompTable[] decompTables = new DecompTable[maxDecompCount];
301:
302: System.out.print("Building decompositon tables for "
303: + UScript.getName(script)
304: + "... total decompositions: "
305: + data.countRecords(script) + ", max: "
306: + maxDecompCount + "...");
307:
308: for (int i = 0; i < maxDecompCount; i += 1) {
309: DecompTable table = new DecompTable();
310:
311: for (int r = 0; r < data.countRecords(script); r += 1) {
312: CanonicalCharacterData.Record record = data.getRecord(
313: script, r);
314:
315: if (record.countEquivalents() > i) {
316: table.add(record.getComposedCharacter(), record
317: .getEquivalent(i));
318: }
319: }
320:
321: decompTables[i] = table;
322: }
323:
324: System.out.println(" Done.");
325:
326: return decompTables;
327: }
328:
329: public static int[] buildLookups(CanonicalCharacterData data,
330: LookupList lookupList, int script) {
331: int[] lookups = new int[2];
332:
333: DecompTable[] decompTables = buildDecompTables(data, script);
334:
335: LigatureTree compTree = new LigatureTree();
336:
337: buildLigatureTree(data, script, compTree);
338:
339: System.out.println();
340:
341: LigatureTreeWalker compWalker = new LigatureTreeWalker();
342:
343: compTree.walk(compWalker);
344:
345: Lookup compLookup, dcmpLookup;
346: //int compLookupIndex, dcmpLookupIndex;
347:
348: compLookup = new Lookup(Lookup.GSST_Ligature, 0);
349: compLookup.addSubtable(compWalker);
350:
351: dcmpLookup = new Lookup(Lookup.GSST_Multiple, 0);
352: for (int i = 0; i < decompTables.length; i += 1) {
353: dcmpLookup.addSubtable(decompTables[i]);
354: }
355:
356: lookups[0] = lookupList.addLookup(compLookup);
357: lookups[1] = lookupList.addLookup(dcmpLookup);
358:
359: return lookups;
360: }
361:
362: public static void addLookups(Feature feature, int[] lookups) {
363: for (int i = 0; i < lookups.length; i += 1) {
364: feature.addLookup(lookups[i]);
365: }
366: }
367:
368: /*
369: * Hebrew mark order taken from the SBL Hebrew Font manual
370: * Arabic mark order per Thomas Milo: hamza < shadda < combining_alef < sukun, vowel_marks < madda < qur'anic_marks
371: */
372: public static ClassTable buildCombiningClassTable() {
373: UnicodeSet markSet = new UnicodeSet(
374: "[\\P{CanonicalCombiningClass=0}]");
375: ClassTable exceptions = new ClassTable();
376: ClassTable combiningClasses = new ClassTable();
377: int markCount = markSet.size();
378:
379: exceptions.addMapping(0x05C1, 10); // Point Shin Dot
380: exceptions.addMapping(0x05C2, 11); // Point Sin Dot
381: exceptions.addMapping(0x05BC, 21); // Point Dagesh or Mapiq
382: exceptions.addMapping(0x05BF, 23); // Point Rafe
383: exceptions.addMapping(0x05B9, 27); // Point Holam
384: exceptions.addMapping(0x0323, 220); // Comb. Dot Below (low punctum)
385: exceptions.addMapping(0x0591, 220); // Accent Etnahta
386: exceptions.addMapping(0x0596, 220); // Accent Tipeha
387: exceptions.addMapping(0x059B, 220); // Accent Tevir
388: exceptions.addMapping(0x05A3, 220); // Accent Munah
389: exceptions.addMapping(0x05A4, 220); // Accent Mahapakh
390: exceptions.addMapping(0x05A5, 220); // Accent Merkha
391: exceptions.addMapping(0x05A6, 220); // Accent Merkha Kefula
392: exceptions.addMapping(0x05A7, 220); // Accent Darga
393: exceptions.addMapping(0x05AA, 220); // Accent Yerah Ben Yomo
394: exceptions.addMapping(0x05B0, 220); // Point Sheva
395: exceptions.addMapping(0x05B1, 220); // Point Hataf Segol
396: exceptions.addMapping(0x05B2, 220); // Point Hataf Patah
397: exceptions.addMapping(0x05B3, 220); // Point Hataf Qamats
398: exceptions.addMapping(0x05B4, 220); // Point Hiriq
399: exceptions.addMapping(0x05B5, 220); // Point Tsere
400: exceptions.addMapping(0x05B6, 220); // Point Segol
401: exceptions.addMapping(0x05B7, 220); // Point Patah
402: exceptions.addMapping(0x05B8, 220); // Point Qamats
403: exceptions.addMapping(0x05BB, 220); // Point Qubuts
404: exceptions.addMapping(0x05BD, 220); // Point Meteg
405: exceptions.addMapping(0x059A, 222); // Accent Yetiv
406: exceptions.addMapping(0x05AD, 222); // Accent Dehi
407: exceptions.addMapping(0x05C4, 230); // Mark Upper Dot (high punctum)
408: exceptions.addMapping(0x0593, 230); // Accent Shalshelet
409: exceptions.addMapping(0x0594, 230); // Accent Zaqef Qatan
410: exceptions.addMapping(0x0595, 230); // Accent Zaqef Gadol
411: exceptions.addMapping(0x0597, 230); // Accent Revia
412: exceptions.addMapping(0x0598, 230); // Accent Zarqa
413: exceptions.addMapping(0x059F, 230); // Accent Qarney Para
414: exceptions.addMapping(0x059E, 230); // Accent Gershayim
415: exceptions.addMapping(0x059D, 230); // Accent Geresh Muqdam
416: exceptions.addMapping(0x059C, 230); // Accent Geresh
417: exceptions.addMapping(0x0592, 230); // Accent Segolta
418: exceptions.addMapping(0x05A0, 230); // Accent Telisha Gedola
419: exceptions.addMapping(0x05AC, 230); // Accent Iluy
420: exceptions.addMapping(0x05A8, 230); // Accent Qadma
421: exceptions.addMapping(0x05AB, 230); // Accent Ole
422: exceptions.addMapping(0x05AF, 230); // Mark Masora Circle
423: exceptions.addMapping(0x05A1, 230); // Accent Pazer
424: //exceptions.addMapping(0x0307, 230); // Mark Number/Masora Dot
425: exceptions.addMapping(0x05AE, 232); // Accent Zinor
426: exceptions.addMapping(0x05A9, 232); // Accent Telisha Qetana
427: exceptions.addMapping(0x0599, 232); // Accent Pashta
428:
429: exceptions.addMapping(0x0655, 27); // ARABIC HAMZA BELOW
430: exceptions.addMapping(0x0654, 27); // ARABIC HAMZA ABOVE
431:
432: exceptions.addMapping(0x0651, 28); // ARABIC SHADDA
433:
434: exceptions.addMapping(0x0656, 29); // ARABIC SUBSCRIPT ALEF
435: exceptions.addMapping(0x0670, 29); // ARABIC LETTER SUPERSCRIPT ALEF
436:
437: exceptions.addMapping(0x064D, 30); // ARABIC KASRATAN
438: exceptions.addMapping(0x0650, 30); // ARABIC KASRA
439:
440: exceptions.addMapping(0x0652, 31); // ARABIC SUKUN
441: exceptions.addMapping(0x06E1, 31); // ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
442:
443: exceptions.addMapping(0x064B, 31); // ARABIC FATHATAN
444: exceptions.addMapping(0x064C, 31); // ARABIC DAMMATAN
445: exceptions.addMapping(0x064E, 31); // ARABIC FATHA
446: exceptions.addMapping(0x064F, 31); // ARABIC DAMMA
447: exceptions.addMapping(0x0657, 31); // ARABIC INVERTED DAMMA
448: exceptions.addMapping(0x0658, 31); // ARABIC MARK NOON GHUNNA
449:
450: exceptions.addMapping(0x0653, 32); // ARABIC MADDAH ABOVE
451:
452: exceptions.snapshot();
453:
454: for (int i = 0; i < markCount; i += 1) {
455: int mark = markSet.charAt(i);
456: int markClass = exceptions.getGlyphClassID(mark);
457:
458: if (markClass == 0) {
459: markClass = UCharacter.getCombiningClass(mark);
460: }
461:
462: combiningClasses.addMapping(mark, markClass);
463: }
464:
465: combiningClasses.snapshot();
466: return combiningClasses;
467: }
468:
469: public static void buildDecompTables(String fileName) {
470: // F900 - FAFF are compatibility ideographs. They all decompose to a single other character, and can be ignored.
471: UnicodeSet decompSet = new UnicodeSet(
472: "[[[\\P{Hangul}] & [\\p{DecompositionType=Canonical}]] - [\uF900-\uFAFF]]");
473: CanonicalCharacterData data = CanonicalCharacterData
474: .factory(decompSet);
475: ClassTable classTable = new ClassTable();
476:
477: LookupList lookupList = new LookupList();
478: FeatureList featureList = new FeatureList();
479: ScriptList scriptList = new ScriptList();
480:
481: // build common, inherited lookups...
482: // int[] commonLookups = buildLookups(data, lookupList, UScript.COMMON);
483: // int[] inheritedLookups = buildLookups(data, lookupList, UScript.INHERITED);
484:
485: for (int script = 0; script < UScript.CODE_LIMIT; script += 1) {
486:
487: // This is a bit lame, but it's the only way I can think of
488: // to make this work w/o knowing the values of COMMON and INHERITED...
489: if (script == UScript.COMMON || script == UScript.INHERITED
490: || data.getMaxEquivalents(script) == 0) {
491: continue;
492: }
493:
494: int[] lookups = buildLookups(data, lookupList, script);
495:
496: Feature ccmp = new Feature("ccmp");
497:
498: addLookups(ccmp, lookups);
499: // addLookups(ccmp, commonLookups);
500: // addLookups(ccmp, inheritedLookups);
501:
502: featureList.addFeature(ccmp);
503:
504: String scriptTag = TagUtilities.tagLabel(UScript
505: .getShortName(script));
506:
507: scriptList.addFeature(scriptTag, "(default)", ccmp);
508:
509: if (script == UScript.ARABIC) {
510: buildArabicTables(scriptList, featureList, lookupList,
511: classTable);
512: }
513: }
514:
515: featureList.finalizeFeatureList();
516:
517: ClassTable markClassTable = buildCombiningClassTable();
518:
519: GSUBWriter gsubWriter = new GSUBWriter("Canon", scriptList,
520: featureList, lookupList);
521: GDEFWriter gdefWriter = new GDEFWriter("Canon", classTable,
522: markClassTable);
523: String[] includeFiles = { "LETypes.h", "CanonShaping.h" };
524:
525: LigatureModuleWriter writer = new LigatureModuleWriter();
526:
527: writer.openFile(fileName);
528: writer.writeHeader(null, includeFiles);
529: writer.writeTable(gsubWriter);
530: writer.writeTable(gdefWriter);
531: writer.writeTrailer();
532: writer.closeFile();
533: }
534:
535: public static void main(String[] args) {
536: buildDecompTables(args[0]);
537: }
538: }
|