001: package snow.sortabletable;
002:
003: import snow.utils.gui.*;
004: import snow.texteditor.EditDistance;
005: import java.text.Collator;
006: import java.awt.EventQueue;
007: import java.awt.event.ActionEvent;
008: import java.awt.event.ActionListener;
009: import javax.swing.border.EmptyBorder;
010: import java.awt.BorderLayout;
011: import javax.swing.*;
012: import java.util.regex.*;
013: import java.io.*;
014: import java.util.*;
015:
016: /** Utility...
017: */
018: public final class NamesExplorer extends JFrame {
019: Map<Integer, String> countries = new HashMap<Integer, String>();
020: Map<String, Integer> chars = new HashMap<String, Integer>();
021: List<Name> names = new ArrayList<Name>();
022: JTextField searchField = new JTextField(12);
023: final Pattern pc = Pattern.compile("<.*?>"); // must be reluctant "?"
024: final Matcher mc = pc.matcher("");
025: TM tm = new TM();
026:
027: JTextField unknownChars = new JTextField(20);
028: Collator collator = Collator.getInstance(Locale.GERMAN);
029: List<Integer> countriesIndices = new ArrayList<Integer>();
030: Vector<String> countriesNames = new Vector<String>();
031:
032: JComboBox type = new JComboBox(new String[] { "exact equals",
033: "approx equals", "contains", "tolerant approx" });
034: JComboBox countriesCB;
035:
036: boolean debug = false;
037:
038: File f = null;
039:
040: public NamesExplorer(File f) throws Exception {
041: super ("Names explorer");
042:
043: this .f = f;
044:
045: setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
046: setSize(800, 600);
047:
048: JPanel np = new JPanel(new BorderLayout());
049:
050: JPanel sp = new JPanel();
051: sp.setBorder(new EmptyBorder(5, 5, 5, 5));
052: np.add(sp, BorderLayout.CENTER);
053: GridLayout3 settingsLayout = new GridLayout3(2, sp);
054: /*settingsLayout.addExplanationArea(
055: "Results hit types: 0: exact, 1: contains, 2: collated, 3: approx ");*/
056:
057: settingsLayout.add("Search:");
058:
059: settingsLayout.add(searchField);
060:
061: settingsLayout.add("type:");
062: settingsLayout.add(type);
063: type.setSelectedIndex(1);
064:
065: if (debug) {
066: settingsLayout.add("unknown chars:");
067: settingsLayout.add(unknownChars);
068: }
069:
070: try {
071: search(-1, 0, "", null, true);
072: } catch (Exception e) {
073: JOptionPane.showMessageDialog(null, "Error: "
074: + e.getMessage(), "Error",
075: JOptionPane.ERROR_MESSAGE);
076: }
077:
078: countriesIndices.addAll(countries.keySet());
079: Collections.sort(countriesIndices);
080:
081: for (int cii : countriesIndices) {
082: countriesNames.add(countries.get(cii));
083: }
084:
085: countriesIndices.add(0, -1); // all
086: countriesNames.add(0, "All");
087:
088: countriesCB = new JComboBox(countriesNames);
089: countriesCB.setMaximumRowCount(30);
090: settingsLayout.add("Country:");
091: settingsLayout.add(countriesCB);
092:
093: settingsLayout.addSeparator();
094:
095: SortableTableModel stm = new SortableTableModel(tm);
096: JTable table = new JTable(stm);
097: stm.installGUI(table);
098: add(new JScrollPane(table), BorderLayout.CENTER);
099: MultiSearchPanel msp = new MultiSearchPanel("Filter: ", null,
100: stm);
101: add(np, BorderLayout.NORTH);
102: np.add(msp, BorderLayout.SOUTH);
103: setLocationRelativeTo(null);
104: setVisible(true);
105:
106: ActionListener sac = new ActionListener() {
107: public void actionPerformed(ActionEvent ae) {
108: tm.clear();
109: final ProgressModalDialog pmd = new ProgressModalDialog(
110: NamesExplorer.this , "Search progress", false);
111: pmd.setProgressBounds(45000); // approx.
112: pmd.setProgressValue(0, "");
113: pmd.start();
114: Thread t = new Thread() {
115: public void run() {
116: try {
117: int c = countriesIndices.get(countriesCB
118: .getSelectedIndex());
119: search(c, 0, searchField.getText(), pmd,
120: false);
121: } catch (Exception e) {
122: JOptionPane.showMessageDialog(null,
123: "Error: " + e.getMessage(),
124: "Error", JOptionPane.ERROR_MESSAGE);
125: } finally {
126: pmd.closeDialog();
127: EventQueue.invokeLater(new Runnable() {
128: public void run() {
129: tm.update();
130: }
131: });
132:
133: if (unknown.length() > 0) {
134: unknownChars.setText("" + unknown);
135: for (char ci : unknown.toString()
136: .toCharArray()) {
137: int co = (int) ci;
138: System.out.print("\\u0"
139: + Integer.toString(co, 16)
140: + "");
141: }
142:
143: }
144: }
145: }
146: };
147: t.start();
148: }
149: };
150:
151: searchField.addActionListener(sac);
152:
153: EventQueue.invokeLater(new Runnable() {
154: public void run() {
155: searchField.requestFocus();
156: }
157: });
158:
159: }
160:
161: void search(int countryFilter, int occFilter, String nameFilter,
162: ProgressModalDialog pmd, boolean onlyReadCountries)
163: throws Exception {
164: collator.setStrength(Collator.PRIMARY);
165: BufferedReader fr = new BufferedReader(new FileReader(f));
166: // preparation
167: String line;
168:
169: while ((line = fr.readLine()) != null) {
170: if (line.indexOf("Non-iso") > 0)
171: break;
172: }
173:
174: // chars
175: while ((line = fr.readLine()) != null) {
176: line = line.substring(1, line.length() - 1); // ignore # and $
177: String tr = line.trim();
178: if (tr.length() == 0)
179: break;
180: int pos = tr.indexOf('=');
181: if (pos < 0)
182: break;
183:
184: int code = Integer.parseInt(tr.substring(0, pos).trim());
185: String charsc = tr.substring(pos + 1).trim();
186:
187: if (charsc.indexOf(" or ") > 0) {
188: //
189: for (String ci : charsc.split("\\sor\\s")) {
190: //System.out.println(""+ci);
191: chars.put(ci, code);
192: }
193: } else {
194: chars.put(charsc, code);
195: }
196:
197: //System.out.println("code="+tr.substring(0,pos).trim());
198: //System.out.println(" char="+tr.substring(pos+1).trim());
199: }
200:
201: //System.out.println(""+chars);
202:
203: while ((line = fr.readLine()) != null) {
204: if (line.indexOf("list of countries") > 0)
205: break;
206: }
207:
208: // read the countries
209: String previous = null;
210: while ((line = fr.readLine()) != null) {
211: line = line.substring(1, line.length() - 1); // ignore # and $
212: String tr = line.trim();
213: if (tr.startsWith("|")) {
214: countries.put(line.indexOf('|') - 28, previous);
215: } else {
216: previous = tr;
217: }
218:
219: if (line.indexOf("begin of name list") >= 00)
220: break;
221:
222: //System.out.println(""+line);
223: }
224:
225: //System.out.println(""+countries.size()+" countries:\n"+countries);
226: if (onlyReadCountries)
227: return;
228:
229: // and now the names.
230: int max = 100000;
231: int read = 0;
232: int lines = 0;
233: List<String> equivalences = new ArrayList<String>();
234:
235: String soundSearch = null; // approx
236: if (nameFilter != null) {
237:
238: if (type.getSelectedIndex() == 3) // approx
239: {
240: soundSearch = nameFilter.toUpperCase();
241: } else if (type.getSelectedIndex() != 0) // not exact
242: {
243: nameFilter = normalizeCH(nameFilter);
244: }
245: }
246:
247: long lastUpd = -1;
248:
249: nl: while ((line = fr.readLine()) != null) {
250:
251: if (pmd.getWasCancelled())
252: throw new Exception("cancelled");
253: pmd.incrementProgress(1);
254:
255: if (line.length() == 0)
256: continue;
257: if (line.charAt(0) == '#')
258: continue; // comment
259: lines++;
260:
261: boolean isEqu = line.charAt(0) == '=';
262:
263: if (isEqu) // name format: "A B" means A is a variant for B
264: {
265: // treat later...
266: equivalences.add(line);
267: // TODO.
268: continue nl;
269: }
270:
271: //System.out.println(""+line.substring(29));
272:
273: //if(lines==10) return;
274:
275: if (line.charAt(29) == '+')
276: continue; // ignore since just a sort order change (copy)
277:
278: String gend = line.substring(0, 3).trim();
279: String nam = line.substring(3);
280:
281: String name = nam.substring(0, 26).trim();
282: name = decode(name);
283:
284: int hitQuality = 3; // 0: exact, 1: contains, 2: collated, 3: approx
285: if (nameFilter != null) {
286: if (soundSearch != null) {
287: if (EditDistance.editexDistanceEnglish(soundSearch,
288: name, 2) > 1)
289: continue;
290: hitQuality = 3;
291: } else {
292: if (type.getSelectedIndex() == 0) // exact
293: {
294: if (!name.equals(nameFilter))
295: continue;
296: hitQuality = 0;
297: } else {
298: String sn = normalizeCH(name);
299:
300: if (type.getSelectedIndex() == 1) // approx equals ( ign case and accents but NOT ~contains)
301: {
302: if (!collator.equals(sn, nameFilter))
303: continue;
304: hitQuality = 2;
305: }
306:
307: if (type.getSelectedIndex() == 2) // contains
308: {
309: if (!sn.contains(nameFilter))
310: continue;
311: hitQuality = 1;
312: }
313:
314: if (sn.equals(nameFilter))
315: hitQuality = 0;
316: else if (sn.contains(nameFilter))
317: hitQuality = 1;
318: else if (collator.equals(sn, nameFilter))
319: hitQuality = 2;
320: else {
321: continue;
322: }
323: }
324: }
325: }
326:
327: String occs = nam.substring(27, nam.length() - 1); // without $
328:
329: Map<Integer, Integer> couOc = new HashMap<Integer, Integer>();
330:
331: int maxOcc = -1;
332:
333: for (int i = 0; i < occs.length(); i++) {
334: char ci = occs.charAt(i);
335: if (Character.isSpaceChar(ci))
336: continue;
337: int oci = 0;
338: if (Character.isDigit(ci))
339: oci = Character.getNumericValue(ci);
340: else if (ci == 'A')
341: oci = 10;
342: else if (ci == 'B')
343: oci = 11;
344: else if (ci == 'C')
345: oci = 12;
346: else if (ci == 'D')
347: oci = 13;
348: else if (ci == 'E')
349: oci = 14;
350:
351: couOc.put(i + 1, oci);
352: if (oci > maxOcc)
353: maxOcc = oci;
354: //System.out.println(" "+oci+": "+countries.get(i+1));
355: }
356:
357: if (occFilter != 0) {
358: if (occFilter > 0) {
359: if (maxOcc < occFilter)
360: continue nl;
361: } else if (maxOcc > -occFilter)
362: continue nl;
363: }
364:
365: if (countryFilter >= 0 && !couOc.containsKey(countryFilter))
366: continue nl;
367: //System.out.println(""+couOc);
368:
369: Name ni = new Name(name, gend, couOc, hitQuality);
370: names.add(ni);
371:
372: if (System.currentTimeMillis() - lastUpd > 2000) {
373: EventQueue.invokeLater(new Runnable() {
374: public void run() {
375: tm.update();
376: }
377: });
378: //pmd.set
379: lastUpd = System.currentTimeMillis();
380: }
381: //System.out.println(""+ni);
382:
383: //System.out.println(""+occs);
384: //line = line.substring(0, line.length()-1); // ignore # and $
385: read++;
386: if (read > max) {
387: System.out.println("To much hits: breaking");
388: break;
389: }
390:
391: //System.out.println(""+line);
392: }
393:
394: System.out.println("" + names.size() + " names, " + lines
395: + " lines, " + equivalences.size() + " equival");
396:
397: }
398:
399: // special chars
400: String decode(String s) {
401: if (s.indexOf('<') < 0)
402: return s;
403:
404: mc.reset(s);
405: int start = 0;
406: StringBuffer ret = new StringBuffer();
407: while (mc.find()) // NEVER SET start, this reinitialises the replace...
408: {
409: int st = mc.start();
410: int en = mc.end();
411:
412: String code = s.substring(st, en);
413:
414: //System.out.println(""+code);
415: String ci = new String(Character.toChars(chars.get(code)));
416: //System.out.println(""+code+" :: "+ci);
417:
418: mc.appendReplacement(ret, ci);
419:
420: start = en;
421: }
422: mc.appendTail(ret);
423: return ret.toString();
424: }
425:
426: class Name {
427: int hitQuality = 0; // 0: exact, 1: contains, 2: collated, 3: approx
428: String name;
429: int max;
430: String gender;
431: int[][] occs; // {{n1, c1}, ...}
432:
433: String freqLow = ""; // 0,1,2,3
434: String freqMed = ""; // 4,5,6,7
435: String freqHigh = ""; // 8..D
436:
437: public Name(String name, String gender,
438: Map<Integer, Integer> couOc, int hitQuality) {
439: this .hitQuality = hitQuality;
440: this .name = name;
441: this .gender = gender;
442:
443: if (this .name.indexOf(' ') > 0) {
444: // ignore equivalence name "A B"
445: this .name = name.substring(0, name.indexOf(' '));
446: }
447:
448: occs = new int[couOc.size()][2];
449: int i = 0;
450: for (int ci : couOc.keySet()) // country
451: {
452: int ni = couOc.get(ci); // frequ (log-normed)
453: max = Math.max(ni, max);
454: occs[i][0] = ni;
455: occs[i][1] = ci;
456: i++;
457:
458: if (ni > 7) {
459: freqHigh += ", " + countries.get(ci);
460: } else if (ni > 3) {
461: freqMed += ", " + countries.get(ci);
462: } else {
463: freqLow += ", " + countries.get(ci);
464: }
465: }
466:
467: if (freqHigh.startsWith(", "))
468: freqHigh = freqHigh.substring(2);
469: if (freqMed.startsWith(", "))
470: freqMed = freqMed.substring(2);
471: if (freqLow.startsWith(", "))
472: freqLow = freqLow.substring(2);
473: }
474:
475: @Override
476: public final String toString() {
477: StringBuilder sb = new StringBuilder();
478: sb.append(name + " " + gender + " " + max);
479: if (freqHigh.length() > 0)
480: sb.append(": " + freqHigh);
481: return sb.toString();
482: }
483: }
484:
485: // quick De and Fr feeling
486: // Use only for compare and search !
487: public static String normalizeCH(String name) {
488: //name = removeAccents(name);
489: name = name.toUpperCase(Locale.ENGLISH); // same as DE for us
490:
491: name = name.replace("OE", "E"); // or Ö => OE ???
492: name = name.replace("OU", "U");
493: name = name.replace("OI", "I");
494: name = name.replace("OA", "A");
495:
496: //name = removeAccents(name);
497:
498: name = name.replace("SCH", "1");
499: name = name.replace("CH", "K"); // Achermann
500: name = name.replace("AU", "O");
501: //name = name.replace("NN", "N");
502: name = name.replace("PH", "F");
503: //name = name.replace("SS", "S");
504: name = name.replace("Y", "I");
505: name = name.replace("Z", "S");
506: name = name.replace("B", "P");
507: name = name.replace("J", "I");
508: name = name.replace("G", "I");
509: name = name.replace("W", "V");
510: name = name.replace("X", "K");
511: name = name.replace("Q", "K");
512: name = name.replace("C", "K");
513: name = name.replace("D", "T");
514: //name = name.replace("B", "P");
515:
516: name = name.replace("H", "");
517: //name = name.replace("H", "");
518:
519: return removeDoubleLetters(name);
520: }
521:
522: public static String removeDoubleLetters(String s) {
523: if (s.length() == 0)
524: return s;
525: char prev = s.charAt(s.length() - 1);
526: StringBuilder sb = null;
527: for (int i = s.length() - 2; i >= 0; i--) {
528: if (s.charAt(i) == prev) {
529: if (sb == null)
530: sb = new StringBuilder(s);
531: sb.replace(i, i + 1, "");
532: }
533:
534: prev = s.charAt(i);
535: }
536: if (sb != null)
537: return sb.toString();
538: return s;
539: }
540:
541: static String acc = "éèêëóòôöúùûüíìîïáàâäãñõÿýÉÈÊËÓÒÔÖÚÙÛÜÍÌÎÏÁÀÂÄÃÑÕÐÆŠÅŽÝçÇØÞðšøæåß"
542: + "\u0117\u0100\u0101\u015f\u0131\u012b\u0113\u017e\u013c\u011b"
543: + "\u016b\u0163\u0111\u0159\u0146\u011f\u00fe\u0142\u0107\u010d"
544: + "\u017c\u0103\u0106\u010c\u0110\u0151\u0105\u0112\u0122\u0130"
545: + "\u013e\u0137\u0123\u0119\u013b\u013d\u0141\u010f\u0145\u016f"
546: + "\u015e\u0165\u017b\u0148"
547: + "\u0116\u012a\u011a\u016a\u0162\u0158\u011e\u0102\u0150\u0104\u0136\u0118\u010e\u016e\u0164\u0147";
548:
549: static String deacc = "eeeeoooouuuuiiiiaaaaanoyyEEEEOOOOUUUUIIIIAAAAANODÆSAZYcCOposoaas"
550: + "eAasllezle"
551: + "utdrngplcc"
552: + "zaCCDoaEGI"
553: + "lkgeLLLdNu"
554: + "StZn" + "EIEUTRGAOAKEDUTN";
555:
556: static StringBuilder unknown = new StringBuilder();
557:
558: // deep (slow)
559: // Use only for compare and search !
560: public static String removeAccents(String str) {
561: boolean has = false;
562: for (char ci : str.toCharArray()) {
563: if (ci > 'z') {
564: has = true;
565: break;
566: }
567: }
568: if (!has)
569: return str;
570: StringBuilder sb = new StringBuilder(str.length());
571: for (char ci : str.toCharArray()) {
572: if (ci > 'z') {
573: int pos = acc.indexOf(ci);
574: if (pos < 0) {
575: System.out.println("not found: " + ((int) ci));
576: sb.append(ci);
577: if (unknown.indexOf("" + ci) < 0) {
578: unknown.append(ci);
579: }
580: } else {
581: sb.append(deacc.charAt(pos));
582: }
583: } else {
584: sb.append(ci);
585: }
586: }
587:
588: return sb.toString();
589: }
590:
591: public static void main(String[] args) throws Exception {
592: new NamesExplorer(new File("c:/projects/nam_dict.txt")); //, -1, 0, "step");
593: }
594:
595: class TM extends FineGrainTableModel {
596: public Object getValueAt(int row, int col) {
597: Name ni = names.get(row);
598: if (col == 0)
599: return ni.hitQuality;
600: if (col == 1)
601: return ni.gender;
602: if (col == 2)
603: return ni.name;
604: if (col == 3)
605: return ni.max;
606: if (col == 4)
607: return ni.freqHigh;
608: if (col == 5)
609: return ni.freqMed;
610: if (col == 6)
611: return ni.freqLow;
612: return "??";
613: }
614:
615: @Override
616: public String getColumnName(int col) {
617: if (col == 0)
618: return "Hit quality";
619: if (col == 1)
620: return "Gender";
621: if (col == 2)
622: return "Name";
623: if (col == 3)
624: return "Max Log";
625: if (col == 4)
626: return "Frequent";
627: if (col == 5)
628: return "Medium";
629: if (col == 6)
630: return "Rare";
631: return "";
632: }
633:
634: public void clear() {
635: names.clear();
636: fireTableDataChanged();
637: }
638:
639: public void update() {
640: fireTableDataChanged();
641: }
642:
643: int[] COLUMN_PREFERED_SIZES = new int[] { 1, 1, 10, 1, 14, 14,
644: 14 };
645:
646: @Override
647: public int getPreferredColumnWidth(int column) {
648: if (column >= 0 && column < COLUMN_PREFERED_SIZES.length)
649: return COLUMN_PREFERED_SIZES[column];
650: return -1;
651: }
652:
653: public int getColumnCount() {
654: return 7;
655: }
656:
657: public int getRowCount() {
658: return names.size();
659: }
660:
661: }
662:
663: }
|