001: package jcommontk.inflector;
002:
003: import java.util.HashMap;
004: import java.util.HashSet;
005: import java.util.Map;
006: import java.util.Set;
007: import java.util.StringTokenizer;
008: import java.util.Vector;
009: import jcommontk.utils.StringUtils;
010:
011: /**
012: * A class for plurilization of nouns.
013: *
014: * <p>Based on the work:
015: * <pre>
016: * An Algorithmic Approach to English Pluralization
017: *
018: * Damian Conway
019: *
020: * School of Computer Science and Software Engineering
021: * Monash University
022: * Clayton 3168, Australia
023: *
024: * mailto:damian@csse.monash.edu.au
025: * http://www.csse.monash.edu.au/~damian
026: * </pre>
027: */
028: @SuppressWarnings("unchecked")
029: // working to complete a Java 1.5 version
030: public class SimpleInflector {
031: static String[] pluralSuffixes = new String[] { ".*fish", ".*ois",
032: ".*sheep", ".*deer", ".*pox", ".*[a-z].*ese", ".*itis" };
033:
034: static Set ignoreInflection = arrayToSet(new String[] { "bison",
035: "flounder", "pliers", "bream", "gallows", "proceedings",
036: "breeches", "graffiti", "rabies", "britches",
037: "headquarters", "salmon", "carp", "herpes", "scissors",
038: "chassis", "highjinks", "seabass", "clippers", "homework",
039: "series", "cod", "innings", "shears", "contretemps",
040: "jackanapes", "species", "corps", "mackerel", "swine",
041: "debris", "measles", "trout", "diabetes", "mews", "tuna",
042: "djinn", "mumps", "whiting", "eland", "news", "wildebeest",
043: "elk", "pincers" });
044:
045: static Map irregularNouns = arrayToMap(new Object[][] {
046: { "beef", new String[] { "beefs", "beeves" } },
047: { "brother", new String[] { "brothers", "brethren" } },
048: { "child", new String[] { "children" } },
049: { "cow", new String[] { "cows", "kine" } },
050: { "ephemeris", new String[] { "ephemerides" } },
051: { "genie", new String[] { "genies", "genii" } },
052: { "money", new String[] { "moneys", "monies" } },
053: { "mongoose", new String[] { "mongooses" } },
054: { "mythos", new String[] { "mythoi" } },
055: { "octopus", new String[] { "octopuses", "octopodes" } },
056: { "ox", new String[] { "oxen" } },
057: { "soliloquy", new String[] { "soliloquies" } },
058: { "trilby", new String[] { "trilbys" } },
059: { "loaf", new String[] { "loaves" } },
060: { "hoof", new String[] { "hoofs" } },
061: { "graffito", new String[] { "graffiti" } },
062: { "ganglion", new String[] { "ganglions" } },
063: { "turf", new String[] { "turfs" } },
064: { "numen", new String[] { "numina" } },
065: { "atman", new String[] { "atmas" } },
066: { "genus", new String[] { "genera" } },
067: { "occiput", new String[] { "occiputs" } } });
068:
069: static String[][] inflectionSuffixes = new String[][] {
070: { ".*man", "man", "men" }, { ".*[lm]ouse", "ouse", "ice" },
071: { ".*tooth", "tooth", "teeth" },
072: { ".*goose", "goose", "geese" },
073: { ".*foot", "foot", "feet" }, { ".*zoon", "zoon", "zoa" },
074: { ".*[csx]is", "is", "es" },
075: { ".*trix", "trix", "trices" }, { ".*eau", "eau", "eaux" },
076: { ".*ieu", "ieu", "ieux" },
077: { ".*..[iay]nx", "nx", "nges" }, { ".*[cs]h", "h", "hes" },
078: { ".*ss", "ss", "sses" }, { ".*[aeo]lf", "f", "ves" },
079: { ".*[^d]eaf", "f", "ves" }, { ".*arf", "f", "ves" },
080: { ".*[nlw]ife", "fe", "ves" }, { ".*[aeiou]y", "y", "ys" },
081: { ".*[A-Z].*y", "y", "ys" }, { ".*y", "y", "ies" },
082: { ".*[aeiou]o", "o", "os" }, { ".*o", "o", "oes" } };
083:
084: static Object[][] inflectionCategories = new Object[][] {
085: {
086: arrayToSet(new String[] { "acropolis", "chaos",
087: "lens", "aegis", "cosmos", "mantis",
088: "alias", "dais", "marquis", "asbestos",
089: "digitalis", "metropolis", "corpus",
090: "opus", "penis", "testis", "atlas",
091: "epidermis", "pathos", "bathos", "ethos",
092: "pelvis", "bias", "gas", "polis", "caddis",
093: "glottis", "rhinoceros", "cannabis",
094: "glottis", "sassafras", "canvas", "ibis",
095: "trellis" }), "s", "es" },
096: {
097: arrayToSet(new String[] { "alumna", "alga",
098: "vertebra" }), "a", "ae" },
099: {
100: arrayToSet(new String[] { "abscissa", "formula",
101: "medusa", "amoeba", "hydra", "nebula",
102: "antenna", "hyperbola", "nova", "aurora",
103: "lacuna", "parabola" }), "a", "as", "ae" },
104: {
105: arrayToSet(new String[] { "anathema", "enema",
106: "oedema", "bema", "enigma", "sarcoma",
107: "carcinoma", "gumma", "schema", "charisma",
108: "lemma", "soma", "diploma", "lymphoma",
109: "stigma", "dogma", "magma", "stoma",
110: "drama", "melisma", "trauma", "edema",
111: "miasma" }), "a", "as", "ata" },
112: {
113: arrayToSet(new String[] { "stamen", "foramen",
114: "lumen" }), "en", "ens", "ina" },
115: { arrayToSet(new String[] { "codex", "murex", "silex" }),
116: "ex", "ices" },
117: {
118: arrayToSet(new String[] { "apex", "latex",
119: "vertex", "cortex", "pontifex", "vortex",
120: "index", "simplex" }), "ex", "exes", "ices" },
121: { arrayToSet(new String[] { "iris", "clitoris" }), "is",
122: "ises", "ides" },
123: {
124: arrayToSet(new String[] { "albino",
125: "generalissimo", "manifesto",
126: "archipelago", "ghetto", "medico",
127: "armadillo", "guano", "octavo", "commando",
128: "inferno", "photo", "ditto", "jumbo",
129: "pro", "dynamo", "lingo", "quarto",
130: "embryo", "lumbago", "rhino", "fiasco",
131: "magneto", "stylo" }), "o", "os" },
132: {
133: arrayToSet(new String[] { "alto", "contralto",
134: "soprano", "basso", "crescendo", "tempo",
135: "canto", "solo" }), "o", "os", "i" },
136: {
137: arrayToSet(new String[] { "aphelion", "hyperbaton",
138: "perihelion", "asyndeton", "noumenon",
139: "phenomenon", "criterion", "organon",
140: "prolegomenon", }), "on", "a" },
141: {
142: arrayToSet(new String[] { "agendum", "datum",
143: "extremum", "bacterium", "desideratum",
144: "stratum", "candelabrum", "erratum", "ovum" }),
145: "um", "a" },
146: {
147: arrayToSet(new String[] { "aquarium",
148: "interregnum", "quantum", "compendium",
149: "lustrum", "rostrum", "consortium",
150: "maximum", "spectrum", "cranium", "medium",
151: "speculum", "curriculum", "memorandum",
152: "stadium", "dictum", "millenium",
153: "trapezium", "emporium", "minimum",
154: "ultimatum", "enconium", "momentum",
155: "vacuum", "gymnasium", "optimum", "velum",
156: "honorarium", "phylum" }), "um", "ums", "a" },
157: {
158: arrayToSet(new String[] { "focus", "nimbus",
159: "succubus", "fungus", "nucleolus", "torus",
160: "genius", "radius", "umbilicus", "incubus",
161: "stylus", "uterus" }), "us", "uses", "i" },
162: {
163: arrayToSet(new String[] { "apparatus", "impetus",
164: "prospectus", "cantus", "nexus", "sinus",
165: "coitus", "plexus", "status", "hiatus" }),
166: "us", "uses", "us" },
167: { arrayToSet(new String[] { "afreet", "afrit", "efreet" }),
168: "i" },
169: { arrayToSet(new String[] { "cherub", "goy", "seraph" }),
170: "im" } };
171:
172: /**
173: * This method pluralizes words (nouns only). In the event of a multiple segment
174: * word (camel case/underline) only the last segment will be pluralized.
175: *
176: * @param word the word to pluralize
177: * @return returns an array of one or more pluralizations (modern followed by classic)
178: */
179: public static String[] pluralize(String word) {
180: String[] words = parseWord(word), pluralizedWords = null;
181:
182: word = words[words.length - 1];
183:
184: if (doNotInflectPlural(word) || ignoreInflection.contains(word))
185: pluralizedWords = new String[] { word };
186:
187: if (pluralizedWords == null) {
188: if (irregularNouns.containsKey(word))
189: pluralizedWords = (String[]) irregularNouns.get(word);
190:
191: if (pluralizedWords == null) {
192: Object[] category = getCategory(word);
193:
194: if (category != null)
195: pluralizedWords = getCategoryPlurals(category, word);
196:
197: if (pluralizedWords == null) {
198: for (int i = 0; pluralizedWords == null
199: && i < inflectionSuffixes.length; i++)
200: if (word.matches(inflectionSuffixes[i][0]))
201: pluralizedWords = new String[] { getInflection(
202: word, inflectionSuffixes[i][1],
203: inflectionSuffixes[i][2]) };
204:
205: if (pluralizedWords == null)
206: pluralizedWords = new String[] { word + "s" };
207: }
208: }
209: }
210:
211: return repair(words, pluralizedWords);
212: }
213:
214: static String[] parseWord(String word) {
215: Vector words = new Vector();
216:
217: word = StringUtils.camelCaseToLowerCaseUnderline(word);
218: StringTokenizer strtok = new StringTokenizer(word, "_");
219:
220: while (strtok.hasMoreTokens())
221: words.add(strtok.nextToken());
222:
223: return (String[]) words.toArray(new String[words.size()]);
224: }
225:
226: static String[] repair(String[] segments, String[] pluralizedWords) {
227: Vector pluralized = new Vector();
228: String combined = "";
229:
230: for (int i = 0; i < segments.length - 1; i++)
231: combined += (combined.length() > 0 ? "_" : "")
232: + segments[i];
233:
234: if (combined.length() > 0)
235: combined += "_";
236:
237: for (int i = 0; i < pluralizedWords.length; i++)
238: pluralized.add(StringUtils
239: .lowerCaseUnderlineToCamelCase(combined
240: + pluralizedWords[i]));
241:
242: return (String[]) pluralized.toArray(new String[pluralized
243: .size()]);
244: }
245:
246: static boolean doNotInflectPlural(String word) {
247: for (int i = 0; i < pluralSuffixes.length; i++)
248: if (word.matches(pluralSuffixes[i]))
249: return true;
250:
251: return false;
252: }
253:
254: static Object[] getCategory(String word) {
255: for (int i = 0; i < inflectionCategories.length; i++)
256: if (((Set) inflectionCategories[i][0]).contains(word))
257: return inflectionCategories[i];
258:
259: return null;
260: }
261:
262: static String[] getCategoryPlurals(Object[] category, String word) {
263: String suffix = (String) category[1];
264: Vector plurals = new Vector();
265:
266: if (category.length == 2)
267: plurals.add(word + (String) category[1]);
268: else {
269: for (int i = 2; i < category.length; i++)
270: plurals.add(getInflection(word, suffix,
271: (String) category[i]));
272: }
273:
274: return (String[]) plurals.toArray(new String[plurals.size()]);
275: }
276:
277: static String getInflection(String word, String suffix,
278: String newSuffix) {
279: return word.substring(0, word.length() - suffix.length())
280: + newSuffix;
281: }
282:
283: static Set arrayToSet(Object[] array) {
284: Set set = new HashSet();
285:
286: for (int i = 0; i < array.length; i++)
287: set.add(array[i]);
288:
289: return set;
290: }
291:
292: static Map arrayToMap(Object[][] array) {
293: Map map = new HashMap();
294:
295: for (int i = 0; i < array.length; i++)
296: map.put(array[i][0], array[i][1]);
297:
298: return map;
299: }
300: }
|