001: /**
002: * Copyright (c) 2004-2006 Regents of the University of California.
003: * See "license-prefuse.txt" for licensing terms.
004: */package prefuse.util;
005:
006: import java.util.Arrays;
007: import java.util.Comparator;
008: import java.util.HashMap;
009: import java.util.HashSet;
010: import java.util.Iterator;
011: import java.util.Map;
012:
013: import prefuse.data.Table;
014: import prefuse.data.Tuple;
015: import prefuse.data.column.ColumnMetadata;
016: import prefuse.data.tuple.TupleSet;
017: import prefuse.util.collections.DefaultLiteralComparator;
018:
019: /**
020: * Functions for processing an iterator of tuples, including the creation
021: * of arrays of particular tuple data values and summary
022: * statistics (min, max, median, mean, standard deviation).
023: *
024: * @author <a href="http://jheer.org">jeffrey heer</a>
025: */
026: public class DataLib {
027:
028: /**
029: * Get an array containing all data values for a given tuple iteration
030: * and field.
031: * @param tuples an iterator over tuples
032: * @param field the column / data field name
033: * @return an array containing the data values
034: */
035: public static Object[] toArray(Iterator tuples, String field) {
036: Object[] array = new Object[100];
037: int i = 0;
038: for (; tuples.hasNext(); ++i) {
039: if (i >= array.length)
040: array = ArrayLib.resize(array, 3 * array.length / 2);
041: array[i] = ((Tuple) tuples.next()).get(field);
042: }
043: return ArrayLib.trim(array, i);
044: }
045:
046: /**
047: * Get an array of doubles containing all column values for a given table
048: * and field. The {@link Table#canGetDouble(String)} method must return
049: * true for the given column name, otherwise an exception will be thrown.
050: * @param tuples an iterator over tuples
051: * @param field the column / data field name
052: * @return an array of doubles containing the column values
053: */
054: public static double[] toDoubleArray(Iterator tuples, String field) {
055: double[] array = new double[100];
056: int i = 0;
057: for (; tuples.hasNext(); ++i) {
058: if (i >= array.length)
059: array = ArrayLib.resize(array, 3 * array.length / 2);
060: array[i] = ((Tuple) tuples.next()).getDouble(field);
061: }
062: return ArrayLib.trim(array, i);
063: }
064:
065: // ------------------------------------------------------------------------
066:
067: /**
068: * Get a sorted array containing all column values for a given tuple
069: * iterator and field.
070: * @param tuples an iterator over tuples
071: * @param field the column / data field name
072: * @return an array containing the column values sorted
073: */
074: public static Object[] ordinalArray(Iterator tuples, String field) {
075: return DataLib.ordinalArray(tuples, field,
076: DefaultLiteralComparator.getInstance());
077: }
078:
079: /**
080: * Get a sorted array containing all column values for a given table and
081: * field.
082: * @param tuples an iterator over tuples
083: * @param field the column / data field name
084: * @param cmp a comparator for sorting the column contents
085: * @return an array containing the column values sorted
086: */
087: public static Object[] ordinalArray(Iterator tuples, String field,
088: Comparator cmp) {
089: // get set of all unique values
090: HashSet set = new HashSet();
091: while (tuples.hasNext())
092: set.add(((Tuple) tuples.next()).get(field));
093:
094: // sort the unique values
095: Object[] o = set.toArray();
096: Arrays.sort(o, cmp);
097: return o;
098: }
099:
100: /**
101: * Get a sorted array containing all column values for a given tuple
102: * iterator and field.
103: * @param tuples a TupleSet
104: * @param field the column / data field name
105: * @return an array containing the column values sorted
106: */
107: public static Object[] ordinalArray(TupleSet tuples, String field) {
108: return ordinalArray(tuples, field, DefaultLiteralComparator
109: .getInstance());
110: }
111:
112: /**
113: * Get a sorted array containing all column values for a given table and
114: * field.
115: * @param tuples a TupleSet
116: * @param field the column / data field name
117: * @param cmp a comparator for sorting the column contents
118: * @return an array containing the column values sorted
119: */
120: public static Object[] ordinalArray(TupleSet tuples, String field,
121: Comparator cmp) {
122: if (tuples instanceof Table) {
123: ColumnMetadata md = ((Table) tuples).getMetadata(field);
124: return md.getOrdinalArray();
125: } else {
126: return ordinalArray(tuples.tuples(), field, cmp);
127: }
128: }
129:
130: // ------------------------------------------------------------------------
131:
132: /**
133: * Get map mapping from column values (as Object instances) to their
134: * ordinal index in a sorted array.
135: * @param tuples an iterator over tuples
136: * @param field the column / data field name
137: * @return a map mapping column values to their position in a sorted
138: * order of values
139: */
140: public static Map ordinalMap(Iterator tuples, String field) {
141: return ordinalMap(tuples, field, DefaultLiteralComparator
142: .getInstance());
143: }
144:
145: /**
146: * Get map mapping from column values (as Object instances) to their
147: * ordinal index in a sorted array.
148: * @param tuples an iterator over tuples
149: * @param field the column / data field name
150: * @param cmp a comparator for sorting the column contents
151: * @return a map mapping column values to their position in a sorted
152: * order of values
153: */
154: public static Map ordinalMap(Iterator tuples, String field,
155: Comparator cmp) {
156: Object[] o = ordinalArray(tuples, field, cmp);
157:
158: // map the values to the non-negative numbers
159: HashMap map = new HashMap();
160: for (int i = 0; i < o.length; ++i)
161: map.put(o[i], new Integer(i));
162: return map;
163: }
164:
165: /**
166: * Get map mapping from column values (as Object instances) to their
167: * ordinal index in a sorted array.
168: * @param tuples a TupleSet
169: * @param field the column / data field name
170: * @return a map mapping column values to their position in a sorted
171: * order of values
172: */
173: public static Map ordinalMap(TupleSet tuples, String field) {
174: return ordinalMap(tuples, field, DefaultLiteralComparator
175: .getInstance());
176: }
177:
178: /**
179: * Get map mapping from column values (as Object instances) to their
180: * ordinal index in a sorted array.
181: * @param tuples a TupleSet
182: * @param field the column / data field name
183: * @param cmp a comparator for sorting the column contents
184: * @return a map mapping column values to their position in a sorted
185: * order of values
186: */
187: public static Map ordinalMap(TupleSet tuples, String field,
188: Comparator cmp) {
189: if (tuples instanceof Table) {
190: ColumnMetadata md = ((Table) tuples).getMetadata(field);
191: return md.getOrdinalMap();
192: } else {
193: return ordinalMap(tuples.tuples(), field, cmp);
194: }
195: }
196:
197: // ------------------------------------------------------------------------
198:
199: /**
200: * Get the number of values in a data column. Duplicates will be counted.
201: * @param tuples an iterator over tuples
202: * @param field the column / data field name
203: * @return the number of values
204: */
205: public static int count(Iterator tuples, String field) {
206: int i = 0;
207: for (; tuples.hasNext(); ++i, tuples.next())
208: ;
209: return i;
210: }
211:
212: /**
213: * Get the number of distinct values in a data column.
214: * @param tuples an iterator over tuples
215: * @param field the column / data field name
216: * @return the number of distinct values
217: */
218: public static int uniqueCount(Iterator tuples, String field) {
219: HashSet set = new HashSet();
220: while (tuples.hasNext())
221: set.add(((Tuple) tuples.next()).get(field));
222: return set.size();
223: }
224:
225: // ------------------------------------------------------------------------
226:
227: /**
228: * Get the Tuple with the minimum data field value.
229: * @param tuples an iterator over tuples
230: * @param field the column / data field name
231: * @return the Tuple with the minimum data field value
232: */
233: public static Tuple min(Iterator tuples, String field) {
234: return min(tuples, field, DefaultLiteralComparator
235: .getInstance());
236: }
237:
238: /**
239: * Get the Tuple with the minimum data field value.
240: * @param tuples an iterator over tuples
241: * @param field the column / data field name
242: * @param cmp a comparator for sorting the column contents
243: * @return the Tuple with the minimum data field value
244: */
245: public static Tuple min(Iterator tuples, String field,
246: Comparator cmp) {
247: Tuple t = null, tmp;
248: Object min = null;
249: if (tuples.hasNext()) {
250: t = (Tuple) tuples.next();
251: min = t.get(field);
252: }
253: while (tuples.hasNext()) {
254: tmp = (Tuple) tuples.next();
255: Object obj = tmp.get(field);
256: if (cmp.compare(obj, min) < 0) {
257: t = tmp;
258: min = obj;
259: }
260: }
261: return t;
262: }
263:
264: /**
265: * Get the Tuple with the minimum data field value.
266: * @param tuples a TupleSet
267: * @param field the column / data field name
268: * @return the Tuple with the minimum data field value
269: */
270: public static Tuple min(TupleSet tuples, String field,
271: Comparator cmp) {
272: if (tuples instanceof Table) {
273: Table table = (Table) tuples;
274: ColumnMetadata md = table.getMetadata(field);
275: return table.getTuple(md.getMinimumRow());
276: } else {
277: return min(tuples.tuples(), field, cmp);
278: }
279: }
280:
281: /**
282: * Get the Tuple with the minimum data field value.
283: * @param tuples a TupleSet
284: * @param field the column / data field name
285: * @return the Tuple with the minimum data field value
286: */
287: public static Tuple min(TupleSet tuples, String field) {
288: return min(tuples, field, DefaultLiteralComparator
289: .getInstance());
290: }
291:
292: // ------------------------------------------------------------------------
293:
294: /**
295: * Get the Tuple with the maximum data field value.
296: * @param tuples an iterator over tuples
297: * @param field the column / data field name
298: * @return the Tuple with the maximum data field value
299: */
300: public static Tuple max(Iterator tuples, String field) {
301: return max(tuples, field, DefaultLiteralComparator
302: .getInstance());
303: }
304:
305: /**
306: * Get the Tuple with the maximum data field value.
307: * @param tuples an iterator over tuples
308: * @param field the column / data field name
309: * @param cmp a comparator for sorting the column contents
310: * @return the Tuple with the maximum data field value
311: */
312: public static Tuple max(Iterator tuples, String field,
313: Comparator cmp) {
314: Tuple t = null, tmp;
315: Object min = null;
316: if (tuples.hasNext()) {
317: t = (Tuple) tuples.next();
318: min = t.get(field);
319: }
320: while (tuples.hasNext()) {
321: tmp = (Tuple) tuples.next();
322: Object obj = tmp.get(field);
323: if (cmp.compare(obj, min) > 0) {
324: t = tmp;
325: min = obj;
326: }
327: }
328: return t;
329: }
330:
331: /**
332: * Get the Tuple with the maximum data field value.
333: * @param tuples a TupleSet
334: * @param field the column / data field name
335: * @return the Tuple with the maximum data field value
336: */
337: public static Tuple max(TupleSet tuples, String field,
338: Comparator cmp) {
339: if (tuples instanceof Table) {
340: Table table = (Table) tuples;
341: ColumnMetadata md = table.getMetadata(field);
342: return table.getTuple(md.getMaximumRow());
343: } else {
344: return max(tuples.tuples(), field, cmp);
345: }
346: }
347:
348: /**
349: * Get the Tuple with the maximum data field value.
350: * @param tuples a TupleSet
351: * @param field the column / data field name
352: * @return the Tuple with the maximum data field value
353: */
354: public static Tuple max(TupleSet tuples, String field) {
355: return max(tuples, field, DefaultLiteralComparator
356: .getInstance());
357: }
358:
359: // ------------------------------------------------------------------------
360:
361: /**
362: * Get the Tuple with the median data field value.
363: * @param tuples an iterator over tuples
364: * @param field the column / data field name
365: * @return the Tuple with the median data field value
366: */
367: public static Tuple median(Iterator tuples, String field) {
368: return median(tuples, field, DefaultLiteralComparator
369: .getInstance());
370: }
371:
372: /**
373: * Get the Tuple with the median data field value.
374: * @param tuples an iterator over tuples
375: * @param field the column / data field name
376: * @param cmp a comparator for sorting the column contents
377: * @return the Tuple with the median data field value
378: */
379: public static Tuple median(Iterator tuples, String field,
380: Comparator cmp) {
381: Object[] t = new Tuple[100];
382: int i = 0;
383: for (; tuples.hasNext(); ++i) {
384: if (i >= t.length)
385: t = ArrayLib.resize(t, 3 * t.length / 2);
386: t[i] = (Tuple) tuples.next();
387: }
388: ArrayLib.trim(t, i);
389:
390: Object[] v = new Object[t.length];
391: int[] idx = new int[t.length];
392: for (i = 0; i < t.length; ++i) {
393: idx[i] = i;
394: v[i] = ((Tuple) t[i]).get(field);
395: }
396:
397: ArrayLib.sort(v, idx, cmp);
398: return (Tuple) t[idx[idx.length / 2]];
399: }
400:
401: /**
402: * Get the Tuple with the median data field value.
403: * @param tuples a TupleSet
404: * @param field the column / data field name
405: * @return the Tuple with the median data field value
406: */
407: public static Tuple median(TupleSet tuples, String field,
408: Comparator cmp) {
409: if (tuples instanceof Table) {
410: Table table = (Table) tuples;
411: ColumnMetadata md = table.getMetadata(field);
412: return table.getTuple(md.getMedianRow());
413: } else {
414: return median(tuples.tuples(), field, cmp);
415: }
416: }
417:
418: /**
419: * Get the Tuple with the median data field value.
420: * @param tuples a TupleSet
421: * @param field the column / data field name
422: * @return the Tuple with the median data field value
423: */
424: public static Tuple median(TupleSet tuples, String field) {
425: return median(tuples, field, DefaultLiteralComparator
426: .getInstance());
427: }
428:
429: // ------------------------------------------------------------------------
430:
431: /**
432: * Get the mean value of a tuple data value. If any tuple does not have the
433: * named field or the field is not a numeric data type, NaN will be returned.
434: * @param tuples an iterator over tuples
435: * @param field the column / data field name
436: * @return the mean value, or NaN if a non-numeric data type is encountered
437: */
438: public static double mean(Iterator tuples, String field) {
439: try {
440: int count = 0;
441: double sum = 0;
442:
443: while (tuples.hasNext()) {
444: sum += ((Tuple) tuples.next()).getDouble(field);
445: ++count;
446: }
447: return sum / count;
448: } catch (Exception e) {
449: return Double.NaN;
450: }
451: }
452:
453: /**
454: * Get the standard deviation of a tuple data value. If any tuple does not
455: * have the named field or the field is not a numeric data type, NaN will be
456: * returned.
457: * @param tuples an iterator over tuples
458: * @param field the column / data field name
459: * @return the standard deviation value, or NaN if a non-numeric data type
460: * is encountered
461: */
462: public static double deviation(Iterator tuples, String field) {
463: return deviation(tuples, field, DataLib.mean(tuples, field));
464: }
465:
466: /**
467: * Get the standard deviation of a tuple data value. If any tuple does not
468: * have the named field or the field is not a numeric data type, NaN will be
469: * returned.
470: * @param tuples an iterator over tuples
471: * @param field the column / data field name
472: * @param mean the mean of the column, used to speed up accurate
473: * deviation calculation
474: * @return the standard deviation value, or NaN if a non-numeric data type
475: * is encountered
476: */
477: public static double deviation(Iterator tuples, String field,
478: double mean) {
479: try {
480: int count = 0;
481: double sumsq = 0;
482: double x;
483:
484: while (tuples.hasNext()) {
485: x = ((Tuple) tuples.next()).getDouble(field) - mean;
486: sumsq += x * x;
487: ++count;
488: }
489: return Math.sqrt(sumsq / count);
490: } catch (Exception e) {
491: return Double.NaN;
492: }
493: }
494:
495: /**
496: * Get the sum of a tuple data value. If any tuple does not have the named
497: * field or the field is not a numeric data type, NaN will be returned.
498: * @param tuples an iterator over tuples
499: * @param field the column / data field name
500: * @return the sum, or NaN if a non-numeric data type is encountered
501: */
502: public static double sum(Iterator tuples, String field) {
503: try {
504: double sum = 0;
505:
506: while (tuples.hasNext()) {
507: sum += ((Tuple) tuples.next()).getDouble(field);
508: }
509: return sum;
510: } catch (Exception e) {
511: return Double.NaN;
512: }
513: }
514:
515: // ------------------------------------------------------------------------
516:
517: /**
518: * Infer the data field type across all tuples in a TupleSet.
519: * @param tuples the TupleSet to analyze
520: * @param field the data field to type check
521: * @return the inferred data type
522: * @throws IllegalArgumentException if incompatible types are used
523: */
524: public static Class inferType(TupleSet tuples, String field) {
525: if (tuples instanceof Table) {
526: return ((Table) tuples).getColumnType(field);
527: } else {
528: Class type = null, type2 = null;
529: Iterator iter = tuples.tuples();
530: while (iter.hasNext()) {
531: Tuple t = (Tuple) iter.next();
532: if (type == null) {
533: type = t.getColumnType(field);
534: } else if (!type.equals(type2 = t.getColumnType(field))) {
535: if (type2.isAssignableFrom(type)) {
536: type = type2;
537: } else if (!type.isAssignableFrom(type2)) {
538: throw new IllegalArgumentException(
539: "The data field ["
540: + field
541: + "] does not have "
542: + "a consistent type across provided Tuples");
543: }
544: }
545: }
546: return type;
547: }
548: }
549:
550: } // end of class DataLib
|