001: // Copyright (c) 2001, 2003 Per M.A. Bothner and Brainfood Inc.
002: // This is free software; for terms and warranty disclaimer see ./COPYING.
003:
004: package gnu.xquery.util;
005:
006: import gnu.lists.*;
007: import gnu.math.*;
008: import gnu.mapping.*;
009: import gnu.xml.TextUtils;
010: import gnu.kawa.xml.KNode;
011: import gnu.kawa.xml.UntypedAtomic; /* #ifdef use:java.util.regex */
012: import java.util.regex.Pattern;
013: import java.util.regex.Matcher; /* #endif */
014: import gnu.text.*;
015:
016: /* #ifdef use:java.text.Normalizer */
017: // import java.text.Normalizer;
018: /* #endif */
019:
020: public class StringUtils {
021: private static String ERROR_VALUE = "<error>";
022:
023: static String coerceToString(Object arg, String functionName,
024: int iarg, String onEmpty) {
025: if (arg instanceof KNode)
026: arg = KNode.atomicValue(arg);
027: if ((arg == Values.empty || arg == null)
028: && onEmpty != ERROR_VALUE)
029: return onEmpty;
030: if (arg instanceof UntypedAtomic
031: /* #ifdef use:java.lang.CharSequence */
032: || arg instanceof CharSequence
033: /* #else */
034: // || arg instanceof String
035: /* #endif */
036: /* #ifdef use:java.net.URI */
037: || arg instanceof java.net.URI
038: /* #endif */
039: || arg instanceof Path)
040: return arg.toString();
041: throw new WrongType(functionName, iarg, arg,
042: onEmpty == ERROR_VALUE ? "xs:string" : "xs:string?");
043: }
044:
045: public static Object lowerCase(Object node) {
046: return coerceToString(node, "lower-case", 1, "").toLowerCase();
047: }
048:
049: public static Object upperCase(Object node) {
050: return coerceToString(node, "upper-case", 1, "").toUpperCase();
051: }
052:
053: static double asDouble(Object value) {
054: if (!(value instanceof Number))
055: value = NumberValue.numberValue(value);
056: return ((Number) value).doubleValue();
057: }
058:
059: public static Object substring(Object str, Object start) {
060: double d1 = asDouble(start);
061: if (Double.isNaN(d1))
062: return "";
063: int i = (int) (d1 - 0.5);
064: if (i < 0)
065: i = 0;
066: String s = coerceToString(str, "substring", 1, "");
067: int len = s.length();
068: int offset = 0;
069: while (--i >= 0) {
070: if (offset >= len)
071: return "";
072: char ch = s.charAt(offset++);
073: if (ch >= 0xD800 && ch < 0xDC00 && offset < len)
074: offset++;
075: }
076: return s.substring(offset);
077: }
078:
079: public static Object substring(Object str, Object start,
080: Object length) {
081: String s = coerceToString(str, "substring", 1, "");
082: int len = s.length();
083: // Don't use Math.round because it returns 0 given NaN!
084: // We pre-subtract 1 before rounding.
085: double d1 = Math.floor(asDouble(start) - 0.5);
086: double d2 = d1 + Math.floor(asDouble(length) + 0.5);
087: if (d1 <= 0)
088: d1 = 0;
089: if (d2 > len)
090: d2 = len;
091: if (d2 <= d1) // Including the case where either is NaN.
092: return "";
093: int i1 = (int) d1;
094: int i2 = (int) d2 - i1;
095: int offset = 0;
096: while (--i1 >= 0) {
097: if (offset >= len)
098: return "";
099: char ch = s.charAt(offset++);
100: if (ch >= 0xD800 && ch < 0xDC00 && offset < len)
101: offset++;
102: }
103: i1 = offset;
104: while (--i2 >= 0) {
105: if (offset >= len)
106: return "";
107: char ch = s.charAt(offset++);
108: if (ch >= 0xD800 && ch < 0xDC00 && offset < len)
109: offset++;
110: }
111: i2 = offset;
112: return s.substring(i1, i2);
113: }
114:
115: public static Object stringLength(Object str) {
116: String s = coerceToString(str, "string-length", 1, "");
117: int slen = s.length();
118: int len = 0;
119: for (int i = 0; i < slen;) {
120: char ch = s.charAt(i++);
121: if (ch >= 0xD800 && ch < 0xDC00 && i < slen)
122: i++;
123: len++;
124: }
125: return IntNum.make(len);
126: }
127:
128: public static Object substringBefore(Object str, Object find) {
129: String s = coerceToString(str, "substring-before", 1, "");
130: String f = coerceToString(find, "substring-before", 2, "");
131: int flen = f.length();
132:
133: if (flen == 0)
134: return "";
135: int start = s.indexOf(f);
136: return start >= 0 ? s.substring(0, start) : "";
137: }
138:
139: public static Object substringAfter(Object str, Object find) {
140: String s = coerceToString(str, "substring-after", 1, "");
141: String f = coerceToString(find, "substring-after", 2, "");
142: int flen = f.length();
143:
144: if (flen == 0)
145: return s;
146:
147: int start = s.indexOf(f);
148: return start >= 0 ? s.substring(start + flen) : "";
149: }
150:
151: public static Object translate(Object str, Object map, Object trans) {
152: String sv = coerceToString(str, "translate", 1, "");
153: map = KNode.atomicValue(map);
154: if (!(map instanceof UntypedAtomic || map instanceof String))
155: throw new WrongType("translate", 2, str, "xs:string");
156: String m = map.toString();
157: int mlen = m.length();
158:
159: trans = KNode.atomicValue(trans);
160: if (!(trans instanceof UntypedAtomic || trans instanceof String))
161: throw new WrongType("translate", 3, str, "xs:string");
162: String t = trans.toString();
163:
164: if (mlen == 0)
165: return sv;
166:
167: int slen = sv.length();
168: StringBuffer s = new StringBuffer(slen);
169: int tlen = t.length();
170:
171: mainLoop: for (int i = 0; i < slen;) {
172: char c1 = sv.charAt(i++);
173: char c2 = 0;
174: if (c1 >= 0xD800 && c1 < 0xDC00 && i < slen)
175: c2 = sv.charAt(i++);
176: int j = 0;
177: for (int mi = 0; mi < mlen;) {
178: char m1 = m.charAt(mi++);
179: char m2 = 0;
180: if (m1 >= 0xD800 && m1 < 0xDC00 && mi < mlen)
181: m2 = m.charAt(mi++);
182: if (m1 == c1 && m2 == c2) {
183: for (int ti = 0;; j--) {
184: if (ti >= tlen)
185: continue mainLoop;
186: char t1 = t.charAt(ti++);
187: char t2 = 0;
188: if (t1 >= 0xD800 && t1 < 0xDC00 && ti < tlen)
189: t2 = t.charAt(ti++);
190: if (j == 0) {
191: c1 = t1;
192: c2 = t2;
193: break;
194: }
195: }
196: break;
197: }
198: j++;
199: }
200: s.append(c1);
201: if (c2 != 0)
202: s.append(c2);
203: }
204:
205: return s.toString();
206: }
207:
208: public static Object stringPad(Object str, Object padcount) {
209: int count = ((Number) NumberValue.numberValue(padcount))
210: .intValue();
211: if (count <= 0) {
212: if (count == 0)
213: return "";
214: throw new IndexOutOfBoundsException(
215: "Invalid string-pad count");
216: }
217:
218: String sv = coerceToString(str, "string-pad", 1, "");
219: int slen = sv.length();
220: StringBuffer s = new StringBuffer(count * slen);
221: for (int i = 0; i < count; i++)
222: s.append(sv);
223:
224: return s.toString();
225: }
226:
227: public static Object contains(Object str, Object contain) {
228: String s = coerceToString(str, "contains", 1, "");
229: String c = coerceToString(contain, "contains", 2, "");
230:
231: return s.indexOf(c) < 0 ? Boolean.FALSE : Boolean.TRUE;
232: }
233:
234: public static Object startsWith(Object str, Object with) {
235: String s = coerceToString(str, "starts-with", 1, "");
236: String w = coerceToString(with, "starts-with", 2, "");
237:
238: return s.startsWith(w) ? Boolean.TRUE : Boolean.FALSE;
239: }
240:
241: public static Object endsWith(Object str, Object with) {
242: String s = coerceToString(str, "ends-with", 1, "");
243: String w = coerceToString(with, "ends-with", 2, "");
244: return s.endsWith(w) ? Boolean.TRUE : Boolean.FALSE;
245: }
246:
247: public static Object stringJoin(Object strseq, Object join) {
248: StringBuffer s = new StringBuffer();
249: String glue = coerceToString(join, "string-join", 2,
250: ERROR_VALUE);
251: int glen = glue.length();
252: int index = 0;
253: boolean started = false;
254:
255: while ((index = Values.nextIndex(strseq, index)) >= 0) {
256: Object obj = Values.nextValue(strseq, index - 1);
257: if (obj == Values.empty)
258: continue;
259:
260: if (started && glen > 0)
261: s.append(glue);
262: s.append(TextUtils.stringValue(obj));
263: started = true;
264: }
265:
266: return s.toString();
267: }
268:
269: public static String concat$V(Object arg1, Object arg2,
270: Object[] args) {
271: arg1 = SequenceUtils.coerceToZeroOrOne(arg1, "concat", 1);
272: String str1 = TextUtils.stringValue(arg1);
273: arg2 = SequenceUtils.coerceToZeroOrOne(arg2, "concat", 2);
274: String str2 = TextUtils.stringValue(arg2);
275: /* #ifdef JAVA5 */
276: // StringBuilder result = new StringBuilder(str1);
277: /* #else */
278: StringBuffer result = new StringBuffer(str1);
279: /* #endif */
280: result.append(str2);
281: int count = args.length;
282: for (int i = 0; i < count; i++) {
283: Object arg = SequenceUtils.coerceToZeroOrOne(args[i],
284: "concat", i + 2);
285: result.append(TextUtils.stringValue(arg));
286: }
287: return result.toString();
288: }
289:
290: /** This implements the XQuery <code>fn:compare</code> function. */
291: public static Object compare(Object val1, Object val2,
292: NamedCollator coll) {
293: if (val1 == Values.empty || val1 == null
294: || val2 == Values.empty || val2 == null)
295: return Values.empty;
296: if (coll == null)
297: coll = NamedCollator.codepointCollation;
298: int ret = coll.compare(val1.toString(), val2.toString());
299: return ret < 0 ? IntNum.minusOne() : ret > 0 ? IntNum.one()
300: : IntNum.zero();
301: }
302:
303: public static void stringToCodepoints$X(Object arg, CallContext ctx) {
304: String str = coerceToString(arg, "string-to-codepoints", 1, "");
305: int len = str.length();
306: Consumer out = ctx.consumer;
307: for (int i = 0; i < len;) {
308: int ch = str.charAt(i++);
309: if (ch >= 0xD800 && ch < 0xDC00 && i < len)
310: ch = (ch - 0xD800) * 0x400 + (str.charAt(i++) - 0xDC00)
311: + 0x10000;
312: out.writeInt(ch);
313: }
314: }
315:
316: private static void appendCodepoint(Object code, StringBuffer sbuf) {
317: IntNum I = (IntNum) gnu.kawa.xml.XIntegerType.integerType
318: .cast(code);
319: int i = I.intValue();
320: if (i <= 0
321: || (i > 0xD7FF && (i < 0xE000
322: || (i > 0xFFFD && i < 0x10000) || i > 0x10FFFF)))
323: throw new IllegalArgumentException("codepoints-to-string: "
324: + i + " is not a valid XML character [FOCH0001]");
325: if (i >= 0x10000) {
326: sbuf.append((char) (((i - 0x10000) >> 10) + 0xD800));
327: i = (i & 0x3FF) + 0xDC00;
328: }
329: sbuf.append((char) i);
330: }
331:
332: public static String codepointsToString(Object arg) {
333: if (arg == null)
334: return "";
335: StringBuffer sbuf = new StringBuffer();
336: if (arg instanceof Values) {
337: Values vals = (Values) arg;
338: int ipos = vals.startPos();
339: while ((ipos = vals.nextPos(ipos)) != 0)
340: appendCodepoint(vals.getPosPrevious(ipos), sbuf);
341: } else
342: appendCodepoint(arg, sbuf);
343: return sbuf.toString();
344: }
345:
346: public static String encodeForUri(Object arg) {
347: return encodeForUri(arg, 'U');
348: }
349:
350: public static String iriToUri(Object arg) {
351: return encodeForUri(arg, 'I');
352: }
353:
354: public static String escapeHtmlUri(Object arg) {
355: return encodeForUri(arg, 'H');
356: }
357:
358: static String encodeForUri(Object arg, char mode) {
359: String str;
360: if (arg instanceof String || arg instanceof UntypedAtomic)
361: str = arg.toString();
362: else if (arg == null || arg == Values.empty)
363: str = "";
364: else
365: throw new ClassCastException();
366: return URIPath.encodeForUri(str, mode);
367: }
368:
369: public static String normalizeSpace(Object arg) {
370: String str = coerceToString(arg, "normalize-space", 1, "");
371: int len = str.length();
372: StringBuffer sbuf = null;
373: int skipped = 0;
374: for (int i = 0; i < len; i++) {
375: char ch = str.charAt(i);
376: if (Character.isWhitespace(ch)) {
377: if (sbuf == null && skipped == 0 && i > 0)
378: sbuf = new StringBuffer(str.substring(0, i));
379: skipped++;
380: } else {
381: if (skipped > 0) {
382: if (sbuf != null)
383: sbuf.append(' ');
384: else if (skipped > 1 || i == 1
385: || str.charAt(i - 1) != ' ')
386: sbuf = new StringBuffer();
387: skipped = 0;
388: }
389: if (sbuf != null)
390: sbuf.append(ch);
391: }
392: }
393: return sbuf != null ? sbuf.toString() : skipped > 0 ? "" : str;
394: }
395:
396: /* #ifdef use:java.util.regex */
397: public static Pattern makePattern(String pattern, String flags) {
398: int fl = 0;
399: for (int i = flags.length(); --i >= 0;) {
400: char ch = flags.charAt(i);
401: switch (ch) {
402: case 'i':
403: fl |= Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
404: break;
405: case 's':
406: fl |= Pattern.DOTALL;
407: break;
408: case 'x':
409: StringBuffer sbuf = new StringBuffer();
410: int plen = pattern.length();
411: for (int j = 0; j < plen; j++) {
412: char pch = pattern.charAt(j);
413: if (!Character.isWhitespace(pch))
414: sbuf.append(pch);
415: }
416: pattern = sbuf.toString();
417: break;
418: case 'm':
419: fl |= Pattern.MULTILINE;
420: break;
421: default:
422: throw new IllegalArgumentException(
423: "unknown 'replace' flag");
424: }
425: }
426: return Pattern.compile(pattern, fl);
427: }
428:
429: /* #endif */
430:
431: public static boolean matches(Object input, String pattern) {
432: return matches(input, pattern, "");
433: }
434:
435: public static boolean matches(Object arg, String pattern,
436: String flags) {
437: /* #ifdef use:java.util.regex */
438: String str;
439: if (arg instanceof String || arg instanceof UntypedAtomic)
440: str = arg.toString();
441: else if (arg == null || arg == Values.empty)
442: str = "";
443: else
444: throw new ClassCastException();
445: return makePattern(pattern, flags).matcher(str).find();
446: /* #else */
447: // throw new Error("fn:matches requires java.util.regex (JDK 1.4 or equivalent)");
448: /* #endif */
449: }
450:
451: public static String replace(Object input, String pattern,
452: String replacement) {
453: return replace(input, pattern, replacement, "");
454: }
455:
456: public static String replace(Object arg, String pattern,
457: String replacement, String flags) {
458: /* #ifdef use:java.util.regex */
459: String str;
460: if (arg instanceof String || arg instanceof UntypedAtomic)
461: str = arg.toString();
462: else if (arg == null || arg == Values.empty)
463: str = "";
464: else
465: throw new ClassCastException();
466: return makePattern(pattern, flags).matcher(str).replaceAll(
467: replacement);
468: /* #else */
469: // throw new Error("fn:replace requires java.util.regex (JDK 1.4 or equivalent)");
470: /* #endif */
471: }
472:
473: public static void tokenize$X(Object arg, String pattern,
474: CallContext ctx) {
475: tokenize$X(arg, pattern, "", ctx);
476: }
477:
478: public static void tokenize$X(Object arg, String pattern,
479: String flags, CallContext ctx) {
480: /* #ifdef use:java.util.regex */
481: String str;
482: if (arg instanceof String || arg instanceof UntypedAtomic)
483: str = arg.toString();
484: else if (arg == null || arg == Values.empty)
485: str = "";
486: else
487: throw new ClassCastException();
488: Consumer out = ctx.consumer;
489: Matcher matcher = makePattern(pattern, flags).matcher(str);
490: int len = str.length();
491: if (len == 0)
492: return;
493: int start = 0;
494: for (;;) {
495: boolean matched = matcher.find();
496: if (!matched) {
497: out.writeObject(str.substring(start));
498: break;
499: }
500: int end = matcher.start();
501: out.writeObject(str.substring(start, end));
502: start = matcher.end();
503: if (start == end)
504: throw new IllegalArgumentException(
505: "pattern matches empty string");
506: }
507: /* #else */
508: // throw new Error("fn:tokenize requires java.util.regex (JDK 1.4 or equivalent)");
509: /* #endif */
510: }
511:
512: public static Object codepointEqual(Object arg1, Object arg2) {
513: String str1 = coerceToString(arg1, "codepoint-equal", 1, null);
514: String str2 = coerceToString(arg2, "codepoint-equal", 2, null);
515: if (str1 == null || str2 == null)
516: return Values.empty;
517: return str1.equals(str2) ? Boolean.TRUE : Boolean.FALSE;
518: }
519:
520: public static Object normalizeUnicode(Object arg) {
521: return normalizeUnicode(arg, "NFC");
522: }
523:
524: public static Object normalizeUnicode(Object arg, String form) {
525: String str = coerceToString(arg, "normalize-unicode", 1, "");
526: form = form.trim().toUpperCase();
527: if ("".equals(form))
528: return str;
529: /* #ifdef use:java.text.Normalizer */
530: // Normalizer.Form nform;
531: // if ("NFC".equals(form))
532: // nform = Normalizer.Form.NFC;
533: // else if ("NFD".equals(form))
534: // nform = Normalizer.Form.NFD;
535: // else if ("NFKC".equals(form))
536: // nform = Normalizer.Form.NFKC;
537: // else if ("NFKD".equals(form))
538: // nform = Normalizer.Form.NFKD;
539: // else
540: // throw new RuntimeException("normalize-unicode: unknown normalization form '"+form+'\'');
541: // return Normalizer.normalize(str, nform);
542: /* #else */
543: throw AbstractSequence
544: .unsupportedException("normalize-unicode form " + form);
545: /* #endif */
546: }
547: }
|