001: /*
002: * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
003: * PROPRIETARY/CONFIDENTIAL. Use of this product is subject to license terms.
004: */
005:
006: package com.sun.portal.search.soif;
007:
008: import java.util.*;
009: import java.io.*;
010:
011: /**
012: * SOIFInputStream class. Reads serialized SOIF objects from an input stream.
013: */
014: public class SOIFInputStream {
015: /**
016: * Raw SOIF string.
017: */
018: private DataInputStream sis;
019:
020: /**
021: * End of stream reached
022: */
023: private boolean EOS;
024:
025: /**
026: * Encoding for conversion to/from byte stream
027: */
028: private String encoding = SOIF.defaultEncoding;
029: private int enc_ = SOIF.ENC_UTF8;
030:
031: /**
032: * True if input is ISO8859-n (n != 1), requiring a charset conversion.
033: * This is inefficient and is only being done for bwd compatibility.
034: */
035: private boolean convert8bit = false;
036:
037: /**
038: * Allowed attributes - all others are ignored
039: */
040: Set allowed; // XXX Replace with schema enforcement at SOIF level?
041:
042: /**
043: * debug
044: */
045: public final boolean debug = false;
046:
047: //public final boolean debug = true;
048:
049: /**
050: * Creates a SOIFInputStream reading from the named file.
051: * @param filename the SOIF source file
052: */
053: public SOIFInputStream(String filename)
054: throws FileNotFoundException {
055: this (new DataInputStream(new BufferedInputStream(
056: new FileInputStream(filename))));
057: }
058:
059: /**
060: * Creates a SOIFInputStream reading from the named file.
061: * @param filename the SOIF source file
062: * @param encoding the character encoding of the input SOIF
063: */
064: public SOIFInputStream(String filename, String encoding)
065: throws FileNotFoundException, UnsupportedEncodingException {
066: this (new DataInputStream(new BufferedInputStream(
067: new FileInputStream(filename))), encoding);
068: }
069:
070: /**
071: * Creates a SOIFInputStream reading from a byte array.
072: * @param data the SOIF source byte array
073: * @param encoding the character encoding of the input SOIF
074: */
075: public SOIFInputStream(byte[] data, String encoding)
076: throws UnsupportedEncodingException {
077: this (new DataInputStream(new ByteArrayInputStream(data)));
078: }
079:
080: /**
081: * Creates a SOIFInputStream reading from a byte array.
082: * @param data the SOIF source byte array
083: */
084: public SOIFInputStream(byte[] data) {
085: this (new DataInputStream(new ByteArrayInputStream(data)));
086: }
087:
088: /**
089: * Creates a SOIFInputStream reading from a SOIFBuffer.
090: * @param data the SOIF source SOIFBuffer
091: */
092: public SOIFInputStream(SOIFBuffer sb) {
093: this (sb.toByteArray());
094: }
095:
096: /**
097: * Creates a SOIFInputStream reading from an InputStream.
098: * @param filename the SOIF source InoutStream
099: */
100: public SOIFInputStream(InputStream is) {
101: if (!(is instanceof DataInputStream))
102: sis = new DataInputStream(is);
103: else
104: sis = (DataInputStream) is;
105: }
106:
107: /**
108: * Creates a SOIFInputStream reading from an InputStream.
109: * @param filename the SOIF source InoutStream
110: * @param encoding the character encoding of the input SOIF
111: */
112: public SOIFInputStream(InputStream is, String encoding)
113: throws UnsupportedEncodingException {
114: this (is);
115: if (encoding.equalsIgnoreCase("UTF-8"))
116: enc_ = SOIF.ENC_UTF8;
117: else if (encoding.equalsIgnoreCase("UTF-16")) {
118: encoding = "UTF-16BE";
119: enc_ = SOIF.ENC_UTF16;
120: } else {
121: // We can support any encoding which doesn't
122: // use @, {, }, <space> and \n in escaped/encoded chars.
123: // This includes all unescaped, unshifted 8-bit encodings,
124: // all ISO8859-N, and some others.
125: // (too difficult/inefficient/unnecessary to support all
126: // encodings at this point)
127: enc_ = SOIF.ENC_8BIT;
128: if (!encoding.equalsIgnoreCase("ISO8859-1"))
129: convert8bit = true;
130: }
131: //else
132: //throw new UnsupportedEncodingException(encoding);
133: this .encoding = encoding;
134: }
135:
136: /**
137: * Sets a SOIF attribute filter. Only attributes in the allowed set
138: * will be read from the input stream.
139: * @param attrs the set of allowed attributes (use lower case)
140: */
141: public void setAllowed(Set attrs) {
142: allowed = attrs;
143: }
144:
145: /**
146: * Char encoder for use with SOIF.
147: * Similar to default Java UTF-8 encoder but altered to
148: * allow all characters so to handle non-UTF8 data in non-strict mode.
149: * This only works for ascii-based charsets, eg, ISO8859-*, UTF-8, UTF-16
150: */
151: char getChar(InputStream is) throws Exception, IOException {
152: int c, char2, char3;
153: boolean strict = false;
154: if (enc_ == SOIF.ENC_UTF8) {
155: /*
156: * This is much faster than Java utf-8 String conversion.
157: * We can also control the handling of illegal utf-8 sequences.
158: */
159: if ((c = is.read()) == -1)
160: throw new EOFException();
161: c &= 0xff;
162: switch (c >> 4) {
163: case 0:
164: case 1:
165: case 2:
166: case 3:
167: case 4:
168: case 5:
169: case 6:
170: case 7:
171: /* 0xxxxxxx*/
172: return (char) c;
173: case 12:
174: case 13:
175: /* 110x xxxx 10xx xxxx*/
176: if (!strict)
177: is.mark(8);
178: if ((char2 = is.read()) == -1)
179: if (strict)
180: throw new EOFException();
181: else
182: return (char) c; // XXX return '?'...
183: if ((char2 & 0xC0) != 0x80) {
184: if (strict)
185: throw new EOFException();
186: else {
187: is.reset();
188: return (char) c;
189: }
190: }
191: return (char) (((c & 0x1F) << 6) | (char2 & 0x3F));
192: case 14:
193: // 1110 xxxx 10xx xxxx 10xx xxxx
194: is.mark(8);
195: if ((char2 = is.read()) == -1) {
196: if (strict)
197: throw new EOFException();
198: else
199: return (char) c;
200: }
201: if ((char3 = is.read()) == -1) {
202: if (strict)
203: throw new EOFException();
204: else {
205: is.reset();
206: return (char) c;
207: }
208: }
209: if (((char2 & 0xC0) != 0x80)
210: || ((char3 & 0xC0) != 0x80)) {
211: if (strict)
212: throw new Exception("UTF-8 Data Format Error");
213: else {
214: is.reset();
215: return (char) c;
216: }
217: }
218: return (char) (((c & 0x0F) << 12)
219: | ((char2 & 0x3F) << 6) | ((char3 & 0x3F) << 0));
220: default:
221: // 10xx xxxx, 1111 xxxx
222: // UTF format error
223: if (strict)
224: throw new Exception("UTF-8 Data Format Error");
225: else {
226: return (char) c;
227: }
228: }
229: } else if (enc_ == SOIF.ENC_UTF16) {
230: if ((c = is.read()) == -1)
231: throw new EOFException();
232: if ((char2 = is.read()) == -1)
233: throw new EOFException();
234: return (char) (c << 8 | char2 & 0xff);
235: } else {
236: // assume 8 bit - non ISO8859-1 will require further conversion (convert8bit)
237: if ((c = is.read()) == -1)
238: throw new EOFException();
239: return (char) c;
240: }
241: }
242:
243: /** This is a bit closer to the mark than StreamTokenizer */
244: char getToken(String skip, String stop, String ok, String bad,
245: StringBuffer sb) throws Exception {
246: char c = getChar(sis);
247: if (skip != null) {
248: while (skip.indexOf(c) != -1) {
249: c = getChar(sis);
250: }
251: }
252: while (true) {
253: if (stop != null && stop.indexOf(c) != -1)
254: break;
255: if (ok != null && ok.indexOf(c) == -1)
256: throw new Exception();
257: if (bad != null && bad.indexOf(c) != -1)
258: throw new Exception();
259: sb.append(c);
260: c = getChar(sis);
261: }
262: if (convert8bit) {
263: try {
264: // convert ISO8859-1 to ISO8859-n
265: byte[] b = sb.toString().getBytes();
266: sb.setLength(0);
267: sb.append(new String(b, encoding));
268: } catch (UnsupportedEncodingException e) {
269: throw new SOIFException("Unsupported encoding: "
270: + e.getMessage());
271: }
272: }
273: return c;
274: }
275:
276: /**
277: * Reads a single SOIF from the underlying input stream.
278: * @return the next SOIF object from the input stream
279: * @return null when end of stream is reached for the first time
280: * @throws SOIFException if error in SOIF
281: * @throws IOException if an I/O error occurs
282: * @throws EOFException if read past end of stream
283: */
284: public SOIF readSOIF() throws IOException {
285: String attrname;
286: StringBuffer sb = new StringBuffer(64);
287: SOIF nSOIF = new SOIF();
288: nSOIF.encoding = encoding;
289: char c, c1;
290: int i;
291: //debug = true;
292:
293: if (EOS)
294: throw new EOFException();
295:
296: if (debug)
297: System.out.println("\nSOIFInputStream...");
298:
299: try {
300:
301: // "@SCHEMA { url\n" (skip over leading junk)
302: c = '\n';
303: do {
304: c1 = c;
305: c = getChar(sis);
306: } while (!(c == '@' && (c1 == '\n' || c1 == '\r')));
307:
308: c = getToken(null, " ", null, "\r\n", sb);
309: nSOIF.setSchemaName(new String(sb));
310:
311: if (debug)
312: System.out.println("schemaName = ["
313: + nSOIF.getSchemaName() + "]");
314:
315: // " { url\n"
316: c = getToken(" ", "{", " ", "\r\n", sb);
317:
318: sb.setLength(0);
319: c = getToken(" ", "\r\n", null, null, sb);
320: nSOIF.setURL(new String(sb));
321:
322: if (debug)
323: System.out.println("URL = [" + nSOIF.getURL() + "]");
324:
325: // parse the attributes
326: while (true) {
327:
328: // "attr{n}:\tval" OR "}"
329:
330: sb.setLength(0);
331: c = getToken("\r\n", "{}", null, "\r\n", sb);
332:
333: if (c == '}') {
334: if (debug)
335: System.out.println("End of SOIF");
336: return nSOIF;
337: }
338:
339: attrname = new String(sb);
340:
341: if (debug)
342: System.out.println(attrname);
343:
344: // "nn}:\t..."
345:
346: // this is a bit slow ...
347: //sb.setLength(0);
348: //c = getToken(null, "}", "0123456789", null, sb);
349: //int nn = Integer.parseInt(new String(sb));
350:
351: // ... try this instead
352: int nn = -1;
353: while (true) {
354: c = getChar(sis);
355: if ('0' <= c && c <= '9') {
356: if (nn == -1)
357: nn = 0;
358: nn = nn * 10 + c - '0';
359: } else
360: break;
361: }
362:
363: if (c != '}' || nn == -1)
364: throw new Exception();
365:
366: int attrlen = nn;
367: if (debug)
368: System.out.println("" + attrlen);
369:
370: // ":\t..."
371: if (getChar(sis) != ':' || getChar(sis) != '\t')
372: throw new Exception();
373:
374: /** parse attribute-NNN for multi value attributes */
375: int mvIndex = 0;
376: int p = attrname.lastIndexOf('-');
377: if (p != -1) {
378: int len = attrname.length(), index = 0;
379: for (i = p + 1; i < len; ++i) {
380: c = attrname.charAt(i);
381: if (c < '0' || c > '9')
382: break;
383: index = index * 10 + c - '0';
384: }
385: if (i == len) {
386: mvIndex = index > 0 ? (index - 1) : 0;
387: attrname = attrname.substring(0, p);
388: }
389: }
390:
391: if (allowed == null
392: || allowed.contains(attrname.toLowerCase())) {
393: byte ba[] = new byte[attrlen];
394: sis.readFully(ba);
395: if (debug)
396: System.out.println("AV Pair: " + attrname
397: + " : " + new String(ba, encoding));
398: nSOIF.insert(attrname, ba, mvIndex);
399: } else {
400: if (debug)
401: System.out
402: .println("Skipped disallowed attribute: "
403: + attrname);
404: sis.skip(nn);
405: }
406: }
407: } catch (Exception e) {
408: if (e instanceof EOFException) {
409: if (debug)
410: System.out.println("End of stream reached");
411: EOS = true;
412: sis.close();
413: return null;
414: }
415: if (debug)
416: System.out.println("Error in SOIF: " + e);
417: String msg = SOIF.INVALIDSOIF;
418: if (e.getMessage() != null)
419: msg += ": " + e.getMessage();
420: if (nSOIF != null && nSOIF.getURL() != null)
421: msg += ": " + nSOIF.getURL();
422: throw new SOIFException(msg);
423: }
424: }
425:
426: /**
427: * @return true if stream is at EOS
428: */
429: public boolean isEOS() {
430: return EOS;
431: }
432:
433: /**
434: * Closes the stream.
435: * @return false on I/O error
436: */
437: public boolean close() {
438: // XXX should be void
439: try {
440: sis.close();
441: } catch (Exception e) {
442: return false;
443: }
444: return true;
445: }
446:
447: }
|