001: package net.sf.saxon.query;
002:
003: import net.sf.saxon.Err;
004: import net.sf.saxon.expr.StaticProperty;
005: import net.sf.saxon.om.*;
006: import net.sf.saxon.pattern.CombinedNodeTest;
007: import net.sf.saxon.pattern.ContentTypeTest;
008: import net.sf.saxon.pattern.NodeTest;
009: import net.sf.saxon.trans.StaticError;
010: import net.sf.saxon.type.AnyItemType;
011: import net.sf.saxon.type.AtomicType;
012: import net.sf.saxon.type.ItemType;
013: import net.sf.saxon.type.SchemaType;
014: import net.sf.saxon.value.SequenceType;
015:
016: import javax.xml.transform.stream.StreamSource;
017: import java.io.*;
018: import java.util.Iterator;
019:
020: /**
021: * This class contains static methods used to read a query as a byte stream, infer the encoding if
022: * necessary, and return the text of the query as a string; also methods to import functions and variables
023: * from one module into another, and check their consistency.
024: */
025: public class QueryReader {
026:
027: /**
028: * The class is never instantiated
029: */
030: private QueryReader() {
031: }
032:
033: /**
034: * Read a query module given a StreamSource
035: * @param ss the supplied StreamSource. This must contain a non-null systemID which defines the base
036: * URI of the query module, and either an InputStream or a Reader containing the query text. In the
037: * case of an InputStream the method attempts to infer the encoding; in the case of a Reader, this has
038: * already been done, and the encoding specified within the query itself is ignored.
039: */
040:
041: public static String readSourceQuery(StreamSource ss,
042: NameChecker nameChecker) throws StaticError {
043: CharSequence queryText;
044: if (ss.getInputStream() != null) {
045: InputStream is = ss.getInputStream();
046: if (!is.markSupported()) {
047: is = new BufferedInputStream(is);
048: }
049: String encoding = readEncoding(is);
050: queryText = readInputStream(is, encoding, nameChecker);
051: } else if (ss.getReader() != null) {
052: queryText = readQueryFromReader(ss.getReader(), nameChecker);
053: } else {
054: throw new StaticError(
055: "Module URI Resolver must supply either an InputSource or a Reader");
056: }
057: return queryText.toString();
058: }
059:
060: /**
061: * Read an input stream non-destructively to determine the encoding from the Query Prolog
062: * @param is the input stream: this must satisfy the precondition is.markSupported() = true.
063: * @return the encoding to be used: defaults to UTF-8 if no encoding was specified explicitly
064: * in the query prolog
065: * @throws StaticError if the input stream cannot be read
066: */
067:
068: public static String readEncoding(InputStream is)
069: throws StaticError {
070: try {
071: if (!is.markSupported()) {
072: throw new IllegalArgumentException(
073: "InputStream must have markSupported() = true");
074: }
075: is.mark(100);
076: byte[] start = new byte[100];
077: int read = is.read(start, 0, 100);
078: is.reset();
079: return inferEncoding(start, read);
080: } catch (IOException e) {
081: StaticError se = new StaticError(
082: "Failed to read query source file", e);
083: throw se;
084: }
085: }
086:
087: /**
088: * Read a query from an InputStream. The method checks that all characters are valid XML
089: * characters, and also performs normalization of line endings.
090: * @param is the input stream
091: * @param encoding the encoding, or null if the encoding is unknown
092: * @param nameChecker the nameChecker to be used for checking characters
093: * @return the content of the InputStream as a string
094: */
095:
096: public static String readInputStream(InputStream is,
097: String encoding, NameChecker nameChecker)
098: throws StaticError {
099: if (encoding == null) {
100: if (!is.markSupported()) {
101: is = new BufferedInputStream(is);
102: }
103: encoding = readEncoding(is);
104: }
105: try {
106: Reader reader = new BufferedReader(new InputStreamReader(
107: is, encoding));
108: return readQueryFromReader(reader, nameChecker);
109: } catch (UnsupportedEncodingException encErr) {
110: StaticError e = new StaticError("Unknown encoding "
111: + Err.wrap(encoding), encErr);
112: throw e;
113: }
114: }
115:
116: /**
117: * Read a query from a Reader. The method checks that all characters are valid XML
118: * characters.
119: * @param reader The Reader supplying the input
120: * @param nameChecker the NameChecker to be used
121: * @return the text of the query module, as a string
122: * @throws StaticError if the file cannot be read or contains illegal characters
123: */
124:
125: private static String readQueryFromReader(Reader reader,
126: NameChecker nameChecker) throws StaticError {
127: try {
128: FastStringBuffer sb = new FastStringBuffer(2048);
129: char[] buffer = new char[2048];
130: boolean first = true;
131: int actual;
132: int line = 1; // track line/column position for reporting bad characters
133: int column = 1;
134: while (true) {
135: actual = reader.read(buffer, 0, 2048);
136: if (actual < 0) {
137: break;
138: }
139: for (int c = 0; c < actual;) {
140: int ch32 = buffer[c++];
141: if (ch32 == '\n') {
142: line++;
143: column = 0;
144: }
145: column++;
146: if (XMLChar.isHighSurrogate(ch32)) {
147: char low = buffer[c++];
148: ch32 = XMLChar.supplemental((char) ch32, low);
149: }
150: if (!nameChecker.isValidChar(ch32)) {
151: StaticError err = new StaticError(
152: "The query file contains a character illegal in XML "
153: + nameChecker.getXMLVersion()
154: + " (line=" + line + " column="
155: + column + " value=x"
156: + Integer.toHexString(ch32)
157: + ')');
158: err.setErrorCode("XPST0003");
159: throw err;
160: }
161: }
162: if (first) {
163: first = false;
164: if (buffer[0] == '\ufeff') {
165: sb.append(buffer, 1, actual - 1);
166: } else {
167: sb.append(buffer, 0, actual);
168: }
169: } else {
170: sb.append(buffer, 0, actual);
171: }
172: }
173: return sb.condense().toString();
174: } catch (IOException ioErr) {
175: StaticError e = new StaticError(
176: "Failed to read input file", ioErr);
177: throw e;
178: }
179: }
180:
181: /**
182: * Attempt to infer the encoding of a file by reading its byte order mark and if necessary
183: * the encoding declaration in the query prolog
184: * @param start the bytes appearing at the start of the file
185: * @param read the number of bytes supplied
186: * @return the inferred encoding
187: * @throws StaticError
188: */
189:
190: private static String inferEncoding(byte[] start, int read)
191: throws StaticError {
192: // Debugging code
193: // StringBuffer sb = new StringBuffer(read*5);
194: // for (int i=0; i<read; i++) sb.append(Integer.toHexString(start[i]&255) + ", ");
195: // System.err.println(sb);
196: // End of debugging code
197:
198: if (read >= 2) {
199: if (ch(start[0]) == 0xFE && ch(start[1]) == 0xFF) {
200: return "UTF-16";
201: } else if (ch(start[0]) == 0xFF && ch(start[1]) == 0xFE) {
202: return "UTF-16LE";
203: }
204: }
205: if (read >= 3) {
206: if (ch(start[0]) == 0xEF && ch(start[1]) == 0xBB
207: && ch(start[2]) == 0xBF) {
208: return "UTF-8";
209: }
210: }
211:
212: // Try to handle a UTF-16 file with no BOM
213: if (read >= 8 && start[0] == 0 && start[2] == 0
214: && start[4] == 0 && start[6] == 0) {
215: return "UTF-16";
216: }
217: if (read >= 8 && start[1] == 0 && start[3] == 0
218: && start[5] == 0 && start[7] == 0) {
219: return "UTF-16LE";
220: }
221:
222: // In all other cases, we assume an encoding that has ISO646 as a subset
223:
224: // Note, we don't care about syntax errors here: they'll be reported later. We just need to
225: // establish the encoding.
226: int i = 0;
227: String tok = readToken(start, i, read);
228: if (tok.trim().equals("xquery")) {
229: i += tok.length();
230: } else {
231: return "UTF-8";
232: }
233: tok = readToken(start, i, read);
234: if (tok.trim().equals("version")) {
235: i += tok.length();
236: } else {
237: return "UTF-8";
238: }
239: tok = readToken(start, i, read);
240: if (tok == null) {
241: return "UTF-8";
242: }
243: i += tok.length();
244: tok = readToken(start, i, read);
245: if (tok.trim().equals("encoding")) {
246: i += tok.length();
247: } else {
248: return "UTF-8";
249: }
250: tok = readToken(start, i, read).trim();
251: if (tok.startsWith("\"") && tok.endsWith("\"")
252: && tok.length() > 2) {
253: return tok.substring(1, tok.length() - 1);
254: } else if (tok.startsWith("'") && tok.endsWith("'")
255: && tok.length() > 2) {
256: return tok.substring(1, tok.length() - 1);
257: } else {
258: throw new StaticError("Unrecognized encoding "
259: + Err.wrap(tok) + " in query prolog");
260: }
261:
262: }
263:
264: /**
265: * Simple tokenizer for use when reading the encoding declaration in the query prolog. A token
266: * is a sequence of characters delimited either by whitespace, or by single or double quotes; the
267: * quotes if present are returned as part of the token.
268: * @param in the character buffer
269: * @param i offset where to start reading
270: * @param len the length of buffer
271: * @return the next token
272: */
273:
274: private static String readToken(byte[] in, int i, int len) {
275: int p = i;
276: while (p < len && " \n\r\t".indexOf(ch(in[p])) >= 0) {
277: p++;
278: }
279: if (ch(in[p]) == '"') {
280: p++;
281: while (p < len && ch(in[p]) != '"') {
282: p++;
283: }
284: } else if (ch(in[p]) == '\'') {
285: p++;
286: while (p < len && ch(in[p]) != '\'') {
287: p++;
288: }
289: } else {
290: while (p < len && " \n\r\t".indexOf(ch(in[p])) < 0) {
291: p++;
292: }
293: }
294: if (p >= len) {
295: return new String(in, i, len - i);
296: }
297: FastStringBuffer sb = new FastStringBuffer(p - i + 1);
298: for (int c = i; c <= p; c++) {
299: sb.append((char) ch(in[c]));
300: }
301: return sb.toString();
302: }
303:
304: /**
305: * Convert a byte containing an ASCII character to that character
306: * @param b the input byte
307: * @return the ASCII character
308: */
309:
310: private static int ch(byte b) {
311: return ((int) b) & 0xff;
312: }
313:
314: /**
315: * Check the types used in the functions and variables in an imported module
316: * @param importedModule the imported module
317: * @param thisModule thus module
318: * @throws net.sf.saxon.trans.StaticError
319: */
320:
321: public static void importModuleContents(
322: StaticQueryContext importedModule,
323: StaticQueryContext this Module) throws StaticError {
324: short ns = importedModule.getModuleNamespaceCode();
325: NamePool pool = this Module.getNamePool();
326: // Note: see W3C Public Bugzilla 2546, which proposes changing the rule so that types must be declared
327: // in the importing module only for those variables and functions actually used by the importing module.
328: Iterator it = importedModule.getLocalFunctionLibrary()
329: .getFunctionDefinitions();
330: while (it.hasNext()) {
331: XQueryFunction def = (XQueryFunction) it.next();
332: // don't import functions transitively
333: if (pool.getURICode(def.getFunctionFingerprint()) == ns
334: && def.getSystemId() == importedModule
335: .getLocationURI()) {
336: //thisModule.declareFunction(def);
337: checkImportedType(this Module, def.getResultType(), def);
338: for (int i = 0; i < def.getNumberOfArguments(); i++) {
339: SequenceType argType = def.getArgumentTypes()[i];
340: checkImportedType(this Module, argType, def);
341: }
342: }
343: }
344: it = importedModule.getModuleVariables();
345: while (it.hasNext()) {
346: GlobalVariableDefinition def = (GlobalVariableDefinition) it
347: .next();
348: // don't import variables transitively
349: if (!(def instanceof UndeclaredVariable)
350: && pool.getURICode(def.getNameCode()) == ns
351: && !def.getSystemId().equals(
352: this Module.getLocationURI())) {
353: //thisModule.declareVariable(def);
354: checkImportedType(this Module, def.getRequiredType(),
355: def);
356: }
357: }
358: }
359:
360: /**
361: * Check that a SequenceType used in the definition of an imported variable or function
362: * is available in the importing module
363: */
364:
365: private static void checkImportedType(StaticQueryContext env,
366: SequenceType importedType, Declaration declaration)
367: throws StaticError {
368: ItemType type = importedType.getPrimaryType();
369: if (type instanceof AnyItemType) {
370: return;
371: }
372: if (type instanceof AtomicType) {
373: int f = ((AtomicType) type).getFingerprint();
374: checkSchemaNamespaceImported(env, f, declaration);
375: } else if (type instanceof ContentTypeTest) {
376: SchemaType annotation = ((ContentTypeTest) type)
377: .getSchemaType();
378: int f = annotation.getFingerprint();
379: checkSchemaNamespaceImported(env, f, declaration);
380: } else if (type instanceof CombinedNodeTest) {
381: NodeTest[] tests = ((CombinedNodeTest) type)
382: .getComponentNodeTests();
383: for (int i = 0; i < tests.length; i++) {
384: SequenceType st = SequenceType.makeSequenceType(
385: tests[1], StaticProperty.EXACTLY_ONE);
386: checkImportedType(env, st, declaration);
387: }
388: }
389: }
390:
391: /**
392: * Check that the namespace of a given name is the namespace of an imported schema
393: */
394:
395: static void checkSchemaNamespaceImported(StaticQueryContext env,
396: int fingerprint, Declaration declaration)
397: throws StaticError {
398: String uri = env.getNamePool().getURI(fingerprint);
399: if (uri.equals(NamespaceConstant.SCHEMA)) {
400: return;
401: } else if (NamespaceConstant.isXDTNamespace(uri)) {
402: return;
403: }
404: if (env.isImportedSchema(uri)) {
405: return; // schema namespace is imported in this module
406: } else {
407: String msg = "Schema component "
408: + env.getNamePool().getDisplayName(fingerprint)
409: + " used in ";
410: if (declaration instanceof GlobalVariableDefinition) {
411: msg += "declaration of imported variable "
412: + env
413: .getNamePool()
414: .getDisplayName(
415: ((GlobalVariableDefinition) declaration)
416: .getNameCode());
417: } else {
418: msg += "signature of imported function "
419: + env.getNamePool().getDisplayName(
420: ((XQueryFunction) declaration)
421: .getNameCode());
422: }
423: msg += " is not declared in any schema imported by ";
424: String module = env.getModuleNamespace();
425: if (module == null) {
426: msg += "the main query module";
427: } else {
428: msg += "query module " + module;
429: }
430: StaticError err = new StaticError(msg);
431: err.setErrorCode("XQST0036");
432: err.setLocator(declaration);
433: throw err;
434: }
435: }
436: }
437:
438: //
439: // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
440: // you may not use this file except in compliance with the License. You may obtain a copy of the
441: // License at http://www.mozilla.org/MPL/
442: //
443: // Software distributed under the License is distributed on an "AS IS" basis,
444: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
445: // See the License for the specific language governing rights and limitations under the License.
446: //
447: // The Original Code is: all this file.
448: //
449: // The Initial Developer of the Original Code is Michael H. Kay.
450: //
451: // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
452: //
453: // Contributor(s): none.
454: //
|