001: package net.sf.saxon.functions;
002:
003: import net.sf.saxon.Err;
004: import net.sf.saxon.expr.Expression;
005: import net.sf.saxon.expr.StaticContext;
006: import net.sf.saxon.expr.XPathContext;
007: import net.sf.saxon.om.Item;
008: import net.sf.saxon.om.FastStringBuffer;
009: import net.sf.saxon.om.XMLChar;
010: import net.sf.saxon.om.NameChecker;
011: import net.sf.saxon.trans.DynamicError;
012: import net.sf.saxon.trans.XPathException;
013: import net.sf.saxon.value.StringValue;
014: import net.sf.saxon.value.BooleanValue;
015:
016: import java.io.*;
017: import java.net.MalformedURLException;
018: import java.net.URL;
019: import java.net.URLConnection;
020:
021: public class UnparsedText extends SystemFunction implements
022: XSLTFunction {
023:
024: // TODO: Add some kind of uri resolver mechanism
025:
026: String expressionBaseURI = null;
027:
028: public static final int UNPARSED_TEXT = 0;
029: public static final int UNPARSED_TEXT_AVAILABLE = 1;
030:
031: public void checkArguments(StaticContext env) throws XPathException {
032: if (expressionBaseURI == null) {
033: super .checkArguments(env);
034: expressionBaseURI = env.getBaseURI();
035: }
036: }
037:
038: /**
039: * preEvaluate: this method suppresses compile-time evaluation by doing nothing
040: */
041:
042: public Expression preEvaluate(StaticContext env) {
043: return this ;
044: // in principle we could pre-evaluate any call of unparsed-text() with
045: // constant arguments. But we don't, because the file contents might
046: // change before the stylesheet executes.
047: }
048:
049: /**
050: * evaluateItem() handles evaluation of the function:
051: * it returns a String
052: */
053:
054: public Item evaluateItem(XPathContext context)
055: throws XPathException {
056: StringValue result;
057: try {
058: StringValue hrefVal = (StringValue) argument[0]
059: .evaluateItem(context);
060: if (hrefVal == null) {
061: return null;
062: }
063: String href = hrefVal.getStringValue();
064:
065: String encoding = null;
066: if (getNumberOfArguments() == 2) {
067: encoding = argument[1].evaluateItem(context)
068: .getStringValue();
069: }
070:
071: result = new StringValue(readFile(href, expressionBaseURI,
072: encoding, context.getConfiguration()
073: .getNameChecker()));
074: } catch (XPathException err) {
075: if (operation == UNPARSED_TEXT_AVAILABLE) {
076: return BooleanValue.FALSE;
077: } else {
078: throw err;
079: }
080: }
081: if (operation == UNPARSED_TEXT_AVAILABLE) {
082: return BooleanValue.TRUE;
083: } else {
084: return result;
085: }
086: }
087:
088: /**
089: * Supporting routine to load one external file given a URI (href) and a baseURI
090: */
091:
092: private CharSequence readFile(String href, String baseURI,
093: String encoding, NameChecker checker) throws XPathException {
094:
095: // Resolve relative URI
096:
097: URL absoluteURL;
098: if (baseURI == null) { // no base URI available
099: try {
100: // the href might be an absolute URL
101: absoluteURL = new URL(href);
102: } catch (MalformedURLException err) {
103: // it isn't
104: DynamicError e = new DynamicError(
105: "Cannot resolve absolute URI", err);
106: e.setErrorCode("XTDE1170");
107: throw e;
108: }
109: } else {
110: try {
111: absoluteURL = new URL(new URL(baseURI), href);
112: } catch (MalformedURLException err) {
113: DynamicError e = new DynamicError(
114: "Cannot resolve relative URI", err);
115: e.setErrorCode("XTDE1170");
116: throw e;
117: }
118: }
119: try {
120: InputStream is;
121: if (encoding != null) {
122: is = absoluteURL.openStream();
123: } else {
124: URLConnection connection = absoluteURL.openConnection();
125: connection.connect();
126: is = connection.getInputStream();
127:
128: try {
129:
130: if (!is.markSupported()) {
131: is = new BufferedInputStream(is);
132: }
133:
134: // Get any external (HTTP) encoding label.
135: String contentType;
136:
137: // The file:// URL scheme gives no useful information...
138: if (!"file".equals(connection.getURL()
139: .getProtocol())) {
140:
141: // Use the contentType from the HTTP header if available
142: contentType = connection.getContentType();
143:
144: if (contentType != null) {
145: int pos = contentType.indexOf("charset");
146: if (pos >= 0) {
147: pos = contentType.indexOf('=', pos + 7);
148: if (pos >= 0) {
149: contentType = contentType
150: .substring(pos + 1);
151: }
152: if ((pos = contentType.indexOf(';')) > 0) {
153: contentType = contentType
154: .substring(0, pos);
155: }
156:
157: // attributes can have comment fields (RFC 822)
158: if ((pos = contentType.indexOf('(')) > 0) {
159: contentType = contentType
160: .substring(0, pos);
161: }
162: // ... and values may be quoted
163: if ((pos = contentType.indexOf('"')) > 0) {
164: contentType = contentType
165: .substring(
166: pos + 1,
167: contentType
168: .indexOf(
169: '"',
170: pos + 2));
171: }
172: encoding = contentType.trim();
173: }
174: }
175: }
176:
177: if (encoding == null) {
178: // Try to detect the encoding from the start of the content
179: is.mark(100);
180: byte[] start = new byte[100];
181: int read = is.read(start, 0, 100);
182: is.reset();
183: encoding = inferEncoding(start, read);
184: }
185:
186: } catch (IOException e) {
187: encoding = "UTF-8";
188: }
189:
190: }
191: Reader reader = new BufferedReader(new InputStreamReader(
192: is, encoding));
193: FastStringBuffer sb = new FastStringBuffer(2048);
194: char[] buffer = new char[2048];
195: boolean first = true;
196: int actual;
197: int line = 1;
198: int column = 1;
199: while (true) {
200: actual = reader.read(buffer, 0, 2048);
201: if (actual < 0) {
202: break;
203: }
204: for (int c = 0; c < actual;) {
205: int ch32 = buffer[c++];
206: if (ch32 == '\n') {
207: line++;
208: column = 0;
209: }
210: column++;
211: if (XMLChar.isHighSurrogate(ch32)) {
212: char low = buffer[c++];
213: ch32 = XMLChar.supplemental((char) ch32, low);
214: }
215: if (!checker.isValidChar(ch32)) {
216: DynamicError err = new DynamicError(
217: "The unparsed-text file contains a character illegal in XML (line="
218: + line + " column=" + column
219: + " value=hex "
220: + Integer.toHexString(ch32)
221: + ')');
222: err.setErrorCode("XTDE1190");
223: throw err;
224: }
225: }
226: if (first) {
227: first = false;
228: if (buffer[0] == '\ufeff') {
229: // don't include the BOM in the result
230: sb.append(buffer, 1, actual - 1);
231: } else {
232: sb.append(buffer, 0, actual);
233: }
234: } else {
235: sb.append(buffer, 0, actual);
236: }
237: }
238: reader.close();
239: return sb.condense();
240: } catch (java.io.UnsupportedEncodingException encErr) {
241: DynamicError e = new DynamicError("Unknown encoding "
242: + Err.wrap(encoding), encErr);
243: e.setErrorCode("XTDE1190");
244: throw e;
245: } catch (java.io.IOException ioErr) {
246: DynamicError e = new DynamicError(
247: "Failed to read input file", ioErr);
248: e.setErrorCode("XTDE1170");
249: e.setLocator(this );
250: throw e;
251: }
252: }
253:
254: private String inferEncoding(byte[] start, int read) {
255: if (read >= 2) {
256: if (ch(start[0]) == 0xFE && ch(start[1]) == 0xFF) {
257: return "UTF-16";
258: } else if (ch(start[0]) == 0xFF && ch(start[1]) == 0xFE) {
259: return "UTF-16LE";
260: }
261: }
262: if (read >= 3) {
263: if (ch(start[0]) == 0xEF && ch(start[1]) == 0xBB
264: && ch(start[2]) == 0xBF) {
265: return "UTF-8";
266: }
267: }
268: if (read >= 4) {
269: if (ch(start[0]) == '<' && ch(start[1]) == '?'
270: && ch(start[2]) == 'x' && ch(start[3]) == 'm'
271: && ch(start[4]) == 'l') {
272: FastStringBuffer sb = new FastStringBuffer(read);
273: for (int b = 0; b < read; b++) {
274: sb.append((char) start[b]);
275: }
276: String p = sb.toString();
277: int v = p.indexOf("encoding");
278: if (v >= 0) {
279: v += 8;
280: while (v < p.length()
281: && " \n\r\t=\"'".indexOf(p.charAt(v)) >= 0) {
282: v++;
283: }
284: sb.setLength(0);
285: while (v < p.length() && p.charAt(v) != '"'
286: && p.charAt(v) != '\'') {
287: sb.append(p.charAt(v++));
288: }
289: return sb.toString();
290: }
291: }
292: } else if (read > 0 && start[0] == 0 && start[2] == 0
293: && start[4] == 0 && start[6] == 0) {
294: return "UTF-16";
295: } else if (read > 1 && start[1] == 0 && start[3] == 0
296: && start[5] == 0 && start[7] == 0) {
297: return "UTF-16LE";
298: }
299: // If all else fails, assume UTF-8
300: return "UTF-8";
301: }
302:
303: private int ch(byte b) {
304: return ((int) b) & 0xff;
305: }
306:
307: }
308:
309: //
310: // The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License");
311: // you may not use this file except in compliance with the License. You may obtain a copy of the
312: // License at http://www.mozilla.org/MPL/
313: //
314: // Software distributed under the License is distributed on an "AS IS" basis,
315: // WITHOUT WARRANTY OF ANY KIND, either express or implied.
316: // See the License for the specific language governing rights and limitations under the License.
317: //
318: // The Original Code is: all this file.
319: //
320: // The Initial Developer of the Original Code is Michael H. Kay. The detectEncoding() method includes
321: // code fragments taken from the AElfred XML Parser developed by David Megginson.
322: //
323: // Portions created by (your name) are Copyright (C) (your legal entity). All Rights Reserved.
324: //
325: // Contributor(s): none.
326: //
|