0001: /* Copyright 2002-2005 Elliotte Rusty Harold
0002:
0003: This library is free software; you can redistribute it and/or modify
0004: it under the terms of version 2.1 of the GNU Lesser General Public
0005: License as published by the Free Software Foundation.
0006:
0007: This library is distributed in the hope that it will be useful,
0008: but WITHOUT ANY WARRANTY; without even the implied warranty of
0009: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0010: GNU Lesser General Public License for more details.
0011:
0012: You should have received a copy of the GNU Lesser General Public
0013: License along with this library; if not, write to the
0014: Free Software Foundation, Inc., 59 Temple Place, Suite 330,
0015: Boston, MA 02111-1307 USA
0016:
0017: You can contact Elliotte Rusty Harold by sending e-mail to
0018: elharo@metalab.unc.edu. Please include the word "XOM" in the
0019: subject line. The XOM home page is located at http://www.xom.nu/
0020: */
0021:
0022: package nu.xom;
0023:
0024: import java.io.DataInputStream;
0025: import java.io.IOException;
0026: import java.io.InputStream;
0027: import java.io.Reader;
0028: import java.io.StringReader;
0029: import java.util.StringTokenizer;
0030:
0031: import org.xml.sax.EntityResolver;
0032: import org.xml.sax.InputSource;
0033: import org.xml.sax.SAXException;
0034: import org.xml.sax.XMLReader;
0035:
0036: /**
0037: * <p>
0038: * <code>Verifier</code> checks names and data for
0039: * compliance with XML 1.0 and Namespaces in XML rules.
0040: * </p>
0041: *
0042: * @author Elliotte Rusty Harold
0043: * @version 1.1b4
0044: *
0045: */
0046: final class Verifier {
0047:
0048: private Verifier() {
0049: }
0050:
0051: // constants for the bit flags in the characters lookup table
0052: private final static byte XML_CHARACTER = 1;
0053: private final static byte NAME_CHARACTER = 2;
0054: private final static byte NAME_START_CHARACTER = 4;
0055: private final static byte NCNAME_CHARACTER = 8;
0056:
0057: private static byte[] flags = null;
0058:
0059: static {
0060:
0061: ClassLoader loader = Verifier.class.getClassLoader();
0062: if (loader != null)
0063: loadFlags(loader);
0064: // If that didn't work, try a different ClassLoader
0065: if (flags == null) {
0066: loader = Thread.currentThread().getContextClassLoader();
0067: loadFlags(loader);
0068: }
0069:
0070: }
0071:
0072: private static void loadFlags(ClassLoader loader) {
0073:
0074: DataInputStream in = null;
0075: try {
0076: InputStream raw = loader
0077: .getResourceAsStream("nu/xom/characters.dat");
0078: if (raw == null) {
0079: throw new RuntimeException("Broken XOM installation: "
0080: + "could not load nu/xom/characters.dat");
0081: }
0082: // buffer this????
0083: in = new DataInputStream(raw);
0084: flags = new byte[65536];
0085: in.readFully(flags);
0086: } catch (IOException ex) {
0087: throw new RuntimeException("Broken XOM installation: "
0088: + "could not load nu/xom/characters.dat");
0089: } finally {
0090: try {
0091: if (in != null)
0092: in.close();
0093: } catch (IOException ex) {
0094: // no big deal
0095: }
0096: }
0097:
0098: }
0099:
0100: /**
0101: * <p>
0102: * Check whether <code>name</code> is
0103: * a non-colonized name as defined in
0104: * <cite>Namespaces in XML</cite>.
0105: * </p>
0106: *
0107: * @param name <code>String</code> name to check
0108: *
0109: * @throws IllegalNameException if <code>name</code> is not a
0110: * non-colonized name
0111: */
0112: static void checkNCName(String name) {
0113:
0114: if (name == null) {
0115: throwIllegalNameException(name, "NCNames cannot be null");
0116: }
0117:
0118: int length = name.length();
0119: if (length == 0) {
0120: throwIllegalNameException(name, "NCNames cannot be empty");
0121: }
0122:
0123: char first = name.charAt(0);
0124: if ((flags[first] & NAME_START_CHARACTER) == 0) {
0125: throwIllegalNameException(name, "NCNames cannot start "
0126: + "with the character "
0127: + Integer.toHexString(first));
0128: }
0129:
0130: for (int i = 1; i < length; i++) {
0131: char c = name.charAt(i);
0132: if ((flags[c] & NCNAME_CHARACTER) == 0) {
0133: if (c == ':') {
0134: throwIllegalNameException(name,
0135: "NCNames cannot contain colons");
0136: } else {
0137: throwIllegalNameException(name, "0x"
0138: + Integer.toHexString(c)
0139: + " is not a legal NCName character");
0140: }
0141: }
0142: }
0143:
0144: }
0145:
0146: private static void throwIllegalNameException(String name,
0147: String message) {
0148: IllegalNameException ex = new IllegalNameException(message);
0149: ex.setData(name);
0150: throw ex;
0151: }
0152:
0153: private static void throwIllegalCharacterDataException(String data,
0154: String message) {
0155: IllegalDataException ex = new IllegalCharacterDataException(
0156: message);
0157: ex.setData(data);
0158: throw ex;
0159: }
0160:
0161: private static void throwMalformedURIException(String uri,
0162: String message) {
0163: MalformedURIException ex = new MalformedURIException(message);
0164: ex.setData(uri);
0165: throw ex;
0166: }
0167:
0168: /**
0169: * <p>
0170: * This methods checks whether a string contains only
0171: * characters allowed by the XML 1.0 specification.
0172: * </p>
0173: *
0174: * @param text <code>String</code> value to verify
0175: *
0176: * @throws IllegalCharacterDataException if <code>text</code> is
0177: * not legal PCDATA
0178: */
0179: static void checkPCDATA(String text) {
0180:
0181: if (text == null)
0182: throwIllegalCharacterDataException(text, "Null text");
0183:
0184: char[] data = text.toCharArray();
0185: for (int i = 0, len = data.length; i < len; i++) {
0186: int result = data[i];
0187: if (result >= 0xD800 && result <= 0xDBFF) {
0188: try {
0189: int low = data[i + 1];
0190: if (low < 0xDC00 || low > 0xDFFF) {
0191: IllegalCharacterDataException ex = new IllegalCharacterDataException(
0192: "Bad surrogate pair");
0193: ex.setData(text);
0194: throw ex;
0195: }
0196: i++; // increment past low surrogate
0197: } catch (ArrayIndexOutOfBoundsException ex) {
0198: IllegalCharacterDataException ide = new IllegalCharacterDataException(
0199: "Bad Surrogate Pair", ex);
0200: ide.setData(text);
0201: throw ide;
0202: }
0203: // all properly matched surrogate pairs are legal in PCDATA
0204: } // end if
0205: else if ((flags[result] & XML_CHARACTER) == 0) {
0206: throwIllegalCharacterDataException(text, "0x"
0207: + Integer.toHexString(result)
0208: + " is not allowed in XML content");
0209: }
0210:
0211: }
0212:
0213: }
0214:
0215: /**
0216: * <p>
0217: * Checks a string to see if it is a syntactically correct
0218: * RFC 3986 URI reference. Both absolute and relative
0219: * URIs are supported, as are URIs with fragment identifiers.
0220: * </p>
0221: *
0222: * @param uri <code>String</code> containing the potential URI
0223: *
0224: * @throws MalformedURIException if this is not a
0225: * legal URI reference
0226: */
0227: static void checkURIReference(String uri) {
0228:
0229: if ((uri == null) || uri.length() == 0)
0230: return;
0231:
0232: URIUtil.ParsedURI parsed = new URIUtil.ParsedURI(uri);
0233: try {
0234: if (parsed.scheme != null)
0235: checkScheme(parsed.scheme);
0236: if (parsed.authority != null)
0237: checkAuthority(parsed.authority);
0238: checkPath(parsed.path);
0239: if (parsed.fragment != null)
0240: checkFragment(parsed.fragment);
0241: if (parsed.query != null)
0242: checkQuery(parsed.query);
0243: } catch (MalformedURIException ex) {
0244: ex.setData(uri);
0245: throw ex;
0246: }
0247:
0248: }
0249:
0250: private static void checkQuery(final String query) {
0251:
0252: int length = query.length();
0253: for (int i = 0; i < length; i++) {
0254: char c = query.charAt(i);
0255: if (c == '%') {
0256: try {
0257: if (!isHexDigit(query.charAt(i + 1))
0258: || !isHexDigit(query.charAt(i + 2))) {
0259: throwMalformedURIException(query,
0260: "Bad percent escape sequence");
0261: }
0262: } catch (StringIndexOutOfBoundsException ex) {
0263: throwMalformedURIException(query,
0264: "Bad percent escape sequence");
0265: }
0266: i += 2;
0267: } else if (!isQueryCharacter(c)) {
0268: throw new MalformedURIException(
0269: "Illegal query character " + c);
0270: }
0271: }
0272:
0273: }
0274:
0275: // same for fragment ID
0276: private static boolean isQueryCharacter(char c) {
0277:
0278: switch (c) {
0279: case '!':
0280: return true;
0281: case '"':
0282: return false;
0283: case '#':
0284: return false;
0285: case '$':
0286: return true;
0287: case '%':
0288: return false; // tested in checkQuery
0289: case '&':
0290: return true;
0291: case '\'':
0292: return true;
0293: case '(':
0294: return true;
0295: case ')':
0296: return true;
0297: case '*':
0298: return true;
0299: case '+':
0300: return true;
0301: case ',':
0302: return true;
0303: case '-':
0304: return true;
0305: case '.':
0306: return true;
0307: case '/':
0308: return true;
0309: case '0':
0310: return true;
0311: case '1':
0312: return true;
0313: case '2':
0314: return true;
0315: case '3':
0316: return true;
0317: case '4':
0318: return true;
0319: case '5':
0320: return true;
0321: case '6':
0322: return true;
0323: case '7':
0324: return true;
0325: case '8':
0326: return true;
0327: case '9':
0328: return true;
0329: case ':':
0330: return true;
0331: case ';':
0332: return true;
0333: case '<':
0334: return false;
0335: case '=':
0336: return true;
0337: case '>':
0338: return false;
0339: case '?':
0340: return true;
0341: case '@':
0342: return true;
0343: case 'A':
0344: return true;
0345: case 'B':
0346: return true;
0347: case 'C':
0348: return true;
0349: case 'D':
0350: return true;
0351: case 'E':
0352: return true;
0353: case 'F':
0354: return true;
0355: case 'G':
0356: return true;
0357: case 'H':
0358: return true;
0359: case 'I':
0360: return true;
0361: case 'J':
0362: return true;
0363: case 'K':
0364: return true;
0365: case 'L':
0366: return true;
0367: case 'M':
0368: return true;
0369: case 'N':
0370: return true;
0371: case 'O':
0372: return true;
0373: case 'P':
0374: return true;
0375: case 'Q':
0376: return true;
0377: case 'R':
0378: return true;
0379: case 'S':
0380: return true;
0381: case 'T':
0382: return true;
0383: case 'U':
0384: return true;
0385: case 'V':
0386: return true;
0387: case 'W':
0388: return true;
0389: case 'X':
0390: return true;
0391: case 'Y':
0392: return true;
0393: case 'Z':
0394: return true;
0395: case '[':
0396: return false;
0397: case '\\':
0398: return false;
0399: case ']':
0400: return false;
0401: case '^':
0402: return false;
0403: case '_':
0404: return true;
0405: case '`':
0406: return false;
0407: case 'a':
0408: return true;
0409: case 'b':
0410: return true;
0411: case 'c':
0412: return true;
0413: case 'd':
0414: return true;
0415: case 'e':
0416: return true;
0417: case 'f':
0418: return true;
0419: case 'g':
0420: return true;
0421: case 'h':
0422: return true;
0423: case 'i':
0424: return true;
0425: case 'j':
0426: return true;
0427: case 'k':
0428: return true;
0429: case 'l':
0430: return true;
0431: case 'm':
0432: return true;
0433: case 'n':
0434: return true;
0435: case 'o':
0436: return true;
0437: case 'p':
0438: return true;
0439: case 'q':
0440: return true;
0441: case 'r':
0442: return true;
0443: case 's':
0444: return true;
0445: case 't':
0446: return true;
0447: case 'u':
0448: return true;
0449: case 'v':
0450: return true;
0451: case 'w':
0452: return true;
0453: case 'x':
0454: return true;
0455: case 'y':
0456: return true;
0457: case 'z':
0458: return true;
0459: case '{':
0460: return false;
0461: case '|':
0462: return false;
0463: case '}':
0464: return false;
0465: case '~':
0466: return true;
0467: }
0468: return false;
0469:
0470: }
0471:
0472: private static void checkFragment(String fragment) {
0473: // The BNF for fragments is the same as for query strings
0474: checkQuery(fragment);
0475: }
0476:
0477: // Besides the legal characters issues, a path must
0478: // not contain two consecutive forward slashes
0479: private static void checkPath(final String path) {
0480:
0481: int length = path.length();
0482: char[] text = path.toCharArray();
0483: for (int i = 0; i < length; i++) {
0484: char c = text[i];
0485: if (c == '/') {
0486: if (i < length - 1) {
0487: if (text[i + 1] == '/') {
0488: throwMalformedURIException(path,
0489: "Double slash (//) in path");
0490: }
0491: }
0492: } else if (c == '%') {
0493: try {
0494: if (!isHexDigit(text[i + 1])
0495: || !isHexDigit(text[i + 2])) {
0496: throwMalformedURIException(path,
0497: "Bad percent escape sequence");
0498: }
0499: } catch (ArrayIndexOutOfBoundsException ex) {
0500: throwMalformedURIException(path,
0501: "Bad percent escape sequence");
0502: }
0503: i += 2;
0504: } else if (!isPathCharacter(c)) {
0505: throwMalformedURIException(path,
0506: "Illegal path character " + c);
0507: }
0508: }
0509:
0510: }
0511:
0512: private static void checkAuthority(String authority) {
0513:
0514: String userInfo = null;
0515: String host = null;
0516: String port = null;
0517:
0518: int atSign = authority.indexOf('@');
0519: if (atSign != -1) {
0520: userInfo = authority.substring(0, atSign);
0521: authority = authority.substring(atSign + 1);
0522: }
0523:
0524: int colon;
0525: if (authority.startsWith("[")) {
0526: colon = authority.indexOf("]:");
0527: if (colon != -1)
0528: colon = colon + 1;
0529: } else
0530: colon = authority.indexOf(':');
0531:
0532: if (colon != -1) {
0533: host = authority.substring(0, colon);
0534: port = authority.substring(colon + 1);
0535: } else {
0536: host = authority;
0537: }
0538:
0539: if (userInfo != null)
0540: checkUserInfo(userInfo);
0541: if (port != null)
0542: checkPort(port);
0543: checkHost(host);
0544:
0545: }
0546:
0547: private static void checkHost(final String host) {
0548:
0549: int length = host.length();
0550: if (length == 0)
0551: return; // file URI
0552:
0553: char[] text = host.toCharArray();
0554: if (text[0] == '[') {
0555: if (text[length - 1] != ']') {
0556: throw new MalformedURIException("Missing closing ]");
0557: }
0558: // trim [ and ] from ends of host
0559: checkIP6Address(host.substring(1, length - 1));
0560: } else {
0561: if (length > 255) {
0562: throw new MalformedURIException("Host name too long: "
0563: + host);
0564: }
0565:
0566: for (int i = 0; i < length; i++) {
0567: char c = text[i];
0568: if (c == '%') {
0569: try {
0570: if (!isHexDigit(text[i + 1])
0571: || !isHexDigit(text[i + 2])) {
0572: throwMalformedURIException(host,
0573: "Bad percent escape sequence");
0574: }
0575: } catch (ArrayIndexOutOfBoundsException ex) {
0576: throwMalformedURIException(host,
0577: "Bad percent escape sequence");
0578: }
0579: i += 2;
0580: } else if (!isRegNameCharacter(c)) {
0581: throwMalformedURIException(host,
0582: "Illegal host character " + c);
0583: }
0584: }
0585: }
0586: }
0587:
0588: private static boolean isRegNameCharacter(char c) {
0589:
0590: switch (c) {
0591: case '!':
0592: return true;
0593: case '"':
0594: return false;
0595: case '#':
0596: return false;
0597: case '$':
0598: return true;
0599: case '%':
0600: return false; // checked separately
0601: case '&':
0602: return true;
0603: case '\'':
0604: return true;
0605: case '(':
0606: return true;
0607: case ')':
0608: return true;
0609: case '*':
0610: return true;
0611: case '+':
0612: return true;
0613: case ',':
0614: return true;
0615: case '-':
0616: return true;
0617: case '.':
0618: return true;
0619: case '/':
0620: return false;
0621: case '0':
0622: return true;
0623: case '1':
0624: return true;
0625: case '2':
0626: return true;
0627: case '3':
0628: return true;
0629: case '4':
0630: return true;
0631: case '5':
0632: return true;
0633: case '6':
0634: return true;
0635: case '7':
0636: return true;
0637: case '8':
0638: return true;
0639: case '9':
0640: return true;
0641: case ':':
0642: return false;
0643: case ';':
0644: return true;
0645: case '<':
0646: return false;
0647: case '=':
0648: return true;
0649: case '>':
0650: return false;
0651: case '?':
0652: return false;
0653: case '@':
0654: return false;
0655: case 'A':
0656: return true;
0657: case 'B':
0658: return true;
0659: case 'C':
0660: return true;
0661: case 'D':
0662: return true;
0663: case 'E':
0664: return true;
0665: case 'F':
0666: return true;
0667: case 'G':
0668: return true;
0669: case 'H':
0670: return true;
0671: case 'I':
0672: return true;
0673: case 'J':
0674: return true;
0675: case 'K':
0676: return true;
0677: case 'L':
0678: return true;
0679: case 'M':
0680: return true;
0681: case 'N':
0682: return true;
0683: case 'O':
0684: return true;
0685: case 'P':
0686: return true;
0687: case 'Q':
0688: return true;
0689: case 'R':
0690: return true;
0691: case 'S':
0692: return true;
0693: case 'T':
0694: return true;
0695: case 'U':
0696: return true;
0697: case 'V':
0698: return true;
0699: case 'W':
0700: return true;
0701: case 'X':
0702: return true;
0703: case 'Y':
0704: return true;
0705: case 'Z':
0706: return true;
0707: case '[':
0708: return false;
0709: case '\\':
0710: return false;
0711: case ']':
0712: return false;
0713: case '^':
0714: return false;
0715: case '_':
0716: return true;
0717: case '`':
0718: return false;
0719: case 'a':
0720: return true;
0721: case 'b':
0722: return true;
0723: case 'c':
0724: return true;
0725: case 'd':
0726: return true;
0727: case 'e':
0728: return true;
0729: case 'f':
0730: return true;
0731: case 'g':
0732: return true;
0733: case 'h':
0734: return true;
0735: case 'i':
0736: return true;
0737: case 'j':
0738: return true;
0739: case 'k':
0740: return true;
0741: case 'l':
0742: return true;
0743: case 'm':
0744: return true;
0745: case 'n':
0746: return true;
0747: case 'o':
0748: return true;
0749: case 'p':
0750: return true;
0751: case 'q':
0752: return true;
0753: case 'r':
0754: return true;
0755: case 's':
0756: return true;
0757: case 't':
0758: return true;
0759: case 'u':
0760: return true;
0761: case 'v':
0762: return true;
0763: case 'w':
0764: return true;
0765: case 'x':
0766: return true;
0767: case 'y':
0768: return true;
0769: case 'z':
0770: return true;
0771: case '{':
0772: return false;
0773: case '|':
0774: return false;
0775: case '}':
0776: return false;
0777: case '~':
0778: return true;
0779: }
0780: return false;
0781:
0782: }
0783:
0784: private static void checkPort(String port) {
0785:
0786: for (int i = port.length() - 1; i >= 0; i--) {
0787: char c = port.charAt(i);
0788: if (c < '0' || c > '9') {
0789: throw new MalformedURIException("Bad port: " + port);
0790: }
0791: }
0792:
0793: }
0794:
0795: private static void checkUserInfo(String userInfo) {
0796:
0797: int length = userInfo.length();
0798: for (int i = 0; i < length; i++) {
0799: char c = userInfo.charAt(i);
0800: if (c == '%') {
0801: try {
0802: if (!isHexDigit(userInfo.charAt(i + 1))
0803: || !isHexDigit(userInfo.charAt(i + 2))) {
0804: throwMalformedURIException(userInfo,
0805: "Bad percent escape sequence");
0806: }
0807: } catch (StringIndexOutOfBoundsException ex) {
0808: throwMalformedURIException(userInfo,
0809: "Bad percent escape sequence");
0810: }
0811: i += 2;
0812: } else if (!isUserInfoCharacter(c)) {
0813: throw new MalformedURIException("Bad user info: "
0814: + userInfo);
0815: }
0816: }
0817:
0818: }
0819:
0820: private static void checkScheme(String scheme) {
0821:
0822: // http is probably 99% of cases so check it first
0823: if ("http".equals(scheme))
0824: return;
0825:
0826: if (scheme.length() == 0) {
0827: throw new MalformedURIException(
0828: "URIs cannot begin with a colon");
0829: }
0830: char c = scheme.charAt(0);
0831: if (!isAlpha(c)) {
0832: throw new MalformedURIException(
0833: "Illegal initial scheme character " + c);
0834: }
0835:
0836: for (int i = scheme.length() - 1; i >= 1; i--) {
0837: c = scheme.charAt(i);
0838: if (!isSchemeCharacter(c)) {
0839: throw new MalformedURIException(
0840: "Illegal scheme character " + c);
0841: }
0842: }
0843:
0844: }
0845:
0846: private static void checkIP6Address(String ip6Address) {
0847:
0848: StringTokenizer st = new StringTokenizer(ip6Address, ":", true);
0849: int numTokens = st.countTokens();
0850: if (numTokens > 15 || numTokens < 2) {
0851: throw new MalformedURIException(
0852: "Illegal IP6 host address: " + ip6Address);
0853: }
0854: for (int i = 0; i < numTokens; i++) {
0855: String hexPart = st.nextToken();
0856: if (":".equals(hexPart))
0857: continue;
0858: try {
0859: int part = Integer.parseInt(hexPart, 16);
0860: if (part < 0) {
0861: throw new MalformedURIException(
0862: "Illegal IP6 host address: " + ip6Address);
0863: }
0864: } catch (NumberFormatException ex) {
0865: if (i == numTokens - 1) {
0866: checkIP4Address(hexPart, ip6Address);
0867: } else {
0868: throwMalformedURIException(ip6Address,
0869: "Illegal IP6 host address: " + ip6Address);
0870: }
0871: }
0872: }
0873:
0874: if (ip6Address.indexOf("::") != ip6Address.lastIndexOf("::")) {
0875: throw new MalformedURIException(
0876: "Illegal IP6 host address: " + ip6Address);
0877: }
0878:
0879: }
0880:
0881: private static void checkIP4Address(String address,
0882: String ip6Address) {
0883:
0884: StringTokenizer st = new StringTokenizer(address, ".");
0885: int numTokens = st.countTokens();
0886: if (numTokens != 4) {
0887: throw new MalformedURIException(
0888: "Illegal IP6 host address: " + ip6Address);
0889: }
0890: for (int i = 0; i < 4; i++) {
0891: String decPart = st.nextToken();
0892: try {
0893: int dec = Integer.parseInt(decPart);
0894: if (dec > 255 || dec < 0) {
0895: throw new MalformedURIException(
0896: "Illegal IP6 host address: " + ip6Address);
0897: }
0898: } catch (NumberFormatException ex) {
0899: throw new MalformedURIException(
0900: "Illegal IP6 host address: " + ip6Address);
0901: }
0902: }
0903:
0904: }
0905:
0906: static void checkXMLName(String name) {
0907:
0908: if (name == null) {
0909: throwIllegalNameException(name, "XML names cannot be null");
0910: }
0911:
0912: int length = name.length();
0913: if (length == 0) {
0914: throwIllegalNameException(name, "XML names cannot be empty");
0915: }
0916:
0917: char first = name.charAt(0);
0918: if ((flags[first] & NAME_START_CHARACTER) == 0) {
0919: throwIllegalNameException(name, "XML names cannot start "
0920: + "with the character "
0921: + Integer.toHexString(first));
0922: }
0923:
0924: for (int i = 1; i < length; i++) {
0925: char c = name.charAt(i);
0926: if ((flags[c] & NAME_CHARACTER) == 0) {
0927: throwIllegalNameException(name, "0x"
0928: + Integer.toHexString(c)
0929: + " is not a legal name character");
0930: }
0931: }
0932:
0933: }
0934:
0935: private static boolean[] C0Table = new boolean[0x21];
0936: static {
0937: C0Table['\n'] = true;
0938: C0Table['\r'] = true;
0939: C0Table['\t'] = true;
0940: C0Table[' '] = true;
0941: }
0942:
0943: static boolean isXMLSpaceCharacter(char c) {
0944: if (c > ' ')
0945: return false;
0946: return C0Table[c];
0947: }
0948:
0949: private static boolean isHexDigit(char c) {
0950:
0951: switch (c) {
0952: case '0':
0953: return true;
0954: case '1':
0955: return true;
0956: case '2':
0957: return true;
0958: case '3':
0959: return true;
0960: case '4':
0961: return true;
0962: case '5':
0963: return true;
0964: case '6':
0965: return true;
0966: case '7':
0967: return true;
0968: case '8':
0969: return true;
0970: case '9':
0971: return true;
0972: case ':':
0973: return false;
0974: case ';':
0975: return false;
0976: case '<':
0977: return false;
0978: case '=':
0979: return false;
0980: case '>':
0981: return false;
0982: case '?':
0983: return false;
0984: case '@':
0985: return false;
0986: case 'A':
0987: return true;
0988: case 'B':
0989: return true;
0990: case 'C':
0991: return true;
0992: case 'D':
0993: return true;
0994: case 'E':
0995: return true;
0996: case 'F':
0997: return true;
0998: case 'G':
0999: return false;
1000: case 'H':
1001: return false;
1002: case 'I':
1003: return false;
1004: case 'J':
1005: return false;
1006: case 'K':
1007: return false;
1008: case 'L':
1009: return false;
1010: case 'M':
1011: return false;
1012: case 'N':
1013: return false;
1014: case 'O':
1015: return false;
1016: case 'P':
1017: return false;
1018: case 'Q':
1019: return false;
1020: case 'R':
1021: return false;
1022: case 'S':
1023: return false;
1024: case 'T':
1025: return false;
1026: case 'U':
1027: return false;
1028: case 'V':
1029: return false;
1030: case 'W':
1031: return false;
1032: case 'X':
1033: return false;
1034: case 'Y':
1035: return false;
1036: case 'Z':
1037: return false;
1038: case '[':
1039: return false;
1040: case '\\':
1041: return false;
1042: case ']':
1043: return false;
1044: case '^':
1045: return false;
1046: case '_':
1047: return false;
1048: case '`':
1049: return false;
1050: case 'a':
1051: return true;
1052: case 'b':
1053: return true;
1054: case 'c':
1055: return true;
1056: case 'd':
1057: return true;
1058: case 'e':
1059: return true;
1060: case 'f':
1061: return true;
1062: }
1063: return false;
1064: }
1065:
1066: // Since namespace URIs are commonly repeated, we can save a lot
1067: // of redundant code by storing the ones we've seen before.
1068: private static URICache cache = new URICache();
1069:
1070: private final static class URICache {
1071:
1072: private final static int LOAD = 6;
1073: private String[] cache = new String[LOAD];
1074: private int position = 0;
1075:
1076: synchronized boolean contains(String s) {
1077:
1078: for (int i = 0; i < LOAD; i++) {
1079: // Here I'm assuming the namespace URIs are interned.
1080: // This is commonly but not always true. This won't
1081: // break if they haven't been. Using equals() instead
1082: // of == is faster when the namespace URIs haven't been
1083: // interned but slower if they have.
1084: if (s == cache[i]) {
1085: return true;
1086: }
1087: }
1088: return false;
1089:
1090: }
1091:
1092: synchronized void put(String s) {
1093: cache[position] = s;
1094: position++;
1095: if (position == LOAD)
1096: position = 0;
1097: }
1098:
1099: }
1100:
1101: /**
1102: * <p>
1103: * Checks a string to see if it is an RFC 3986 absolute
1104: * URI reference. URI references can contain fragment identifiers.
1105: * Absolute URI references must have a scheme.
1106: * </p>
1107: *
1108: * @param uri <code>String</code> to check
1109: *
1110: * @throws MalformedURIException if this is not a legal
1111: * URI reference
1112: */
1113: static void checkAbsoluteURIReference(String uri) {
1114:
1115: if (cache.contains(uri)) {
1116: return;
1117: }
1118: URIUtil.ParsedURI parsed = new URIUtil.ParsedURI(uri);
1119: try {
1120: if (parsed.scheme == null) {
1121: throwMalformedURIException(uri,
1122: "Missing scheme in absolute URI reference");
1123: }
1124: checkScheme(parsed.scheme);
1125: if (parsed.authority != null)
1126: checkAuthority(parsed.authority);
1127: checkPath(parsed.path);
1128: if (parsed.fragment != null)
1129: checkFragment(parsed.fragment);
1130: if (parsed.query != null)
1131: checkQuery(parsed.query);
1132: cache.put(uri);
1133: } catch (MalformedURIException ex) {
1134: ex.setData(uri);
1135: throw ex;
1136: }
1137:
1138: }
1139:
1140: static boolean isAlpha(char c) {
1141:
1142: switch (c) {
1143: case 'A':
1144: return true;
1145: case 'B':
1146: return true;
1147: case 'C':
1148: return true;
1149: case 'D':
1150: return true;
1151: case 'E':
1152: return true;
1153: case 'F':
1154: return true;
1155: case 'G':
1156: return true;
1157: case 'H':
1158: return true;
1159: case 'I':
1160: return true;
1161: case 'J':
1162: return true;
1163: case 'K':
1164: return true;
1165: case 'L':
1166: return true;
1167: case 'M':
1168: return true;
1169: case 'N':
1170: return true;
1171: case 'O':
1172: return true;
1173: case 'P':
1174: return true;
1175: case 'Q':
1176: return true;
1177: case 'R':
1178: return true;
1179: case 'S':
1180: return true;
1181: case 'T':
1182: return true;
1183: case 'U':
1184: return true;
1185: case 'V':
1186: return true;
1187: case 'W':
1188: return true;
1189: case 'X':
1190: return true;
1191: case 'Y':
1192: return true;
1193: case 'Z':
1194: return true;
1195: case '[':
1196: return false;
1197: case '\\':
1198: return false;
1199: case ']':
1200: return false;
1201: case '^':
1202: return false;
1203: case '_':
1204: return false;
1205: case '`':
1206: return false;
1207: case 'a':
1208: return true;
1209: case 'b':
1210: return true;
1211: case 'c':
1212: return true;
1213: case 'd':
1214: return true;
1215: case 'e':
1216: return true;
1217: case 'f':
1218: return true;
1219: case 'g':
1220: return true;
1221: case 'h':
1222: return true;
1223: case 'i':
1224: return true;
1225: case 'j':
1226: return true;
1227: case 'k':
1228: return true;
1229: case 'l':
1230: return true;
1231: case 'm':
1232: return true;
1233: case 'n':
1234: return true;
1235: case 'o':
1236: return true;
1237: case 'p':
1238: return true;
1239: case 'q':
1240: return true;
1241: case 'r':
1242: return true;
1243: case 's':
1244: return true;
1245: case 't':
1246: return true;
1247: case 'u':
1248: return true;
1249: case 'v':
1250: return true;
1251: case 'w':
1252: return true;
1253: case 'x':
1254: return true;
1255: case 'y':
1256: return true;
1257: case 'z':
1258: return true;
1259: }
1260:
1261: return false;
1262:
1263: }
1264:
1265: static boolean isSchemeCharacter(char c) {
1266:
1267: /* The : and the ? cannot be reached here because they'll
1268: * have been parsed out separately before this method is
1269: * called. They're included here strictly for alignment
1270: * so the compiler will generate a table lookup.
1271: */
1272: switch (c) {
1273: case '+':
1274: return true;
1275: case ',':
1276: return false;
1277: case '-':
1278: return true;
1279: case '.':
1280: return true;
1281: case '/':
1282: return false;
1283: case '0':
1284: return true;
1285: case '1':
1286: return true;
1287: case '2':
1288: return true;
1289: case '3':
1290: return true;
1291: case '4':
1292: return true;
1293: case '5':
1294: return true;
1295: case '6':
1296: return true;
1297: case '7':
1298: return true;
1299: case '8':
1300: return true;
1301: case '9':
1302: return true;
1303: case ':':
1304: return false; // unreachable
1305: case ';':
1306: return false;
1307: case '<':
1308: return false;
1309: case '=':
1310: return false;
1311: case '>':
1312: return false;
1313: case '?':
1314: return false; // unreachable
1315: case '@':
1316: return false;
1317: case 'A':
1318: return true;
1319: case 'B':
1320: return true;
1321: case 'C':
1322: return true;
1323: case 'D':
1324: return true;
1325: case 'E':
1326: return true;
1327: case 'F':
1328: return true;
1329: case 'G':
1330: return true;
1331: case 'H':
1332: return true;
1333: case 'I':
1334: return true;
1335: case 'J':
1336: return true;
1337: case 'K':
1338: return true;
1339: case 'L':
1340: return true;
1341: case 'M':
1342: return true;
1343: case 'N':
1344: return true;
1345: case 'O':
1346: return true;
1347: case 'P':
1348: return true;
1349: case 'Q':
1350: return true;
1351: case 'R':
1352: return true;
1353: case 'S':
1354: return true;
1355: case 'T':
1356: return true;
1357: case 'U':
1358: return true;
1359: case 'V':
1360: return true;
1361: case 'W':
1362: return true;
1363: case 'X':
1364: return true;
1365: case 'Y':
1366: return true;
1367: case 'Z':
1368: return true;
1369: case '[':
1370: return false;
1371: case '\\':
1372: return false;
1373: case ']':
1374: return false;
1375: case '^':
1376: return false;
1377: case '_':
1378: return false;
1379: case '`':
1380: return false;
1381: case 'a':
1382: return true;
1383: case 'b':
1384: return true;
1385: case 'c':
1386: return true;
1387: case 'd':
1388: return true;
1389: case 'e':
1390: return true;
1391: case 'f':
1392: return true;
1393: case 'g':
1394: return true;
1395: case 'h':
1396: return true;
1397: case 'i':
1398: return true;
1399: case 'j':
1400: return true;
1401: case 'k':
1402: return true;
1403: case 'l':
1404: return true;
1405: case 'm':
1406: return true;
1407: case 'n':
1408: return true;
1409: case 'o':
1410: return true;
1411: case 'p':
1412: return true;
1413: case 'q':
1414: return true;
1415: case 'r':
1416: return true;
1417: case 's':
1418: return true;
1419: case 't':
1420: return true;
1421: case 'u':
1422: return true;
1423: case 'v':
1424: return true;
1425: case 'w':
1426: return true;
1427: case 'x':
1428: return true;
1429: case 'y':
1430: return true;
1431: case 'z':
1432: return true;
1433: }
1434:
1435: return false;
1436:
1437: }
1438:
1439: private static boolean isPathCharacter(char c) {
1440:
1441: switch (c) {
1442: case '!':
1443: return true;
1444: case '"':
1445: return false;
1446: case '#':
1447: return false;
1448: case '$':
1449: return true;
1450: case '%':
1451: return false; // checked separately
1452: case '&':
1453: return true;
1454: case '\'':
1455: return true;
1456: case '(':
1457: return true;
1458: case ')':
1459: return true;
1460: case '*':
1461: return true;
1462: case '+':
1463: return true;
1464: case ',':
1465: return true;
1466: case '-':
1467: return true;
1468: case '.':
1469: return true;
1470: case '/':
1471: return false; // handled separately
1472: case '0':
1473: return true;
1474: case '1':
1475: return true;
1476: case '2':
1477: return true;
1478: case '3':
1479: return true;
1480: case '4':
1481: return true;
1482: case '5':
1483: return true;
1484: case '6':
1485: return true;
1486: case '7':
1487: return true;
1488: case '8':
1489: return true;
1490: case '9':
1491: return true;
1492: case ':':
1493: return true;
1494: case ';':
1495: return true;
1496: case '<':
1497: return false;
1498: case '=':
1499: return true;
1500: case '>':
1501: return false;
1502: case '?':
1503: return false;
1504: case '@':
1505: return true;
1506: case 'A':
1507: return true;
1508: case 'B':
1509: return true;
1510: case 'C':
1511: return true;
1512: case 'D':
1513: return true;
1514: case 'E':
1515: return true;
1516: case 'F':
1517: return true;
1518: case 'G':
1519: return true;
1520: case 'H':
1521: return true;
1522: case 'I':
1523: return true;
1524: case 'J':
1525: return true;
1526: case 'K':
1527: return true;
1528: case 'L':
1529: return true;
1530: case 'M':
1531: return true;
1532: case 'N':
1533: return true;
1534: case 'O':
1535: return true;
1536: case 'P':
1537: return true;
1538: case 'Q':
1539: return true;
1540: case 'R':
1541: return true;
1542: case 'S':
1543: return true;
1544: case 'T':
1545: return true;
1546: case 'U':
1547: return true;
1548: case 'V':
1549: return true;
1550: case 'W':
1551: return true;
1552: case 'X':
1553: return true;
1554: case 'Y':
1555: return true;
1556: case 'Z':
1557: return true;
1558: case '[':
1559: return false;
1560: case '\\':
1561: return false;
1562: case ']':
1563: return false;
1564: case '^':
1565: return false;
1566: case '_':
1567: return true;
1568: case '`':
1569: return false;
1570: case 'a':
1571: return true;
1572: case 'b':
1573: return true;
1574: case 'c':
1575: return true;
1576: case 'd':
1577: return true;
1578: case 'e':
1579: return true;
1580: case 'f':
1581: return true;
1582: case 'g':
1583: return true;
1584: case 'h':
1585: return true;
1586: case 'i':
1587: return true;
1588: case 'j':
1589: return true;
1590: case 'k':
1591: return true;
1592: case 'l':
1593: return true;
1594: case 'm':
1595: return true;
1596: case 'n':
1597: return true;
1598: case 'o':
1599: return true;
1600: case 'p':
1601: return true;
1602: case 'q':
1603: return true;
1604: case 'r':
1605: return true;
1606: case 's':
1607: return true;
1608: case 't':
1609: return true;
1610: case 'u':
1611: return true;
1612: case 'v':
1613: return true;
1614: case 'w':
1615: return true;
1616: case 'x':
1617: return true;
1618: case 'y':
1619: return true;
1620: case 'z':
1621: return true;
1622: case '{':
1623: return false;
1624: case '|':
1625: return false;
1626: case '}':
1627: return false;
1628: case '~':
1629: return true;
1630: }
1631:
1632: return false;
1633:
1634: }
1635:
1636: private static boolean isUserInfoCharacter(char c) {
1637:
1638: switch (c) {
1639: case '!':
1640: return true;
1641: case '"':
1642: return false;
1643: case '#':
1644: return false;
1645: case '$':
1646: return true;
1647: case '%':
1648: return false; // checked separately
1649: case '&':
1650: return true;
1651: case '\'':
1652: return true;
1653: case '(':
1654: return true;
1655: case ')':
1656: return true;
1657: case '*':
1658: return true;
1659: case '+':
1660: return true;
1661: case ',':
1662: return true;
1663: case '-':
1664: return true;
1665: case '.':
1666: return true;
1667: case '/':
1668: return true;
1669: case '0':
1670: return true;
1671: case '1':
1672: return true;
1673: case '2':
1674: return true;
1675: case '3':
1676: return true;
1677: case '4':
1678: return true;
1679: case '5':
1680: return true;
1681: case '6':
1682: return true;
1683: case '7':
1684: return true;
1685: case '8':
1686: return true;
1687: case '9':
1688: return true;
1689: case ':':
1690: return true;
1691: case ';':
1692: return true;
1693: case '<':
1694: return false;
1695: case '=':
1696: return true;
1697: case '>':
1698: return false;
1699: case '?':
1700: return false;
1701: case '@':
1702: return false;
1703: case 'A':
1704: return true;
1705: case 'B':
1706: return true;
1707: case 'C':
1708: return true;
1709: case 'D':
1710: return true;
1711: case 'E':
1712: return true;
1713: case 'F':
1714: return true;
1715: case 'G':
1716: return true;
1717: case 'H':
1718: return true;
1719: case 'I':
1720: return true;
1721: case 'J':
1722: return true;
1723: case 'K':
1724: return true;
1725: case 'L':
1726: return true;
1727: case 'M':
1728: return true;
1729: case 'N':
1730: return true;
1731: case 'O':
1732: return true;
1733: case 'P':
1734: return true;
1735: case 'Q':
1736: return true;
1737: case 'R':
1738: return true;
1739: case 'S':
1740: return true;
1741: case 'T':
1742: return true;
1743: case 'U':
1744: return true;
1745: case 'V':
1746: return true;
1747: case 'W':
1748: return true;
1749: case 'X':
1750: return true;
1751: case 'Y':
1752: return true;
1753: case 'Z':
1754: return true;
1755: case '[':
1756: return false;
1757: case '\\':
1758: return false;
1759: case ']':
1760: return false;
1761: case '^':
1762: return false;
1763: case '_':
1764: return true;
1765: case '`':
1766: return false;
1767: case 'a':
1768: return true;
1769: case 'b':
1770: return true;
1771: case 'c':
1772: return true;
1773: case 'd':
1774: return true;
1775: case 'e':
1776: return true;
1777: case 'f':
1778: return true;
1779: case 'g':
1780: return true;
1781: case 'h':
1782: return true;
1783: case 'i':
1784: return true;
1785: case 'j':
1786: return true;
1787: case 'k':
1788: return true;
1789: case 'l':
1790: return true;
1791: case 'm':
1792: return true;
1793: case 'n':
1794: return true;
1795: case 'o':
1796: return true;
1797: case 'p':
1798: return true;
1799: case 'q':
1800: return true;
1801: case 'r':
1802: return true;
1803: case 's':
1804: return true;
1805: case 't':
1806: return true;
1807: case 'u':
1808: return true;
1809: case 'v':
1810: return true;
1811: case 'w':
1812: return true;
1813: case 'x':
1814: return true;
1815: case 'y':
1816: return true;
1817: case 'z':
1818: return true;
1819: case '{':
1820: return false;
1821: case '|':
1822: return false;
1823: case '}':
1824: return false;
1825: case '~':
1826: return true;
1827: }
1828:
1829: return false;
1830:
1831: }
1832:
1833: /**
1834: * Check to see that this string is an absolute URI,
1835: * neither a relative URI nor a URI reference.
1836: *
1837: */
1838: static void checkAbsoluteURI(String uri) {
1839:
1840: URIUtil.ParsedURI parsed = new URIUtil.ParsedURI(uri);
1841: try {
1842: if (parsed.scheme == null) {
1843: throwMalformedURIException(uri,
1844: "Missing scheme in absolute URI");
1845: }
1846: checkScheme(parsed.scheme);
1847: if (parsed.authority != null)
1848: checkAuthority(parsed.authority);
1849: checkPath(parsed.path);
1850: if (parsed.fragment != null) {
1851: throwMalformedURIException(uri,
1852: "URIs cannot have fragment identifiers");
1853: }
1854: if (parsed.query != null)
1855: checkQuery(parsed.query);
1856: } catch (MalformedURIException ex) {
1857: ex.setData(uri);
1858: throw ex;
1859: }
1860:
1861: }
1862:
1863: // For use in checking internal DTD subsets
1864: private static XMLReader parser;
1865:
1866: static synchronized void checkInternalDTDSubset(String subset) {
1867:
1868: if (parser == null) {
1869: final InputSource empty = new InputSource(new EmptyReader());
1870: parser = Builder.findParser(false);
1871: // parser = new org.apache.crimson.parser.XMLReaderImpl();
1872: // Now let's stop this parser from loading any external
1873: // entities the subset references
1874: parser.setEntityResolver(new EntityResolver() {
1875:
1876: public InputSource resolveEntity(String publicID,
1877: String systemID) {
1878: return empty;
1879: }
1880:
1881: });
1882: }
1883:
1884: String doc = "<!DOCTYPE a [" + subset + "]><a/>";
1885: try {
1886: InputSource source = new InputSource(new StringReader(doc));
1887: // just to make sure relative URLs can be resolved; don't
1888: // actually need to connect to this; the EntityResolver
1889: // prevents that
1890: source.setSystemId("http://www.example.org/");
1891: parser.parse(source);
1892: } catch (SAXException ex) {
1893: IllegalDataException idex = new IllegalDataException(
1894: "Malformed internal DTD subset: " + ex.getMessage(),
1895: ex);
1896: idex.setData(subset);
1897: throw idex;
1898: } catch (IOException ex) {
1899: throw new RuntimeException(
1900: "BUG: I don't think this can happen");
1901: }
1902:
1903: }
1904:
1905: // A reader that immediately returns end of stream. This is a great
1906: // big hack to avoid reading anything when setting the internal
1907: // DTD subset. I could use the
1908: // http://xml.org/sax/features/external-parameter-entities SAX
1909: // feature, but many parsers don't reliably implement that so
1910: // instead we simply pretend that all URLs point to empty files.
1911: private static class EmptyReader extends Reader {
1912:
1913: public int read(char[] text, int start, int length)
1914: throws IOException {
1915: return -1;
1916: }
1917:
1918: public void close() {
1919: }
1920:
1921: }
1922:
1923: }
|