0001: /*
0002: * tclpro/tclparser/tclParser.c -> TclParser.java
0003: *
0004: * This is a Tcl language parser as a Tcl dynamically loadable
0005: * extension.
0006: *
0007: * Copyright (c) 1996 by Sun Microsystems, Inc.
0008: * Copyright (c) 2000 Ajuba Solutions
0009: *
0010: * See the file "license.terms" for information on usage and redistribution
0011: * of this file, and for a DISCLAIMER OF ALL WARRANTIES.
0012: *
0013: * RCS: @(#) $Id: TclParser.java,v 1.5 2005/11/22 22:10:02 mdejong Exp $
0014: */
0015:
0016: package tcl.lang;
0017:
0018: public class TclParser implements Command {
0019:
0020: static final private String[] options = { "command", "expr",
0021: "varname", "list", "getrange", "getstring", "charindex",
0022: "charlength", "countnewline" };
0023:
0024: static final private int PARSE_COMMAND = 0;
0025: static final private int PARSE_EXPR = 1;
0026: static final private int PARSE_VARNAME = 2;
0027: static final private int PARSE_LIST = 3;
0028: static final private int PARSE_GET_RANGE = 4;
0029: static final private int PARSE_GET_STR = 5;
0030: static final private int PARSE_CHAR_INDEX = 6;
0031: static final private int PARSE_CHAR_LEN = 7;
0032: static final private int PARSE_COUNT_NWLNE = 8;
0033:
0034: /*
0035: *----------------------------------------------------------------------
0036: *
0037: * ParseObjCmd -> cmdProc
0038: *
0039: * This function implements the Tcl "parse" command.
0040: *
0041: * Results:
0042: * A standard Tcl result.
0043: *
0044: * Side effects:
0045: * None.
0046: *
0047: *----------------------------------------------------------------------
0048: */
0049:
0050: public void cmdProc(Interp interp, // Current interpreter.
0051: TclObject[] objv) // Arguments to command
0052: throws TclException {
0053: int option, index, length, scriptLength;
0054:
0055: if (objv.length < 3) {
0056: throw new TclNumArgsException(interp, 1, objv,
0057: "option arg ?arg ...?");
0058: }
0059: option = TclIndex.get(interp, objv[1], options, "option", 0);
0060:
0061: // If the script argument holds a cached UTF8CharPointer internal rep
0062: // then grab it and use it. Otherwise, create a new UTF8CharPointer
0063: // and set it as the internal rep.
0064:
0065: TclObject tobj = objv[2];
0066: UTF8CharPointer script;
0067: InternalRep irep = tobj.getInternalRep();
0068: if (irep instanceof UTF8CharPointer) {
0069: script = (UTF8CharPointer) irep;
0070: } else {
0071: script = new UTF8CharPointer(tobj.toString());
0072: tobj.setInternalRep(script);
0073: }
0074: if (script == null) {
0075: System.out.println(script); // For debugging only
0076: }
0077: scriptLength = script.getByteLength();
0078:
0079: // Check the number arguments passed to the command and
0080: // extract information (script, index, length) depending
0081: // upon the option selected.
0082:
0083: switch (option) {
0084: case PARSE_GET_RANGE: {
0085: if (objv.length == 3) {
0086: index = 0;
0087: length = scriptLength;
0088: } else if (objv.length == 5) {
0089: index = TclInteger.get(interp, objv[3]);
0090: length = TclInteger.get(interp, objv[4]);
0091:
0092: if (index < 0) {
0093: index = 0;
0094: } else if (index >= scriptLength) {
0095: index = scriptLength - 1;
0096: }
0097: if (length < 0) {
0098: length = 0;
0099: } else if (length > (scriptLength - index)) {
0100: length = scriptLength - index;
0101: }
0102: } else {
0103: throw new TclNumArgsException(interp, 2, objv,
0104: "string ?index length?");
0105: }
0106: interp.setResult(ParseMakeRange(script, index, length));
0107: return;
0108: }
0109: case PARSE_COMMAND:
0110: case PARSE_EXPR:
0111: case PARSE_VARNAME:
0112: case PARSE_LIST:
0113: case PARSE_GET_STR:
0114: case PARSE_CHAR_INDEX:
0115: case PARSE_CHAR_LEN: {
0116: if (objv.length != 4) {
0117: throw new TclNumArgsException(interp, 2, objv,
0118: "string range");
0119: }
0120: ParseGetIndexAndLengthResult result = new ParseGetIndexAndLengthResult();
0121: ParseGetIndexAndLength(interp, objv[3], scriptLength,
0122: result);
0123: index = result.indexPtr;
0124: length = result.lengthPtr;
0125:
0126: switch (option) {
0127: case PARSE_COMMAND:
0128: ParseCommand(interp, script, index, length);
0129: return;
0130: case PARSE_EXPR:
0131: ParseExpr(interp, script, index, length);
0132: return;
0133: case PARSE_VARNAME:
0134: ParseVarName(interp, script, index, length);
0135: return;
0136: case PARSE_LIST:
0137: ParseList(interp, script, index, length);
0138: return;
0139: case PARSE_GET_STR:
0140: ParseGetString(interp, script, index, length);
0141: return;
0142: case PARSE_CHAR_INDEX:
0143: ParseCharIndex(interp, script, index, length);
0144: return;
0145: case PARSE_CHAR_LEN:
0146: ParseCharLength(interp, script, index, length);
0147: return;
0148: case PARSE_GET_RANGE:
0149: case PARSE_COUNT_NWLNE:
0150: // No Op - This will suppress compiler warnings
0151: break;
0152: }
0153: break;
0154: }
0155: case PARSE_COUNT_NWLNE: {
0156: TclObject range2;
0157: if (objv.length == 5) {
0158: range2 = objv[4];
0159: } else if (objv.length == 4) {
0160: range2 = null;
0161: } else {
0162: throw new TclNumArgsException(interp, 2, objv,
0163: "string range ?range?");
0164: }
0165: ParseCountNewline(interp, script, scriptLength, objv[3],
0166: range2);
0167: return;
0168: }
0169: }
0170: throw new TclException(interp, "unmatched option");
0171: }
0172:
0173: /*
0174: *----------------------------------------------------------------------
0175: *
0176: * ParseCommand --
0177: *
0178: * This function parses a script into Tcl commands by calling the
0179: * Tcl_ParseCommand function. This routine returns a list of the
0180: * following form: <commentRange> <commandRange> <restRange> <parseTree>
0181: * The first range refers to any leading comments before the command.
0182: * The second range refers to the command itself. The third range
0183: * contains the remainder of the original range that appears after
0184: * the command range. The parseTree is a list representation
0185: * of the parse tree where each node is a list in the form:
0186: * <type> <range> <subTree>.
0187: *
0188: * Results:
0189: * A standard Tcl result.
0190: *
0191: * Side effects:
0192: * None.
0193: *
0194: *----------------------------------------------------------------------
0195: */
0196:
0197: static void ParseCommand(Interp interp, // Current interpreter.
0198: UTF8CharPointer script, // Script to parse.
0199: int index, // Index to the starting point of the
0200: // script, in bytes.
0201: int length) // Length of script be parsed, in bytes.
0202: throws TclException {
0203: TclObject resultPtr, listPtr, tokenPtr;
0204: TclParse parse;
0205: int i;
0206: int endCharIndex;
0207: int endByteIndex;
0208:
0209: // Convert byte index and range into char index and range
0210: int charIndex = script.getCharIndex(index);
0211: int charLength = script.getCharRange(index, length);
0212:
0213: parse = Parser.parseCommand(interp, script.array, charIndex,
0214: charLength, null, -1, false);
0215:
0216: if (parse.result != TCL.OK) {
0217: ParseSetErrorCode(interp, script, parse);
0218: }
0219:
0220: resultPtr = TclList.newInstance();
0221: if (parse.commentStart != -1) {
0222: TclList.append(interp, resultPtr, ParseMakeByteRange(
0223: script, parse.commentStart, parse.commentSize));
0224: } else {
0225: TclList.append(interp, resultPtr, ParseMakeRange(script,
0226: script.index, 0));
0227: }
0228: TclList.append(interp, resultPtr, ParseMakeByteRange(script,
0229: parse.commandStart, parse.commandSize));
0230: endCharIndex = parse.commandStart + parse.commandSize;
0231: TclList.append(interp, resultPtr,
0232: ParseMakeByteRange(script, endCharIndex,
0233: (charLength - (endCharIndex - charIndex))));
0234:
0235: listPtr = TclList.newInstance();
0236: ParseMakeTokenListResult result = new ParseMakeTokenListResult();
0237: i = 0;
0238: while (i < parse.numTokens) {
0239: i = ParseMakeTokenList(script, parse, i, result);
0240: tokenPtr = result.newList;
0241: TclList.append(null, listPtr, tokenPtr);
0242: }
0243: TclList.append(interp, resultPtr, listPtr);
0244: interp.setResult(resultPtr);
0245: return;
0246: }
0247:
0248: /*
0249: *----------------------------------------------------------------------
0250: *
0251: * ParseExpr --
0252: *
0253: * This function parses a Tcl expression into a tree representation.
0254: *
0255: * Results:
0256: * A standard Tcl result.
0257: *
0258: * Side effects:
0259: * None.
0260: *
0261: *----------------------------------------------------------------------
0262: */
0263:
0264: static void ParseExpr(Interp interp, // Current interpreter.
0265: UTF8CharPointer script, // Script to parse.
0266: int index, // Index to the starting point of the
0267: // script, in bytes.
0268: int length) // Length of script be parsed, in bytes.
0269: throws TclException {
0270: TclParse parse;
0271:
0272: int charIndex = script.getCharIndex(index);
0273: int charLength = script.getCharRange(index, length);
0274:
0275: parse = ParseExpr.parseExpr(interp, script.array, charIndex,
0276: charLength);
0277:
0278: if (parse.result != TCL.OK) {
0279: ParseSetErrorCode(interp, script, parse);
0280: }
0281:
0282: // There is only one top level token, so just return it.
0283:
0284: ParseMakeTokenListResult lresult = new ParseMakeTokenListResult();
0285: ParseMakeTokenList(script, parse, 0, lresult);
0286: interp.setResult(lresult.newList);
0287: }
0288:
0289: /*
0290: *----------------------------------------------------------------------
0291: *
0292: * ParseList --
0293: *
0294: * This function parses a Tcl list into a list of ranges.
0295: *
0296: * Results:
0297: * A standard Tcl result.
0298: *
0299: * Side effects:
0300: * None.
0301: *
0302: *----------------------------------------------------------------------
0303: */
0304:
0305: static void ParseList(Interp interp, // Current interpreter.
0306: UTF8CharPointer script, // Script to parse.
0307: int index, // Index to the starting point of the
0308: // script, in bytes.
0309: int length) // Length of script be parsed, in bytes.
0310: throws TclException {
0311: TclObject resultPtr;
0312: int size;
0313: char c;
0314: String list;
0315: int elementIndex;
0316: int listIndex, prevListIndex, lastListIndex;
0317: FindElemResult fer = new FindElemResult();
0318: int charIndex, charLength, charListOffset;
0319: boolean found;
0320:
0321: charIndex = script.getCharIndex(index);
0322: charListOffset = (charIndex - script.index);
0323:
0324: resultPtr = TclList.newInstance();
0325: list = script.getByteRangeAsString(index, length);
0326: charLength = list.length();
0327:
0328: lastListIndex = charLength;
0329: listIndex = 0;
0330:
0331: for (;;) {
0332: prevListIndex = listIndex;
0333:
0334: try {
0335: found = Util.findElement(interp, list, listIndex,
0336: charLength, fer);
0337: } catch (TclException te) {
0338: TclObject errorCode = TclList.newInstance();
0339: TclList.append(interp, errorCode, TclString
0340: .newInstance("PARSE"));
0341: TclList.append(interp, errorCode, TclString
0342: .newInstance("list"));
0343: // Convert to byte range
0344: int byteRange = script.getByteRange(script.index,
0345: charListOffset + listIndex);
0346: TclList.append(interp, errorCode, TclInteger
0347: .newInstance(byteRange));
0348: TclList.append(interp, errorCode, interp.getResult());
0349: interp.setErrorCode(errorCode);
0350: throw te;
0351: }
0352: if (!found) {
0353: break;
0354: }
0355: listIndex = fer.elemEnd;
0356: //charLength -= (listIndex - prevListIndex);
0357: elementIndex = fer.elemStart;
0358: size = fer.size;
0359:
0360: // Check to see if this element was in quotes or braces.
0361: // If it is, ensure that the range includes the quotes/braces
0362: // so the parser can make decisions based on this fact.
0363:
0364: if (elementIndex > 0) {
0365: c = list.charAt(elementIndex - 1);
0366: } else {
0367: c = '\0';
0368: }
0369: if (c == '{' || c == '\"') {
0370: elementIndex--;
0371: size += 2;
0372: }
0373: TclList.append(interp, resultPtr, ParseMakeByteRange(
0374: script, charListOffset + elementIndex, size));
0375: }
0376:
0377: interp.setResult(resultPtr);
0378: }
0379:
0380: /*
0381: *----------------------------------------------------------------------
0382: *
0383: * ParseVarName --
0384: *
0385: * This function parses a Tcl braced word into a tree representation.
0386: *
0387: * Results:
0388: * A standard Tcl result.
0389: *
0390: * Side effects:
0391: * None.
0392: *
0393: *----------------------------------------------------------------------
0394: */
0395:
0396: static void ParseVarName(Interp interp, // Current interpreter.
0397: UTF8CharPointer script, // Script to parse.
0398: int index, // Index to the starting point of the
0399: // script, in bytes.
0400: int length) // Length of script be parsed, in bytes.
0401: throws TclException {
0402: TclParse parse;
0403:
0404: // Convert byte index and range into char index and range
0405: int charIndex = script.getCharIndex(index);
0406: int charLength = script.getCharRange(index, length);
0407:
0408: parse = Parser.parseVarName(interp, script.array, charIndex,
0409: charLength, null, false);
0410: if (parse.result != TCL.OK) {
0411: ParseSetErrorCode(interp, script, parse);
0412: }
0413:
0414: // There is only one top level token, so just return it.
0415:
0416: ParseMakeTokenListResult lresult = new ParseMakeTokenListResult();
0417: ParseMakeTokenList(script, parse, 0, lresult);
0418: interp.setResult(lresult.newList);
0419: }
0420:
0421: /*
0422: *----------------------------------------------------------------------
0423: *
0424: * ParseSetErrorCode --
0425: *
0426: * Set the errorCode variable the standard parser error form
0427: * and raise a TclException. This method is invoked after something
0428: * goes wrong in a parse operation.
0429: *
0430: * Results:
0431: * None.
0432: *
0433: * Side effects:
0434: * None.
0435: *
0436: *----------------------------------------------------------------------
0437: */
0438:
0439: static void ParseSetErrorCode(Interp interp, // Current interpreter.
0440: UTF8CharPointer script, // Script to parse.
0441: TclParse parse) // Parse state.
0442: throws TclException {
0443: TclObject tlist;
0444: String type;
0445:
0446: switch (parse.errorType) {
0447: case Parser.TCL_PARSE_QUOTE_EXTRA:
0448: type = "quoteExtra";
0449: break;
0450: case Parser.TCL_PARSE_BRACE_EXTRA:
0451: type = "braceExtra";
0452: break;
0453: case Parser.TCL_PARSE_MISSING_BRACE:
0454: type = "missingBrace";
0455: break;
0456: case Parser.TCL_PARSE_MISSING_BRACKET:
0457: type = "missingBracket";
0458: break;
0459: case Parser.TCL_PARSE_MISSING_PAREN:
0460: type = "missingParen";
0461: break;
0462: case Parser.TCL_PARSE_MISSING_QUOTE:
0463: type = "missingQuote";
0464: break;
0465: case Parser.TCL_PARSE_MISSING_VAR_BRACE:
0466: type = "missingVarBrace";
0467: break;
0468: case Parser.TCL_PARSE_SYNTAX:
0469: type = "syntax";
0470: break;
0471: case Parser.TCL_PARSE_BAD_NUMBER:
0472: type = "badNumber";
0473: break;
0474: default:
0475: throw new TclException(interp,
0476: "unexpected error type from Tcl_ParseCommand");
0477: }
0478: tlist = TclList.newInstance();
0479: TclList.append(interp, tlist, TclString.newInstance("PARSE"));
0480: TclList.append(interp, tlist, TclString.newInstance(type));
0481: if (parse.termIndex > 0) {
0482: // Convert to byte range
0483: int byteRange = script.getByteRange(script.index,
0484: parse.termIndex);
0485: TclList.append(interp, tlist, TclInteger
0486: .newInstance(byteRange));
0487: } else {
0488: TclList.append(interp, tlist, TclInteger.newInstance(0));
0489: }
0490: TclList.append(interp, tlist, interp.getResult());
0491: interp.setErrorCode(tlist);
0492: throw new TclException(interp, interp.getResult().toString());
0493: }
0494:
0495: /*
0496: *----------------------------------------------------------------------
0497: *
0498: * ParseMakeTokenList --
0499: *
0500: * Make the list representation of a token. Each token is represented
0501: * as a list where the first element is a token type, the second
0502: * element is a range, and the third element is a list of
0503: * subtokens.
0504: *
0505: * Results:
0506: * Returns the next token offset and stores a newly allocated
0507: * list object in the location referred to by result.
0508: *
0509: * Side effects:
0510: * None.
0511: *
0512: *----------------------------------------------------------------------
0513: */
0514:
0515: static class ParseMakeTokenListResult {
0516: TclObject newList;
0517: }
0518:
0519: static int ParseMakeTokenList(UTF8CharPointer script, // Pointer to start of script being parsed.
0520: TclParse parse, // Parse information.
0521: int index, // Index of token to append.
0522: ParseMakeTokenListResult result)
0523: // Location where resulting list
0524: // object is to be stored.
0525: throws TclException {
0526: TclToken token = parse.tokenList[index];
0527: TclObject resultList, resultIndexList;
0528: int start;
0529: String type;
0530:
0531: switch (token.type) {
0532: case Parser.TCL_TOKEN_WORD:
0533: type = "word";
0534: break;
0535: case Parser.TCL_TOKEN_SIMPLE_WORD:
0536: type = "simple";
0537: break;
0538: case Parser.TCL_TOKEN_TEXT:
0539: type = "text";
0540: break;
0541: case Parser.TCL_TOKEN_BS:
0542: type = "backslash";
0543: break;
0544: case Parser.TCL_TOKEN_COMMAND:
0545: type = "command";
0546: break;
0547: case Parser.TCL_TOKEN_VARIABLE:
0548: type = "variable";
0549: break;
0550: case Parser.TCL_TOKEN_SUB_EXPR:
0551: type = "subexpr";
0552: break;
0553: case Parser.TCL_TOKEN_OPERATOR:
0554: type = "operator";
0555: break;
0556: default:
0557: type = "unknown";
0558: break;
0559: }
0560: resultList = TclList.newInstance();
0561: TclList.append(null, resultList, TclString.newInstance(type));
0562: TclList.append(null, resultList, ParseMakeByteRange(script,
0563: token.script_index, token.size));
0564: resultIndexList = TclList.newInstance();
0565: TclList.append(null, resultList, resultIndexList);
0566: start = index;
0567: index++;
0568: ParseMakeTokenListResult lresult = new ParseMakeTokenListResult();
0569: while (index <= start + token.numComponents) {
0570: index = ParseMakeTokenList(script, parse, index, lresult);
0571: TclList.append(null, resultIndexList, lresult.newList);
0572: }
0573:
0574: result.newList = resultList;
0575: return index;
0576: }
0577:
0578: /*
0579: *----------------------------------------------------------------------
0580: *
0581: * ParseMakeRange --
0582: *
0583: * Construct a new range object. This method depends on the
0584: * script.index being set to the starting index of the
0585: * entire script.
0586: *
0587: * Results:
0588: * Returns a newly allocated Tcl object.
0589: *
0590: * Side effects:
0591: * None.
0592: *
0593: *----------------------------------------------------------------------
0594: */
0595:
0596: static TclObject ParseMakeRange(UTF8CharPointer script, // Pointer to the start of whole script.
0597: int start, // Index of start of the range, in bytes.
0598: int length) // The length of the range, in bytes.
0599: throws TclException {
0600: int scriptByteIndex = script.getByteIndex(script.index);
0601:
0602: TclObject tlist = TclList.newInstance();
0603: TclList.append(null, tlist, TclInteger.newInstance(start
0604: - scriptByteIndex));
0605: TclList.append(null, tlist, TclInteger.newInstance(length));
0606: return tlist;
0607: }
0608:
0609: /*
0610: *----------------------------------------------------------------------
0611: *
0612: * ParseMakeByteRange --
0613: *
0614: * Construct a new range object containing a byte range given
0615: * a start and length in characters.
0616: *
0617: * Results:
0618: * Returns a newly allocated Tcl object.
0619: *
0620: * Side effects:
0621: * None.
0622: *
0623: *----------------------------------------------------------------------
0624: */
0625:
0626: static TclObject ParseMakeByteRange(UTF8CharPointer script, // Pointer to the start of whole script.
0627: int start, // Index of start of the range, in chars.
0628: int length) // The length of the range, in chars.
0629: throws TclException {
0630: if (start < 0) {
0631: throw new TclRuntimeError("char index can't be < 0, was "
0632: + start);
0633: }
0634: if (length < 0) {
0635: throw new TclRuntimeError("char length can't be < 0, was "
0636: + length);
0637: }
0638: int byteStart = script.getByteIndex(start);
0639: int byteLength = script.getByteRange(start, length);
0640: return ParseMakeRange(script, byteStart, byteLength);
0641: }
0642:
0643: /*
0644: *----------------------------------------------------------------------
0645: *
0646: * ParseGetString --
0647: *
0648: * Extract the string from the script within the boundaries of
0649: * byte oriented index and length.
0650: *
0651: * Results:
0652: * A standard Tcl result.
0653: *
0654: * Side effects:
0655: * The interp's result is set.
0656: *
0657: *----------------------------------------------------------------------
0658: */
0659:
0660: static void ParseGetString(Interp interp, // Current interpreter.
0661: UTF8CharPointer script, // Script to parse.
0662: int index, // Index to the starting point of the
0663: // script, in bytes
0664: int length) // Length of script in bytes.
0665: throws TclException {
0666: String str = script.getByteRangeAsString(index, length);
0667: interp.setResult(str);
0668: }
0669:
0670: /*
0671: *----------------------------------------------------------------------
0672: *
0673: * ParseCharIndex --
0674: *
0675: * Converts byte oriented index values into character oriented
0676: * index values.
0677: *
0678: * Results:
0679: * A standard Tcl result.
0680: *
0681: * Side effects:
0682: * The interp's result is set.
0683: *
0684: *----------------------------------------------------------------------
0685: */
0686:
0687: static void ParseCharIndex(Interp interp, // Current interpreter.
0688: UTF8CharPointer script, // Script to parse.
0689: int index, // Index to the starting point of the
0690: // script, in bytes.
0691: int length) // Length of script be parsed, in bytes.
0692: throws TclException {
0693: // Count number of characters from the start of the
0694: // script to the given byte index.
0695:
0696: int charIndex = script.getCharIndex(index);
0697: interp.setResult(charIndex - script.index);
0698: }
0699:
0700: /*
0701: *----------------------------------------------------------------------
0702: *
0703: * ParseCharLength --
0704: *
0705: * Converts the given byte length into a character count.
0706: *
0707: * Results:
0708: * A standard Tcl result.
0709: *
0710: * Side effects:
0711: * The interp's result is set.
0712: *
0713: *----------------------------------------------------------------------
0714: */
0715:
0716: static void ParseCharLength(Interp interp, // Current interpreter.
0717: UTF8CharPointer script, // Script to parse.
0718: int index, // Index to the starting point of the
0719: // script, in bytes.
0720: int length) // Length of script be parsed, in bytes.
0721: throws TclException {
0722: // Count number of characters from the byte index
0723: // to the byte length.
0724:
0725: int charLength = script.getCharRange(index, length);
0726: interp.setResult(charLength);
0727: }
0728:
0729: /*
0730: *----------------------------------------------------------------------
0731: *
0732: * ParseCountNewline --
0733: *
0734: * Count the number of newlines between a range of bytes
0735: * in a script. If two ranges are passed to this function,
0736: * calculate the number of newlines from the beginning index of
0737: * the first range up to, but not including, the beginning of
0738: * the second range. If one range is passed in, count the
0739: * number of newlines from the beginning of the first range
0740: * through the last character in the range.
0741: *
0742: * It is assumed that the indices and lengths are within the
0743: * boundaries of the script. No error checking is done to
0744: * verify this. Use the ParseGetIndexAndRange to validate
0745: * the data.
0746: *
0747: * Results:
0748: * A standard Tcl result.
0749: *
0750: * Side effects:
0751: * The interp's result is set to the number of newlines counted.
0752: *
0753: *----------------------------------------------------------------------
0754: */
0755:
0756: static void ParseCountNewline(Interp interp, // Current interpreter.
0757: UTF8CharPointer script, // Script to parse.
0758: int scriptLength, // Lengths of script in bytes.
0759: TclObject rangePtr1, // Begin counting newlines with this range.
0760: TclObject rangePtr2) // Possibly null, otherwise used to terminate
0761: // newline counting
0762: throws TclException {
0763: int subStrIndex, endStrIndex;
0764: int offset, index1, index2 = 0;
0765: int length, length1, length2;
0766: int listLen1, listLen2;
0767: int numNewline;
0768:
0769: listLen1 = TclList.getLength(interp, rangePtr1);
0770: ParseGetIndexAndLengthResult result = new ParseGetIndexAndLengthResult();
0771: ParseGetIndexAndLength(interp, rangePtr1, scriptLength, result);
0772: index1 = result.indexPtr;
0773: length1 = result.lengthPtr;
0774:
0775: if (rangePtr2 != null) {
0776: listLen2 = TclList.getLength(interp, rangePtr2);
0777: ParseGetIndexAndLength(interp, rangePtr2, scriptLength,
0778: result);
0779: index2 = result.indexPtr;
0780: length2 = result.lengthPtr;
0781: } else {
0782: listLen2 = 0;
0783: }
0784:
0785: if ((listLen1 == 0) && (listLen2 == 2)) {
0786: // Counting from the beginning of the file to
0787: // the beginning of the second range.
0788: //
0789: // example: parse count script {} r2
0790:
0791: offset = 0;
0792: length = index2;
0793: } else if ((listLen1 == 2) && (listLen2 == 2)) {
0794: // Counting from the beginning of the first
0795: // range to the beginning of the second range.
0796: //
0797: // example: parse count script r1 r2
0798:
0799: offset = index1;
0800: length = (index2 - offset);
0801: } else {
0802: // Counting from the beginning of the first
0803: // range to the end of the first range. If
0804: // the arg passed was an empty string it
0805: // will count the whole script.
0806: //
0807: // example: parse count script {}
0808: // parse count script r1
0809:
0810: offset = index1;
0811: length = length1;
0812: }
0813:
0814: subStrIndex = offset;
0815: endStrIndex = subStrIndex + length;
0816: numNewline = 0;
0817:
0818: // Get byte range as a String and count the number of
0819: // newlines found in that range.
0820:
0821: String range = script.getByteRangeAsString(subStrIndex, length);
0822: final int range_length = range.length();
0823: for (int i = 0; i < range_length; i++) {
0824: if (range.charAt(i) == '\n') {
0825: numNewline++;
0826: }
0827: }
0828:
0829: interp.setResult(numNewline);
0830: }
0831:
0832: /*
0833: *----------------------------------------------------------------------
0834: *
0835: * ParseGetIndexAndLength --
0836: *
0837: * Extract the index and length from a Tcl Object. If the
0838: * Tcl Object does not contain data, return the beginning
0839: * of the script as the index and the length of the script
0840: * for the length. If the data in the script is out of the
0841: * scripts range (e.g. < 0 or > scriptLength,) and scriptLen
0842: * is >= 0, set the value to the closest point. Note that
0843: * indexes and ranges are in terms of bytes.
0844: *
0845: * Results:
0846: * A standard Tcl result.
0847: *
0848: * Side effects:
0849: * The values are written to the result argument.
0850: * If scriptLen is >= 0, the values will be normalized based
0851: * on the length of the script.
0852: *
0853: *----------------------------------------------------------------------
0854: */
0855:
0856: static class ParseGetIndexAndLengthResult {
0857: int indexPtr; // Index to the starting point of the
0858: // script.
0859: int lengthPtr; // Byte length of script be parsed.
0860: }
0861:
0862: static void ParseGetIndexAndLength(Interp interp, // Current interpreter.
0863: TclObject rangePtr, int scriptLen, // Length of script in bytes. If >= 0, then try
0864: // to normalize index and length based
0865: // on the length of the script.
0866: ParseGetIndexAndLengthResult result) throws TclException {
0867: TclObject itemPtr;
0868: int listLen;
0869:
0870: listLen = TclList.getLength(interp, rangePtr);
0871: if ((listLen != 0) && (listLen != 2)) {
0872: throw new TclException(interp,
0873: "invalid range input: incorrect list size");
0874: }
0875: if ((listLen == 0) && (scriptLen < 0)) {
0876: throw new TclException(interp,
0877: "empty range: no index or length values");
0878: }
0879:
0880: // If the range argument is null, then set 'index' to zero
0881: // and 'length' to the string length of the script. Otherwise
0882: // extract 'index' and 'length' from the list. If index or length
0883: // is < 0 then set it to 0, if index or length is > then the scripts
0884: // length, set it to the end of the script.
0885:
0886: if (listLen == 0) {
0887: result.indexPtr = 0;
0888: result.lengthPtr = scriptLen;
0889: } else {
0890: int len;
0891: String bytes;
0892: itemPtr = TclList.index(interp, rangePtr, 0);
0893: result.indexPtr = TclInteger.get(interp, itemPtr);
0894: itemPtr = TclList.index(interp, rangePtr, 1);
0895: bytes = itemPtr.toString();
0896: len = bytes.length();
0897:
0898: if (bytes.equals("end")) {
0899: result.lengthPtr = scriptLen;
0900: } else {
0901: result.lengthPtr = TclInteger.get(interp, itemPtr);
0902: }
0903: if (scriptLen >= 0) {
0904: if (result.indexPtr < 0) {
0905: result.indexPtr = 0;
0906: }
0907: if (result.lengthPtr < 0) {
0908: result.lengthPtr = 0;
0909: }
0910: if (result.indexPtr >= scriptLen) {
0911: result.indexPtr = scriptLen;
0912: }
0913: if (result.indexPtr + result.lengthPtr >= scriptLen) {
0914: result.lengthPtr = scriptLen - result.indexPtr;
0915: }
0916: }
0917: }
0918: return;
0919: }
0920:
0921: } // end class TclParser
0922:
0923: // This class is used to map UTF8 oriented byte indexes used in
0924: // the Tcl API for the parser extension into character oriented
0925: // index used within Jacl.
0926:
0927: // String "Foo\u00c7bar"
0928: // Chars: 0123 456
0929:
0930: // Bytes: charToByteIndex byteToCharIndex
0931: // [0] = 'f' [0] = 0 [0] = 0
0932: // [1] = '0' [1] = 1 [1] = 1
0933: // [2] = 'o' [2] = 2 [2] = 2
0934: // [3] = '?' [3] = 3 [3] = 3
0935: // [4] = '?' [4] = 3
0936: // [5] = 'b' [4] = 5 [5] = 4
0937: // [6] = 'a' [5] = 6 [6] = 5
0938: // [7] = 'r' [6] = 7 [7] = 6
0939:
0940: class UTF8CharPointer extends CharPointer implements InternalRep {
0941: int[] charToByteIndex; // Map char index to byte index
0942: int[] byteToCharIndex; // Map byte index to char index
0943: byte[] bytes;
0944: String orig;
0945:
0946: UTF8CharPointer(String s) {
0947: super (s);
0948: orig = s;
0949: getByteInfo();
0950: }
0951:
0952: void getByteInfo() {
0953: int charIndex, byteIndex, bytesThisChar, bytesTotal;
0954:
0955: try {
0956: // First, loop over the characters to see if each of the characters
0957: // can be represented as a single UTF8 byte. In this special
0958: // case there is no need to worry about mapping bytes to charaters
0959: // or vice versa.
0960:
0961: char c;
0962: boolean singleBytes = true;
0963:
0964: for (int i = 0; i < array.length; i++) {
0965: c = array[i];
0966: if (c == '\0') {
0967: // Ignore encoding issues related to null byte in Java vs UTF8
0968: bytesThisChar = 1;
0969: } else {
0970: bytesThisChar = StringCmd.Utf8Count(c);
0971: }
0972: if (bytesThisChar != 1) {
0973: singleBytes = false;
0974: break;
0975: }
0976: }
0977:
0978: // When each character maps to a single byte, bytes is null
0979:
0980: if (singleBytes) {
0981: bytes = null;
0982: return;
0983: }
0984:
0985: // When multiple byte UTF8 characters are found, convert to
0986: // a byte array and save mapping info.
0987:
0988: String chars = new String(array); // Get string including trailing null
0989: bytes = chars.getBytes("UTF8");
0990:
0991: if (chars == null) { // For debugging only
0992: System.out.println("chars is \"" + chars + "\" len = "
0993: + chars.length());
0994: String bstr = new String(bytes, 0, bytes.length, "UTF8");
0995: System.out.println("bytes is \"" + bstr + "\" len = "
0996: + bytes.length);
0997: }
0998:
0999: // Count UTF8 bytes for each character, map char to byte index
1000:
1001: charToByteIndex = new int[array.length];
1002:
1003: for (charIndex = 0, byteIndex = 0; charIndex < charToByteIndex.length; charIndex++) {
1004: charToByteIndex[charIndex] = byteIndex;
1005:
1006: c = array[charIndex];
1007: if (c == '\0') {
1008: // Ignore encoding issues related to null byte in Java vs UTF8
1009: bytesThisChar = 1;
1010: } else {
1011: bytesThisChar = StringCmd.Utf8Count(c);
1012: }
1013: byteIndex += bytesThisChar;
1014: }
1015:
1016: // Double check that the number of expected bytes
1017: // was generated.
1018: bytesTotal = byteIndex;
1019:
1020: if (bytes.length != bytesTotal) {
1021: throw new TclRuntimeError("generated " + bytes.length
1022: + " but expected to generate " + bytesTotal
1023: + " bytes");
1024: }
1025:
1026: // Count Utf8 bytes for each character, map byte to char index
1027:
1028: byteToCharIndex = new int[bytes.length];
1029: for (charIndex = 0, byteIndex = 0, bytesThisChar = 0; byteIndex < byteToCharIndex.length; byteIndex++, bytesThisChar--) {
1030: if (byteIndex > 0 && bytesThisChar == 0) {
1031: charIndex++;
1032: }
1033: byteToCharIndex[byteIndex] = charIndex;
1034:
1035: c = array[charIndex];
1036: if (bytesThisChar == 0) {
1037: if (c == '\0') {
1038: // Ignore encoding issues related to null byte in Java vs UTF8
1039: bytesThisChar = 1;
1040: } else {
1041: bytesThisChar = StringCmd.Utf8Count(c);
1042: }
1043: }
1044: }
1045: } catch (java.io.UnsupportedEncodingException ex) {
1046: throw new TclRuntimeError("UTF8 encoding not supported");
1047: }
1048: }
1049:
1050: // Return bytes in the given byte range as a String
1051:
1052: String getByteRangeAsString(int byteIndex, int byteLength) {
1053: if (bytes == null) {
1054: // One byte for each character
1055: return orig.substring(byteIndex, byteIndex + byteLength);
1056: }
1057:
1058: try {
1059: return new String(bytes, byteIndex, byteLength, "UTF8");
1060: } catch (java.io.UnsupportedEncodingException ex) {
1061: throw new TclRuntimeError("UTF8 encoding not supported");
1062: }
1063: }
1064:
1065: // Convert char index into a byte index.
1066:
1067: int getByteIndex(int charIndex) {
1068: if (bytes == null) {
1069: // One byte for each character
1070: return charIndex;
1071: }
1072:
1073: return charToByteIndex[charIndex];
1074: }
1075:
1076: // Given a char index and range, return the number of
1077: // bytes in the range.
1078:
1079: int getByteRange(int charIndex, int charRange) {
1080: if (bytes == null) {
1081: // One byte for each character
1082: return charRange;
1083: }
1084:
1085: return charToByteIndex[charIndex + charRange]
1086: - charToByteIndex[charIndex];
1087: }
1088:
1089: // Get number of bytes for the given char index
1090:
1091: int getBytesAtIndex(int charIndex) {
1092: if (bytes == null) {
1093: // One byte for each character
1094: return 1;
1095: }
1096:
1097: return charToByteIndex[charIndex + 1]
1098: - charToByteIndex[charIndex];
1099: }
1100:
1101: // Return length of script in bytes
1102:
1103: int getByteLength() {
1104: if (bytes == null) {
1105: // One byte for each character
1106: return orig.length();
1107: }
1108:
1109: return bytes.length - 1;
1110: }
1111:
1112: // Given a byte index, return the char index.
1113:
1114: int getCharIndex(int byteIndex) {
1115: if (bytes == null) {
1116: // One byte for each character
1117: return byteIndex;
1118: }
1119:
1120: return byteToCharIndex[byteIndex];
1121: }
1122:
1123: // Given a byte index and range, return the number of
1124: // chars in the range.
1125:
1126: int getCharRange(int byteIndex, int byteRange) {
1127: if (bytes == null) {
1128: // One byte for each character
1129: return byteRange;
1130: }
1131:
1132: return byteToCharIndex[byteIndex + byteRange]
1133: - byteToCharIndex[byteIndex];
1134: }
1135:
1136: // This API is used for debugging, it would never be invoked as part
1137: // of the InternalRep interface since a TclObject would always have
1138: // a string rep when the UTF8CharPointer is created and it should
1139: // never be invalidated.
1140:
1141: public String toString() {
1142: if (bytes == null) {
1143: // One byte for each character
1144: return "1 byte for each character with length "
1145: + orig.length();
1146: }
1147:
1148: StringBuffer sb = new StringBuffer();
1149:
1150: int max_char = array.length - 1;
1151: int max_byte = bytes.length - 1;
1152: int max = max_char;
1153: if (max_byte > max) {
1154: max = max_byte;
1155: }
1156: sb.append("index char/byte array: (sizes = " + max_char + " "
1157: + max_byte + ")\n");
1158:
1159: for (int i = 0; i < max; i++) {
1160: String char_ind = " ", byte_ind = " ";
1161: if (i < max_char) {
1162: char_ind = "'" + array[i] + "'";
1163: }
1164: if (i < max_byte) {
1165: byte_ind = "'" + ((char) bytes[i]) + "'";
1166: }
1167:
1168: sb.append("[" + i + "] = " + char_ind + " " + byte_ind
1169: + "\n");
1170: }
1171: sb.append("\n");
1172:
1173: sb.append("charToByteIndex array:\n");
1174: for (int i = 0; i < charToByteIndex.length - 1; i++) {
1175: sb.append("[" + i + "] = " + charToByteIndex[i] + "\n");
1176: }
1177: sb.append("\n");
1178:
1179: sb.append("byteToCharIndex array:\n");
1180: for (int i = 0; i < byteToCharIndex.length - 1; i++) {
1181: sb.append("[" + i + "] = " + byteToCharIndex[i] + "\n");
1182: }
1183: sb.append("\n");
1184:
1185: return sb.toString();
1186: }
1187:
1188: // InternalRep interfaces
1189:
1190: // Called to free any storage for the type's internal rep.
1191:
1192: public void dispose() {
1193: }
1194:
1195: // duplicate
1196:
1197: public InternalRep duplicate() {
1198: // A UTF8CharPointer is read-only, so just dup the ref
1199: return this;
1200: }
1201:
1202: }
|