0001: /*
0002: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
0003: *
0004: * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
0005: *
0006: * The contents of this file are subject to the terms of either the GNU
0007: * General Public License Version 2 only ("GPL") or the Common
0008: * Development and Distribution License("CDDL") (collectively, the
0009: * "License"). You may not use this file except in compliance with the
0010: * License. You can obtain a copy of the License at
0011: * http://www.netbeans.org/cddl-gplv2.html
0012: * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
0013: * specific language governing permissions and limitations under the
0014: * License. When distributing the software, include this License Header
0015: * Notice in each file and include the License file at
0016: * nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this
0017: * particular file as subject to the "Classpath" exception as provided
0018: * by Sun in the GPL Version 2 section of the License file that
0019: * accompanied this code. If applicable, add the following below the
0020: * License Header, with the fields enclosed by brackets [] replaced by
0021: * your own identifying information:
0022: * "Portions Copyrighted [year] [name of copyright owner]"
0023: *
0024: * Contributor(s):
0025: *
0026: * The Original Software is NetBeans. The Initial Developer of the Original
0027: * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
0028: * Microsystems, Inc. All Rights Reserved.
0029: *
0030: * If you wish your version of this file to be governed by only the CDDL
0031: * or only the GPL Version 2, indicate your decision by adding
0032: * "[Contributor] elects to include this software in this distribution
0033: * under the [CDDL or GPL Version 2] license." If you do not indicate a
0034: * single choice of license, a recipient has the option to distribute
0035: * your version of this file under either the CDDL, the GPL Version 2 or
0036: * to extend the choice of license to its licensees as provided above.
0037: * However, if you add GPL Version 2 code and therefore, elected the GPL
0038: * Version 2 license, then the option applies only if the new code is
0039: * made subject to such option by the copyright holder.
0040: */
0041:
0042: package org.netbeans.lib.jsp.lexer;
0043:
0044: import java.util.logging.Level;
0045: import java.util.logging.Logger;
0046: import org.netbeans.api.jsp.lexer.JspTokenId;
0047: import org.netbeans.api.lexer.InputAttributes;
0048: import org.netbeans.api.lexer.LanguagePath;
0049: import org.netbeans.api.lexer.PartType;
0050: import org.netbeans.api.lexer.Token;
0051: import org.netbeans.spi.jsp.lexer.JspParseData;
0052: import org.netbeans.spi.lexer.Lexer;
0053: import org.netbeans.spi.lexer.LexerInput;
0054: import org.netbeans.spi.lexer.LexerRestartInfo;
0055: import org.netbeans.spi.lexer.TokenFactory;
0056: import org.netbeans.spi.lexer.TokenPropertyProvider;
0057:
0058: /**
0059: * Syntax class for JSP tags.
0060: *
0061: * @author Petr Jiricka
0062: * @author Marek Fukala
0063: *
0064: * @version 1.00
0065: */
0066:
0067: public class JspLexer implements Lexer<JspTokenId> {
0068:
0069: private static final Logger LOGGER = Logger
0070: .getLogger(JspLexer.class.getName());
0071: private static final boolean LOG = Boolean
0072: .getBoolean("j2ee_lexer_debug"); //NOI18N
0073:
0074: private static final int EOF = LexerInput.EOF;
0075:
0076: private static final String JSP_STANDART_TAG_PREFIX = "jsp:";
0077:
0078: private final LexerInput input;
0079:
0080: private final InputAttributes inputAttributes;
0081: private final JspParseData jspParseData;
0082:
0083: private final TokenFactory<JspTokenId> tokenFactory;
0084:
0085: public Object state() {
0086: return lexerState + lexerStateBeforeEL * 1000
0087: + lexerStateJspScriptlet * 1000000;
0088: }
0089:
0090: //main internal lexer state
0091: private int lexerState = INIT;
0092:
0093: //secondary internal state for EL expressions in JSP
0094: //is it used to eliminate a number of lexer states when EL is found -
0095: //we have 8 states just in attribute value so I would have to copy the EL
0096: //recognition code eight-times.
0097: private int lexerStateBeforeEL = INIT;
0098:
0099: //internal state signalling whether the lexer is in <jsp:scriptlet> tag
0100: private int lexerStateJspScriptlet = INIT;
0101:
0102: // Internal analyzer states
0103: // general
0104: private static final int INIT = 0; // initial lexer state = content language
0105: private static final int ISI_ERROR = 1; // when the fragment does not start with <
0106: private static final int ISA_LT = 2; // after '<' char
0107: // tags and directives
0108: private static final int ISI_TAGNAME = 3; // inside JSP tag name
0109: private static final int ISI_DIRNAME = 4; // inside JSP directive name
0110: private static final int ISP_TAG = 5; // after JSP tag name
0111: private static final int ISP_DIR = 6; // after JSP directive name
0112: private static final int ISI_TAG_I_WS = 7; // inside JSP tag after whitespace
0113: private static final int ISI_DIR_I_WS = 8; // inside JSP directive after whitespace
0114: private static final int ISI_ENDTAG = 9; // inside end JSP tag
0115: private static final int ISI_TAG_ATTR = 10; // inside tag attribute
0116: private static final int ISI_DIR_ATTR = 11; // inside directive attribute
0117: private static final int ISP_TAG_EQ = 12; // just after '=' in tag
0118: private static final int ISP_DIR_EQ = 13; // just after '=' in directive
0119: private static final int ISI_TAG_STRING = 14; // inside string (value - "") in tag
0120: private static final int ISI_DIR_STRING = 15; // inside string (value - "") in directive
0121: private static final int ISI_TAG_STRING_B = 16; // inside string (value - "") after backslash in tag
0122: private static final int ISI_DIR_STRING_B = 17; // inside string (value - "") after backslash in directive
0123: private static final int ISI_TAG_STRING2 = 18; // inside string (value - '') in tag
0124: private static final int ISI_DIR_STRING2 = 19; // inside string (value - '') in directive
0125: private static final int ISI_TAG_STRING2_B = 20; // inside string (value - '') after backslash in tag
0126: private static final int ISI_DIR_STRING2_B = 21; // inside string (value - '') after backslash in directive
0127: private static final int ISA_ENDSLASH = 22; // after ending '/' in JSP tag
0128: private static final int ISA_ENDPC = 23; // after ending '%' in JSP directive
0129: // comments (+directives)
0130: private static final int ISA_LT_PC = 24; // after '<%' - comment or directive or scriptlet
0131: private static final int ISI_JSP_COMMENT = 25; // after <%-
0132:
0133: private static final int ISI_JSP_COMMENT_M = 26; // inside JSP comment after -
0134: private static final int ISI_JSP_COMMENT_MM = 27; // inside JSP comment after --
0135: private static final int ISI_JSP_COMMENT_MMP = 28; // inside JSP comment after --%
0136: // end state
0137: // static final int ISA_END_JSP = 29; // JSP fragment has finished and control
0138: // should be returned to master syntax
0139: // more errors
0140: private static final int ISI_TAG_ERROR = 30; // error in tag, can be cleared by > or \n
0141: private static final int ISI_DIR_ERROR = 31; // error in directive, can be cleared by %>, \n, \t or space
0142: private static final int ISI_DIR_ERROR_P = 32; // error in directive after %, can be cleared by > or \n
0143:
0144: private static final int ISA_LT_PC_AT = 33; // after '<%@' (directive)
0145: private static final int ISA_LT_SLASH = 34; // after '</' sequence
0146: private static final int ISA_LT_PC_DASH = 35; // after <%- ;not comment yet
0147:
0148: private static final int ISI_SCRIPTLET = 36; // inside java scriptlet/declaration/expression
0149: private static final int ISP_SCRIPTLET_PC = 37; // just after % in scriptlet
0150:
0151: //expression language
0152:
0153: //EL in content language
0154: private static final int ISA_EL_DELIM = 38; //after $ or # in content language
0155: private static final int ISI_EL = 39; //expression language in content (after ${ or #{ )
0156:
0157: private static final int ISA_BS = 40; //after backslash in text - needed to disable EL by scaping # or $
0158:
0159: //scriptlet substate states
0160: //in standart syntax jsp
0161: private static final int JAVA_SCRITPLET = 1; //java scriptlet
0162: private static final int JAVA_DECLARATION = 2; //java declaration
0163: private static final int JAVA_EXPRESSION = 3; //java expression
0164: //in xml syntax jsp (jsp document)
0165: private static final int JAVA_SCRITPLET_DOCUMENT = 4; //java scriptlet in JSP document
0166: private static final int JAVA_DECLARATION_DOCUMENT = 5; //java declaration in JSP document
0167: private static final int JAVA_EXPRESSION_DOCUMENT = 6; //java expression in JSP document
0168:
0169: public JspLexer(LexerRestartInfo<JspTokenId> info) {
0170: this .input = info.input();
0171: this .inputAttributes = info.inputAttributes();
0172: this .tokenFactory = info.tokenFactory();
0173: if (info.state() == null) {
0174: lexerState = INIT;
0175: lexerStateBeforeEL = INIT;
0176: lexerStateJspScriptlet = INIT;
0177: } else {
0178: int encoded = ((Integer) info.state()).intValue();
0179: lexerStateJspScriptlet = encoded / 1000000;
0180: int reminder = encoded % 1000000;
0181: lexerStateBeforeEL = reminder / 1000;
0182: lexerState = encoded % 1000;
0183: }
0184: if (inputAttributes != null) {
0185: jspParseData = (JspParseData) inputAttributes.getValue(
0186: LanguagePath.get(JspTokenId.language()),
0187: JspParseData.class);
0188: } else {
0189: jspParseData = null;
0190: }
0191: }
0192:
0193: public boolean isIdentifierPart(char character) {
0194: return Character.isJavaIdentifierPart(character);
0195: }
0196:
0197: /** Determines whether a given string is a JSP tag. */
0198: private boolean isJspTag(CharSequence tagName) {
0199: if (startsWith(tagName, JSP_STANDART_TAG_PREFIX)) { // NOI18N
0200: return true;
0201: }
0202:
0203: //TODO handle custom tags from JSP parser here
0204: if (jspParseData != null) {
0205: int colonIndex = indexOf(tagName, ':');//NOI18N
0206: if (colonIndex != -1) {
0207: CharSequence prefix = tagName
0208: .subSequence(0, colonIndex);
0209: return jspParseData.isTagLibRegistered(prefix
0210: .toString());
0211: }
0212: }
0213:
0214: return false;
0215: }
0216:
0217: private boolean startsWith(CharSequence text, CharSequence prefix) {
0218: if (text.length() < prefix.length()) {
0219: return false;
0220: }
0221:
0222: for (int i = 0; i < prefix.length(); i++) {
0223: if (text.charAt(i) != prefix.charAt(i)) {
0224: return false;
0225: }
0226: }
0227:
0228: return true;
0229: }
0230:
0231: private int indexOf(CharSequence text, char ch) {
0232: for (int i = 0; i < text.length(); i++) {
0233: if (text.charAt(i) == ch) {
0234: return i;
0235: }
0236: }
0237: return -1;
0238: }
0239:
0240: private boolean isELIgnored() {
0241: return jspParseData == null ? false : jspParseData
0242: .isELIgnored();
0243: }
0244:
0245: private boolean isXMLSyntax() {
0246: return jspParseData == null ? false : jspParseData
0247: .isXMLSyntax();
0248: }
0249:
0250: private CharSequence getPossibleTagName() {
0251: int actChar;
0252: int prev_read = input.readLength(); //remember the size of the read sequence
0253: int read = 0;
0254: while (true) {
0255: actChar = input.read();
0256: read++;
0257: if (!(Character.isLetter(actChar)
0258: || Character.isDigit(actChar) || (actChar == '_')
0259: || (actChar == '-') || (actChar == ':')
0260: || (actChar == '.') || (actChar == '/'))
0261: || (actChar == EOF)) { // EOL or not alpha
0262: //end of tagname
0263: CharSequence tagName = input.readText().subSequence(
0264: prev_read, prev_read + read - 1);
0265: input.backup(read); //put the lookahead text back to the buffer
0266: return tagName;
0267: }
0268: }
0269: }
0270:
0271: /** Looks ahead into the character buffer and checks if a jsp tag name follows. */
0272: private boolean followsJspTag() {
0273: return isJspTag(getPossibleTagName());
0274: }
0275:
0276: public Token<JspTokenId> nextToken() {
0277: int actChar;
0278: while (true) {
0279: actChar = input.read();
0280:
0281: if (actChar == EOF) {
0282: if (input.readLengthEOF() == 1) {
0283: return null; //just EOL is read
0284: } else {
0285: //there is something else in the buffer except EOL
0286: //we will return last token now
0287: input.backup(1); //backup the EOL, we will return null in next nextToken() call
0288: break;
0289: }
0290: }
0291:
0292: switch (lexerState) {
0293: case INIT:
0294: switch (actChar) {
0295: // case '\n':
0296: // return token(JspTokenId.EOL);
0297: case '<':
0298: lexerState = ISA_LT;
0299: break;
0300: // default:
0301: // state = ISI_ERROR;
0302: // break;
0303: case '\\':
0304: lexerState = ISA_BS;
0305: break;
0306: case '$':
0307: case '#': //maybe expression language
0308: lexerStateBeforeEL = lexerState; //remember main state
0309: lexerState = ISA_EL_DELIM;
0310: break;
0311: }
0312: break;
0313:
0314: case ISA_BS:
0315: if (actChar != '\\') {
0316: lexerState = INIT; //prevent scaped EL in text being recognized
0317: }
0318: break;
0319:
0320: case ISA_EL_DELIM:
0321: if (isELIgnored()) {
0322: //reset to previous state - do not recognize EL
0323: lexerState = lexerStateBeforeEL;
0324: lexerStateBeforeEL = INIT;
0325: } else {
0326: switch (actChar) {
0327: case '{':
0328: if (input.readLength() > 2) {
0329: //we have something read except the '${' or '#{' => it's content language
0330: input.backup(2); //backup the '$/#{'
0331: lexerState = lexerStateBeforeEL; //we will read the '$/#{' again
0332: lexerStateBeforeEL = INIT;
0333: return token(JspTokenId.TEXT); //return the content language token
0334: }
0335: lexerState = ISI_EL;
0336: break;
0337: default:
0338: input.backup(1); //put the read char back
0339: lexerState = lexerStateBeforeEL;
0340: lexerStateBeforeEL = INIT;
0341: }
0342: }
0343: break;
0344:
0345: case ISI_EL:
0346: if (actChar == '}') {
0347: //return EL token
0348: lexerState = lexerStateBeforeEL;
0349: lexerStateBeforeEL = INIT;
0350: return token(JspTokenId.EL);
0351: }
0352: //stay in EL
0353: break;
0354:
0355: case ISA_LT:
0356: if (Character.isLetter(actChar) || (actChar == '_')) { // possible tag begining
0357: input.backup(1); //backup the read letter
0358: CharSequence tagName = getPossibleTagName();
0359: if (isJspTag(tagName)) { //test if a jsp tag follows
0360: if (input.readLength() > 1) {
0361: //we have something read except the '<' => it's content language
0362: input.backup(1); //backup the '<'
0363: lexerState = INIT; //we will read the '<' again
0364: return token(JspTokenId.TEXT); //return the content language token
0365: }
0366: //possibly switch to scriptlet when <jsp:scriptlet> found
0367:
0368: if ("jsp:scriptlet".equals(tagName)) { //NOI18N
0369: lexerStateJspScriptlet = JAVA_SCRITPLET_DOCUMENT;
0370: } else if ("jsp:declaration".equals(tagName)) { //NOI18N
0371: lexerStateJspScriptlet = JAVA_DECLARATION_DOCUMENT;
0372: } else if ("jsp:expression".equals(tagName)) { //NOI18N
0373: lexerStateJspScriptlet = JAVA_EXPRESSION_DOCUMENT;
0374: }
0375:
0376: lexerState = ISI_TAGNAME;
0377: return token(JspTokenId.SYMBOL); //return the read '<' symbol first
0378: } else {
0379: //just a content language
0380: lexerState = INIT;
0381: break;
0382: }
0383: }
0384:
0385: switch (actChar) {
0386: case '/':
0387: lexerState = ISA_LT_SLASH;
0388: break;
0389: // case '\n':
0390: // state = ISI_TAG_ERROR;
0391: // input.backup(1);
0392: // return token(JspTokenId.SYMBOL);
0393: case '%':
0394: lexerState = ISA_LT_PC;
0395: break;
0396: default:
0397: lexerState = INIT; //just content
0398: // state = ISI_TAG_ERROR;
0399: // break;
0400: }
0401: break;
0402:
0403: case ISA_LT_SLASH:
0404: if (Character.isLetter(actChar) || (actChar == '_')) {
0405: //possible end tag beginning
0406: input.backup(1); //backup the first letter
0407: if (followsJspTag()) {
0408: if (input.readLength() > 2) {
0409: //we have something read except the '</' symbol
0410: input.backup(2);
0411: lexerState = INIT;
0412: return token(JspTokenId.TEXT);
0413: } else {
0414: lexerState = ISI_ENDTAG;
0415: return token(JspTokenId.SYMBOL); //return the read '</' symbol first
0416: }
0417: //break;
0418: } else {
0419: //just a content language
0420: lexerState = INIT;
0421: break;
0422: }
0423: }
0424:
0425: //not jsp end tag -> just content -> switch to init state
0426: lexerState = INIT;
0427: break;
0428:
0429: case ISI_TAGNAME:
0430: case ISI_DIRNAME:
0431:
0432: if (!(Character.isLetter(actChar)
0433: || Character.isDigit(actChar)
0434: || (actChar == '_') || (actChar == '-')
0435: || (actChar == ':') || (actChar == '.'))) { // not alpha
0436: switch (actChar) {
0437: case '<':
0438: lexerState = INIT;
0439: input.backup(1);
0440: break;
0441: case '/':
0442: input.backup(1);
0443: lexerState = ((lexerState == ISI_TAGNAME) ? ISP_TAG
0444: : ISP_DIR);
0445: break;
0446: case '>':
0447: if (lexerStateJspScriptlet != INIT) {
0448: //switch to java scriptlet
0449: lexerState = ISI_SCRIPTLET;
0450: } else {
0451: input.backup(1); //backup the '<' char
0452: lexerState = ((lexerState == ISI_TAGNAME) ? ISP_TAG
0453: : ISP_DIR);
0454: }
0455: break;
0456: case ' ':
0457: input.backup(1);
0458: lexerState = ((lexerState == ISI_TAGNAME) ? ISP_TAG
0459: : ISP_DIR);
0460: break;
0461: default:
0462: lexerState = ((lexerState == ISI_TAGNAME) ? ISP_TAG
0463: : ISP_DIR);
0464: }
0465: return token(JspTokenId.TAG);
0466: }
0467: break;
0468:
0469: case ISP_TAG:
0470: case ISP_DIR:
0471: if (Character.isLetter(actChar) || (actChar == '_')) {
0472: lexerState = ((lexerState == ISP_TAG) ? ISI_TAG_ATTR
0473: : ISI_DIR_ATTR);
0474: break;
0475: }
0476: switch (actChar) {
0477: case '\n':
0478: // if (input.readLength() == 1) { // no char
0479: return token(JspTokenId.EOL);
0480: // } else { // return string first
0481: // input.backup(1);
0482: // return decide_jsp_tag_token();
0483: // }
0484: case '>': // for tags
0485: if (lexerState == ISP_TAG) {
0486: // if (input.readLength() == 1) { // no char
0487: // state = ISA_END_JSP;
0488: lexerState = INIT;
0489: return token(JspTokenId.SYMBOL);
0490: // } else { // return string first
0491: // input.backup(1);
0492: // return decide_jsp_tag_token();
0493: // }
0494: } else { // directive
0495: lexerState = ISI_DIR_ERROR;
0496: break;
0497: }
0498: case '/': // for tags
0499: if (lexerState == ISP_TAG) {
0500: // if (input.readLength() == 1) { // no char
0501: lexerState = ISA_ENDSLASH;
0502: break;
0503: // } else { // return string first
0504: // input.backup(1);
0505: // return decide_jsp_tag_token();
0506: // }
0507: } else { // directive
0508: lexerState = ISI_DIR_ERROR;
0509: break;
0510: }
0511: case '%': // for directives
0512: if (lexerState == ISP_DIR) {
0513: // if (input.readLength() == 1) { // no char
0514: lexerState = ISA_ENDPC;
0515: break;
0516: // } else { // return string first
0517: // input.backup(1);
0518: // return decide_jsp_tag_token();
0519: // }
0520: } else { // tag
0521: lexerState = ISI_TAG_ERROR;
0522: break;
0523: }
0524: case '=':
0525: lexerState = ((lexerState == ISP_TAG) ? ISP_TAG_EQ
0526: : ISP_DIR_EQ);
0527: return token(JspTokenId.SYMBOL);
0528: case ' ':
0529: case '\t':
0530: lexerState = ((lexerState == ISP_TAG) ? ISI_TAG_I_WS
0531: : ISI_DIR_I_WS);
0532: break;
0533: case '<':
0534: // assume that this is the start of the next tag
0535: //we shouldn't have anything else than then the < char in buffer
0536: assert input.readLength() == 1 : "There is something more than '<' char in the read text: '"
0537: + input.readText() + "'"; //NOI18N
0538: input.backup(1);
0539: lexerState = INIT;
0540: default: //numbers or illegal symbols
0541: lexerState = ((lexerState == ISP_TAG) ? ISI_TAG_ERROR
0542: : ISI_DIR_ERROR);
0543: break;
0544: }
0545: break;
0546:
0547: case ISI_TAG_I_WS:
0548: case ISI_DIR_I_WS:
0549: switch (actChar) {
0550: case ' ':
0551: case '\t':
0552: break;
0553: case '<': //start of the next tag
0554: // state = ISA_END_JSP;
0555: lexerState = INIT;
0556: input.backup(1);
0557: return token(JspTokenId.TAG);
0558: default:
0559: lexerState = ((lexerState == ISI_TAG_I_WS) ? ISP_TAG
0560: : ISP_DIR);
0561: input.backup(1);
0562: return token(JspTokenId.WHITESPACE);
0563: }
0564: break;
0565:
0566: case ISI_ENDTAG:
0567: if (!(Character.isLetter(actChar)
0568: || Character.isDigit(actChar)
0569: || (actChar == '_') || (actChar == '-') || (actChar == ':'))) { // not alpha
0570: lexerState = ISP_TAG;
0571: input.backup(1);
0572: return token(JspTokenId.ENDTAG);
0573: }
0574: break;
0575:
0576: case ISI_TAG_ATTR:
0577: case ISI_DIR_ATTR:
0578: if (!(Character.isLetter(actChar)
0579: || Character.isDigit(actChar)
0580: || (actChar == '_') || (actChar == ':') || (actChar == '-'))) { // not alpha or '-' (http-equiv)
0581: lexerState = ((lexerState == ISI_TAG_ATTR) ? ISP_TAG
0582: : ISP_DIR);
0583: input.backup(1);
0584: return token(JspTokenId.ATTRIBUTE);
0585: }
0586: break;
0587:
0588: case ISP_TAG_EQ:
0589: case ISP_DIR_EQ:
0590: switch (actChar) {
0591: case '\n':
0592: // if (input.readLength() == 1) { // no char
0593: return token(JspTokenId.EOL);
0594: // } else { // return string first
0595: // input.backup(1);
0596: // return token(JspTokenId.ATTR_VALUE);
0597: // }
0598: case '"':
0599: lexerState = ((lexerState == ISP_TAG_EQ) ? ISI_TAG_STRING
0600: : ISI_DIR_STRING);
0601: break;
0602: case '\'':
0603: lexerState = ((lexerState == ISP_TAG_EQ) ? ISI_TAG_STRING2
0604: : ISI_DIR_STRING2);
0605: break;
0606: case ' ':
0607: case '\t':
0608: // don't change the state
0609: break;
0610: default:
0611: //invalid value - lets backup it and swith to tag content
0612: lexerState = ((lexerState == ISP_TAG_EQ) ? ISP_TAG
0613: : ISP_DIR);
0614: input.backup(input.readLength());
0615: break;
0616: }
0617: break;
0618:
0619: case ISI_TAG_STRING:
0620: case ISI_DIR_STRING:
0621: case ISI_TAG_STRING2:
0622: case ISI_DIR_STRING2:
0623: if ((actChar == '"')
0624: && ((lexerState == ISI_TAG_STRING) || (lexerState == ISI_DIR_STRING))) {
0625: lexerState = ((lexerState == ISI_TAG_STRING) ? ISP_TAG
0626: : ISP_DIR);
0627: return token(JspTokenId.ATTR_VALUE);
0628: }
0629:
0630: if ((actChar == '\'')
0631: && ((lexerState == ISI_TAG_STRING2) || (lexerState == ISI_DIR_STRING2))) {
0632: lexerState = ((lexerState == ISI_TAG_STRING2) ? ISP_TAG
0633: : ISP_DIR);
0634: return token(JspTokenId.ATTR_VALUE);
0635: }
0636:
0637: switch (actChar) {
0638: case '\\':
0639: switch (lexerState) {
0640: case ISI_TAG_STRING:
0641: lexerState = ISI_TAG_STRING_B;
0642: break;
0643: case ISI_DIR_STRING:
0644: lexerState = ISI_DIR_STRING_B;
0645: break;
0646: case ISI_TAG_STRING2:
0647: lexerState = ISI_TAG_STRING2_B;
0648: break;
0649: case ISI_DIR_STRING2:
0650: lexerState = ISI_DIR_STRING2_B;
0651: break;
0652: }
0653: break;
0654: case '\n':
0655: if (input.readLength() == 1) { // no char
0656: return token(JspTokenId.EOL);
0657: } else { // return string first
0658: input.backup(1);
0659: return token(JspTokenId.ATTR_VALUE);
0660: }
0661: case '$':
0662: case '#':
0663: if (input.readLength() > 1) {
0664: //return part of the attribute value before EL
0665: input.backup(1); //backup $ or #
0666: return token(JspTokenId.ATTR_VALUE);
0667: } else {
0668: lexerStateBeforeEL = lexerState; //remember main state
0669: lexerState = ISA_EL_DELIM;
0670: }
0671: break;
0672:
0673: default:
0674: break;//stay in ISI_TAG_STRING/2;
0675:
0676: }
0677: break;
0678:
0679: case ISI_TAG_STRING_B:
0680: case ISI_DIR_STRING_B:
0681: case ISI_TAG_STRING2_B:
0682: case ISI_DIR_STRING2_B:
0683: switch (actChar) {
0684: case '"':
0685: case '\'':
0686: case '\\':
0687: case '$':
0688: case '#':
0689: break;
0690: default:
0691: input.backup(1);
0692: break;
0693: }
0694: switch (lexerState) {
0695: case ISI_TAG_STRING_B:
0696: lexerState = ISI_TAG_STRING;
0697: break;
0698: case ISI_DIR_STRING_B:
0699: lexerState = ISI_DIR_STRING;
0700: break;
0701: case ISI_TAG_STRING2_B:
0702: lexerState = ISI_TAG_STRING2;
0703: break;
0704: case ISI_DIR_STRING2_B:
0705: lexerState = ISI_DIR_STRING2;
0706: break;
0707: }
0708: break;
0709:
0710: case ISA_ENDSLASH:
0711: switch (actChar) {
0712: case '>':
0713: // state = ISA_END_JSP;
0714: lexerState = INIT;
0715: return token(JspTokenId.SYMBOL);
0716: case '\n':
0717: lexerState = ISI_TAG_ERROR;
0718: input.backup(1);
0719: return token(JspTokenId.SYMBOL);
0720: default:
0721: lexerState = ISP_TAG;
0722: input.backup(1);
0723: return token(JspTokenId.SYMBOL);
0724: }
0725: //break; not reached
0726:
0727: case ISA_ENDPC:
0728: switch (actChar) {
0729: case '>':
0730: // state = ISA_END_JSP;
0731: lexerState = INIT;
0732: return token(JspTokenId.SYMBOL);
0733: case '\n':
0734: lexerState = ISI_DIR_ERROR;
0735: input.backup(1);
0736: return token(JspTokenId.SYMBOL);
0737: default:
0738: lexerState = ISP_DIR;
0739: input.backup(1);
0740: return token(JspTokenId.SYMBOL);
0741: }
0742: //break; not reached
0743:
0744: case ISA_LT_PC:
0745: switch (actChar) {
0746: case '@':
0747: if (input.readLength() == 3) {
0748: // just <%@ read
0749: lexerState = ISA_LT_PC_AT;
0750: return token(JspTokenId.SYMBOL);
0751: } else {
0752: //jsp symbol, but we also have content language in the buffer
0753: input.backup(3); //backup <%@
0754: lexerState = INIT;
0755: return token(JspTokenId.TEXT); //return CL token
0756: }
0757: case '-': //may be JSP comment
0758: lexerState = ISA_LT_PC_DASH;
0759: break;
0760: case '!': // java declaration
0761: case '=': // java expression
0762: if (input.readLength() == 3) {
0763: // just <%! or <%= read
0764: lexerStateJspScriptlet = actChar == '!' ? JAVA_DECLARATION
0765: : JAVA_EXPRESSION;
0766: lexerState = ISI_SCRIPTLET;
0767: return token(JspTokenId.SYMBOL2);
0768: } else {
0769: //jsp symbol, but we also have content language in the buffer
0770: input.backup(3); //backup <%! or <%=
0771: lexerState = INIT;
0772: return token(JspTokenId.TEXT); //return CL token
0773: }
0774: default: //java scriptlet delimiter '<%'
0775: if (input.readLength() == 3) {
0776: // just <% + something != [-,!,=,@] read
0777: lexerStateJspScriptlet = JAVA_SCRITPLET;
0778: lexerState = ISI_SCRIPTLET;
0779: input.backup(1); //backup the third character, it is a part of the java scriptlet
0780: return token(JspTokenId.SYMBOL2);
0781: } else {
0782: //jsp symbol, but we also have content language in the buffer
0783: input.backup(3); //backup <%@
0784: lexerState = INIT;
0785: return token(JspTokenId.TEXT); //return CL token
0786: }
0787: }
0788: break;
0789:
0790: case ISI_SCRIPTLET:
0791: switch (actChar) {
0792: case '%':
0793: lexerState = ISP_SCRIPTLET_PC;
0794: break;
0795: case '<':
0796: //may be end of scriptlet section in JSP document
0797: CharSequence tagName = getPossibleTagName();
0798: if ("/jsp:scriptlet".equals(tagName) || //NOI18N
0799: "/jsp:declaration".equals(tagName) || //NOI18N
0800: "/jsp:expression".equals(tagName)) { //NOI18N
0801: if (input.readLength() == 1) {
0802: //just the '<' symbol read
0803: input.backup(1);
0804: lexerState = INIT;
0805: } else {
0806: //return the scriptlet content
0807: input.backup(1); // backup '<' we will read it again
0808: int lxs = lexerStateJspScriptlet;
0809: lexerStateJspScriptlet = INIT;
0810: return scriptletToken(JspTokenId.SCRIPTLET,
0811: lxs);
0812: }
0813: }
0814: }
0815: break;
0816:
0817: case ISP_SCRIPTLET_PC:
0818: switch (actChar) {
0819: case '>':
0820: if (input.readLength() == 2) {
0821: //just the '%>' symbol read
0822: lexerState = INIT;
0823: lexerStateJspScriptlet = INIT;
0824: return token(JspTokenId.SYMBOL2);
0825: } else {
0826: //return the scriptlet content
0827: input.backup(2); // backup '%>' we will read JUST them again
0828: lexerState = ISI_SCRIPTLET;
0829: int lxs = lexerStateJspScriptlet;
0830: lexerStateJspScriptlet = INIT;
0831: return scriptletToken(JspTokenId.SCRIPTLET, lxs);
0832: }
0833: default:
0834: lexerState = ISI_SCRIPTLET;
0835: break;
0836: }
0837: break;
0838:
0839: case ISA_LT_PC_DASH:
0840: switch (actChar) {
0841: case '-':
0842: if (input.readLength() == 4) {
0843: //just the '<%--' symbol read
0844: lexerState = ISI_JSP_COMMENT;
0845: } else {
0846: //return the scriptlet content
0847: input.backup(4); // backup '<%--', we will read it again
0848: lexerState = INIT;
0849: return token(JspTokenId.TEXT);
0850: }
0851: break;
0852: default:
0853: // state = ISA_END_JSP;
0854: lexerState = INIT; //XXX how to handle content language?
0855: return token(JspTokenId.TEXT); //marek: should I token here????
0856: }
0857:
0858: // JSP states
0859: case ISI_JSP_COMMENT:
0860: switch (actChar) {
0861: case '\n':
0862: if (input.readLength() == 1) { // no char
0863: return token(JspTokenId.EOL);
0864: } else { // return block comment first
0865: input.backup(1);
0866: return token(JspTokenId.COMMENT);
0867: }
0868: case '-':
0869: lexerState = ISI_JSP_COMMENT_M;
0870: break;
0871: }
0872: break;
0873:
0874: case ISI_JSP_COMMENT_M:
0875: switch (actChar) {
0876: case '\n':
0877: lexerState = ISI_JSP_COMMENT;
0878: if (input.readLength() == 1) { // no char
0879: return token(JspTokenId.EOL);
0880: } else { // return block comment first
0881: input.backup(1);
0882: return token(JspTokenId.COMMENT);
0883: }
0884: case '-':
0885: lexerState = ISI_JSP_COMMENT_MM;
0886: break;
0887: default:
0888: lexerState = ISI_JSP_COMMENT;
0889: break;
0890: }
0891: break;
0892:
0893: case ISI_JSP_COMMENT_MM:
0894: switch (actChar) {
0895: case '\n':
0896: lexerState = ISI_JSP_COMMENT;
0897: if (input.readLength() == 1) { // no char
0898: return token(JspTokenId.EOL);
0899: } else { // return block comment first
0900: input.backup(1);
0901: return token(JspTokenId.COMMENT);
0902: }
0903: case '%':
0904: lexerState = ISI_JSP_COMMENT_MMP;
0905: break;
0906: case '-':
0907: lexerState = ISI_JSP_COMMENT_MM;
0908: break;
0909: default:
0910: lexerState = ISI_JSP_COMMENT;
0911: break;
0912: }
0913: break;
0914:
0915: case ISI_JSP_COMMENT_MMP:
0916: switch (actChar) {
0917: case '\n':
0918: lexerState = ISI_JSP_COMMENT;
0919: if (input.readLength() == 1) { // no char
0920: return token(JspTokenId.EOL);
0921: } else { // return block comment first
0922: input.backup(1);
0923: return token(JspTokenId.COMMENT);
0924: }
0925: case '>':
0926: // state = ISA_END_JSP;
0927: lexerState = INIT;
0928: return token(JspTokenId.COMMENT);
0929: default:
0930: lexerState = ISI_JSP_COMMENT;
0931: break;
0932: }
0933: break;
0934:
0935: case ISI_ERROR:
0936: switch (actChar) {
0937: case '\n':
0938: lexerState = INIT;
0939: input.backup(1);
0940: return token(JspTokenId.ERROR);
0941: case '<':
0942: lexerState = ISA_LT;
0943: input.backup(1);
0944: return token(JspTokenId.ERROR);
0945: }
0946: break;
0947:
0948: case ISI_TAG_ERROR:
0949: switch (actChar) {
0950: case '\n':
0951: if (input.readLength() == 1) { // no char
0952: lexerState = ISP_TAG;
0953: return token(JspTokenId.EOL);
0954: } else { // return error first
0955: input.backup(1);
0956: return token(JspTokenId.ERROR);
0957: }
0958: case '>':
0959: case ' ':
0960: case '\t':
0961: lexerState = ISP_TAG;
0962: input.backup(1);
0963: return token(JspTokenId.ERROR);
0964: default:
0965: break;
0966: }
0967: break;
0968:
0969: case ISI_DIR_ERROR:
0970: switch (actChar) {
0971: case '\n':
0972: if (input.readLength() == 1) { // no char
0973: lexerState = ISP_DIR;
0974: return token(JspTokenId.EOL);
0975: } else { // return error first
0976: input.backup(1);
0977: return token(JspTokenId.ERROR);
0978: }
0979: // case '%':
0980: case '\t':
0981: case ' ':
0982: lexerState = ISP_DIR;
0983: if (input.readLength() > 1) {
0984: input.backup(1);
0985: return token(JspTokenId.ERROR);
0986: }
0987: default:
0988: break;
0989: }
0990: break;
0991:
0992: case ISI_DIR_ERROR_P:
0993: switch (actChar) {
0994: case '\n':
0995: if (input.readLength() == 1) { // no char
0996: lexerState = ISI_DIR_I_WS;
0997: return token(JspTokenId.EOL);
0998: } else { // return error first
0999: input.backup(1);
1000: return token(JspTokenId.ERROR);
1001: }
1002: case '>':
1003: input.backup(2);
1004: lexerState = ISI_DIR_I_WS;
1005: return token(JspTokenId.ERROR);
1006: default:
1007: break;
1008: }
1009: break;
1010:
1011: // case ISA_END_JSP:
1012: // if (input.readLength() == 1) {
1013: // offset++;
1014: // return JspTokenId.AFTER_UNEXPECTED_LT;
1015: // }
1016: // else {
1017: // return JspTokenId.TEXT;
1018: // }
1019: // //break;
1020:
1021: // added states
1022: case ISA_LT_PC_AT:
1023: if (Character.isLetter(actChar) || (actChar == '_')) {
1024: // the directive starts
1025: lexerState = ISI_DIRNAME;
1026: break;
1027: }
1028:
1029: switch (actChar) {
1030: case '\n':
1031: if (input.readLength() == 1) { // no char
1032: return token(JspTokenId.EOL);
1033: } else {
1034: input.backup(1);
1035: return token(JspTokenId.SYMBOL);
1036: }
1037: case ' ':
1038: case '\t':
1039: break;
1040: case '%':
1041: lexerState = ISA_ENDPC;
1042: break;
1043: default:
1044: //error
1045: lexerState = ISI_DIR_ERROR;
1046: if (input.readLength() > 1) {
1047: input.backup(1); //backup the error char if there is something more in the buffer
1048: return token(JspTokenId.SYMBOL);
1049: }
1050: break;
1051: }
1052: break;
1053:
1054: }
1055:
1056: }
1057:
1058: // At this stage there's no more text in the scanned buffer.
1059: // Scanner first checks whether this is completely the last
1060: // available buffer.
1061:
1062: switch (lexerState) {
1063: case INIT:
1064: case ISA_BS:
1065: case ISA_LT:
1066: case ISA_LT_SLASH:
1067: if (input.readLength() == 0) {
1068: return null;
1069: } else {
1070: return token(JspTokenId.TEXT);
1071: }
1072: case ISI_ERROR:
1073: case ISI_TAG_ERROR:
1074: lexerState = INIT;
1075: return token(JspTokenId.ERROR);
1076: case ISI_DIR_ERROR:
1077: case ISI_DIR_ERROR_P:
1078: lexerState = INIT;
1079: return token(JspTokenId.ERROR);
1080: case ISA_ENDSLASH:
1081: case ISP_TAG_EQ:
1082: lexerState = INIT;
1083: return token(JspTokenId.SYMBOL);
1084: case ISA_LT_PC:
1085: case ISA_LT_PC_DASH:
1086: case ISA_ENDPC:
1087: case ISP_DIR_EQ:
1088: lexerState = INIT;
1089: return token(JspTokenId.SYMBOL);
1090: case ISI_TAGNAME:
1091: case ISI_ENDTAG:
1092: lexerState = INIT;
1093: return token(JspTokenId.TAG);
1094: case ISI_DIRNAME:
1095: lexerState = INIT;
1096: return token(JspTokenId.TAG);
1097: case ISP_TAG:
1098: case ISI_TAG_I_WS:
1099: lexerState = INIT;
1100: return token(JspTokenId.TAG);
1101: case ISP_DIR:
1102: case ISI_DIR_I_WS:
1103: case ISA_LT_PC_AT:
1104: lexerState = INIT;
1105: return token(JspTokenId.TAG);
1106: case ISI_TAG_ATTR:
1107: lexerState = INIT;
1108: return token(JspTokenId.ATTRIBUTE);
1109: case ISI_DIR_ATTR:
1110: lexerState = INIT;
1111: return token(JspTokenId.ATTRIBUTE);
1112: case ISI_TAG_STRING:
1113: case ISI_TAG_STRING_B:
1114: case ISI_TAG_STRING2:
1115: case ISI_TAG_STRING2_B:
1116: lexerState = INIT;
1117: return token(JspTokenId.ATTR_VALUE);
1118: case ISI_DIR_STRING:
1119: case ISI_DIR_STRING_B:
1120: case ISI_DIR_STRING2:
1121: case ISI_DIR_STRING2_B:
1122: lexerState = INIT;
1123: return token(JspTokenId.ATTR_VALUE);
1124: case ISI_JSP_COMMENT:
1125: case ISI_JSP_COMMENT_M:
1126: case ISI_JSP_COMMENT_MM:
1127: case ISI_JSP_COMMENT_MMP:
1128: lexerState = INIT;
1129: return token(JspTokenId.COMMENT);
1130: case ISA_EL_DELIM:
1131: lexerState = INIT;
1132: return token(JspTokenId.TEXT);
1133: case ISI_EL:
1134: lexerState = INIT;
1135: return token(JspTokenId.EL);
1136: case ISI_SCRIPTLET:
1137: case ISP_SCRIPTLET_PC:
1138: lexerState = INIT;
1139: return scriptletToken(JspTokenId.SCRIPTLET,
1140: lexerStateJspScriptlet);
1141: default:
1142: break;
1143: }
1144:
1145: return null;
1146:
1147: }
1148:
1149: private Token<JspTokenId> token(JspTokenId tokenId) {
1150: if (LOG) {
1151: checkToken(tokenId);
1152: }
1153: return tokenFactory.createToken(tokenId);
1154: }
1155:
1156: private Token<JspTokenId> scriptletToken(JspTokenId tokenId,
1157: int javaCodeType) {
1158: if (LOG) {
1159: checkToken(tokenId);
1160: }
1161: JspTokenId.JavaCodeType scriptletType;
1162: switch (javaCodeType) {
1163: case JAVA_SCRITPLET:
1164: case JAVA_SCRITPLET_DOCUMENT:
1165: scriptletType = JspTokenId.JavaCodeType.SCRIPTLET;
1166: break;
1167: case JAVA_DECLARATION:
1168: case JAVA_DECLARATION_DOCUMENT:
1169: scriptletType = JspTokenId.JavaCodeType.DECLARATION;
1170: break;
1171: case JAVA_EXPRESSION:
1172: case JAVA_EXPRESSION_DOCUMENT:
1173: scriptletType = JspTokenId.JavaCodeType.EXPRESSION;
1174: break;
1175: default:
1176: throw new IllegalStateException(
1177: "Unsupported scriptlet type "
1178: + lexerStateJspScriptlet);
1179: }
1180:
1181: return tokenFactory.createPropertyToken(tokenId, input
1182: .readLength(), new JspTokenPropertyProvider(
1183: scriptletType), PartType.COMPLETE);
1184: }
1185:
1186: private void checkToken(JspTokenId tokenId) {
1187: if (input.readLength() == 0) {
1188: LOGGER.log(Level.INFO, "Found zero length token: ");
1189: }
1190: LOGGER.log(Level.INFO, "[" + this .getClass().getSimpleName()
1191: + "] token ('" + input.readText().toString() + "'; id="
1192: + tokenId + "; state=" + state() + ")\n");
1193: }
1194:
1195: public void release() {
1196: }
1197:
1198: private static class JspTokenPropertyProvider implements
1199: TokenPropertyProvider {
1200:
1201: private final JspTokenId.JavaCodeType scriptletType;
1202:
1203: JspTokenPropertyProvider(JspTokenId.JavaCodeType scriptletType) {
1204: this .scriptletType = scriptletType;
1205: }
1206:
1207: public Object getValue(Token token, Object key) {
1208: if (JspTokenId.SCRIPTLET_TOKEN_TYPE_PROPERTY.equals(key))
1209: return scriptletType;
1210: return null;
1211: }
1212:
1213: }
1214:
1215: }
|