0001: /*
0002: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
0003: *
0004: * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
0005: *
0006: * The contents of this file are subject to the terms of either the GNU
0007: * General Public License Version 2 only ("GPL") or the Common
0008: * Development and Distribution License("CDDL") (collectively, the
0009: * "License"). You may not use this file except in compliance with the
0010: * License. You can obtain a copy of the License at
0011: * http://www.netbeans.org/cddl-gplv2.html
0012: * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
0013: * specific language governing permissions and limitations under the
0014: * License. When distributing the software, include this License Header
0015: * Notice in each file and include the License file at
0016: * nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this
0017: * particular file as subject to the "Classpath" exception as provided
0018: * by Sun in the GPL Version 2 section of the License file that
0019: * accompanied this code. If applicable, add the following below the
0020: * License Header, with the fields enclosed by brackets [] replaced by
0021: * your own identifying information:
0022: * "Portions Copyrighted [year] [name of copyright owner]"
0023: *
0024: * Contributor(s):
0025: *
0026: * The Original Software is NetBeans. The Initial Developer of the Original
0027: * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
0028: * Microsystems, Inc. All Rights Reserved.
0029: *
0030: * If you wish your version of this file to be governed by only the CDDL
0031: * or only the GPL Version 2, indicate your decision by adding
0032: * "[Contributor] elects to include this software in this distribution
0033: * under the [CDDL or GPL Version 2] license." If you do not indicate a
0034: * single choice of license, a recipient has the option to distribute
0035: * your version of this file under either the CDDL, the GPL Version 2 or
0036: * to extend the choice of license to its licensees as provided above.
0037: * However, if you add GPL Version 2 code and therefore, elected the GPL
0038: * Version 2 license, then the option applies only if the new code is
0039: * made subject to such option by the copyright holder.
0040: */
0041:
0042: package org.netbeans.lib.html.lexer;
0043:
0044: import java.util.HashSet;
0045: import java.util.Set;
0046: import java.util.logging.Level;
0047: import java.util.logging.Logger;
0048: import org.netbeans.api.html.lexer.HTMLTokenId;
0049: import org.netbeans.api.lexer.Token;
0050: import org.netbeans.spi.lexer.Lexer;
0051: import org.netbeans.spi.lexer.LexerInput;
0052: import org.netbeans.spi.lexer.LexerRestartInfo;
0053: import org.netbeans.spi.lexer.TokenFactory;
0054:
0055: /**
0056: * Lexical analyzer for HTML. Based on original HTML lexer from html/editor module.
0057: *
0058: * @author Petr Nejedly
0059: * @author Miloslav Metelka
0060: * @author Jan Lahoda
0061: * @author Marek Fukala
0062: * @version 1.00
0063: */
0064:
0065: public final class HTMLLexer implements Lexer<HTMLTokenId> {
0066:
0067: private static final Logger LOGGER = Logger
0068: .getLogger(HTMLLexer.class.getName());
0069: private static final boolean LOG = Boolean
0070: .getBoolean("j2ee_lexer_debug"); //NOI18N
0071:
0072: private static final int EOF = LexerInput.EOF;
0073:
0074: private final LexerInput input;
0075:
0076: private final TokenFactory<HTMLTokenId> tokenFactory;
0077:
0078: class CompoundState {
0079: private int lexerState;
0080: private int lexerSubState;
0081: private int lexerEmbeddingState;
0082: private String attributeName;
0083:
0084: public CompoundState(int lexerState, int lexerSubState,
0085: int lexerEmbeddingState, String attributeName) {
0086: this .lexerState = lexerState;
0087: this .lexerSubState = lexerSubState;
0088: this .lexerEmbeddingState = lexerEmbeddingState;
0089: this .attributeName = attributeName;
0090: }
0091:
0092: @Override
0093: public String toString() {
0094: // return "state=" + (lexerSubState * 1000000 + lexerState * 1000 + lexerEmbeddingState) + "," + attributeName.toString();
0095: int state = lexerSubState * 1000000 + lexerState * 1000
0096: + lexerEmbeddingState;
0097: return Integer.toString(state) + ","
0098: + attributeName.toString();
0099: }
0100: }
0101:
0102: public Object state() {
0103: if (attributeName != null) {
0104: return new CompoundState(lexerState, lexerSubState,
0105: lexerEmbeddingState, attributeName);
0106: } else {
0107: return lexerSubState * 1000000 + lexerState * 1000
0108: + lexerEmbeddingState;
0109: }
0110: }
0111:
0112: //script and style tag names
0113: private static final String SCRIPT = "script";
0114: private static final String STYLE = "style";
0115:
0116: /** Internal state of the lexical analyzer before entering subanalyzer of
0117: * character references. It is initially set to INIT, but before first usage,
0118: * this will be overwritten with state, which originated transition to
0119: * charref subanalyzer.
0120: */
0121: private int lexerSubState = INIT;
0122: private int lexerState = INIT;
0123: private String attributeName;
0124:
0125: /** indicated whether we are in a script */
0126: private int lexerEmbeddingState = INIT;
0127:
0128: // internal 'in script' state. 'scriptState' internal state is set to it when the
0129: // analyzer goes into a script tag body
0130: private static final int ISI_SCRIPT = 1;
0131: private static final int ISI_STYLE = 2;
0132:
0133: // Internal states
0134: private static final int INIT = 0;
0135: private static final int ISI_TEXT = 1; // Plain text between tags
0136: private static final int ISI_ERROR = 2; // Syntax error in HTML syntax
0137: private static final int ISA_LT = 3; // After start of tag delimiter - "<"
0138: private static final int ISA_SLASH = 4; // After ETAGO - "</"
0139: private static final int ISI_ENDTAG = 5; // Inside endtag - "</[a..Z]+"
0140: private static final int ISP_ENDTAG_X = 6; // X-switch after ENDTAG's name
0141: private static final int ISP_ENDTAG_WS = 7; // In WS in ENDTAG - "</A_ _>"
0142: private static final int ISI_TAG = 8; // Inside tag - "<[a..Z]+"
0143: private static final int ISP_TAG_X = 9; // X-switch after TAG's name
0144: private static final int ISP_TAG_WS = 10; // In WS in TAG - "<A_ _...>"
0145: private static final int ISI_ARG = 11; // Inside tag's argument - "<A h_r_...>"
0146: private static final int ISP_ARG_X = 12; // X-switch after ARGUMENT's name
0147: private static final int ISP_ARG_WS = 13; // Inside WS after argument awaiting '='
0148: private static final int ISP_EQ = 14; // X-switch after '=' in TAG's ARGUMENT
0149: private static final int ISP_EQ_WS = 15; // In WS after '='
0150: private static final int ISI_VAL = 16; // Non-quoted value
0151: private static final int ISI_VAL_QUOT = 17; // Single-quoted value - may contain " chars
0152: private static final int ISI_VAL_DQUOT = 18; // Double-quoted value - may contain ' chars
0153: private static final int ISA_SGML_ESCAPE = 19; // After "<!"
0154: private static final int ISA_SGML_DASH = 20; // After "<!-"
0155: private static final int ISI_HTML_COMMENT = 21; // Somewhere after "<!--"
0156: private static final int ISA_HTML_COMMENT_DASH = 22; // Dash in comment - maybe end of comment
0157: private static final int ISI_HTML_COMMENT_WS = 23; // After end of comment, awaiting end of comment declaration
0158: private static final int ISI_SGML_DECL = 24;
0159: private static final int ISA_SGML_DECL_DASH = 25;
0160: private static final int ISI_SGML_COMMENT = 26;
0161: private static final int ISA_SGML_COMMENT_DASH = 27;
0162: private static final int ISA_REF = 28; // when comes to character reference, e.g. &, after &
0163: private static final int ISI_REF_NAME = 29; // if the reference is symbolic - by predefined name
0164: private static final int ISA_REF_HASH = 30; // for numeric references - after &#
0165: private static final int ISI_REF_DEC = 31; // decimal character reference, e.g. ř
0166: private static final int ISA_REF_X = 32; //
0167: private static final int ISI_REF_HEX = 33; // hexadecimal reference, in 
.. of 	..
0168: private static final int ISI_TAG_SLASH = 34; //after slash in html tag
0169:
0170: private static final int ISI_SCRIPT_CONTENT = 35; //after <script> tags closing symbol '>' - the tag content
0171: private static final int ISI_SCRIPT_CONTENT_AFTER_LT = 36; //after < in script content
0172: private static final int ISI_SCRIPT_CONTENT_ENDTAG = 37; //after </ in script content
0173:
0174: private static final int ISI_STYLE_CONTENT = 38; //after <style> tags closing symbol '>' - the tag content
0175: private static final int ISI_STYLE_CONTENT_AFTER_LT = 39; //after < in style content
0176: private static final int ISI_STYLE_CONTENT_ENDTAG = 40; //after </ in style content
0177:
0178: private static final int ISI_SGML_DECL_WS = 41; //after whitespace in SGML declaration
0179:
0180: static Set<String> EVENT_HANDLER_NAMES = new HashSet<String>();
0181: static {
0182: // See http://www.w3.org/TR/html401/interact/scripts.html
0183: EVENT_HANDLER_NAMES.add("onload"); // NOI18N
0184: EVENT_HANDLER_NAMES.add("onunload"); // NOI18N
0185: EVENT_HANDLER_NAMES.add("onclick"); // NOI18N
0186: EVENT_HANDLER_NAMES.add("ondblclick"); // NOI18N
0187: EVENT_HANDLER_NAMES.add("onmousedown"); // NOI18N
0188: EVENT_HANDLER_NAMES.add("onmouseup"); // NOI18N
0189: EVENT_HANDLER_NAMES.add("onmouseover"); // NOI18N
0190: EVENT_HANDLER_NAMES.add("onmousemove"); // NOI18N
0191: EVENT_HANDLER_NAMES.add("onmouseout"); // NOI18N
0192: EVENT_HANDLER_NAMES.add("onfocus"); // NOI18N
0193: EVENT_HANDLER_NAMES.add("onblur"); // NOI18N
0194: EVENT_HANDLER_NAMES.add("onkeypress"); // NOI18N
0195: EVENT_HANDLER_NAMES.add("onkeydown"); // NOI18N
0196: EVENT_HANDLER_NAMES.add("onkeyup"); // NOI18N
0197: EVENT_HANDLER_NAMES.add("onsubmit"); // NOI18N
0198: EVENT_HANDLER_NAMES.add("onreset"); // NOI18N
0199: EVENT_HANDLER_NAMES.add("onselect"); // NOI18N
0200: EVENT_HANDLER_NAMES.add("onchange"); // NOI18N
0201:
0202: // IMPORTANT - if you add any that DON'T start with "o" here,
0203: // make sure you update the optimized firstchar look in isJavaScriptArgument
0204: }
0205:
0206: public HTMLLexer(LexerRestartInfo<HTMLTokenId> info) {
0207: this .input = info.input();
0208: this .tokenFactory = info.tokenFactory();
0209: if (info.state() == null) {
0210: this .lexerSubState = INIT;
0211: this .lexerState = INIT;
0212: this .lexerEmbeddingState = INIT;
0213: } else {
0214: Object state = info.state();
0215: if (state instanceof CompoundState) {
0216: CompoundState cs = (CompoundState) state;
0217: lexerState = cs.lexerState;
0218: lexerSubState = cs.lexerSubState;
0219: lexerEmbeddingState = cs.lexerEmbeddingState;
0220: attributeName = cs.attributeName;
0221: } else {
0222: int encoded = ((Integer) info.state()).intValue();
0223: this .lexerSubState = encoded / 1000000;
0224: int remainder = encoded % 1000000;
0225: this .lexerState = remainder / 1000;
0226: this .lexerEmbeddingState = remainder % 1000;
0227: }
0228: }
0229: }
0230:
0231: private final boolean isAZ(int character) {
0232: return ((character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z'));
0233: }
0234:
0235: private final boolean isName(int character) {
0236: return Character.isLetterOrDigit(character) || character == '-'
0237: || character == '_' || character == '.'
0238: || character == ':';
0239: // return( (ch >= 'a' && ch <= 'z') ||
0240: // (ch >= 'A' && ch <= 'Z') ||
0241: // (ch >= '0' && ch <= '9') ||
0242: // ch == '-' || ch == '_' || ch == '.' || ch == ':' );
0243:
0244: }
0245:
0246: /**
0247: * Resolves if given char is whitespace in terms of HTML4.0 specs
0248: * According to specs, following characters are treated as whitespace:
0249: * Space - <CODE>'\u0020'</CODE>, Tab - <CODE>'\u0009'</CODE>,
0250: * Formfeed - <CODE>'\u000C'</CODE>,Zero-width space - <CODE>'\u200B'</CODE>,
0251: * Carriage return - <CODE>'
0252: '</CODE> and Line feed - <CODE>'
0253: '</CODE>
0254: * CR's are included for completenes only, they should never appear in document
0255: */
0256:
0257: private final boolean isWS(int character) {
0258: return Character.isWhitespace(character);
0259: // return ( ch == '\u0020' || ch == '\u0009' || ch == '\u000c'
0260: // || ch == '\u200b' || ch == '\n' || ch == '\r' );
0261: }
0262:
0263: private boolean isJavaScriptArgument(LexerInput input) {
0264: CharSequence name = input.readText();
0265: if (name.charAt(0) == 'o') {
0266: if (EVENT_HANDLER_NAMES.contains(name.toString())) {
0267: return true;
0268: }
0269: }
0270: return false;
0271: }
0272:
0273: private boolean followsCloseTag(String closeTagName) {
0274: int actChar;
0275: int prev_read = input.readLength(); //remember the size of the read sequence //substract the first read character
0276: int read = 0;
0277: while (true) {
0278: actChar = input.read();
0279: read++;
0280: if (!(Character.isLetter(actChar)
0281: || Character.isDigit(actChar) || (actChar == '_')
0282: || (actChar == '-') || (actChar == ':')
0283: || (actChar == '.') || (actChar == '/'))
0284: || (actChar == EOF)) { // EOL or not alpha
0285: //end of tagname
0286: CharSequence tagName = input.readText().subSequence(
0287: prev_read, prev_read + read - 1);
0288:
0289: input.backup(read); //put the lookahead text back to the buffer
0290:
0291: if (closeTagName.equalsIgnoreCase(tagName.toString())) {
0292: if (actChar == '>') {
0293: return true;
0294: }
0295: }
0296:
0297: return false;
0298: }
0299: }
0300: }
0301:
0302: public Token<HTMLTokenId> nextToken() {
0303: int actChar;
0304:
0305: while (true) {
0306: actChar = input.read();
0307:
0308: if (actChar == EOF) {
0309: if (input.readLengthEOF() == 1) {
0310: return null; //just EOL is read
0311: } else {
0312: //there is something else in the buffer except EOL
0313: //we will return last token now
0314: input.backup(1); //backup the EOL, we will return null in next nextToken() call
0315: break;
0316: }
0317: }
0318:
0319: //System.out.println("HTMLSyntax: parseToken tokenOffset=" + tokenOffset + ", actChar='" + actChar + "', offset=" + offset + ", state=" + getStateName(state) +
0320: // ", stopOffset=" + stopOffset + ", lastBuffer=" + lastBuffer);
0321: switch (lexerState) {
0322: case INIT: // DONE
0323: switch (actChar) {
0324: case '<':
0325: lexerState = ISA_LT;
0326: break;
0327: case '&':
0328: lexerState = ISA_REF;
0329: lexerSubState = ISI_TEXT;
0330: break;
0331: default:
0332: lexerState = ISI_TEXT;
0333: break;
0334: }
0335: break;
0336:
0337: case ISI_TEXT: // DONE
0338: switch (actChar) {
0339: case '<':
0340: case '&':
0341: lexerState = INIT;
0342: input.backup(1);
0343: if (input.readLength() > 0) { //is there any text before & or < ???
0344: return token(HTMLTokenId.TEXT);
0345: }
0346: break;
0347: }
0348: break;
0349:
0350: case ISI_ERROR: // DONE
0351: lexerState = INIT;
0352: return token(HTMLTokenId.ERROR);
0353:
0354: case ISA_LT: // PENDING other transitions - e.g '<?'
0355: if (isAZ(actChar)) { // <'a..Z'
0356: lexerState = ISI_TAG;
0357: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0358: input.backup(1);
0359: return token(HTMLTokenId.TAG_OPEN_SYMBOL);
0360: }
0361: break;
0362: }
0363: switch (actChar) {
0364: case '/': // ETAGO - </
0365: lexerState = ISA_SLASH;
0366: return token(HTMLTokenId.TAG_OPEN_SYMBOL);
0367: case '>': // Empty start tag <>, RELAXED
0368: lexerState = INIT;
0369: return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
0370: case '!':
0371: lexerState = ISA_SGML_ESCAPE;
0372: break;
0373: default: // Part of text, RELAXED
0374: lexerState = ISI_TEXT;
0375: break;
0376: }
0377: break;
0378:
0379: case ISA_SLASH: // DONE
0380: if (isAZ(actChar)) { // </'a..Z'
0381: lexerState = ISI_ENDTAG;
0382: break;
0383: }
0384: switch (actChar) {
0385: case '>': // Empty end tag </>, RELAXED
0386: lexerState = INIT;
0387: return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
0388: default: // Part of text, e.g. </3, </'\n', RELAXED
0389: lexerState = ISI_TEXT;
0390: input.backup(1);
0391: break;
0392: }
0393: break;
0394:
0395: case ISI_ENDTAG: // DONE
0396: if (isName(actChar))
0397: break; // Still in endtag identifier, eat next char
0398: lexerState = ISP_ENDTAG_X;
0399: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0400: input.backup(1);
0401: return token(HTMLTokenId.TAG_CLOSE);
0402: }
0403: break;
0404:
0405: case ISP_ENDTAG_X: // DONE
0406: if (isWS(actChar)) {
0407: lexerState = ISP_ENDTAG_WS;
0408: break;
0409: }
0410: switch (actChar) {
0411: case '>': // Closing of endtag, e.g. </H6 _>_
0412: lexerState = INIT;
0413: return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
0414: case '<': // next tag, e.g. </H6 _<_, RELAXED
0415: lexerState = INIT;
0416: input.backup(1);
0417: break;
0418: default:
0419: lexerState = ISI_ERROR;
0420: input.backup(1);
0421: break;
0422: }
0423: break;
0424:
0425: case ISP_ENDTAG_WS: // DONE
0426: if (isWS(actChar))
0427: break; // eat all WS
0428: lexerState = ISP_ENDTAG_X;
0429: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0430: input.backup(1);
0431: return token(HTMLTokenId.WS);
0432: }
0433: break;
0434:
0435: case ISI_TAG: // DONE
0436: if (isName(actChar))
0437: break; // Still in tag identifier, eat next char
0438: lexerState = ISP_TAG_X;
0439: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0440: input.backup(1);
0441: //test if the tagname is SCRIPT
0442: if (SCRIPT.equalsIgnoreCase(input.readText()
0443: .toString())) { //NOI18N
0444: lexerEmbeddingState = ISI_SCRIPT;
0445: }
0446: if (STYLE.equalsIgnoreCase(input.readText()
0447: .toString())) { //NOI18N
0448: lexerEmbeddingState = ISI_STYLE;
0449: }
0450: return token(HTMLTokenId.TAG_OPEN);
0451: }
0452: break;
0453:
0454: case ISP_TAG_X: // DONE
0455: if (isWS(actChar)) {
0456: lexerState = ISP_TAG_WS;
0457: break;
0458: }
0459: if (isAZ(actChar)) {
0460: lexerState = ISI_ARG;
0461: break;
0462: }
0463: switch (actChar) {
0464: case '/':
0465: lexerState = ISI_TAG_SLASH;
0466: break;
0467: case '>':
0468: switch (lexerEmbeddingState) {
0469: case INIT:
0470: lexerState = INIT;
0471: break;
0472: case ISI_SCRIPT:
0473: lexerState = ISI_SCRIPT_CONTENT;
0474: break;
0475: case ISI_STYLE:
0476: lexerState = ISI_STYLE_CONTENT;
0477: break;
0478: }
0479:
0480: return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
0481: case '<':
0482: lexerState = INIT;
0483: input.backup(1);
0484: break;
0485: default:
0486: lexerState = ISI_ERROR;
0487: input.backup(1);
0488: break;
0489: }
0490: break;
0491:
0492: case ISP_TAG_WS: // DONE
0493: if (isWS(actChar))
0494: break; // eat all WS
0495: lexerState = ISP_TAG_X;
0496: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0497: input.backup(1);
0498: return token(HTMLTokenId.WS);
0499: }
0500:
0501: case ISI_TAG_SLASH:
0502: switch (actChar) {
0503: case '>':
0504: lexerEmbeddingState = INIT; //possibly cancel 'in script' if empty tag found
0505: lexerState = INIT;
0506: return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
0507: default:
0508: lexerState = ISI_ERROR;
0509: input.backup(1);
0510: break;
0511: }
0512: break;
0513:
0514: case ISI_SCRIPT_CONTENT:
0515: switch (actChar) {
0516: case '<':
0517: lexerState = ISI_SCRIPT_CONTENT_AFTER_LT;
0518: break;
0519: default:
0520: break;
0521: }
0522: break;
0523:
0524: case ISI_SCRIPT_CONTENT_AFTER_LT:
0525: if (actChar == '/') {
0526: if (followsCloseTag(SCRIPT)) {
0527: //end of script section found
0528: lexerEmbeddingState = INIT;
0529: lexerState = INIT;
0530: input.backup(input.readLength() > 2 ? 2 : input
0531: .readLength()); //backup the '</', we will read it again
0532: if (input.readLength() > 0) {
0533: //the script has a body
0534: return token(HTMLTokenId.SCRIPT);
0535: } else {
0536: break;
0537: }
0538: }
0539: }
0540: lexerState = ISI_SCRIPT_CONTENT;
0541: break;
0542:
0543: case ISI_STYLE_CONTENT:
0544: switch (actChar) {
0545: case '<':
0546: lexerState = ISI_STYLE_CONTENT_AFTER_LT;
0547: break;
0548: default:
0549: break;
0550: }
0551: break;
0552:
0553: case ISI_STYLE_CONTENT_AFTER_LT:
0554: if (actChar == '/') {
0555: if (followsCloseTag(STYLE)) {
0556: //end of script section found
0557: lexerEmbeddingState = INIT;
0558: lexerState = INIT;
0559: input.backup(input.readLength() > 2 ? 2 : input
0560: .readLength()); //backup the '</', we will read it again
0561: if (input.readLength() > 0) {
0562: //the script has a body
0563: return token(HTMLTokenId.STYLE);
0564: } else {
0565: break;
0566: }
0567: }
0568: }
0569: lexerState = ISI_STYLE_CONTENT;
0570: break;
0571:
0572: case ISI_ARG: // DONE
0573: if (isName(actChar))
0574: break; // eat next char
0575: lexerState = ISP_ARG_X;
0576: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0577: input.backup(1);
0578: attributeName = input.readText().toString();
0579: return token(HTMLTokenId.ARGUMENT);
0580: }
0581: break;
0582:
0583: case ISP_ARG_X:
0584: if (isWS(actChar)) {
0585: lexerState = ISP_ARG_WS;
0586: break;
0587: }
0588: if (isAZ(actChar)) {
0589: lexerState = ISI_ARG;
0590: break;
0591: }
0592: switch (actChar) {
0593: case '/':
0594: case '>':
0595: input.backup(1);
0596: lexerState = ISP_TAG_X;
0597: break;
0598: case '<':
0599: lexerState = INIT;
0600: input.backup(1);
0601: break;
0602: case '=':
0603: lexerState = ISP_EQ;
0604: return token(HTMLTokenId.OPERATOR);
0605: default:
0606: lexerState = ISI_ERROR;
0607: input.backup(1);
0608: break;
0609: }
0610: break;
0611:
0612: case ISP_ARG_WS:
0613: if (isWS(actChar))
0614: break; // Eat all WhiteSpace
0615: lexerState = ISP_ARG_X;
0616: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0617: input.backup(1);
0618: return token(HTMLTokenId.WS);
0619: }
0620: break;
0621:
0622: case ISP_EQ:
0623: if (isWS(actChar)) {
0624: lexerState = ISP_EQ_WS;
0625: break;
0626: }
0627: switch (actChar) {
0628: case '\'':
0629: lexerState = ISI_VAL_QUOT;
0630: break;
0631: case '"':
0632: lexerState = ISI_VAL_DQUOT;
0633: break;
0634: case '/':
0635: case '>':
0636: input.backup(1);
0637: lexerState = ISP_TAG_X;
0638: break;
0639: default:
0640: lexerState = ISI_VAL; //everything else if attribute value
0641: break;
0642: }
0643: break;
0644:
0645: case ISP_EQ_WS:
0646: if (isWS(actChar))
0647: break; // Consume all WS
0648: lexerState = ISP_EQ;
0649: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0650: input.backup(1);
0651: return token(HTMLTokenId.WS);
0652: }
0653: break;
0654:
0655: case ISI_VAL:
0656: if (!isWS(actChar)
0657: && !(actChar == '/' || actChar == '>' || actChar == '<'))
0658: break; // Consume whole value
0659: lexerState = ISP_TAG_X;
0660: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0661: input.backup(1);
0662: if (attributeName != null
0663: && EVENT_HANDLER_NAMES
0664: .contains(attributeName)) {
0665: attributeName = null;
0666: return token(HTMLTokenId.VALUE_JAVASCRIPT);
0667: }
0668: attributeName = null;
0669: return token(HTMLTokenId.VALUE);
0670: }
0671: break;
0672:
0673: case ISI_VAL_QUOT:
0674: switch (actChar) {
0675: case '\'':
0676: lexerState = ISP_TAG_X;
0677: if (attributeName != null
0678: && EVENT_HANDLER_NAMES
0679: .contains(attributeName)) {
0680: attributeName = null;
0681: return token(HTMLTokenId.VALUE_JAVASCRIPT);
0682: }
0683: attributeName = null;
0684: return token(HTMLTokenId.VALUE);
0685:
0686: // Workaround for [Issue 117450] Provide unified LexerInput across multiple joined embedded sections
0687: // The problem is described in detail in issue [Issue 118892] Allow Schlieman lexer to continuously lex embedded language over more tokens of its parent language
0688: // Should be removed once the issue is fixed.
0689: //
0690: // case '&':
0691: // if( input.readLength() == 1 ) {
0692: // lexerSubState = lexerState;
0693: // lexerState = ISA_REF;
0694: // break;
0695: // } else {
0696: // if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0697: // input.backup(1);
0698: // return token(HTMLTokenId.VALUE);
0699: // }
0700: // }
0701: }
0702: break; // else simply consume next char of VALUE
0703:
0704: case ISI_VAL_DQUOT:
0705: switch (actChar) {
0706: case '"':
0707: lexerState = ISP_TAG_X;
0708: if (attributeName != null
0709: && EVENT_HANDLER_NAMES
0710: .contains(attributeName)) {
0711: attributeName = null;
0712: return token(HTMLTokenId.VALUE_JAVASCRIPT);
0713: }
0714: attributeName = null;
0715: return token(HTMLTokenId.VALUE);
0716:
0717: // Workaround for [Issue 117450] Provide unified LexerInput across multiple joined embedded sections
0718: // The problem is described in detail in issue [Issue 118892] Allow Schlieman lexer to continuously lex embedded language over more tokens of its parent language
0719: // Should be removed once the issue is fixed.
0720: //
0721: // case '&':
0722: // if( input.readLength() == 1 ) {
0723: // lexerSubState = lexerState;
0724: // lexerState = ISA_REF;
0725: // break;
0726: // } else {
0727: // if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0728: // input.backup(1);
0729: // return token(HTMLTokenId.VALUE);
0730: // }
0731: // }
0732: }
0733: break; // else simply consume next char of VALUE
0734:
0735: case ISA_SGML_ESCAPE: // DONE
0736: if (isAZ(actChar)) {
0737: lexerState = ISI_SGML_DECL;
0738: break;
0739: }
0740: switch (actChar) {
0741: case '-':
0742: lexerState = ISA_SGML_DASH;
0743: break;
0744: default:
0745: lexerState = ISI_TEXT;
0746: input.backup(1);
0747: continue;
0748: }
0749: break;
0750:
0751: case ISA_SGML_DASH: // DONE
0752: switch (actChar) {
0753: case '-':
0754: lexerState = ISI_HTML_COMMENT;
0755: break;
0756: default:
0757: lexerState = ISI_TEXT;
0758: input.backup(1);
0759: continue;
0760: }
0761: break;
0762:
0763: case ISI_HTML_COMMENT: // DONE
0764: switch (actChar) {
0765: case '-':
0766: lexerState = ISA_HTML_COMMENT_DASH;
0767: break;
0768: //create an HTML comment token for each line of the comment - a performance fix for #43532
0769: case '\n':
0770: //leave the some state - we are still in an HTML comment,
0771: //we just need to create a token for each line.
0772: return token(HTMLTokenId.BLOCK_COMMENT);
0773: }
0774: break;
0775:
0776: case ISA_HTML_COMMENT_DASH:
0777: switch (actChar) {
0778: case '-':
0779: lexerState = ISI_HTML_COMMENT_WS;
0780: break;
0781: default:
0782: lexerState = ISI_HTML_COMMENT;
0783: continue;
0784: }
0785: break;
0786:
0787: case ISI_HTML_COMMENT_WS: // DONE
0788: if (isWS(actChar))
0789: break; // Consume all WS
0790: switch (actChar) {
0791: case '>':
0792: lexerState = INIT;
0793: return token(HTMLTokenId.BLOCK_COMMENT);
0794: default:
0795: lexerState = ISI_HTML_COMMENT;
0796: input.backup(1);
0797: break;
0798: }
0799: break;
0800:
0801: case ISI_SGML_DECL:
0802: if (Character.isWhitespace(actChar)) {
0803: lexerState = ISI_SGML_DECL_WS;
0804: if (input.readLength() > 1) {
0805: input.backup(1); //backup the whitespace
0806: return token(HTMLTokenId.DECLARATION);
0807: }
0808: break;
0809: }
0810: switch (actChar) {
0811: case '>':
0812: if (input.readLength() > 1) {
0813: input.backup(1); //backup the '<' char
0814: return token(HTMLTokenId.DECLARATION);
0815: } else {
0816: //just the symbol read - return it as a part of declaration
0817: lexerState = INIT;
0818: return token(HTMLTokenId.DECLARATION);
0819: }
0820: case '-':
0821: if (input.readLength() == 1) {
0822: lexerState = ISA_SGML_DECL_DASH;
0823: break;
0824: } else {
0825: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0826: input.backup(1);
0827: return token(HTMLTokenId.DECLARATION);
0828: }
0829: }
0830: }
0831: break;
0832:
0833: case ISI_SGML_DECL_WS:
0834: if (!Character.isWhitespace(actChar)) {
0835: lexerState = ISI_SGML_DECL;
0836: input.backup(1);
0837: return token(HTMLTokenId.WS);
0838: }
0839: break;
0840:
0841: case ISA_SGML_DECL_DASH:
0842: if (actChar == '-') {
0843: lexerState = ISI_SGML_COMMENT;
0844: break;
0845: } else {
0846: lexerState = ISI_SGML_DECL;
0847: input.backup(1);
0848: continue;
0849: }
0850:
0851: case ISI_SGML_COMMENT:
0852: switch (actChar) {
0853: case '-':
0854: lexerState = ISA_SGML_COMMENT_DASH;
0855: break;
0856: }
0857: break;
0858:
0859: case ISA_SGML_COMMENT_DASH:
0860: if (actChar == '-') {
0861: lexerState = ISI_SGML_DECL;
0862: return token(HTMLTokenId.SGML_COMMENT);
0863: } else {
0864: lexerState = ISI_SGML_COMMENT;
0865: input.backup(1);
0866: continue;
0867: }
0868:
0869: case ISA_REF:
0870: if (isAZ(actChar)) {
0871: lexerState = ISI_REF_NAME;
0872: break;
0873: }
0874: if (actChar == '#') {
0875: lexerState = ISA_REF_HASH;
0876: break;
0877: }
0878: lexerState = lexerSubState;
0879: input.backup(1);
0880: continue;
0881:
0882: case ISI_REF_NAME:
0883: if (isName(actChar))
0884: break;
0885: lexerState = lexerSubState;
0886: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0887: if (actChar != ';') {
0888: input.backup(1);
0889: }
0890: return token(HTMLTokenId.CHARACTER);
0891: }
0892: break;
0893:
0894: case ISA_REF_HASH:
0895: if (actChar >= '0' && actChar <= '9') {
0896: lexerState = ISI_REF_DEC;
0897: break;
0898: }
0899: if (actChar == 'x' || actChar == 'X') {
0900: lexerState = ISA_REF_X;
0901: break;
0902: }
0903: if (isAZ(actChar)) {
0904: lexerState = lexerSubState;
0905: return token(HTMLTokenId.ERROR);
0906: }
0907: lexerState = lexerSubState;
0908: input.backup(1);
0909: continue;
0910:
0911: case ISI_REF_DEC:
0912: if (actChar >= '0' && actChar <= '9')
0913: break;
0914: lexerState = lexerSubState;
0915: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0916: if (actChar != ';')
0917: input.backup(1);
0918: return token(HTMLTokenId.CHARACTER);
0919: }
0920: break;
0921:
0922: case ISA_REF_X:
0923: if ((actChar >= '0' && actChar <= '9')
0924: || (actChar >= 'a' && actChar <= 'f')
0925: || (actChar >= 'A' && actChar <= 'F')) {
0926: lexerState = ISI_REF_HEX;
0927: break;
0928: }
0929: lexerState = lexerSubState;
0930: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0931: input.backup(1);
0932: return token(HTMLTokenId.ERROR); // error on previous "&#x" sequence
0933: }
0934: break;
0935:
0936: case ISI_REF_HEX:
0937: if ((actChar >= '0' && actChar <= '9')
0938: || (actChar >= 'a' && actChar <= 'f')
0939: || (actChar >= 'A' && actChar <= 'F'))
0940: break;
0941: lexerState = lexerSubState;
0942: if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0943: if (actChar != ';')
0944: input.backup(1);
0945: return token(HTMLTokenId.CHARACTER);
0946: }
0947: break;
0948: }
0949: } // end of while(offset...)
0950:
0951: /** At this stage there's no more text in the scanned buffer.
0952: * Scanner first checks whether this is completely the last
0953: * available buffer.
0954: */
0955: switch (lexerState) {
0956: case INIT:
0957: if (input.readLength() == 0) {
0958: return null;
0959: }
0960: break;
0961: case ISI_TEXT:
0962: case ISA_LT:
0963: case ISA_SLASH:
0964: case ISA_SGML_ESCAPE:
0965: case ISA_SGML_DASH:
0966: case ISI_TAG_SLASH:
0967: return token(HTMLTokenId.TEXT);
0968:
0969: case ISA_REF:
0970: case ISA_REF_HASH:
0971: if (lexerSubState == ISI_TEXT)
0972: return token(HTMLTokenId.TEXT);
0973: else
0974: return token(HTMLTokenId.VALUE);
0975:
0976: case ISI_HTML_COMMENT:
0977: case ISA_HTML_COMMENT_DASH:
0978: case ISI_HTML_COMMENT_WS:
0979: return token(HTMLTokenId.BLOCK_COMMENT);
0980:
0981: case ISI_TAG:
0982: return token(HTMLTokenId.TAG_OPEN);
0983: case ISI_ENDTAG:
0984: return token(HTMLTokenId.TAG_CLOSE);
0985:
0986: case ISI_ARG:
0987: return token(HTMLTokenId.ARGUMENT);
0988:
0989: case ISI_ERROR:
0990: return token(HTMLTokenId.ERROR);
0991:
0992: case ISP_ARG_WS:
0993: case ISP_TAG_WS:
0994: case ISP_ENDTAG_WS:
0995: case ISP_EQ_WS:
0996: return token(HTMLTokenId.WS);
0997:
0998: case ISP_ARG_X:
0999: case ISP_TAG_X:
1000: case ISP_ENDTAG_X:
1001: case ISP_EQ:
1002: return token(HTMLTokenId.WS);
1003:
1004: case ISI_VAL:
1005: case ISI_VAL_QUOT:
1006: case ISI_VAL_DQUOT:
1007: return token(HTMLTokenId.VALUE);
1008:
1009: case ISI_SGML_DECL:
1010: case ISA_SGML_DECL_DASH:
1011: return token(HTMLTokenId.DECLARATION);
1012:
1013: case ISI_SGML_COMMENT:
1014: case ISA_SGML_COMMENT_DASH:
1015: return token(HTMLTokenId.SGML_COMMENT);
1016:
1017: case ISI_REF_NAME:
1018: case ISI_REF_DEC:
1019: case ISA_REF_X:
1020: case ISI_REF_HEX:
1021: return token(HTMLTokenId.CHARACTER);
1022: case ISI_SCRIPT_CONTENT:
1023: case ISI_SCRIPT_CONTENT_ENDTAG:
1024: case ISI_SCRIPT_CONTENT_AFTER_LT:
1025: return token(HTMLTokenId.SCRIPT);
1026: case ISI_STYLE_CONTENT:
1027: case ISI_STYLE_CONTENT_ENDTAG:
1028: case ISI_STYLE_CONTENT_AFTER_LT:
1029: return token(HTMLTokenId.STYLE);
1030:
1031: }
1032:
1033: return null;
1034: }
1035:
1036: private Token<HTMLTokenId> token(HTMLTokenId tokenId) {
1037: if (LOG) {
1038: if (input.readLength() == 0) {
1039: LOGGER.log(Level.INFO, "Found zero length token: ");
1040: }
1041: LOGGER.log(Level.INFO, "["
1042: + this .getClass().getSimpleName() + "] token ('"
1043: + input.readText().toString() + "'; id=" + tokenId
1044: + "; state=" + state() + ")\n");
1045: }
1046: return tokenFactory.createToken(tokenId);
1047: }
1048:
1049: public void release() {
1050: }
1051:
1052: }
|