Source Code Cross Referenced for HTMLLexer.java in » IDE-Netbeans » html » org » netbeans » lib » html » lexer » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » IDE Netbeans » html » org.netbeans.lib.html.lexer
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        /*
0002:         * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
0003:         *
0004:         * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
0005:         *
0006:         * The contents of this file are subject to the terms of either the GNU
0007:         * General Public License Version 2 only ("GPL") or the Common
0008:         * Development and Distribution License("CDDL") (collectively, the
0009:         * "License"). You may not use this file except in compliance with the
0010:         * License. You can obtain a copy of the License at
0011:         * http://www.netbeans.org/cddl-gplv2.html
0012:         * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
0013:         * specific language governing permissions and limitations under the
0014:         * License.  When distributing the software, include this License Header
0015:         * Notice in each file and include the License file at
0016:         * nbbuild/licenses/CDDL-GPL-2-CP.  Sun designates this
0017:         * particular file as subject to the "Classpath" exception as provided
0018:         * by Sun in the GPL Version 2 section of the License file that
0019:         * accompanied this code. If applicable, add the following below the
0020:         * License Header, with the fields enclosed by brackets [] replaced by
0021:         * your own identifying information:
0022:         * "Portions Copyrighted [year] [name of copyright owner]"
0023:         *
0024:         * Contributor(s):
0025:         *
0026:         * The Original Software is NetBeans. The Initial Developer of the Original
0027:         * Software is Sun Microsystems, Inc. Portions Copyright 1997-2006 Sun
0028:         * Microsystems, Inc. All Rights Reserved.
0029:         *
0030:         * If you wish your version of this file to be governed by only the CDDL
0031:         * or only the GPL Version 2, indicate your decision by adding
0032:         * "[Contributor] elects to include this software in this distribution
0033:         * under the [CDDL or GPL Version 2] license." If you do not indicate a
0034:         * single choice of license, a recipient has the option to distribute
0035:         * your version of this file under either the CDDL, the GPL Version 2 or
0036:         * to extend the choice of license to its licensees as provided above.
0037:         * However, if you add GPL Version 2 code and therefore, elected the GPL
0038:         * Version 2 license, then the option applies only if the new code is
0039:         * made subject to such option by the copyright holder.
0040:         */
0041:
0042:        package org.netbeans.lib.html.lexer;
0043:
0044:        import java.util.HashSet;
0045:        import java.util.Set;
0046:        import java.util.logging.Level;
0047:        import java.util.logging.Logger;
0048:        import org.netbeans.api.html.lexer.HTMLTokenId;
0049:        import org.netbeans.api.lexer.Token;
0050:        import org.netbeans.spi.lexer.Lexer;
0051:        import org.netbeans.spi.lexer.LexerInput;
0052:        import org.netbeans.spi.lexer.LexerRestartInfo;
0053:        import org.netbeans.spi.lexer.TokenFactory;
0054:
0055:        /**
0056:         * Lexical analyzer for HTML. Based on original HTML lexer from html/editor module.
0057:         *
0058:         * @author Petr Nejedly
0059:         * @author Miloslav Metelka
0060:         * @author Jan Lahoda
0061:         * @author Marek Fukala
0062:         * @version 1.00
0063:         */
0064:
0065:        public final class HTMLLexer implements  Lexer<HTMLTokenId> {
0066:
0067:            private static final Logger LOGGER = Logger
0068:                    .getLogger(HTMLLexer.class.getName());
0069:            private static final boolean LOG = Boolean
0070:                    .getBoolean("j2ee_lexer_debug"); //NOI18N
0071:
0072:            private static final int EOF = LexerInput.EOF;
0073:
0074:            private final LexerInput input;
0075:
0076:            private final TokenFactory<HTMLTokenId> tokenFactory;
0077:
0078:            class CompoundState {
0079:                private int lexerState;
0080:                private int lexerSubState;
0081:                private int lexerEmbeddingState;
0082:                private String attributeName;
0083:
0084:                public CompoundState(int lexerState, int lexerSubState,
0085:                        int lexerEmbeddingState, String attributeName) {
0086:                    this .lexerState = lexerState;
0087:                    this .lexerSubState = lexerSubState;
0088:                    this .lexerEmbeddingState = lexerEmbeddingState;
0089:                    this .attributeName = attributeName;
0090:                }
0091:
0092:                @Override
0093:                public String toString() {
0094:                    // return "state=" + (lexerSubState * 1000000 + lexerState * 1000 + lexerEmbeddingState) + "," + attributeName.toString();
0095:                    int state = lexerSubState * 1000000 + lexerState * 1000
0096:                            + lexerEmbeddingState;
0097:                    return Integer.toString(state) + ","
0098:                            + attributeName.toString();
0099:                }
0100:            }
0101:
0102:            public Object state() {
0103:                if (attributeName != null) {
0104:                    return new CompoundState(lexerState, lexerSubState,
0105:                            lexerEmbeddingState, attributeName);
0106:                } else {
0107:                    return lexerSubState * 1000000 + lexerState * 1000
0108:                            + lexerEmbeddingState;
0109:                }
0110:            }
0111:
0112:            //script and style tag names
0113:            private static final String SCRIPT = "script";
0114:            private static final String STYLE = "style";
0115:
0116:            /** Internal state of the lexical analyzer before entering subanalyzer of
0117:             * character references. It is initially set to INIT, but before first usage,
0118:             * this will be overwritten with state, which originated transition to
0119:             * charref subanalyzer.
0120:             */
0121:            private int lexerSubState = INIT;
0122:            private int lexerState = INIT;
0123:            private String attributeName;
0124:
0125:            /** indicated whether we are in a script */
0126:            private int lexerEmbeddingState = INIT;
0127:
0128:            // internal 'in script' state. 'scriptState' internal state is set to it when the
0129:            // analyzer goes into a script tag body
0130:            private static final int ISI_SCRIPT = 1;
0131:            private static final int ISI_STYLE = 2;
0132:
0133:            // Internal states
0134:            private static final int INIT = 0;
0135:            private static final int ISI_TEXT = 1; // Plain text between tags
0136:            private static final int ISI_ERROR = 2; // Syntax error in HTML syntax
0137:            private static final int ISA_LT = 3; // After start of tag delimiter - "<"
0138:            private static final int ISA_SLASH = 4; // After ETAGO - "</"
0139:            private static final int ISI_ENDTAG = 5; // Inside endtag - "</[a..Z]+"
0140:            private static final int ISP_ENDTAG_X = 6; // X-switch after ENDTAG's name
0141:            private static final int ISP_ENDTAG_WS = 7; // In WS in ENDTAG - "</A_ _>"
0142:            private static final int ISI_TAG = 8; // Inside tag - "<[a..Z]+"
0143:            private static final int ISP_TAG_X = 9; // X-switch after TAG's name
0144:            private static final int ISP_TAG_WS = 10; // In WS in TAG - "<A_ _...>"
0145:            private static final int ISI_ARG = 11; // Inside tag's argument - "<A h_r_...>"
0146:            private static final int ISP_ARG_X = 12; // X-switch after ARGUMENT's name
0147:            private static final int ISP_ARG_WS = 13; // Inside WS after argument awaiting '='
0148:            private static final int ISP_EQ = 14; // X-switch after '=' in TAG's ARGUMENT
0149:            private static final int ISP_EQ_WS = 15; // In WS after '='
0150:            private static final int ISI_VAL = 16; // Non-quoted value
0151:            private static final int ISI_VAL_QUOT = 17; // Single-quoted value - may contain " chars
0152:            private static final int ISI_VAL_DQUOT = 18; // Double-quoted value - may contain ' chars
0153:            private static final int ISA_SGML_ESCAPE = 19; // After "<!"
0154:            private static final int ISA_SGML_DASH = 20; // After "<!-"
0155:            private static final int ISI_HTML_COMMENT = 21; // Somewhere after "<!--"
0156:            private static final int ISA_HTML_COMMENT_DASH = 22; // Dash in comment - maybe end of comment
0157:            private static final int ISI_HTML_COMMENT_WS = 23; // After end of comment, awaiting end of comment declaration
0158:            private static final int ISI_SGML_DECL = 24;
0159:            private static final int ISA_SGML_DECL_DASH = 25;
0160:            private static final int ISI_SGML_COMMENT = 26;
0161:            private static final int ISA_SGML_COMMENT_DASH = 27;
0162:            private static final int ISA_REF = 28; // when comes to character reference, e.g. &amp;, after &
0163:            private static final int ISI_REF_NAME = 29; // if the reference is symbolic - by predefined name
0164:            private static final int ISA_REF_HASH = 30; // for numeric references - after &#
0165:            private static final int ISI_REF_DEC = 31; // decimal character reference, e.g. &#345;
0166:            private static final int ISA_REF_X = 32; //
0167:            private static final int ISI_REF_HEX = 33; // hexadecimal reference, in &#xa.. of &#X9..
0168:            private static final int ISI_TAG_SLASH = 34; //after slash in html tag
0169:
0170:            private static final int ISI_SCRIPT_CONTENT = 35; //after <script> tags closing symbol '>' - the tag content
0171:            private static final int ISI_SCRIPT_CONTENT_AFTER_LT = 36; //after < in script content
0172:            private static final int ISI_SCRIPT_CONTENT_ENDTAG = 37; //after </ in script content
0173:
0174:            private static final int ISI_STYLE_CONTENT = 38; //after <style> tags closing symbol '>' - the tag content
0175:            private static final int ISI_STYLE_CONTENT_AFTER_LT = 39; //after < in style content
0176:            private static final int ISI_STYLE_CONTENT_ENDTAG = 40; //after </ in style content
0177:
0178:            private static final int ISI_SGML_DECL_WS = 41; //after whitespace in SGML declaration
0179:
0180:            static Set<String> EVENT_HANDLER_NAMES = new HashSet<String>();
0181:            static {
0182:                // See http://www.w3.org/TR/html401/interact/scripts.html
0183:                EVENT_HANDLER_NAMES.add("onload"); // NOI18N
0184:                EVENT_HANDLER_NAMES.add("onunload"); // NOI18N
0185:                EVENT_HANDLER_NAMES.add("onclick"); // NOI18N
0186:                EVENT_HANDLER_NAMES.add("ondblclick"); // NOI18N
0187:                EVENT_HANDLER_NAMES.add("onmousedown"); // NOI18N
0188:                EVENT_HANDLER_NAMES.add("onmouseup"); // NOI18N
0189:                EVENT_HANDLER_NAMES.add("onmouseover"); // NOI18N
0190:                EVENT_HANDLER_NAMES.add("onmousemove"); // NOI18N
0191:                EVENT_HANDLER_NAMES.add("onmouseout"); // NOI18N
0192:                EVENT_HANDLER_NAMES.add("onfocus"); // NOI18N
0193:                EVENT_HANDLER_NAMES.add("onblur"); // NOI18N
0194:                EVENT_HANDLER_NAMES.add("onkeypress"); // NOI18N
0195:                EVENT_HANDLER_NAMES.add("onkeydown"); // NOI18N
0196:                EVENT_HANDLER_NAMES.add("onkeyup"); // NOI18N
0197:                EVENT_HANDLER_NAMES.add("onsubmit"); // NOI18N
0198:                EVENT_HANDLER_NAMES.add("onreset"); // NOI18N
0199:                EVENT_HANDLER_NAMES.add("onselect"); // NOI18N
0200:                EVENT_HANDLER_NAMES.add("onchange"); // NOI18N
0201:
0202:                // IMPORTANT - if you add any that DON'T start with "o" here,
0203:                // make sure you update the optimized firstchar look in isJavaScriptArgument
0204:            }
0205:
0206:            public HTMLLexer(LexerRestartInfo<HTMLTokenId> info) {
0207:                this .input = info.input();
0208:                this .tokenFactory = info.tokenFactory();
0209:                if (info.state() == null) {
0210:                    this .lexerSubState = INIT;
0211:                    this .lexerState = INIT;
0212:                    this .lexerEmbeddingState = INIT;
0213:                } else {
0214:                    Object state = info.state();
0215:                    if (state instanceof  CompoundState) {
0216:                        CompoundState cs = (CompoundState) state;
0217:                        lexerState = cs.lexerState;
0218:                        lexerSubState = cs.lexerSubState;
0219:                        lexerEmbeddingState = cs.lexerEmbeddingState;
0220:                        attributeName = cs.attributeName;
0221:                    } else {
0222:                        int encoded = ((Integer) info.state()).intValue();
0223:                        this .lexerSubState = encoded / 1000000;
0224:                        int remainder = encoded % 1000000;
0225:                        this .lexerState = remainder / 1000;
0226:                        this .lexerEmbeddingState = remainder % 1000;
0227:                    }
0228:                }
0229:            }
0230:
0231:            private final boolean isAZ(int character) {
0232:                return ((character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z'));
0233:            }
0234:
0235:            private final boolean isName(int character) {
0236:                return Character.isLetterOrDigit(character) || character == '-'
0237:                        || character == '_' || character == '.'
0238:                        || character == ':';
0239:                //        return( (ch >= 'a' && ch <= 'z') ||
0240:                //                (ch >= 'A' && ch <= 'Z') ||
0241:                //                (ch >= '0' && ch <= '9') ||
0242:                //                ch == '-' || ch == '_' || ch == '.' || ch == ':' );
0243:
0244:            }
0245:
0246:            /**
0247:             * Resolves if given char is whitespace in terms of HTML4.0 specs
0248:             * According to specs, following characters are treated as whitespace:
0249:             * Space - <CODE>'\u0020'</CODE>, Tab - <CODE>'\u0009'</CODE>,
0250:             * Formfeed - <CODE>'\u000C'</CODE>,Zero-width space - <CODE>'\u200B'</CODE>,
0251:             * Carriage return - <CODE>'
0252:            '</CODE> and Line feed - <CODE>'
0253:            '</CODE>
0254:             * CR's are included for completenes only, they should never appear in document
0255:             */
0256:
0257:            private final boolean isWS(int character) {
0258:                return Character.isWhitespace(character);
0259:                //        return ( ch == '\u0020' || ch == '\u0009' || ch == '\u000c'
0260:                //              || ch == '\u200b' || ch == '\n' || ch == '\r' );
0261:            }
0262:
0263:            private boolean isJavaScriptArgument(LexerInput input) {
0264:                CharSequence name = input.readText();
0265:                if (name.charAt(0) == 'o') {
0266:                    if (EVENT_HANDLER_NAMES.contains(name.toString())) {
0267:                        return true;
0268:                    }
0269:                }
0270:                return false;
0271:            }
0272:
0273:            private boolean followsCloseTag(String closeTagName) {
0274:                int actChar;
0275:                int prev_read = input.readLength(); //remember the size of the read sequence //substract the first read character
0276:                int read = 0;
0277:                while (true) {
0278:                    actChar = input.read();
0279:                    read++;
0280:                    if (!(Character.isLetter(actChar)
0281:                            || Character.isDigit(actChar) || (actChar == '_')
0282:                            || (actChar == '-') || (actChar == ':')
0283:                            || (actChar == '.') || (actChar == '/'))
0284:                            || (actChar == EOF)) { // EOL or not alpha
0285:                        //end of tagname
0286:                        CharSequence tagName = input.readText().subSequence(
0287:                                prev_read, prev_read + read - 1);
0288:
0289:                        input.backup(read); //put the lookahead text back to the buffer
0290:
0291:                        if (closeTagName.equalsIgnoreCase(tagName.toString())) {
0292:                            if (actChar == '>') {
0293:                                return true;
0294:                            }
0295:                        }
0296:
0297:                        return false;
0298:                    }
0299:                }
0300:            }
0301:
0302:            public Token<HTMLTokenId> nextToken() {
0303:                int actChar;
0304:
0305:                while (true) {
0306:                    actChar = input.read();
0307:
0308:                    if (actChar == EOF) {
0309:                        if (input.readLengthEOF() == 1) {
0310:                            return null; //just EOL is read
0311:                        } else {
0312:                            //there is something else in the buffer except EOL
0313:                            //we will return last token now
0314:                            input.backup(1); //backup the EOL, we will return null in next nextToken() call
0315:                            break;
0316:                        }
0317:                    }
0318:
0319:                    //System.out.println("HTMLSyntax: parseToken tokenOffset=" + tokenOffset + ", actChar='" + actChar + "', offset=" + offset + ", state=" + getStateName(state) +
0320:                    //      ", stopOffset=" + stopOffset + ", lastBuffer=" + lastBuffer);
0321:                    switch (lexerState) {
0322:                    case INIT: // DONE
0323:                        switch (actChar) {
0324:                        case '<':
0325:                            lexerState = ISA_LT;
0326:                            break;
0327:                        case '&':
0328:                            lexerState = ISA_REF;
0329:                            lexerSubState = ISI_TEXT;
0330:                            break;
0331:                        default:
0332:                            lexerState = ISI_TEXT;
0333:                            break;
0334:                        }
0335:                        break;
0336:
0337:                    case ISI_TEXT: // DONE
0338:                        switch (actChar) {
0339:                        case '<':
0340:                        case '&':
0341:                            lexerState = INIT;
0342:                            input.backup(1);
0343:                            if (input.readLength() > 0) { //is there any text before & or < ???
0344:                                return token(HTMLTokenId.TEXT);
0345:                            }
0346:                            break;
0347:                        }
0348:                        break;
0349:
0350:                    case ISI_ERROR: // DONE
0351:                        lexerState = INIT;
0352:                        return token(HTMLTokenId.ERROR);
0353:
0354:                    case ISA_LT: // PENDING other transitions - e.g '<?'
0355:                        if (isAZ(actChar)) { // <'a..Z'
0356:                            lexerState = ISI_TAG;
0357:                            if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0358:                                input.backup(1);
0359:                                return token(HTMLTokenId.TAG_OPEN_SYMBOL);
0360:                            }
0361:                            break;
0362:                        }
0363:                        switch (actChar) {
0364:                        case '/': // ETAGO - </
0365:                            lexerState = ISA_SLASH;
0366:                            return token(HTMLTokenId.TAG_OPEN_SYMBOL);
0367:                        case '>': // Empty start tag <>, RELAXED
0368:                            lexerState = INIT;
0369:                            return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
0370:                        case '!':
0371:                            lexerState = ISA_SGML_ESCAPE;
0372:                            break;
0373:                        default: // Part of text, RELAXED
0374:                            lexerState = ISI_TEXT;
0375:                            break;
0376:                        }
0377:                        break;
0378:
0379:                    case ISA_SLASH: // DONE
0380:                        if (isAZ(actChar)) { // </'a..Z'
0381:                            lexerState = ISI_ENDTAG;
0382:                            break;
0383:                        }
0384:                        switch (actChar) {
0385:                        case '>': // Empty end tag </>, RELAXED
0386:                            lexerState = INIT;
0387:                            return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
0388:                        default: // Part of text, e.g. </3, </'\n', RELAXED
0389:                            lexerState = ISI_TEXT;
0390:                            input.backup(1);
0391:                            break;
0392:                        }
0393:                        break;
0394:
0395:                    case ISI_ENDTAG: // DONE
0396:                        if (isName(actChar))
0397:                            break; // Still in endtag identifier, eat next char
0398:                        lexerState = ISP_ENDTAG_X;
0399:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0400:                            input.backup(1);
0401:                            return token(HTMLTokenId.TAG_CLOSE);
0402:                        }
0403:                        break;
0404:
0405:                    case ISP_ENDTAG_X: // DONE
0406:                        if (isWS(actChar)) {
0407:                            lexerState = ISP_ENDTAG_WS;
0408:                            break;
0409:                        }
0410:                        switch (actChar) {
0411:                        case '>': // Closing of endtag, e.g. </H6 _>_
0412:                            lexerState = INIT;
0413:                            return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
0414:                        case '<': // next tag, e.g. </H6 _<_, RELAXED
0415:                            lexerState = INIT;
0416:                            input.backup(1);
0417:                            break;
0418:                        default:
0419:                            lexerState = ISI_ERROR;
0420:                            input.backup(1);
0421:                            break;
0422:                        }
0423:                        break;
0424:
0425:                    case ISP_ENDTAG_WS: // DONE
0426:                        if (isWS(actChar))
0427:                            break; // eat all WS
0428:                        lexerState = ISP_ENDTAG_X;
0429:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0430:                            input.backup(1);
0431:                            return token(HTMLTokenId.WS);
0432:                        }
0433:                        break;
0434:
0435:                    case ISI_TAG: // DONE
0436:                        if (isName(actChar))
0437:                            break; // Still in tag identifier, eat next char
0438:                        lexerState = ISP_TAG_X;
0439:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0440:                            input.backup(1);
0441:                            //test if the tagname is SCRIPT
0442:                            if (SCRIPT.equalsIgnoreCase(input.readText()
0443:                                    .toString())) { //NOI18N
0444:                                lexerEmbeddingState = ISI_SCRIPT;
0445:                            }
0446:                            if (STYLE.equalsIgnoreCase(input.readText()
0447:                                    .toString())) { //NOI18N
0448:                                lexerEmbeddingState = ISI_STYLE;
0449:                            }
0450:                            return token(HTMLTokenId.TAG_OPEN);
0451:                        }
0452:                        break;
0453:
0454:                    case ISP_TAG_X: // DONE
0455:                        if (isWS(actChar)) {
0456:                            lexerState = ISP_TAG_WS;
0457:                            break;
0458:                        }
0459:                        if (isAZ(actChar)) {
0460:                            lexerState = ISI_ARG;
0461:                            break;
0462:                        }
0463:                        switch (actChar) {
0464:                        case '/':
0465:                            lexerState = ISI_TAG_SLASH;
0466:                            break;
0467:                        case '>':
0468:                            switch (lexerEmbeddingState) {
0469:                            case INIT:
0470:                                lexerState = INIT;
0471:                                break;
0472:                            case ISI_SCRIPT:
0473:                                lexerState = ISI_SCRIPT_CONTENT;
0474:                                break;
0475:                            case ISI_STYLE:
0476:                                lexerState = ISI_STYLE_CONTENT;
0477:                                break;
0478:                            }
0479:
0480:                            return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
0481:                        case '<':
0482:                            lexerState = INIT;
0483:                            input.backup(1);
0484:                            break;
0485:                        default:
0486:                            lexerState = ISI_ERROR;
0487:                            input.backup(1);
0488:                            break;
0489:                        }
0490:                        break;
0491:
0492:                    case ISP_TAG_WS: // DONE
0493:                        if (isWS(actChar))
0494:                            break; // eat all WS
0495:                        lexerState = ISP_TAG_X;
0496:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0497:                            input.backup(1);
0498:                            return token(HTMLTokenId.WS);
0499:                        }
0500:
0501:                    case ISI_TAG_SLASH:
0502:                        switch (actChar) {
0503:                        case '>':
0504:                            lexerEmbeddingState = INIT; //possibly cancel 'in script' if empty tag found
0505:                            lexerState = INIT;
0506:                            return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
0507:                        default:
0508:                            lexerState = ISI_ERROR;
0509:                            input.backup(1);
0510:                            break;
0511:                        }
0512:                        break;
0513:
0514:                    case ISI_SCRIPT_CONTENT:
0515:                        switch (actChar) {
0516:                        case '<':
0517:                            lexerState = ISI_SCRIPT_CONTENT_AFTER_LT;
0518:                            break;
0519:                        default:
0520:                            break;
0521:                        }
0522:                        break;
0523:
0524:                    case ISI_SCRIPT_CONTENT_AFTER_LT:
0525:                        if (actChar == '/') {
0526:                            if (followsCloseTag(SCRIPT)) {
0527:                                //end of script section found
0528:                                lexerEmbeddingState = INIT;
0529:                                lexerState = INIT;
0530:                                input.backup(input.readLength() > 2 ? 2 : input
0531:                                        .readLength()); //backup the '</', we will read it again
0532:                                if (input.readLength() > 0) {
0533:                                    //the script has a body
0534:                                    return token(HTMLTokenId.SCRIPT);
0535:                                } else {
0536:                                    break;
0537:                                }
0538:                            }
0539:                        }
0540:                        lexerState = ISI_SCRIPT_CONTENT;
0541:                        break;
0542:
0543:                    case ISI_STYLE_CONTENT:
0544:                        switch (actChar) {
0545:                        case '<':
0546:                            lexerState = ISI_STYLE_CONTENT_AFTER_LT;
0547:                            break;
0548:                        default:
0549:                            break;
0550:                        }
0551:                        break;
0552:
0553:                    case ISI_STYLE_CONTENT_AFTER_LT:
0554:                        if (actChar == '/') {
0555:                            if (followsCloseTag(STYLE)) {
0556:                                //end of script section found
0557:                                lexerEmbeddingState = INIT;
0558:                                lexerState = INIT;
0559:                                input.backup(input.readLength() > 2 ? 2 : input
0560:                                        .readLength()); //backup the '</', we will read it again
0561:                                if (input.readLength() > 0) {
0562:                                    //the script has a body
0563:                                    return token(HTMLTokenId.STYLE);
0564:                                } else {
0565:                                    break;
0566:                                }
0567:                            }
0568:                        }
0569:                        lexerState = ISI_STYLE_CONTENT;
0570:                        break;
0571:
0572:                    case ISI_ARG: // DONE
0573:                        if (isName(actChar))
0574:                            break; // eat next char
0575:                        lexerState = ISP_ARG_X;
0576:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0577:                            input.backup(1);
0578:                            attributeName = input.readText().toString();
0579:                            return token(HTMLTokenId.ARGUMENT);
0580:                        }
0581:                        break;
0582:
0583:                    case ISP_ARG_X:
0584:                        if (isWS(actChar)) {
0585:                            lexerState = ISP_ARG_WS;
0586:                            break;
0587:                        }
0588:                        if (isAZ(actChar)) {
0589:                            lexerState = ISI_ARG;
0590:                            break;
0591:                        }
0592:                        switch (actChar) {
0593:                        case '/':
0594:                        case '>':
0595:                            input.backup(1);
0596:                            lexerState = ISP_TAG_X;
0597:                            break;
0598:                        case '<':
0599:                            lexerState = INIT;
0600:                            input.backup(1);
0601:                            break;
0602:                        case '=':
0603:                            lexerState = ISP_EQ;
0604:                            return token(HTMLTokenId.OPERATOR);
0605:                        default:
0606:                            lexerState = ISI_ERROR;
0607:                            input.backup(1);
0608:                            break;
0609:                        }
0610:                        break;
0611:
0612:                    case ISP_ARG_WS:
0613:                        if (isWS(actChar))
0614:                            break; // Eat all WhiteSpace
0615:                        lexerState = ISP_ARG_X;
0616:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0617:                            input.backup(1);
0618:                            return token(HTMLTokenId.WS);
0619:                        }
0620:                        break;
0621:
0622:                    case ISP_EQ:
0623:                        if (isWS(actChar)) {
0624:                            lexerState = ISP_EQ_WS;
0625:                            break;
0626:                        }
0627:                        switch (actChar) {
0628:                        case '\'':
0629:                            lexerState = ISI_VAL_QUOT;
0630:                            break;
0631:                        case '"':
0632:                            lexerState = ISI_VAL_DQUOT;
0633:                            break;
0634:                        case '/':
0635:                        case '>':
0636:                            input.backup(1);
0637:                            lexerState = ISP_TAG_X;
0638:                            break;
0639:                        default:
0640:                            lexerState = ISI_VAL; //everything else if attribute value
0641:                            break;
0642:                        }
0643:                        break;
0644:
0645:                    case ISP_EQ_WS:
0646:                        if (isWS(actChar))
0647:                            break; // Consume all WS
0648:                        lexerState = ISP_EQ;
0649:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0650:                            input.backup(1);
0651:                            return token(HTMLTokenId.WS);
0652:                        }
0653:                        break;
0654:
0655:                    case ISI_VAL:
0656:                        if (!isWS(actChar)
0657:                                && !(actChar == '/' || actChar == '>' || actChar == '<'))
0658:                            break; // Consume whole value
0659:                        lexerState = ISP_TAG_X;
0660:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0661:                            input.backup(1);
0662:                            if (attributeName != null
0663:                                    && EVENT_HANDLER_NAMES
0664:                                            .contains(attributeName)) {
0665:                                attributeName = null;
0666:                                return token(HTMLTokenId.VALUE_JAVASCRIPT);
0667:                            }
0668:                            attributeName = null;
0669:                            return token(HTMLTokenId.VALUE);
0670:                        }
0671:                        break;
0672:
0673:                    case ISI_VAL_QUOT:
0674:                        switch (actChar) {
0675:                        case '\'':
0676:                            lexerState = ISP_TAG_X;
0677:                            if (attributeName != null
0678:                                    && EVENT_HANDLER_NAMES
0679:                                            .contains(attributeName)) {
0680:                                attributeName = null;
0681:                                return token(HTMLTokenId.VALUE_JAVASCRIPT);
0682:                            }
0683:                            attributeName = null;
0684:                            return token(HTMLTokenId.VALUE);
0685:
0686:                            //                        Workaround for [Issue 117450]  Provide unified LexerInput across multiple joined embedded sections
0687:                            //                        The problem is described in detail in issue [Issue 118892]  Allow Schlieman lexer to continuously lex embedded language over  more tokens of its parent language
0688:                            //                        Should be removed once the issue is fixed.
0689:                            //                            
0690:                            //                        case '&':
0691:                            //                            if( input.readLength() == 1 ) {
0692:                            //                                lexerSubState = lexerState;
0693:                            //                                lexerState = ISA_REF;
0694:                            //                                break;
0695:                            //                            } else {
0696:                            //                                if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0697:                            //                                    input.backup(1);
0698:                            //                                    return token(HTMLTokenId.VALUE);
0699:                            //                                }
0700:                            //                            }
0701:                        }
0702:                        break; // else simply consume next char of VALUE
0703:
0704:                    case ISI_VAL_DQUOT:
0705:                        switch (actChar) {
0706:                        case '"':
0707:                            lexerState = ISP_TAG_X;
0708:                            if (attributeName != null
0709:                                    && EVENT_HANDLER_NAMES
0710:                                            .contains(attributeName)) {
0711:                                attributeName = null;
0712:                                return token(HTMLTokenId.VALUE_JAVASCRIPT);
0713:                            }
0714:                            attributeName = null;
0715:                            return token(HTMLTokenId.VALUE);
0716:
0717:                            //                        Workaround for [Issue 117450]  Provide unified LexerInput across multiple joined embedded sections
0718:                            //                        The problem is described in detail in issue [Issue 118892]  Allow Schlieman lexer to continuously lex embedded language over  more tokens of its parent language
0719:                            //                        Should be removed once the issue is fixed.
0720:                            //                            
0721:                            //                        case '&':
0722:                            //                            if( input.readLength() == 1 ) {
0723:                            //                                lexerSubState = lexerState;
0724:                            //                                lexerState = ISA_REF;
0725:                            //                                break;
0726:                            //                            } else {
0727:                            //                                if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0728:                            //                                    input.backup(1);
0729:                            //                                    return token(HTMLTokenId.VALUE);
0730:                            //                                }
0731:                            //                            }
0732:                        }
0733:                        break; // else simply consume next char of VALUE
0734:
0735:                    case ISA_SGML_ESCAPE: // DONE
0736:                        if (isAZ(actChar)) {
0737:                            lexerState = ISI_SGML_DECL;
0738:                            break;
0739:                        }
0740:                        switch (actChar) {
0741:                        case '-':
0742:                            lexerState = ISA_SGML_DASH;
0743:                            break;
0744:                        default:
0745:                            lexerState = ISI_TEXT;
0746:                            input.backup(1);
0747:                            continue;
0748:                        }
0749:                        break;
0750:
0751:                    case ISA_SGML_DASH: // DONE
0752:                        switch (actChar) {
0753:                        case '-':
0754:                            lexerState = ISI_HTML_COMMENT;
0755:                            break;
0756:                        default:
0757:                            lexerState = ISI_TEXT;
0758:                            input.backup(1);
0759:                            continue;
0760:                        }
0761:                        break;
0762:
0763:                    case ISI_HTML_COMMENT: // DONE
0764:                        switch (actChar) {
0765:                        case '-':
0766:                            lexerState = ISA_HTML_COMMENT_DASH;
0767:                            break;
0768:                        //create an HTML comment token for each line of the comment - a performance fix for #43532
0769:                        case '\n':
0770:                            //leave the some state - we are still in an HTML comment,
0771:                            //we just need to create a token for each line.
0772:                            return token(HTMLTokenId.BLOCK_COMMENT);
0773:                        }
0774:                        break;
0775:
0776:                    case ISA_HTML_COMMENT_DASH:
0777:                        switch (actChar) {
0778:                        case '-':
0779:                            lexerState = ISI_HTML_COMMENT_WS;
0780:                            break;
0781:                        default:
0782:                            lexerState = ISI_HTML_COMMENT;
0783:                            continue;
0784:                        }
0785:                        break;
0786:
0787:                    case ISI_HTML_COMMENT_WS: // DONE
0788:                        if (isWS(actChar))
0789:                            break; // Consume all WS
0790:                        switch (actChar) {
0791:                        case '>':
0792:                            lexerState = INIT;
0793:                            return token(HTMLTokenId.BLOCK_COMMENT);
0794:                        default:
0795:                            lexerState = ISI_HTML_COMMENT;
0796:                            input.backup(1);
0797:                            break;
0798:                        }
0799:                        break;
0800:
0801:                    case ISI_SGML_DECL:
0802:                        if (Character.isWhitespace(actChar)) {
0803:                            lexerState = ISI_SGML_DECL_WS;
0804:                            if (input.readLength() > 1) {
0805:                                input.backup(1); //backup the whitespace
0806:                                return token(HTMLTokenId.DECLARATION);
0807:                            }
0808:                            break;
0809:                        }
0810:                        switch (actChar) {
0811:                        case '>':
0812:                            if (input.readLength() > 1) {
0813:                                input.backup(1); //backup the '<' char
0814:                                return token(HTMLTokenId.DECLARATION);
0815:                            } else {
0816:                                //just the symbol read - return it as a part of declaration
0817:                                lexerState = INIT;
0818:                                return token(HTMLTokenId.DECLARATION);
0819:                            }
0820:                        case '-':
0821:                            if (input.readLength() == 1) {
0822:                                lexerState = ISA_SGML_DECL_DASH;
0823:                                break;
0824:                            } else {
0825:                                if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0826:                                    input.backup(1);
0827:                                    return token(HTMLTokenId.DECLARATION);
0828:                                }
0829:                            }
0830:                        }
0831:                        break;
0832:
0833:                    case ISI_SGML_DECL_WS:
0834:                        if (!Character.isWhitespace(actChar)) {
0835:                            lexerState = ISI_SGML_DECL;
0836:                            input.backup(1);
0837:                            return token(HTMLTokenId.WS);
0838:                        }
0839:                        break;
0840:
0841:                    case ISA_SGML_DECL_DASH:
0842:                        if (actChar == '-') {
0843:                            lexerState = ISI_SGML_COMMENT;
0844:                            break;
0845:                        } else {
0846:                            lexerState = ISI_SGML_DECL;
0847:                            input.backup(1);
0848:                            continue;
0849:                        }
0850:
0851:                    case ISI_SGML_COMMENT:
0852:                        switch (actChar) {
0853:                        case '-':
0854:                            lexerState = ISA_SGML_COMMENT_DASH;
0855:                            break;
0856:                        }
0857:                        break;
0858:
0859:                    case ISA_SGML_COMMENT_DASH:
0860:                        if (actChar == '-') {
0861:                            lexerState = ISI_SGML_DECL;
0862:                            return token(HTMLTokenId.SGML_COMMENT);
0863:                        } else {
0864:                            lexerState = ISI_SGML_COMMENT;
0865:                            input.backup(1);
0866:                            continue;
0867:                        }
0868:
0869:                    case ISA_REF:
0870:                        if (isAZ(actChar)) {
0871:                            lexerState = ISI_REF_NAME;
0872:                            break;
0873:                        }
0874:                        if (actChar == '#') {
0875:                            lexerState = ISA_REF_HASH;
0876:                            break;
0877:                        }
0878:                        lexerState = lexerSubState;
0879:                        input.backup(1);
0880:                        continue;
0881:
0882:                    case ISI_REF_NAME:
0883:                        if (isName(actChar))
0884:                            break;
0885:                        lexerState = lexerSubState;
0886:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0887:                            if (actChar != ';') {
0888:                                input.backup(1);
0889:                            }
0890:                            return token(HTMLTokenId.CHARACTER);
0891:                        }
0892:                        break;
0893:
0894:                    case ISA_REF_HASH:
0895:                        if (actChar >= '0' && actChar <= '9') {
0896:                            lexerState = ISI_REF_DEC;
0897:                            break;
0898:                        }
0899:                        if (actChar == 'x' || actChar == 'X') {
0900:                            lexerState = ISA_REF_X;
0901:                            break;
0902:                        }
0903:                        if (isAZ(actChar)) {
0904:                            lexerState = lexerSubState;
0905:                            return token(HTMLTokenId.ERROR);
0906:                        }
0907:                        lexerState = lexerSubState;
0908:                        input.backup(1);
0909:                        continue;
0910:
0911:                    case ISI_REF_DEC:
0912:                        if (actChar >= '0' && actChar <= '9')
0913:                            break;
0914:                        lexerState = lexerSubState;
0915:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0916:                            if (actChar != ';')
0917:                                input.backup(1);
0918:                            return token(HTMLTokenId.CHARACTER);
0919:                        }
0920:                        break;
0921:
0922:                    case ISA_REF_X:
0923:                        if ((actChar >= '0' && actChar <= '9')
0924:                                || (actChar >= 'a' && actChar <= 'f')
0925:                                || (actChar >= 'A' && actChar <= 'F')) {
0926:                            lexerState = ISI_REF_HEX;
0927:                            break;
0928:                        }
0929:                        lexerState = lexerSubState;
0930:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0931:                            input.backup(1);
0932:                            return token(HTMLTokenId.ERROR); // error on previous "&#x" sequence
0933:                        }
0934:                        break;
0935:
0936:                    case ISI_REF_HEX:
0937:                        if ((actChar >= '0' && actChar <= '9')
0938:                                || (actChar >= 'a' && actChar <= 'f')
0939:                                || (actChar >= 'A' && actChar <= 'F'))
0940:                            break;
0941:                        lexerState = lexerSubState;
0942:                        if (input.readLength() > 1) { //lexer restart check, token already returned before last EOF
0943:                            if (actChar != ';')
0944:                                input.backup(1);
0945:                            return token(HTMLTokenId.CHARACTER);
0946:                        }
0947:                        break;
0948:                    }
0949:                } // end of while(offset...)
0950:
0951:                /** At this stage there's no more text in the scanned buffer.
0952:                 * Scanner first checks whether this is completely the last
0953:                 * available buffer.
0954:                 */
0955:                switch (lexerState) {
0956:                case INIT:
0957:                    if (input.readLength() == 0) {
0958:                        return null;
0959:                    }
0960:                    break;
0961:                case ISI_TEXT:
0962:                case ISA_LT:
0963:                case ISA_SLASH:
0964:                case ISA_SGML_ESCAPE:
0965:                case ISA_SGML_DASH:
0966:                case ISI_TAG_SLASH:
0967:                    return token(HTMLTokenId.TEXT);
0968:
0969:                case ISA_REF:
0970:                case ISA_REF_HASH:
0971:                    if (lexerSubState == ISI_TEXT)
0972:                        return token(HTMLTokenId.TEXT);
0973:                    else
0974:                        return token(HTMLTokenId.VALUE);
0975:
0976:                case ISI_HTML_COMMENT:
0977:                case ISA_HTML_COMMENT_DASH:
0978:                case ISI_HTML_COMMENT_WS:
0979:                    return token(HTMLTokenId.BLOCK_COMMENT);
0980:
0981:                case ISI_TAG:
0982:                    return token(HTMLTokenId.TAG_OPEN);
0983:                case ISI_ENDTAG:
0984:                    return token(HTMLTokenId.TAG_CLOSE);
0985:
0986:                case ISI_ARG:
0987:                    return token(HTMLTokenId.ARGUMENT);
0988:
0989:                case ISI_ERROR:
0990:                    return token(HTMLTokenId.ERROR);
0991:
0992:                case ISP_ARG_WS:
0993:                case ISP_TAG_WS:
0994:                case ISP_ENDTAG_WS:
0995:                case ISP_EQ_WS:
0996:                    return token(HTMLTokenId.WS);
0997:
0998:                case ISP_ARG_X:
0999:                case ISP_TAG_X:
1000:                case ISP_ENDTAG_X:
1001:                case ISP_EQ:
1002:                    return token(HTMLTokenId.WS);
1003:
1004:                case ISI_VAL:
1005:                case ISI_VAL_QUOT:
1006:                case ISI_VAL_DQUOT:
1007:                    return token(HTMLTokenId.VALUE);
1008:
1009:                case ISI_SGML_DECL:
1010:                case ISA_SGML_DECL_DASH:
1011:                    return token(HTMLTokenId.DECLARATION);
1012:
1013:                case ISI_SGML_COMMENT:
1014:                case ISA_SGML_COMMENT_DASH:
1015:                    return token(HTMLTokenId.SGML_COMMENT);
1016:
1017:                case ISI_REF_NAME:
1018:                case ISI_REF_DEC:
1019:                case ISA_REF_X:
1020:                case ISI_REF_HEX:
1021:                    return token(HTMLTokenId.CHARACTER);
1022:                case ISI_SCRIPT_CONTENT:
1023:                case ISI_SCRIPT_CONTENT_ENDTAG:
1024:                case ISI_SCRIPT_CONTENT_AFTER_LT:
1025:                    return token(HTMLTokenId.SCRIPT);
1026:                case ISI_STYLE_CONTENT:
1027:                case ISI_STYLE_CONTENT_ENDTAG:
1028:                case ISI_STYLE_CONTENT_AFTER_LT:
1029:                    return token(HTMLTokenId.STYLE);
1030:
1031:                }
1032:
1033:                return null;
1034:            }
1035:
1036:            private Token<HTMLTokenId> token(HTMLTokenId tokenId) {
1037:                if (LOG) {
1038:                    if (input.readLength() == 0) {
1039:                        LOGGER.log(Level.INFO, "Found zero length token: ");
1040:                    }
1041:                    LOGGER.log(Level.INFO, "["
1042:                            + this .getClass().getSimpleName() + "] token ('"
1043:                            + input.readText().toString() + "'; id=" + tokenId
1044:                            + "; state=" + state() + ")\n");
1045:                }
1046:                return tokenFactory.createToken(tokenId);
1047:            }
1048:
1049:            public void release() {
1050:            }
1051:
1052:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.