001: /*
002: * Sun Public License Notice
003: *
004: * The contents of this file are subject to the Sun Public License
005: * Version 1.0 (the "License"). You may not use this file except in
006: * compliance with the License. A copy of the License is available at
007: * http://www.sun.com/
008: *
009: * The Original Code is NetBeans. The Initial Developer of the Original
010: * Code is Sun Microsystems, Inc. Portions Copyright 1997-2000 Sun
011: * Microsystems, Inc. All Rights Reserved.
012: */
013:
014: package org.netbeans.editor.ext.html;
015:
016: import org.netbeans.editor.Syntax;
017: import org.netbeans.editor.TokenID;
018:
019: /**
020: * Lexical anlyzer for HTML source files.
021: *
022: * @author Petr Nejedly
023: * @author Miloslav Metelka
024: * @version 1.00
025: */
026:
027: public class HTMLSyntax extends Syntax {
028:
029: /**
030: * Internal state of the lexical analyzer before entering subanalyzer of
031: * character references. It is initially set to INIT, but before first
032: * usage, this will be overwritten with state, which originated transition
033: * to charref subanalyzer.
034: */
035: protected int subState = INIT;
036:
037: // Internal states
038: private static final int ISI_TEXT = 1; // Plain text between tags
039: private static final int ISI_ERROR = 2; // Syntax error in HTML syntax
040: private static final int ISA_LT = 3; // After start of tag delimiter -
041: // "<"
042: private static final int ISA_SLASH = 4; // After ETAGO - "</"
043: private static final int ISI_ENDTAG = 5; // Inside endtag - "</[a..Z]+"
044: private static final int ISP_ENDTAG_X = 6; // X-switch after ENDTAG's name
045: private static final int ISP_ENDTAG_WS = 7; // In WS in ENDTAG - "</A_ _>"
046: private static final int ISI_TAG = 8; // Inside tag - "<[a..Z]+"
047: private static final int ISP_TAG_X = 9; // X-switch after TAG's name
048: private static final int ISP_TAG_WS = 10; // In WS in TAG - "<A_ _...>"
049: private static final int ISI_ARG = 11; // Inside tag's argument - "<A
050: // h_r_...>"
051: private static final int ISP_ARG_X = 12; // X-switch after ARGUMENT's
052: // name
053: private static final int ISP_ARG_WS = 13; // Inside WS after argument
054: // awaiting '='
055: private static final int ISP_EQ = 14; // X-switch after '=' in TAG's
056: // ARGUMENT
057: private static final int ISP_EQ_WS = 15; // In WS after '='
058: private static final int ISI_VAL = 16; // Non-quoted value
059: private static final int ISI_VAL_QUOT = 17; // Single-quoted value - may
060: // contain " chars
061: private static final int ISI_VAL_DQUOT = 18; // Double-quoted value - may
062: // contain ' chars
063: private static final int ISA_SGML_ESCAPE = 19; // After "<!"
064: private static final int ISA_SGML_DASH = 20; // After "<!-"
065: private static final int ISI_HTML_COMMENT = 21; // Somewhere after "<!--"
066: private static final int ISA_HTML_COMMENT_DASH = 22; // Dash in comment -
067: // maybe end of
068: // comment
069: private static final int ISI_HTML_COMMENT_WS = 23; // After end of comment,
070: // awaiting end of
071: // comment declaration
072: private static final int ISI_SGML_DECL = 24;
073: private static final int ISA_SGML_DECL_DASH = 25;
074: private static final int ISI_SGML_COMMENT = 26;
075: private static final int ISA_SGML_COMMENT_DASH = 27;
076: private static final int ISA_REF = 28; // when comes to character
077: // reference, e.g. &, after &
078: private static final int ISI_REF_NAME = 29; // if the reference is symbolic
079: // - by predefined name
080: private static final int ISA_REF_HASH = 30; // for numeric references -
081: // after &#
082: private static final int ISI_REF_DEC = 31; // decimal character reference,
083: // e.g. ř
084: private static final int ISA_REF_X = 32; //
085: private static final int ISI_REF_HEX = 33; // hexadecimal reference, in
086:
087: // 
.. of 	..
088:
089: public HTMLSyntax() {
090: tokenContextPath = HTMLTokenContext.contextPath;
091: }
092:
093: private final boolean isAZ(char ch) {
094: return ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'));
095: }
096:
097: private final boolean isName(char ch) {
098: return ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
099: || (ch >= '0' && ch <= '9') || ch == '-' || ch == '_'
100: || ch == '.' || ch == ':');
101:
102: }
103:
104: /**
105: * Resolves if given char is whitespace in terms of HTML4.0 specs According
106: * to specs, following characters are treated as whitespace: Space - <CODE>'\u0020'</CODE>,
107: * Tab - <CODE>'\u0009'</CODE>, Formfeed - <CODE>'\u000C'</CODE>,Zero-width
108: * space - <CODE>'\u200B'</CODE>, Carriage return - <CODE>' '</CODE>
109: * and Line feed - <CODE>' '</CODE> CR's are included for completenes
110: * only, they should never appear in document
111: */
112:
113: private final boolean isWS(char ch) {
114: return (ch == '\u0020' || ch == '\u0009' || ch == '\u000c'
115: || ch == '\u200b' || ch == '\n' || ch == '\r');
116: }
117:
118: protected TokenID parseToken() {
119: char actChar;
120:
121: while (offset < stopOffset) {
122: actChar = buffer[offset];
123:
124: switch (state) {
125: case INIT: // DONE
126: switch (actChar) {
127: case '<':
128: state = ISA_LT;
129: break;
130: case '&':
131: state = ISA_REF;
132: subState = ISI_TEXT;
133: break;
134: default:
135: state = ISI_TEXT;
136: break;
137: }
138: break;
139:
140: case ISI_TEXT: // DONE
141: switch (actChar) {
142: case '<':
143: case '&':
144: state = INIT;
145: return HTMLTokenContext.TEXT;
146: }
147: break;
148:
149: case ISI_ERROR: // DONE
150: offset++;
151: state = INIT;
152: return HTMLTokenContext.ERROR;
153:
154: case ISA_LT: // PENDING other transitions - e.g '<?'
155: if (isAZ(actChar)) { // <'a..Z'
156: state = ISI_TAG;
157: break;
158: }
159: switch (actChar) {
160: case '/': // ETAGO - </
161: state = ISA_SLASH;
162: break;
163: case '>': // Empty start tag <>, RELAXED
164: offset++;
165: state = INIT;
166: return HTMLTokenContext.TAG;
167: case '!':
168: state = ISA_SGML_ESCAPE;
169: break;
170: default: // Part of text, RELAXED
171: state = ISI_TEXT;
172: continue; // don't eat the char, maybe its '&'
173: }
174: break;
175:
176: case ISA_SLASH: // DONE
177: if (isAZ(actChar)) { // </'a..Z'
178: state = ISI_ENDTAG;
179: break;
180: }
181: switch (actChar) {
182: case '>': // Empty end tag </>, RELAXED
183: offset++;
184: state = INIT;
185: return HTMLTokenContext.TAG;
186: default: // Part of text, e.g. </3, </'\n', RELAXED
187: state = ISI_TEXT;
188: continue; // don'e eat the char
189: }
190: // break;
191:
192: case ISI_ENDTAG: // DONE
193: if (isName(actChar))
194: break; // Still in endtag identifier, eat next char
195: state = ISP_ENDTAG_X;
196: return HTMLTokenContext.TAG;
197:
198: case ISP_ENDTAG_X: // DONE
199: if (isWS(actChar)) {
200: state = ISP_ENDTAG_WS;
201: break;
202: }
203: switch (actChar) {
204: case '>': // Closing of endtag, e.g. </H6 _>_
205: offset++;
206: state = INIT;
207: return HTMLTokenContext.TAG;
208: case '<': // next tag, e.g. </H6 _<_, RELAXED
209: state = INIT;
210: continue;
211: default:
212: state = ISI_ERROR;
213: continue; // don't eat
214: }
215: // break;
216:
217: case ISP_ENDTAG_WS: // DONE
218: if (isWS(actChar))
219: break; // eat all WS
220: state = ISP_ENDTAG_X;
221: return HTMLTokenContext.WS;
222:
223: case ISI_TAG: // DONE
224: if (isName(actChar))
225: break; // Still in tag identifier, eat next char
226: state = ISP_TAG_X;
227: return HTMLTokenContext.TAG;
228:
229: case ISP_TAG_X: // DONE
230: if (isWS(actChar)) {
231: state = ISP_TAG_WS;
232: break;
233: }
234: if (isAZ(actChar)) {
235: state = ISI_ARG;
236: break;
237: }
238: switch (actChar) {
239: case '/':
240: case '>':
241: offset++;
242: state = INIT;
243: return HTMLTokenContext.TAG;
244: case '<':
245: state = INIT;
246: continue; // don't eat it!!!
247: default:
248: state = ISI_ERROR;
249: continue;
250: }
251: // break;
252:
253: case ISP_TAG_WS: // DONE
254: if (isWS(actChar))
255: break; // eat all WS
256: state = ISP_TAG_X;
257: return HTMLTokenContext.WS;
258:
259: case ISI_ARG: // DONE
260: if (isName(actChar))
261: break; // eat next char
262: state = ISP_ARG_X;
263: return HTMLTokenContext.ARGUMENT;
264:
265: case ISP_ARG_X:
266: if (isWS(actChar)) {
267: state = ISP_ARG_WS;
268: break;
269: }
270: if (isAZ(actChar)) {
271: state = ISI_ARG;
272: break;
273: }
274: switch (actChar) {
275: case '/':
276: case '>':
277: offset++;
278: state = INIT;
279: return HTMLTokenContext.TAG;
280: case '<':
281: state = INIT;
282: continue; // don't eat !!!
283: case '=':
284: offset++;
285: state = ISP_EQ;
286: return HTMLTokenContext.OPERATOR;
287: default:
288: state = ISI_ERROR;
289: continue;
290: }
291: // break;
292:
293: case ISP_ARG_WS:
294: if (isWS(actChar))
295: break; // Eat all WhiteSpace
296: state = ISP_ARG_X;
297: return HTMLTokenContext.WS;
298:
299: case ISP_EQ:
300: if (isWS(actChar)) {
301: state = ISP_EQ_WS;
302: break;
303: }
304: if (isName(actChar)) {
305: state = ISI_VAL;
306: break;
307: }
308: switch (actChar) {
309: case '\'':
310: state = ISI_VAL_QUOT;
311: break;
312: case '"':
313: state = ISI_VAL_DQUOT;
314: break;
315: default:
316: state = ISI_ERROR;
317: continue;
318: }
319: break;
320:
321: case ISP_EQ_WS:
322: if (isWS(actChar))
323: break; // Consume all WS
324: state = ISP_EQ;
325: return HTMLTokenContext.WS;
326:
327: case ISI_VAL:
328: if (isName(actChar))
329: break; // Consume whole value
330: state = ISP_TAG_X;
331: return HTMLTokenContext.VALUE;
332:
333: case ISI_VAL_QUOT:
334: switch (actChar) {
335: case '\'':
336: offset++;
337: state = ISP_TAG_X;
338: return HTMLTokenContext.VALUE;
339: case '&':
340: if (offset == tokenOffset) {
341: subState = state;
342: state = ISA_REF;
343: break;
344: } else {
345: return HTMLTokenContext.VALUE;
346: }
347: }
348: break; // else simply consume next char of VALUE
349:
350: case ISI_VAL_DQUOT:
351: switch (actChar) {
352: case '"':
353: offset++;
354: state = ISP_TAG_X;
355: return HTMLTokenContext.VALUE;
356: case '&':
357: if (offset == tokenOffset) {
358: subState = state;
359: state = ISA_REF;
360: break;
361: } else {
362: return HTMLTokenContext.VALUE;
363: }
364: }
365: break; // else simply consume next char of VALUE
366:
367: case ISA_SGML_ESCAPE: // DONE
368: if (isAZ(actChar)) {
369: state = ISI_SGML_DECL;
370: break;
371: }
372: switch (actChar) {
373: case '-':
374: state = ISA_SGML_DASH;
375: break;
376: default:
377: state = ISI_TEXT;
378: continue;
379: }
380: break;
381:
382: case ISA_SGML_DASH: // DONE
383: switch (actChar) {
384: case '-':
385: state = ISI_HTML_COMMENT;
386: break;
387: default:
388: state = ISI_TEXT;
389: continue;
390: }
391: break;
392:
393: case ISI_HTML_COMMENT: // DONE
394: switch (actChar) {
395: case '-':
396: state = ISA_HTML_COMMENT_DASH;
397: break;
398: }
399: break;
400:
401: case ISA_HTML_COMMENT_DASH:
402: switch (actChar) {
403: case '-':
404: state = ISI_HTML_COMMENT_WS;
405: break;
406: default:
407: state = ISI_HTML_COMMENT;
408: continue;
409: }
410: break;
411:
412: case ISI_HTML_COMMENT_WS: // DONE
413: if (isWS(actChar))
414: break; // Consume all WS
415: switch (actChar) {
416: case '>':
417: offset++;
418: state = INIT;
419: return HTMLTokenContext.BLOCK_COMMENT;
420: default:
421: state = ISI_ERROR;
422: return HTMLTokenContext.BLOCK_COMMENT;
423: }
424: // break;
425:
426: case ISI_SGML_DECL:
427: switch (actChar) {
428: case '>':
429: offset++;
430: state = INIT;
431: return HTMLTokenContext.DECLARATION;
432: case '-':
433: if (offset == tokenOffset) {
434: state = ISA_SGML_DECL_DASH;
435: break;
436: } else {
437: return HTMLTokenContext.DECLARATION;
438: }
439: }
440: break;
441:
442: case ISA_SGML_DECL_DASH:
443: if (actChar == '-') {
444: state = ISI_SGML_COMMENT;
445: break;
446: } else {
447: state = ISI_SGML_DECL;
448: continue;
449: }
450:
451: case ISI_SGML_COMMENT:
452: switch (actChar) {
453: case '-':
454: state = ISA_SGML_COMMENT_DASH;
455: break;
456: }
457: break;
458:
459: case ISA_SGML_COMMENT_DASH:
460: if (actChar == '-') {
461: offset++;
462: state = ISI_SGML_DECL;
463: return HTMLTokenContext.SGML_COMMENT;
464: } else {
465: state = ISI_SGML_COMMENT;
466: continue;
467: }
468:
469: case ISA_REF:
470: if (isAZ(actChar)) {
471: state = ISI_REF_NAME;
472: break;
473: }
474: if (actChar == '#') {
475: state = ISA_REF_HASH;
476: break;
477: }
478: state = subState;
479: continue;
480:
481: case ISI_REF_NAME:
482: if (isName(actChar))
483: break;
484: if (actChar == ';')
485: offset++;
486: state = subState;
487: return HTMLTokenContext.CHARACTER;
488:
489: case ISA_REF_HASH:
490: if (actChar >= '0' && actChar <= '9') {
491: state = ISI_REF_DEC;
492: break;
493: }
494: if (actChar == 'x' || actChar == 'X') {
495: state = ISA_REF_X;
496: break;
497: }
498: if (isAZ(actChar)) {
499: offset++;
500: state = subState;
501: return HTMLTokenContext.ERROR;
502: }
503: state = subState;
504: continue;
505:
506: case ISI_REF_DEC:
507: if (actChar >= '0' && actChar <= '9')
508: break;
509: if (actChar == ';')
510: offset++;
511: state = subState;
512: return HTMLTokenContext.CHARACTER;
513:
514: case ISA_REF_X:
515: if ((actChar >= '0' && actChar <= '9')
516: || (actChar >= 'a' && actChar <= 'f')
517: || (actChar >= 'A' && actChar <= 'F')) {
518: state = ISI_REF_HEX;
519: break;
520: }
521: state = subState;
522: return HTMLTokenContext.ERROR; // error on previous "&#x"
523: // sequence
524:
525: case ISI_REF_HEX:
526: if ((actChar >= '0' && actChar <= '9')
527: || (actChar >= 'a' && actChar <= 'f')
528: || (actChar >= 'A' && actChar <= 'F'))
529: break;
530: if (actChar == ';')
531: offset++;
532: state = subState;
533: return HTMLTokenContext.CHARACTER;
534: }
535:
536: offset = ++offset;
537: } // end of while(offset...)
538:
539: /**
540: * At this stage there's no more text in the scanned buffer. Scanner
541: * first checks whether this is completely the last available buffer.
542: */
543: if (lastBuffer) {
544: switch (state) {
545: case INIT:
546: case ISI_TEXT:
547: case ISA_LT:
548: case ISA_SLASH:
549: case ISA_SGML_ESCAPE:
550: case ISA_SGML_DASH:
551: return HTMLTokenContext.TEXT;
552:
553: case ISA_REF:
554: case ISA_REF_HASH:
555: if (subState == ISI_TEXT)
556: return HTMLTokenContext.TEXT;
557: else
558: return HTMLTokenContext.VALUE;
559:
560: case ISI_HTML_COMMENT:
561: case ISA_HTML_COMMENT_DASH:
562: case ISI_HTML_COMMENT_WS:
563: return HTMLTokenContext.BLOCK_COMMENT;
564:
565: case ISI_TAG:
566: case ISI_ENDTAG:
567: return HTMLTokenContext.TAG;
568:
569: case ISI_ARG:
570: return HTMLTokenContext.ARGUMENT;
571:
572: case ISI_ERROR:
573: return HTMLTokenContext.ERROR;
574:
575: case ISP_ARG_WS:
576: case ISP_TAG_WS:
577: case ISP_ENDTAG_WS:
578: case ISP_EQ_WS:
579: return HTMLTokenContext.WS;
580:
581: case ISP_ARG_X:
582: case ISP_TAG_X:
583: case ISP_ENDTAG_X:
584: case ISP_EQ:
585: return HTMLTokenContext.WS;
586:
587: case ISI_VAL:
588: case ISI_VAL_QUOT:
589: case ISI_VAL_DQUOT:
590: return HTMLTokenContext.VALUE;
591:
592: case ISI_SGML_DECL:
593: case ISA_SGML_DECL_DASH:
594: return HTMLTokenContext.DECLARATION;
595:
596: case ISI_SGML_COMMENT:
597: case ISA_SGML_COMMENT_DASH:
598: return HTMLTokenContext.SGML_COMMENT;
599:
600: case ISI_REF_NAME:
601: case ISI_REF_DEC:
602: case ISA_REF_X:
603: case ISI_REF_HEX:
604: return HTMLTokenContext.CHARACTER;
605: }
606: }
607:
608: return null;
609: }
610:
611: public String getStateName(int stateNumber) {
612: switch (stateNumber) {
613: case INIT:
614: return "INIT"; // NOI18N
615: case ISI_TEXT:
616: return "ISI_TEXT"; // NOI18N
617: case ISA_LT:
618: return "ISA_LT"; // NOI18N
619: case ISA_SLASH:
620: return "ISA_SLASH"; // NOI18N
621: case ISA_SGML_ESCAPE:
622: return "ISA_SGML_ESCAPE"; // NOI18N
623: case ISA_SGML_DASH:
624: return "ISA_SGML_DASH"; // NOI18N
625: case ISI_HTML_COMMENT:
626: return "ISI_HTML_COMMENT";// NOI18N
627: case ISA_HTML_COMMENT_DASH:
628: return "ISA_HTML_COMMENT_DASH";// NOI18N
629: case ISI_HTML_COMMENT_WS:
630: return "ISI_HTML_COMMENT_WS";// NOI18N
631: case ISI_TAG:
632: return "ISI_TAG";// NOI18N
633: case ISI_ENDTAG:
634: return "ISI_ENDTAG";// NOI18N
635: case ISI_ARG:
636: return "ISI_ARG";// NOI18N
637: case ISI_ERROR:
638: return "ISI_ERROR";// NOI18N
639: case ISP_ARG_WS:
640: return "ISP_ARG_WS";// NOI18N
641: case ISP_TAG_WS:
642: return "ISP_TAG_WS";// NOI18N
643: case ISP_ENDTAG_WS:
644: return "ISP_ENDTAG_WS";// NOI18N
645: case ISP_ARG_X:
646: return "ISP_ARG_X";// NOI18N
647: case ISP_TAG_X:
648: return "ISP_TAG_X";// NOI18N
649: case ISP_ENDTAG_X:
650: return "ISP_ENDTAG_X";// NOI18N
651: case ISP_EQ:
652: return "ISP_EQ";// NOI18N
653: case ISI_VAL:
654: return "ISI_VAL";// NOI18N
655: case ISI_VAL_QUOT:
656: return "ISI_VAL_QUOT";// NOI18N
657: case ISI_VAL_DQUOT:
658: return "ISI_VAL_DQUOT";// NOI18N
659: case ISI_SGML_DECL:
660: return "ISI_SGML_DECL";// NOI18N
661: case ISA_SGML_DECL_DASH:
662: return "ISA_SGML_DECL_DASH";// NOI18N
663: case ISI_SGML_COMMENT:
664: return "ISI_SGML_COMMENT";// NOI18N
665: case ISA_SGML_COMMENT_DASH:
666: return "ISA_SGML_COMMENT_DASH";// NOI18N
667: case ISA_REF:
668: return "ISA_REF";// NOI18N
669: case ISI_REF_NAME:
670: return "ISI_REF_NAME";// NOI18N
671: case ISA_REF_HASH:
672: return "ISA_REF_HASH";// NOI18N
673: case ISI_REF_DEC:
674: return "ISI_REF_DEC";// NOI18N
675: case ISA_REF_X:
676: return "ISA_REF_X";// NOI18N
677: case ISI_REF_HEX:
678: return "ISI_REF_HEX";// NOI18N
679: default:
680: return super .getStateName(stateNumber);
681: }
682: }
683:
684: /**
685: * Load valid mark state into the analyzer. Offsets are already initialized
686: * when this method is called. This method must get the state from the mark
687: * and set it to the analyzer. Then it must decrease tokenOffset by the
688: * preScan stored in the mark state.
689: *
690: * @param markState
691: * mark state to be loaded into syntax. It must be non-null
692: * value.
693: */
694: public void loadState(StateInfo stateInfo) {
695: super .loadState(stateInfo);
696: subState = ((HTMLStateInfo) stateInfo).getSubState();
697: }
698:
699: /** Store state of this analyzer into given mark state. */
700: public void storeState(StateInfo stateInfo) {
701: super .storeState(stateInfo);
702: ((HTMLStateInfo) stateInfo).setSubState(subState);
703: }
704:
705: /** Compare state of this analyzer to given state info */
706: public int compareState(StateInfo stateInfo) {
707: if (super .compareState(stateInfo) == DIFFERENT_STATE)
708: return DIFFERENT_STATE;
709: return (((HTMLStateInfo) stateInfo).getSubState() == subState) ? EQUAL_STATE
710: : DIFFERENT_STATE;
711: }
712:
713: /** Create state info appropriate for particular analyzer */
714: public StateInfo createStateInfo() {
715: return new HTMLStateInfo();
716: }
717:
718: /** Base implementation of the StateInfo interface */
719: public static class HTMLStateInfo extends Syntax.BaseStateInfo {
720:
721: /** analyzer subState during parsing character references */
722: private int subState;
723:
724: public int getSubState() {
725: return subState;
726: }
727:
728: public void setSubState(int subState) {
729: this .subState = subState;
730: }
731:
732: public String toString(Syntax syntax) {
733: return super .toString(syntax) + ", subState="
734: + syntax.getStateName(getSubState()); // NOI18N
735: }
736:
737: }
738:
739: }
|