001: /*
002: * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
003: *
004: * Copyright 1997-2007 Sun Microsystems, Inc. All rights reserved.
005: *
006: * The contents of this file are subject to the terms of either the GNU
007: * General Public License Version 2 only ("GPL") or the Common
008: * Development and Distribution License("CDDL") (collectively, the
009: * "License"). You may not use this file except in compliance with the
010: * License. You can obtain a copy of the License at
011: * http://www.netbeans.org/cddl-gplv2.html
012: * or nbbuild/licenses/CDDL-GPL-2-CP. See the License for the
013: * specific language governing permissions and limitations under the
014: * License. When distributing the software, include this License Header
015: * Notice in each file and include the License file at
016: * nbbuild/licenses/CDDL-GPL-2-CP. Sun designates this
017: * particular file as subject to the "Classpath" exception as provided
018: * by Sun in the GPL Version 2 section of the License file that
019: * accompanied this code. If applicable, add the following below the
020: * License Header, with the fields enclosed by brackets [] replaced by
021: * your own identifying information:
022: * "Portions Copyrighted [year] [name of copyright owner]"
023: *
024: * Contributor(s):
025: *
026: * The Original Software is NetBeans. The Initial Developer of the Original
027: * Software is Sun Microsystems, Inc. Portions Copyright 1997-2007 Sun
028: * Microsystems, Inc. All Rights Reserved.
029: *
030: * If you wish your version of this file to be governed by only the CDDL
031: * or only the GPL Version 2, indicate your decision by adding
032: * "[Contributor] elects to include this software in this distribution
033: * under the [CDDL or GPL Version 2] license." If you do not indicate a
034: * single choice of license, a recipient has the option to distribute
035: * your version of this file under either the CDDL, the GPL Version 2 or
036: * to extend the choice of license to its licensees as provided above.
037: * However, if you add GPL Version 2 code and therefore, elected the GPL
038: * Version 2 license, then the option applies only if the new code is
039: * made subject to such option by the copyright holder.
040: */
041: package org.netbeans.editor.ext.html.parser;
042:
043: import java.util.ArrayList;
044: import java.util.Collections;
045: import java.util.HashMap;
046: import java.util.List;
047: import java.util.Map;
048: import java.util.logging.Level;
049: import java.util.logging.Logger;
050: import javax.swing.text.BadLocationException;
051: import javax.swing.text.Document;
052: import org.netbeans.api.html.lexer.HTMLTokenId;
053: import org.netbeans.api.lexer.LanguagePath;
054: import org.netbeans.api.lexer.Token;
055: import org.netbeans.api.lexer.TokenHierarchy;
056: import org.netbeans.api.lexer.TokenHierarchyEvent;
057: import org.netbeans.api.lexer.TokenHierarchyEventType;
058: import org.netbeans.api.lexer.TokenHierarchyListener;
059: import org.netbeans.api.lexer.TokenSequence;
060: import org.netbeans.editor.BaseDocument;
061: import org.openide.util.Exceptions;
062: import org.openide.util.RequestProcessor;
063:
064: /**
065: * Simple HTML syntax analyzer
066: *
067: * @author Marek.Fukala@Sun.com
068: */
069: public final class SyntaxParser {
070:
071: private static final Logger LOGGER = Logger
072: .getLogger(SyntaxParser.class.getName());
073: private static final boolean LOG = LOGGER.isLoggable(Level.FINE);
074:
075: private static final int PARSER_DELAY = 1000; //ms (=1second)
076:
077: private final Document doc;
078: private final LanguagePath languagePath;
079: private final TokenHierarchy hi;
080: private final RequestProcessor.Task parserTask;
081: private final ArrayList<SyntaxParserListener> listeners = new ArrayList<SyntaxParserListener>();
082: private final TokenHierarchyListener tokenHierarchyListener = new TokenHierarchyListener() {
083:
084: public void tokenHierarchyChanged(TokenHierarchyEvent evt) {
085: if (evt.type() == TokenHierarchyEventType.MODIFICATION) {
086: restartParser();
087: }
088: }
089: };
090:
091: private List<SyntaxElement> EMPTY_ELEMENTS_LIST = Collections
092: .emptyList();
093: private List<SyntaxElement> parsedElements;
094: private boolean isSuccessfulyParsed = false;
095:
096: protected final ParserSource parserSource;
097:
098: /** Returns an instance of SyntaxParser for given document.
099: * The client is supposed to add a SyntaxParserListener to the obtained instance
100: * to get notification whenever the document changes and is reparsed.
101: */
102: //XXX We cannot create multiple SyntaxParser-s for various languagePaths on one document.
103: public static synchronized SyntaxParser get(Document doc,
104: LanguagePath languagePath) {
105: SyntaxParser parser = (SyntaxParser) doc
106: .getProperty(SyntaxParser.class);
107: if (parser == null) {
108: parser = new SyntaxParser(doc, languagePath);
109: doc.putProperty(SyntaxParser.class, parser);
110: }
111: return parser;
112: }
113:
114: /** Creates a new instance of SyntaxParser parsing the immutable source. */
115: public static SyntaxParser create(CharSequence source) {
116: return new SyntaxParser(source);
117: }
118:
119: private SyntaxParser(final CharSequence source) {
120: this .parserTask = null;
121: this .doc = null;
122: this .parsedElements = EMPTY_ELEMENTS_LIST;
123: this .languagePath = LanguagePath.get(HTMLTokenId.language());
124: this .hi = TokenHierarchy.create(source, HTMLTokenId.language());
125: this .parserSource = new ParserSource() {
126: public CharSequence getText(int offset, int length)
127: throws BadLocationException {
128: return source.subSequence(offset, offset + length);
129: }
130: };
131: }
132:
133: private SyntaxParser(Document document, LanguagePath languagePath) {
134: this .doc = document;
135: this .languagePath = languagePath;
136: this .hi = TokenHierarchy.get(doc);
137:
138: if (hi == null) {
139: String mimeType = (String) doc.getProperty("mimeType"); //NOI18N
140: if (mimeType == null) {
141: mimeType = "unknown";
142: }
143: throw new IllegalStateException(
144: "Cannot obtain TokenHierarchy instance for document "
145: + document + " with " + mimeType
146: + " mimetype."); //NOI18N
147: }
148:
149: this .parserSource = new ParserSource() {
150: public String getText(int offset, int length)
151: throws BadLocationException {
152: return doc.getText(offset, length);
153: }
154: };
155:
156: parsedElements = EMPTY_ELEMENTS_LIST;
157:
158: parserTask = RequestProcessor.getDefault().create(
159: new Runnable() {
160: public void run() {
161: parse();
162: }
163: });
164:
165: //add itself as token hierarchy listener
166: hi.addTokenHierarchyListener(tokenHierarchyListener);
167:
168: //ensure the document is parsed
169: restartParser();
170:
171: }
172:
173: /** Parses the immutable source. */
174: public List<SyntaxElement> parseImmutableSource() {
175: if (doc != null) {
176: throw new IllegalStateException(
177: "Cannot explicitly parse muttable source!");
178: } else {
179: try {
180: return parseDocument();
181: } catch (BadLocationException ex) {
182: LOGGER.log(Level.WARNING,
183: "Error during parsing html content", ex);
184: return null;
185:
186: }
187: }
188: }
189:
190: //---------------------------- public methods ------------------------------
191: public void addSyntaxParserListener(SyntaxParserListener spl) {
192: listeners.add(spl);
193: }
194:
195: /** Removes the SyntaxParserListener from the listeners list.*/
196: public void removeSyntaxParserListener(SyntaxParserListener spl) {
197: listeners.remove(spl);
198: }
199:
200: //----------------------- package private methods---------------------------
201: /** used by unit tests */
202: void forceParse() {
203: parserTask.cancel();
204: parse();
205: }
206:
207: List<SyntaxElement> elements() {
208: return parsedElements;
209: }
210:
211: //---------------------------- private methods -----------------------------
212: private void restartParser() {
213: if (!parserTask.isFinished()) {
214: parserTask.cancel(); //removes the task from the queue AND INTERRUPTS the thread!
215: }
216: parserTask.schedule(PARSER_DELAY);
217: }
218:
219: private synchronized void parse() {
220: BaseDocument bdoc = (BaseDocument) doc;
221: bdoc.readLock();
222: try {
223: List<SyntaxElement> newElements = parseDocument();
224: parsedElements = newElements;
225: isSuccessfulyParsed = true;
226: } catch (BadLocationException ble) {
227: isSuccessfulyParsed = false;
228: LOGGER.log(Level.WARNING,
229: "Error during parsing html content", ble);
230: } finally {
231: bdoc.readUnlock();
232: }
233:
234: if (isSuccessfulyParsed) {
235: notifyParsingFinished();
236: }
237: }
238:
239: private void notifyParsingFinished() {
240: if (!parsedElements.isEmpty()) {
241:
242: //debug messages
243: if (LOG) {
244: for (SyntaxElement se : parsedElements) {
245: LOGGER.log(Level.FINE, se.toString());
246: System.out.println(se.toString());
247: }
248: }
249:
250: for (SyntaxParserListener spl : listeners) {
251: spl.parsingFinished(parsedElements);
252: }
253: }
254: }
255:
256: private void entityReference() {
257: elements.add(new SyntaxElement(parserSource, start, token
258: .offset(hi)
259: + token.length() - start,
260: SyntaxElement.TYPE_ENTITY_REFERENCE));
261:
262: }
263:
264: private void comment() {
265: elements.add(new SyntaxElement(parserSource, start, token
266: .offset(hi)
267: + token.length() - start, SyntaxElement.TYPE_COMMENT));
268: }
269:
270: private void declaration() {
271: elements.add(new SyntaxElement.Declaration(parserSource, start,
272: token.offset(hi) + token.length() - start,
273: root_element, doctype_public_id, doctype_file));
274: }
275:
276: private void tag(boolean emptyTag) {
277: List<SyntaxElement.TagAttribute> attributes = new ArrayList<SyntaxElement.TagAttribute>();
278: for (int i = 0; i < attr_keys.size(); i++) {
279: Token key = attr_keys.get(i);
280: List<Token> values = attr_values.get(i);
281: StringBuffer joinedValue = new StringBuffer();
282: for (Token t : values) {
283: joinedValue.append(t.text());
284: }
285:
286: Token firstValuePart = values.get(0);
287: Token lastValuePart = values.get(values.size() - 1);
288:
289: SyntaxElement.TagAttribute ta = new SyntaxElement.TagAttribute(
290: key.text().toString(), joinedValue.toString(), key
291: .offset(hi), firstValuePart.offset(hi),
292: lastValuePart.offset(hi) + lastValuePart.length()
293: - firstValuePart.offset(hi));
294: attributes.add(ta);
295: }
296:
297: elements.add(new SyntaxElement.Tag(parserSource, start, token
298: .offset(hi)
299: + token.length() - start, tagName, attributes, openTag,
300: emptyTag));
301:
302: tagName = null;
303: attrib = null;
304: attr_keys = new ArrayList<Token>();
305: attr_values = new ArrayList<List<Token>>();
306: }
307:
308: private void reset() {
309: state = S_INIT;
310: start = -1;
311: backup(1);
312: }
313:
314: private void backup(int tokens) {
315: for (int i = 0; i < tokens; i++) {
316: ts.movePrevious();
317: token = ts.token();
318: }
319: }
320:
321: private static final int S_INIT = 0;
322: private static final int S_TAG_OPEN_SYMBOL = 1;
323: private static final int S_TAG = 2;
324: private static final int S_TAG_ATTR = 3;
325: private static final int S_TAG_VALUE = 4;
326: private static final int S_COMMENT = 5;
327: private static final int S_DECLARATION = 6;
328: private static final int S_DOCTYPE_DECLARATION = 7;
329: private static final int S_DOCTYPE_AFTER_ROOT_ELEMENT = 8;
330: private static final int S_DOCTYPE_PUBLIC_ID = 9;
331: private static final int S_DOCTYPE_FILE = 10;
332:
333: private int state;
334: private int start;
335: private TokenSequence ts;
336: private Token<HTMLTokenId> token;
337: private List<SyntaxElement> elements;
338:
339: private boolean openTag = true;
340: private String tagName = null;
341: private Token attrib = null;
342: private ArrayList<Token> attr_keys = null;
343: private ArrayList<List<Token>> attr_values = null;
344:
345: private String root_element, doctype_public_id, doctype_file;
346:
347: //PENDING: we do not handle incomplete tokens yet - should be added
348: private List<SyntaxElement> parseDocument()
349: throws BadLocationException {
350: elements = new ArrayList<SyntaxElement>();
351: List<TokenSequence<HTMLTokenId>> sequences = hi
352: .tokenSequenceList(languagePath, 0, Integer.MAX_VALUE);
353: state = S_INIT;
354: start = -1;
355: attr_keys = new ArrayList<Token>();
356: attr_values = new ArrayList<List<Token>>();
357:
358: for (TokenSequence _ts : sequences) {
359: ts = _ts;
360: while (ts.moveNext()) {
361: token = ts.token();
362: HTMLTokenId id = token.id();
363:
364: switch (state) {
365: case S_INIT:
366: switch (id) {
367: case CHARACTER:
368: start = ts.offset();
369: entityReference();
370: state = S_INIT;
371: start = -1;
372: break;
373: case TAG_OPEN_SYMBOL:
374: start = ts.offset();
375: state = S_TAG_OPEN_SYMBOL;
376: break;
377: case BLOCK_COMMENT:
378: start = ts.offset();
379: state = S_COMMENT;
380: break;
381: case DECLARATION:
382: start = ts.offset();
383: if (token.text().toString().equals("<!DOCTYPE")) {
384: root_element = null;
385: doctype_public_id = null;
386: doctype_file = null;
387: state = S_DOCTYPE_DECLARATION;
388: } else {
389: state = S_DECLARATION;
390: }
391: break;
392: }
393: break;
394:
395: case S_TAG_OPEN_SYMBOL:
396: switch (id) {
397: case TAG_OPEN:
398: state = S_TAG;
399: openTag = true;
400: tagName = token.text().toString();
401: break;
402: case TAG_CLOSE:
403: state = S_TAG;
404: openTag = false;
405: tagName = token.text().toString();
406: break;
407: default:
408: reset(); //error
409: break;
410: }
411: break;
412:
413: case S_TAG:
414: switch (id) {
415: case WS:
416: case EOL:
417: case ERROR:
418: break;
419: case ARGUMENT:
420: state = S_TAG_ATTR;
421: attrib = token;
422: break;
423: case TAG_CLOSE_SYMBOL:
424: boolean emptyTag = "/>".equals(token.text()
425: .toString());
426: tag(emptyTag);
427: state = S_INIT;
428: start = -1;
429: break;
430: default:
431: reset(); //error
432: break;
433: }
434: break;
435:
436: case S_TAG_ATTR:
437: switch (id) {
438: case OPERATOR:
439: case WS:
440: break;
441: case VALUE:
442: backup(1); //backup the value
443: state = S_TAG_VALUE;
444: break;
445: default:
446: reset(); //error
447: break;
448: }
449: break;
450:
451: case S_TAG_VALUE:
452: switch (id) {
453: case VALUE:
454: int index = attr_keys.indexOf(attrib);
455: if (index == -1) {
456: List<Token> values = new ArrayList<Token>();
457: values.add(token);
458: attr_keys.add(attrib);
459: attr_values.add(values);
460: } else {
461: attr_values.get(index).add(token);
462: }
463:
464: break;
465: default:
466: backup(1);
467: state = S_TAG;
468: break;
469: }
470: break;
471:
472: case S_COMMENT:
473: switch (id) {
474: case BLOCK_COMMENT:
475: case EOL:
476: case WS:
477: break;
478: default:
479: backup(1);
480: comment();
481: state = S_INIT;
482: start = -1;
483: break;
484: }
485: break;
486:
487: case S_DECLARATION:
488: switch (id) {
489: case DECLARATION:
490: case SGML_COMMENT:
491: case EOL:
492: case WS:
493: break;
494: default:
495: backup(1);
496: declaration();
497: state = S_INIT;
498: start = -1;
499: break;
500: }
501: break;
502:
503: case S_DOCTYPE_DECLARATION:
504: switch (id) {
505: case DECLARATION:
506: root_element = token.text().toString();
507: state = S_DOCTYPE_AFTER_ROOT_ELEMENT;
508: break;
509: case SGML_COMMENT:
510: case EOL:
511: case WS:
512: break;
513: default:
514: backup(1);
515: declaration();
516: state = S_INIT;
517: start = -1;
518: break;
519: }
520: break;
521:
522: case S_DOCTYPE_AFTER_ROOT_ELEMENT:
523: switch (id) {
524: case DECLARATION:
525: if (token.text().toString().equals("PUBLIC")) {
526: state = S_DOCTYPE_PUBLIC_ID;
527: break;
528: } else if (token.text().toString().equals(
529: "SYSTEM")) {
530: state = S_DOCTYPE_FILE;
531: break;
532: }
533: //not of the expected
534: backup(1);
535: declaration();
536: state = S_INIT;
537: start = -1;
538:
539: break;
540: case SGML_COMMENT:
541: case EOL:
542: case WS:
543: break;
544: default:
545: backup(1);
546: declaration();
547: state = S_INIT;
548: start = -1;
549: break;
550: }
551: break;
552:
553: case S_DOCTYPE_PUBLIC_ID:
554: switch (id) {
555: case DECLARATION:
556: doctype_public_id = token.text().toString();
557: state = S_DOCTYPE_FILE;
558: break;
559: case SGML_COMMENT:
560: case EOL:
561: case WS:
562: break;
563: default:
564: backup(1);
565: declaration();
566: state = S_INIT;
567: start = -1;
568: break;
569: }
570: break;
571:
572: case S_DOCTYPE_FILE:
573: switch (id) {
574: case DECLARATION:
575: doctype_file = token.text().toString();
576: //jump to simple sgml declaration so potentially
577: //other declaration tokens are inluded
578: state = S_DECLARATION;
579: break;
580: case SGML_COMMENT:
581: case EOL:
582: case WS:
583: break;
584: default:
585: backup(1);
586: declaration();
587: state = S_INIT;
588: start = -1;
589: break;
590: }
591: break;
592:
593: }
594: }
595: }
596:
597: if (state != S_INIT) {
598: //an incomplete syntax element at the end of the file
599: switch (state) {
600: case S_COMMENT:
601: comment();
602: break;
603: case S_DECLARATION:
604: case S_DOCTYPE_AFTER_ROOT_ELEMENT:
605: case S_DOCTYPE_DECLARATION:
606: case S_DOCTYPE_FILE:
607: case S_DOCTYPE_PUBLIC_ID:
608: declaration();
609: break;
610: }
611:
612: }
613:
614: return elements;
615:
616: }
617:
618: }
|