0001: /*
0002: * Java HTML Tidy - JTidy
0003: * HTML parser and pretty printer
0004: *
0005: * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
0006: * Institute of Technology, Institut National de Recherche en
0007: * Informatique et en Automatique, Keio University). All Rights
0008: * Reserved.
0009: *
0010: * Contributing Author(s):
0011: *
0012: * Dave Raggett <dsr@w3.org>
0013: * Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0014: * Gary L Peskin <garyp@firstech.com> (Java development)
0015: * Sami Lempinen <sami@lempinen.net> (release management)
0016: * Fabrizio Giustina <fgiust at users.sourceforge.net>
0017: *
0018: * The contributing author(s) would like to thank all those who
0019: * helped with testing, bug fixes, and patience. This wouldn't
0020: * have been possible without all of you.
0021: *
0022: * COPYRIGHT NOTICE:
0023: *
0024: * This software and documentation is provided "as is," and
0025: * the copyright holders and contributing author(s) make no
0026: * representations or warranties, express or implied, including
0027: * but not limited to, warranties of merchantability or fitness
0028: * for any particular purpose or that the use of the software or
0029: * documentation will not infringe any third party patents,
0030: * copyrights, trademarks or other rights.
0031: *
0032: * The copyright holders and contributing author(s) will not be
0033: * liable for any direct, indirect, special or consequential damages
0034: * arising out of any use of the software or documentation, even if
0035: * advised of the possibility of such damage.
0036: *
0037: * Permission is hereby granted to use, copy, modify, and distribute
0038: * this source code, or portions hereof, documentation and executables,
0039: * for any purpose, without fee, subject to the following restrictions:
0040: *
0041: * 1. The origin of this source code must not be misrepresented.
0042: * 2. Altered versions must be plainly marked as such and must
0043: * not be misrepresented as being the original source.
0044: * 3. This Copyright notice may not be removed or altered from any
0045: * source or altered source distribution.
0046: *
0047: * The copyright holders and contributing author(s) specifically
0048: * permit, without fee, and encourage the use of this source code
0049: * as a component for supporting the Hypertext Markup Language in
0050: * commercial products. If you use this source code in a product,
0051: * acknowledgment is not required but would be appreciated.
0052: *
0053: */
0054: package org.w3c.tidy;
0055:
0056: /**
0057: * Used for elements and text nodes element name is null for text nodes start and end are offsets into lexbuf which
0058: * contains the textual content of all elements in the parse tree. Parent and content allow traversal of the parse tree
0059: * in any direction. attributes are represented as a linked list of AttVal nodes which hold the strings for
0060: * attribute/value pairs.
0061: * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
0062: * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
0063: * @author Fabrizio Giustina
0064: * @version $Revision: 1.31 $ ($Author: fgiust $)
0065: */
0066: public class Node implements Cloneable {
0067:
0068: /**
0069: * node type: root.
0070: */
0071: public static final short ROOT_NODE = 0;
0072:
0073: /**
0074: * node type: doctype.
0075: */
0076: public static final short DOCTYPE_TAG = 1;
0077:
0078: /**
0079: * node type: comment.
0080: */
0081: public static final short COMMENT_TAG = 2;
0082:
0083: /**
0084: * node type: .
0085: */
0086: public static final short PROC_INS_TAG = 3;
0087:
0088: /**
0089: * node type: text.
0090: */
0091: public static final short TEXT_NODE = 4;
0092:
0093: /**
0094: * Start tag.
0095: */
0096: public static final short START_TAG = 5;
0097:
0098: /**
0099: * End tag.
0100: */
0101: public static final short END_TAG = 6;
0102:
0103: /**
0104: * Start of an end tag.
0105: */
0106: public static final short START_END_TAG = 7;
0107:
0108: /**
0109: * node type: CDATA.
0110: */
0111: public static final short CDATA_TAG = 8;
0112:
0113: /**
0114: * node type: section tag.
0115: */
0116: public static final short SECTION_TAG = 9;
0117:
0118: /**
0119: * node type: asp tag.
0120: */
0121: public static final short ASP_TAG = 10;
0122:
0123: /**
0124: * node type: jste tag.
0125: */
0126: public static final short JSTE_TAG = 11;
0127:
0128: /**
0129: * node type: php tag.
0130: */
0131: public static final short PHP_TAG = 12;
0132:
0133: /**
0134: * node type: doctype.
0135: */
0136: public static final short XML_DECL = 13;
0137:
0138: /**
0139: * Description for all the node types. Used in toString.
0140: */
0141: private static final String[] NODETYPE_STRING = { "RootNode",
0142: "DocTypeTag", "CommentTag", "ProcInsTag", "TextNode",
0143: "StartTag", "EndTag", "StartEndTag", "SectionTag",
0144: "AspTag", "PhpTag", "XmlDecl" };
0145:
0146: /**
0147: * parent node.
0148: */
0149: protected Node parent;
0150:
0151: /**
0152: * pevious node.
0153: */
0154: protected Node prev;
0155:
0156: /**
0157: * next node.
0158: */
0159: protected Node next;
0160:
0161: /**
0162: * last node.
0163: */
0164: protected Node last;
0165:
0166: /**
0167: * start of span onto text array.
0168: */
0169: protected int start;
0170:
0171: /**
0172: * end of span onto text array.
0173: */
0174: protected int end;
0175:
0176: /**
0177: * the text array.
0178: */
0179: protected byte[] textarray;
0180:
0181: /**
0182: * TextNode, StartTag, EndTag etc.
0183: */
0184: protected short type;
0185:
0186: /**
0187: * true if closed by explicit end tag.
0188: */
0189: protected boolean closed;
0190:
0191: /**
0192: * true if inferred.
0193: */
0194: protected boolean implicit;
0195:
0196: /**
0197: * true if followed by a line break.
0198: */
0199: protected boolean linebreak;
0200:
0201: /**
0202: * old tag when it was changed.
0203: */
0204: protected Dict was;
0205:
0206: /**
0207: * tag's dictionary definition.
0208: */
0209: protected Dict tag;
0210:
0211: /**
0212: * Tag name.
0213: */
0214: protected String element;
0215:
0216: /**
0217: * Attribute/Value linked list.
0218: */
0219: protected AttVal attributes;
0220:
0221: /**
0222: * Contained node.
0223: */
0224: protected Node content;
0225:
0226: /**
0227: * DOM adapter.
0228: */
0229: protected org.w3c.dom.Node adapter;
0230:
0231: /**
0232: * Instantiates a new text node.
0233: */
0234: public Node() {
0235: this (TEXT_NODE, null, 0, 0);
0236: }
0237:
0238: /**
0239: * Instantiates a new node.
0240: * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
0241: * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
0242: * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
0243: * @param textarray array of bytes contained in the Node
0244: * @param start start position
0245: * @param end end position
0246: */
0247: public Node(short type, byte[] textarray, int start, int end) {
0248: this .parent = null;
0249: this .prev = null;
0250: this .next = null;
0251: this .last = null;
0252: this .start = start;
0253: this .end = end;
0254: this .textarray = textarray;
0255: this .type = type;
0256: this .closed = false;
0257: this .implicit = false;
0258: this .linebreak = false;
0259: this .was = null;
0260: this .tag = null;
0261: this .element = null;
0262: this .attributes = null;
0263: this .content = null;
0264: }
0265:
0266: /**
0267: * Instantiates a new node.
0268: * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
0269: * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
0270: * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
0271: * @param textarray array of bytes contained in the Node
0272: * @param start start position
0273: * @param end end position
0274: * @param element tag name
0275: * @param tt tag table instance
0276: */
0277: public Node(short type, byte[] textarray, int start, int end,
0278: String element, TagTable tt) {
0279: this .parent = null;
0280: this .prev = null;
0281: this .next = null;
0282: this .last = null;
0283: this .start = start;
0284: this .end = end;
0285: this .textarray = textarray;
0286: this .type = type;
0287: this .closed = false;
0288: this .implicit = false;
0289: this .linebreak = false;
0290: this .was = null;
0291: this .tag = null;
0292: this .element = element;
0293: this .attributes = null;
0294: this .content = null;
0295: if (type == START_TAG || type == START_END_TAG
0296: || type == END_TAG) {
0297: tt.findTag(this );
0298: }
0299: }
0300:
0301: /**
0302: * Used to clone heading nodes when split by an hr.
0303: * @see java.lang.Object#clone()
0304: */
0305: protected Object clone() {
0306: Node node;
0307: try {
0308: node = (Node) super .clone();
0309: } catch (CloneNotSupportedException e) {
0310: // should never happen
0311: throw new RuntimeException("CloneNotSupportedException "
0312: + e.getMessage());
0313: }
0314: if (this .textarray != null) {
0315: node.textarray = new byte[this .end - this .start];
0316: node.start = 0;
0317: node.end = this .end - this .start;
0318: if (node.end > 0) {
0319: System.arraycopy(this .textarray, this .start,
0320: node.textarray, node.start, node.end);
0321: }
0322: }
0323: if (this .attributes != null) {
0324: node.attributes = (AttVal) this .attributes.clone();
0325: }
0326: return node;
0327: }
0328:
0329: /**
0330: * Returns an attribute with the given name in the current node.
0331: * @param name attribute name.
0332: * @return AttVal instance or null if no attribute with the iven name is found
0333: */
0334: public AttVal getAttrByName(String name) {
0335: AttVal attr;
0336:
0337: for (attr = this .attributes; attr != null; attr = attr.next) {
0338: if (name != null && attr.attribute != null
0339: && attr.attribute.equals(name)) {
0340: break;
0341: }
0342: }
0343:
0344: return attr;
0345: }
0346:
0347: /**
0348: * Default method for checking an element's attributes.
0349: * @param lexer Lexer
0350: */
0351: public void checkAttributes(Lexer lexer) {
0352: AttVal attval;
0353:
0354: for (attval = this .attributes; attval != null; attval = attval.next) {
0355: attval.checkAttribute(lexer, this );
0356: }
0357: }
0358:
0359: /**
0360: * The same attribute name can't be used more than once in each element. Discard or join attributes according to
0361: * configuration.
0362: * @param lexer Lexer
0363: */
0364: public void repairDuplicateAttributes(Lexer lexer) {
0365: AttVal attval;
0366:
0367: for (attval = this .attributes; attval != null;) {
0368: if (attval.asp == null && attval.php == null) {
0369: AttVal current;
0370:
0371: for (current = attval.next; current != null;) {
0372: if (current.asp == null
0373: && current.php == null
0374: && attval.attribute != null
0375: && attval.attribute
0376: .equalsIgnoreCase(current.attribute)) {
0377: AttVal temp;
0378:
0379: if ("class".equalsIgnoreCase(current.attribute)
0380: && lexer.configuration.joinClasses) {
0381: // concatenate classes
0382: current.value = current.value + " "
0383: + attval.value;
0384:
0385: temp = attval.next;
0386:
0387: if (temp.next == null) {
0388: current = null;
0389: } else {
0390: current = current.next;
0391: }
0392:
0393: lexer.report.attrError(lexer, this , attval,
0394: Report.JOINING_ATTRIBUTE);
0395:
0396: removeAttribute(attval);
0397: attval = temp;
0398: } else if ("style"
0399: .equalsIgnoreCase(current.attribute)
0400: && lexer.configuration.joinStyles) {
0401: // concatenate styles
0402:
0403: // this doesn't handle CSS comments and leading/trailing white-space very well see
0404: // http://www.w3.org/TR/css-style-attr
0405:
0406: int end = current.value.length() - 1;
0407:
0408: if (current.value.charAt(end) == ';') {
0409: // attribute ends with declaration seperator
0410: current.value = current.value + " "
0411: + attval.value;
0412: } else if (current.value.charAt(end) == '}') {
0413: // attribute ends with rule set
0414: current.value = current.value + " { "
0415: + attval.value + " }";
0416: } else {
0417: // attribute ends with property value
0418: current.value = current.value + "; "
0419: + attval.value;
0420: }
0421:
0422: temp = attval.next;
0423:
0424: if (temp.next == null) {
0425: current = null;
0426: } else {
0427: current = current.next;
0428: }
0429:
0430: lexer.report.attrError(lexer, this , attval,
0431: Report.JOINING_ATTRIBUTE);
0432:
0433: removeAttribute(attval);
0434: attval = temp;
0435:
0436: } else if (lexer.configuration.duplicateAttrs == Configuration.KEEP_LAST) {
0437: temp = current.next;
0438:
0439: lexer.report.attrError(lexer, this ,
0440: current, Report.REPEATED_ATTRIBUTE);
0441:
0442: removeAttribute(current);
0443: current = temp;
0444: } else {
0445: temp = attval.next;
0446:
0447: if (attval.next == null) {
0448: current = null;
0449: } else {
0450: current = current.next;
0451: }
0452:
0453: lexer.report.attrError(lexer, this , attval,
0454: Report.REPEATED_ATTRIBUTE);
0455:
0456: removeAttribute(attval);
0457: attval = temp;
0458: }
0459: } else {
0460: current = current.next;
0461: }
0462: }
0463: attval = attval.next;
0464: } else {
0465: attval = attval.next;
0466: }
0467: }
0468: }
0469:
0470: /**
0471: * Adds an attribute to the node.
0472: * @param name attribute name
0473: * @param value attribute value
0474: */
0475: public void addAttribute(String name, String value) {
0476: AttVal av = new AttVal(null, null, null, null, '"', name, value);
0477: av.dict = AttributeTable.getDefaultAttributeTable()
0478: .findAttribute(av);
0479:
0480: if (this .attributes == null) {
0481: this .attributes = av;
0482: } else {
0483: // append to end of attributes
0484: AttVal here = this .attributes;
0485:
0486: while (here.next != null) {
0487: here = here.next;
0488: }
0489:
0490: here.next = av;
0491: }
0492: }
0493:
0494: /**
0495: * Remove an attribute from node and then free it.
0496: * @param attr attribute to remove
0497: */
0498: public void removeAttribute(AttVal attr) {
0499: AttVal av;
0500: AttVal prev = null;
0501: AttVal next;
0502:
0503: for (av = this .attributes; av != null; av = next) {
0504: next = av.next;
0505:
0506: if (av == attr) {
0507: if (prev != null) {
0508: prev.next = next;
0509: } else {
0510: this .attributes = next;
0511: }
0512: } else {
0513: prev = av;
0514: }
0515: }
0516: }
0517:
0518: /**
0519: * Find the doctype element.
0520: * @return doctype node or null if not found
0521: */
0522: public Node findDocType() {
0523: Node node = this .content;
0524:
0525: while (node != null && node.type != DOCTYPE_TAG) {
0526: node = node.next;
0527: }
0528:
0529: return node;
0530: }
0531:
0532: /**
0533: * Discard the doctype node.
0534: */
0535: public void discardDocType() {
0536: Node node;
0537:
0538: node = findDocType();
0539: if (node != null) {
0540: if (node.prev != null) {
0541: node.prev.next = node.next;
0542: } else {
0543: node.parent.content = node.next;
0544: }
0545:
0546: if (node.next != null) {
0547: node.next.prev = node.prev;
0548: }
0549:
0550: node.next = null;
0551: }
0552: }
0553:
0554: /**
0555: * Remove node from markup tree and discard it.
0556: * @param element discarded node
0557: * @return next node
0558: */
0559: public static Node discardElement(Node element) {
0560: Node next = null;
0561:
0562: if (element != null) {
0563: next = element.next;
0564: element.removeNode();
0565: }
0566:
0567: return next;
0568: }
0569:
0570: /**
0571: * Insert a node into markup tree.
0572: * @param node to insert
0573: */
0574: public void insertNodeAtStart(Node node) {
0575: node.parent = this ;
0576:
0577: if (this .content == null) {
0578: this .last = node;
0579: } else {
0580: this .content.prev = node; // AQ added 13 Apr 2000
0581: }
0582:
0583: node.next = this .content;
0584: node.prev = null;
0585: this .content = node;
0586: }
0587:
0588: /**
0589: * Insert node into markup tree.
0590: * @param node Node to insert
0591: */
0592: public void insertNodeAtEnd(Node node) {
0593: node.parent = this ;
0594: node.prev = this .last;
0595:
0596: if (this .last != null) {
0597: this .last.next = node;
0598: } else {
0599: this .content = node;
0600: }
0601:
0602: this .last = node;
0603: }
0604:
0605: /**
0606: * Insert node into markup tree in pace of element which is moved to become the child of the node.
0607: * @param element child node. Will be inserted as a child of element
0608: * @param node parent node
0609: */
0610: public static void insertNodeAsParent(Node element, Node node) {
0611: node.content = element;
0612: node.last = element;
0613: node.parent = element.parent;
0614: element.parent = node;
0615:
0616: if (node.parent.content == element) {
0617: node.parent.content = node;
0618: }
0619:
0620: if (node.parent.last == element) {
0621: node.parent.last = node;
0622: }
0623:
0624: node.prev = element.prev;
0625: element.prev = null;
0626:
0627: if (node.prev != null) {
0628: node.prev.next = node;
0629: }
0630:
0631: node.next = element.next;
0632: element.next = null;
0633:
0634: if (node.next != null) {
0635: node.next.prev = node;
0636: }
0637: }
0638:
0639: /**
0640: * Insert node into markup tree before element.
0641: * @param element child node. Will be insertedbefore element
0642: * @param node following node
0643: */
0644: public static void insertNodeBeforeElement(Node element, Node node) {
0645: Node parent;
0646:
0647: parent = element.parent;
0648: node.parent = parent;
0649: node.next = element;
0650: node.prev = element.prev;
0651: element.prev = node;
0652:
0653: if (node.prev != null) {
0654: node.prev.next = node;
0655: }
0656:
0657: if (parent != null && parent.content == element) {
0658: parent.content = node;
0659: }
0660: }
0661:
0662: /**
0663: * Insert node into markup tree after element.
0664: * @param node new node to insert
0665: */
0666: public void insertNodeAfterElement(Node node) {
0667: Node parent;
0668:
0669: parent = this .parent;
0670: node.parent = parent;
0671:
0672: // AQ - 13Jan2000 fix for parent == null
0673: if (parent != null && parent.last == this ) {
0674: parent.last = node;
0675: } else {
0676: node.next = this .next;
0677: // AQ - 13Jan2000 fix for node.next == null
0678: if (node.next != null) {
0679: node.next.prev = node;
0680: }
0681: }
0682:
0683: this .next = node;
0684: node.prev = this ;
0685: }
0686:
0687: /**
0688: * Trim an empty element.
0689: * @param lexer Lexer
0690: * @param element empty node to be removed
0691: */
0692: public static void trimEmptyElement(Lexer lexer, Node element) {
0693: // don't trim if user explicitely set trim-empty-elements to false
0694: // empty element can be needed in css sites
0695: if (lexer.configuration.trimEmpty) {
0696: TagTable tt = lexer.configuration.tt;
0697:
0698: if (lexer.canPrune(element)) {
0699: if (element.type != TEXT_NODE) {
0700: lexer.report.warning(lexer, element, null,
0701: Report.TRIM_EMPTY_ELEMENT);
0702: }
0703:
0704: discardElement(element);
0705: } else if (element.tag == tt.tagP
0706: && element.content == null) {
0707: // replace <p></p> by <br><br> to preserve formatting
0708: Node node = lexer.inferredTag("br");
0709: Node.coerceNode(lexer, element, tt.tagBr);
0710: element.insertNodeAfterElement(node);
0711: }
0712: }
0713: }
0714:
0715: /**
0716: * This maps <em> hello </em> <strong>world </strong> to <em> hello </em> <strong>world </strong>. If last child of
0717: * element is a text node then trim trailing white space character moving it to after element's end tag.
0718: * @param lexer Lexer
0719: * @param element node
0720: * @param last last child of element
0721: */
0722: public static void trimTrailingSpace(Lexer lexer, Node element,
0723: Node last) {
0724: byte c;
0725: TagTable tt = lexer.configuration.tt;
0726:
0727: if (last != null && last.type == Node.TEXT_NODE) {
0728: if (last.end > last.start)
0729:
0730: {
0731: c = lexer.lexbuf[last.end - 1];
0732:
0733: if (c == 160 || c == (byte) ' ') {
0734: // take care with <td> </td>
0735: // fix for [435920]
0736: if (c == 160
0737: && (element.tag == tt.tagTd || element.tag == tt.tagTh)) {
0738: if (last.end > last.start + 1) {
0739: last.end -= 1;
0740: }
0741: } else {
0742: last.end -= 1;
0743:
0744: if (TidyUtils.toBoolean(element.tag.model
0745: & Dict.CM_INLINE)
0746: && !TidyUtils
0747: .toBoolean(element.tag.model
0748: & Dict.CM_FIELD)) {
0749: lexer.insertspace = true;
0750: }
0751: }
0752: }
0753: }
0754: // if empty string then delete from parse tree
0755: if (last.start == last.end) // COMMENT_NBSP_FIX: && tag != tag_td && tag != tag_th
0756: {
0757: trimEmptyElement(lexer, last);
0758: }
0759: }
0760: }
0761:
0762: /**
0763: * Escapes the given tag.
0764: * @param lexer Lexer
0765: * @param element node to be escaped
0766: * @return escaped node
0767: */
0768: protected static Node escapeTag(Lexer lexer, Node element) {
0769: Node node = lexer.newNode();
0770: node.start = lexer.lexsize;
0771: node.textarray = element.textarray; // @todo check it
0772: lexer.addByte('<');
0773:
0774: if (element.type == END_TAG) {
0775: lexer.addByte('/');
0776: }
0777:
0778: if (element.element != null) {
0779: lexer.addStringLiteral(element.element);
0780: } else if (element.type == DOCTYPE_TAG) {
0781: int i;
0782:
0783: lexer.addByte('!');
0784: lexer.addByte('D');
0785: lexer.addByte('O');
0786: lexer.addByte('C');
0787: lexer.addByte('T');
0788: lexer.addByte('Y');
0789: lexer.addByte('P');
0790: lexer.addByte('E');
0791: lexer.addByte(' ');
0792:
0793: for (i = element.start; i < element.end; ++i) {
0794: lexer.addByte(lexer.lexbuf[i]);
0795: }
0796: }
0797:
0798: if (element.type == START_END_TAG) {
0799: lexer.addByte('/');
0800: }
0801:
0802: lexer.addByte('>');
0803: node.end = lexer.lexsize;
0804:
0805: return node;
0806: }
0807:
0808: /**
0809: * Is the node content empty or blank? Assumes node is a text node.
0810: * @param lexer Lexer
0811: * @return <code>true</code> if the node content empty or blank
0812: */
0813: public boolean isBlank(Lexer lexer) {
0814: if (this .type == TEXT_NODE) {
0815: if (this .end == this .start) {
0816: return true;
0817: }
0818: if (this .end == this .start + 1
0819: && lexer.lexbuf[this .end - 1] == ' ') {
0820: return true;
0821: }
0822: }
0823: return false;
0824: }
0825:
0826: /**
0827: * This maps <code><p> hello <em> world </em></code> to <code><p> hello <em> world </em></code>.
0828: * Trims initial space, by moving it before the start tag, or if this element is the first in parent's content, then
0829: * by discarding the space.
0830: * @param lexer Lexer
0831: * @param element parent node
0832: * @param text text node
0833: */
0834: public static void trimInitialSpace(Lexer lexer, Node element,
0835: Node text) {
0836: Node prev, node;
0837:
0838: // #427677 - fix by Gary Peskin 31 Oct 00
0839: if (text.type == TEXT_NODE
0840: && text.textarray[text.start] == (byte) ' '
0841: && (text.start < text.end)) {
0842: if (TidyUtils.toBoolean(element.tag.model & Dict.CM_INLINE)
0843: && !TidyUtils.toBoolean(element.tag.model
0844: & Dict.CM_FIELD)
0845: && element.parent.content != element) {
0846: prev = element.prev;
0847:
0848: if (prev != null && prev.type == TEXT_NODE) {
0849: if (prev.textarray[prev.end - 1] != (byte) ' ') {
0850: prev.textarray[prev.end++] = (byte) ' ';
0851: }
0852:
0853: ++element.start;
0854: } else {
0855: // create new node
0856: node = lexer.newNode();
0857: // Local fix for bug 228486 (GLP). This handles the case
0858: // where we need to create a preceeding text node but there are
0859: // no "slots" in textarray that we can steal from the current
0860: // element. Therefore, we create a new textarray containing
0861: // just the blank. When Tidy is fixed, this should be removed.
0862: if (element.start >= element.end) {
0863: node.start = 0;
0864: node.end = 1;
0865: node.textarray = new byte[1];
0866: } else {
0867: node.start = element.start++;
0868: node.end = element.start;
0869: node.textarray = element.textarray;
0870: }
0871: node.textarray[node.start] = (byte) ' ';
0872: node.prev = prev;
0873: if (prev != null) {
0874: prev.next = node;
0875: }
0876: node.next = element;
0877: element.prev = node;
0878: node.parent = element.parent;
0879: }
0880: }
0881:
0882: // discard the space in current node
0883: ++text.start;
0884: }
0885: }
0886:
0887: /**
0888: * Move initial and trailing space out. This routine maps: hello <em> world </em> to hello <em> world </em> and
0889: * <em> hello </em> <strong>world </strong> to <em> hello </em> <strong>world </strong>.
0890: * @param lexer Lexer
0891: * @param element Node
0892: */
0893: public static void trimSpaces(Lexer lexer, Node element) {
0894: Node text = element.content;
0895: TagTable tt = lexer.configuration.tt;
0896:
0897: if (text != null && text.type == Node.TEXT_NODE
0898: && element.tag != tt.tagPre) {
0899: trimInitialSpace(lexer, element, text);
0900: }
0901:
0902: text = element.last;
0903:
0904: if (text != null && text.type == Node.TEXT_NODE) {
0905: trimTrailingSpace(lexer, element, text);
0906: }
0907: }
0908:
0909: /**
0910: * Is this node contained in a given tag?
0911: * @param tag descendant tag
0912: * @return <code>true</code> if node is contained in tag
0913: */
0914: public boolean isDescendantOf(Dict tag) {
0915: Node parent;
0916:
0917: for (parent = this .parent; parent != null; parent = parent.parent) {
0918: if (parent.tag == tag) {
0919: return true;
0920: }
0921: }
0922:
0923: return false;
0924: }
0925:
0926: /**
0927: * The doctype has been found after other tags, and needs moving to before the html element.
0928: * @param lexer Lexer
0929: * @param element document
0930: * @param doctype doctype node to insert at the beginning of element
0931: */
0932: public static void insertDocType(Lexer lexer, Node element,
0933: Node doctype) {
0934: TagTable tt = lexer.configuration.tt;
0935:
0936: lexer.report.warning(lexer, element, doctype,
0937: Report.DOCTYPE_AFTER_TAGS);
0938:
0939: while (element.tag != tt.tagHtml) {
0940: element = element.parent;
0941: }
0942:
0943: insertNodeBeforeElement(element, doctype);
0944: }
0945:
0946: /**
0947: * Find the body node.
0948: * @param tt tag table
0949: * @return body node
0950: */
0951: public Node findBody(TagTable tt) {
0952: Node node;
0953:
0954: node = this .content;
0955:
0956: while (node != null && node.tag != tt.tagHtml) {
0957: node = node.next;
0958: }
0959:
0960: if (node == null) {
0961: return null;
0962: }
0963:
0964: node = node.content;
0965:
0966: while (node != null && node.tag != tt.tagBody
0967: && node.tag != tt.tagFrameset) {
0968: node = node.next;
0969: }
0970:
0971: if (node.tag == tt.tagFrameset) {
0972: node = node.content;
0973:
0974: while (node != null && node.tag != tt.tagNoframes) {
0975: node = node.next;
0976: }
0977:
0978: if (node != null) {
0979: node = node.content;
0980: while (node != null && node.tag != tt.tagBody) {
0981: node = node.next;
0982: }
0983: }
0984: }
0985:
0986: return node;
0987: }
0988:
0989: /**
0990: * Is the node an element?
0991: * @return <code>true</code> if type is START_TAG | START_END_TAG
0992: */
0993: public boolean isElement() {
0994: return (this .type == START_TAG || this .type == START_END_TAG ? true
0995: : false);
0996: }
0997:
0998: /**
0999: * Unexpected content in table row is moved to just before the table in accordance with Netscape and IE. This code
1000: * assumes that node hasn't been inserted into the row.
1001: * @param row Row node
1002: * @param node Node which should be moved before the table
1003: * @param tt tag table
1004: */
1005: public static void moveBeforeTable(Node row, Node node, TagTable tt) {
1006: Node table;
1007:
1008: /* first find the table element */
1009: for (table = row.parent; table != null; table = table.parent) {
1010: if (table.tag == tt.tagTable) {
1011: if (table.parent.content == table) {
1012: table.parent.content = node;
1013: }
1014:
1015: node.prev = table.prev;
1016: node.next = table;
1017: table.prev = node;
1018: node.parent = table.parent;
1019:
1020: if (node.prev != null) {
1021: node.prev.next = node;
1022: }
1023:
1024: break;
1025: }
1026: }
1027: }
1028:
1029: /**
1030: * If a table row is empty then insert an empty cell.This practice is consistent with browser behavior and avoids
1031: * potential problems with row spanning cells.
1032: * @param lexer Lexer
1033: * @param row row node
1034: */
1035: public static void fixEmptyRow(Lexer lexer, Node row) {
1036: Node cell;
1037:
1038: if (row.content == null) {
1039: cell = lexer.inferredTag("td");
1040: row.insertNodeAtEnd(cell);
1041: lexer.report.warning(lexer, row, cell,
1042: Report.MISSING_STARTTAG);
1043: }
1044: }
1045:
1046: /**
1047: * Coerce a node.
1048: * @param lexer Lexer
1049: * @param node Node
1050: * @param tag tag dictionary reference
1051: */
1052: public static void coerceNode(Lexer lexer, Node node, Dict tag) {
1053: Node tmp = lexer.inferredTag(tag.name);
1054: lexer.report.warning(lexer, node, tmp, Report.OBSOLETE_ELEMENT);
1055: node.was = node.tag;
1056: node.tag = tag;
1057: node.type = START_TAG;
1058: node.implicit = true;
1059: node.element = tag.name;
1060: }
1061:
1062: /**
1063: * Extract this node and its children from a markup tree.
1064: */
1065: public void removeNode() {
1066: if (this .prev != null) {
1067: this .prev.next = this .next;
1068: }
1069:
1070: if (this .next != null) {
1071: this .next.prev = this .prev;
1072: }
1073:
1074: if (this .parent != null) {
1075: if (this .parent.content == this ) {
1076: this .parent.content = this .next;
1077: }
1078:
1079: if (this .parent.last == this ) {
1080: this .parent.last = this .prev;
1081: }
1082: }
1083:
1084: this .parent = null;
1085: this .prev = null;
1086: this .next = null;
1087: }
1088:
1089: /**
1090: * Insert a node at the end.
1091: * @param element parent node
1092: * @param node will be inserted at the end of element
1093: * @return <code>true</code> if the node has been inserted
1094: */
1095: public static boolean insertMisc(Node element, Node node) {
1096: if (node.type == COMMENT_TAG || node.type == PROC_INS_TAG
1097: || node.type == CDATA_TAG || node.type == SECTION_TAG
1098: || node.type == ASP_TAG || node.type == JSTE_TAG
1099: || node.type == PHP_TAG || node.type == XML_DECL) {
1100: element.insertNodeAtEnd(node);
1101: return true;
1102: }
1103:
1104: return false;
1105: }
1106:
1107: /**
1108: * Is this a new (user defined) node? Used to determine how attributes without values should be printed. This was
1109: * introduced to deal with user defined tags e.g. Cold Fusion.
1110: * @return <code>true</code> if this node represents a user-defined tag.
1111: */
1112: public boolean isNewNode() {
1113: if (this .tag != null) {
1114: return TidyUtils.toBoolean(this .tag.model & Dict.CM_NEW);
1115: }
1116:
1117: return true;
1118: }
1119:
1120: /**
1121: * Does the node have one (and only one) child?
1122: * @return <code>true</code> if the node has one child
1123: */
1124: public boolean hasOneChild() {
1125: return (this .content != null && this .content.next == null);
1126: }
1127:
1128: /**
1129: * Find the "html" element.
1130: * @param tt tag table
1131: * @return html node
1132: */
1133: public Node findHTML(TagTable tt) {
1134: Node node;
1135:
1136: for (node = this .content; node != null
1137: && node.tag != tt.tagHtml; node = node.next) {
1138: //
1139: }
1140:
1141: return node;
1142: }
1143:
1144: /**
1145: * Find the head tag.
1146: * @param tt tag table
1147: * @return head node
1148: */
1149: public Node findHEAD(TagTable tt) {
1150: Node node;
1151:
1152: node = this .findHTML(tt);
1153:
1154: if (node != null) {
1155: for (node = node.content; node != null
1156: && node.tag != tt.tagHead; node = node.next) {
1157: //
1158: }
1159: }
1160:
1161: return node;
1162: }
1163:
1164: /**
1165: * Checks for node integrity.
1166: * @return false if node is not consistent
1167: */
1168: public boolean checkNodeIntegrity() {
1169: Node child;
1170: boolean found = false;
1171:
1172: if (this .prev != null) {
1173: if (this .prev.next != this ) {
1174: return false;
1175: }
1176: }
1177:
1178: if (this .next != null) {
1179: if (this .next.prev != this ) {
1180: return false;
1181: }
1182: }
1183:
1184: if (this .parent != null) {
1185: if (this .prev == null && this .parent.content != this ) {
1186: return false;
1187: }
1188:
1189: if (this .next == null && this .parent.last != this ) {
1190: return false;
1191: }
1192:
1193: for (child = this .parent.content; child != null; child = child.next) {
1194: if (child == this ) {
1195: found = true;
1196: break;
1197: }
1198: }
1199:
1200: if (!found) {
1201: return false;
1202: }
1203: }
1204:
1205: for (child = this .content; child != null; child = child.next) {
1206: if (!child.checkNodeIntegrity()) {
1207: return false;
1208: }
1209: }
1210: return true;
1211: }
1212:
1213: /**
1214: * Add a css class to the node. If a class attribute already exists adds the value to the existing attribute.
1215: * @param classname css class name
1216: */
1217: public void addClass(String classname) {
1218: AttVal classattr = this .getAttrByName("class");
1219:
1220: // if there already is a class attribute then append class name after a space
1221: if (classattr != null) {
1222: classattr.value = classattr.value + " " + classname;
1223: } else {
1224: // create new class attribute
1225: this .addAttribute("class", classname);
1226: }
1227: }
1228:
1229: /**
1230: * @see java.lang.Object#toString()
1231: */
1232: public String toString() {
1233: String s = "";
1234: Node n = this ;
1235:
1236: while (n != null) {
1237: s += "[Node type=";
1238: s += NODETYPE_STRING[n.type];
1239: s += ",element=";
1240: if (n.element != null) {
1241: s += n.element;
1242: } else {
1243: s += "null";
1244: }
1245: if (n.type == TEXT_NODE || n.type == COMMENT_TAG
1246: || n.type == PROC_INS_TAG) {
1247: s += ",text=";
1248: if (n.textarray != null && n.start <= n.end) {
1249: s += "\"";
1250: s += TidyUtils.getString(n.textarray, n.start,
1251: n.end - n.start);
1252: s += "\"";
1253: } else {
1254: s += "null";
1255: }
1256: }
1257: s += ",content=";
1258: if (n.content != null) {
1259: s += n.content.toString();
1260: } else {
1261: s += "null";
1262: }
1263: s += "]";
1264: if (n.next != null) {
1265: s += ",";
1266: }
1267: n = n.next;
1268: }
1269: return s;
1270: }
1271:
1272: /**
1273: * Returns a DOM Node which wrap the current tidy Node.
1274: * @return org.w3c.dom.Node instance
1275: */
1276: protected org.w3c.dom.Node getAdapter() {
1277: if (adapter == null) {
1278: switch (this .type) {
1279: case ROOT_NODE:
1280: adapter = new DOMDocumentImpl(this );
1281: break;
1282: case START_TAG:
1283: case START_END_TAG:
1284: adapter = new DOMElementImpl(this );
1285: break;
1286: case DOCTYPE_TAG:
1287: adapter = new DOMDocumentTypeImpl(this );
1288: break;
1289: case COMMENT_TAG:
1290: adapter = new DOMCommentImpl(this );
1291: break;
1292: case TEXT_NODE:
1293: adapter = new DOMTextImpl(this );
1294: break;
1295: case CDATA_TAG:
1296: adapter = new DOMCDATASectionImpl(this );
1297: break;
1298: case PROC_INS_TAG:
1299: adapter = new DOMProcessingInstructionImpl(this );
1300: break;
1301: default:
1302: adapter = new DOMNodeImpl(this );
1303: }
1304: }
1305: return adapter;
1306: }
1307:
1308: /**
1309: * Clone this node.
1310: * @param deep if true deep clone the node (also clones all the contained nodes)
1311: * @return cloned node
1312: */
1313: protected Node cloneNode(boolean deep) {
1314: Node node = (Node) this .clone();
1315: if (deep) {
1316: Node child;
1317: Node newChild;
1318: for (child = this .content; child != null; child = child.next) {
1319: newChild = child.cloneNode(deep);
1320: node.insertNodeAtEnd(newChild);
1321: }
1322: }
1323: return node;
1324: }
1325:
1326: /**
1327: * Setter for node type.
1328: * @param newType a valid node type constant
1329: */
1330: protected void setType(short newType) {
1331: this .type = newType;
1332: }
1333:
1334: /**
1335: * Used to check script node for script language.
1336: * @return <code>true</code> if the script node contains javascript
1337: */
1338: public boolean isJavaScript() {
1339: boolean result = false;
1340: AttVal attr;
1341:
1342: if (this .attributes == null) {
1343: return true;
1344: }
1345:
1346: for (attr = this .attributes; attr != null; attr = attr.next) {
1347: if (("language".equalsIgnoreCase(attr.attribute) || "type"
1348: .equalsIgnoreCase(attr.attribute))
1349: && "javascript".equalsIgnoreCase(attr.value)) {
1350: result = true;
1351: }
1352: }
1353:
1354: return result;
1355: }
1356:
1357: /**
1358: * Does the node expect contents?
1359: * @return <code>false</code> if this node should be empty
1360: */
1361: public boolean expectsContent() {
1362: if (this .type != Node.START_TAG) {
1363: return false;
1364: }
1365:
1366: // unknown element?
1367: if (this .tag == null) {
1368: return true;
1369: }
1370:
1371: if (TidyUtils.toBoolean(this .tag.model & Dict.CM_EMPTY)) {
1372: return false;
1373: }
1374:
1375: return true;
1376: }
1377: }
|