0001: /*
0002: * Java HTML Tidy - JTidy
0003: * HTML parser and pretty printer
0004: *
0005: * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
0006: * Institute of Technology, Institut National de Recherche en
0007: * Informatique et en Automatique, Keio University). All Rights
0008: * Reserved.
0009: *
0010: * Contributing Author(s):
0011: *
0012: * Dave Raggett <dsr@w3.org>
0013: * Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0014: * Gary L Peskin <garyp@firstech.com> (Java development)
0015: * Sami Lempinen <sami@lempinen.net> (release management)
0016: * Fabrizio Giustina <fgiust at users.sourceforge.net>
0017: *
0018: * The contributing author(s) would like to thank all those who
0019: * helped with testing, bug fixes, and patience. This wouldn't
0020: * have been possible without all of you.
0021: *
0022: * COPYRIGHT NOTICE:
0023: *
0024: * This software and documentation is provided "as is," and
0025: * the copyright holders and contributing author(s) make no
0026: * representations or warranties, express or implied, including
0027: * but not limited to, warranties of merchantability or fitness
0028: * for any particular purpose or that the use of the software or
0029: * documentation will not infringe any third party patents,
0030: * copyrights, trademarks or other rights.
0031: *
0032: * The copyright holders and contributing author(s) will not be
0033: * liable for any direct, indirect, special or consequential damages
0034: * arising out of any use of the software or documentation, even if
0035: * advised of the possibility of such damage.
0036: *
0037: * Permission is hereby granted to use, copy, modify, and distribute
0038: * this source code, or portions hereof, documentation and executables,
0039: * for any purpose, without fee, subject to the following restrictions:
0040: *
0041: * 1. The origin of this source code must not be misrepresented.
0042: * 2. Altered versions must be plainly marked as such and must
0043: * not be misrepresented as being the original source.
0044: * 3. This Copyright notice may not be removed or altered from any
0045: * source or altered source distribution.
0046: *
0047: * The copyright holders and contributing author(s) specifically
0048: * permit, without fee, and encourage the use of this source code
0049: * as a component for supporting the Hypertext Markup Language in
0050: * commercial products. If you use this source code in a product,
0051: * acknowledgment is not required but would be appreciated.
0052: *
0053: */
0054: package org.w3c.tidy;
0055:
0056: /**
0057: * Clean up misuse of presentation markup. Filters from other formats such as Microsoft Word often make excessive use of
0058: * presentation markup such as font tags, B, I, and the align attribute. By applying a set of production rules, it is
0059: * straight forward to transform this to use CSS. Some rules replace some of the children of an element by style
0060: * properties on the element, e.g.
0061: * <p>
0062: * <b>... </b>
0063: * </p>.
0064: * <p style="font-weight: bold">
0065: * ...
0066: * </p>
0067: * Such rules are applied to the element's content and then to the element itself until none of the rules more apply.
0068: * Having applied all the rules to an element, it will have a style attribute with one or more properties. Other rules
0069: * strip the element they apply to, replacing it by style properties on the contents, e.g. <dir>
0070: * <li>
0071: * <p>
0072: * ...</li>
0073: * </dir>.
0074: * <p style="margin-left 1em">
0075: * ... These rules are applied to an element before processing its content and replace the current element by the first
0076: * element in the exposed content. After applying both sets of rules, you can replace the style attribute by a class
0077: * value and style rule in the document head. To support this, an association of styles and class names is built. A
0078: * naive approach is to rely on string matching to test when two property lists are the same. A better approach would be
0079: * to first sort the properties before matching.
0080: * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
0081: * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
0082: * @author Fabrizio Giustina
0083: * @version $Revision: 1.25 $ ($Author: fgiust $)
0084: */
0085: public class Clean {
0086:
0087: /**
0088: * sequential number for generated css classes.
0089: */
0090: private int classNum = 1;
0091:
0092: /**
0093: * Tag table.
0094: */
0095: private TagTable tt;
0096:
0097: /**
0098: * Instantiates a new Clean.
0099: * @param tagTable tag table instance
0100: */
0101: public Clean(TagTable tagTable) {
0102: this .tt = tagTable;
0103: }
0104:
0105: /**
0106: * Insert a css style property.
0107: * @param props StyleProp instance
0108: * @param name property name
0109: * @param value property value
0110: * @return StyleProp containin the given property
0111: */
0112: private StyleProp insertProperty(StyleProp props, String name,
0113: String value) {
0114: StyleProp first, prev, prop;
0115: int cmp;
0116:
0117: prev = null;
0118: first = props;
0119:
0120: while (props != null) {
0121: cmp = props.name.compareTo(name);
0122:
0123: if (cmp == 0) {
0124: // this property is already defined, ignore new value
0125: return first;
0126: }
0127:
0128: if (cmp > 0) // props.name > name
0129: {
0130: // insert before this
0131:
0132: prop = new StyleProp(name, value, props);
0133:
0134: if (prev != null) {
0135: prev.next = prop;
0136: } else {
0137: first = prop;
0138: }
0139:
0140: return first;
0141: }
0142:
0143: prev = props;
0144: props = props.next;
0145: }
0146:
0147: prop = new StyleProp(name, value, null);
0148:
0149: if (prev != null) {
0150: prev.next = prop;
0151: } else {
0152: first = prop;
0153: }
0154:
0155: return first;
0156: }
0157:
0158: /**
0159: * Create sorted linked list of properties from style string.
0160: * @param prop StyleProp
0161: * @param style style string
0162: * @return StyleProp with given style
0163: */
0164: private StyleProp createProps(StyleProp prop, String style) {
0165: int nameEnd;
0166: int valueEnd;
0167: int valueStart = 0;
0168: int nameStart = 0;
0169: boolean more;
0170:
0171: nameStart = 0;
0172: while (nameStart < style.length()) {
0173: while (nameStart < style.length()
0174: && style.charAt(nameStart) == ' ') {
0175: ++nameStart;
0176: }
0177:
0178: nameEnd = nameStart;
0179:
0180: while (nameEnd < style.length()) {
0181: if (style.charAt(nameEnd) == ':') {
0182: valueStart = nameEnd + 1;
0183: break;
0184: }
0185:
0186: ++nameEnd;
0187: }
0188:
0189: if (nameEnd >= style.length()
0190: || style.charAt(nameEnd) != ':') {
0191: break;
0192: }
0193:
0194: while (valueStart < style.length()
0195: && style.charAt(valueStart) == ' ') {
0196: ++valueStart;
0197: }
0198:
0199: valueEnd = valueStart;
0200: more = false;
0201:
0202: while (valueEnd < style.length()) {
0203: if (style.charAt(valueEnd) == ';') {
0204: more = true;
0205: break;
0206: }
0207:
0208: ++valueEnd;
0209: }
0210:
0211: prop = insertProperty(prop, style.substring(nameStart,
0212: nameEnd), style.substring(valueStart, valueEnd));
0213:
0214: if (more) {
0215: nameStart = valueEnd + 1;
0216: continue;
0217: }
0218:
0219: break;
0220: }
0221:
0222: return prop;
0223: }
0224:
0225: /**
0226: * Create a css property.
0227: * @param props StyleProp
0228: * @return css property as String
0229: */
0230: private String createPropString(StyleProp props) {
0231: String style = "";
0232: int len;
0233: StyleProp prop;
0234:
0235: // compute length
0236: for (len = 0, prop = props; prop != null; prop = prop.next) {
0237: len += prop.name.length() + 2;
0238: len += prop.value.length() + 2;
0239: }
0240:
0241: for (prop = props; prop != null; prop = prop.next) {
0242: style = style.concat(prop.name);
0243: style = style.concat(": ");
0244:
0245: style = style.concat(prop.value);
0246:
0247: if (prop.next == null) {
0248: break;
0249: }
0250:
0251: style = style.concat("; ");
0252: }
0253:
0254: return style;
0255: }
0256:
0257: /**
0258: * Creates a string with merged properties.
0259: * @param style css style
0260: * @param property css properties
0261: * @return merged string
0262: */
0263: private String addProperty(String style, String property) {
0264: StyleProp prop;
0265:
0266: prop = createProps(null, style);
0267: prop = createProps(prop, property);
0268: style = createPropString(prop);
0269: return style;
0270: }
0271:
0272: /**
0273: * Generates a new css class name.
0274: * @param lexer Lexer
0275: * @param tag Tag
0276: * @return generated css class
0277: */
0278: private String gensymClass(Lexer lexer, String tag) {
0279: String str;
0280:
0281: str = lexer.configuration.cssPrefix == null ? lexer.configuration.cssPrefix
0282: + this .classNum
0283: : "c" + this .classNum;
0284: this .classNum++;
0285: return str;
0286: }
0287:
0288: /**
0289: * Finds a css style.
0290: * @param lexer Lexer
0291: * @param tag tag name
0292: * @param properties css properties
0293: * @return style string
0294: */
0295: private String findStyle(Lexer lexer, String tag, String properties) {
0296: Style style;
0297:
0298: for (style = lexer.styles; style != null; style = style.next) {
0299: if (style.tag.equals(tag)
0300: && style.properties.equals(properties)) {
0301: return style.tagClass;
0302: }
0303: }
0304:
0305: style = new Style(tag, gensymClass(lexer, tag), properties,
0306: lexer.styles);
0307: lexer.styles = style;
0308: return style.tagClass;
0309: }
0310:
0311: /**
0312: * Find style attribute in node, and replace it by corresponding class attribute. Search for class in style
0313: * dictionary otherwise gensym new class and add to dictionary. Assumes that node doesn't have a class attribute.
0314: * @param lexer Lexer
0315: * @param node node with a style attribute
0316: */
0317: private void style2Rule(Lexer lexer, Node node) {
0318: AttVal styleattr, classattr;
0319: String classname;
0320:
0321: styleattr = node.getAttrByName("style");
0322:
0323: if (styleattr != null) {
0324: classname = findStyle(lexer, node.element, styleattr.value);
0325: classattr = node.getAttrByName("class");
0326:
0327: // if there already is a class attribute then append class name after a space
0328:
0329: if (classattr != null) {
0330: classattr.value = classattr.value + " " + classname;
0331: node.removeAttribute(styleattr);
0332: } else {
0333: // reuse style attribute for class attribute
0334: styleattr.attribute = "class";
0335: styleattr.value = classname;
0336: }
0337: }
0338: }
0339:
0340: /**
0341: * Adds a css rule for color.
0342: * @param lexer Lexer
0343: * @param selector css selector
0344: * @param color color value
0345: */
0346: private void addColorRule(Lexer lexer, String selector, String color) {
0347: if (color != null) {
0348: lexer.addStringLiteral(selector);
0349: lexer.addStringLiteral(" { color: ");
0350: lexer.addStringLiteral(color);
0351: lexer.addStringLiteral(" }\n");
0352: }
0353: }
0354:
0355: /**
0356: * Move presentation attribs from body to style element.
0357: *
0358: * <pre>
0359: * background="foo" . body { background-image: url(foo) }
0360: * bgcolor="foo" . body { background-color: foo }
0361: * text="foo" . body { color: foo }
0362: * link="foo" . :link { color: foo }
0363: * vlink="foo" . :visited { color: foo }
0364: * alink="foo" . :active { color: foo }
0365: * </pre>
0366: *
0367: * @param lexer Lexer
0368: * @param body body node
0369: */
0370: private void cleanBodyAttrs(Lexer lexer, Node body) {
0371: AttVal attr;
0372: String bgurl = null;
0373: String bgcolor = null;
0374: String color = null;
0375:
0376: attr = body.getAttrByName("background");
0377:
0378: if (attr != null) {
0379: bgurl = attr.value;
0380: attr.value = null;
0381: body.removeAttribute(attr);
0382: }
0383:
0384: attr = body.getAttrByName("bgcolor");
0385:
0386: if (attr != null) {
0387: bgcolor = attr.value;
0388: attr.value = null;
0389: body.removeAttribute(attr);
0390: }
0391:
0392: attr = body.getAttrByName("text");
0393:
0394: if (attr != null) {
0395: color = attr.value;
0396: attr.value = null;
0397: body.removeAttribute(attr);
0398: }
0399:
0400: if (bgurl != null || bgcolor != null || color != null) {
0401: lexer.addStringLiteral(" body {\n");
0402:
0403: if (bgurl != null) {
0404: lexer.addStringLiteral(" background-image: url(");
0405: lexer.addStringLiteral(bgurl);
0406: lexer.addStringLiteral(");\n");
0407: }
0408:
0409: if (bgcolor != null) {
0410: lexer.addStringLiteral(" background-color: ");
0411: lexer.addStringLiteral(bgcolor);
0412: lexer.addStringLiteral(";\n");
0413: }
0414:
0415: if (color != null) {
0416: lexer.addStringLiteral(" color: ");
0417: lexer.addStringLiteral(color);
0418: lexer.addStringLiteral(";\n");
0419: }
0420:
0421: lexer.addStringLiteral(" }\n");
0422: }
0423:
0424: attr = body.getAttrByName("link");
0425:
0426: if (attr != null) {
0427: addColorRule(lexer, " :link", attr.value);
0428: body.removeAttribute(attr);
0429: }
0430:
0431: attr = body.getAttrByName("vlink");
0432:
0433: if (attr != null) {
0434: addColorRule(lexer, " :visited", attr.value);
0435: body.removeAttribute(attr);
0436: }
0437:
0438: attr = body.getAttrByName("alink");
0439:
0440: if (attr != null) {
0441: addColorRule(lexer, " :active", attr.value);
0442: body.removeAttribute(attr);
0443: }
0444: }
0445:
0446: /**
0447: * Check deprecated attributes in body tag.
0448: * @param lexer Lexer
0449: * @param doc document root node
0450: * @return <code>true</code> is the body doesn't contain deprecated attributes, false otherwise.
0451: */
0452: private boolean niceBody(Lexer lexer, Node doc) {
0453: Node body = doc.findBody(lexer.configuration.tt);
0454:
0455: if (body != null) {
0456: if (body.getAttrByName("background") != null
0457: || body.getAttrByName("bgcolor") != null
0458: || body.getAttrByName("text") != null
0459: || body.getAttrByName("link") != null
0460: || body.getAttrByName("vlink") != null
0461: || body.getAttrByName("alink") != null) {
0462: lexer.badLayout |= Report.USING_BODY;
0463: return false;
0464: }
0465: }
0466:
0467: return true;
0468: }
0469:
0470: /**
0471: * Create style element using rules from dictionary.
0472: * @param lexer Lexer
0473: * @param doc root node
0474: */
0475: private void createStyleElement(Lexer lexer, Node doc) {
0476: Node node, head, body;
0477: Style style;
0478: AttVal av;
0479:
0480: if (lexer.styles == null && niceBody(lexer, doc)) {
0481: return;
0482: }
0483:
0484: node = lexer.newNode(Node.START_TAG, null, 0, 0, "style");
0485: node.implicit = true;
0486:
0487: // insert type attribute
0488: av = new AttVal(null, null, '"', "type", "text/css");
0489: av.dict = AttributeTable.getDefaultAttributeTable()
0490: .findAttribute(av);
0491: node.attributes = av;
0492:
0493: body = doc.findBody(lexer.configuration.tt);
0494:
0495: lexer.txtstart = lexer.lexsize;
0496:
0497: if (body != null) {
0498: cleanBodyAttrs(lexer, body);
0499: }
0500:
0501: for (style = lexer.styles; style != null; style = style.next) {
0502: lexer.addCharToLexer(' ');
0503: lexer.addStringLiteral(style.tag);
0504: lexer.addCharToLexer('.');
0505: lexer.addStringLiteral(style.tagClass);
0506: lexer.addCharToLexer(' ');
0507: lexer.addCharToLexer('{');
0508: lexer.addStringLiteral(style.properties);
0509: lexer.addCharToLexer('}');
0510: lexer.addCharToLexer('\n');
0511: }
0512:
0513: lexer.txtend = lexer.lexsize;
0514:
0515: node.insertNodeAtEnd(lexer.newNode(Node.TEXT_NODE,
0516: lexer.lexbuf, lexer.txtstart, lexer.txtend));
0517:
0518: // now insert style element into document head doc is root node. search its children for html node the head
0519: // node should be first child of html node
0520:
0521: head = doc.findHEAD(lexer.configuration.tt);
0522:
0523: if (head != null) {
0524: head.insertNodeAtEnd(node);
0525: }
0526: }
0527:
0528: /**
0529: * Ensure bidirectional links are consistent.
0530: * @param node root node
0531: */
0532: private void fixNodeLinks(Node node) {
0533: Node child;
0534:
0535: if (node.prev != null) {
0536: node.prev.next = node;
0537: } else {
0538: node.parent.content = node;
0539: }
0540:
0541: if (node.next != null) {
0542: node.next.prev = node;
0543: } else {
0544: node.parent.last = node;
0545: }
0546:
0547: for (child = node.content; child != null; child = child.next) {
0548: child.parent = node;
0549: }
0550: }
0551:
0552: /**
0553: * Used to strip child of node when the node has one and only one child.
0554: * @param node parent node
0555: */
0556: private void stripOnlyChild(Node node) {
0557: Node child;
0558:
0559: child = node.content;
0560: node.content = child.content;
0561: node.last = child.last;
0562: child.content = null;
0563:
0564: for (child = node.content; child != null; child = child.next) {
0565: child.parent = node;
0566: }
0567: }
0568:
0569: /**
0570: * Used to strip font start and end tags.
0571: * @param element original node
0572: * @param pnode passed in as array to allow modification. pnode[0] will contain the final node
0573: * @todo remove the pnode parameter and make it a return value
0574: */
0575: private void discardContainer(Node element, Node[] pnode) {
0576: Node node;
0577: Node parent = element.parent;
0578:
0579: if (element.content != null) {
0580: element.last.next = element.next;
0581:
0582: if (element.next != null) {
0583: element.next.prev = element.last;
0584: element.last.next = element.next;
0585: } else {
0586: parent.last = element.last;
0587: }
0588:
0589: if (element.prev != null) {
0590: element.content.prev = element.prev;
0591: element.prev.next = element.content;
0592: } else {
0593: parent.content = element.content;
0594: }
0595:
0596: for (node = element.content; node != null; node = node.next) {
0597: node.parent = parent;
0598: }
0599:
0600: pnode[0] = element.content;
0601: } else {
0602: if (element.next != null) {
0603: element.next.prev = element.prev;
0604: } else {
0605: parent.last = element.prev;
0606: }
0607:
0608: if (element.prev != null) {
0609: element.prev.next = element.next;
0610: } else {
0611: parent.content = element.next;
0612: }
0613:
0614: pnode[0] = element.next;
0615: }
0616:
0617: element.next = null;
0618: element.content = null;
0619: }
0620:
0621: /**
0622: * Add style property to element, creating style attribute as needed and adding ; delimiter.
0623: * @param node node
0624: * @param property property added to node
0625: */
0626: private void addStyleProperty(Node node, String property) {
0627: AttVal av;
0628:
0629: for (av = node.attributes; av != null; av = av.next) {
0630: if (av.attribute.equals("style")) {
0631: break;
0632: }
0633: }
0634:
0635: // if style attribute already exists then insert property
0636:
0637: if (av != null) {
0638: String s;
0639:
0640: s = addProperty(av.value, property);
0641: av.value = s;
0642: } else {
0643: // else create new style attribute
0644: av = new AttVal(node.attributes, null, '"', "style",
0645: property);
0646: av.dict = AttributeTable.getDefaultAttributeTable()
0647: .findAttribute(av);
0648: node.attributes = av;
0649: }
0650: }
0651:
0652: /**
0653: * Create new string that consists of the combined style properties in s1 and s2. To merge property lists, we build
0654: * a linked list of property/values and insert properties into the list in order, merging values for the same
0655: * property name.
0656: * @param s1 first property
0657: * @param s2 second property
0658: * @return merged properties
0659: */
0660: private String mergeProperties(String s1, String s2) {
0661: String s;
0662: StyleProp prop;
0663:
0664: prop = createProps(null, s1);
0665: prop = createProps(prop, s2);
0666: s = createPropString(prop);
0667: return s;
0668: }
0669:
0670: /**
0671: * Merge class attributes from 2 nodes.
0672: * @param node Node
0673: * @param child Child node
0674: */
0675: private void mergeClasses(Node node, Node child) {
0676: AttVal av;
0677: String s1, s2, names;
0678:
0679: for (s2 = null, av = child.attributes; av != null; av = av.next) {
0680: if ("class".equals(av.attribute)) {
0681: s2 = av.value;
0682: break;
0683: }
0684: }
0685:
0686: for (s1 = null, av = node.attributes; av != null; av = av.next) {
0687: if ("class".equals(av.attribute)) {
0688: s1 = av.value;
0689: break;
0690: }
0691: }
0692:
0693: if (s1 != null) {
0694: if (s2 != null) // merge class names from both
0695: {
0696: names = s1 + ' ' + s2;
0697: av.value = names;
0698: }
0699: } else if (s2 != null) // copy class names from child
0700: {
0701: av = new AttVal(node.attributes, null, '"', "class", s2);
0702: av.dict = AttributeTable.getDefaultAttributeTable()
0703: .findAttribute(av);
0704: node.attributes = av;
0705: }
0706: }
0707:
0708: /**
0709: * Merge style from 2 nodes.
0710: * @param node Node
0711: * @param child Child node
0712: */
0713: private void mergeStyles(Node node, Node child) {
0714: AttVal av;
0715: String s1, s2, style;
0716:
0717: // the child may have a class attribute used for attaching styles, if so the class name needs to be copied to
0718: // node's class
0719: mergeClasses(node, child);
0720:
0721: for (s2 = null, av = child.attributes; av != null; av = av.next) {
0722: if (av.attribute.equals("style")) {
0723: s2 = av.value;
0724: break;
0725: }
0726: }
0727:
0728: for (s1 = null, av = node.attributes; av != null; av = av.next) {
0729: if (av.attribute.equals("style")) {
0730: s1 = av.value;
0731: break;
0732: }
0733: }
0734:
0735: if (s1 != null) {
0736: if (s2 != null) // merge styles from both
0737: {
0738: style = mergeProperties(s1, s2);
0739: av.value = style;
0740: }
0741: } else if (s2 != null) // copy style of child
0742: {
0743: av = new AttVal(node.attributes, null, '"', "style", s2);
0744: av.dict = AttributeTable.getDefaultAttributeTable()
0745: .findAttribute(av);
0746: node.attributes = av;
0747: }
0748: }
0749:
0750: /**
0751: * Map a % font size to a named font size.
0752: * @param size size in %
0753: * @return font size name
0754: */
0755: private String fontSize2Name(String size) {
0756: String[] sizes = { "60%", "70%", "80%", null, "120%", "150%",
0757: "200%" };
0758: String buf;
0759:
0760: if (size.length() > 0 && '0' <= size.charAt(0)
0761: && size.charAt(0) <= '6') {
0762: int n = size.charAt(0) - '0';
0763: return sizes[n];
0764: }
0765:
0766: if (size.length() > 0 && size.charAt(0) == '-') {
0767: if (size.length() > 1 && '0' <= size.charAt(1)
0768: && size.charAt(1) <= '6') {
0769: int n = size.charAt(1) - '0';
0770: double x;
0771:
0772: for (x = 1.0; n > 0; --n) {
0773: x *= 0.8;
0774: }
0775:
0776: x *= 100.0;
0777: buf = "" + (int) x + "%";
0778:
0779: return buf;
0780: }
0781:
0782: return "smaller"; /* "70%"; */
0783: }
0784:
0785: if (size.length() > 1 && '0' <= size.charAt(1)
0786: && size.charAt(1) <= '6') {
0787: int n = size.charAt(1) - '0';
0788: double x;
0789:
0790: for (x = 1.0; n > 0; --n) {
0791: x *= 1.2;
0792: }
0793:
0794: x *= 100.0;
0795: buf = "" + (int) x + "%";
0796:
0797: return buf;
0798: }
0799:
0800: return "larger"; /* "140%" */
0801: }
0802:
0803: /**
0804: * Adds a font-family style.
0805: * @param node Node
0806: * @param face font face
0807: */
0808: private void addFontFace(Node node, String face) {
0809: addStyleProperty(node, "font-family: " + face);
0810: }
0811:
0812: /**
0813: * Adds a font size style.
0814: * @param node Node
0815: * @param size font size
0816: */
0817: private void addFontSize(Node node, String size) {
0818: String value;
0819:
0820: if (size.equals("6") && node.tag == this .tt.tagP) {
0821: node.element = "h1";
0822: this .tt.findTag(node);
0823: return;
0824: }
0825:
0826: if (size.equals("5") && node.tag == this .tt.tagP) {
0827: node.element = "h2";
0828: this .tt.findTag(node);
0829: return;
0830: }
0831:
0832: if (size.equals("4") && node.tag == this .tt.tagP) {
0833: node.element = "h3";
0834: this .tt.findTag(node);
0835: return;
0836: }
0837:
0838: value = fontSize2Name(size);
0839:
0840: if (value != null) {
0841: addStyleProperty(node, "font-size: " + value);
0842: }
0843: }
0844:
0845: /**
0846: * Adds a font color style.
0847: * @param node Node
0848: * @param color color value
0849: */
0850: private void addFontColor(Node node, String color) {
0851: addStyleProperty(node, "color: " + color);
0852: }
0853:
0854: /**
0855: * Adds an align style.
0856: * @param node Node
0857: * @param align align value
0858: */
0859: private void addAlign(Node node, String align) {
0860: // force alignment value to lower case
0861: addStyleProperty(node, "text-align: " + align.toLowerCase());
0862: }
0863:
0864: /**
0865: * Add style properties to node corresponding to the font face, size and color attributes.
0866: * @param node font tag
0867: * @param av attribute list for node
0868: */
0869: private void addFontStyles(Node node, AttVal av) {
0870: while (av != null) {
0871: if (av.attribute.equals("face")) {
0872: addFontFace(node, av.value);
0873: } else if (av.attribute.equals("size")) {
0874: addFontSize(node, av.value);
0875: } else if (av.attribute.equals("color")) {
0876: addFontColor(node, av.value);
0877: }
0878:
0879: av = av.next;
0880: }
0881: }
0882:
0883: /**
0884: * Symptom: <code><p align=center></code>. Action: <code><p style="text-align: center"></code>.
0885: * @param lexer Lexer
0886: * @param node node with center attribute. Will be modified to use css style.
0887: */
0888: private void textAlign(Lexer lexer, Node node) {
0889: AttVal av, prev;
0890:
0891: prev = null;
0892:
0893: for (av = node.attributes; av != null; av = av.next) {
0894: if (av.attribute.equals("align")) {
0895: if (prev != null) {
0896: prev.next = av.next;
0897: } else {
0898: node.attributes = av.next;
0899: }
0900:
0901: if (av.value != null) {
0902: addAlign(node, av.value);
0903: }
0904:
0905: break;
0906: }
0907:
0908: prev = av;
0909: }
0910: }
0911:
0912: /**
0913: * Symptom: <code><dir><li></code> where <code><li></code> is only child. Action: coerce
0914: * <code><dir> <li></code> to <code><div></code> with indent. The clean up rules use the pnode argument
0915: * to return the next node when the original node has been deleted.
0916: * @param lexer Lexer
0917: * @param node dir tag
0918: * @return <code>true</code> if a dir tag has been coerced to a div
0919: */
0920: private boolean dir2Div(Lexer lexer, Node node) {
0921: Node child;
0922:
0923: if (node.tag == this .tt.tagDir || node.tag == this .tt.tagUl
0924: || node.tag == this .tt.tagOl) {
0925: child = node.content;
0926:
0927: if (child == null) {
0928: return false;
0929: }
0930:
0931: // check child has no peers
0932: if (child.next != null) {
0933: return false;
0934: }
0935:
0936: if (child.tag != this .tt.tagLi) {
0937: return false;
0938: }
0939:
0940: if (!child.implicit) {
0941: return false;
0942: }
0943:
0944: // coerce dir to div
0945: node.tag = this .tt.tagDiv;
0946: node.element = "div";
0947: addStyleProperty(node, "margin-left: 2em");
0948: stripOnlyChild(node);
0949: return true;
0950: }
0951:
0952: return false;
0953: }
0954:
0955: /**
0956: * Symptom:
0957: *
0958: * <pre>
0959: * <center>
0960: * </pre>.
0961: * <p>
0962: * Action: replace <code><center></code> by <code><div style="text-align: center"></code>
0963: * </p>
0964: * @param lexer Lexer
0965: * @param node center tag
0966: * @param pnode pnode[0] is the same as node, passed in as an array to allow modification
0967: * @return <code>true</code> if a center tag has been replaced by a div
0968: */
0969: private boolean center2Div(Lexer lexer, Node node, Node[] pnode) {
0970: if (node.tag == this .tt.tagCenter) {
0971: if (lexer.configuration.dropFontTags) {
0972: if (node.content != null) {
0973: Node last = node.last;
0974: Node parent = node.parent;
0975:
0976: discardContainer(node, pnode);
0977:
0978: node = lexer.inferredTag("br");
0979:
0980: if (last.next != null) {
0981: last.next.prev = node;
0982: }
0983:
0984: node.next = last.next;
0985: last.next = node;
0986: node.prev = last;
0987:
0988: if (parent.last == last) {
0989: parent.last = node;
0990: }
0991:
0992: node.parent = parent;
0993: } else {
0994: Node prev = node.prev;
0995: Node next = node.next;
0996: Node parent = node.parent;
0997: discardContainer(node, pnode);
0998:
0999: node = lexer.inferredTag("br");
1000: node.next = next;
1001: node.prev = prev;
1002: node.parent = parent;
1003:
1004: if (next != null) {
1005: next.prev = node;
1006: } else {
1007: parent.last = node;
1008: }
1009:
1010: if (prev != null) {
1011: prev.next = node;
1012: } else {
1013: parent.content = node;
1014: }
1015: }
1016:
1017: return true;
1018: }
1019: node.tag = this .tt.tagDiv;
1020: node.element = "div";
1021: addStyleProperty(node, "text-align: center");
1022: return true;
1023: }
1024:
1025: return false;
1026: }
1027:
1028: /**
1029: * Symptom: <code><div><div>...</div></div></code> Action: merge the two divs. This is useful after
1030: * nested <dir>s used by Word for indenting have been converted to <div>s.
1031: * @param lexer Lexer
1032: * @param node first div
1033: * @return true if the divs have been merged
1034: */
1035: private boolean mergeDivs(Lexer lexer, Node node) {
1036: Node child;
1037:
1038: if (node.tag != this .tt.tagDiv) {
1039: return false;
1040: }
1041:
1042: child = node.content;
1043:
1044: if (child == null) {
1045: return false;
1046: }
1047:
1048: if (child.tag != this .tt.tagDiv) {
1049: return false;
1050: }
1051:
1052: if (child.next != null) {
1053: return false;
1054: }
1055:
1056: mergeStyles(node, child);
1057: stripOnlyChild(node);
1058: return true;
1059: }
1060:
1061: /**
1062: * Symptom:
1063: * <ul>
1064: * <li>
1065: * <ul>
1066: * ...
1067: * </ul>
1068: * </li>
1069: * </ul>
1070: * Action: discard outer list.
1071: * @param lexer Lexer
1072: * @param node Node
1073: * @param pnode passed in as array to allow modifications.
1074: * @return <code>true</code> if nested lists have been found and replaced
1075: */
1076: private boolean nestedList(Lexer lexer, Node node, Node[] pnode) {
1077: Node child, list;
1078:
1079: if (node.tag == this .tt.tagUl || node.tag == this .tt.tagOl) {
1080: child = node.content;
1081:
1082: if (child == null) {
1083: return false;
1084: }
1085:
1086: // check child has no peers
1087:
1088: if (child.next != null) {
1089: return false;
1090: }
1091:
1092: list = child.content;
1093:
1094: if (list == null) {
1095: return false;
1096: }
1097:
1098: if (list.tag != node.tag) {
1099: return false;
1100: }
1101:
1102: pnode[0] = list; // Set node to resume iteration
1103:
1104: // move inner list node into position of outer node
1105: list.prev = node.prev;
1106: list.next = node.next;
1107: list.parent = node.parent;
1108: fixNodeLinks(list);
1109:
1110: // get rid of outer ul and its li
1111: // XXX: Are we leaking the child node? -creitzel 7 Jun, 01
1112: child.content = null;
1113: node.content = null;
1114: node.next = null;
1115: node = null;
1116:
1117: // If prev node was a list the chances are this node should be appended to that list. Word has no way of
1118: // recognizing nested lists and just uses indents
1119: if (list.prev != null) {
1120: if (list.prev.tag == this .tt.tagUl
1121: || list.prev.tag == this .tt.tagOl) {
1122:
1123: node = list;
1124: list = node.prev;
1125:
1126: list.next = node.next;
1127:
1128: if (list.next != null) {
1129: list.next.prev = list;
1130: }
1131:
1132: child = list.last; /* <li> */
1133:
1134: node.parent = child;
1135: node.next = null;
1136: node.prev = child.last;
1137: fixNodeLinks(node);
1138: cleanNode(lexer, node);
1139: }
1140: }
1141:
1142: return true;
1143: }
1144:
1145: return false;
1146: }
1147:
1148: /**
1149: * Symptom: the only child of a block-level element is a presentation element such as B, I or FONT. Action: add
1150: * style "font-weight: bold" to the block and strip the <b>element, leaving its children. example:
1151: *
1152: * <pre>
1153: * <p>
1154: * <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1155: * </p>
1156: * </pre>
1157: *
1158: * becomes:
1159: *
1160: * <pre>
1161: * <p style="font-weight: bold; font-family: Arial; font-size: 6">
1162: * Draft Recommended Practice
1163: * </p>
1164: * </pre>
1165: *
1166: * <p>
1167: * This code also replaces the align attribute by a style attribute. However, to avoid CSS problems with Navigator
1168: * 4, this isn't done for the elements: caption, tr and table
1169: * </p>
1170: * @param lexer Lexer
1171: * @param node parent node
1172: * @return <code>true</code> if the child node has been removed
1173: */
1174: private boolean blockStyle(Lexer lexer, Node node) {
1175: Node child;
1176:
1177: if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST
1178: | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) {
1179: if (node.tag != this .tt.tagTable
1180: && node.tag != this .tt.tagTr
1181: && node.tag != this .tt.tagLi) {
1182: // check for align attribute
1183: if (node.tag != this .tt.tagCaption) {
1184: textAlign(lexer, node);
1185: }
1186:
1187: child = node.content;
1188:
1189: if (child == null) {
1190: return false;
1191: }
1192:
1193: // check child has no peers
1194: if (child.next != null) {
1195: return false;
1196: }
1197:
1198: if (child.tag == this .tt.tagB) {
1199: mergeStyles(node, child);
1200: addStyleProperty(node, "font-weight: bold");
1201: stripOnlyChild(node);
1202: return true;
1203: }
1204:
1205: if (child.tag == this .tt.tagI) {
1206: mergeStyles(node, child);
1207: addStyleProperty(node, "font-style: italic");
1208: stripOnlyChild(node);
1209: return true;
1210: }
1211:
1212: if (child.tag == this .tt.tagFont) {
1213: mergeStyles(node, child);
1214: addFontStyles(node, child.attributes);
1215: stripOnlyChild(node);
1216: return true;
1217: }
1218: }
1219: }
1220:
1221: return false;
1222: }
1223:
1224: /**
1225: * If the node has only one b, i, or font child remove the child node and add the appropriate style attributes to
1226: * parent.
1227: * @param lexer Lexer
1228: * @param node parent node
1229: * @param pnode passed as an array to allow modifications
1230: * @return <code>true</code> if child node has been stripped, replaced by style attributes.
1231: */
1232: private boolean inlineStyle(Lexer lexer, Node node, Node[] pnode) {
1233: Node child;
1234:
1235: if (node.tag != this .tt.tagFont
1236: && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) {
1237: child = node.content;
1238:
1239: if (child == null) {
1240: return false;
1241: }
1242:
1243: // check child has no peers
1244: if (child.next != null) {
1245: return false;
1246: }
1247:
1248: if (child.tag == this .tt.tagB
1249: && lexer.configuration.logicalEmphasis) {
1250: mergeStyles(node, child);
1251: addStyleProperty(node, "font-weight: bold");
1252: stripOnlyChild(node);
1253: return true;
1254: }
1255:
1256: if (child.tag == this .tt.tagI
1257: && lexer.configuration.logicalEmphasis) {
1258: mergeStyles(node, child);
1259: addStyleProperty(node, "font-style: italic");
1260: stripOnlyChild(node);
1261: return true;
1262: }
1263:
1264: if (child.tag == this .tt.tagFont) {
1265: mergeStyles(node, child);
1266: addFontStyles(node, child.attributes);
1267: stripOnlyChild(node);
1268: return true;
1269: }
1270: }
1271:
1272: return false;
1273: }
1274:
1275: /**
1276: * Replace font elements by span elements, deleting the font element's attributes and replacing them by a single
1277: * style attribute.
1278: * @param lexer Lexer
1279: * @param node font tag
1280: * @param pnode passed as an array to allow modifications
1281: * @return <code>true</code> if a font tag has been dropped and replaced by style attributes
1282: */
1283: private boolean font2Span(Lexer lexer, Node node, Node[] pnode) {
1284: AttVal av, style, next;
1285:
1286: if (node.tag == this .tt.tagFont) {
1287: if (lexer.configuration.dropFontTags) {
1288: discardContainer(node, pnode);
1289: return false;
1290: }
1291:
1292: // if FONT is only child of parent element then leave alone
1293: if (node.parent.content == node && node.next == null) {
1294: return false;
1295: }
1296:
1297: addFontStyles(node, node.attributes);
1298:
1299: // extract style attribute and free the rest
1300: av = node.attributes;
1301: style = null;
1302:
1303: while (av != null) {
1304: next = av.next;
1305:
1306: if (av.attribute.equals("style")) {
1307: av.next = null;
1308: style = av;
1309: }
1310:
1311: av = next;
1312: }
1313:
1314: node.attributes = style;
1315:
1316: node.tag = this .tt.tagSpan;
1317: node.element = "span";
1318:
1319: return true;
1320: }
1321:
1322: return false;
1323: }
1324:
1325: /**
1326: * Applies all matching rules to a node.
1327: * @param lexer Lexer
1328: * @param node original node
1329: * @return cleaned up node
1330: */
1331: private Node cleanNode(Lexer lexer, Node node) {
1332: Node next = null;
1333: Node[] o = new Node[1];
1334: boolean b = false;
1335:
1336: for (next = node; node != null && node.isElement(); node = next) {
1337: o[0] = next;
1338:
1339: b = dir2Div(lexer, node);
1340: next = o[0];
1341: if (b) {
1342: continue;
1343: }
1344:
1345: // Special case: true result means that arg node and its parent no longer exist.
1346: // So we must jump back up the CreateStyleProperties() call stack until we have a valid node reference.
1347: b = nestedList(lexer, node, o);
1348: next = o[0];
1349: if (b) {
1350: return next;
1351: }
1352:
1353: b = center2Div(lexer, node, o);
1354: next = o[0];
1355: if (b) {
1356: continue;
1357: }
1358:
1359: b = mergeDivs(lexer, node);
1360: next = o[0];
1361: if (b) {
1362: continue;
1363: }
1364:
1365: b = blockStyle(lexer, node);
1366: next = o[0];
1367: if (b) {
1368: continue;
1369: }
1370:
1371: b = inlineStyle(lexer, node, o);
1372: next = o[0];
1373: if (b) {
1374: continue;
1375: }
1376:
1377: b = font2Span(lexer, node, o);
1378: next = o[0];
1379: if (b) {
1380: continue;
1381: }
1382:
1383: break;
1384: }
1385:
1386: return next;
1387: }
1388:
1389: /**
1390: * Special case: if the current node is destroyed by CleanNode() lower in the tree, this node and its parent no
1391: * longer exist. So we must jump back up the CreateStyleProperties() call stack until we have a valid node
1392: * reference.
1393: * @param lexer Lexer
1394: * @param node Node
1395: * @param prepl passed in as array to allow modifications
1396: * @return cleaned Node
1397: */
1398: private Node createStyleProperties(Lexer lexer, Node node,
1399: Node[] prepl) {
1400: Node child;
1401:
1402: if (node.content != null) {
1403: Node[] repl = new Node[1];
1404: repl[0] = node;
1405: for (child = node.content; child != null; child = child.next) {
1406: child = createStyleProperties(lexer, child, repl);
1407: if (repl[0] != node) {
1408: return repl[0];
1409: }
1410: }
1411: }
1412:
1413: return cleanNode(lexer, node);
1414: }
1415:
1416: /**
1417: * Find style attribute in node content, and replace it by corresponding class attribute.
1418: * @param lexer Lexer
1419: * @param node parent node
1420: */
1421: private void defineStyleRules(Lexer lexer, Node node) {
1422: Node child;
1423:
1424: if (node.content != null) {
1425: child = node.content;
1426: while (child != null) {
1427: defineStyleRules(lexer, child);
1428: child = child.next;
1429: }
1430: }
1431:
1432: style2Rule(lexer, node);
1433: }
1434:
1435: /**
1436: * Clean an html tree.
1437: * @param lexer Lexer
1438: * @param doc root node
1439: */
1440: public void cleanTree(Lexer lexer, Node doc) {
1441: Node[] repl = new Node[1];
1442: repl[0] = doc;
1443: doc = createStyleProperties(lexer, doc, repl);
1444:
1445: if (!lexer.configuration.makeClean) {
1446: defineStyleRules(lexer, doc);
1447: createStyleElement(lexer, doc);
1448: }
1449: }
1450:
1451: /**
1452: * simplifies <b><b>... </b> ... </b> etc.
1453: * @param node root Node
1454: */
1455: public void nestedEmphasis(Node node) {
1456: Node[] o = new Node[1];
1457: Node next;
1458:
1459: while (node != null) {
1460: next = node.next;
1461:
1462: if ((node.tag == this .tt.tagB || node.tag == this .tt.tagI)
1463: && node.parent != null
1464: && node.parent.tag == node.tag) {
1465: // strip redundant inner element
1466: o[0] = next;
1467: discardContainer(node, o);
1468: next = o[0];
1469: node = next;
1470: continue;
1471: }
1472:
1473: if (node.content != null) {
1474: nestedEmphasis(node.content);
1475: }
1476:
1477: node = next;
1478: }
1479: }
1480:
1481: /**
1482: * Replace i by em and b by strong.
1483: * @param node root Node
1484: */
1485: public void emFromI(Node node) {
1486: while (node != null) {
1487: if (node.tag == this .tt.tagI) {
1488: node.element = this .tt.tagEm.name;
1489: node.tag = this .tt.tagEm;
1490: } else if (node.tag == this .tt.tagB) {
1491: node.element = this .tt.tagStrong.name;
1492: node.tag = this .tt.tagStrong;
1493: }
1494:
1495: if (node.content != null) {
1496: emFromI(node.content);
1497: }
1498:
1499: node = node.next;
1500: }
1501: }
1502:
1503: /**
1504: * Some people use dir or ul without an li to indent the content. The pattern to look for is a list with a single
1505: * implicit li. This is recursively replaced by an implicit blockquote.
1506: * @param node root Node
1507: */
1508: public void list2BQ(Node node) {
1509: while (node != null) {
1510: if (node.content != null) {
1511: list2BQ(node.content);
1512: }
1513:
1514: if (node.tag != null
1515: && node.tag.getParser() == ParserImpl.LIST
1516: && node.hasOneChild() && node.content.implicit) {
1517: stripOnlyChild(node);
1518: node.element = this .tt.tagBlockquote.name;
1519: node.tag = this .tt.tagBlockquote;
1520: node.implicit = true;
1521: }
1522:
1523: node = node.next;
1524: }
1525: }
1526:
1527: /**
1528: * Replace implicit blockquote by div with an indent taking care to reduce nested blockquotes to a single div with
1529: * the indent set to match the nesting depth.
1530: * @param node root Node
1531: */
1532: public void bQ2Div(Node node) {
1533: int indent;
1534: String indentBuf;
1535: AttVal attval;
1536:
1537: while (node != null) {
1538: if (node.tag == this .tt.tagBlockquote && node.implicit) {
1539: indent = 1;
1540:
1541: while (node.hasOneChild()
1542: && node.content.tag == this .tt.tagBlockquote
1543: && node.implicit) {
1544: ++indent;
1545: stripOnlyChild(node);
1546: }
1547:
1548: if (node.content != null) {
1549: bQ2Div(node.content);
1550: }
1551:
1552: indentBuf = "margin-left: "
1553: + (new Integer(2 * indent)).toString() + "em";
1554:
1555: node.element = this .tt.tagDiv.name;
1556: node.tag = this .tt.tagDiv;
1557:
1558: attval = node.getAttrByName("style");
1559:
1560: if (attval != null && attval.value != null) {
1561: attval.value = indentBuf + "; " + attval.value;
1562: } else {
1563: node.addAttribute("style", indentBuf);
1564: }
1565: } else if (node.content != null) {
1566: bQ2Div(node.content);
1567: }
1568:
1569: node = node.next;
1570: }
1571: }
1572:
1573: /**
1574: * Find the enclosing table cell for the given node.
1575: * @param node Node
1576: * @return enclosing cell node
1577: */
1578: Node findEnclosingCell(Node node) {
1579: Node check;
1580:
1581: for (check = node; check != null; check = check.parent) {
1582: if (check.tag == tt.tagTd) {
1583: return check;
1584: }
1585: }
1586: return null;
1587: }
1588:
1589: /**
1590: * node is <code><![if ...]></code> prune up to <code><![endif]></code>.
1591: * @param lexer Lexer
1592: * @param node Node
1593: * @return cleaned up Node
1594: */
1595: public Node pruneSection(Lexer lexer, Node node) {
1596: for (;;) {
1597:
1598: // FG: commented out - don't add to empty cells
1599:
1600: // if ((Lexer.getString(node.textarray, node.start, 21)).equals("if !supportEmptyParas"))
1601: // {
1602: // Node cell = findEnclosingCell(node);
1603: // if (cell != null)
1604: // {
1605: // // Need to put into cell so it doesn't look weird
1606: // char onesixty[] = {(char) 160, (char) 0};
1607: // Node nbsp = lexer.newLiteralTextNode(lexer, onesixty);
1608: // Node.insertNodeBeforeElement(node, nbsp);
1609: // }
1610: // }
1611:
1612: // discard node and returns next
1613: node = Node.discardElement(node);
1614:
1615: if (node == null) {
1616: return null;
1617: }
1618:
1619: if (node.type == Node.SECTION_TAG) {
1620: if ((TidyUtils.getString(node.textarray, node.start, 2))
1621: .equals("if")) {
1622: node = pruneSection(lexer, node);
1623: continue;
1624: }
1625:
1626: if ((TidyUtils.getString(node.textarray, node.start, 5))
1627: .equals("endif")) {
1628: node = Node.discardElement(node);
1629: break;
1630: }
1631: }
1632: }
1633:
1634: return node;
1635: }
1636:
1637: /**
1638: * Drop if/endif sections inserted by word2000.
1639: * @param lexer Lexer
1640: * @param node Node root node
1641: */
1642: public void dropSections(Lexer lexer, Node node) {
1643: while (node != null) {
1644: if (node.type == Node.SECTION_TAG) {
1645: // prune up to matching endif
1646: if ((TidyUtils.getString(node.textarray, node.start, 2))
1647: .equals("if")
1648: && (!(TidyUtils.getString(node.textarray,
1649: node.start, 7)).equals("if !vml"))) // #444394 - fix 13
1650: // Sep 01
1651: {
1652: node = pruneSection(lexer, node);
1653: continue;
1654: }
1655:
1656: // discard others as well
1657: node = Node.discardElement(node);
1658: continue;
1659: }
1660:
1661: if (node.content != null) {
1662: dropSections(lexer, node.content);
1663: }
1664:
1665: node = node.next;
1666: }
1667: }
1668:
1669: /**
1670: * Remove word2000 attributes from node.
1671: * @param node node to cleanup
1672: */
1673: public void purgeWord2000Attributes(Node node) {
1674: AttVal attr = null;
1675: AttVal next = null;
1676: AttVal prev = null;
1677:
1678: for (attr = node.attributes; attr != null; attr = next) {
1679: next = attr.next;
1680:
1681: // special check for class="Code" denoting pre text
1682: // Pass thru user defined styles as HTML class names
1683: if (attr.attribute != null && attr.value != null
1684: && attr.attribute.equals("class")) {
1685: if (attr.value.equals("Code")
1686: || !attr.value.startsWith("Mso")) {
1687: prev = attr;
1688: continue;
1689: }
1690: }
1691:
1692: if (attr.attribute != null
1693: && (attr.attribute.equals("class")
1694: || attr.attribute.equals("style")
1695: || attr.attribute.equals("lang")
1696: || attr.attribute.startsWith("x:") || ((attr.attribute
1697: .equals("height") || attr.attribute
1698: .equals("width")) && //
1699: (node.tag == this .tt.tagTd
1700: || node.tag == this .tt.tagTr || node.tag == this .tt.tagTh)))) {
1701: if (prev != null) {
1702: prev.next = next;
1703: } else {
1704: node.attributes = next;
1705: }
1706:
1707: } else {
1708: prev = attr;
1709: }
1710: }
1711: }
1712:
1713: /**
1714: * Word2000 uses span excessively, so we strip span out.
1715: * @param lexer Lexer
1716: * @param span Node span
1717: * @return cleaned node
1718: */
1719: public Node stripSpan(Lexer lexer, Node span) {
1720: Node node;
1721: Node prev = null;
1722: Node content;
1723:
1724: // deal with span elements that have content by splicing the content in place of the span after having
1725: // processed it
1726:
1727: cleanWord2000(lexer, span.content);
1728: content = span.content;
1729:
1730: if (span.prev != null) {
1731: prev = span.prev;
1732: } else if (content != null) {
1733: node = content;
1734: content = content.next;
1735: node.removeNode();
1736: Node.insertNodeBeforeElement(span, node);
1737: prev = node;
1738: }
1739:
1740: while (content != null) {
1741: node = content;
1742: content = content.next;
1743: node.removeNode();
1744: prev.insertNodeAfterElement(node);
1745: prev = node;
1746: }
1747:
1748: if (span.next == null) {
1749: span.parent.last = prev;
1750: }
1751:
1752: node = span.next;
1753: span.content = null;
1754: Node.discardElement(span);
1755: return node;
1756: }
1757:
1758: /**
1759: * Map non-breaking spaces to regular spaces.
1760: * @param lexer Lexer
1761: * @param node Node
1762: */
1763: private void normalizeSpaces(Lexer lexer, Node node) {
1764: while (node != null) {
1765: if (node.content != null) {
1766: normalizeSpaces(lexer, node.content);
1767: }
1768:
1769: if (node.type == Node.TEXT_NODE) {
1770: int i;
1771: int[] c = new int[1];
1772: int p = node.start;
1773:
1774: for (i = node.start; i < node.end; ++i) {
1775: c[0] = node.textarray[i];
1776:
1777: // look for UTF-8 multibyte character
1778: if (c[0] > 0x7F) {
1779: i += PPrint.getUTF8(node.textarray, i, c);
1780: }
1781:
1782: if (c[0] == 160) {
1783: c[0] = ' ';
1784: }
1785:
1786: p = PPrint.putUTF8(node.textarray, p, c[0]);
1787: }
1788: }
1789:
1790: node = node.next;
1791: }
1792: }
1793:
1794: /**
1795: * Used to hunt for hidden preformatted sections.
1796: * @param node checked node
1797: * @return <code>true</code> if the node has a "margin-top: 0" or "margin-bottom: 0" style
1798: */
1799: boolean noMargins(Node node) {
1800: AttVal attval = node.getAttrByName("style");
1801:
1802: if (attval == null || attval.value == null) {
1803: return false;
1804: }
1805:
1806: // search for substring "margin-top: 0"
1807: if (attval.value.indexOf("margin-top: 0") == -1) {
1808: return false;
1809: }
1810:
1811: // search for substring "margin-top: 0"
1812: if (attval.value.indexOf("margin-bottom: 0") == -1) {
1813: return false;
1814: }
1815:
1816: return true;
1817: }
1818:
1819: /**
1820: * Does element have a single space as its content?
1821: * @param lexer Lexer
1822: * @param node checked node
1823: * @return <code>true</code> if the element has a single space as its content
1824: */
1825: boolean singleSpace(Lexer lexer, Node node) {
1826: if (node.content != null) {
1827: node = node.content;
1828:
1829: if (node.next != null) {
1830: return false;
1831: }
1832:
1833: if (node.type != Node.TEXT_NODE) {
1834: return false;
1835: }
1836:
1837: if (((node.end - node.start) == 1)
1838: && lexer.lexbuf[node.start] == ' ') {
1839: return true;
1840: }
1841:
1842: if ((node.end - node.start) == 2) {
1843: int[] c = new int[1];
1844:
1845: PPrint.getUTF8(lexer.lexbuf, node.start, c);
1846:
1847: if (c[0] == 160) {
1848: return true;
1849: }
1850: }
1851: }
1852:
1853: return false;
1854: }
1855:
1856: /**
1857: * This is a major clean up to strip out all the extra stuff you get when you save as web page from Word 2000. It
1858: * doesn't yet know what to do with VML tags, but these will appear as errors unless you declare them as new tags,
1859: * such as o:p which needs to be declared as inline.
1860: * @param lexer Lexer
1861: * @param node node to clean up
1862: */
1863: public void cleanWord2000(Lexer lexer, Node node) {
1864: // used to a list from a sequence of bulletted p's
1865: Node list = null;
1866:
1867: while (node != null) {
1868:
1869: // get rid of Word's xmlns attributes
1870: if (node.tag == tt.tagHtml) {
1871: // check that it's a Word 2000 document
1872: if ((node.getAttrByName("xmlns:o") == null)) {
1873: return;
1874: }
1875: lexer.configuration.tt.freeAttrs(node);
1876: }
1877:
1878: // fix up preformatted sections by looking for a sequence of paragraphs with zero top/bottom margin
1879: if (node.tag == tt.tagP) {
1880: if (noMargins(node)) {
1881: Node pre;
1882: Node next;
1883: Node.coerceNode(lexer, node, tt.tagPre);
1884:
1885: purgeWord2000Attributes(node);
1886:
1887: if (node.content != null) {
1888: cleanWord2000(lexer, node.content);
1889: }
1890:
1891: pre = node;
1892: node = node.next;
1893:
1894: // continue to strip p's
1895: while (node.tag == tt.tagP && noMargins(node)) {
1896: next = node.next;
1897: node.removeNode();
1898: pre.insertNodeAtEnd(lexer.newLineNode());
1899: pre.insertNodeAtEnd(node);
1900: stripSpan(lexer, node);
1901: node = next;
1902: }
1903:
1904: if (node == null) {
1905: break;
1906: }
1907: }
1908: }
1909:
1910: if (node.tag != null
1911: && TidyUtils.toBoolean(node.tag.model
1912: & Dict.CM_BLOCK)
1913: && singleSpace(lexer, node)) {
1914: node = stripSpan(lexer, node);
1915: continue;
1916: }
1917:
1918: // discard Word's style verbiage
1919: if (node.tag == this .tt.tagStyle
1920: || node.tag == this .tt.tagMeta
1921: || node.type == Node.COMMENT_TAG) {
1922: node = Node.discardElement(node);
1923: continue;
1924: }
1925:
1926: // strip out all span and font tags Word scatters so liberally!
1927: if (node.tag == this .tt.tagSpan
1928: || node.tag == this .tt.tagFont) {
1929: node = stripSpan(lexer, node);
1930: continue;
1931: }
1932:
1933: if (node.tag == this .tt.tagLink) {
1934: AttVal attr = node.getAttrByName("rel");
1935:
1936: if (attr != null && attr.value != null
1937: && attr.value.equals("File-List")) {
1938: node = Node.discardElement(node);
1939: continue;
1940: }
1941: }
1942:
1943: // discard empty paragraphs
1944: if (node.content == null && node.tag == this .tt.tagP) {
1945: node = Node.discardElement(node);
1946: continue;
1947: }
1948:
1949: if (node.tag == this .tt.tagP) {
1950: AttVal attr = node.getAttrByName("class");
1951: AttVal atrStyle = node.getAttrByName("style");
1952:
1953: // (JES) Sometimes Word marks a list item with the following hokie syntax
1954: // <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
1955: // translate these into <li>
1956:
1957: // map sequence of <p class="MsoListBullet"> to <ul> ... </ul>
1958: // map <p class="MsoListNumber"> to <ol>...</ol>
1959: if (attr != null
1960: && attr.value != null
1961: && ((attr.value.equals("MsoListBullet") || attr.value
1962: .equals("MsoListNumber")) //
1963: || (atrStyle != null && (atrStyle.value
1964: .indexOf("mso-list:") != -1)))) // 463066 - fix by Joel
1965: // Shafer 19 Sep 01
1966: {
1967: Dict listType = tt.tagUl;
1968:
1969: if (attr.value.equals("MsoListNumber")) {
1970: listType = tt.tagOl;
1971: }
1972:
1973: Node.coerceNode(lexer, node, this .tt.tagLi);
1974:
1975: if (list == null || list.tag != listType) {
1976: list = lexer.inferredTag(listType.name);
1977: Node.insertNodeBeforeElement(node, list);
1978: }
1979:
1980: purgeWord2000Attributes(node);
1981:
1982: if (node.content != null) {
1983: cleanWord2000(lexer, node.content);
1984: }
1985:
1986: // remove node and append to contents of list
1987: node.removeNode();
1988: list.insertNodeAtEnd(node);
1989: node = list;
1990: }
1991: // map sequence of <p class="Code"> to <pre> ... </pre>
1992: else if (attr != null && attr.value != null
1993: && attr.value.equals("Code")) {
1994: Node br = lexer.newLineNode();
1995: normalizeSpaces(lexer, node);
1996:
1997: if (list == null || list.tag != this .tt.tagPre) {
1998: list = lexer.inferredTag("pre");
1999: Node.insertNodeBeforeElement(node, list);
2000: }
2001:
2002: // remove node and append to contents of list
2003: node.removeNode();
2004: list.insertNodeAtEnd(node);
2005: stripSpan(lexer, node);
2006: list.insertNodeAtEnd(br);
2007: node = list.next;
2008: } else {
2009: list = null;
2010: }
2011: } else {
2012: list = null;
2013: }
2014:
2015: // strip out style and class attributes
2016: if (node.type == Node.START_TAG
2017: || node.type == Node.START_END_TAG) {
2018: purgeWord2000Attributes(node);
2019: }
2020:
2021: if (node.content != null) {
2022: cleanWord2000(lexer, node.content);
2023: }
2024:
2025: node = node.next;
2026: }
2027: }
2028:
2029: /**
2030: * Check if the current document is a converted Word document.
2031: * @param root root Node
2032: * @return <code>true</code> if the document has been geenrated by Microsoft Word.
2033: */
2034: public boolean isWord2000(Node root) {
2035: AttVal attval;
2036: Node node;
2037: Node head;
2038: Node html = root.findHTML(this .tt);
2039:
2040: if (html != null && html.getAttrByName("xmlns:o") != null) {
2041: return true;
2042: }
2043:
2044: // search for <meta name="GENERATOR" content="Microsoft ...">
2045: head = root.findHEAD(tt);
2046:
2047: if (head != null) {
2048: for (node = head.content; node != null; node = node.next) {
2049: if (node.tag != tt.tagMeta) {
2050: continue;
2051: }
2052:
2053: attval = node.getAttrByName("name");
2054:
2055: if (attval == null || attval.value == null) {
2056: continue;
2057: }
2058:
2059: if (!"generator".equals(attval.value)) {
2060: continue;
2061: }
2062:
2063: attval = node.getAttrByName("content");
2064:
2065: if (attval == null || attval.value == null) {
2066: continue;
2067: }
2068:
2069: if (attval.value.indexOf("Microsoft") != -1) {
2070: return true;
2071: }
2072: }
2073: }
2074:
2075: return false;
2076: }
2077:
2078: /**
2079: * Where appropriate move object elements from head to body.
2080: * @param lexer Lexer
2081: * @param html html node
2082: */
2083: static void bumpObject(Lexer lexer, Node html) {
2084: if (html == null) {
2085: return;
2086: }
2087:
2088: Node node, next, head = null, body = null;
2089: TagTable tt = lexer.configuration.tt;
2090: for (node = html.content; node != null; node = node.next) {
2091: if (node.tag == tt.tagHead) {
2092: head = node;
2093: }
2094:
2095: if (node.tag == tt.tagBody) {
2096: body = node;
2097: }
2098: }
2099:
2100: if (head != null && body != null) {
2101: for (node = head.content; node != null; node = next) {
2102: next = node.next;
2103:
2104: if (node.tag == tt.tagObject) {
2105: Node child;
2106: boolean bump = false;
2107:
2108: for (child = node.content; child != null; child = child.next) {
2109: // bump to body unless content is param
2110: if ((child.type == Node.TEXT_NODE && !node
2111: .isBlank(lexer))
2112: || child.tag != tt.tagParam) {
2113: bump = true;
2114: break;
2115: }
2116: }
2117:
2118: if (bump) {
2119: node.removeNode();
2120: body.insertNodeAtStart(node);
2121: }
2122: }
2123: }
2124: }
2125: }
2126:
2127: }
|