0001: /*
0002: * @(#)Clean.java 1.11 2000/08/16
0003: *
0004: */
0005:
0006: package org.w3c.tidy;
0007:
0008: /**
0009: *
0010: * Clean up misuse of presentation markup
0011: *
0012: * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
0013: * See Tidy.java for the copyright notice.
0014: * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
0015: * HTML Tidy Release 4 Aug 2000</a>
0016: *
0017: * @author Dave Raggett <dsr@w3.org>
0018: * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
0019: * @version 1.0, 1999/05/22
0020: * @version 1.0.1, 1999/05/29
0021: * @version 1.1, 1999/06/18 Java Bean
0022: * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
0023: * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
0024: * @version 1.4, 1999/09/04 DOM support
0025: * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
0026: * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
0027: * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
0028: * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
0029: * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
0030: * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
0031: * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
0032: */
0033:
0034: /*
0035: Filters from other formats such as Microsoft Word
0036: often make excessive use of presentation markup such
0037: as font tags, B, I, and the align attribute. By applying
0038: a set of production rules, it is straight forward to
0039: transform this to use CSS.
0040:
0041: Some rules replace some of the children of an element by
0042: style properties on the element, e.g.
0043:
0044: <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
0045:
0046: Such rules are applied to the element's content and then
0047: to the element itself until none of the rules more apply.
0048: Having applied all the rules to an element, it will have
0049: a style attribute with one or more properties.
0050:
0051: Other rules strip the element they apply to, replacing
0052: it by style properties on the contents, e.g.
0053:
0054: <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
0055:
0056: These rules are applied to an element before processing
0057: its content and replace the current element by the first
0058: element in the exposed content.
0059:
0060: After applying both sets of rules, you can replace the
0061: style attribute by a class value and style rule in the
0062: document head. To support this, an association of styles
0063: and class names is built.
0064:
0065: A naive approach is to rely on string matching to test
0066: when two property lists are the same. A better approach
0067: would be to first sort the properties before matching.
0068: */
0069:
0070: public class Clean {
0071:
0072: private int classNum = 1;
0073:
0074: private TagTable tt;
0075:
0076: public Clean(TagTable tt) {
0077: this .tt = tt;
0078: }
0079:
0080: private StyleProp insertProperty(StyleProp props, String name,
0081: String value) {
0082: StyleProp first, prev, prop;
0083: int cmp;
0084:
0085: prev = null;
0086: first = props;
0087:
0088: while (props != null) {
0089: cmp = props.name.compareTo(name);
0090:
0091: if (cmp == 0) {
0092: /* this property is already defined, ignore new value */
0093: return first;
0094: }
0095:
0096: if (cmp > 0) // props.name > name
0097: {
0098: /* insert before this */
0099:
0100: prop = new StyleProp(name, value, props);
0101:
0102: if (prev != null)
0103: prev.next = prop;
0104: else
0105: first = prop;
0106:
0107: return first;
0108: }
0109:
0110: prev = props;
0111: props = props.next;
0112: }
0113:
0114: prop = new StyleProp(name, value);
0115:
0116: if (prev != null)
0117: prev.next = prop;
0118: else
0119: first = prop;
0120:
0121: return first;
0122: }
0123:
0124: /*
0125: Create sorted linked list of properties from style string
0126: It temporarily places nulls in place of ':' and ';' to
0127: delimit the strings for the property name and value.
0128: Some systems don't allow you to null literal strings,
0129: so to avoid this, a copy is made first.
0130: */
0131: private StyleProp createProps(StyleProp prop, String style) {
0132: int name_end;
0133: int value_end;
0134: int value_start = 0;
0135: int name_start = 0;
0136: boolean more;
0137:
0138: name_start = 0;
0139: while (name_start < style.length()) {
0140: while (name_start < style.length()
0141: && style.charAt(name_start) == ' ')
0142: ++name_start;
0143:
0144: name_end = name_start;
0145:
0146: while (name_end < style.length()) {
0147: if (style.charAt(name_end) == ':') {
0148: value_start = name_end + 1;
0149: break;
0150: }
0151:
0152: ++name_end;
0153: }
0154:
0155: if (name_end >= style.length()
0156: || style.charAt(name_end) != ':')
0157: break;
0158:
0159: while (value_start < style.length()
0160: && style.charAt(value_start) == ' ')
0161: ++value_start;
0162:
0163: value_end = value_start;
0164: more = false;
0165:
0166: while (value_end < style.length()) {
0167: if (style.charAt(value_end) == ';') {
0168: more = true;
0169: break;
0170: }
0171:
0172: ++value_end;
0173: }
0174:
0175: prop = insertProperty(prop, style.substring(name_start,
0176: name_end), style.substring(value_start, value_end));
0177:
0178: if (more) {
0179: name_start = value_end + 1;
0180: continue;
0181: }
0182:
0183: break;
0184: }
0185:
0186: return prop;
0187: }
0188:
0189: private String createPropString(StyleProp props) {
0190: String style = "";
0191: int len;
0192: StyleProp prop;
0193:
0194: /* compute length */
0195:
0196: for (len = 0, prop = props; prop != null; prop = prop.next) {
0197: len += prop.name.length() + 2;
0198: len += prop.value.length() + 2;
0199: }
0200:
0201: for (prop = props; prop != null; prop = prop.next) {
0202: style = style.concat(prop.name);
0203: style = style.concat(": ");
0204:
0205: style = style.concat(prop.value);
0206:
0207: if (prop.next == null)
0208: break;
0209:
0210: style = style.concat("; ");
0211: }
0212:
0213: return style;
0214: }
0215:
0216: /*
0217: create string with merged properties
0218: */
0219: private String addProperty(String style, String property) {
0220: StyleProp prop;
0221:
0222: prop = createProps(null, style);
0223: prop = createProps(prop, property);
0224: style = createPropString(prop);
0225: return style;
0226: }
0227:
0228: private String gensymClass(String tag) {
0229: String str;
0230:
0231: str = "c" + classNum;
0232: classNum++;
0233: return str;
0234: }
0235:
0236: private String findStyle(Lexer lexer, String tag, String properties) {
0237: Style style;
0238:
0239: for (style = lexer.styles; style != null; style = style.next) {
0240: if (style.tag.equals(tag)
0241: && style.properties.equals(properties))
0242: return style.tagClass;
0243: }
0244:
0245: style = new Style(tag, gensymClass(tag), properties,
0246: lexer.styles);
0247: lexer.styles = style;
0248: return style.tagClass;
0249: }
0250:
0251: /*
0252: Find style attribute in node, and replace it
0253: by corresponding class attribute. Search for
0254: class in style dictionary otherwise gensym
0255: new class and add to dictionary.
0256:
0257: Assumes that node doesn't have a class attribute
0258: */
0259: private void style2Rule(Lexer lexer, Node node) {
0260: AttVal styleattr, classattr;
0261: String classname;
0262:
0263: styleattr = node.getAttrByName("style");
0264:
0265: if (styleattr != null) {
0266: classname = findStyle(lexer, node.element, styleattr.value);
0267: classattr = node.getAttrByName("class");
0268:
0269: /*
0270: if there already is a class attribute
0271: then append class name after a space
0272: */
0273: if (classattr != null) {
0274: classattr.value = classattr.value + " " + classname;
0275: node.removeAttribute(styleattr);
0276: } else /* reuse style attribute for class attribute */
0277: {
0278: styleattr.attribute = "class";
0279: styleattr.value = classname;
0280: }
0281: }
0282: }
0283:
0284: private void addColorRule(Lexer lexer, String selector, String color) {
0285: if (color != null) {
0286: lexer.addStringLiteral(selector);
0287: lexer.addStringLiteral(" { color: ");
0288: lexer.addStringLiteral(color);
0289: lexer.addStringLiteral(" }\n");
0290: }
0291: }
0292:
0293: /*
0294: move presentation attribs from body to style element
0295:
0296: background="foo" -> body { background-image: url(foo) }
0297: bgcolor="foo" -> body { background-color: foo }
0298: text="foo" -> body { color: foo }
0299: link="foo" -> :link { color: foo }
0300: vlink="foo" -> :visited { color: foo }
0301: alink="foo" -> :active { color: foo }
0302: */
0303: private void cleanBodyAttrs(Lexer lexer, Node body) {
0304: AttVal attr;
0305: String bgurl = null;
0306: String bgcolor = null;
0307: String color = null;
0308:
0309: attr = body.getAttrByName("background");
0310:
0311: if (attr != null) {
0312: bgurl = attr.value;
0313: attr.value = null;
0314: body.removeAttribute(attr);
0315: }
0316:
0317: attr = body.getAttrByName("bgcolor");
0318:
0319: if (attr != null) {
0320: bgcolor = attr.value;
0321: attr.value = null;
0322: body.removeAttribute(attr);
0323: }
0324:
0325: attr = body.getAttrByName("text");
0326:
0327: if (attr != null) {
0328: color = attr.value;
0329: attr.value = null;
0330: body.removeAttribute(attr);
0331: }
0332:
0333: if (bgurl != null || bgcolor != null || color != null) {
0334: lexer.addStringLiteral(" body {\n");
0335:
0336: if (bgurl != null) {
0337: lexer.addStringLiteral(" background-image: url(");
0338: lexer.addStringLiteral(bgurl);
0339: lexer.addStringLiteral(");\n");
0340: }
0341:
0342: if (bgcolor != null) {
0343: lexer.addStringLiteral(" background-color: ");
0344: lexer.addStringLiteral(bgcolor);
0345: lexer.addStringLiteral(";\n");
0346: }
0347:
0348: if (color != null) {
0349: lexer.addStringLiteral(" color: ");
0350: lexer.addStringLiteral(color);
0351: lexer.addStringLiteral(";\n");
0352: }
0353:
0354: lexer.addStringLiteral(" }\n");
0355: }
0356:
0357: attr = body.getAttrByName("link");
0358:
0359: if (attr != null) {
0360: addColorRule(lexer, " :link", attr.value);
0361: body.removeAttribute(attr);
0362: }
0363:
0364: attr = body.getAttrByName("vlink");
0365:
0366: if (attr != null) {
0367: addColorRule(lexer, " :visited", attr.value);
0368: body.removeAttribute(attr);
0369: }
0370:
0371: attr = body.getAttrByName("alink");
0372:
0373: if (attr != null) {
0374: addColorRule(lexer, " :active", attr.value);
0375: body.removeAttribute(attr);
0376: }
0377: }
0378:
0379: private boolean niceBody(Lexer lexer, Node doc) {
0380: Node body = doc.findBody(lexer.configuration.tt);
0381:
0382: if (body != null) {
0383: if (body.getAttrByName("background") != null
0384: || body.getAttrByName("bgcolor") != null
0385: || body.getAttrByName("text") != null
0386: || body.getAttrByName("link") != null
0387: || body.getAttrByName("vlink") != null
0388: || body.getAttrByName("alink") != null) {
0389: lexer.badLayout |= Report.USING_BODY;
0390: return false;
0391: }
0392: }
0393:
0394: return true;
0395: }
0396:
0397: /* create style element using rules from dictionary */
0398: private void createStyleElement(Lexer lexer, Node doc) {
0399: Node node, head, body;
0400: Style style;
0401: AttVal av;
0402:
0403: if (lexer.styles == null && niceBody(lexer, doc))
0404: return;
0405:
0406: node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
0407: node.implicit = true;
0408:
0409: /* insert type attribute */
0410: av = new AttVal(null, null, '"', "type", "text/css");
0411: av.dict = AttributeTable.getDefaultAttributeTable()
0412: .findAttribute(av);
0413: node.attributes = av;
0414:
0415: body = doc.findBody(lexer.configuration.tt);
0416:
0417: lexer.txtstart = lexer.lexsize;
0418:
0419: if (body != null)
0420: cleanBodyAttrs(lexer, body);
0421:
0422: for (style = lexer.styles; style != null; style = style.next) {
0423: lexer.addCharToLexer(' ');
0424: lexer.addStringLiteral(style.tag);
0425: lexer.addCharToLexer('.');
0426: lexer.addStringLiteral(style.tagClass);
0427: lexer.addCharToLexer(' ');
0428: lexer.addCharToLexer('{');
0429: lexer.addStringLiteral(style.properties);
0430: lexer.addCharToLexer('}');
0431: lexer.addCharToLexer('\n');
0432: }
0433:
0434: lexer.txtend = lexer.lexsize;
0435:
0436: Node.insertNodeAtEnd(node, lexer.newNode(Node.TextNode,
0437: lexer.lexbuf, lexer.txtstart, lexer.txtend));
0438:
0439: /*
0440: now insert style element into document head
0441:
0442: doc is root node. search its children for html node
0443: the head node should be first child of html node
0444: */
0445:
0446: head = doc.findHEAD(lexer.configuration.tt);
0447:
0448: if (head != null)
0449: Node.insertNodeAtEnd(head, node);
0450: }
0451:
0452: /* ensure bidirectional links are consistent */
0453: private void fixNodeLinks(Node node) {
0454: Node child;
0455:
0456: if (node.prev != null)
0457: node.prev.next = node;
0458: else
0459: node.parent.content = node;
0460:
0461: if (node.next != null)
0462: node.next.prev = node;
0463: else
0464: node.parent.last = node;
0465:
0466: for (child = node.content; child != null; child = child.next)
0467: child.parent = node;
0468: }
0469:
0470: /*
0471: used to strip child of node when
0472: the node has one and only one child
0473: */
0474: private void stripOnlyChild(Node node) {
0475: Node child;
0476:
0477: child = node.content;
0478: node.content = child.content;
0479: node.last = child.last;
0480: child.content = null;
0481:
0482: for (child = node.content; child != null; child = child.next)
0483: child.parent = node;
0484: }
0485:
0486: /* used to strip font start and end tags */
0487: private void discardContainer(Node element, MutableObject pnode) {
0488: Node node;
0489: Node parent = element.parent;
0490:
0491: if (element.content != null) {
0492: element.last.next = element.next;
0493:
0494: if (element.next != null) {
0495: element.next.prev = element.last;
0496: element.last.next = element.next;
0497: } else
0498: parent.last = element.last;
0499:
0500: if (element.prev != null) {
0501: element.content.prev = element.prev;
0502: element.prev.next = element.content;
0503: } else
0504: parent.content = element.content;
0505:
0506: for (node = element.content; node != null; node = node.next)
0507: node.parent = parent;
0508:
0509: pnode.setObject(element.content);
0510: } else {
0511: if (element.next != null)
0512: element.next.prev = element.prev;
0513: else
0514: parent.last = element.prev;
0515:
0516: if (element.prev != null)
0517: element.prev.next = element.next;
0518: else
0519: parent.content = element.next;
0520:
0521: pnode.setObject(element.next);
0522: }
0523:
0524: element.next = null;
0525: element.content = null;
0526: }
0527:
0528: /*
0529: Add style property to element, creating style
0530: attribute as needed and adding ; delimiter
0531: */
0532: private void addStyleProperty(Node node, String property) {
0533: AttVal av;
0534:
0535: for (av = node.attributes; av != null; av = av.next) {
0536: if (av.attribute.equals("style"))
0537: break;
0538: }
0539:
0540: /* if style attribute already exists then insert property */
0541:
0542: if (av != null) {
0543: String s;
0544:
0545: s = addProperty(av.value, property);
0546: av.value = s;
0547: } else /* else create new style attribute */
0548: {
0549: av = new AttVal(node.attributes, null, '"', "style",
0550: property);
0551: av.dict = AttributeTable.getDefaultAttributeTable()
0552: .findAttribute(av);
0553: node.attributes = av;
0554: }
0555: }
0556:
0557: /*
0558: Create new string that consists of the
0559: combined style properties in s1 and s2
0560:
0561: To merge property lists, we build a linked
0562: list of property/values and insert properties
0563: into the list in order, merging values for
0564: the same property name.
0565: */
0566: private String mergeProperties(String s1, String s2) {
0567: String s;
0568: StyleProp prop;
0569:
0570: prop = createProps(null, s1);
0571: prop = createProps(prop, s2);
0572: s = createPropString(prop);
0573: return s;
0574: }
0575:
0576: private void mergeStyles(Node node, Node child) {
0577: AttVal av;
0578: String s1, s2, style;
0579:
0580: for (s2 = null, av = child.attributes; av != null; av = av.next) {
0581: if (av.attribute.equals("style")) {
0582: s2 = av.value;
0583: break;
0584: }
0585: }
0586:
0587: for (s1 = null, av = node.attributes; av != null; av = av.next) {
0588: if (av.attribute.equals("style")) {
0589: s1 = av.value;
0590: break;
0591: }
0592: }
0593:
0594: if (s1 != null) {
0595: if (s2 != null) /* merge styles from both */
0596: {
0597: style = mergeProperties(s1, s2);
0598: av.value = style;
0599: }
0600: } else if (s2 != null) /* copy style of child */
0601: {
0602: av = new AttVal(node.attributes, null, '"', "style", s2);
0603: av.dict = AttributeTable.getDefaultAttributeTable()
0604: .findAttribute(av);
0605: node.attributes = av;
0606: }
0607: }
0608:
0609: private String fontSize2Name(String size) {
0610: /*
0611: String[] sizes =
0612: {
0613: "50%",
0614: "60%",
0615: "80%",
0616: null,
0617: "120%",
0618: "150%",
0619: "200%"
0620: };
0621: */
0622:
0623: String[] sizes = { "60%", "70%", "80%", null, "120%", "150%",
0624: "200%" };
0625: String buf;
0626:
0627: if (size.length() > 0 && '0' <= size.charAt(0)
0628: && size.charAt(0) <= '6') {
0629: int n = size.charAt(0) - '0';
0630: return sizes[n];
0631: }
0632:
0633: if (size.length() > 0 && size.charAt(0) == '-') {
0634: if (size.length() > 1 && '0' <= size.charAt(1)
0635: && size.charAt(1) <= '6') {
0636: int n = size.charAt(1) - '0';
0637: double x;
0638:
0639: for (x = 1.0; n > 0; --n)
0640: x *= 0.8;
0641:
0642: x *= 100.0;
0643: buf = "" + (int) x + "%";
0644:
0645: return buf;
0646: }
0647:
0648: return "smaller"; /*"70%"; */
0649: }
0650:
0651: if (size.length() > 1 && '0' <= size.charAt(1)
0652: && size.charAt(1) <= '6') {
0653: int n = size.charAt(1) - '0';
0654: double x;
0655:
0656: for (x = 1.0; n > 0; --n)
0657: x *= 1.2;
0658:
0659: x *= 100.0;
0660: buf = "" + (int) x + "%";
0661:
0662: return buf;
0663: }
0664:
0665: return "larger"; /* "140%" */
0666: }
0667:
0668: private void addFontFace(Node node, String face) {
0669: addStyleProperty(node, "font-family: " + face);
0670: }
0671:
0672: private void addFontSize(Node node, String size) {
0673: String value;
0674:
0675: if (size.equals("6") && node.tag == tt.tagP) {
0676: node.element = "h1";
0677: tt.findTag(node);
0678: return;
0679: }
0680:
0681: if (size.equals("5") && node.tag == tt.tagP) {
0682: node.element = "h2";
0683: tt.findTag(node);
0684: return;
0685: }
0686:
0687: if (size.equals("4") && node.tag == tt.tagP) {
0688: node.element = "h3";
0689: tt.findTag(node);
0690: return;
0691: }
0692:
0693: value = fontSize2Name(size);
0694:
0695: if (value != null) {
0696: addStyleProperty(node, "font-size: " + value);
0697: }
0698: }
0699:
0700: private void addFontColor(Node node, String color) {
0701: addStyleProperty(node, "color: " + color);
0702: }
0703:
0704: private void addAlign(Node node, String align) {
0705: /* force alignment value to lower case */
0706: addStyleProperty(node, "text-align: " + align.toLowerCase());
0707: }
0708:
0709: /*
0710: add style properties to node corresponding to
0711: the font face, size and color attributes
0712: */
0713: private void addFontStyles(Node node, AttVal av) {
0714: while (av != null) {
0715: if (av.attribute.equals("face"))
0716: addFontFace(node, av.value);
0717: else if (av.attribute.equals("size"))
0718: addFontSize(node, av.value);
0719: else if (av.attribute.equals("color"))
0720: addFontColor(node, av.value);
0721:
0722: av = av.next;
0723: }
0724: }
0725:
0726: /*
0727: Symptom: <p align=center>
0728: Action: <p style="text-align: center">
0729: */
0730: private void textAlign(Lexer lexer, Node node) {
0731: AttVal av, prev;
0732:
0733: prev = null;
0734:
0735: for (av = node.attributes; av != null; av = av.next) {
0736: if (av.attribute.equals("align")) {
0737: if (prev != null)
0738: prev.next = av.next;
0739: else
0740: node.attributes = av.next;
0741:
0742: if (av.value != null) {
0743: addAlign(node, av.value);
0744: }
0745:
0746: break;
0747: }
0748:
0749: prev = av;
0750: }
0751: }
0752:
0753: /*
0754: The clean up rules use the pnode argument to return the
0755: next node when the orignal node has been deleted
0756: */
0757:
0758: /*
0759: Symptom: <dir> <li> where <li> is only child
0760: Action: coerce <dir> <li> to <div> with indent.
0761: */
0762:
0763: private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode) {
0764: Node child;
0765:
0766: if (node.tag == tt.tagDir || node.tag == tt.tagUl
0767: || node.tag == tt.tagOl) {
0768: child = node.content;
0769:
0770: if (child == null)
0771: return false;
0772:
0773: /* check child has no peers */
0774:
0775: if (child.next != null)
0776: return false;
0777:
0778: if (child.tag != tt.tagLi)
0779: return false;
0780:
0781: if (!child.implicit)
0782: return false;
0783:
0784: /* coerce dir to div */
0785:
0786: node.tag = tt.tagDiv;
0787: node.element = "div";
0788: addStyleProperty(node, "margin-left: 2em");
0789: stripOnlyChild(node);
0790: return true;
0791:
0792: //#if 0
0793: //Node content;
0794: //Node last;
0795: //content = child.content;
0796: //last = child.last;
0797: //child.content = null;
0798:
0799: /* adjust parent and set margin on contents of <li> */
0800:
0801: //for (child = content; child != null; child = child.next)
0802: //{
0803: // child.parent = node.parent;
0804: // addStyleProperty(child, "margin-left: 1em");
0805: //}
0806: /* hook first/last into sequence */
0807:
0808: //if (content != null)
0809: //{
0810: // content.prev = node.prev;
0811: // last.next = node.next;
0812: // fixNodeLinks(content);
0813: // fixNodeLinks(last);
0814: //}
0815: //node.next = null;
0816: /* ensure that new node is cleaned */
0817: //pnode.setObject(cleanNode(lexer, content));
0818: //return true;
0819: //#endif
0820: }
0821:
0822: return false;
0823: }
0824:
0825: /*
0826: Symptom: <center>
0827: Action: replace <center> by <div style="text-align: center">
0828: */
0829:
0830: private boolean center2Div(Lexer lexer, Node node,
0831: MutableObject pnode) {
0832: if (node.tag == tt.tagCenter) {
0833: if (lexer.configuration.DropFontTags) {
0834: if (node.content != null) {
0835: Node last = node.last;
0836: Node parent = node.parent;
0837:
0838: discardContainer(node, pnode);
0839:
0840: node = lexer.inferredTag("br");
0841:
0842: if (last.next != null)
0843: last.next.prev = node;
0844:
0845: node.next = last.next;
0846: last.next = node;
0847: node.prev = last;
0848:
0849: if (parent.last == last)
0850: parent.last = node;
0851:
0852: node.parent = parent;
0853: } else {
0854: Node prev = node.prev;
0855: Node next = node.next;
0856: Node parent = node.parent;
0857: discardContainer(node, pnode);
0858:
0859: node = lexer.inferredTag("br");
0860: node.next = next;
0861: node.prev = prev;
0862: node.parent = parent;
0863:
0864: if (next != null)
0865: next.prev = node;
0866: else
0867: parent.last = node;
0868:
0869: if (prev != null)
0870: prev.next = node;
0871: else
0872: parent.content = node;
0873: }
0874:
0875: return true;
0876: }
0877: node.tag = tt.tagDiv;
0878: node.element = "div";
0879: addStyleProperty(node, "text-align: center");
0880: return true;
0881: }
0882:
0883: return false;
0884: }
0885:
0886: /*
0887: Symptom <div><div>...</div></div>
0888: Action: merge the two divs
0889:
0890: This is useful after nested <dir>s used by Word
0891: for indenting have been converted to <div>s
0892: */
0893: private boolean mergeDivs(Lexer lexer, Node node,
0894: MutableObject pnode) {
0895: Node child;
0896:
0897: if (node.tag != tt.tagDiv)
0898: return false;
0899:
0900: child = node.content;
0901:
0902: if (child == null)
0903: return false;
0904:
0905: if (child.tag != tt.tagDiv)
0906: return false;
0907:
0908: if (child.next != null)
0909: return false;
0910:
0911: mergeStyles(node, child);
0912: stripOnlyChild(node);
0913: return true;
0914: }
0915:
0916: /*
0917: Symptom: <ul><li><ul>...</ul></li></ul>
0918: Action: discard outer list
0919: */
0920:
0921: private boolean nestedList(Lexer lexer, Node node,
0922: MutableObject pnode) {
0923: Node child, list;
0924:
0925: if (node.tag == tt.tagUl || node.tag == tt.tagOl) {
0926: child = node.content;
0927:
0928: if (child == null)
0929: return false;
0930:
0931: /* check child has no peers */
0932:
0933: if (child.next != null)
0934: return false;
0935:
0936: list = child.content;
0937:
0938: if (list == null)
0939: return false;
0940:
0941: if (list.tag != node.tag)
0942: return false;
0943:
0944: pnode.setObject(node.next);
0945:
0946: /* move inner list node into position of outer node */
0947: list.prev = node.prev;
0948: list.next = node.next;
0949: list.parent = node.parent;
0950: fixNodeLinks(list);
0951:
0952: /* get rid of outer ul and its li */
0953: child.content = null;
0954: node.content = null;
0955: node.next = null;
0956:
0957: /*
0958: If prev node was a list the chances are this node
0959: should be appended to that list. Word has no way of
0960: recognizing nested lists and just uses indents
0961: */
0962:
0963: if (list.prev != null) {
0964: node = list;
0965: list = node.prev;
0966:
0967: if (list.tag == tt.tagUl || list.tag == tt.tagOl) {
0968: list.next = node.next;
0969:
0970: if (list.next != null)
0971: list.next.prev = list;
0972:
0973: child = list.last; /* <li> */
0974:
0975: node.parent = child;
0976: node.next = null;
0977: node.prev = child.last;
0978: fixNodeLinks(node);
0979: }
0980: }
0981:
0982: cleanNode(lexer, node);
0983: return true;
0984: }
0985:
0986: return false;
0987: }
0988:
0989: /*
0990: Symptom: the only child of a block-level element is a
0991: presentation element such as B, I or FONT
0992:
0993: Action: add style "font-weight: bold" to the block and
0994: strip the <b> element, leaving its children.
0995:
0996: example:
0997:
0998: <p>
0999: <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1000: </p>
1001:
1002: becomes:
1003:
1004: <p style="font-weight: bold; font-family: Arial; font-size: 6">
1005: Draft Recommended Practice
1006: </p>
1007:
1008: This code also replaces the align attribute by a style attribute.
1009: However, to avoid CSS problems with Navigator 4, this isn't done
1010: for the elements: caption, tr and table
1011: */
1012: private boolean blockStyle(Lexer lexer, Node node,
1013: MutableObject pnode) {
1014: Node child;
1015:
1016: if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST
1017: | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) {
1018: if (node.tag != tt.tagTable && node.tag != tt.tagTr
1019: && node.tag != tt.tagLi) {
1020: /* check for align attribute */
1021: if (node.tag != tt.tagCaption)
1022: textAlign(lexer, node);
1023:
1024: child = node.content;
1025:
1026: if (child == null)
1027: return false;
1028:
1029: /* check child has no peers */
1030:
1031: if (child.next != null)
1032: return false;
1033:
1034: if (child.tag == tt.tagB) {
1035: mergeStyles(node, child);
1036: addStyleProperty(node, "font-weight: bold");
1037: stripOnlyChild(node);
1038: return true;
1039: }
1040:
1041: if (child.tag == tt.tagI) {
1042: mergeStyles(node, child);
1043: addStyleProperty(node, "font-style: italic");
1044: stripOnlyChild(node);
1045: return true;
1046: }
1047:
1048: if (child.tag == tt.tagFont) {
1049: mergeStyles(node, child);
1050: addFontStyles(node, child.attributes);
1051: stripOnlyChild(node);
1052: return true;
1053: }
1054: }
1055: }
1056:
1057: return false;
1058: }
1059:
1060: /* the only child of table cell or an inline element such as em */
1061: private boolean inlineStyle(Lexer lexer, Node node,
1062: MutableObject pnode) {
1063: Node child;
1064:
1065: if (node.tag != tt.tagFont
1066: && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) {
1067: child = node.content;
1068:
1069: if (child == null)
1070: return false;
1071:
1072: /* check child has no peers */
1073:
1074: if (child.next != null)
1075: return false;
1076:
1077: if (child.tag == tt.tagB
1078: && lexer.configuration.LogicalEmphasis) {
1079: mergeStyles(node, child);
1080: addStyleProperty(node, "font-weight: bold");
1081: stripOnlyChild(node);
1082: return true;
1083: }
1084:
1085: if (child.tag == tt.tagI
1086: && lexer.configuration.LogicalEmphasis) {
1087: mergeStyles(node, child);
1088: addStyleProperty(node, "font-style: italic");
1089: stripOnlyChild(node);
1090: return true;
1091: }
1092:
1093: if (child.tag == tt.tagFont) {
1094: mergeStyles(node, child);
1095: addFontStyles(node, child.attributes);
1096: stripOnlyChild(node);
1097: return true;
1098: }
1099: }
1100:
1101: return false;
1102: }
1103:
1104: /*
1105: Replace font elements by span elements, deleting
1106: the font element's attributes and replacing them
1107: by a single style attribute.
1108: */
1109: private boolean font2Span(Lexer lexer, Node node,
1110: MutableObject pnode) {
1111: AttVal av, style, next;
1112:
1113: if (node.tag == tt.tagFont) {
1114: if (lexer.configuration.DropFontTags) {
1115: discardContainer(node, pnode);
1116: return false;
1117: }
1118:
1119: /* if FONT is only child of parent element then leave alone */
1120: if (node.parent.content == node && node.next == null)
1121: return false;
1122:
1123: addFontStyles(node, node.attributes);
1124:
1125: /* extract style attribute and free the rest */
1126: av = node.attributes;
1127: style = null;
1128:
1129: while (av != null) {
1130: next = av.next;
1131:
1132: if (av.attribute.equals("style")) {
1133: av.next = null;
1134: style = av;
1135: }
1136:
1137: av = next;
1138: }
1139:
1140: node.attributes = style;
1141:
1142: node.tag = tt.tagSpan;
1143: node.element = "span";
1144:
1145: return true;
1146: }
1147:
1148: return false;
1149: }
1150:
1151: /*
1152: Applies all matching rules to a node.
1153: */
1154: private Node cleanNode(Lexer lexer, Node node) {
1155: Node next = null;
1156: MutableObject o = new MutableObject();
1157: boolean b = false;
1158:
1159: for (next = node; node.isElement(); node = next) {
1160: o.setObject(next);
1161:
1162: b = dir2Div(lexer, node, o);
1163: next = (Node) o.getObject();
1164: if (b)
1165: continue;
1166:
1167: b = nestedList(lexer, node, o);
1168: next = (Node) o.getObject();
1169: if (b)
1170: continue;
1171:
1172: b = center2Div(lexer, node, o);
1173: next = (Node) o.getObject();
1174: if (b)
1175: continue;
1176:
1177: b = mergeDivs(lexer, node, o);
1178: next = (Node) o.getObject();
1179: if (b)
1180: continue;
1181:
1182: b = blockStyle(lexer, node, o);
1183: next = (Node) o.getObject();
1184: if (b)
1185: continue;
1186:
1187: b = inlineStyle(lexer, node, o);
1188: next = (Node) o.getObject();
1189: if (b)
1190: continue;
1191:
1192: b = font2Span(lexer, node, o);
1193: next = (Node) o.getObject();
1194: if (b)
1195: continue;
1196:
1197: break;
1198: }
1199:
1200: return next;
1201: }
1202:
1203: private Node createStyleProperties(Lexer lexer, Node node) {
1204: Node child;
1205:
1206: if (node.content != null) {
1207: for (child = node.content; child != null; child = child.next) {
1208: child = createStyleProperties(lexer, child);
1209: }
1210: }
1211:
1212: return cleanNode(lexer, node);
1213: }
1214:
1215: private void defineStyleRules(Lexer lexer, Node node) {
1216: Node child;
1217:
1218: if (node.content != null) {
1219: for (child = node.content; child != null; child = child.next) {
1220: defineStyleRules(lexer, child);
1221: }
1222: }
1223:
1224: style2Rule(lexer, node);
1225: }
1226:
1227: public void cleanTree(Lexer lexer, Node doc) {
1228: doc = createStyleProperties(lexer, doc);
1229:
1230: if (!lexer.configuration.MakeClean) {
1231: defineStyleRules(lexer, doc);
1232: createStyleElement(lexer, doc);
1233: }
1234: }
1235:
1236: /* simplifies <b><b> ... </b> ...</b> etc. */
1237: public void nestedEmphasis(Node node) {
1238: MutableObject o = new MutableObject();
1239: Node next;
1240:
1241: while (node != null) {
1242: next = node.next;
1243:
1244: if ((node.tag == tt.tagB || node.tag == tt.tagI)
1245: && node.parent != null
1246: && node.parent.tag == node.tag) {
1247: /* strip redundant inner element */
1248: o.setObject(next);
1249: discardContainer(node, o);
1250: next = (Node) o.getObject();
1251: node = next;
1252: continue;
1253: }
1254:
1255: if (node.content != null)
1256: nestedEmphasis(node.content);
1257:
1258: node = next;
1259: }
1260: }
1261:
1262: /* replace i by em and b by strong */
1263: public void emFromI(Node node) {
1264: while (node != null) {
1265: if (node.tag == tt.tagI) {
1266: node.element = tt.tagEm.name;
1267: node.tag = tt.tagEm;
1268: } else if (node.tag == tt.tagB) {
1269: node.element = tt.tagStrong.name;
1270: node.tag = tt.tagStrong;
1271: }
1272:
1273: if (node.content != null)
1274: emFromI(node.content);
1275:
1276: node = node.next;
1277: }
1278: }
1279:
1280: /*
1281: Some people use dir or ul without an li
1282: to indent the content. The pattern to
1283: look for is a list with a single implicit
1284: li. This is recursively replaced by an
1285: implicit blockquote.
1286: */
1287: public void list2BQ(Node node) {
1288: while (node != null) {
1289: if (node.content != null)
1290: list2BQ(node.content);
1291:
1292: if (node.tag != null
1293: && node.tag.parser == ParserImpl.getParseList()
1294: && node.hasOneChild() && node.content.implicit) {
1295: stripOnlyChild(node);
1296: node.element = tt.tagBlockquote.name;
1297: node.tag = tt.tagBlockquote;
1298: node.implicit = true;
1299: }
1300:
1301: node = node.next;
1302: }
1303: }
1304:
1305: /*
1306: Replace implicit blockquote by div with an indent
1307: taking care to reduce nested blockquotes to a single
1308: div with the indent set to match the nesting depth
1309: */
1310: public void bQ2Div(Node node) {
1311: int indent;
1312: String indent_buf;
1313:
1314: while (node != null) {
1315: if (node.tag == tt.tagBlockquote && node.implicit) {
1316: indent = 1;
1317:
1318: while (node.hasOneChild()
1319: && node.content.tag == tt.tagBlockquote
1320: && node.implicit) {
1321: ++indent;
1322: stripOnlyChild(node);
1323: }
1324:
1325: if (node.content != null)
1326: bQ2Div(node.content);
1327:
1328: indent_buf = "margin-left: "
1329: + (new Integer(2 * indent)).toString() + "em";
1330:
1331: node.element = tt.tagDiv.name;
1332: node.tag = tt.tagDiv;
1333: node.addAttribute("style", indent_buf);
1334: } else if (node.content != null)
1335: bQ2Div(node.content);
1336:
1337: node = node.next;
1338: }
1339: }
1340:
1341: /* node is <![if ...]> prune up to <![endif]> */
1342: public Node pruneSection(Lexer lexer, Node node) {
1343: for (;;) {
1344: /* discard node and returns next */
1345: node = Node.discardElement(node);
1346:
1347: if (node == null)
1348: return null;
1349:
1350: if (node.type == Node.SectionTag) {
1351: if ((Lexer.getString(node.textarray, node.start, 2))
1352: .equals("if")) {
1353: node = pruneSection(lexer, node);
1354: continue;
1355: }
1356:
1357: if ((Lexer.getString(node.textarray, node.start, 5))
1358: .equals("endif")) {
1359: node = Node.discardElement(node);
1360: break;
1361: }
1362: }
1363: }
1364:
1365: return node;
1366: }
1367:
1368: public void dropSections(Lexer lexer, Node node) {
1369: while (node != null) {
1370: if (node.type == Node.SectionTag) {
1371: /* prune up to matching endif */
1372: if ((Lexer.getString(node.textarray, node.start, 2))
1373: .equals("if")) {
1374: node = pruneSection(lexer, node);
1375: continue;
1376: }
1377:
1378: /* discard others as well */
1379: node = Node.discardElement(node);
1380: continue;
1381: }
1382:
1383: if (node.content != null)
1384: dropSections(lexer, node.content);
1385:
1386: node = node.next;
1387: }
1388: }
1389:
1390: public void purgeAttributes(Node node) {
1391: AttVal attr = node.attributes;
1392: AttVal next = null;
1393: AttVal prev = null;
1394:
1395: while (attr != null) {
1396: next = attr.next;
1397:
1398: /* special check for class="Code" denoting pre text */
1399: if (attr.attribute != null && attr.value != null
1400: && attr.attribute.equals("class")
1401: && attr.value.equals("Code")) {
1402: prev = attr;
1403: } else if (attr.attribute != null
1404: && (attr.attribute.equals("class")
1405: || attr.attribute.equals("style")
1406: || attr.attribute.equals("lang")
1407: || attr.attribute.startsWith("x:") || ((attr.attribute
1408: .equals("height") || attr.attribute
1409: .equals("width")) && (node.tag == tt.tagTd
1410: || node.tag == tt.tagTr || node.tag == tt.tagTh)))) {
1411: if (prev != null)
1412: prev.next = next;
1413: else
1414: node.attributes = next;
1415:
1416: } else
1417: prev = attr;
1418:
1419: attr = next;
1420: }
1421: }
1422:
1423: /* Word2000 uses span excessively, so we strip span out */
1424: public Node stripSpan(Lexer lexer, Node span) {
1425: Node node;
1426: Node prev = null;
1427: Node content;
1428:
1429: /*
1430: deal with span elements that have content
1431: by splicing the content in place of the span
1432: after having processed it
1433: */
1434:
1435: cleanWord2000(lexer, span.content);
1436: content = span.content;
1437:
1438: if (span.prev != null)
1439: prev = span.prev;
1440: else if (content != null) {
1441: node = content;
1442: content = content.next;
1443: Node.removeNode(node);
1444: Node.insertNodeBeforeElement(span, node);
1445: prev = node;
1446: }
1447:
1448: while (content != null) {
1449: node = content;
1450: content = content.next;
1451: Node.removeNode(node);
1452: Node.insertNodeAfterElement(prev, node);
1453: prev = node;
1454: }
1455:
1456: if (span.next == null)
1457: span.parent.last = prev;
1458:
1459: node = span.next;
1460: span.content = null;
1461: Node.discardElement(span);
1462: return node;
1463: }
1464:
1465: /* map non-breaking spaces to regular spaces */
1466: private void normalizeSpaces(Lexer lexer, Node node) {
1467: while (node != null) {
1468: if (node.content != null)
1469: normalizeSpaces(lexer, node.content);
1470:
1471: if (node.type == Node.TextNode) {
1472: int i;
1473: MutableInteger c = new MutableInteger();
1474: int p = node.start;
1475:
1476: for (i = node.start; i < node.end; ++i) {
1477: c.value = (int) node.textarray[i];
1478:
1479: /* look for UTF-8 multibyte character */
1480: if (c.value > 0x7F)
1481: i += PPrint.getUTF8(node.textarray, i, c);
1482:
1483: if (c.value == 160)
1484: c.value = ' ';
1485:
1486: p = PPrint.putUTF8(node.textarray, p, c.value);
1487: }
1488: }
1489:
1490: node = node.next;
1491: }
1492: }
1493:
1494: /*
1495: This is a major clean up to strip out all the extra stuff you get
1496: when you save as web page from Word 2000. It doesn't yet know what
1497: to do with VML tags, but these will appear as errors unless you
1498: declare them as new tags, such as o:p which needs to be declared
1499: as inline.
1500: */
1501: public void cleanWord2000(Lexer lexer, Node node) {
1502: /* used to a list from a sequence of bulletted p's */
1503: Node list = null;
1504:
1505: while (node != null) {
1506: /* discard Word's style verbiage */
1507: if (node.tag == tt.tagStyle || node.tag == tt.tagMeta
1508: || node.type == Node.CommentTag) {
1509: node = Node.discardElement(node);
1510: continue;
1511: }
1512:
1513: /* strip out all span tags Word scatters so liberally! */
1514: if (node.tag == tt.tagSpan) {
1515: node = stripSpan(lexer, node);
1516: continue;
1517: }
1518:
1519: /* get rid of Word's xmlns attributes */
1520: if (node.tag == tt.tagHtml) {
1521: /* check that it's a Word 2000 document */
1522: if (node.getAttrByName("xmlns:o") == null)
1523: return;
1524: }
1525:
1526: if (node.tag == tt.tagLink) {
1527: AttVal attr = node.getAttrByName("rel");
1528:
1529: if (attr != null && attr.value != null
1530: && attr.value.equals("File-List")) {
1531: node = Node.discardElement(node);
1532: continue;
1533: }
1534: }
1535:
1536: /* discard empty paragraphs */
1537: if (node.content == null && node.tag == tt.tagP) {
1538: node = Node.discardElement(node);
1539: continue;
1540: }
1541:
1542: if (node.tag == tt.tagP) {
1543: AttVal attr = node.getAttrByName("class");
1544:
1545: /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
1546: if (attr != null && attr.value != null
1547: && attr.value.equals("MsoListBullet")) {
1548: Node.coerceNode(lexer, node, tt.tagLi);
1549:
1550: if (list == null || list.tag != tt.tagUl) {
1551: list = lexer.inferredTag("ul");
1552: Node.insertNodeBeforeElement(node, list);
1553: }
1554:
1555: purgeAttributes(node);
1556:
1557: if (node.content != null)
1558: cleanWord2000(lexer, node.content);
1559:
1560: /* remove node and append to contents of list */
1561: Node.removeNode(node);
1562: Node.insertNodeAtEnd(list, node);
1563: node = list.next;
1564: }
1565: /* map sequence of <p class="Code"> to <pre>...</pre> */
1566: else if (attr != null && attr.value != null
1567: && attr.value.equals("Code")) {
1568: Node br = lexer.newLineNode();
1569: normalizeSpaces(lexer, node);
1570:
1571: if (list == null || list.tag != tt.tagPre) {
1572: list = lexer.inferredTag("pre");
1573: Node.insertNodeBeforeElement(node, list);
1574: }
1575:
1576: /* remove node and append to contents of list */
1577: Node.removeNode(node);
1578: Node.insertNodeAtEnd(list, node);
1579: stripSpan(lexer, node);
1580: Node.insertNodeAtEnd(list, br);
1581: node = list.next;
1582: } else
1583: list = null;
1584: } else
1585: list = null;
1586:
1587: /* strip out style and class attributes */
1588: if (node.type == Node.StartTag
1589: || node.type == Node.StartEndTag)
1590: purgeAttributes(node);
1591:
1592: if (node.content != null)
1593: cleanWord2000(lexer, node.content);
1594:
1595: node = node.next;
1596: }
1597: }
1598:
1599: public boolean isWord2000(Node root, TagTable tt) {
1600: Node html = root.findHTML(tt);
1601:
1602: return (html != null && html.getAttrByName("xmlns:o") != null);
1603: }
1604: }
|