0001: /*
0002: * WebLoader.java
0003: *
0004: * Copyright (C) 1998-2002 Peter Graves
0005: * $Id: WebLoader.java,v 1.1.1.1 2002/09/24 16:09:09 piso Exp $
0006: *
0007: * This program is free software; you can redistribute it and/or
0008: * modify it under the terms of the GNU General Public License
0009: * as published by the Free Software Foundation; either version 2
0010: * of the License, or (at your option) any later version.
0011: *
0012: * This program is distributed in the hope that it will be useful,
0013: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0014: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0015: * GNU General Public License for more details.
0016: *
0017: * You should have received a copy of the GNU General Public License
0018: * along with this program; if not, write to the Free Software
0019: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
0020: */
0021:
0022: package org.armedbear.j;
0023:
0024: import java.io.BufferedReader;
0025: import java.io.IOException;
0026: import java.io.InputStream;
0027: import java.io.InputStreamReader;
0028: import java.io.PushbackReader;
0029: import java.io.Reader;
0030: import java.util.ArrayList;
0031: import java.util.Hashtable;
0032: import java.util.List;
0033: import java.util.Stack;
0034:
0035: public final class WebLoader implements WebConstants {
0036: private PushbackReader reader;
0037: private final FastStringBuffer textBuffer = new FastStringBuffer();
0038: private final Stack indentStack = new Stack();
0039: private final Stack tableStack = new Stack();
0040: private Table currentTable;
0041: private int sourceOffset;
0042: private int offset;
0043: private final int maxChars = 80;
0044: private LineSegmentList segments;
0045: private LineSequence lines;
0046: private final Hashtable refs = new Hashtable();
0047: private int indentLevel;
0048: private File file;
0049:
0050: public WebLoader(File file) {
0051: this .file = file;
0052: if (file.getEncoding() == null)
0053: file.setEncoding("iso-8859-1");
0054: Debug.assertTrue(file.isLocal());
0055: }
0056:
0057: public WebLoader(Reader reader) {
0058: this .reader = new PushbackReader(new BufferedReader(reader));
0059: }
0060:
0061: public final Hashtable getRefs() {
0062: return refs;
0063: }
0064:
0065: public LineSequence load() {
0066: try {
0067: loadInternal();
0068: } catch (EncodingChangeException e) {
0069: Log.debug("encoding change!");
0070: Log.debug("new encoding = |" + e.getNewEncoding() + "|");
0071: file.setEncoding(e.getNewEncoding());
0072: reader = null;
0073: try {
0074: loadInternal();
0075: } catch (EncodingChangeException ex) {
0076: Log.error(ex);
0077: }
0078: }
0079: // Handle zero length files.
0080: if (lines.getFirstLine() == null)
0081: lines.appendLine(new WebLine(sourceOffset));
0082: return lines;
0083: }
0084:
0085: private void loadInternal() throws EncodingChangeException {
0086: if (reader == null) {
0087: Debug.assertTrue(file != null);
0088: String encoding = file.getEncoding();
0089: if (encoding == null)
0090: encoding = Editor.preferences().getStringProperty(
0091: Property.DEFAULT_ENCODING);
0092: try {
0093: InputStream inputStream = file.getInputStream();
0094: reader = new PushbackReader(new BufferedReader(
0095: new InputStreamReader(inputStream, encoding)));
0096: } catch (IOException e) {
0097: Log.error(e);
0098: return;
0099: }
0100: }
0101: lines = new LineSequence();
0102: sourceOffset = 0;
0103: try {
0104: int c;
0105: while ((c = reader.read()) >= 0) {
0106: // Line separator always counts as 1 char, so count '\n' but
0107: // not '\r'.
0108: if (c != '\r')
0109: ++sourceOffset;
0110: switch (c) {
0111: case '<':
0112: processMarkup();
0113: break;
0114: case '&':
0115: processEntity();
0116: break;
0117: default:
0118: doChar((char) c);
0119: break;
0120: }
0121: }
0122: flushLine();
0123: } catch (IOException e) {
0124: Log.error(e);
0125: }
0126: }
0127:
0128: private boolean bold;
0129: private boolean strong;
0130: private boolean italic;
0131: private boolean emphasis;
0132: private boolean heading;
0133: private boolean h1;
0134: private boolean center;
0135: private boolean preformatted;
0136: private boolean whitespace;
0137: private Link link;
0138:
0139: private final boolean centered() {
0140: return center || h1;
0141: }
0142:
0143: private void processMarkup() throws EncodingChangeException {
0144: final String tag = gatherTag();
0145: if (tag.length() < 3) {
0146: doText(tag);
0147: return;
0148: }
0149: char c = tag.charAt(1);
0150: if (c == '/') {
0151: if (!Character.isLetter(tag.charAt(2))) {
0152: doText(tag);
0153: return;
0154: }
0155: } else {
0156: if (c == '!') {
0157: // We only care about comments.
0158: if (tag.equals("<!--"))
0159: skipComment();
0160: return;
0161: }
0162: if (c == '?') {
0163: // Ignore XML declaration, processing instructions.
0164: return;
0165: }
0166: if (!Character.isLetter(c)) {
0167: doText(tag);
0168: return;
0169: }
0170: }
0171: final String tagName = Utilities.getTagName(tag).toLowerCase()
0172: .intern();
0173:
0174: // Unsupported tags.
0175: if (tagName == "applet") {
0176: skipTag("/applet");
0177: return;
0178: }
0179: if (tagName == "form") {
0180: flushLine();
0181: textBuffer.append("[Form]");
0182: flushSegment(null, FORMAT_DISABLED);
0183: flushLine();
0184: return;
0185: }
0186: if (tagName == "/form") {
0187: flushLine();
0188: textBuffer.append("[End Form]");
0189: flushSegment(null, FORMAT_DISABLED);
0190: newLine();
0191: return;
0192: }
0193: if (tagName == "input") {
0194: List attributes = getAttributes(tag);
0195: String type = getAttribute(attributes, "type");
0196: if (type != null) {
0197: if (type.equalsIgnoreCase("submit")) {
0198: flushSegment();
0199: String value = getAttribute(attributes, "value");
0200: if (value == null)
0201: value = "Submit"; // Default label.
0202: textBuffer.append('[');
0203: textBuffer.append(value);
0204: textBuffer.append(']');
0205: flushSegment(null, FORMAT_DISABLED);
0206: } else if (type.equalsIgnoreCase("image")) {
0207: flushSegment();
0208: textBuffer.append("[Image]");
0209: flushSegment(null, FORMAT_DISABLED);
0210: }
0211: }
0212: return;
0213: }
0214: if (tagName == "object") {
0215: skipTag("/object");
0216: return;
0217: }
0218: if (tagName == "xml") {
0219: skipTag("/xml");
0220: return;
0221: }
0222: if (tagName == "script") {
0223: skipScript();
0224: return;
0225: }
0226:
0227: if (tagName == "title") {
0228: processTitle();
0229: return;
0230: }
0231: if (tagName == "b") {
0232: flushSegment();
0233: if (bold) {
0234: // Two <b>'s in a row. This one is probably a typo for </b>.
0235: bold = false;
0236: } else {
0237: bold = true;
0238: }
0239: return;
0240: }
0241: if (tagName == "/b") {
0242: flushSegment();
0243: bold = false;
0244: return;
0245: }
0246: if (tagName == "strong") {
0247: flushSegment();
0248: strong = true;
0249: return;
0250: }
0251: if (tagName == "/strong") {
0252: flushSegment();
0253: strong = false;
0254: return;
0255: }
0256: if (tagName == "i") {
0257: flushSegment();
0258: italic = true;
0259: return;
0260: }
0261: if (tagName == "/i") {
0262: flushSegment();
0263: italic = false;
0264: return;
0265: }
0266: if (tagName == "em") {
0267: flushSegment();
0268: emphasis = true;
0269: return;
0270: }
0271: if (tagName == "/em") {
0272: flushSegment();
0273: emphasis = false;
0274: return;
0275: }
0276: if (tagName == "q" || tagName == "/q") {
0277: // Indent if we're at the beginning of the line.
0278: maybeIndent();
0279: textBuffer.append('"');
0280: return;
0281: }
0282: if (tagName == "a") {
0283: if (link != null)
0284: // The last <a> tag was never terminated. This is probably a typo for </a>.
0285: processEndAnchor();
0286: else
0287: processAnchor(tag);
0288: return;
0289: }
0290: if (tagName == "/a") {
0291: processEndAnchor();
0292: return;
0293: }
0294: if (tagName == "h1") {
0295: newLine();
0296: heading = true;
0297: h1 = true;
0298: return;
0299: }
0300: if (tagName == "/h1") {
0301: newLine();
0302: heading = false;
0303: h1 = false;
0304: return;
0305: }
0306: if (tagName == "h2" || tagName == "h3" || tagName == "h4"
0307: || tagName == "h5" || tagName == "h6") {
0308: newLine();
0309: heading = true;
0310: return;
0311: }
0312: if (tagName == "/h2" || tagName == "/h3" || tagName == "/h4"
0313: || tagName == "/h5" || tagName == "/h6") {
0314: newLine();
0315: heading = false;
0316: return;
0317: }
0318: if (tagName == "br") {
0319: // Forced line break. If there's no text to flush, append a blank
0320: // line.
0321: if (!flushLine()) {
0322: lines.appendLine(new WebLine(sourceOffset));
0323: ++offset;
0324: }
0325: return;
0326: }
0327: if (tagName == "div") {
0328: flushLine();
0329: return;
0330: }
0331: if (tagName == "/div") {
0332: flushLine();
0333: return;
0334: }
0335: if (tagName == "p") {
0336: newLine();
0337: return;
0338: }
0339: if (tagName == "pre") {
0340: flushLine();
0341: preformatted = true;
0342: return;
0343: }
0344: if (tagName == "/pre") {
0345: newLine();
0346: preformatted = false;
0347: return;
0348: }
0349: if (tagName == "blockquote") {
0350: newLine();
0351: indentStack.push("blockquote");
0352: ++indentLevel;
0353: return;
0354: }
0355: if (tagName == "/blockquote") {
0356: newLine();
0357: if (!indentStack.empty()) {
0358: String s = (String) indentStack.pop();
0359: --indentLevel;
0360: if (!s.equals("blockquote"))
0361: Log.error("**** /blockquote: stack imbalance");
0362: }
0363: return;
0364: }
0365: // Definition list.
0366: if (tagName == "dl") {
0367: newLine();
0368: indentStack.push("dl");
0369: return;
0370: }
0371: // Never omitted.
0372: if (tagName == "/dl") {
0373: newLine();
0374: // Handle unbalanced <dt> and/or <dd> tags.
0375: while (!indentStack.empty()) {
0376: String s = (String) indentStack.peek();
0377: if (s.equals("dd")) {
0378: indentStack.pop();
0379: --indentLevel;
0380: } else if (s.equals("dl")) {
0381: indentStack.pop();
0382: break;
0383: } else {
0384: // Shouldn't happen.
0385: break;
0386: }
0387: }
0388: return;
0389: }
0390: // Definition.
0391: if (tagName == "dd") {
0392: flushLine();
0393: if (!indentStack.empty()) {
0394: String s = (String) indentStack.peek();
0395: if (s.equals("dl"))
0396: ;
0397: else if (s.equals("dd")) {
0398: // Keep same indentation.
0399: return;
0400: } else
0401: Log.error("**** dd: top of stack is " + s);
0402: } else
0403: Log.error("**** dd: indentStack unexpectedly empty");
0404: indentStack.push("dd");
0405: ++indentLevel;
0406: return;
0407: }
0408: // Term to be defined.
0409: if (tagName == "dt") {
0410: flushLine();
0411: if (!indentStack.empty()) {
0412: String s = (String) indentStack.peek();
0413: if (s.equals("dd")) {
0414: indentStack.pop(); // <dt> terminating <dd> (javadoc)
0415: --indentLevel;
0416: } else if (s.equals("dl"))
0417: ;
0418: else
0419: Log.error("**** dt: top of stack is " + s);
0420: } else
0421: Log.error("**** dt: indentStack unexpectedly empty");
0422: return;
0423: }
0424: if (tagName == "img") {
0425: processImg(tag);
0426: return;
0427: }
0428: if (tagName == "center") {
0429: flushLine();
0430: center = true;
0431: return;
0432: }
0433: if (tagName == "/center") {
0434: flushLine();
0435: center = false;
0436: return;
0437: }
0438: if (tagName == "hr") {
0439: flushLine();
0440: link = null;
0441: for (int i = 0; i < maxChars(); i++)
0442: textBuffer.append('-');
0443: flushLine();
0444: return;
0445: }
0446: if (tagName == "ul") {
0447: newLine();
0448: indentStack.push("ul");
0449: ++indentLevel;
0450: }
0451: // Never omitted.
0452: if (tagName == "/ul") {
0453: newLine();
0454: if (!indentStack.empty()) {
0455: indentStack.pop();
0456: --indentLevel;
0457: }
0458: }
0459: // End tag is usually omitted.
0460: if (tagName == "li") {
0461: flushLine();
0462: if (indentStack.size() > 0) {
0463: textBuffer.append(Utilities.spaces(getIndent()));
0464: } else {
0465: textBuffer.append(Utilities.spaces(4));
0466: }
0467: if (textBuffer.length() >= 2)
0468: textBuffer.setCharAt(textBuffer.length() - 2, '\u2022');
0469: flushSegment(null, 0);
0470: return;
0471: }
0472: if (tagName == "style") {
0473: skipTag("/style");
0474: return;
0475: }
0476: if (tagName == "table") {
0477: newLine();
0478: tableStack.push(currentTable);
0479: currentTable = new Table();
0480: return;
0481: }
0482: if (tagName == "/table") {
0483: flushLine();
0484: if (!tableStack.empty())
0485: currentTable = (Table) tableStack.pop();
0486: else
0487: Log
0488: .error("**** /table: table stack imbalance source offset = "
0489: + sourceOffset);
0490: return;
0491: }
0492: // </tr> tag may be omittted.
0493: if (tagName == "tr") {
0494: flushLine();
0495: if (currentTable != null)
0496: currentTable.nextRow();
0497: else
0498: Log
0499: .error("**** tr: currentTable is null source offset = "
0500: + sourceOffset);
0501: return;
0502: }
0503: // </td> tag may be omitted.
0504: if (tagName == "td" || tagName == "th") {
0505: flushSegment();
0506: if (currentTable != null) {
0507: currentTable.nextColumn();
0508: int currentOffset = getCurrentOffset();
0509: // Leave at least one space between columns (but no space
0510: // before the first column).
0511: int numSpaces = 1;
0512: if (currentTable.getColumnIndex() == 0
0513: || currentOffset == 0)
0514: numSpaces = 0;
0515: int minimumOffset = currentTable.getMinimumOffset();
0516: if (minimumOffset > 0) {
0517: if (currentOffset < minimumOffset)
0518: numSpaces = minimumOffset - currentOffset;
0519: }
0520: textBuffer.append(Utilities.spaces(numSpaces));
0521: flushSegment(null, FORMAT_WHITESPACE);
0522: String s = getAttribute(tag, "width");
0523: if (s != null) {
0524: if (s.endsWith("%")) {
0525: s = s.substring(0, s.length() - 1).trim();
0526: if (s.length() > 0) {
0527: try {
0528: int percent = Integer.parseInt(s);
0529: int width = maxChars() * percent / 100;
0530: currentTable.setColumnWidth(width);
0531: } catch (NumberFormatException e) {
0532: Log.error(e);
0533: }
0534: }
0535: } else
0536: ; // Ignore widths specified in pixels.
0537: }
0538: } else
0539: Log.error("**** td: currentTable is null");
0540: return;
0541: }
0542: if (tagName == "meta") {
0543: // Ignore change of encoding if we're not loading a file. This can
0544: // happen when load() is called from MessageBuffer.setText() to
0545: // process an HTML message.
0546: if (file == null)
0547: return;
0548: String encoding = file.getEncoding();
0549: // Ignore the specified encoding if we have already determined the
0550: // encoding from the byte order mark.
0551: if (encoding != null) {
0552: if (encoding.equals("UnicodeBig")
0553: || encoding.equals("UnicodeLittle"))
0554: return;
0555: }
0556: List attributes = getAttributes(tag);
0557: String httpEquiv = getAttribute(attributes, "http-equiv");
0558: if (httpEquiv != null) {
0559: if (httpEquiv.toLowerCase().equals("content-type")) {
0560: String contentType = getAttribute(attributes,
0561: "content");
0562: if (contentType != null) {
0563: String charset = Utilities
0564: .getCharsetFromContentType(contentType);
0565: Log.debug("charset = |" + charset + "|");
0566: if (charset != null && charset.length() > 0) {
0567: String newEncoding = Utilities
0568: .getEncodingFromCharset(charset);
0569: Log.debug("new encoding = " + newEncoding);
0570: if (!newEncoding.equalsIgnoreCase(encoding))
0571: throw new EncodingChangeException(
0572: newEncoding);
0573: Log.debug("no encoding change");
0574: }
0575: }
0576: }
0577: }
0578: return;
0579: }
0580: }
0581:
0582: private void processTitle() {
0583: FastStringBuffer sb = new FastStringBuffer();
0584: try {
0585: int c;
0586: while ((c = reader.read()) >= 0) {
0587: if (c != '\r')
0588: ++sourceOffset;
0589: if (c == '<') {
0590: String tag = gatherTag();
0591: if (!isTag(tag, "/title"))
0592: Log.error("processTitle unexpected tag " + tag);
0593: break;
0594: } else if (c == '&') {
0595: String entity = gatherEntity();
0596: sb.append(substituteEntity(entity));
0597: } else
0598: sb.append((char) c);
0599: }
0600: } catch (IOException e) {
0601: Log.error(e);
0602: }
0603: String title = sb.toString().trim();
0604: if (lines.getFirstLine() == null) {
0605: if (textBuffer.length() == 0) {
0606: if (title.length() < maxChars())
0607: textBuffer.append(Utilities.spaces(maxChars()
0608: - title.length()));
0609: textBuffer.append(title);
0610: flushLine();
0611: }
0612: }
0613: }
0614:
0615: private void processAnchor(String tag) {
0616: flushSegment();
0617: List attributes = getAttributes(tag);
0618: if (attributes != null) {
0619: for (int i = 0; i < attributes.size(); i++) {
0620: StringPair pair = (StringPair) attributes.get(i);
0621: if (pair.first.equals("href"))
0622: link = new Link(pair.second.trim());
0623: else if (pair.first.equals("name"))
0624: addRef(pair.second, offset);
0625: }
0626: }
0627: }
0628:
0629: private void processEndAnchor() {
0630: boolean appendSpace = false;
0631: while (textBuffer.toString().endsWith(" ")) {
0632: appendSpace = true;
0633: textBuffer.setLength(textBuffer.length() - 1);
0634: }
0635: flushSegment();
0636: link = null;
0637: if (appendSpace) {
0638: textBuffer.append(' ');
0639: flushSegment();
0640: }
0641: }
0642:
0643: private void processImg(String tag) {
0644: flushSegment();
0645: List attributes = getAttributes(tag);
0646: String alt = getAttribute(attributes, "alt");
0647: String src = getAttribute(attributes, "src");
0648: String width = getAttribute(attributes, "width");
0649: String height = getAttribute(attributes, "height");
0650: int w = 0;
0651: int h = 0;
0652: if (width != null) {
0653: try {
0654: w = Integer.parseInt(width);
0655: } catch (NumberFormatException e) {
0656: }
0657: }
0658: if (height != null) {
0659: try {
0660: h = Integer.parseInt(height);
0661: } catch (NumberFormatException e) {
0662: }
0663: }
0664: // Create image link if appropriate.
0665: ImageLink imageLink = null;
0666: if (src != null && src.length() > 0) {
0667: String lower = src.toLowerCase();
0668: if (lower.endsWith(".jpg") || lower.endsWith(".gif")
0669: || lower.endsWith(".png")) {
0670: // Only provide image link if image is big enough.
0671: if (w >= 100 && h >= 100)
0672: imageLink = new ImageLink(src);
0673: }
0674: }
0675: if (imageLink != null) {
0676: FastStringBuffer sb = new FastStringBuffer("[IMAGE");
0677: if (width != null && height != null) {
0678: sb.append(' ');
0679: sb.append(width);
0680: sb.append('x');
0681: sb.append(height);
0682: }
0683: sb.append(']');
0684: if (alt != null && (alt = alt.trim()).length() > 0) {
0685: sb.append(' ');
0686: sb.append(alt);
0687: }
0688: imageLink.setText(sb.toString());
0689: textBuffer.append(imageLink.getText());
0690: flushSegment(imageLink, FORMAT_LINK);
0691: }
0692: // Add a space if the last character on the line so far is not
0693: // already a space.
0694: if (segments == null || segments.size() == 0) {
0695: // We don't need to add a space at the beginning of the line.
0696: return;
0697: }
0698: FastStringBuffer sb = new FastStringBuffer();
0699: for (int i = 0; i < segments.size(); i++) {
0700: HtmlLineSegment segment = (HtmlLineSegment) segments
0701: .getSegment(i);
0702: sb.append(segment.getText());
0703: }
0704: if (sb.length() == 0 || sb.charAt(sb.length() - 1) == ' ')
0705: return;
0706: // The last character is not a space, so we need to add one.
0707: textBuffer.append(' ');
0708: flushSegment(null, FORMAT_WHITESPACE);
0709: }
0710:
0711: private final void addRef(String ref, int offset) {
0712: refs.put(ref, new Integer(offset));
0713: }
0714:
0715: private static final String getAttribute(String tag,
0716: String attributeName) {
0717: return getAttribute(getAttributes(tag), attributeName);
0718: }
0719:
0720: private static String getAttribute(List attributes,
0721: String attributeName) {
0722: if (attributes != null) {
0723: for (int i = attributes.size() - 1; i >= 0; i--) {
0724: StringPair pair = (StringPair) attributes.get(i);
0725: if (pair.first.equals(attributeName))
0726: return pair.second;
0727: }
0728: }
0729: return null;
0730: }
0731:
0732: private static List getAttributes(String tag) {
0733: final int NEUTRAL = 0;
0734: final int ATTRIBUTE_NAME = 1;
0735: final int SPACE_BEFORE_EQ = 2;
0736: final int SPACE_AFTER_EQ = 3;
0737: final int ATTRIBUTE_VALUE = 4;
0738:
0739: int state = NEUTRAL;
0740: FastStringBuffer sb = new FastStringBuffer();
0741: String name = null;
0742: String value = null;
0743: ArrayList attributes = null;
0744: char delim = 0;
0745:
0746: final int limit = tag.length();
0747: int i;
0748: // Skip past tag name.
0749: for (i = 0; i < limit; i++) {
0750: char c = tag.charAt(i);
0751: if (c == '>')
0752: return null;
0753: if (Character.isWhitespace(c)) {
0754: ++i;
0755: break;
0756: }
0757: }
0758:
0759: for (; i < limit; i++) {
0760: char c = tag.charAt(i);
0761: switch (state) {
0762: case NEUTRAL:
0763: if (Character.isWhitespace(c))
0764: ;
0765: else {
0766: sb.setLength(0);
0767: sb.append(c);
0768: state = ATTRIBUTE_NAME;
0769: }
0770: break;
0771: case ATTRIBUTE_NAME:
0772: if (c == '=') {
0773: name = sb.toString().toLowerCase();
0774: sb.setLength(0);
0775: state = SPACE_AFTER_EQ;
0776: } else if (Character.isWhitespace(c)) {
0777: name = sb.toString().toLowerCase();
0778: sb.setLength(0);
0779: state = SPACE_BEFORE_EQ;
0780: } else
0781: sb.append(c);
0782: break;
0783: case SPACE_BEFORE_EQ:
0784: if (Character.isWhitespace(c))
0785: ;
0786: else if (c == '=')
0787: state = SPACE_AFTER_EQ;
0788: else {
0789: // An attribute with no value.
0790: sb.setLength(0);
0791: state = NEUTRAL;
0792: if (attributes == null)
0793: attributes = new ArrayList();
0794: attributes.add(new StringPair(name, ""));
0795: name = value = null;
0796: }
0797: break;
0798: case SPACE_AFTER_EQ:
0799: if (Character.isWhitespace(c))
0800: ;
0801: else if (c == '"' || c == '\'') {
0802: delim = c;
0803: sb.setLength(0);
0804: state = ATTRIBUTE_VALUE;
0805: } else {
0806: delim = 0;
0807: sb.setLength(0);
0808: sb.append(c);
0809: state = ATTRIBUTE_VALUE;
0810: }
0811: break;
0812: case ATTRIBUTE_VALUE:
0813: if (delim != 0) {
0814: if (c == delim) {
0815: value = sb.toString();
0816: sb.setLength(0);
0817: state = NEUTRAL;
0818: if (attributes == null)
0819: attributes = new ArrayList();
0820: attributes.add(new StringPair(name, value));
0821: name = value = null;
0822: } else if (c == '&') {
0823: FastStringBuffer sbEntity = new FastStringBuffer();
0824: sbEntity.append('&');
0825: for (++i; i < limit; i++) {
0826: c = tag.charAt(i);
0827: if (c == delim) {
0828: // Not really an entity.
0829: sb.append(sbEntity.toString());
0830: // Let outer loop handle the delimiter.
0831: --i;
0832: break;
0833: }
0834: sbEntity.append(c);
0835: if (c == ';') {
0836: sb.append(substituteEntity(sbEntity
0837: .toString()));
0838: break;
0839: }
0840: }
0841: } else
0842: sb.append(c);
0843: } else {
0844: // Attribute value is not enclosed in quotes.
0845: if (c == '>' || Character.isWhitespace(c)) {
0846: value = sb.toString();
0847: sb.setLength(0);
0848: state = NEUTRAL;
0849: if (attributes == null)
0850: attributes = new ArrayList();
0851: attributes.add(new StringPair(name, value));
0852: name = value = null;
0853: } else if (c == '&') {
0854: FastStringBuffer sbEntity = new FastStringBuffer();
0855: sbEntity.append('&');
0856: for (++i; i < limit; i++) {
0857: c = tag.charAt(i);
0858: if (c == ' ' || c == '>') {
0859: // Reached end of attribute. Back up one char.
0860: --i;
0861: // We've already got the whole entity (if it is one).
0862: break;
0863: }
0864: sbEntity.append(c);
0865: if (c == ';')
0866: break;
0867: }
0868: sb
0869: .append(substituteEntity(sbEntity
0870: .toString()));
0871: } else
0872: sb.append(c);
0873: }
0874: break;
0875: }
0876: }
0877:
0878: return attributes;
0879: }
0880:
0881: // tagName can be e.g. "table" or "/table".
0882: private static boolean isTag(String s, String tagName) {
0883: Debug.assertTrue(tagName.indexOf('<') < 0);
0884: Debug.assertTrue(tagName.indexOf('>') < 0);
0885: Debug.assertTrue(tagName.indexOf(' ') < 0);
0886:
0887: // Shortest possible tag is "<a>".
0888: if (s == null || s.length() < 3)
0889: return false;
0890: if (s.charAt(0) != '<')
0891: return false;
0892: int length = tagName.length();
0893: if (s.length() < length + 2)
0894: return false;
0895: if (!s.regionMatches(true, 1, tagName, 0, length))
0896: return false;
0897: // Char after tag name must be whitespace or '>'.
0898: char c = s.charAt(length + 1);
0899: return c == '>' || Character.isWhitespace(c);
0900: }
0901:
0902: private String gatherTag() {
0903: final int TAG_NAME = 0;
0904: final int NEUTRAL = 1;
0905: final int ATTRIBUTE_NAME = 2;
0906: final int SPACE_BEFORE_EQ = 3;
0907: final int SPACE_AFTER_EQ = 4;
0908: final int ATTRIBUTE_VALUE = 5;
0909: final int MARKED_SECTION = 6;
0910: final int BANG = 7;
0911: final int INVALID = 8;
0912:
0913: FastStringBuffer sb = new FastStringBuffer(256);
0914: sb.append('<');
0915: int length = 1;
0916: int state = TAG_NAME;
0917: char delim = 0;
0918:
0919: int ch;
0920:
0921: try {
0922: while ((ch = reader.read()) >= 0) {
0923: char c = (char) ch;
0924: if (c == '<') {
0925: // We only expect to see a '<' inside a quoted attribute value.
0926: // An actual example from msnbc.com: <a href="<!--none-->">
0927: if (state != ATTRIBUTE_VALUE || delim == 0) {
0928: Log.error("unexpected '<' sourceOffset = "
0929: + sourceOffset);
0930: reader.unread(c);
0931: return sb.toString();
0932: }
0933: }
0934: if (c != '\r')
0935: ++sourceOffset;
0936: // Ignore whitespace after initial "<" or "</".
0937: if (c <= ' ') {
0938: if (length == 1)
0939: continue;
0940: if (length == 2 && sb.charAt(1) == '/')
0941: continue;
0942: }
0943: sb.append(c);
0944: ++length;
0945: switch (state) {
0946: case TAG_NAME:
0947: if (c == '>') {
0948: // End of tag, no attributes.
0949: return sb.toString();
0950: } else if (Character.isWhitespace(c)) {
0951: // Reached end of tag name.
0952: state = NEUTRAL;
0953: } else if (length == 2 && c == '!') {
0954: state = BANG;
0955: } else if ((c >= 'a' && c <= 'z')
0956: || (c >= 'A' && c <= 'Z') || c == '_'
0957: || c == ':') {
0958: ; // OK at any time
0959: } else if (length == 2 && (c == '/' || c == '!')) {
0960: ; // OK as second char only
0961: } else if (length > 2
0962: && ((c >= '0' && c <= '9') || c == '-' || c == '.')) {
0963: ; // OK second char or later
0964: } else {
0965: // Not really a tag.
0966: Log.error("invalid tag sourceOffset = "
0967: + sourceOffset);
0968: state = INVALID;
0969: }
0970: break;
0971: case BANG:
0972: if (c == '>') {
0973: return sb.toString();
0974: } else if (length == 4
0975: && sb.toString().equals("<!--")) {
0976: // Start of comment.
0977: return sb.toString();
0978: } else if (length == 3
0979: && sb.toString().equals("<![")) {
0980: state = MARKED_SECTION;
0981: }
0982: break;
0983: case NEUTRAL:
0984: if (c == '>')
0985: return sb.toString();
0986: else if (!Character.isWhitespace(c))
0987: state = ATTRIBUTE_NAME;
0988: break;
0989: case ATTRIBUTE_NAME:
0990: if (c == '>')
0991: return sb.toString();
0992: else if (c == '=')
0993: state = SPACE_AFTER_EQ;
0994: else if (Character.isWhitespace(c))
0995: state = SPACE_BEFORE_EQ;
0996: break;
0997: case SPACE_BEFORE_EQ:
0998: if (c == '>')
0999: return sb.toString();
1000: else if (Character.isWhitespace(c))
1001: ;
1002: else if (c == '=')
1003: state = SPACE_AFTER_EQ;
1004: else {
1005: // An attribute with no value.
1006: state = NEUTRAL;
1007: }
1008: break;
1009: case SPACE_AFTER_EQ:
1010: if (c == '>')
1011: return sb.toString();
1012: else if (Character.isWhitespace(c))
1013: ;
1014: else if (c == '"' || c == '\'') {
1015: delim = c;
1016: state = ATTRIBUTE_VALUE;
1017: } else {
1018: delim = 0;
1019: state = ATTRIBUTE_VALUE;
1020: }
1021: break;
1022: case ATTRIBUTE_VALUE:
1023: if (delim != 0) {
1024: if (c == delim)
1025: state = NEUTRAL;
1026: } else {
1027: // Attribute value is not enclosed in quotes.
1028: if (c == '>')
1029: return sb.toString();
1030: else if (Character.isWhitespace(c))
1031: state = NEUTRAL;
1032: }
1033: break;
1034: case MARKED_SECTION:
1035: if (c == '>') {
1036: if (sb.toString().endsWith("]>"))
1037: return sb.toString();
1038: }
1039: break;
1040: case INVALID:
1041: if (c == '>') {
1042: Log.error("invalid tag |" + sb.toString()
1043: + "| sourceOffset = " + sourceOffset);
1044: return sb.toString();
1045: }
1046: break;
1047: }
1048: }
1049: } catch (IOException e) {
1050: Log.error(e);
1051: }
1052:
1053: return sb.toString();
1054: }
1055:
1056: private void processEntity() {
1057: String entity = gatherEntity();
1058: doText(substituteEntity(entity));
1059: }
1060:
1061: private String gatherEntity() {
1062: FastStringBuffer sb = new FastStringBuffer('&');
1063: try {
1064: int c;
1065: while ((c = reader.read()) >= 0) {
1066: if (c == '<' || c == '&') {
1067: reader.unread(c);
1068: break;
1069: }
1070: if (c != '\r')
1071: ++sourceOffset;
1072: sb.append((char) c);
1073: if (c == ';')
1074: break;
1075: if (c == ' ')
1076: break;
1077: }
1078: } catch (IOException e) {
1079: Log.error(e);
1080: }
1081: return sb.toString();
1082: }
1083:
1084: private static String substituteEntity(String entity) {
1085: final int length = entity.length();
1086: if (length < 2)
1087: return entity;
1088: if (entity.equals("& "))
1089: return entity; // Not really an entity.
1090: if (entity.charAt(1) == '#') {
1091: // Remove leading "&#" and trailing ';' if present.
1092: String s;
1093: if (entity.charAt(length - 1) == ';')
1094: s = entity.substring(2, length - 1);
1095: else
1096: s = entity.substring(2);
1097:
1098: int n = -1;
1099: try {
1100: n = Integer.parseInt(s);
1101: } catch (NumberFormatException e) {
1102: }
1103:
1104: if (n >= 0) {
1105: switch (n) {
1106: case 145: // Left single quote.
1107: case 146: // Right single quote.
1108: return "'";
1109: case 147: // Left double quote.
1110: case 148: // Right double quote.
1111: return "\"";
1112: case 149: // Bullet.
1113: return String.valueOf((char) 8226);
1114: case 150: // En dash.
1115: return "-";
1116: case 151: // Em dash.
1117: return "--";
1118: case 153:
1119: return "(TM)";
1120: case 174:
1121: return "(R)";
1122: default:
1123: return String.valueOf((char) n);
1124: }
1125: }
1126: }
1127:
1128: // Remove leading '&' and trailing ';' if present.
1129: String s;
1130: if (entity.charAt(length - 1) == ';')
1131: s = entity.substring(1, length - 1).intern();
1132: else
1133: s = entity.substring(1).intern();
1134:
1135: if (s == "quot")
1136: return "\"";
1137: else if (s == "trade") // 153
1138: return "(TM)";
1139: else if (s == "nbsp")
1140: return String.valueOf((char) 160);
1141: else if (s == "copy")
1142: return String.valueOf((char) 169);
1143: else if (s == "laquo")
1144: return String.valueOf((char) 171);
1145: else if (s == "reg") // 174
1146: return "(R)";
1147: else if (s == "acute")
1148: return String.valueOf((char) 180);
1149: else if (s == "auml")
1150: return String.valueOf((char) 228);
1151: else if (s == "middot")
1152: return String.valueOf((char) 183);
1153: else if (s == "raquo")
1154: return String.valueOf((char) 187);
1155: else if (s == "eacute")
1156: return String.valueOf((char) 233);
1157: else if (s == "iuml")
1158: return String.valueOf((char) 239);
1159: else if (s == "bull")
1160: return String.valueOf((char) 8226);
1161: else if (s == "AElig")
1162: return "AE";
1163: else if (s == "amp")
1164: return "&";
1165: else if (s == "lt")
1166: return "<";
1167: else if (s == "gt")
1168: return ">";
1169: else
1170: return entity;
1171: }
1172:
1173: private void skipComment() {
1174: FastStringBuffer sb = new FastStringBuffer();
1175: try {
1176: int c;
1177: while ((c = reader.read()) >= 0) {
1178: if (c != '\r')
1179: ++sourceOffset;
1180: sb.append((char) c);
1181: if (c == '>' && sb.toString().endsWith("-->"))
1182: return;
1183: }
1184: } catch (IOException e) {
1185: Log.error(e);
1186: }
1187: }
1188:
1189: private void skipTag(String tagName) {
1190: try {
1191: int c;
1192: while ((c = reader.read()) >= 0) {
1193: if (c != '\r')
1194: ++sourceOffset;
1195: if (c == '<') {
1196: String tag = gatherTag();
1197: if (isTag(tag, tagName))
1198: return;
1199: }
1200: }
1201: } catch (IOException e) {
1202: Log.error(e);
1203: }
1204: }
1205:
1206: private void skipScript() {
1207: try {
1208: int c;
1209: while ((c = reader.read()) >= 0) {
1210: if (c != '\r')
1211: ++sourceOffset;
1212: if (c == '<') {
1213: if (readEndScriptTag())
1214: return;
1215: }
1216: }
1217: } catch (IOException e) {
1218: Log.error(e);
1219: }
1220: }
1221:
1222: private boolean readEndScriptTag() {
1223: final String s = "</script>";
1224: final int length = s.length();
1225: FastStringBuffer sb = new FastStringBuffer('<');
1226: try {
1227: int c;
1228: while ((c = reader.read()) >= 0) {
1229: if (c != '\r')
1230: ++sourceOffset;
1231: sb.append(Character.toLowerCase((char) c));
1232: if (sb.length() < length) {
1233: if (!s.startsWith(sb.toString()))
1234: return false;
1235: } else
1236: return s.equals(sb.toString());
1237: }
1238: } catch (IOException e) {
1239: Log.error(e);
1240: }
1241: return false;
1242: }
1243:
1244: private void doText(String s) {
1245: final int length = s.length();
1246: for (int i = 0; i < length; i++)
1247: doChar(s.charAt(i));
1248: }
1249:
1250: private void doChar(char c) {
1251: if (preformatted) {
1252: switch (c) {
1253: case '\t':
1254: final int spaces = 8 - getCurrentOffset() % 8;
1255: for (int i = spaces - 1; i >= 0; i--)
1256: textBuffer.append(' ');
1257: break;
1258: case '\r':
1259: break;
1260: case '\n':
1261: flushSegment();
1262: if (segments != null) {
1263: lines
1264: .appendLine(new WebLine(segments,
1265: sourceOffset));
1266: segments = null;
1267: } else
1268: lines.appendLine(new WebLine(sourceOffset));
1269: ++offset; // Line separator always counts as 1.
1270: break;
1271: default:
1272: textBuffer.append(c);
1273: break;
1274: }
1275: return;
1276: }
1277:
1278: switch (c) {
1279: case 133: // Ellipsis.
1280: textBuffer.append("...");
1281: break;
1282: case 145: // Left single quote.
1283: case 146: // Right single quote.
1284: textBuffer.append('\'');
1285: break;
1286: case 147: // Left double quote.
1287: case 148: // Right double quote.
1288: textBuffer.append('"');
1289: break;
1290: case 149: // Bullet.
1291: textBuffer.append((char) 8226);
1292: break;
1293: case 150:
1294: // En dash.
1295: textBuffer.append('-');
1296: break;
1297: case 151:
1298: // Em dash.
1299: textBuffer.append("--");
1300: break;
1301: case 153:
1302: textBuffer.append("(TM)");
1303: break;
1304: case '\n':
1305: case '\t':
1306: case ' ':
1307: // Append a space unless the preceding character was a space
1308: // or non-breaking space.
1309: if (textBuffer.length() > 0) {
1310: char preceding = textBuffer
1311: .charAt(textBuffer.length() - 1);
1312: if (preceding != ' ' && preceding != 160)
1313: textBuffer.append(' ');
1314: } else if (segments != null && segments.size() > 0) {
1315: // Check the last character in the previous segment.
1316: HtmlLineSegment seg = (HtmlLineSegment) segments
1317: .getLastSegment();
1318: String s = seg.getText();
1319: if (s.length() == 0)
1320: textBuffer.append(' ');
1321: else {
1322: char preceding = s.charAt(s.length() - 1);
1323: if (preceding != ' ' && preceding != 160)
1324: textBuffer.append(' ');
1325: }
1326: }
1327: break;
1328: case '\r':
1329: break;
1330: default:
1331: // A non-whitespace character.
1332: // Indent if we're at the beginning of the line.
1333: maybeIndent();
1334: textBuffer.append(c);
1335: break;
1336: }
1337:
1338: if (Character.isWhitespace(c))
1339: maybeWrap();
1340: }
1341:
1342: private void maybeIndent() {
1343: if (indentLevel > 0) {
1344: if (segments == null && textBuffer.length() == 0) {
1345: textBuffer.append(Utilities.spaces(getIndent()));
1346: flushSegment(null, FORMAT_WHITESPACE);
1347: }
1348: }
1349: }
1350:
1351: private final int getIndent() {
1352: return indentLevel * 4;
1353: }
1354:
1355: private int getCurrentOffset() {
1356: int currentOffset = 0;
1357: if (segments != null) {
1358: for (int i = segments.size() - 1; i >= 0; i--)
1359: currentOffset += segments.getSegment(i).length();
1360: }
1361: currentOffset += textBuffer.length();
1362: return currentOffset;
1363: }
1364:
1365: private final void flushSegment() {
1366: flushSegment(true);
1367: }
1368:
1369: private void flushSegment(boolean wrap) {
1370: if (textBuffer.length() > 0) {
1371: if (wrap)
1372: maybeWrap();
1373: int format = 0;
1374: if (link != null)
1375: format |= FORMAT_LINK;
1376: if (bold || strong || heading)
1377: format |= FORMAT_BOLD;
1378: if (italic || emphasis)
1379: format |= FORMAT_ITALIC;
1380: if (whitespace)
1381: format |= FORMAT_WHITESPACE;
1382: if (segments == null)
1383: segments = new LineSegmentList();
1384: segments.addSegment(new HtmlLineSegment(textBuffer
1385: .toString(), format, link));
1386: offset += textBuffer.length();
1387: textBuffer.setLength(0);
1388: }
1389: }
1390:
1391: private void flushSegment(Link link, int format) {
1392: if (textBuffer.length() > 0) {
1393: if (segments == null)
1394: segments = new LineSegmentList();
1395: segments.addSegment(new HtmlLineSegment(textBuffer
1396: .toString(), format, link));
1397: offset += textBuffer.length();
1398: textBuffer.setLength(0);
1399: }
1400: }
1401:
1402: private void maybeWrap() {
1403: if (preformatted)
1404: return;
1405: int currentOffset = getCurrentOffset();
1406: if (currentOffset > maxChars()) {
1407: int length = textBuffer.length();
1408:
1409: // Cumulative length of preceding segments.
1410: int preceding = currentOffset - length;
1411:
1412: final String text = textBuffer.toString();
1413: int index = text.lastIndexOf(' ');
1414: while (index >= 0 && preceding + index > maxChars())
1415: index = text.lastIndexOf(' ', index - 1);
1416:
1417: if (index >= 0) {
1418: // Found a suitable break.
1419: String remainder = text.substring(index + 1);
1420: textBuffer.setLength(index); // Trims trailing space.
1421: flushSegment(false); // No wrap!
1422: if (segments != null) {
1423: lines
1424: .appendLine(new WebLine(segments,
1425: sourceOffset));
1426: ++offset; // Line separator always counts as 1.
1427: segments = null;
1428: }
1429: maybeIndent();
1430: textBuffer.append(remainder);
1431: } else {
1432: // No suitable break in text buffer.
1433: textBuffer.setLength(0);
1434: if (segments != null) {
1435: final int last = segments.size() - 1;
1436: if (last >= 0) {
1437: final HtmlLineSegment lastSegment = (HtmlLineSegment) segments
1438: .getSegment(last);
1439: final String segmentText = lastSegment
1440: .getText();
1441: index = segmentText.lastIndexOf(' ');
1442: if (index >= 0) {
1443: // Found a break.
1444: final String head = segmentText.substring(
1445: 0, index);
1446: final String tail = segmentText
1447: .substring(index + 1);
1448:
1449: // We're removing a trailing space. Adjust offset
1450: // accordingly.
1451: --offset;
1452:
1453: final int format = lastSegment.getFormat();
1454: final Link link = lastSegment.getLink();
1455:
1456: segments.setSegment(last,
1457: new HtmlLineSegment(head, format,
1458: link));
1459: lines.appendLine(new WebLine(segments,
1460: sourceOffset));
1461:
1462: // Line separator always counts as 1.
1463: ++offset;
1464:
1465: segments = null;
1466: if (tail.length() > 0) {
1467: maybeIndent();
1468: if (segments == null)
1469: segments = new LineSegmentList();
1470: segments
1471: .addSegment(new HtmlLineSegment(
1472: tail, format, link));
1473: }
1474: } else {
1475: // No break. Move last segment to current line.
1476: segments.removeSegment(lastSegment);
1477: lines.appendLine(new WebLine(segments,
1478: sourceOffset));
1479:
1480: // Line separator always counts as 1.
1481: ++offset;
1482:
1483: segments = null;
1484: maybeIndent();
1485: if (segments == null)
1486: segments = new LineSegmentList();
1487: segments.addSegment(lastSegment);
1488: }
1489: }
1490: }
1491:
1492: maybeIndent();
1493: textBuffer.append(text);
1494: flushSegment(false); // No wrap!
1495: }
1496: }
1497: }
1498:
1499: // Returns true if it does anything.
1500: private boolean flushLine() {
1501: flushSegment();
1502: if (centered() && currentTable == null && segments != null) {
1503: int length = getCurrentOffset();
1504: if (maxChars() > length) {
1505: int numSpaces = (maxChars() - length) / 2;
1506: if (numSpaces > 0) {
1507: segments.addSegment(0, new HtmlLineSegment(
1508: Utilities.spaces(numSpaces),
1509: FORMAT_WHITESPACE, null));
1510: offset += numSpaces;
1511: }
1512: }
1513: }
1514: if (segments != null) {
1515: lines.appendLine(new WebLine(segments, sourceOffset));
1516: ++offset; // Line separator always counts as 1.
1517: segments = null;
1518: return true;
1519: } else
1520: return false;
1521: }
1522:
1523: private void newLine() {
1524: flushLine();
1525: Line lastLine = lines.getLastLine();
1526: if (lastLine != null && lastLine.length() > 0
1527: && !lastLine.isBlank()) {
1528: lines.appendLine(new WebLine(sourceOffset));
1529: ++offset;
1530: }
1531: }
1532:
1533: private final int maxChars() {
1534: // if (maxChars == 0) {
1535: // // We have to be careful here because this might get called before
1536: // // the display is initialized if we're opening a file on the
1537: // // command line.
1538: // Display display = Editor.currentEditor().getDisplay();
1539: // int displayWidth = display.getWidth();
1540: // if (displayWidth > 0) {
1541: // int charWidth = display.getCharWidth();
1542: // if (charWidth > 0)
1543: // maxChars = display.getWidth() / charWidth - 2;
1544: // }
1545: // if (maxChars <= 0)
1546: // maxChars = 80;
1547: // }
1548: Debug.assertTrue(maxChars == 80);
1549: return maxChars;
1550: }
1551:
1552: private static class EncodingChangeException extends Exception {
1553: private String newEncoding;
1554:
1555: EncodingChangeException(String newEncoding) {
1556: this .newEncoding = newEncoding;
1557: }
1558:
1559: String getNewEncoding() {
1560: return newEncoding;
1561: }
1562: }
1563: }
|