001: /*
002: * argun 1.0
003: * Web 2.0 delivery framework
004: * Copyright (C) 2007 Hammurapi Group
005: *
006: * This program is free software; you can redistribute it and/or
007: * modify it under the terms of the GNU Lesser General Public
008: * License as published by the Free Software Foundation; either
009: * version 2 of the License, or (at your option) any later version.
010: *
011: * This program is distributed in the hope that it will be useful,
012: * but WITHOUT ANY WARRANTY; without even the implied warranty of
013: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
014: * Lesser General Public License for more details.
015: *
016: * You should have received a copy of the GNU Lesser General Public
017: * License along with this library; if not, write to the Free Software
018: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
019: *
020: * URL: http://www.hammurapi.biz
021: * e-Mail: support@hammurapi.biz
022: */
023: package biz.hammurapi.web.util;
024:
025: import java.io.File;
026: import java.io.FileReader;
027: import java.io.FileWriter;
028: import java.io.IOException;
029: import java.io.InputStream;
030: import java.io.Reader;
031: import java.io.Writer;
032:
033: import antlr.Token;
034: import antlr.TokenStreamException;
035:
036: /**
037: * @author Pavel Vlasov
038: * @version $Revision: 1.1 $
039: */
040: public class HtmlParser {
041: public static void main(String[] args) throws Exception {
042: File in = new File(args[0]);
043: if (in.isFile()) {
044: stripFile(in, new File(args[1]));
045: } else if (in.isDirectory()) {
046: File out = new File(args[1]);
047: stripDirectory(in, out);
048: } else {
049: System.err.println("Unknown file type: "
050: + in.getAbsolutePath());
051: System.exit(1);
052: }
053: }
054:
055: /**
056: * @param out
057: * @throws IOException
058: * @throws TokenStreamException
059: */
060: private static void stripDirectory(File in, File out)
061: throws TokenStreamException, IOException {
062: if (!out.exists()) {
063: if (!out.mkdirs()) {
064: System.err.println("Can't create "
065: + out.getAbsolutePath());
066: System.exit(1);
067: }
068: } else if (!out.isDirectory()) {
069: System.err.println("Not a directory: "
070: + out.getAbsolutePath());
071: System.exit(1);
072: }
073:
074: File[] entries = in.listFiles();
075: for (int i = 0; i < entries.length; i++) {
076: File cout = new File(out, entries[i].getName());
077: if (entries[i].isFile()) {
078: stripFile(entries[i], cout);
079: } else if (entries[i].isDirectory()) {
080: stripDirectory(entries[i], cout);
081: } else {
082: System.err.println("WARNING: Not a file or directory: "
083: + entries[i].getAbsolutePath());
084: }
085: }
086: }
087:
088: /**
089: * Strips body, throws away head.
090: * @throws TokenStreamException
091: * @throws IOException
092: */
093: private static void stripFile(File in, File out)
094: throws TokenStreamException, IOException {
095: System.out.println(in.getAbsolutePath() + " -> "
096: + out.getAbsolutePath());
097: parse(new FileReader(in), null, new FileWriter(out), null);
098: }
099:
100: /**
101: * @param reader
102: * @param writer
103: * @param tokenProcessor TODO
104: * @throws TokenStreamException
105: * @throws IOException
106: * @return true if header and/or body was detected.
107: */
108: public static boolean parse(Reader reader, Writer headWriter,
109: Writer bodyWriter, TokenProcessor tokenProcessor)
110: throws TokenStreamException, IOException {
111: HTMLLexer lexer = new HTMLLexer(reader);
112: return parse(lexer, headWriter, bodyWriter, tokenProcessor);
113: }
114:
115: /**
116: * @param lexer
117: * @param bodyWriter
118: * @param tokenProcessor TODO
119: * @throws TokenStreamException
120: * @throws IOException
121: */
122: private static boolean parse(HTMLLexer lexer, Writer headWriter,
123: Writer bodyWriter, TokenProcessor tokenProcessor)
124: throws TokenStreamException, IOException {
125: int bodyLevel = 0;
126: int headLevel = 0;
127: boolean hadBodyOrHeader = false;
128: // int i=0;
129: try {
130: for (Token t = lexer.nextToken(); t != null
131: && t.getType() != HTMLTokenTypes.EOF; t = lexer
132: .nextToken()) {
133: // System.out.println(++i + " " + t.getType() + " " + t.getText() + " "+t.getLine()+":"+t.getColumn());
134: switch (t.getType()) {
135: case HTMLTokenTypes.OBODY:
136: bodyLevel++;
137: hadBodyOrHeader = true;
138: break;
139: case HTMLTokenTypes.CBODY:
140: bodyLevel--;
141: if (bodyLevel == 0) {
142: return true;
143: }
144: break;
145: case HTMLTokenTypes.OHEAD:
146: headLevel++;
147: hadBodyOrHeader = true;
148: break;
149: case HTMLTokenTypes.CHEAD:
150: headLevel--;
151: break;
152: default:
153: if (bodyLevel > 0) {
154: hadBodyOrHeader = true;
155: if (bodyWriter != null) {
156: bodyWriter.write(tokenProcessor == null ? t
157: .getText() : tokenProcessor
158: .getTokenText(t));
159: }
160: } else if (headLevel > 0) {
161: hadBodyOrHeader = true;
162: if (headWriter != null) {
163: headWriter.write(tokenProcessor == null ? t
164: .getText() : tokenProcessor
165: .getTokenText(t));
166: }
167: }
168: }
169: }
170: } finally {
171: if (bodyWriter != null) {
172: bodyWriter.close();
173: }
174:
175: if (headWriter != null) {
176: headWriter.close();
177: }
178: }
179: return hadBodyOrHeader;
180: }
181:
182: /**
183: * @param writer
184: * @param tokenProcessor TODO
185: * @param reader
186: * @throws TokenStreamException
187: * @throws IOException
188: */
189: public static void parse(InputStream in, Writer headWriter,
190: Writer bodyWriter, TokenProcessor tokenProcessor)
191: throws TokenStreamException, IOException {
192: HTMLLexer lexer = new HTMLLexer(in);
193: parse(lexer, headWriter, bodyWriter, tokenProcessor);
194: }
195: }
|