001: /**
002: *
003: */package com.dappit.Dapper.parser.test;
004:
005: import java.io.ByteArrayOutputStream;
006: import java.io.File;
007: import java.io.FileInputStream;
008: import java.io.FileNotFoundException;
009: import java.io.IOException;
010: import java.io.StringReader;
011: import java.net.MalformedURLException;
012: import java.util.Hashtable;
013: import java.util.concurrent.ExecutorService;
014: import java.util.concurrent.Executors;
015: import java.util.concurrent.TimeUnit;
016: import java.util.zip.ZipEntry;
017: import java.util.zip.ZipInputStream;
018:
019: import org.ccil.cowan.tagsoup.Parser;
020: import org.dom4j.DocumentException;
021: import org.dom4j.io.DOMWriter;
022: import org.dom4j.io.SAXReader;
023: import org.w3c.dom.Document;
024:
025: import com.dappit.Dapper.parser.MozillaParser;
026: import com.dappit.Dapper.parser.profiler.SimpleTimeProfiler;
027: import com.dappit.Dapper.parser.test.util.ProgressLogger;
028:
029: /**
030: * @author Ohad Serfaty
031: *
032: */
033: public class ParserComparator {
034:
035: private static volatile double mozillaParsingTime;
036: private static volatile double tagsoupParsingTime;
037:
038: public static byte[] fileGetContentsInBytes(File file)
039: throws FileNotFoundException, IOException {
040: FileInputStream fIS = new FileInputStream(file);
041: ByteArrayOutputStream bIS = new ByteArrayOutputStream();
042: byte[] temp = new byte[256];
043: int bytesRead = 0;
044: while ((bytesRead = fIS.read(temp)) != -1) {
045: bIS.write(temp, 0, bytesRead);
046: }
047: fIS.close();
048: bIS.close();
049:
050: return bIS.toByteArray();
051: }
052:
053: /**
054: * @param youTubeContent
055: * @throws DocumentException
056: * @throws NetworkErrorException
057: * @throws IOException
058: * @throws MalformedURLException
059: */
060: private static void compareMozillaAndTagsoup(String content)
061: throws Exception {
062:
063: SimpleTimeProfiler profiler = new SimpleTimeProfiler();
064:
065: // profile mozilla :
066: profiler.start();
067: // System.out.println("Parsing content : "+ content);
068: MozillaParser parser = new MozillaParser();
069: System.out.println("Mozilla Parsing...");
070: parser.parse(content);
071: mozillaParsingTime += profiler.report("Mozilla:");
072:
073: profiler = new SimpleTimeProfiler();
074: // profile tagsoup :
075: System.out.println("Tagsoup Parsing...");
076: profiler.start();
077: tagSoupParse(content);
078: tagsoupParsingTime += profiler.report("tagsoup:");
079: }
080:
081: private static Document tagSoupParse(String content) {
082: Parser htmlParser = new Parser();
083:
084: SAXReader saxReader = new SAXReader(htmlParser);
085: saxReader.setMergeAdjacentText(true);
086: DOMWriter domWriter = new DOMWriter();
087: try {
088: return domWriter.write(saxReader.read(new StringReader(
089: content)));
090: } catch (Exception e) {
091: e.printStackTrace();
092: }
093: return null;
094: }
095:
096: private static void testZippedContent() throws Exception {
097: ZipInputStream zippedInputStream = new ZipInputStream(
098: new FileInputStream("./test.content.zip"));
099: int counter = 0;
100: int maxCount = 1000;
101: ProgressLogger progressLogger = new ProgressLogger(maxCount);
102: while (counter++ < maxCount) {
103:
104: ZipEntry nextZippedEntry = zippedInputStream.getNextEntry();
105: if (nextZippedEntry == null)
106: break;
107: ByteArrayOutputStream bos = new ByteArrayOutputStream();
108: System.out.println("Reading zipped file :"
109: + nextZippedEntry.getName());
110: byte[] buf = new byte[1024];
111: int len;
112: while ((len = zippedInputStream.read(buf)) > 0) {
113: bos.write(buf, 0, len);
114: }
115: String content = new String(bos.toByteArray());
116: // System.out.println("Content : "+ content);
117: bos.close();
118: compareMozillaAndTagsoup(content);
119:
120: progressLogger.incrementCount();
121: }
122: System.out.println("Mozilla Parsing time :"
123: + mozillaParsingTime + " sec");
124: System.out.println("Tagsoup Parsing time :"
125: + tagsoupParsingTime + " sec");
126:
127: }
128:
129: public static class ZipFileReader {
130:
131: private final String fileName;
132: private ZipInputStream zippedInputStream;
133:
134: public ZipFileReader(String fileName)
135: throws FileNotFoundException {
136: this .fileName = fileName;
137: zippedInputStream = new ZipInputStream(new FileInputStream(
138: this .fileName));
139: }
140:
141: public synchronized String nextContent() throws Exception {
142: ZipEntry nextZippedEntry = zippedInputStream.getNextEntry();
143: if (nextZippedEntry == null)
144: return null;
145: ByteArrayOutputStream bos = new ByteArrayOutputStream();
146: System.out.println("Reading zipped file :"
147: + nextZippedEntry.getName());
148: byte[] buf = new byte[1024];
149: int len;
150: while ((len = zippedInputStream.read(buf)) > 0) {
151: bos.write(buf, 0, len);
152: }
153: String content = new String(bos.toByteArray());
154: // System.out.println("Content : "+ content);
155: bos.close();
156: return content;
157: }
158:
159: }
160:
161: private static void testZippedContentMultithreaded()
162: throws Exception {
163: int maxThreads = 10;
164: ExecutorService mozillThreadPool = Executors
165: .newFixedThreadPool(maxThreads);
166: ExecutorService tagsoupThreadPool = Executors
167: .newFixedThreadPool(maxThreads);
168:
169: ZipFileReader mozillaFileReader = new ZipFileReader(
170: "./test.content.zip");
171: ZipFileReader tagsoupFileReader = new ZipFileReader(
172: "./test.content.zip");
173: int counter = 0;
174: int maxCount = 530;
175:
176: SimpleTimeProfiler mozillaProfiler = new SimpleTimeProfiler();
177: mozillaProfiler.start();
178: // first have Mozilla :
179: while (counter++ < maxCount) {
180: mozillThreadPool.execute(new MozillaParsingThread(
181: mozillaFileReader));
182: }
183: mozillThreadPool.shutdown();
184: mozillThreadPool.awaitTermination(10000, TimeUnit.SECONDS);
185: double mozillaTime = mozillaProfiler
186: .report("Mozilla total time");
187:
188: counter = 0;
189: // then have tagsoup :
190: SimpleTimeProfiler tagsoupProfiler = new SimpleTimeProfiler();
191: tagsoupProfiler.start();
192: while (counter++ < maxCount) {
193: tagsoupThreadPool.execute(new TagsoupParsingThread(
194: tagsoupFileReader));
195: }
196: tagsoupThreadPool.shutdown();
197: tagsoupThreadPool.awaitTermination(10000, TimeUnit.SECONDS);
198:
199: double tagsoupTime = tagsoupProfiler
200: .report("Tagsoup total time");
201:
202: System.out.println("Mozilla Parsing multithreaded time :"
203: + mozillaParsingTime + " sec");
204: System.out.println("Tagsoup Parsing multithreaded time :"
205: + tagsoupParsingTime + " sec");
206:
207: System.out.println("Mozilla Parsing Total time :" + mozillaTime
208: + " sec");
209: System.out.println("Tagsoup Parsing Total time :" + tagsoupTime
210: + " sec");
211:
212: }
213:
214: public static class MozillaParsingThread extends Thread {
215:
216: private final ZipFileReader mozillaFileReader;
217: private boolean synchronize;
218: private static Object SynchronizationObject = new Object();
219: private static Hashtable<String, Document> documentHashTable = new Hashtable<String, Document>();
220:
221: /**
222: * @param tagsoupFileReader
223: */
224: public MozillaParsingThread(ZipFileReader tagsoupFileReader) {
225: this (tagsoupFileReader, false);
226: }
227:
228: /**
229: * @param tagsoupFileReader2
230: * @param b
231: */
232: public MozillaParsingThread(ZipFileReader tagsoupFileReader,
233: boolean synchronize) {
234: this .synchronize = synchronize;
235: this .mozillaFileReader = tagsoupFileReader;
236: }
237:
238: public void run() {
239: String content;
240: try {
241: content = mozillaFileReader.nextContent();
242: SimpleTimeProfiler profiler = new SimpleTimeProfiler();
243: profiler.start();
244: MozillaParser parser = new MozillaParser();
245: org.dom4j.Document document;
246: if (this .synchronize) {
247: synchronized (SynchronizationObject) {
248: document = (org.dom4j.Document) parser
249: .parse(content);
250: }
251: } else {
252: document = (org.dom4j.Document) parser
253: .parse(content);
254: }
255:
256: mozillaParsingTime += profiler.report("Mozilla");
257: // org.dom4j.Document document2 = (org.dom4j.Document) parser.parse(content);
258: // if (!document2.asXML().equals(document.asXML()))
259: // {
260: // System.err.println("------------------------->>> content not equals ????");
261: // }
262:
263: documentHashTable.put(content.hashCode()
264: + Boolean.toString(synchronize),
265: (Document) document);
266: } catch (Exception e) {
267: e.printStackTrace();
268: }
269:
270: }
271:
272: public static Hashtable<String, Document> getDocumentsHashTable() {
273: return documentHashTable;
274: }
275:
276: }
277:
278: public static class TagsoupParsingThread extends Thread {
279:
280: private final ZipFileReader tagsoupFileReader;
281: private final boolean synchronize;
282: private static Object SynchronizationObject = new Object();
283:
284: /**
285: * @param tagsoupFileReader
286: */
287: public TagsoupParsingThread(ZipFileReader tagsoupFileReader) {
288: this (tagsoupFileReader, false);
289: }
290:
291: /**
292: * @param tagsoupFileReader2
293: * @param b
294: */
295: public TagsoupParsingThread(ZipFileReader tagsoupFileReader,
296: boolean synchronize) {
297: this .synchronize = synchronize;
298: this .tagsoupFileReader = tagsoupFileReader;
299: }
300:
301: public void run() {
302: try {
303: String content = tagsoupFileReader.nextContent();
304: SimpleTimeProfiler profiler = new SimpleTimeProfiler();
305: profiler.start();
306: if (synchronize) {
307: synchronized (SynchronizationObject) {
308: tagSoupParse(content);
309: }
310: } else
311: tagSoupParse(content);
312: tagsoupParsingTime += profiler.report("Tagsoup");
313:
314: } catch (Exception e) {
315: // TODO Auto-generated catch block
316: e.printStackTrace();
317: }
318:
319: }
320:
321: }
322:
323: public static void main(String[] args) throws Exception {
324: TestMozillaParser.initTestingXPCOM();
325:
326: // Scheme 1 :
327: testZippedContentMultithreaded();
328:
329: // Scheme 2 :
330: // testTagsoupSynchronizedParsing();
331:
332: // Scheme 3 :
333: // testMozillaSynchronizedParsing();
334: // System.out.println( MozillaParsingThread.getDocumentsHashTable());
335: // Hashtable<String, Document> documentHashTable = MozillaParsingThread.getDocumentsHashTable();
336: // for (String contentType:documentHashTable.keySet())
337: // {
338: // if (contentType.endsWith("true"))
339: // {
340: // org.dom4j.Document synchronizedDocumentResult = (org.dom4j.Document) documentHashTable.get(contentType);
341: // System.out.println(contentType +"->" + synchronizedDocumentResult);
342: //
343: // String parralelScontent = contentType.replace("true", "false");
344: // org.dom4j.Document unsynchronizedDocumentResult = (org.dom4j.Document) documentHashTable.get(parralelScontent);
345: // System.out.println( parralelScontent+"->" +unsynchronizedDocumentResult );
346: // if (!unsynchronizedDocumentResult.asXML().equals(synchronizedDocumentResult.asXML()))
347: // System.err.println("Not Good : " + contentType);
348: // }
349: // }
350: //
351: }
352:
353: /**
354: * @throws Exception
355: *
356: */
357: private static void testTagsoupSynchronizedParsing()
358: throws Exception {
359: tagsoupMultithreadedParse(true, "Tagsoup Synchronized ");
360: tagsoupMultithreadedParse(false, "Tagsoup Parallel ");
361: }
362:
363: /**
364: * @throws Exception
365: *
366: */
367: private static void testMozillaSynchronizedParsing()
368: throws Exception {
369: mozillaMultithreadedParse(true, "Mozilla Synchronized ");
370: mozillaMultithreadedParse(false, "Mozilla Parallel ");
371: }
372:
373: /**
374: * @throws FileNotFoundException
375: * @throws Exception
376: *
377: */
378: private static void mozillaMultithreadedParse(
379: final boolean synchronize, String reportString)
380: throws Exception {
381: int maxThreads = 30;
382: ExecutorService mozillaThreadPool = Executors
383: .newFixedThreadPool(maxThreads);
384: mozillaParsingTime = 0;
385: ZipFileReader tagsoupFileReader = new ZipFileReader(
386: "./test.content.zip");
387: int counter = 0;
388: int maxCount = 530;
389:
390: // then have tagsoup :
391: SimpleTimeProfiler mozillaProfiler = new SimpleTimeProfiler();
392: mozillaProfiler.start();
393: while (counter++ < maxCount) {
394: mozillaThreadPool.execute(new MozillaParsingThread(
395: tagsoupFileReader, synchronize));
396: }
397: mozillaThreadPool.shutdown();
398: mozillaThreadPool.awaitTermination(10000, TimeUnit.SECONDS);
399:
400: double mozillaTime = mozillaProfiler
401: .report("Tagsoup synchronized total time");
402:
403: System.out.println(reportString + " time :"
404: + mozillaParsingTime + " sec");
405: System.out.println(reportString + " Total time :" + mozillaTime
406: + " sec");
407:
408: }
409:
410: /**
411: * @throws FileNotFoundException
412: * @throws Exception
413: *
414: */
415: private static void tagsoupMultithreadedParse(
416: final boolean synchronize, String reportString)
417: throws Exception {
418: int maxThreads = 10;
419: ExecutorService tagsoupThreadPool = Executors
420: .newFixedThreadPool(maxThreads);
421: tagsoupParsingTime = 0;
422: ZipFileReader tagsoupFileReader = new ZipFileReader(
423: "./test.content.zip");
424: int counter = 0;
425: int maxCount = 530;
426:
427: // then have tagsoup :
428: SimpleTimeProfiler tagsoupProfiler = new SimpleTimeProfiler();
429: tagsoupProfiler.start();
430: while (counter++ < maxCount) {
431: tagsoupThreadPool.execute(new TagsoupParsingThread(
432: tagsoupFileReader, synchronize));
433: }
434: tagsoupThreadPool.shutdown();
435: tagsoupThreadPool.awaitTermination(10000, TimeUnit.SECONDS);
436:
437: double tagsoupTime = tagsoupProfiler
438: .report("Tagsoup synchronized total time");
439:
440: System.out.println(reportString + " time :"
441: + tagsoupParsingTime + " sec");
442: System.out.println(reportString + " Total time :" + tagsoupTime
443: + " sec");
444:
445: }
446:
447: }
|