001: /**
002: *
003: */package com.dappit.Dapper.parser.test;
004:
005: import java.io.ByteArrayOutputStream;
006: import java.io.File;
007: import java.io.IOException;
008: import java.io.OutputStreamWriter;
009: import java.io.StringReader;
010: import java.io.StringWriter;
011: import java.util.Random;
012: import java.util.Set;
013: import java.util.Vector;
014: import java.util.regex.Matcher;
015: import java.util.regex.Pattern;
016:
017: import junit.framework.TestCase;
018:
019: import org.ccil.cowan.tagsoup.Parser;
020: import org.dom4j.DocumentException;
021: import org.dom4j.io.DOMReader;
022: import org.dom4j.io.DOMWriter;
023: import org.dom4j.io.HTMLWriter;
024: import org.dom4j.io.OutputFormat;
025: import org.dom4j.io.SAXReader;
026: import org.w3c.dom.Document;
027: import org.w3c.dom.Element;
028: import org.w3c.dom.NodeList;
029:
030: import com.dappit.Dapper.Configuration;
031: import com.dappit.Dapper.parser.EnviromentController;
032: import com.dappit.Dapper.parser.MozillaParser;
033: import com.dappit.Dapper.parser.ParserInitializationException;
034: import com.dappit.Dapper.parser.ParserInstruction;
035: import com.dappit.Dapper.parser.profiler.SimpleMemoryProfiler;
036: import com.sun.org.apache.xml.internal.serialize.XMLSerializer;
037:
038: /**
039: * @author Ohad Serfaty
040: *
041: */
042: public class TestMozillaParser extends TestCase {
043:
044: boolean doTesting = true;
045:
046: public static void initTestingXPCOM() {
047: File mozillaParserLibraryFile;
048: try {
049: mozillaParserLibraryFile = new File(
050: "native/bin/MozillaParser"
051: + EnviromentController
052: .getSharedLibraryExtension());
053: } catch (Exception e1) {
054: mozillaParserLibraryFile = new File(
055: "./native/bin/MozillaParser.dll");
056: e1.printStackTrace();
057: }
058:
059: String mozillaParserLibrary = mozillaParserLibraryFile
060: .getAbsolutePath();
061: String mozillaComponentBasePath = Configuration
062: .getMozillaComponentsPath();
063: try {
064: System.out.println("Loading and initializing XPCOM from "
065: + mozillaParserLibrary);
066: MozillaParser.init(mozillaParserLibrary,
067: mozillaComponentBasePath);
068: System.out.println("done!");
069: } catch (Exception e) {
070: e.printStackTrace();
071: }
072: }
073:
074: static {
075: initTestingXPCOM();
076: }
077:
078: // helper function : get the string of the dom document
079:
080: public static String serialize(Document document)
081: throws IOException {
082: StringWriter stringWriter = new StringWriter();
083: XMLSerializer serializer = new XMLSerializer();
084: serializer.setOutputCharStream(stringWriter);
085: serializer.serialize(document);
086:
087: return stringWriter.toString();
088:
089: }
090:
091: private Document parseAndCompare(String html, String expectedResult)
092: throws Exception {
093: //MozillaParser parser = MozillaParser.getInstance();
094: MozillaParser parser = new MozillaParser();
095: Document document = parser.parse(html);
096: //System.out.println(serialize(document));
097: if (doTesting)
098: assertEquals(expectedResult, serialize(document));
099: return document;
100: }
101:
102: public void testSimple1() throws Exception {
103: String simple1 = "<html>Hello world!</html>";
104: String expected1 = "<?xml version=\"1.0\"?>\n"
105: + "<html><body>Hello world!</body></html>";
106: parseAndCompare(simple1, expected1);
107: }
108:
109: public void testSimple2() throws Exception {
110: String simple2 = "<html>Hello world!</html>";
111: String expected1 = "<?xml version=\"1.0\"?>\n"
112: + "<html><body>Hello world!</body></html>";
113: parseAndCompare(simple2, expected1);
114: }
115:
116: public void testFonts() throws Exception {
117: String html = "<p><p><p><font color=\"steelblue\" size=\"4\" FACE=\"Verdana\"><b>Ledger: The Joker's a \"pure anarchist\"</b></font><font size=\"2\"><br><b>Author:</b> <a href=\"bio_jett.html\" target=\"_blank\">Jett</a> <br><b><font color=\"silver\">Tuesday, December 5, 2006 - 11:32 AM, 8:00 PM:</b></font> Here's a bit from Heath Ledger about his upcoming turn as The Joker in <i>TDK</i>:<p><font size=\"1\">";
118: MozillaParser parser = new MozillaParser();
119: Document document = parser.parse(html);
120: System.out.println(serialize(document));
121: }
122:
123: public void testComment1() throws Exception {
124: String simple2 = "<html><body><p><!-- a comment --></p> <br> Hello world!</html>";
125: String expected1 = "<?xml version=\"1.0\"?>\n"
126: + "<html><body><p><!-- a comment --></p> <br/> Hello world!</body></html>";
127: parseAndCompare(simple2, expected1);
128: // System.out.println(serialize(document));
129: }
130:
131: public void testScriptComment1() throws Exception {
132: String simple2 = "<html><body><script language=\"JavaScript\" > document.write('hell');</script> <br> Hello world!</html>";
133: String expected1 = ""
134: + "<?xml version=\"1.0\"?>\n"
135: + "<html><body><script language=\"JavaScript\">document.write('hell');</script> <br/> Hello world!</body></html>";
136: parseAndCompare(simple2, expected1);
137: // System.out.println(serialize(document));
138: }
139:
140: public void testStyleContent() throws Exception {
141: String simple2 = "<html><head><style > <!-- body,td,a,p,.h{font-family:arial,sans-serif} "
142: + ".h{font-size:20px} "
143: + " .h{color:#3366cc} "
144: + " .q{color:#00c} "
145: + " --></style></head><body> <br> Hello world!</html>";
146: String expected1 = "<?xml version=\"1.0\"?>\n"
147: + "<html><head><style harmless=\"\"><!-- body,td,a,p,.h{font-family:arial,sans-serif} .h{font-size:20px} .h{color:#3366cc} .q{color:#00c} --></style></head><body> <br/> Hello world!</body></html>";
148: parseAndCompare(simple2, expected1);
149: // System.out.println(serialize(document));
150: }
151:
152: public void testAmpReplacer() {
153: String testString = " ";
154: String newString = testString.replaceAll(" ", "");
155: assertEquals("", newString);
156:
157: testString = " 3 1 ";
158: newString = testString.replaceAll(" ", "");
159: assertEquals("31", newString);
160:
161: }
162:
163: public void testStyleReplacer() {
164: String testString = "< style >";
165: String newString = testString.replaceAll("<\\s*style\\s*>",
166: "<style harmless=''> ");
167: assertEquals("<style harmless=''> ", newString);
168:
169: testString = "< style>";
170: newString = testString.replaceAll("<\\s*style\\s*>",
171: "<style harmless=''> ");
172: assertEquals("<style harmless=''> ", newString);
173:
174: testString = "<style>";
175: newString = testString.replaceAll("<\\s*style\\s*>",
176: "<style harmless=''> ");
177: assertEquals("<style harmless=''> ", newString);
178:
179: testString = "< style defer>";
180: newString = testString.replaceAll("<\\s*style\\s*>",
181: "<style harmless=''> ");
182: assertNotSame("<style harmless=''> ", newString);
183:
184: }
185:
186: public void testMultithreadedXPCOMInitialization()
187: throws InterruptedException {
188: Thread thread1 = new Thread() {
189: public void run() {
190: try {
191: initTestingXPCOM();
192: } catch (Exception e) {
193: e.printStackTrace();
194: }
195: }
196: };
197: thread1.start();
198: thread1.join();
199: Thread.sleep(1000);
200: thread1 = new Thread() {
201: public void run() {
202: try {
203: initTestingXPCOM();
204: } catch (Exception e) {
205: e.printStackTrace();
206: }
207: }
208: };
209: thread1.start();
210: thread1.join();
211:
212: }
213:
214: public Document parseRandomHtml(int length)
215: throws ParserInitializationException, DocumentException {
216: String html = "<html><body>";
217: for (int i = 0; i < length; i++)
218: html += "<div>" + Math.random() + "</div>";
219: html += "</body></html>";
220: MozillaParser parser = new MozillaParser();
221: return parser.parse(html);
222: }
223:
224: public void testMultithreaded1() {
225: Thread thread1 = new Thread() {
226: public void run() {
227: try {
228: parseRandomHtml(100);
229: } catch (Exception e) {
230: // TODO Auto-generated catch block
231: e.printStackTrace();
232: }
233: }
234: };
235:
236: Thread thread2 = new Thread() {
237: public void run() {
238: try {
239: parseRandomHtml(100);
240: } catch (Exception e) {
241: // TODO Auto-generated catch block
242: e.printStackTrace();
243: }
244: }
245: };
246:
247: thread1.start();
248: thread2.start();
249: try {
250: thread1.join();
251: thread2.join();
252: } catch (InterruptedException e) {
253: // TODO Auto-generated catch block
254: e.printStackTrace();
255: }
256:
257: }
258:
259: volatile int failed = 0;
260:
261: public void testMultithreaded2() throws InterruptedException {
262:
263: int NUM_THREADS = 50;
264:
265: final Thread[] threadPool = new Thread[NUM_THREADS];
266: final Random random = new Random(0);
267:
268: for (int i = 0; i < NUM_THREADS; i++)
269: threadPool[i] = new Thread() {
270: public void run() {
271: try {
272: double randomNumber = random.nextDouble() * 100000000.0;
273: String html = "<html><body>";
274: for (int i = 0; i < 100; i++)
275: html += "<p>" + randomNumber + "</p>";
276: html += "</body></html>";
277: MozillaParser parser = new MozillaParser();
278: Document document = parser.parse(html);
279: Vector<Integer> instructions = parser
280: .getDomBuilderArguments();
281: int closeNodeCounter = 0;
282: int openNodeCounter = 0;
283: for (int instruction : instructions) {
284: if (instruction == ParserInstruction.CloseNode)
285: closeNodeCounter++;
286: if (instruction == ParserInstruction.OpenNode)
287: openNodeCounter++;
288:
289: }
290: // System.err.println("Close Node Counter :" + closeNodeCounter);
291: // System.err.println("Open Node Counter :" + openNodeCounter);
292:
293: if (!serialize(document).equals(
294: "<?xml version=\"1.0\"?>\n" + html)) {
295: synchronized (threadPool) {
296: System.err.println("Html input was :"
297: + "<?xml version=\"1.0\"?>\n"
298: + html);
299: System.err.println("Failed document :"
300: + serialize(document));
301: parser.dump();
302: System.err.println("Verifying :"
303: + document.getChildNodes()
304: .item(0)
305: .getChildNodes()
306: .item(0)
307: .getChildNodes()
308: .item(0).getNodeName());
309: System.err.println("<p number : > : "
310: + document.getChildNodes()
311: .item(0)
312: .getChildNodes()
313: .item(0)
314: .getChildNodes()
315: .getLength());
316:
317: failed++;
318: }
319: }
320:
321: } catch (Throwable e) {
322: e.printStackTrace();
323: failed++;
324: }
325: }
326: };
327: for (int i = 0; i < NUM_THREADS; i++)
328: threadPool[i].start();
329: for (int i = 0; i < NUM_THREADS; i++)
330: threadPool[i].join();
331: assertEquals(0, failed);
332: }
333:
334: @SuppressWarnings("unchecked")
335: public void testEntityDomWriterBug() throws Exception {
336: String testString = "<!doctype html public \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"
337: + "<html>"
338: + "<body>"
339: +
340: // "<a href=\"http://us.ard.yahoo.com/SIG=12ku07d54/M=289534.6742909.7689533.6551553/D=yahoosrch/S=2766679:HLSCH/Y=YAHOO/EXP=1167961934/A=2828626/R=0/SIG=10mgpruen" +
341: // "/*http://www.yahoo.com?fr=yfp-t-501\">Yahoo!</a> " +
342: // "<a href=\"http://us.ard.yahoo.com/SIG=12ku07d54/M=289534.6742909.7689533.6551553/D=yahoosrch/S=2766679:HLSCH/Y=YAHOO" +
343: // "" +
344: // "/EXP=1167961934/A=2828626/R=1/SIG=11nbq2pc6/*http://us.rd.yahoo.com/evt=31554/*http://my.yahoo.com?fr=yfp-t-501\">" +
345: // "My Yahoo!</a>" +
346: // " " +
347: " <a href=\"http://us.ard.yahoo.co"
348: + ""
349: + "m/SIG=12ku07d54/M=289534.6742909.7689533.6551553/D=yahoosrch/S"
350: + ""
351: + "=2766679:HLSCH/Y=YAHOO/EXP=1167961934/A=2828626/R=2/SIG=10n3m6b64/*http"
352: + "://mail.yahoo.com?fr=yfp-t-501\">"
353: + "Mail</a> "
354: + " "
355: + " Welcome, "
356: + "<strong>Guest</strong> [" + "";
357:
358: Document document = new MozillaParser().parse(testString);
359:
360: ByteArrayOutputStream bs = new ByteArrayOutputStream();
361: OutputStreamWriter oSW = null;
362: oSW = new OutputStreamWriter(bs);
363:
364: OutputFormat format = OutputFormat.createPrettyPrint();
365: format.setXHTML(false);
366: format.setExpandEmptyElements(true);
367: HTMLWriter writer = new HTMLWriter(oSW, format);
368: Set tags = writer.getPreformattedTags();
369: tags.add("STYLE");
370: writer.setPreformattedTags(tags);
371:
372: DOMReader domReader = new DOMReader();
373:
374: // System.out.println(" dom serialization : \n "+ serialize(document));
375:
376: writer.write(domReader.read(document));
377: writer.flush();
378:
379: // nhaving no exception means that the test is OK.
380:
381: }
382:
383: // from dapper : TODO : put this in a UTIL class :
384: private String findEncoding(Element rootElement) {
385: String encoding = "UTF-8";
386: NodeList metas = rootElement.getElementsByTagName("meta");
387: for (int m = 0; m < metas.getLength(); m++) {
388: Element meta = (Element) metas.item(m);
389: // find if we have an http-equiv attribute :
390: boolean hasHttpEquivContentType = false; // guilty until proven otherwise.
391: boolean hasNameContentType = false; // guilty until proven otherwise.
392: if (meta.getAttribute("http-equiv").length() > 0) {
393: hasHttpEquivContentType = meta.getAttribute(
394: "http-equiv").toLowerCase().equals(
395: "content-type");
396: } else if (meta.getAttribute("HTTP-EQUIV").length() > 0) {
397: hasHttpEquivContentType = meta.getAttribute(
398: "HTTP-EQUIV").toLowerCase().equals(
399: "content-type");
400: }
401:
402: if (meta.getAttribute("name").length() > 0)
403: hasNameContentType = meta.getAttribute("name")
404: .toLowerCase().equals("content-type");
405: else if (meta.getAttribute("NAME").length() > 0)
406: hasNameContentType = meta.getAttribute("NAME")
407: .toLowerCase().equals("content-type");
408:
409: String contentAttributeStr = null;
410:
411: if (meta.getAttribute("content").length() > 0)
412: contentAttributeStr = meta.getAttribute("content");
413: else if (meta.getAttribute("CONTENT").length() > 0)
414: contentAttributeStr = meta.getAttribute("CONTENT");
415:
416: if ((hasHttpEquivContentType || hasNameContentType)
417: && contentAttributeStr != null) {
418:
419: Pattern pat = Pattern.compile("charset\\s?=\\s?(.+);*",
420: Pattern.CASE_INSENSITIVE);
421: Matcher mat = pat.matcher(contentAttributeStr);
422: if (mat.find()) {
423: encoding = mat.group(1);
424: break;
425: }
426: }
427: }
428:
429: return encoding;
430: }
431:
432: private void printDocumentPreety(Document doc) throws IOException {
433: StringWriter stringWriter = new StringWriter();
434: OutputFormat format = OutputFormat.createPrettyPrint();
435: format.setXHTML(false);
436: format.setEncoding(findEncoding(doc.getDocumentElement()));
437: format.setExpandEmptyElements(true);
438: HTMLWriter writer = new HTMLWriter(stringWriter, format);
439: Set tags = writer.getPreformattedTags();
440: // tags.add("STYLE");
441: tags.clear();
442: writer.setPreformattedTags(tags);
443: DOMReader domReader = new DOMReader();
444: writer.write(domReader.read(doc));
445: // System.out.println("Document:\n" + stringWriter.toString());
446: }
447:
448: // /**
449: // * @param youTubeContent
450: // * @throws DocumentException
451: // * @throws NetworkErrorException
452: // * @throws IOException
453: // * @throws MalformedURLException
454: // */
455: // private void displayMozillaAndTagsoupDoms(Cacher cacher , String url) throws Exception {
456: // String content = null;
457: // try
458: // {
459: // System.err.println("Fetching content from :" + url);
460: // content = cacher.getCache(url);
461: // }
462: // catch (Exception e)
463: // {
464: // System.err.println("couldn't find contetn for URL:" + url +". grabbing page from net...");
465: // content = Util.urlGetContents(new URL(url));
466: // cacher.putCache(url , content);
467: // }
468: //
469: //
470: // // profile mozilla :
471: // Document document = MozillaParser.getInstance().parse(content);
472: //
473: //// System.out.println("Mozilla encoding :" + findEncoding(document.getDocumentElement()));
474: //
475: // printDocumentPreety(document);
476: //
477: // // profile tagsoup :
478: // Parser htmlParser = new Parser();
479: //
480: // SAXReader saxReader = new SAXReader(htmlParser);
481: // saxReader.setMergeAdjacentText(true);
482: // DOMWriter domWriter = new DOMWriter();
483: // document = domWriter.write(saxReader.read(new StringReader(content)));
484: //
485: //// System.out.println("Tagsoup encoding :" + findEncoding(document.getDocumentElement()));
486: //
487: // printDocumentPreety(document);
488: //
489: ////// System.out.println("title :" + );
490: //// String nanaTitle = document.getDocumentElement().getChildNodes().item(0)
491: //// .getChildNodes().item(4).getTextContent();
492: //// for (int i=0; i<nanaTitle.length(); i++)
493: //// System.out.println((int)nanaTitle.charAt(i));
494: // }
495: //
496: // public void testHebrew(){
497: // char dalet = 0xD793;
498: // System.out.println(dalet);
499: // }
500: //
501: //
502: // // this onw is not a true test , just a debug check..
503: // public void testHebrewEncoding() throws Exception
504: // {
505: // Cacher contentCacher = new Cacher("ohad.dappit.com");
506: // displayMozillaAndTagsoupDoms(contentCacher, "http://www.nana.co.il");
507: // }
508: //
509: //
510: // Vector<String> contentList = new Vector<String>();
511: //
512: // public void addToContentList(Cacher cacher , String url) throws Exception{
513: // String content = null;
514: // try
515: // {
516: // System.err.println("Fetching content from :" + url);
517: // content = cacher.getCache(url);
518: // }
519: // catch (Exception e)
520: // {
521: // System.err.println("couldn't find contetn for URL:" + url +". grabbing page from net...");
522: // content = Util.urlGetContents(new URL(url));
523: // cacher.putCache(url , content);
524: // }
525: // contentList.add(content);
526: // }
527: //
528: // public void testMultithreadedPerformance() throws Exception {
529: // Cacher contentCacher = new Cacher("ohad.dappit.com");
530: // contentCacher.setCacheLifeTime(Integer.MAX_VALUE);
531: // addToContentList(contentCacher,"http://www.youtube.com/results?search_query=saddam&search=Search");
532: // addToContentList(contentCacher, "http://www.digg.com");
533: // addToContentList(contentCacher, "http://www.walla.co.il");
534: // addToContentList(contentCacher, "http://www.dappit.com");
535: // addToContentList(contentCacher, "http://www.cnn.com");
536: // addToContentList(contentCacher, "http://slashdot.org");
537: // addToContentList(contentCacher, "http://www.netdimes.org");
538: // addToContentList(contentCacher, "http://www.yahoo.com");
539: // addToContentList(contentCacher, "http://www.mozilla.org");
540: // addToContentList(contentCacher, "http://www.nana.co.il");
541: // addToContentList(contentCacher, "http://www.finance.com");
542: // addToContentList(contentCacher, "http://www.cnn.co.jp/");
543: // addToContentList(contentCacher, "http://www.techcrunch.com/");
544: // addToContentList(contentCacher, "http://freshmeat.net/");
545: //
546: // mozillaParsingTime = 0.0;
547: // tagsoupParsingTime = 0.0;
548: //
549: // System.err.println("Mozilla parsing time :" + mozillaParsingTime +" sec.");
550: // System.err.println("Tagsoup parsing time :" + tagsoupParsingTime +" sec.");
551: //
552: // MozillaParsingThread[] mozillaThreads = new MozillaParsingThread[contentList.size()];
553: // TagSoupParsingThread[] tagsoupThreads = new TagSoupParsingThread[contentList.size()];
554: //
555: // for (int i=0; i<contentList.size(); i++)
556: // {
557: // mozillaThreads[i] = new MozillaParsingThread(contentList.get(i));
558: // tagsoupThreads[i] = new TagSoupParsingThread(contentList.get(i));
559: // }
560: //
561: //
562: // // first do the tagsoup threads :
563: // for (int i=0; i<contentList.size(); i++)
564: // tagsoupThreads[i].start();
565: // for (int i=0; i<contentList.size(); i++)
566: // tagsoupThreads[i].join();
567: //
568: // // then do mizlla threads :
569: // for (int i=0; i<contentList.size(); i++)
570: // mozillaThreads[i].start();
571: //
572: // for (int i=0; i<contentList.size(); i++)
573: // mozillaThreads[i].join();
574: //
575: // System.err.println("--------------> Mozilla parsing time :" + mozillaParsingTime +" sec.");
576: // System.err.println("--------------> Tagsoup parsing time :" + tagsoupParsingTime +" sec.");
577: //
578: // // assert that mozilla parser works no worse than 1.25 the tagsoup time :
579: // assertTrue(1.25*tagsoupParsingTime > mozillaParsingTime);
580: // }
581: //
582: // class MozillaParsingThread extends Thread {
583: //
584: // private final String content;
585: //
586: // public MozillaParsingThread(String content){
587: // this.content = content;
588: // }
589: //
590: // public void run()
591: // {
592: // SimpleTimeProfiler profiler = new SimpleTimeProfiler();
593: // profiler.start();
594: // MozillaParser.getInstance().parse(content);
595: // mozillaParsingTime += profiler.report("Mozilla:");
596: // }
597: //
598: //
599: // }
600: //
601: //class TagSoupParsingThread extends Thread
602: //{
603: //
604: // private final String content;
605: //
606: // public TagSoupParsingThread(String content){
607: // this.content = content;
608: // }
609: //
610: // public void run()
611: // {
612: // SimpleTimeProfiler profiler = new SimpleTimeProfiler();
613: // profiler.start();
614: // try {
615: // tagSoupParse(content);
616: // } catch (DocumentException e) {
617: // // TODO Auto-generated catch block
618: // e.printStackTrace();
619: // }
620: // tagsoupParsingTime += profiler.report("Tagsoup:");
621: // }
622: //
623: //
624: // }
625: //
626: //
627: //
628: // public void testPerformance() throws Exception
629: // {
630: // mozillaParsingTime = 0.0;
631: // tagsoupParsingTime = 0.0;
632: //
633: // Cacher contentCacher = new Cacher("ohad.dappit.com");
634: // contentCacher.setCacheLifeTime(Integer.MAX_VALUE);
635: //
636: // compareMozillaAndTagsoup(contentCacher,"http://www.youtube.com/results?search_query=saddam&search=Search");
637: //
638: // compareMozillaAndTagsoup(contentCacher, "http://www.digg.com");
639: //
640: // compareMozillaAndTagsoup(contentCacher, "http://www.walla.co.il");
641: //
642: // compareMozillaAndTagsoup(contentCacher, "http://www.dappit.com");
643: //
644: // compareMozillaAndTagsoup(contentCacher, "http://www.cnn.com");
645: //
646: // compareMozillaAndTagsoup(contentCacher, "http://slashdot.org");
647: //
648: // compareMozillaAndTagsoup(contentCacher, "http://www.netdimes.org");
649: //
650: // compareMozillaAndTagsoup(contentCacher, "http://www.yahoo.com");
651: //
652: // compareMozillaAndTagsoup(contentCacher, "http://www.mozilla.org");
653: // compareMozillaAndTagsoup(contentCacher, "http://www.nana.co.il");
654: // compareMozillaAndTagsoup(contentCacher, "http://www.finance.com");
655: // compareMozillaAndTagsoup(contentCacher, "http://www.cnn.co.jp/");
656: // compareMozillaAndTagsoup(contentCacher, "http://www.techcrunch.com/");
657: // compareMozillaAndTagsoup(contentCacher, "http://freshmeat.net/");
658: //
659: //
660: // System.err.println("--------------> Mozilla parsing time :" + mozillaParsingTime +" sec.");
661: // System.err.println("--------------> Tagsoup parsing time :" + tagsoupParsingTime +" sec.");
662: //
663: // // assert that mozilla parser works no worse than 1.25 the tagsoup time :
664: // assertTrue(1.25*tagsoupParsingTime > mozillaParsingTime);
665: //
666: // }
667: //
668: private Document tagSoupParse(String content)
669: throws DocumentException {
670: Parser htmlParser = new Parser();
671:
672: SAXReader saxReader = new SAXReader(htmlParser);
673: saxReader.setMergeAdjacentText(true);
674: DOMWriter domWriter = new DOMWriter();
675: return domWriter.write(saxReader
676: .read(new StringReader(content)));
677: }
678:
679: //
680: // public void testCrawler() throws MalformedURLException, IOException, NetworkErrorException, CacheDirectoryException, CacheWriteException, DocumentException{
681: //
682: // Cacher cacher = new Cacher();
683: // cacher.setCacheLifeTime(Integer.MAX_VALUE);
684: // for (int i=1; i<20 ; i++)
685: // {
686: // int start=10*i;
687: // String googleUrlString = "http://www.google.co.il/search?q=windows&hl=iw&lr=&start=" +start + "&sa=N";
688: // System.out.println("Fetching :" +googleUrlString);
689: // String urlContent = Util.urlGetContents( new URL(googleUrlString));
690: //
691: // Document googleDoc = tagSoupParse(urlContent);
692: //// Document googleDoc = MozillaParser.getInstance().parse(urlContent);
693: // NodeList anchors = googleDoc.getElementsByTagName("a");
694: // System.out.println("number of anchors : " + anchors.getLength());
695: // for (int j=0; j<anchors.getLength() ; j++)
696: // {
697: // Attr hrefAttribute = (Attr)anchors.item(j).getAttributes().getNamedItem("href");
698: // if (hrefAttribute!=null)
699: // {
700: // String attributeValue = hrefAttribute.getValue();
701: // if (attributeValue.startsWith("http://") && !attributeValue.endsWith(".pdf"))
702: // {
703: // System.err.println(i+":"+j+"/"+anchors.getLength()+ " : Fetching from : " + attributeValue);
704: // String urlContent2=null;
705: // try
706: // {
707: // urlContent2 = cacher.getCache(attributeValue);
708: // }
709: // catch (Exception e)
710: // {
711: // try
712: // {
713: // urlContent2 = Util.urlGetContents(new URL(attributeValue));
714: // }
715: // catch (Exception ex)
716: // {
717: // ex.printStackTrace();
718: // urlContent2 = "<html>";
719: // }
720: // cacher.putCache(attributeValue, urlContent2);
721: // }
722: //// tagSoupParse(urlContent2);
723: // MozillaParser.getInstance().parse(urlContent2);
724: // }
725: // }
726: // }
727: //
728: //
729: //
730: // }
731: //
732: //
733: // }
734: //
735: //
736: // volatile double mozillaParsingTime = 0.0;
737: // volatile double tagsoupParsingTime = 0.0;
738: //
739: // /**
740: // * @param youTubeContent
741: // * @throws DocumentException
742: // * @throws NetworkErrorException
743: // * @throws IOException
744: // * @throws MalformedURLException
745: // */
746: // private void compareMozillaAndTagsoup(Cacher cacher , String url) throws Exception {
747: // String content = null;
748: // try
749: // {
750: // System.err.println("Fetching content from :" + url);
751: // content = cacher.getCache(url);
752: // }
753: // catch (Exception e)
754: // {
755: // System.err.println("couldn't find contetn for URL:" + url +". grabbing page from net...");
756: // content = Util.urlGetContents(new URL(url));
757: // cacher.putCache(url , content);
758: // }
759: //
760: // SimpleTimeProfiler profiler = new SimpleTimeProfiler();
761: //
762: // // profile mozilla :
763: // profiler.start();
764: //// System.out.println("Parsing content : "+ content);
765: // MozillaParser.getInstance().parse(content);
766: // mozillaParsingTime += profiler.report("Mozilla:");
767: //
768: // // profile tagsoup :
769: // profiler.start();
770: // tagSoupParse(content);
771: // tagsoupParsingTime+= profiler.report("tagsoup:");
772: // }
773: //
774: //
775: // public void testXClarisWindow() throws Exception
776: // {
777: //
778: // // came across this error that crashed the parser :
779: //// ###!!! ASSERTION: unsupported leaf node type: 'Not Reached', file C:\dapper\mozilla\parser\htmlparser\java\JavaContentSink.cpp, line 782
780: //// Break: at file C:\dapper\mozilla\parser\htmlparser\java\JavaContentSink.cpp, line 782
781: //
782: // Cacher contentCacher = new Cacher("ohad.dappit.com");
783: // contentCacher.setCacheLifeTime(Integer.MAX_VALUE);
784: //
785: // compareMozillaAndTagsoup(contentCacher," http://www.sdcoe.k12.ca.us/score/cla.html");
786: // }
787: //
788: //
789: // // WARNING : THIS TEST IS NOT WORKING AUTOMATICALLY
790: // // YOU MUST CHECK THAT THE MEMORY CONSUMPTION IN NOT INCREASING MANUALLY
791: // // TODO : FIND A BETTER WAY TO HANDLE THIS
792: public void testMemoryLeak() throws Exception {
793: SimpleMemoryProfiler memoryProfiler = new SimpleMemoryProfiler();
794: memoryProfiler.start();
795: for (int i = 0; i < 20000; i++) {
796: testSimple2();
797: }
798: //assertTrue("Memory diff is bigger than 20MB. Please check for memory leak" , memoryProfiler.report("Total memory diff") > -100000.0);
799: }
800:
801: }
|