0001: // plasmaParser.java
0002: // (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
0003: // first published in january 2005 on http://yacy.net
0004: // with contributions 02.05.2005 by Martin Thelian
0005: //
0006: // This is a part of YaCy, a peer-to-peer based web search engine
0007: //
0008: // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
0009: // $LastChangedRevision: 1986 $
0010: // $LastChangedBy: orbiter $
0011: //
0012: // LICENSE
0013: //
0014: // This program is free software; you can redistribute it and/or modify
0015: // it under the terms of the GNU General Public License as published by
0016: // the Free Software Foundation; either version 2 of the License, or
0017: // (at your option) any later version.
0018: //
0019: // This program is distributed in the hope that it will be useful,
0020: // but WITHOUT ANY WARRANTY; without even the implied warranty of
0021: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0022: // GNU General Public License for more details.
0023: //
0024: // You should have received a copy of the GNU General Public License
0025: // along with this program; if not, write to the Free Software
0026: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
0027:
0028: package de.anomic.plasma;
0029:
0030: import java.io.BufferedInputStream;
0031: import java.io.ByteArrayInputStream;
0032: import java.io.File;
0033: import java.io.FileFilter;
0034: import java.io.FileInputStream;
0035: import java.io.FilenameFilter;
0036: import java.io.IOException;
0037: import java.io.InputStream;
0038: import java.io.UnsupportedEncodingException;
0039: import java.net.MalformedURLException;
0040: import java.net.URI;
0041: import java.util.Arrays;
0042: import java.util.HashMap;
0043: import java.util.HashSet;
0044: import java.util.Hashtable;
0045: import java.util.Iterator;
0046: import java.util.LinkedList;
0047: import java.util.List;
0048: import java.util.Map;
0049: import java.util.Properties;
0050: import java.util.Set;
0051:
0052: import de.anomic.htmlFilter.htmlFilterContentScraper;
0053: import de.anomic.htmlFilter.htmlFilterImageEntry;
0054: import de.anomic.htmlFilter.htmlFilterInputStream;
0055: import de.anomic.htmlFilter.htmlFilterWriter;
0056: import de.anomic.http.httpc;
0057: import de.anomic.plasma.parser.Parser;
0058: import de.anomic.plasma.parser.ParserException;
0059: import de.anomic.plasma.parser.ParserInfo;
0060: import de.anomic.server.serverFileUtils;
0061: import de.anomic.server.logging.serverLog;
0062: import de.anomic.yacy.yacyURL;
0063:
0064: public final class plasmaParser {
0065: public static final String PARSER_MODE_PROXY = "PROXY";
0066: public static final String PARSER_MODE_CRAWLER = "CRAWLER";
0067: public static final String PARSER_MODE_URLREDIRECTOR = "URLREDIRECTOR";
0068: public static final String PARSER_MODE_ICAP = "ICAP";
0069: public static final String PARSER_MODE_IMAGE = "IMAGE";
0070: public static final HashSet<String> PARSER_MODE = new HashSet<String>(
0071: Arrays.asList(new String[] { PARSER_MODE_PROXY,
0072: PARSER_MODE_CRAWLER, PARSER_MODE_ICAP,
0073: PARSER_MODE_URLREDIRECTOR, PARSER_MODE_IMAGE }));
0074:
0075: private static final HashMap<String, plasmaParserConfig> parserConfigList = new HashMap<String, plasmaParserConfig>();
0076:
0077: /**
0078: * A list containing all installed parsers and the mimeType that they support
0079: * @see #loadAvailableParserList()
0080: */
0081: public static final HashMap<String, ParserInfo> availableParserList = new HashMap<String, ParserInfo>();
0082:
0083: /**
0084: * A list of file extensions and mime types that are supported by the html-parser
0085: */
0086: public static final HashSet<String> supportedHTMLFileExt = new HashSet<String>();
0087: public static final HashSet<String> supportedHTMLMimeTypes = new HashSet<String>();
0088:
0089: private static final Properties mimeTypeLookupByFileExt = new Properties();
0090: static {
0091: // loading a list of extensions from file
0092: BufferedInputStream bufferedIn = null;
0093: try {
0094: mimeTypeLookupByFileExt
0095: .load(bufferedIn = new BufferedInputStream(
0096: new FileInputStream(new File("httpd.mime"))));
0097: } catch (IOException e) {
0098: System.err
0099: .println("ERROR: httpd.mime not found in settings path");
0100: } finally {
0101: if (bufferedIn != null)
0102: try {
0103: bufferedIn.close();
0104: } catch (Exception e) {
0105: }
0106: }
0107: }
0108:
0109: /**
0110: * A list of media extensions that should <b>not</b> be handled by the plasmaParser
0111: */
0112: private static final HashSet<String> mediaExtSet = new HashSet<String>();
0113:
0114: /**
0115: * A list of image, audio, video and application extensions
0116: */
0117: private static final HashSet<String> imageExtSet = new HashSet<String>();
0118: private static final HashSet<String> audioExtSet = new HashSet<String>();
0119: private static final HashSet<String> videoExtSet = new HashSet<String>();
0120: private static final HashSet<String> appsExtSet = new HashSet<String>();
0121:
0122: /**
0123: * This {@link FilenameFilter} is used to find all classes based on there filenames
0124: * which seems to be additional content parsers.
0125: * Currently the filenames of all content parser classes must end with <code>Parser.class</code>
0126: */
0127: private static final FilenameFilter parserFileNameFilter = new FilenameFilter() {
0128: public boolean accept(File dir, String name) {
0129: return name.endsWith("Parser.class");
0130: }
0131: };
0132:
0133: /**
0134: * This {@link FileFilter} is used to get all subpackages
0135: * of the parser package.
0136: */
0137: private static final FileFilter parserDirectoryFilter = new FileFilter() {
0138: public boolean accept(File file) {
0139: return file.isDirectory();
0140: }
0141: };
0142:
0143: /**
0144: * Initializing the
0145: * @see #initMediaExt(String)
0146: */
0147: static {
0148: String apps = "sit,hqx,img,dmg,exe,com,bat,sh,vbs,zip,jar";
0149: String audio = "mp2,mp3,ogg,aac,aif,aiff,wav";
0150: String video = "swf,avi,wmv,rm,mov,mpg,mpeg,ram,m4v";
0151: String image = "jpg,jpeg,jpe,gif,png,ico,bmp";
0152: initMediaExt(extString2extList(apps + "," + // application container
0153: "tar,gz,bz2,arj,zip,rar," + // archive formats
0154: "ps,xls,ppt,asf," + // text formats without support
0155: audio + "," + // audio formats
0156: video + "," + // video formats
0157: image // image formats
0158: ));
0159: initImageExt(extString2extList(image)); // image formats
0160: initAudioExt(extString2extList(audio)); // audio formats
0161: initVideoExt(extString2extList(video)); // video formats
0162: initAppsExt(extString2extList(apps)); // application formats
0163:
0164: /* ===================================================
0165: * loading a list of available parsers
0166: * =================================================== */
0167: loadAvailableParserList();
0168: }
0169:
0170: private serverLog theLogger = new serverLog("PARSER");
0171:
0172: public serverLog getLogger() {
0173: return this .theLogger;
0174: }
0175:
0176: public static HashMap<String, plasmaParserConfig> getParserConfigList() {
0177: return parserConfigList;
0178: }
0179:
0180: /**
0181: * This function is used to initialize the HTMLParsableMimeTypes List.
0182: * This list contains a list of mimeTypes that can be parsed in realtime by
0183: * the yacy html-Parser
0184: * @param htmlParsableMimeTypes a list of mimetypes that can be parsed by the
0185: * yacy html parser
0186: */
0187: public static void initHTMLParsableMimeTypes(
0188: String htmlParsableMimeTypes) {
0189: LinkedList<String> mimeTypes = new LinkedList<String>();
0190: if ((htmlParsableMimeTypes == null)
0191: || (htmlParsableMimeTypes.length() == 0)) {
0192: return;
0193: }
0194: String[] realtimeParsableMimeTypeList = htmlParsableMimeTypes
0195: .split(",");
0196: for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) {
0197: mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase()
0198: .trim());
0199: }
0200: synchronized (supportedHTMLMimeTypes) {
0201: supportedHTMLMimeTypes.clear();
0202: supportedHTMLMimeTypes.addAll(mimeTypes);
0203: }
0204: }
0205:
0206: public static List<String> extString2extList(String extString) {
0207: LinkedList<String> extensions = new LinkedList<String>();
0208: if ((extString == null) || (extString.length() == 0)) {
0209: return extensions;
0210: } else {
0211: String[] xs = extString.split(",");
0212: for (int i = 0; i < xs.length; i++)
0213: extensions.add(xs[i].toLowerCase().trim());
0214: }
0215: return extensions;
0216: }
0217:
0218: public static void initMediaExt(List<String> mediaExtList) {
0219: synchronized (mediaExtSet) {
0220: mediaExtSet.clear();
0221: mediaExtSet.addAll(mediaExtList);
0222: }
0223: }
0224:
0225: public static void initImageExt(List<String> imageExtList) {
0226: synchronized (imageExtSet) {
0227: imageExtSet.clear();
0228: imageExtSet.addAll(imageExtList);
0229: }
0230: }
0231:
0232: public static void initAudioExt(List<String> audioExtList) {
0233: synchronized (audioExtSet) {
0234: audioExtSet.clear();
0235: audioExtSet.addAll(audioExtList);
0236: }
0237: }
0238:
0239: public static void initVideoExt(List<String> videoExtList) {
0240: synchronized (videoExtSet) {
0241: videoExtSet.clear();
0242: videoExtSet.addAll(videoExtList);
0243: }
0244: }
0245:
0246: public static void initAppsExt(List<String> appsExtList) {
0247: synchronized (appsExtSet) {
0248: appsExtSet.clear();
0249: appsExtSet.addAll(appsExtList);
0250: }
0251: }
0252:
0253: public static String getMediaExtList() {
0254: synchronized (mediaExtSet) {
0255: return mediaExtSet.toString();
0256: }
0257: }
0258:
0259: public static void initSupportedHTMLFileExt(
0260: List<String> supportedRealtimeFileExtList) {
0261: synchronized (supportedHTMLFileExt) {
0262: supportedHTMLFileExt.clear();
0263: supportedHTMLFileExt.addAll(supportedRealtimeFileExtList);
0264: }
0265: }
0266:
0267: public static boolean HTMLParsableMimeTypesContains(String mimeType) {
0268: mimeType = normalizeMimeType(mimeType);
0269: synchronized (supportedHTMLMimeTypes) {
0270: return supportedHTMLMimeTypes.contains(mimeType);
0271: }
0272: }
0273:
0274: public static boolean supportedHTMLContent(yacyURL url,
0275: String mimeType) {
0276: return HTMLParsableMimeTypesContains(mimeType)
0277: && supportedHTMLFileExtContains(url);
0278: }
0279:
0280: public static boolean supportedHTMLFileExtContains(yacyURL url) {
0281: String fileExt = getFileExt(url);
0282: synchronized (supportedHTMLFileExt) {
0283: return supportedHTMLFileExt.contains(fileExt);
0284: }
0285: }
0286:
0287: public static String getFileExt(yacyURL url) {
0288: // getting the file path
0289: String name = url.getPath();
0290:
0291: // tetermining last position of / in the file path
0292: int p = name.lastIndexOf('/');
0293: if (p != -1) {
0294: name = name.substring(p);
0295: }
0296:
0297: // termining last position of . in file path
0298: p = name.lastIndexOf('.');
0299: if (p < 0)
0300: return "";
0301: return name.substring(p + 1);
0302: }
0303:
0304: public static boolean mediaExtContains(String mediaExt) {
0305: if (mediaExt == null)
0306: return false;
0307: mediaExt = mediaExt.trim().toLowerCase();
0308:
0309: synchronized (supportedHTMLFileExt) {
0310: if (supportedHTMLFileExt.contains(mediaExt))
0311: return false;
0312: }
0313:
0314: if (supportedFileExtContains(mediaExt))
0315: return false;
0316:
0317: synchronized (mediaExtSet) {
0318: return mediaExtSet.contains(mediaExt);
0319: }
0320: }
0321:
0322: public static boolean imageExtContains(String imageExt) {
0323: if (imageExt == null)
0324: return false;
0325: synchronized (imageExtSet) {
0326: return imageExtSet.contains(imageExt.trim().toLowerCase());
0327: }
0328: }
0329:
0330: public static boolean audioExtContains(String audioExt) {
0331: if (audioExt == null)
0332: return false;
0333: synchronized (audioExtSet) {
0334: return audioExtSet.contains(audioExt.trim().toLowerCase());
0335: }
0336: }
0337:
0338: public static boolean videoExtContains(String videoExt) {
0339: if (videoExt == null)
0340: return false;
0341: synchronized (videoExtSet) {
0342: return videoExtSet.contains(videoExt.trim().toLowerCase());
0343: }
0344: }
0345:
0346: public static boolean appsExtContains(String appsExt) {
0347: if (appsExt == null)
0348: return false;
0349: synchronized (appsExtSet) {
0350: return appsExtSet.contains(appsExt.trim().toLowerCase());
0351: }
0352: }
0353:
0354: public static String getRealCharsetEncoding(String encoding) {
0355: if ((encoding == null) || (encoding.length() == 0))
0356: return "ISO-8859-1";
0357:
0358: // trim encoding string
0359: encoding = encoding.trim();
0360:
0361: if (encoding.toLowerCase().startsWith("windows")
0362: && encoding.length() > 7) {
0363: char c = encoding.charAt(7);
0364: if (c == '_')
0365: encoding = "windows-" + encoding.substring(8);
0366: else if ((c >= '0') && (c <= '9'))
0367: encoding = "windows-" + encoding.substring(7);
0368: }
0369:
0370: if (encoding.toLowerCase().startsWith("iso")
0371: && encoding.length() > 3) {
0372: char c = encoding.charAt(3);
0373: if (c == '_')
0374: encoding = "ISO-" + encoding.substring(4);
0375: else if ((c >= '0') && (c <= '9'))
0376: encoding = "ISO-" + encoding.substring(3);
0377: }
0378:
0379: if (encoding.toLowerCase().startsWith("iso")
0380: && encoding.length() > 8) {
0381: char c = encoding.charAt(8);
0382: if (c == '_')
0383: encoding = encoding.substring(0, 8) + "-"
0384: + encoding.substring(9);
0385: else if ((c >= '0') && (c <= '9'))
0386: encoding = encoding.substring(0, 8) + "-"
0387: + encoding.substring(8);
0388: }
0389:
0390: // converting cp\d{4} -> windows-\d{4}
0391: if (encoding.toLowerCase().matches("cp([_-])?125[0-8]")) {
0392: char c = encoding.charAt(2);
0393: if (c == '_' || c == '-')
0394: encoding = "windows-" + encoding.substring(3);
0395: else if ((c >= '0') && (c <= '9'))
0396: encoding = "windows-" + encoding.substring(2);
0397: }
0398:
0399: if (encoding.toLowerCase().matches("gb[_-]?2312([-_]80)?")) {
0400: encoding = "x-EUC-CN";
0401: }
0402:
0403: if (encoding.toLowerCase().matches(".*utf[-_]?8.*")) {
0404: encoding = "UTF-8";
0405: }
0406:
0407: return encoding;
0408: }
0409:
0410: public static String normalizeMimeType(String mimeType) {
0411: //if (mimeType == null) doMimeTypeAnalysis
0412: if (mimeType == null)
0413: mimeType = "application/octet-stream";
0414: mimeType = mimeType.trim().toLowerCase();
0415:
0416: int pos = mimeType.indexOf(';');
0417: return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
0418: }
0419:
0420: public static String getMimeTypeByFileExt(String fileExt) {
0421: return mimeTypeLookupByFileExt.getProperty(fileExt,
0422: "application/octet-stream");
0423: }
0424:
0425: public HashMap<String, ParserInfo> getAvailableParserList() {
0426: return plasmaParser.availableParserList;
0427: }
0428:
0429: private static void loadAvailableParserList() {
0430: try {
0431: plasmaParser.availableParserList.clear();
0432:
0433: // getting the current java classpath
0434: String javaClassPath = System
0435: .getProperty("java.class.path");
0436:
0437: // getting the current package name
0438: String plasmaParserPkgName = plasmaParser.class
0439: .getPackage().getName()
0440: + ".parser";
0441: serverLog.logInfo("PARSER",
0442: "Searching for additional content parsers in package "
0443: + plasmaParserPkgName);
0444:
0445: // getting an uri to the parser subpackage
0446: String packageURI = plasmaParser.class.getResource(
0447: "/" + plasmaParserPkgName.replace('.', '/'))
0448: .toString();
0449: serverLog.logFine("PARSER", "Parser directory is "
0450: + packageURI);
0451:
0452: // open the parser directory
0453: File parserDir = new File(new URI(packageURI));
0454: if ((parserDir == null) || (!parserDir.exists())
0455: || (!parserDir.isDirectory()))
0456: return;
0457:
0458: /*
0459: * loop through all subdirectories and test if we can
0460: * find an additional parser class
0461: */
0462: File[] parserDirectories = parserDir
0463: .listFiles(parserDirectoryFilter);
0464: if (parserDirectories == null)
0465: return;
0466:
0467: for (int parserDirNr = 0; parserDirNr < parserDirectories.length; parserDirNr++) {
0468: File currentDir = parserDirectories[parserDirNr];
0469: serverLog.logFine("PARSER", "Searching in directory "
0470: + currentDir.toString());
0471:
0472: String[] parserClasses = currentDir
0473: .list(parserFileNameFilter);
0474: if (parserClasses == null)
0475: continue;
0476:
0477: for (int parserNr = 0; parserNr < parserClasses.length; parserNr++) {
0478: serverLog.logFine("PARSER", "Testing parser class "
0479: + parserClasses[parserNr]);
0480: String className = parserClasses[parserNr]
0481: .substring(0, parserClasses[parserNr]
0482: .indexOf(".class"));
0483: String fullClassName = plasmaParserPkgName + "."
0484: + currentDir.getName() + "." + className;
0485: try {
0486: // trying to load the parser class by its name
0487: Class<?> parserClass = Class
0488: .forName(fullClassName);
0489: Object theParser0 = (Parser) parserClass
0490: .newInstance();
0491: if (!(theParser0 instanceof Parser))
0492: continue;
0493: Parser theParser = (Parser) theParser0;
0494:
0495: // testing if all needed libx libraries are available
0496: String[] neededLibx = theParser
0497: .getLibxDependences();
0498: StringBuffer neededLibxBuf = new StringBuffer();
0499: if (neededLibx != null) {
0500: for (int libxId = 0; libxId < neededLibx.length; libxId++) {
0501: if (javaClassPath
0502: .indexOf(neededLibx[libxId]) == -1) {
0503: throw new Exception(
0504: "Missing dependency detected: '"
0505: + neededLibx[libxId]
0506: + "'.");
0507: }
0508: neededLibxBuf
0509: .append(neededLibx[libxId])
0510: .append(",");
0511: }
0512: if (neededLibxBuf.length() > 0)
0513: neededLibxBuf
0514: .deleteCharAt(neededLibxBuf
0515: .length() - 1);
0516: }
0517:
0518: // loading the list of mime-types that are supported by this parser class
0519: Hashtable<String, String> supportedMimeTypes = theParser
0520: .getSupportedMimeTypes();
0521:
0522: // creating a parser info object
0523: ParserInfo parserInfo = new ParserInfo();
0524: parserInfo.parserClass = parserClass;
0525: parserInfo.parserClassName = fullClassName;
0526: parserInfo.libxDependencies = neededLibx;
0527: parserInfo.supportedMimeTypes = supportedMimeTypes;
0528: parserInfo.parserVersionNr = ((Parser) theParser)
0529: .getVersion();
0530: parserInfo.parserName = ((Parser) theParser)
0531: .getName();
0532:
0533: Iterator<String> mimeTypeIterator = supportedMimeTypes
0534: .keySet().iterator();
0535: while (mimeTypeIterator.hasNext()) {
0536: String mimeType = (String) mimeTypeIterator
0537: .next();
0538: availableParserList.put(mimeType,
0539: parserInfo);
0540: serverLog
0541: .logInfo(
0542: "PARSER",
0543: "Found functional parser for mimeType '"
0544: + mimeType
0545: + "'."
0546: + "\n\tName: "
0547: + parserInfo.parserName
0548: + "\n\tVersion: "
0549: + parserInfo.parserVersionNr
0550: + "\n\tClass: "
0551: + parserInfo.parserClassName
0552: + ((neededLibxBuf
0553: .length() > 0) ? "\n\tDependencies: "
0554: + neededLibxBuf
0555: .toString()
0556: : ""));
0557: }
0558:
0559: } catch (Exception e) { /* we can ignore this for the moment */
0560: serverLog
0561: .logWarning(
0562: "PARSER",
0563: "Parser '"
0564: + className
0565: + "' doesn't work correctly and will be ignored.\n ["
0566: + e.getClass()
0567: .getName()
0568: + "]: "
0569: + e.getMessage());
0570: e.printStackTrace();
0571: } catch (Error e) { /* we can ignore this for the moment */
0572: serverLog
0573: .logWarning(
0574: "PARSER",
0575: "Parser '"
0576: + className
0577: + "' doesn't work correctly and will be ignored.\n ["
0578: + e.getClass()
0579: .getName()
0580: + "]: "
0581: + e.getMessage());
0582: e.printStackTrace();
0583: }
0584: }
0585: }
0586:
0587: } catch (Exception e) {
0588: serverLog.logSevere("PARSER",
0589: "Unable to determine all installed parsers. "
0590: + e.getMessage());
0591: }
0592: }
0593:
0594: public void close() {
0595: // clearing the parser list
0596: Iterator<plasmaParserConfig> configs = parserConfigList
0597: .values().iterator();
0598: while (configs.hasNext()) {
0599: plasmaParserConfig currentConfig = configs.next();
0600: synchronized (currentConfig.enabledParserList) {
0601: currentConfig.enabledParserList.clear();
0602: }
0603: }
0604: }
0605:
0606: public plasmaParserDocument parseSource(yacyURL location,
0607: String mimeType, String charset, byte[] sourceArray)
0608: throws InterruptedException, ParserException {
0609: ByteArrayInputStream byteIn = null;
0610: try {
0611: if (this .theLogger.isFine())
0612: this .theLogger.logFine("Parsing '" + location
0613: + "' from byte-array");
0614:
0615: // testing if the resource is not empty
0616: if (sourceArray == null || sourceArray.length == 0) {
0617: String errorMsg = "No resource content available (1).";
0618: this .theLogger.logInfo("Unable to parse '" + location
0619: + "'. " + errorMsg);
0620: throw new ParserException(errorMsg, location,
0621: plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
0622: }
0623:
0624: // creating an InputStream
0625: byteIn = new ByteArrayInputStream(sourceArray);
0626:
0627: // parsing the temp file
0628: return parseSource(location, mimeType, charset,
0629: sourceArray.length, byteIn);
0630:
0631: } catch (Exception e) {
0632: // Interrupted- and Parser-Exceptions should pass through
0633: if (e instanceof InterruptedException)
0634: throw (InterruptedException) e;
0635: if (e instanceof ParserException)
0636: throw (ParserException) e;
0637:
0638: // log unexpected error
0639: this .theLogger.logSevere(
0640: "Unexpected exception in parseSource from byte-array: "
0641: + e.getMessage(), e);
0642: throw new ParserException(
0643: "Unexpected exception while parsing " + location,
0644: location, e);
0645: } finally {
0646: if (byteIn != null)
0647: try {
0648: byteIn.close();
0649: } catch (Exception ex) {/* ignore this */
0650: }
0651: }
0652:
0653: }
0654:
0655: public plasmaParserDocument parseSource(yacyURL location,
0656: String theMimeType, String theDocumentCharset,
0657: File sourceFile) throws InterruptedException,
0658: ParserException {
0659:
0660: BufferedInputStream sourceStream = null;
0661: try {
0662: if (this .theLogger.isFine())
0663: this .theLogger.logFine("Parsing '" + location
0664: + "' from file");
0665:
0666: // testing if the resource is not empty
0667: if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile
0668: .length() > 0)) {
0669: String errorMsg = sourceFile.exists() ? "Empty resource file."
0670: : "No resource content available (2).";
0671: this .theLogger.logInfo("Unable to parse '" + location
0672: + "'. " + errorMsg);
0673: throw new ParserException(errorMsg, location,
0674: plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
0675: }
0676:
0677: // create a new InputStream
0678: sourceStream = new BufferedInputStream(new FileInputStream(
0679: sourceFile));
0680:
0681: // parsing the data
0682: return this .parseSource(location, theMimeType,
0683: theDocumentCharset, sourceFile.length(),
0684: sourceStream);
0685:
0686: } catch (Exception e) {
0687: // Interrupted- and Parser-Exceptions should pass through
0688: if (e instanceof InterruptedException)
0689: throw (InterruptedException) e;
0690: if (e instanceof ParserException)
0691: throw (ParserException) e;
0692:
0693: // log unexpected error
0694: this .theLogger.logSevere(
0695: "Unexpected exception in parseSource from File: "
0696: + e.getMessage(), e);
0697: throw new ParserException(
0698: "Unexpected exception while parsing " + location,
0699: location, e);
0700: } finally {
0701: if (sourceStream != null)
0702: try {
0703: sourceStream.close();
0704: } catch (Exception ex) {/* ignore this */
0705: }
0706: }
0707: }
0708:
0709: /**
0710: * To parse a resource from an {@link InputStream}
0711: * @param location the URL of the resource
0712: * @param theMimeType the resource mimetype (<code>null</code> if unknown)
0713: * @param theDocumentCharset the charset of the resource (<code>null</code> if unknown)
0714: * @param contentLength the content length of the resource (<code>-1</code> if unknown)
0715: * @param sourceStream an {@link InputStream} containing the resource body
0716: * @return the parsed {@link plasmaParserDocument document}
0717: * @throws InterruptedException
0718: * @throws ParserException
0719: */
0720: public plasmaParserDocument parseSource(yacyURL location,
0721: String theMimeType, String theDocumentCharset,
0722: long contentLength, InputStream sourceStream)
0723: throws InterruptedException, ParserException {
0724: Parser theParser = null;
0725: String mimeType = null;
0726: try {
0727: if (this .theLogger.isFine())
0728: this .theLogger.logFine("Parsing '" + location
0729: + "' from stream");
0730:
0731: // getting the mimetype of the document
0732: mimeType = normalizeMimeType(theMimeType);
0733:
0734: // getting the file extension of the document
0735: String fileExt = getFileExt(location);
0736:
0737: // getting the charset of the document
0738: // TODO: do a charset detection here ....
0739: String documentCharset = getRealCharsetEncoding(theDocumentCharset);
0740:
0741: // testing if parsing is supported for this resource
0742: if (!plasmaParser.supportedContent(location, mimeType)) {
0743: String errorMsg = "No parser available to parse mimetype '"
0744: + mimeType + "'";
0745: this .theLogger.logInfo("Unable to parse '" + location
0746: + "'. " + errorMsg);
0747: throw new ParserException(errorMsg, location,
0748: plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
0749: }
0750:
0751: if (this .theLogger.isFine())
0752: this .theLogger.logInfo("Parsing " + location
0753: + " with mimeType '" + mimeType
0754: + "' and file extension '" + fileExt + "'.");
0755:
0756: // getting the correct parser for the given mimeType
0757: theParser = this .getParser(mimeType);
0758:
0759: // if a parser was found we use it ...
0760: plasmaParserDocument doc = null;
0761: if (theParser != null) {
0762: // set the content length of the resource
0763: theParser.setContentLength(contentLength);
0764: // parse the resource
0765: doc = theParser.parse(location, mimeType,
0766: documentCharset, sourceStream);
0767: } else if (HTMLParsableMimeTypesContains(mimeType)) {
0768: doc = parseHtml(location, mimeType, documentCharset,
0769: sourceStream);
0770: } else {
0771: String errorMsg = "No parser available to parse mimetype '"
0772: + mimeType + "'";
0773: this .theLogger.logInfo("Unable to parse '" + location
0774: + "'. " + errorMsg);
0775: throw new ParserException(errorMsg, location,
0776: plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
0777: }
0778:
0779: // check result
0780: if (doc == null) {
0781: String errorMsg = "Unexpected error. Parser returned null.";
0782: this .theLogger.logInfo("Unable to parse '" + location
0783: + "'. " + errorMsg);
0784: throw new ParserException(errorMsg, location);
0785: }
0786: return doc;
0787:
0788: } catch (UnsupportedEncodingException e) {
0789: String errorMsg = "Unsupported charset encoding: "
0790: + e.getMessage();
0791: this .theLogger.logSevere("Unable to parse '" + location
0792: + "'. " + errorMsg);
0793: throw new ParserException(errorMsg, location,
0794: plasmaCrawlEURL.DENIED_UNSUPPORTED_CHARSET);
0795: } catch (Exception e) {
0796: // Interrupted- and Parser-Exceptions should pass through
0797: if (e instanceof InterruptedException)
0798: throw (InterruptedException) e;
0799: if (e instanceof ParserException)
0800: throw (ParserException) e;
0801:
0802: // log unexpected error
0803: String errorMsg = "Unexpected exception. " + e.getMessage();
0804: this .theLogger.logSevere("Unable to parse '" + location
0805: + "'. " + errorMsg, e);
0806: throw new ParserException(errorMsg, location, e);
0807:
0808: } finally {
0809: if (theParser != null) {
0810: theParser = null; // delete object
0811: }
0812: }
0813: }
0814:
0815: private plasmaParserDocument parseHtml(yacyURL location,
0816: String mimeType, String documentCharset,
0817: InputStream sourceStream) throws IOException,
0818: ParserException {
0819:
0820: // make a scraper and transformer
0821: htmlFilterInputStream htmlFilter = new htmlFilterInputStream(
0822: sourceStream, documentCharset, location, null, false);
0823: String charset = htmlFilter.detectCharset();
0824: if (charset == null) {
0825: charset = documentCharset;
0826: } else {
0827: charset = getRealCharsetEncoding(charset);
0828: }
0829:
0830: if (!documentCharset.equalsIgnoreCase(charset)) {
0831: this .theLogger
0832: .logInfo("Charset transformation needed from '"
0833: + documentCharset + "' to '" + charset
0834: + "'");
0835: }
0836:
0837: // parsing the content
0838: htmlFilterContentScraper scraper = new htmlFilterContentScraper(
0839: location);
0840: htmlFilterWriter writer = new htmlFilterWriter(null, null,
0841: scraper, null, false);
0842: serverFileUtils.copy(htmlFilter, writer, charset);
0843: writer.close();
0844: //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
0845: //serverFileUtils.copy(sourceFile, hfos);
0846: //hfos.close();
0847: if (writer.binarySuspect()) {
0848: String errorMsg = "Binary data found in resource";
0849: this .theLogger.logSevere("Unable to parse '" + location
0850: + "'. " + errorMsg);
0851: throw new ParserException(errorMsg, location);
0852: }
0853: return transformScraper(location, mimeType, documentCharset,
0854: scraper);
0855: }
0856:
0857: public plasmaParserDocument transformScraper(yacyURL location,
0858: String mimeType, String charSet,
0859: htmlFilterContentScraper scraper) {
0860: String[] sections = new String[scraper.getHeadlines(1).length
0861: + scraper.getHeadlines(2).length
0862: + scraper.getHeadlines(3).length
0863: + scraper.getHeadlines(4).length];
0864: int p = 0;
0865: for (int i = 1; i <= 4; i++)
0866: for (int j = 0; j < scraper.getHeadlines(i).length; j++)
0867: sections[p++] = scraper.getHeadlines(i)[j];
0868: plasmaParserDocument ppd = new plasmaParserDocument(location,
0869: mimeType, charSet, scraper.getKeywords(), scraper
0870: .getTitle(), scraper.getAuthor(), sections,
0871: scraper.getDescription(), scraper.getText(), scraper
0872: .getAnchors(), scraper.getImages());
0873: //scraper.close();
0874: ppd.setFavicon(scraper.getFavicon());
0875: return ppd;
0876: }
0877:
0878: /**
0879: * This function is used to determine the parser class that should be used for a given
0880: * mimetype ...
0881: * @param mimeType MIME-Type of the resource
0882: * @return the {@link Parser}-class that is supposed to parse the resource of
0883: * the given MIME-Type
0884: */
0885: private Parser getParser(String mimeType) {
0886:
0887: mimeType = normalizeMimeType(mimeType);
0888: try {
0889:
0890: // determining the proper parser class name for the mimeType
0891: String parserClassName = null;
0892: ParserInfo parserInfo = null;
0893: synchronized (plasmaParser.availableParserList) {
0894: if (plasmaParser.availableParserList
0895: .containsKey(mimeType)) {
0896: parserInfo = (ParserInfo) plasmaParser.availableParserList
0897: .get(mimeType);
0898: parserClassName = parserInfo.parserClassName;
0899: } else {
0900: return null;
0901: }
0902: }
0903:
0904: // fetching a new parser object from pool
0905: Parser theParser = makeParser(parserClassName);
0906:
0907: // checking if the created parser really supports the given mimetype
0908: Hashtable<String, String> supportedMimeTypes = theParser
0909: .getSupportedMimeTypes();
0910: if ((supportedMimeTypes != null)
0911: && (supportedMimeTypes.containsKey(mimeType))) {
0912: parserInfo.incUsageCounter();
0913: return theParser;
0914: }
0915:
0916: } catch (Exception e) {
0917: System.err
0918: .println("ERROR: Unable to load the correct parser for type "
0919: + mimeType);
0920: }
0921:
0922: return null;
0923:
0924: }
0925:
0926: static Map<yacyURL, String> allReflinks(Set<?> links) {
0927: // links is either a Set of Strings (with urls) or htmlFilterImageEntries
0928: // we find all links that are part of a reference inside a url
0929: HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
0930: Iterator<?> i = links.iterator();
0931: Object o;
0932: yacyURL url;
0933: String u;
0934: int pos;
0935: loop: while (i.hasNext())
0936: try {
0937: o = i.next();
0938: if (o instanceof yacyURL)
0939: url = (yacyURL) o;
0940: else if (o instanceof String)
0941: url = new yacyURL((String) o, null);
0942: else if (o instanceof htmlFilterImageEntry)
0943: url = ((htmlFilterImageEntry) o).url();
0944: else {
0945: assert false;
0946: continue;
0947: }
0948: u = url.toNormalform(true, true);
0949: if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) {
0950: i.remove();
0951: u = u.substring(pos);
0952: while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0)
0953: u = u.substring(pos);
0954: url = new yacyURL(u, null);
0955: if (!(v.containsKey(url)))
0956: v.put(url, "ref");
0957: continue loop;
0958: }
0959: if ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) {
0960: i.remove();
0961: u = "http:/" + u.substring(pos);
0962: while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0)
0963: u = "http:/" + u.substring(pos);
0964: url = new yacyURL(u, null);
0965: if (!(v.containsKey(url)))
0966: v.put(url, "ref");
0967: continue loop;
0968: }
0969: } catch (MalformedURLException e) {
0970: }
0971: return v;
0972: }
0973:
0974: static Map<yacyURL, String> allSubpaths(Set<?> links) {
0975: // links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries
0976: HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
0977: Iterator<?> i = links.iterator();
0978: Object o;
0979: yacyURL url;
0980: String u;
0981: int pos;
0982: while (i.hasNext())
0983: try {
0984: o = i.next();
0985: if (o instanceof yacyURL)
0986: url = (yacyURL) o;
0987: else if (o instanceof String)
0988: url = new yacyURL((String) o, null);
0989: else if (o instanceof htmlFilterImageEntry)
0990: url = ((htmlFilterImageEntry) o).url();
0991: else {
0992: assert false;
0993: continue;
0994: }
0995: u = url.toNormalform(true, true);
0996: if (u.endsWith("/"))
0997: u = u.substring(0, u.length() - 1);
0998: pos = u.lastIndexOf("/");
0999: while (pos > 8) {
1000: u = u.substring(0, pos + 1);
1001: url = new yacyURL(u, null);
1002: if (!(v.containsKey(url)))
1003: v.put(url, "sub");
1004: u = u.substring(0, pos);
1005: pos = u.lastIndexOf("/");
1006: }
1007: } catch (MalformedURLException e) {
1008: }
1009: return v;
1010: }
1011:
1012: public static void main(String[] args) {
1013: //javac -sourcepath source source/de/anomic/plasma/plasmaParser.java
1014: //java -cp source de.anomic.plasma.plasmaParser bug.html bug.out
1015: httpc remote = null;
1016: try {
1017: Object content = null;
1018: yacyURL contentURL = null;
1019: long contentLength = -1;
1020: String contentMimeType = "application/octet-stream";
1021: String charSet = "UTF-8";
1022:
1023: if (args.length < 2) {
1024: System.err
1025: .println("Usage: java de.anomic.plasma.plasmaParser (-f filename|-u URL) [-m mimeType]");
1026: }
1027:
1028: String mode = args[0];
1029: if (mode.equalsIgnoreCase("-f")) {
1030: content = new File(args[1]);
1031: contentURL = new yacyURL((File) content);
1032: } else if (mode.equalsIgnoreCase("-u")) {
1033: contentURL = new yacyURL(args[1], null);
1034:
1035: // downloading the document content
1036: remote = new httpc(contentURL.getHost(), contentURL
1037: .getHost(), contentURL.getPort(), 5000,
1038: contentURL.getProtocol().equalsIgnoreCase(
1039: "https"), null, null, null);
1040:
1041: httpc.response res = remote.GET(contentURL.getFile(),
1042: null);
1043: if (res.statusCode != 200) {
1044: System.err.println("Unable to download "
1045: + contentURL + ". " + res.status);
1046: return;
1047: }
1048: content = res.getContentInputStream();
1049: contentMimeType = res.responseHeader.mime();
1050: charSet = res.responseHeader.getCharacterEncoding();
1051: contentLength = res.responseHeader.contentLength();
1052: remote.close();
1053: }
1054:
1055: if ((args.length >= 4) && (args[2].equalsIgnoreCase("-m"))) {
1056: contentMimeType = args[3];
1057: }
1058:
1059: if ((args.length >= 6) && (args[4].equalsIgnoreCase("-c"))) {
1060: charSet = args[5];
1061: }
1062:
1063: // creating a plasma parser
1064: plasmaParser theParser = new plasmaParser();
1065:
1066: // configuring the html parsable mimeTypes
1067: plasmaParser
1068: .initHTMLParsableMimeTypes("application/xhtml+xml,text/html,text/plain,text/sgml");
1069:
1070: // parsing the content
1071: plasmaParserDocument document = null;
1072: if (content instanceof byte[]) {
1073: document = theParser.parseSource(contentURL,
1074: contentMimeType, charSet, (byte[]) content);
1075: } else if (content instanceof File) {
1076: document = theParser.parseSource(contentURL,
1077: contentMimeType, charSet, (File) content);
1078: } else if (content instanceof InputStream) {
1079: document = theParser.parseSource(contentURL,
1080: contentMimeType, charSet, contentLength,
1081: (InputStream) content);
1082: }
1083:
1084: // printing out all parsed sentences
1085: if (document != null) {
1086: System.out.print("Document titel: ");
1087: System.out.println(document.dc_title());
1088:
1089: // found text
1090: final Iterator<StringBuffer> sentences = document
1091: .getSentences(false);
1092: int i = 0;
1093: if (sentences != null)
1094: while (sentences.hasNext()) {
1095: System.out.print("line " + i + ": ");
1096: System.out.println(sentences.next().toString());
1097: i++;
1098: }
1099:
1100: // found links
1101: int anchorNr = 0;
1102: Map<yacyURL, String> anchors = document.getAnchors();
1103: Iterator<yacyURL> anchorIter = anchors.keySet()
1104: .iterator();
1105: while (anchorIter.hasNext()) {
1106: yacyURL key = anchorIter.next();
1107: System.out
1108: .println("URL " + anchorNr + ":\t"
1109: + key.toString() + " | "
1110: + anchors.get(key));
1111: anchorNr++;
1112: }
1113: document.close();
1114: }
1115: } catch (Exception e) {
1116: e.printStackTrace();
1117: }
1118: }
1119:
1120: public static boolean supportedContent(yacyURL url, String mimeType) {
1121: if (url == null)
1122: throw new NullPointerException();
1123:
1124: Iterator<plasmaParserConfig> configs = parserConfigList
1125: .values().iterator();
1126: while (configs.hasNext()) {
1127: plasmaParserConfig currentConfig = configs.next();
1128: synchronized (currentConfig.enabledParserList) {
1129: if (currentConfig.supportedContent(url, mimeType))
1130: return true;
1131: }
1132: }
1133:
1134: return false;
1135: }
1136:
1137: public static boolean supportedContent(String parserMode,
1138: yacyURL url, String mimeType) {
1139: if (!PARSER_MODE.contains(parserMode))
1140: throw new IllegalArgumentException();
1141: if (url == null)
1142: throw new NullPointerException();
1143:
1144: if (parserMode.equals(PARSER_MODE_IMAGE))
1145: return true;
1146: plasmaParserConfig config = (plasmaParserConfig) parserConfigList
1147: .get(parserMode);
1148: return (config == null) ? false : config.supportedContent(url,
1149: mimeType);
1150: }
1151:
1152: public static void initParseableMimeTypes(String parserMode,
1153: String configStr) {
1154: if (!PARSER_MODE.contains(parserMode))
1155: throw new IllegalArgumentException();
1156:
1157: plasmaParserConfig config = (plasmaParserConfig) parserConfigList
1158: .get(parserMode);
1159: if (config == null) {
1160: config = new plasmaParserConfig(parserMode);
1161: parserConfigList.put(parserMode, config);
1162: }
1163: config.initParseableMimeTypes(configStr);
1164: }
1165:
1166: public static String[] setEnabledParserList(String parserMode,
1167: Set<String> mimeTypeSet) {
1168: if (!PARSER_MODE.contains(parserMode))
1169: throw new IllegalArgumentException();
1170:
1171: plasmaParserConfig config = (plasmaParserConfig) parserConfigList
1172: .get(parserMode);
1173: if (config == null) {
1174: config = new plasmaParserConfig(parserMode);
1175: parserConfigList.put(parserMode, config);
1176: }
1177: return config.setEnabledParserList(mimeTypeSet);
1178: }
1179:
1180: public static boolean supportedFileExtContains(String fileExt) {
1181: Iterator<plasmaParserConfig> configs = parserConfigList
1182: .values().iterator();
1183: while (configs.hasNext()) {
1184: plasmaParserConfig currentConfig = configs.next();
1185: synchronized (currentConfig.enabledParserList) {
1186: if (currentConfig.supportedFileExtContains(fileExt))
1187: return true;
1188: }
1189: }
1190:
1191: return false;
1192: }
1193:
1194: public static boolean supportedMimeTypesContains(String mimeType) {
1195: Iterator<plasmaParserConfig> configs = parserConfigList
1196: .values().iterator();
1197: while (configs.hasNext()) {
1198: plasmaParserConfig currentConfig = configs.next();
1199: synchronized (currentConfig.enabledParserList) {
1200: if (currentConfig.supportedMimeTypesContains(mimeType))
1201: return true;
1202: }
1203: }
1204:
1205: return false;
1206: }
1207:
1208: public static Parser makeParser(Object name) throws Exception {
1209:
1210: if (!(name instanceof String))
1211: throw new IllegalArgumentException(
1212: "The object key must be of type string.");
1213:
1214: // loading class by name
1215: Class<?> moduleClass = Class.forName((String) name);
1216:
1217: // instantiating class
1218: Parser theParser = (Parser) moduleClass.newInstance();
1219:
1220: // setting logger that should by used
1221: String parserShortName = ((String) name).substring(
1222: "de.anomic.plasma.parser.".length(), ((String) name)
1223: .lastIndexOf("."));
1224:
1225: serverLog theLogger = new serverLog("PARSER."
1226: + parserShortName.toUpperCase());
1227: theParser.setLogger(theLogger);
1228:
1229: return theParser;
1230: }
1231:
1232: }
|