Source Code Cross Referenced for plasmaParser.java in  » Search-Engine » yacy » de » anomic » plasma » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » yacy » de.anomic.plasma 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        // plasmaParser.java 
0002:        // (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
0003:        // first published in january 2005 on http://yacy.net
0004:        // with contributions 02.05.2005 by Martin Thelian
0005:        //
0006:        // This is a part of YaCy, a peer-to-peer based web search engine
0007:        //
0008:        // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
0009:        // $LastChangedRevision: 1986 $
0010:        // $LastChangedBy: orbiter $
0011:        //
0012:        // LICENSE
0013:        // 
0014:        // This program is free software; you can redistribute it and/or modify
0015:        // it under the terms of the GNU General Public License as published by
0016:        // the Free Software Foundation; either version 2 of the License, or
0017:        // (at your option) any later version.
0018:        //
0019:        // This program is distributed in the hope that it will be useful,
0020:        // but WITHOUT ANY WARRANTY; without even the implied warranty of
0021:        // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0022:        // GNU General Public License for more details.
0023:        //
0024:        // You should have received a copy of the GNU General Public License
0025:        // along with this program; if not, write to the Free Software
0026:        // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0027:
0028:        package de.anomic.plasma;
0029:
0030:        import java.io.BufferedInputStream;
0031:        import java.io.ByteArrayInputStream;
0032:        import java.io.File;
0033:        import java.io.FileFilter;
0034:        import java.io.FileInputStream;
0035:        import java.io.FilenameFilter;
0036:        import java.io.IOException;
0037:        import java.io.InputStream;
0038:        import java.io.UnsupportedEncodingException;
0039:        import java.net.MalformedURLException;
0040:        import java.net.URI;
0041:        import java.util.Arrays;
0042:        import java.util.HashMap;
0043:        import java.util.HashSet;
0044:        import java.util.Hashtable;
0045:        import java.util.Iterator;
0046:        import java.util.LinkedList;
0047:        import java.util.List;
0048:        import java.util.Map;
0049:        import java.util.Properties;
0050:        import java.util.Set;
0051:
0052:        import de.anomic.htmlFilter.htmlFilterContentScraper;
0053:        import de.anomic.htmlFilter.htmlFilterImageEntry;
0054:        import de.anomic.htmlFilter.htmlFilterInputStream;
0055:        import de.anomic.htmlFilter.htmlFilterWriter;
0056:        import de.anomic.http.httpc;
0057:        import de.anomic.plasma.parser.Parser;
0058:        import de.anomic.plasma.parser.ParserException;
0059:        import de.anomic.plasma.parser.ParserInfo;
0060:        import de.anomic.server.serverFileUtils;
0061:        import de.anomic.server.logging.serverLog;
0062:        import de.anomic.yacy.yacyURL;
0063:
0064:        public final class plasmaParser {
0065:            public static final String PARSER_MODE_PROXY = "PROXY";
0066:            public static final String PARSER_MODE_CRAWLER = "CRAWLER";
0067:            public static final String PARSER_MODE_URLREDIRECTOR = "URLREDIRECTOR";
0068:            public static final String PARSER_MODE_ICAP = "ICAP";
0069:            public static final String PARSER_MODE_IMAGE = "IMAGE";
0070:            public static final HashSet<String> PARSER_MODE = new HashSet<String>(
0071:                    Arrays.asList(new String[] { PARSER_MODE_PROXY,
0072:                            PARSER_MODE_CRAWLER, PARSER_MODE_ICAP,
0073:                            PARSER_MODE_URLREDIRECTOR, PARSER_MODE_IMAGE }));
0074:
0075:            private static final HashMap<String, plasmaParserConfig> parserConfigList = new HashMap<String, plasmaParserConfig>();
0076:
0077:            /**
0078:             * A list containing all installed parsers and the mimeType that they support
0079:             * @see #loadAvailableParserList()
0080:             */
0081:            public static final HashMap<String, ParserInfo> availableParserList = new HashMap<String, ParserInfo>();
0082:
0083:            /**
0084:             * A list of file extensions and mime types that are supported by the html-parser
0085:             */
0086:            public static final HashSet<String> supportedHTMLFileExt = new HashSet<String>();
0087:            public static final HashSet<String> supportedHTMLMimeTypes = new HashSet<String>();
0088:
0089:            private static final Properties mimeTypeLookupByFileExt = new Properties();
0090:            static {
0091:                // loading a list of extensions from file
0092:                BufferedInputStream bufferedIn = null;
0093:                try {
0094:                    mimeTypeLookupByFileExt
0095:                            .load(bufferedIn = new BufferedInputStream(
0096:                                    new FileInputStream(new File("httpd.mime"))));
0097:                } catch (IOException e) {
0098:                    System.err
0099:                            .println("ERROR: httpd.mime not found in settings path");
0100:                } finally {
0101:                    if (bufferedIn != null)
0102:                        try {
0103:                            bufferedIn.close();
0104:                        } catch (Exception e) {
0105:                        }
0106:                }
0107:            }
0108:
0109:            /**
0110:             * A list of media extensions that should <b>not</b> be handled by the plasmaParser
0111:             */
0112:            private static final HashSet<String> mediaExtSet = new HashSet<String>();
0113:
0114:            /**
0115:             * A list of image, audio, video and application extensions
0116:             */
0117:            private static final HashSet<String> imageExtSet = new HashSet<String>();
0118:            private static final HashSet<String> audioExtSet = new HashSet<String>();
0119:            private static final HashSet<String> videoExtSet = new HashSet<String>();
0120:            private static final HashSet<String> appsExtSet = new HashSet<String>();
0121:
0122:            /**
0123:             * This {@link FilenameFilter} is used to find all classes based on there filenames 
0124:             * which seems to be additional content parsers.
0125:             * Currently the filenames of all content parser classes must end with <code>Parser.class</code> 
0126:             */
0127:            private static final FilenameFilter parserFileNameFilter = new FilenameFilter() {
0128:                public boolean accept(File dir, String name) {
0129:                    return name.endsWith("Parser.class");
0130:                }
0131:            };
0132:
0133:            /**
0134:             * This {@link FileFilter} is used to get all subpackages
0135:             * of the parser package.
0136:             */
0137:            private static final FileFilter parserDirectoryFilter = new FileFilter() {
0138:                public boolean accept(File file) {
0139:                    return file.isDirectory();
0140:                }
0141:            };
0142:
0143:            /**
0144:             * Initializing the 
0145:             * @see #initMediaExt(String)
0146:             */
0147:            static {
0148:                String apps = "sit,hqx,img,dmg,exe,com,bat,sh,vbs,zip,jar";
0149:                String audio = "mp2,mp3,ogg,aac,aif,aiff,wav";
0150:                String video = "swf,avi,wmv,rm,mov,mpg,mpeg,ram,m4v";
0151:                String image = "jpg,jpeg,jpe,gif,png,ico,bmp";
0152:                initMediaExt(extString2extList(apps + "," + // application container
0153:                        "tar,gz,bz2,arj,zip,rar," + // archive formats
0154:                        "ps,xls,ppt,asf," + // text formats without support
0155:                        audio + "," + // audio formats
0156:                        video + "," + // video formats
0157:                        image // image formats
0158:                ));
0159:                initImageExt(extString2extList(image)); // image formats
0160:                initAudioExt(extString2extList(audio)); // audio formats
0161:                initVideoExt(extString2extList(video)); // video formats
0162:                initAppsExt(extString2extList(apps)); // application formats
0163:
0164:                /* ===================================================
0165:                 * loading a list of available parsers
0166:                 * =================================================== */
0167:                loadAvailableParserList();
0168:            }
0169:
0170:            private serverLog theLogger = new serverLog("PARSER");
0171:
0172:            public serverLog getLogger() {
0173:                return this .theLogger;
0174:            }
0175:
0176:            public static HashMap<String, plasmaParserConfig> getParserConfigList() {
0177:                return parserConfigList;
0178:            }
0179:
0180:            /**
0181:             * This function is used to initialize the HTMLParsableMimeTypes List.
0182:             * This list contains a list of mimeTypes that can be parsed in realtime by
0183:             * the yacy html-Parser
0184:             * @param htmlParsableMimeTypes a list of mimetypes that can be parsed by the 
0185:             * yacy html parser
0186:             */
0187:            public static void initHTMLParsableMimeTypes(
0188:                    String htmlParsableMimeTypes) {
0189:                LinkedList<String> mimeTypes = new LinkedList<String>();
0190:                if ((htmlParsableMimeTypes == null)
0191:                        || (htmlParsableMimeTypes.length() == 0)) {
0192:                    return;
0193:                }
0194:                String[] realtimeParsableMimeTypeList = htmlParsableMimeTypes
0195:                        .split(",");
0196:                for (int i = 0; i < realtimeParsableMimeTypeList.length; i++) {
0197:                    mimeTypes.add(realtimeParsableMimeTypeList[i].toLowerCase()
0198:                            .trim());
0199:                }
0200:                synchronized (supportedHTMLMimeTypes) {
0201:                    supportedHTMLMimeTypes.clear();
0202:                    supportedHTMLMimeTypes.addAll(mimeTypes);
0203:                }
0204:            }
0205:
0206:            public static List<String> extString2extList(String extString) {
0207:                LinkedList<String> extensions = new LinkedList<String>();
0208:                if ((extString == null) || (extString.length() == 0)) {
0209:                    return extensions;
0210:                } else {
0211:                    String[] xs = extString.split(",");
0212:                    for (int i = 0; i < xs.length; i++)
0213:                        extensions.add(xs[i].toLowerCase().trim());
0214:                }
0215:                return extensions;
0216:            }
0217:
0218:            public static void initMediaExt(List<String> mediaExtList) {
0219:                synchronized (mediaExtSet) {
0220:                    mediaExtSet.clear();
0221:                    mediaExtSet.addAll(mediaExtList);
0222:                }
0223:            }
0224:
0225:            public static void initImageExt(List<String> imageExtList) {
0226:                synchronized (imageExtSet) {
0227:                    imageExtSet.clear();
0228:                    imageExtSet.addAll(imageExtList);
0229:                }
0230:            }
0231:
0232:            public static void initAudioExt(List<String> audioExtList) {
0233:                synchronized (audioExtSet) {
0234:                    audioExtSet.clear();
0235:                    audioExtSet.addAll(audioExtList);
0236:                }
0237:            }
0238:
0239:            public static void initVideoExt(List<String> videoExtList) {
0240:                synchronized (videoExtSet) {
0241:                    videoExtSet.clear();
0242:                    videoExtSet.addAll(videoExtList);
0243:                }
0244:            }
0245:
0246:            public static void initAppsExt(List<String> appsExtList) {
0247:                synchronized (appsExtSet) {
0248:                    appsExtSet.clear();
0249:                    appsExtSet.addAll(appsExtList);
0250:                }
0251:            }
0252:
0253:            public static String getMediaExtList() {
0254:                synchronized (mediaExtSet) {
0255:                    return mediaExtSet.toString();
0256:                }
0257:            }
0258:
0259:            public static void initSupportedHTMLFileExt(
0260:                    List<String> supportedRealtimeFileExtList) {
0261:                synchronized (supportedHTMLFileExt) {
0262:                    supportedHTMLFileExt.clear();
0263:                    supportedHTMLFileExt.addAll(supportedRealtimeFileExtList);
0264:                }
0265:            }
0266:
0267:            public static boolean HTMLParsableMimeTypesContains(String mimeType) {
0268:                mimeType = normalizeMimeType(mimeType);
0269:                synchronized (supportedHTMLMimeTypes) {
0270:                    return supportedHTMLMimeTypes.contains(mimeType);
0271:                }
0272:            }
0273:
0274:            public static boolean supportedHTMLContent(yacyURL url,
0275:                    String mimeType) {
0276:                return HTMLParsableMimeTypesContains(mimeType)
0277:                        && supportedHTMLFileExtContains(url);
0278:            }
0279:
0280:            public static boolean supportedHTMLFileExtContains(yacyURL url) {
0281:                String fileExt = getFileExt(url);
0282:                synchronized (supportedHTMLFileExt) {
0283:                    return supportedHTMLFileExt.contains(fileExt);
0284:                }
0285:            }
0286:
0287:            public static String getFileExt(yacyURL url) {
0288:                // getting the file path
0289:                String name = url.getPath();
0290:
0291:                // tetermining last position of / in the file path
0292:                int p = name.lastIndexOf('/');
0293:                if (p != -1) {
0294:                    name = name.substring(p);
0295:                }
0296:
0297:                // termining last position of . in file path
0298:                p = name.lastIndexOf('.');
0299:                if (p < 0)
0300:                    return "";
0301:                return name.substring(p + 1);
0302:            }
0303:
0304:            public static boolean mediaExtContains(String mediaExt) {
0305:                if (mediaExt == null)
0306:                    return false;
0307:                mediaExt = mediaExt.trim().toLowerCase();
0308:
0309:                synchronized (supportedHTMLFileExt) {
0310:                    if (supportedHTMLFileExt.contains(mediaExt))
0311:                        return false;
0312:                }
0313:
0314:                if (supportedFileExtContains(mediaExt))
0315:                    return false;
0316:
0317:                synchronized (mediaExtSet) {
0318:                    return mediaExtSet.contains(mediaExt);
0319:                }
0320:            }
0321:
0322:            public static boolean imageExtContains(String imageExt) {
0323:                if (imageExt == null)
0324:                    return false;
0325:                synchronized (imageExtSet) {
0326:                    return imageExtSet.contains(imageExt.trim().toLowerCase());
0327:                }
0328:            }
0329:
0330:            public static boolean audioExtContains(String audioExt) {
0331:                if (audioExt == null)
0332:                    return false;
0333:                synchronized (audioExtSet) {
0334:                    return audioExtSet.contains(audioExt.trim().toLowerCase());
0335:                }
0336:            }
0337:
0338:            public static boolean videoExtContains(String videoExt) {
0339:                if (videoExt == null)
0340:                    return false;
0341:                synchronized (videoExtSet) {
0342:                    return videoExtSet.contains(videoExt.trim().toLowerCase());
0343:                }
0344:            }
0345:
0346:            public static boolean appsExtContains(String appsExt) {
0347:                if (appsExt == null)
0348:                    return false;
0349:                synchronized (appsExtSet) {
0350:                    return appsExtSet.contains(appsExt.trim().toLowerCase());
0351:                }
0352:            }
0353:
0354:            public static String getRealCharsetEncoding(String encoding) {
0355:                if ((encoding == null) || (encoding.length() == 0))
0356:                    return "ISO-8859-1";
0357:
0358:                // trim encoding string
0359:                encoding = encoding.trim();
0360:
0361:                if (encoding.toLowerCase().startsWith("windows")
0362:                        && encoding.length() > 7) {
0363:                    char c = encoding.charAt(7);
0364:                    if (c == '_')
0365:                        encoding = "windows-" + encoding.substring(8);
0366:                    else if ((c >= '0') && (c <= '9'))
0367:                        encoding = "windows-" + encoding.substring(7);
0368:                }
0369:
0370:                if (encoding.toLowerCase().startsWith("iso")
0371:                        && encoding.length() > 3) {
0372:                    char c = encoding.charAt(3);
0373:                    if (c == '_')
0374:                        encoding = "ISO-" + encoding.substring(4);
0375:                    else if ((c >= '0') && (c <= '9'))
0376:                        encoding = "ISO-" + encoding.substring(3);
0377:                }
0378:
0379:                if (encoding.toLowerCase().startsWith("iso")
0380:                        && encoding.length() > 8) {
0381:                    char c = encoding.charAt(8);
0382:                    if (c == '_')
0383:                        encoding = encoding.substring(0, 8) + "-"
0384:                                + encoding.substring(9);
0385:                    else if ((c >= '0') && (c <= '9'))
0386:                        encoding = encoding.substring(0, 8) + "-"
0387:                                + encoding.substring(8);
0388:                }
0389:
0390:                // converting cp\d{4} -> windows-\d{4}
0391:                if (encoding.toLowerCase().matches("cp([_-])?125[0-8]")) {
0392:                    char c = encoding.charAt(2);
0393:                    if (c == '_' || c == '-')
0394:                        encoding = "windows-" + encoding.substring(3);
0395:                    else if ((c >= '0') && (c <= '9'))
0396:                        encoding = "windows-" + encoding.substring(2);
0397:                }
0398:
0399:                if (encoding.toLowerCase().matches("gb[_-]?2312([-_]80)?")) {
0400:                    encoding = "x-EUC-CN";
0401:                }
0402:
0403:                if (encoding.toLowerCase().matches(".*utf[-_]?8.*")) {
0404:                    encoding = "UTF-8";
0405:                }
0406:
0407:                return encoding;
0408:            }
0409:
0410:            public static String normalizeMimeType(String mimeType) {
0411:                //if (mimeType == null) doMimeTypeAnalysis
0412:                if (mimeType == null)
0413:                    mimeType = "application/octet-stream";
0414:                mimeType = mimeType.trim().toLowerCase();
0415:
0416:                int pos = mimeType.indexOf(';');
0417:                return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
0418:            }
0419:
0420:            public static String getMimeTypeByFileExt(String fileExt) {
0421:                return mimeTypeLookupByFileExt.getProperty(fileExt,
0422:                        "application/octet-stream");
0423:            }
0424:
0425:            public HashMap<String, ParserInfo> getAvailableParserList() {
0426:                return plasmaParser.availableParserList;
0427:            }
0428:
0429:            private static void loadAvailableParserList() {
0430:                try {
0431:                    plasmaParser.availableParserList.clear();
0432:
0433:                    // getting the current java classpath
0434:                    String javaClassPath = System
0435:                            .getProperty("java.class.path");
0436:
0437:                    // getting the current package name
0438:                    String plasmaParserPkgName = plasmaParser.class
0439:                            .getPackage().getName()
0440:                            + ".parser";
0441:                    serverLog.logInfo("PARSER",
0442:                            "Searching for additional content parsers in package "
0443:                                    + plasmaParserPkgName);
0444:
0445:                    // getting an uri to the parser subpackage
0446:                    String packageURI = plasmaParser.class.getResource(
0447:                            "/" + plasmaParserPkgName.replace('.', '/'))
0448:                            .toString();
0449:                    serverLog.logFine("PARSER", "Parser directory is "
0450:                            + packageURI);
0451:
0452:                    // open the parser directory
0453:                    File parserDir = new File(new URI(packageURI));
0454:                    if ((parserDir == null) || (!parserDir.exists())
0455:                            || (!parserDir.isDirectory()))
0456:                        return;
0457:
0458:                    /*
0459:                     * loop through all subdirectories and test if we can
0460:                     * find an additional parser class
0461:                     */
0462:                    File[] parserDirectories = parserDir
0463:                            .listFiles(parserDirectoryFilter);
0464:                    if (parserDirectories == null)
0465:                        return;
0466:
0467:                    for (int parserDirNr = 0; parserDirNr < parserDirectories.length; parserDirNr++) {
0468:                        File currentDir = parserDirectories[parserDirNr];
0469:                        serverLog.logFine("PARSER", "Searching in directory "
0470:                                + currentDir.toString());
0471:
0472:                        String[] parserClasses = currentDir
0473:                                .list(parserFileNameFilter);
0474:                        if (parserClasses == null)
0475:                            continue;
0476:
0477:                        for (int parserNr = 0; parserNr < parserClasses.length; parserNr++) {
0478:                            serverLog.logFine("PARSER", "Testing parser class "
0479:                                    + parserClasses[parserNr]);
0480:                            String className = parserClasses[parserNr]
0481:                                    .substring(0, parserClasses[parserNr]
0482:                                            .indexOf(".class"));
0483:                            String fullClassName = plasmaParserPkgName + "."
0484:                                    + currentDir.getName() + "." + className;
0485:                            try {
0486:                                // trying to load the parser class by its name
0487:                                Class<?> parserClass = Class
0488:                                        .forName(fullClassName);
0489:                                Object theParser0 = (Parser) parserClass
0490:                                        .newInstance();
0491:                                if (!(theParser0 instanceof  Parser))
0492:                                    continue;
0493:                                Parser theParser = (Parser) theParser0;
0494:
0495:                                // testing if all needed libx libraries are available
0496:                                String[] neededLibx = theParser
0497:                                        .getLibxDependences();
0498:                                StringBuffer neededLibxBuf = new StringBuffer();
0499:                                if (neededLibx != null) {
0500:                                    for (int libxId = 0; libxId < neededLibx.length; libxId++) {
0501:                                        if (javaClassPath
0502:                                                .indexOf(neededLibx[libxId]) == -1) {
0503:                                            throw new Exception(
0504:                                                    "Missing dependency detected: '"
0505:                                                            + neededLibx[libxId]
0506:                                                            + "'.");
0507:                                        }
0508:                                        neededLibxBuf
0509:                                                .append(neededLibx[libxId])
0510:                                                .append(",");
0511:                                    }
0512:                                    if (neededLibxBuf.length() > 0)
0513:                                        neededLibxBuf
0514:                                                .deleteCharAt(neededLibxBuf
0515:                                                        .length() - 1);
0516:                                }
0517:
0518:                                // loading the list of mime-types that are supported by this parser class
0519:                                Hashtable<String, String> supportedMimeTypes = theParser
0520:                                        .getSupportedMimeTypes();
0521:
0522:                                // creating a parser info object
0523:                                ParserInfo parserInfo = new ParserInfo();
0524:                                parserInfo.parserClass = parserClass;
0525:                                parserInfo.parserClassName = fullClassName;
0526:                                parserInfo.libxDependencies = neededLibx;
0527:                                parserInfo.supportedMimeTypes = supportedMimeTypes;
0528:                                parserInfo.parserVersionNr = ((Parser) theParser)
0529:                                        .getVersion();
0530:                                parserInfo.parserName = ((Parser) theParser)
0531:                                        .getName();
0532:
0533:                                Iterator<String> mimeTypeIterator = supportedMimeTypes
0534:                                        .keySet().iterator();
0535:                                while (mimeTypeIterator.hasNext()) {
0536:                                    String mimeType = (String) mimeTypeIterator
0537:                                            .next();
0538:                                    availableParserList.put(mimeType,
0539:                                            parserInfo);
0540:                                    serverLog
0541:                                            .logInfo(
0542:                                                    "PARSER",
0543:                                                    "Found functional parser for mimeType '"
0544:                                                            + mimeType
0545:                                                            + "'."
0546:                                                            + "\n\tName:    "
0547:                                                            + parserInfo.parserName
0548:                                                            + "\n\tVersion: "
0549:                                                            + parserInfo.parserVersionNr
0550:                                                            + "\n\tClass:   "
0551:                                                            + parserInfo.parserClassName
0552:                                                            + ((neededLibxBuf
0553:                                                                    .length() > 0) ? "\n\tDependencies: "
0554:                                                                    + neededLibxBuf
0555:                                                                            .toString()
0556:                                                                    : ""));
0557:                                }
0558:
0559:                            } catch (Exception e) { /* we can ignore this for the moment */
0560:                                serverLog
0561:                                        .logWarning(
0562:                                                "PARSER",
0563:                                                "Parser '"
0564:                                                        + className
0565:                                                        + "' doesn't work correctly and will be ignored.\n ["
0566:                                                        + e.getClass()
0567:                                                                .getName()
0568:                                                        + "]: "
0569:                                                        + e.getMessage());
0570:                                e.printStackTrace();
0571:                            } catch (Error e) { /* we can ignore this for the moment */
0572:                                serverLog
0573:                                        .logWarning(
0574:                                                "PARSER",
0575:                                                "Parser '"
0576:                                                        + className
0577:                                                        + "' doesn't work correctly and will be ignored.\n ["
0578:                                                        + e.getClass()
0579:                                                                .getName()
0580:                                                        + "]: "
0581:                                                        + e.getMessage());
0582:                                e.printStackTrace();
0583:                            }
0584:                        }
0585:                    }
0586:
0587:                } catch (Exception e) {
0588:                    serverLog.logSevere("PARSER",
0589:                            "Unable to determine all installed parsers. "
0590:                                    + e.getMessage());
0591:                }
0592:            }
0593:
0594:            public void close() {
0595:                // clearing the parser list
0596:                Iterator<plasmaParserConfig> configs = parserConfigList
0597:                        .values().iterator();
0598:                while (configs.hasNext()) {
0599:                    plasmaParserConfig currentConfig = configs.next();
0600:                    synchronized (currentConfig.enabledParserList) {
0601:                        currentConfig.enabledParserList.clear();
0602:                    }
0603:                }
0604:            }
0605:
0606:            public plasmaParserDocument parseSource(yacyURL location,
0607:                    String mimeType, String charset, byte[] sourceArray)
0608:                    throws InterruptedException, ParserException {
0609:                ByteArrayInputStream byteIn = null;
0610:                try {
0611:                    if (this .theLogger.isFine())
0612:                        this .theLogger.logFine("Parsing '" + location
0613:                                + "' from byte-array");
0614:
0615:                    // testing if the resource is not empty
0616:                    if (sourceArray == null || sourceArray.length == 0) {
0617:                        String errorMsg = "No resource content available (1).";
0618:                        this .theLogger.logInfo("Unable to parse '" + location
0619:                                + "'. " + errorMsg);
0620:                        throw new ParserException(errorMsg, location,
0621:                                plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
0622:                    }
0623:
0624:                    // creating an InputStream
0625:                    byteIn = new ByteArrayInputStream(sourceArray);
0626:
0627:                    // parsing the temp file
0628:                    return parseSource(location, mimeType, charset,
0629:                            sourceArray.length, byteIn);
0630:
0631:                } catch (Exception e) {
0632:                    // Interrupted- and Parser-Exceptions should pass through
0633:                    if (e instanceof  InterruptedException)
0634:                        throw (InterruptedException) e;
0635:                    if (e instanceof  ParserException)
0636:                        throw (ParserException) e;
0637:
0638:                    // log unexpected error
0639:                    this .theLogger.logSevere(
0640:                            "Unexpected exception in parseSource from byte-array: "
0641:                                    + e.getMessage(), e);
0642:                    throw new ParserException(
0643:                            "Unexpected exception while parsing " + location,
0644:                            location, e);
0645:                } finally {
0646:                    if (byteIn != null)
0647:                        try {
0648:                            byteIn.close();
0649:                        } catch (Exception ex) {/* ignore this */
0650:                        }
0651:                }
0652:
0653:            }
0654:
0655:            public plasmaParserDocument parseSource(yacyURL location,
0656:                    String theMimeType, String theDocumentCharset,
0657:                    File sourceFile) throws InterruptedException,
0658:                    ParserException {
0659:
0660:                BufferedInputStream sourceStream = null;
0661:                try {
0662:                    if (this .theLogger.isFine())
0663:                        this .theLogger.logFine("Parsing '" + location
0664:                                + "' from file");
0665:
0666:                    // testing if the resource is not empty
0667:                    if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile
0668:                            .length() > 0)) {
0669:                        String errorMsg = sourceFile.exists() ? "Empty resource file."
0670:                                : "No resource content available (2).";
0671:                        this .theLogger.logInfo("Unable to parse '" + location
0672:                                + "'. " + errorMsg);
0673:                        throw new ParserException(errorMsg, location,
0674:                                plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
0675:                    }
0676:
0677:                    // create a new InputStream
0678:                    sourceStream = new BufferedInputStream(new FileInputStream(
0679:                            sourceFile));
0680:
0681:                    // parsing the data
0682:                    return this .parseSource(location, theMimeType,
0683:                            theDocumentCharset, sourceFile.length(),
0684:                            sourceStream);
0685:
0686:                } catch (Exception e) {
0687:                    // Interrupted- and Parser-Exceptions should pass through
0688:                    if (e instanceof  InterruptedException)
0689:                        throw (InterruptedException) e;
0690:                    if (e instanceof  ParserException)
0691:                        throw (ParserException) e;
0692:
0693:                    // log unexpected error
0694:                    this .theLogger.logSevere(
0695:                            "Unexpected exception in parseSource from File: "
0696:                                    + e.getMessage(), e);
0697:                    throw new ParserException(
0698:                            "Unexpected exception while parsing " + location,
0699:                            location, e);
0700:                } finally {
0701:                    if (sourceStream != null)
0702:                        try {
0703:                            sourceStream.close();
0704:                        } catch (Exception ex) {/* ignore this */
0705:                        }
0706:                }
0707:            }
0708:
0709:            /**
0710:             * To parse a resource from an {@link InputStream}
0711:             * @param location the URL of the resource
0712:             * @param theMimeType the resource mimetype (<code>null</code> if unknown)
0713:             * @param theDocumentCharset the charset of the resource (<code>null</code> if unknown)
0714:             * @param contentLength the content length of the resource (<code>-1</code> if unknown)
0715:             * @param sourceStream an {@link InputStream} containing the resource body 
0716:             * @return the parsed {@link plasmaParserDocument document}
0717:             * @throws InterruptedException
0718:             * @throws ParserException
0719:             */
0720:            public plasmaParserDocument parseSource(yacyURL location,
0721:                    String theMimeType, String theDocumentCharset,
0722:                    long contentLength, InputStream sourceStream)
0723:                    throws InterruptedException, ParserException {
0724:                Parser theParser = null;
0725:                String mimeType = null;
0726:                try {
0727:                    if (this .theLogger.isFine())
0728:                        this .theLogger.logFine("Parsing '" + location
0729:                                + "' from stream");
0730:
0731:                    // getting the mimetype of the document
0732:                    mimeType = normalizeMimeType(theMimeType);
0733:
0734:                    // getting the file extension of the document
0735:                    String fileExt = getFileExt(location);
0736:
0737:                    // getting the charset of the document
0738:                    // TODO: do a charset detection here ....
0739:                    String documentCharset = getRealCharsetEncoding(theDocumentCharset);
0740:
0741:                    // testing if parsing is supported for this resource
0742:                    if (!plasmaParser.supportedContent(location, mimeType)) {
0743:                        String errorMsg = "No parser available to parse mimetype '"
0744:                                + mimeType + "'";
0745:                        this .theLogger.logInfo("Unable to parse '" + location
0746:                                + "'. " + errorMsg);
0747:                        throw new ParserException(errorMsg, location,
0748:                                plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
0749:                    }
0750:
0751:                    if (this .theLogger.isFine())
0752:                        this .theLogger.logInfo("Parsing " + location
0753:                                + " with mimeType '" + mimeType
0754:                                + "' and file extension '" + fileExt + "'.");
0755:
0756:                    // getting the correct parser for the given mimeType
0757:                    theParser = this .getParser(mimeType);
0758:
0759:                    // if a parser was found we use it ...
0760:                    plasmaParserDocument doc = null;
0761:                    if (theParser != null) {
0762:                        // set the content length of the resource
0763:                        theParser.setContentLength(contentLength);
0764:                        // parse the resource
0765:                        doc = theParser.parse(location, mimeType,
0766:                                documentCharset, sourceStream);
0767:                    } else if (HTMLParsableMimeTypesContains(mimeType)) {
0768:                        doc = parseHtml(location, mimeType, documentCharset,
0769:                                sourceStream);
0770:                    } else {
0771:                        String errorMsg = "No parser available to parse mimetype '"
0772:                                + mimeType + "'";
0773:                        this .theLogger.logInfo("Unable to parse '" + location
0774:                                + "'. " + errorMsg);
0775:                        throw new ParserException(errorMsg, location,
0776:                                plasmaCrawlEURL.DENIED_WRONG_MIMETYPE_OR_EXT);
0777:                    }
0778:
0779:                    // check result
0780:                    if (doc == null) {
0781:                        String errorMsg = "Unexpected error. Parser returned null.";
0782:                        this .theLogger.logInfo("Unable to parse '" + location
0783:                                + "'. " + errorMsg);
0784:                        throw new ParserException(errorMsg, location);
0785:                    }
0786:                    return doc;
0787:
0788:                } catch (UnsupportedEncodingException e) {
0789:                    String errorMsg = "Unsupported charset encoding: "
0790:                            + e.getMessage();
0791:                    this .theLogger.logSevere("Unable to parse '" + location
0792:                            + "'. " + errorMsg);
0793:                    throw new ParserException(errorMsg, location,
0794:                            plasmaCrawlEURL.DENIED_UNSUPPORTED_CHARSET);
0795:                } catch (Exception e) {
0796:                    // Interrupted- and Parser-Exceptions should pass through
0797:                    if (e instanceof  InterruptedException)
0798:                        throw (InterruptedException) e;
0799:                    if (e instanceof  ParserException)
0800:                        throw (ParserException) e;
0801:
0802:                    // log unexpected error
0803:                    String errorMsg = "Unexpected exception. " + e.getMessage();
0804:                    this .theLogger.logSevere("Unable to parse '" + location
0805:                            + "'. " + errorMsg, e);
0806:                    throw new ParserException(errorMsg, location, e);
0807:
0808:                } finally {
0809:                    if (theParser != null) {
0810:                        theParser = null; // delete object
0811:                    }
0812:                }
0813:            }
0814:
0815:            private plasmaParserDocument parseHtml(yacyURL location,
0816:                    String mimeType, String documentCharset,
0817:                    InputStream sourceStream) throws IOException,
0818:                    ParserException {
0819:
0820:                // make a scraper and transformer
0821:                htmlFilterInputStream htmlFilter = new htmlFilterInputStream(
0822:                        sourceStream, documentCharset, location, null, false);
0823:                String charset = htmlFilter.detectCharset();
0824:                if (charset == null) {
0825:                    charset = documentCharset;
0826:                } else {
0827:                    charset = getRealCharsetEncoding(charset);
0828:                }
0829:
0830:                if (!documentCharset.equalsIgnoreCase(charset)) {
0831:                    this .theLogger
0832:                            .logInfo("Charset transformation needed from '"
0833:                                    + documentCharset + "' to '" + charset
0834:                                    + "'");
0835:                }
0836:
0837:                // parsing the content
0838:                htmlFilterContentScraper scraper = new htmlFilterContentScraper(
0839:                        location);
0840:                htmlFilterWriter writer = new htmlFilterWriter(null, null,
0841:                        scraper, null, false);
0842:                serverFileUtils.copy(htmlFilter, writer, charset);
0843:                writer.close();
0844:                //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);            
0845:                //serverFileUtils.copy(sourceFile, hfos);
0846:                //hfos.close();
0847:                if (writer.binarySuspect()) {
0848:                    String errorMsg = "Binary data found in resource";
0849:                    this .theLogger.logSevere("Unable to parse '" + location
0850:                            + "'. " + errorMsg);
0851:                    throw new ParserException(errorMsg, location);
0852:                }
0853:                return transformScraper(location, mimeType, documentCharset,
0854:                        scraper);
0855:            }
0856:
0857:            public plasmaParserDocument transformScraper(yacyURL location,
0858:                    String mimeType, String charSet,
0859:                    htmlFilterContentScraper scraper) {
0860:                String[] sections = new String[scraper.getHeadlines(1).length
0861:                        + scraper.getHeadlines(2).length
0862:                        + scraper.getHeadlines(3).length
0863:                        + scraper.getHeadlines(4).length];
0864:                int p = 0;
0865:                for (int i = 1; i <= 4; i++)
0866:                    for (int j = 0; j < scraper.getHeadlines(i).length; j++)
0867:                        sections[p++] = scraper.getHeadlines(i)[j];
0868:                plasmaParserDocument ppd = new plasmaParserDocument(location,
0869:                        mimeType, charSet, scraper.getKeywords(), scraper
0870:                                .getTitle(), scraper.getAuthor(), sections,
0871:                        scraper.getDescription(), scraper.getText(), scraper
0872:                                .getAnchors(), scraper.getImages());
0873:                //scraper.close();            
0874:                ppd.setFavicon(scraper.getFavicon());
0875:                return ppd;
0876:            }
0877:
0878:            /**
0879:             * This function is used to determine the parser class that should be used for a given
0880:             * mimetype ...
0881:             * @param mimeType MIME-Type of the resource
0882:             * @return the {@link Parser}-class that is supposed to parse the resource of
0883:             * the given MIME-Type
0884:             */
0885:            private Parser getParser(String mimeType) {
0886:
0887:                mimeType = normalizeMimeType(mimeType);
0888:                try {
0889:
0890:                    // determining the proper parser class name for the mimeType
0891:                    String parserClassName = null;
0892:                    ParserInfo parserInfo = null;
0893:                    synchronized (plasmaParser.availableParserList) {
0894:                        if (plasmaParser.availableParserList
0895:                                .containsKey(mimeType)) {
0896:                            parserInfo = (ParserInfo) plasmaParser.availableParserList
0897:                                    .get(mimeType);
0898:                            parserClassName = parserInfo.parserClassName;
0899:                        } else {
0900:                            return null;
0901:                        }
0902:                    }
0903:
0904:                    // fetching a new parser object from pool  
0905:                    Parser theParser = makeParser(parserClassName);
0906:
0907:                    // checking if the created parser really supports the given mimetype 
0908:                    Hashtable<String, String> supportedMimeTypes = theParser
0909:                            .getSupportedMimeTypes();
0910:                    if ((supportedMimeTypes != null)
0911:                            && (supportedMimeTypes.containsKey(mimeType))) {
0912:                        parserInfo.incUsageCounter();
0913:                        return theParser;
0914:                    }
0915:
0916:                } catch (Exception e) {
0917:                    System.err
0918:                            .println("ERROR: Unable to load the correct parser for type "
0919:                                    + mimeType);
0920:                }
0921:
0922:                return null;
0923:
0924:            }
0925:
0926:            static Map<yacyURL, String> allReflinks(Set<?> links) {
0927:                // links is either a Set of Strings (with urls) or htmlFilterImageEntries
0928:                // we find all links that are part of a reference inside a url
0929:                HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
0930:                Iterator<?> i = links.iterator();
0931:                Object o;
0932:                yacyURL url;
0933:                String u;
0934:                int pos;
0935:                loop: while (i.hasNext())
0936:                    try {
0937:                        o = i.next();
0938:                        if (o instanceof  yacyURL)
0939:                            url = (yacyURL) o;
0940:                        else if (o instanceof  String)
0941:                            url = new yacyURL((String) o, null);
0942:                        else if (o instanceof  htmlFilterImageEntry)
0943:                            url = ((htmlFilterImageEntry) o).url();
0944:                        else {
0945:                            assert false;
0946:                            continue;
0947:                        }
0948:                        u = url.toNormalform(true, true);
0949:                        if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) {
0950:                            i.remove();
0951:                            u = u.substring(pos);
0952:                            while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0)
0953:                                u = u.substring(pos);
0954:                            url = new yacyURL(u, null);
0955:                            if (!(v.containsKey(url)))
0956:                                v.put(url, "ref");
0957:                            continue loop;
0958:                        }
0959:                        if ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) {
0960:                            i.remove();
0961:                            u = "http:/" + u.substring(pos);
0962:                            while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0)
0963:                                u = "http:/" + u.substring(pos);
0964:                            url = new yacyURL(u, null);
0965:                            if (!(v.containsKey(url)))
0966:                                v.put(url, "ref");
0967:                            continue loop;
0968:                        }
0969:                    } catch (MalformedURLException e) {
0970:                    }
0971:                return v;
0972:            }
0973:
0974:            static Map<yacyURL, String> allSubpaths(Set<?> links) {
0975:                // links is either a Set of Strings (urls) or a Set of htmlFilterImageEntries
0976:                HashMap<yacyURL, String> v = new HashMap<yacyURL, String>();
0977:                Iterator<?> i = links.iterator();
0978:                Object o;
0979:                yacyURL url;
0980:                String u;
0981:                int pos;
0982:                while (i.hasNext())
0983:                    try {
0984:                        o = i.next();
0985:                        if (o instanceof  yacyURL)
0986:                            url = (yacyURL) o;
0987:                        else if (o instanceof  String)
0988:                            url = new yacyURL((String) o, null);
0989:                        else if (o instanceof  htmlFilterImageEntry)
0990:                            url = ((htmlFilterImageEntry) o).url();
0991:                        else {
0992:                            assert false;
0993:                            continue;
0994:                        }
0995:                        u = url.toNormalform(true, true);
0996:                        if (u.endsWith("/"))
0997:                            u = u.substring(0, u.length() - 1);
0998:                        pos = u.lastIndexOf("/");
0999:                        while (pos > 8) {
1000:                            u = u.substring(0, pos + 1);
1001:                            url = new yacyURL(u, null);
1002:                            if (!(v.containsKey(url)))
1003:                                v.put(url, "sub");
1004:                            u = u.substring(0, pos);
1005:                            pos = u.lastIndexOf("/");
1006:                        }
1007:                    } catch (MalformedURLException e) {
1008:                    }
1009:                return v;
1010:            }
1011:
1012:            public static void main(String[] args) {
1013:                //javac -sourcepath source source/de/anomic/plasma/plasmaParser.java
1014:                //java -cp source de.anomic.plasma.plasmaParser bug.html bug.out
1015:                httpc remote = null;
1016:                try {
1017:                    Object content = null;
1018:                    yacyURL contentURL = null;
1019:                    long contentLength = -1;
1020:                    String contentMimeType = "application/octet-stream";
1021:                    String charSet = "UTF-8";
1022:
1023:                    if (args.length < 2) {
1024:                        System.err
1025:                                .println("Usage: java de.anomic.plasma.plasmaParser (-f filename|-u URL) [-m mimeType]");
1026:                    }
1027:
1028:                    String mode = args[0];
1029:                    if (mode.equalsIgnoreCase("-f")) {
1030:                        content = new File(args[1]);
1031:                        contentURL = new yacyURL((File) content);
1032:                    } else if (mode.equalsIgnoreCase("-u")) {
1033:                        contentURL = new yacyURL(args[1], null);
1034:
1035:                        // downloading the document content
1036:                        remote = new httpc(contentURL.getHost(), contentURL
1037:                                .getHost(), contentURL.getPort(), 5000,
1038:                                contentURL.getProtocol().equalsIgnoreCase(
1039:                                        "https"), null, null, null);
1040:
1041:                        httpc.response res = remote.GET(contentURL.getFile(),
1042:                                null);
1043:                        if (res.statusCode != 200) {
1044:                            System.err.println("Unable to download "
1045:                                    + contentURL + ". " + res.status);
1046:                            return;
1047:                        }
1048:                        content = res.getContentInputStream();
1049:                        contentMimeType = res.responseHeader.mime();
1050:                        charSet = res.responseHeader.getCharacterEncoding();
1051:                        contentLength = res.responseHeader.contentLength();
1052:                        remote.close();
1053:                    }
1054:
1055:                    if ((args.length >= 4) && (args[2].equalsIgnoreCase("-m"))) {
1056:                        contentMimeType = args[3];
1057:                    }
1058:
1059:                    if ((args.length >= 6) && (args[4].equalsIgnoreCase("-c"))) {
1060:                        charSet = args[5];
1061:                    }
1062:
1063:                    // creating a plasma parser
1064:                    plasmaParser theParser = new plasmaParser();
1065:
1066:                    // configuring the html parsable mimeTypes
1067:                    plasmaParser
1068:                            .initHTMLParsableMimeTypes("application/xhtml+xml,text/html,text/plain,text/sgml");
1069:
1070:                    // parsing the content
1071:                    plasmaParserDocument document = null;
1072:                    if (content instanceof  byte[]) {
1073:                        document = theParser.parseSource(contentURL,
1074:                                contentMimeType, charSet, (byte[]) content);
1075:                    } else if (content instanceof  File) {
1076:                        document = theParser.parseSource(contentURL,
1077:                                contentMimeType, charSet, (File) content);
1078:                    } else if (content instanceof  InputStream) {
1079:                        document = theParser.parseSource(contentURL,
1080:                                contentMimeType, charSet, contentLength,
1081:                                (InputStream) content);
1082:                    }
1083:
1084:                    // printing out all parsed sentences
1085:                    if (document != null) {
1086:                        System.out.print("Document titel: ");
1087:                        System.out.println(document.dc_title());
1088:
1089:                        // found text
1090:                        final Iterator<StringBuffer> sentences = document
1091:                                .getSentences(false);
1092:                        int i = 0;
1093:                        if (sentences != null)
1094:                            while (sentences.hasNext()) {
1095:                                System.out.print("line " + i + ": ");
1096:                                System.out.println(sentences.next().toString());
1097:                                i++;
1098:                            }
1099:
1100:                        // found links
1101:                        int anchorNr = 0;
1102:                        Map<yacyURL, String> anchors = document.getAnchors();
1103:                        Iterator<yacyURL> anchorIter = anchors.keySet()
1104:                                .iterator();
1105:                        while (anchorIter.hasNext()) {
1106:                            yacyURL key = anchorIter.next();
1107:                            System.out
1108:                                    .println("URL " + anchorNr + ":\t"
1109:                                            + key.toString() + " | "
1110:                                            + anchors.get(key));
1111:                            anchorNr++;
1112:                        }
1113:                        document.close();
1114:                    }
1115:                } catch (Exception e) {
1116:                    e.printStackTrace();
1117:                }
1118:            }
1119:
1120:            public static boolean supportedContent(yacyURL url, String mimeType) {
1121:                if (url == null)
1122:                    throw new NullPointerException();
1123:
1124:                Iterator<plasmaParserConfig> configs = parserConfigList
1125:                        .values().iterator();
1126:                while (configs.hasNext()) {
1127:                    plasmaParserConfig currentConfig = configs.next();
1128:                    synchronized (currentConfig.enabledParserList) {
1129:                        if (currentConfig.supportedContent(url, mimeType))
1130:                            return true;
1131:                    }
1132:                }
1133:
1134:                return false;
1135:            }
1136:
1137:            public static boolean supportedContent(String parserMode,
1138:                    yacyURL url, String mimeType) {
1139:                if (!PARSER_MODE.contains(parserMode))
1140:                    throw new IllegalArgumentException();
1141:                if (url == null)
1142:                    throw new NullPointerException();
1143:
1144:                if (parserMode.equals(PARSER_MODE_IMAGE))
1145:                    return true;
1146:                plasmaParserConfig config = (plasmaParserConfig) parserConfigList
1147:                        .get(parserMode);
1148:                return (config == null) ? false : config.supportedContent(url,
1149:                        mimeType);
1150:            }
1151:
1152:            public static void initParseableMimeTypes(String parserMode,
1153:                    String configStr) {
1154:                if (!PARSER_MODE.contains(parserMode))
1155:                    throw new IllegalArgumentException();
1156:
1157:                plasmaParserConfig config = (plasmaParserConfig) parserConfigList
1158:                        .get(parserMode);
1159:                if (config == null) {
1160:                    config = new plasmaParserConfig(parserMode);
1161:                    parserConfigList.put(parserMode, config);
1162:                }
1163:                config.initParseableMimeTypes(configStr);
1164:            }
1165:
1166:            public static String[] setEnabledParserList(String parserMode,
1167:                    Set<String> mimeTypeSet) {
1168:                if (!PARSER_MODE.contains(parserMode))
1169:                    throw new IllegalArgumentException();
1170:
1171:                plasmaParserConfig config = (plasmaParserConfig) parserConfigList
1172:                        .get(parserMode);
1173:                if (config == null) {
1174:                    config = new plasmaParserConfig(parserMode);
1175:                    parserConfigList.put(parserMode, config);
1176:                }
1177:                return config.setEnabledParserList(mimeTypeSet);
1178:            }
1179:
1180:            public static boolean supportedFileExtContains(String fileExt) {
1181:                Iterator<plasmaParserConfig> configs = parserConfigList
1182:                        .values().iterator();
1183:                while (configs.hasNext()) {
1184:                    plasmaParserConfig currentConfig = configs.next();
1185:                    synchronized (currentConfig.enabledParserList) {
1186:                        if (currentConfig.supportedFileExtContains(fileExt))
1187:                            return true;
1188:                    }
1189:                }
1190:
1191:                return false;
1192:            }
1193:
1194:            public static boolean supportedMimeTypesContains(String mimeType) {
1195:                Iterator<plasmaParserConfig> configs = parserConfigList
1196:                        .values().iterator();
1197:                while (configs.hasNext()) {
1198:                    plasmaParserConfig currentConfig = configs.next();
1199:                    synchronized (currentConfig.enabledParserList) {
1200:                        if (currentConfig.supportedMimeTypesContains(mimeType))
1201:                            return true;
1202:                    }
1203:                }
1204:
1205:                return false;
1206:            }
1207:
1208:            public static Parser makeParser(Object name) throws Exception {
1209:
1210:                if (!(name instanceof  String))
1211:                    throw new IllegalArgumentException(
1212:                            "The object key must be of type string.");
1213:
1214:                // loading class by name
1215:                Class<?> moduleClass = Class.forName((String) name);
1216:
1217:                // instantiating class
1218:                Parser theParser = (Parser) moduleClass.newInstance();
1219:
1220:                // setting logger that should by used
1221:                String parserShortName = ((String) name).substring(
1222:                        "de.anomic.plasma.parser.".length(), ((String) name)
1223:                                .lastIndexOf("."));
1224:
1225:                serverLog theLogger = new serverLog("PARSER."
1226:                        + parserShortName.toUpperCase());
1227:                theParser.setLogger(theLogger);
1228:
1229:                return theParser;
1230:            }
1231:
1232:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.