001: // /xml/util/gettitle_p.java
002: // -------------------------------
003: // part of the AnomicHTTPD caching proxy
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2004, 2005
007: //
008: // last major change: 29.12.2005
009: // this file is contributed by Alexander Schier
010: //
011: // This program is free software; you can redistribute it and/or modify
012: // it under the terms of the GNU General Public License as published by
013: // the Free Software Foundation; either version 2 of the License, or
014: // (at your option) any later version.
015: //
016: // This program is distributed in the hope that it will be useful,
017: // but WITHOUT ANY WARRANTY; without even the implied warranty of
018: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: // GNU General Public License for more details.
020: //
021: // You should have received a copy of the GNU General Public License
022: // along with this program; if not, write to the Free Software
023: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: //
025: // Using this software in any meaning (reading, learning, copying, compiling,
026: // running) means that you agree that the Author(s) is (are) not responsible
027: // for cost, loss of data or any harm that may be caused directly or indirectly
028: // by usage of this softare or this documentation. The usage of this software
029: // is on your own risk. The installation and usage (starting/running) of this
030: // software may allow other people or application to access your computer and
031: // any attached devices and is highly dependent on the configuration of the
032: // software which must be done by the user of the software; the author(s) is
033: // (are) also not responsible for proper configuration and usage of the
034: // software, even if provoked by documentation provided together with
035: // the software.
036: //
037: // Any changes to this file according to the GPL as documented in the file
038: // gpl.txt aside this file in the shipment you received can be done to the
039: // lines that follows this copyright notice here, but changes must not be
040: // done inside the copyright notive above. A re-distribution must contain
041: // the intact and unchanged copyright notice.
042: // Contributions and changes to the program code must be marked as such.
043:
044: // You must compile this file with
045: // javac -classpath .:../classes IndexCreate_p.java
046: // if the shell's current path is HTROOT
047: package xml.util;
048:
049: import java.io.IOException;
050: import java.io.Writer;
051: import java.net.MalformedURLException;
052:
053: import de.anomic.data.robotsParser;
054: import de.anomic.htmlFilter.htmlFilterContentScraper;
055: import de.anomic.htmlFilter.htmlFilterWriter;
056: import de.anomic.http.httpHeader;
057: import de.anomic.http.httpc;
058: import de.anomic.plasma.plasmaSwitchboard;
059: import de.anomic.server.serverFileUtils;
060: import de.anomic.server.serverObjects;
061: import de.anomic.server.serverSwitch;
062: import de.anomic.yacy.yacyURL;
063:
064: public class getpageinfo_p {
065: public static serverObjects respond(httpHeader header,
066: serverObjects post, serverSwitch env) {
067: serverObjects prop = new serverObjects();
068: prop.put("sitemap", "");
069: prop.put("title", "");
070: prop.put("favicon", "");
071: prop.put("robots-allowed", "3"); //unknown
072: String actions = "title";
073: if (post != null && post.containsKey("url")) {
074: if (post.containsKey("actions"))
075: actions = (String) post.get("actions");
076: String url = (String) post.get("url");
077: if (url.toLowerCase().startsWith("ftp://")) {
078: prop.put("robots-allowed", "1");
079: prop.putHTML("title", "FTP: " + url);
080: return prop;
081: } else if (!(url.toLowerCase().startsWith("http://") || url
082: .toLowerCase().startsWith("https://"))) {
083: url = "http://" + url;
084: }
085: if (actions.indexOf("title") >= 0) {
086: try {
087: yacyURL u = new yacyURL(url, null);
088: String contentString = new String(
089: httpc
090: .wget(
091: u,
092: u.getHost(),
093: 6000,
094: null,
095: null,
096: ((plasmaSwitchboard) env).remoteProxyConfig,
097: null, null));
098:
099: htmlFilterContentScraper scraper = new htmlFilterContentScraper(
100: u);
101: //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
102: Writer writer = new htmlFilterWriter(null, null,
103: scraper, null, false);
104: serverFileUtils.write(contentString, writer);
105: writer.close();
106:
107: // put the document title
108: prop.putHTML("title", scraper.getTitle());
109:
110: // put the favicon that belongs to the document
111: prop.put("favicon",
112: (scraper.getFavicon() == null) ? ""
113: : scraper.getFavicon().toString());
114:
115: // put keywords
116: String list[] = scraper.getKeywords();
117: for (int i = 0; i < list.length; i++) {
118: prop.putHTML("tags_" + i + "_tag", list[i]);
119: }
120: prop.put("tags", list.length);
121:
122: } catch (MalformedURLException e) { /* ignore this */
123: } catch (IOException e) { /* ignore this */
124: }
125: }
126: if (actions.indexOf("robots") >= 0) {
127: try {
128: yacyURL theURL = new yacyURL(url, null);
129:
130: // determine if crawling of the current URL is allowed
131: prop.put("robots-allowed", robotsParser
132: .isDisallowed(theURL) ? "0" : "1");
133:
134: // get the sitemap URL of the domain
135: yacyURL sitemapURL = robotsParser
136: .getSitemapURL(theURL);
137: prop.putHTML("sitemap", (sitemapURL == null) ? ""
138: : sitemapURL.toString());
139: } catch (MalformedURLException e) {
140: }
141: }
142:
143: }
144: // return rewrite properties
145: return prop;
146: }
147:
148: }
|