001: /*
002: * $Id: Page.java,v 1.12 2004/10/10 14:25:23 csaltos Exp $
003: *
004: * Copyright 2001 PUCE [http://www.puce.edu.ec]
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018:
019: package org.oxyus.crawler;
020:
021: import java.io.BufferedInputStream;
022: import java.io.IOException;
023: import java.io.InputStream;
024: import java.net.ConnectException;
025: import java.net.HttpURLConnection;
026: import java.net.MalformedURLException;
027: import java.net.URL;
028: import java.net.UnknownHostException;
029: import java.sql.PreparedStatement;
030: import java.sql.ResultSet;
031: import java.sql.SQLException;
032: import java.sql.Statement;
033:
034: import org.apache.log4j.Logger;
035: import org.apache.lucene.document.Document;
036: import org.oxyus.crawler.parser.HTMLParser;
037: import org.oxyus.store.Store;
038: import org.oxyus.util.Path;
039:
040: /**
041: * Page persistance.
042: *
043: * @author Carlos Saltos (csaltos[@]users.sourceforge.net)
044: */
045: public class Page {
046:
047: protected Logger log;
048:
049: /**
050: * Page state for signal a page not collected
051: */
052: public static int PAGE_NOT_COLLECTED = 0;
053:
054: /**
055: * Page state for signal a page beign collected
056: */
057: public static int PAGE_COLLECTING = 1;
058:
059: /**
060: * Page state for signal a page already collected
061: */
062: public static int PAGE_COLLECTED = 2;
063:
064: /**
065: * Primary key
066: */
067: protected int code;
068:
069: /**
070: * Server this page belongs
071: */
072: protected Server server;
073:
074: /**
075: * Page collecting state
076: */
077: protected int state;
078:
079: /**
080: * Ruta de la page.
081: */
082: protected String path;
083:
084: /**
085: * Connection to oxyus repository
086: */
087: protected Store store;
088:
089: /**
090: * Crawling scope
091: */
092: protected Scope scope;
093:
094: /**
095: * Creates a page
096: */
097: public Page() {
098: log = Logger.getLogger(Page.class);
099: server = new Server();
100: reset();
101: }
102:
103: /**
104: * Reset the internal properties
105: */
106: public void reset() {
107: setCode(-1);
108: setState(PAGE_NOT_COLLECTED);
109: setPath(null);
110: if (server != null) {
111: server.reset();
112: }
113: }
114:
115: public void read() throws SQLException {
116: boolean success = false;
117: if (store == null) {
118: throw new SQLException("Connection not opened");
119: }
120: PreparedStatement statement = store
121: .prepareStatement("select code_server, state, path "
122: + "from ox_page where code_page = ?");
123: statement.setInt(1, this .getCode());
124: ResultSet result = statement.executeQuery();
125: if (result.next()) {
126: setState(result.getInt("state"));
127: setPath(result.getString("path"));
128: // Load the server
129: this .server = new Server();
130: this .server.setStore(this .store);
131: this .server.setCode(result.getInt("code_server"));
132: this .server.read();
133: success = true;
134: }
135: result.close();
136: statement.close();
137: if (!success) {
138: throw new SQLException("Page not found");
139: }
140: }
141:
142: /**
143: * Reads the next page in not collected state.
144: */
145: public boolean nextForCollect() throws CrawlingException {
146: try {
147: boolean hasNextForCollect = false;
148: if (store == null) {
149: throw new SQLException("Connection not opened");
150: }
151: PreparedStatement statement = store
152: .prepareStatement("select code_page from ox_page where state = ?");
153: statement.setInt(1, Page.PAGE_NOT_COLLECTED);
154: ResultSet result = statement.executeQuery();
155: if (result.next()) {
156: this .setCode(result.getInt("code_page"));
157: hasNextForCollect = true;
158: }
159: result.close();
160: statement.close();
161: // If a next page for collect was located, read it
162: if (hasNextForCollect) {
163: this .read();
164: }
165: return hasNextForCollect;
166: } catch (SQLException sqle) {
167: log.error("Unable to get the next page for collect", sqle);
168: throw new CrawlingException(
169: "Unable to get the next page for " + "collect",
170: sqle);
171: }
172: }
173:
174: /**
175: * Calculate a page address based in a link without add it to the crawler
176: * scope
177: */
178: public void recordLink(String link) {
179: recordLink(link, false);
180: }
181:
182: /**
183: * Calculate a page address based in a link with the posibility to
184: * add it to the crawler scope
185: */
186: public void recordLink(String link, boolean addToRules) {
187: if (link == null) {
188: log.warn("Attemp to record a null link");
189: return;
190: }
191: link = link.trim();
192: if (link.equals("")) {
193: log.warn("Attemp to record an empty link");
194: return;
195: }
196: URL url = null;
197: // Test if the link is an absolute URL address
198: try {
199: url = new URL(link);
200: } catch (MalformedURLException e) {
201: // The link is not a correct absolute address
202: url = null;
203: }
204: // If the link is not an absolute URL address, so create a complete
205: // URL address using this link as a reference
206: if (url == null) {
207: if (server == null) {
208: log.error("Server NULL registering address: " + link);
209: return;
210: }
211: try {
212: // If the link begins with / is a local absolute link, so
213: // try to create the URL with the direct link
214: if (link.charAt(0) == '/') {
215: url = new URL(server.getProtocol() + "://"
216: + server.getHost() + ":" + server.getPort()
217: + link);
218: } else {
219: // Try to construct a relative address using the link
220: String prefix = getPath();
221: if (prefix == null) {
222: log
223: .error("Path not established registering address: "
224: + link);
225: return;
226: }
227: int length = prefix.length();
228: if (length == 0) {
229: log
230: .error("Path not established registering address: "
231: + link);
232: return;
233: }
234: if (prefix.charAt(length - 1) != '/') {
235: int lastSlash = prefix.lastIndexOf('/');
236: if (lastSlash == -1) {
237: log
238: .error("Path error registering address: "
239: + link);
240: return;
241: }
242: prefix = prefix.substring(0, lastSlash + 1);
243: }
244: url = new URL("http://" + server.getHost() + ":"
245: + server.getPort()
246: + Path.normalize(prefix + link));
247: }
248: } catch (MalformedURLException e) {
249: log
250: .warn("Attemp to create an URL address using the link '"
251: + link + "' has failed");
252: return;
253: }
254: }
255: // Creates a server and page persistance objects and stores them
256: if (url.getProtocol().toLowerCase().equals("http")) {
257: if (addToRules) {
258: scope.acceptDomain(url.getHost());
259: }
260: if (scope.inScope(url.getHost())) {
261: Server server = new Server();
262: server.setStore(store);
263: server.setProtocol(url.getProtocol());
264: server.setHost(url.getHost());
265: server.setPort(url.getPort());
266: log.debug("Port " + server.getPort());
267: try {
268: Page page = new Page();
269: page.setStore(store);
270: page.setServer(server);
271: page.setPath(url.getPath());
272: page.setState(Page.PAGE_NOT_COLLECTED);
273: // Store the page with no duplication
274: page.locateOrCreate();
275: } catch (SQLException e) {
276: log.error("Error registering page: "
277: + url.toExternalForm(), e);
278: }
279: }
280: }
281: }
282:
283: /**
284: * Stores this new page is the path is not duplicated for the same server
285: */
286: public void locateOrCreate() throws SQLException {
287: if (store == null) {
288: throw new SQLException("Connection not opened");
289: }
290: if (this .getServer() == null) {
291: throw new SQLException("Server not specified");
292: }
293: if (this .getPath() == null) {
294: throw new SQLException("Path not specified");
295: }
296: // Locate the server or create it if no exists
297: this .getServer().locateOrCreate();
298: // Check if the path is not duplicated in the same server
299: PreparedStatement statement = store
300: .prepareStatement("select code_page from ox_page, ox_server where "
301: + "ox_page.code_server = ox_server.code_server and "
302: + "ox_server.code_server = ? and " + "path = ?");
303: statement.setInt(1, this .getServer().getCode());
304: statement.setString(2, this .getPath());
305: ResultSet result = statement.executeQuery();
306: if (result.next()) {
307: // If a page with the same path exists for the same server
308: // only loads its code and return
309: this .setCode(result.getInt("code_page"));
310: result.close();
311: statement.close();
312: return;
313: }
314: // If no match page found creates it with not collected state
315: this .setCode(store.nextCode("page"));
316: statement = store
317: .prepareStatement("insert into ox_page(code_page, code_server, state, path) "
318: + "values(?,?,?,?)");
319: statement.setInt(1, this .getCode());
320: statement.setInt(2, this .getServer().getCode());
321: statement.setInt(3, Page.PAGE_NOT_COLLECTED);
322: statement.setString(4, this .getPath());
323: statement.executeUpdate();
324: statement.close();
325: }
326:
327: /**
328: * Sets the page state in PAGE_COLLECTING
329: */
330: public void markAsCollecting() throws CrawlingException {
331: Statement sentencia = null;
332: if (store == null) {
333: throw new CrawlingException("Connection not opened");
334: }
335: try {
336: sentencia = store.createStatement();
337: if (sentencia == null) {
338: throw new CrawlingException("Unable to create sentence");
339: }
340: sentencia.executeUpdate("UPDATE ox_page SET state='"
341: + PAGE_COLLECTING + "' WHERE code_page="
342: + getCode());
343: try {
344: sentencia.close();
345: } catch (SQLException e) {
346: log.error("Unable to close the sentence correctly");
347: }
348: } catch (SQLException e) {
349: throw new CrawlingException("Error closing page", e);
350: }
351: }
352:
353: /**
354: * Sets the page state in PAGE_COLLECTED.
355: */
356: public void markAsCollected() throws CrawlingException {
357: Statement sentencia = null;
358: if (store == null) {
359: throw new CrawlingException("Connection not opened");
360: }
361: try {
362: sentencia = store.createStatement();
363: if (sentencia == null) {
364: throw new CrawlingException("Unable to create sentence");
365: }
366: sentencia
367: .executeUpdate("UPDATE ox_page SET state='"
368: + PAGE_COLLECTED + "' WHERE code_page="
369: + getCode());
370: try {
371: sentencia.close();
372: } catch (SQLException e) {
373: log.error("Unable to close the sentence correctly");
374: }
375: } catch (SQLException e) {
376: throw new CrawlingException("Error closing page", e);
377: }
378: }
379:
380: public Document index() throws CrawlingException {
381: log.debug("BEGINIG indexing page [code=" + getCode() + "] ...");
382: URL url = null;
383: InputStream in = null;
384: String contentType = null;
385: try {
386: // Reconstruct the page's url
387: url = new URL(getServer().getProtocol() + "://"
388: + getServer().getHost() + ":"
389: + getServer().getPort() + getPath());
390: // Open an HTTP connection with the page's url
391: HttpURLConnection pageContent = (HttpURLConnection) url
392: .openConnection();
393: // If the response code of the page's connection is not ok
394: // continue collecting the next page
395: if (pageContent.getResponseCode() != HttpURLConnection.HTTP_OK) {
396: log.debug("page pk[" + getCode() + ","
397: + url.toExternalForm() + "] is invalid");
398: return null;
399: }
400: // If a redirection is detected store it as a new address
401: // and continue collecting the next page
402: String redireccion = pageContent.getHeaderField("location");
403: if (redireccion != null) {
404: log.debug("Page " + url.toExternalForm()
405: + " redirected to " + redireccion);
406: recordLink(redireccion);
407: return null;
408: }
409: contentType = pageContent.getContentType();
410: in = new BufferedInputStream(pageContent.getInputStream(),
411: 32768); // TODO: Add a constant
412: } catch (MalformedURLException e) {
413: log.error("Invalid page address", e);
414: } catch (ConnectException e) {
415: if (getServer() != null) {
416: log.error("Unable to connect to page: "
417: + getServer().getProtocol() + "://"
418: + getServer().getHost() + ":"
419: + getServer().getPort() + getPath(), e);
420: }
421: } catch (UnknownHostException uhe) {
422: log.warn("Unknow host indexing page " + getURL(), uhe);
423: } catch (IOException e) {
424: log.warn("Unable to index page " + getURL(), e);
425: }
426:
427: Document doc = generateDocument(contentType, in);
428:
429: log.debug("END indexing page [code=" + getCode() + "]");
430:
431: return doc;
432:
433: }
434:
435: public Document generateDocument(String contentType, InputStream in)
436: throws CrawlingException {
437: Document doc = null;
438: if (contentType.indexOf("text/html") == 0) {
439: doc = new HTMLParser(this ).collect(in);
440: } else {
441: log.warn("No indexor for content type " + contentType);
442: }
443: return doc;
444: }
445:
446: public String getURL() {
447: String address = null;
448: if (server.getProtocol() != null && server.getHost() != null
449: && this .getPath() != null) {
450: address = server.getProtocol() + "://" + server.getHost()
451: + ":" + server.getPort() + this .getPath();
452: }
453: return address;
454: }
455:
456: public void setStore(Store connection) {
457: this .store = connection;
458: if (server != null) {
459: server.setStore(connection);
460: }
461: }
462:
463: public int getCode() {
464: return code;
465: }
466:
467: public void setCode(int code) {
468: this .code = code;
469: }
470:
471: public Server getServer() {
472: return server;
473: }
474:
475: public void setServer(Server server) {
476: this .server = server;
477: }
478:
479: public int getState() {
480: return state;
481: }
482:
483: public void setState(int state) {
484: if (state != PAGE_NOT_COLLECTED && state != PAGE_COLLECTING
485: && state != PAGE_COLLECTED) {
486: log.error("Invalid page state " + state);
487: throw new IllegalArgumentException("Invalid page state"
488: + state);
489: }
490: this .state = state;
491: }
492:
493: public String getPath() {
494: return path;
495: }
496:
497: public void setPath(String path) {
498: // TODO: shrink path
499: this .path = path;
500: if (path != null) {
501: this .path = path.trim();
502: }
503: }
504:
505: public Scope getScope() {
506: return scope;
507: }
508:
509: public void setScope(Scope dominiosPermitidos) {
510: this.scope = dominiosPermitidos;
511: }
512:
513: }
|