001: /* Copyright (c) 2006-2007, Vladimir Nikic
002: All rights reserved.
003:
004: Redistribution and use of this software in source and binary forms,
005: with or without modification, are permitted provided that the following
006: conditions are met:
007:
008: * Redistributions of source code must retain the above
009: copyright notice, this list of conditions and the
010: following disclaimer.
011:
012: * Redistributions in binary form must reproduce the above
013: copyright notice, this list of conditions and the
014: following disclaimer in the documentation and/or other
015: materials provided with the distribution.
016:
017: * The name of Web-Harvest may not be used to endorse or promote
018: products derived from this software without specific prior
019: written permission.
020:
021: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
022: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
023: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
024: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
025: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
026: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
027: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
028: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
029: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
030: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
031: POSSIBILITY OF SUCH DAMAGE.
032:
033: You can contact Vladimir Nikic by sending e-mail to
034: nikic_vladimir@yahoo.com. Please include the word "Web-Harvest" in the
035: subject line.
036: */
037: package org.webharvest.runtime.processors;
038:
039: import org.apache.log4j.Logger;
040: import org.webharvest.definition.BaseElementDef;
041: import org.webharvest.definition.IElementDef;
042: import org.webharvest.runtime.Scraper;
043: import org.webharvest.runtime.ScraperContext;
044: import org.webharvest.runtime.templaters.BaseTemplater;
045: import org.webharvest.runtime.variables.*;
046: import org.webharvest.utils.CommonUtil;
047:
048: import java.io.*;
049:
050: /**
051: * Base processor that contains common processor logic.
052: * All other processors extend this class.
053: */
054: abstract public class BaseProcessor {
055:
056: protected static Logger log = Logger.getLogger(BaseProcessor.class);
057:
058: abstract public IVariable execute(Scraper scraper,
059: ScraperContext context);
060:
061: protected BaseElementDef elementDef;
062:
063: protected BaseProcessor() {
064: }
065:
066: /**
067: * Base constructor - assigns element definition to the processor.
068: * @param elementDef
069: */
070: protected BaseProcessor(BaseElementDef elementDef) {
071: this .elementDef = elementDef;
072: }
073:
074: /**
075: * Wrapper for the execute method. Adds controling and logging logic.
076: */
077: public IVariable run(Scraper scraper, ScraperContext context) {
078: long startTime = System.currentTimeMillis();
079:
080: int runningLevel = scraper.getRunningLevel();
081:
082: String id = (this .elementDef != null) ? BaseTemplater.execute(
083: this .elementDef.getId(), scraper.getScriptEngine())
084: : null;
085: String idDesc = id != null ? "[ID=" + id + "] " : "";
086: String indent = CommonUtil.replicate(" ", runningLevel - 1);
087:
088: log.info(indent + CommonUtil.getClassName(this )
089: + " starts processing..." + idDesc);
090:
091: scraper.increaseRunningLevel();
092: IVariable result = execute(scraper, context);
093: scraper.decreaseRunningLevel();
094:
095: // if debug mode is true and processor ID is not null then write debugging file
096: if (scraper.isDebugMode() && id != null) {
097: writeDebugFile(result, id, scraper);
098: }
099:
100: log.info(indent + CommonUtil.getClassName(this )
101: + " processor executed in "
102: + (System.currentTimeMillis() - startTime) + "ms."
103: + idDesc);
104:
105: return result;
106: }
107:
108: protected IVariable[] executeBody(BaseElementDef elementDef,
109: Scraper scraper, ScraperContext context) {
110: IElementDef[] defs = elementDef.getOperationDefs();
111: IVariable[] result = new IVariable[Math.max(defs.length, 1)]; // at least one element
112:
113: if (defs.length > 0) {
114: for (int i = 0; i < defs.length; i++) {
115: BaseProcessor processor = ProcessorResolver
116: .createProcessor(defs[i]);
117: result[i] = processor.run(scraper, context);
118: }
119: } else {
120: result[0] = new NodeVariable(elementDef.getBodyText());
121: }
122:
123: return result;
124: }
125:
126: protected void debug(BaseElementDef elementDef, Scraper scraper,
127: IVariable variable) {
128: String id = (elementDef != null) ? BaseTemplater.execute(
129: elementDef.getId(), scraper.getScriptEngine()) : null;
130:
131: if (scraper.isDebugMode() && id != null) {
132: if (variable != null) {
133: writeDebugFile(variable, id, scraper);
134: }
135: }
136: }
137:
138: protected IVariable getBodyTextContent(BaseElementDef elementDef,
139: Scraper scraper, ScraperContext context) {
140: if (elementDef == null) {
141: return null;
142: } else if (elementDef.hasOperations()) {
143: IVariable[] vars = executeBody(elementDef, scraper, context);
144: return Appender.appendText(vars);
145: } else {
146: return new NodeVariable(elementDef.getBodyText());
147: }
148: }
149:
150: protected IVariable getBodyBinaryContent(BaseElementDef elementDef,
151: Scraper scraper, ScraperContext context) {
152: if (elementDef == null) {
153: return null;
154: } else if (elementDef.hasOperations()) {
155: IVariable[] vars = executeBody(elementDef, scraper, context);
156: return Appender.appendBinary(vars);
157: } else {
158: return new NodeVariable(elementDef.getBodyText().getBytes());
159: }
160: }
161:
162: protected IVariable getBodyListContent(BaseElementDef elementDef,
163: Scraper scraper, ScraperContext context) {
164: IVariable[] vars = executeBody(elementDef, scraper, context);
165:
166: ListVariable listVariable = new ListVariable();
167: for (int i = 0; i < vars.length; i++) {
168: if (!vars[i].isEmpty()) {
169: listVariable.addVariable(vars[i]);
170: }
171: }
172:
173: return listVariable;
174: }
175:
176: public BaseElementDef getElementDef() {
177: return elementDef;
178: }
179:
180: private void writeDebugFile(IVariable var, String processorId,
181: Scraper scraper) {
182: byte[] data = var == null ? new byte[] {} : var.toString()
183: .getBytes();
184:
185: String workingDir = scraper.getWorkingDir();
186: String dir = CommonUtil.getAbsoluteFilename(workingDir,
187: "_debug");
188:
189: int index = 1;
190: String fullPath = dir + "/" + processorId + "_" + index
191: + ".debug";
192: while (new File(fullPath).exists()) {
193: index++;
194: fullPath = dir + "/" + processorId + "_" + index + ".debug";
195: }
196:
197: FileOutputStream out;
198: try {
199: new File(dir).mkdirs();
200: out = new FileOutputStream(fullPath, false);
201: out.write(data);
202: out.flush();
203: out.close();
204: } catch (FileNotFoundException e) {
205: e.printStackTrace();
206: } catch (IOException e) {
207: e.printStackTrace();
208: }
209: }
210:
211: }
|