001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import java.io.*;
036: import java.net.URL;
037: import rcm.util.Str;
038:
039: public class RecordTransformer extends RewritableLinkTransformer {
040:
041: String prolog = "<HTML><HEAD><TITLE>Extracted Records</TITLE></HEAD><BODY><TABLE>\n";
042: String epilog = "</TABLE></BODY></HTML>\n";
043:
044: String recordStart = "<TR>\n<TD><A HREF=\"%u\">%n.</A>\n";
045: String recordEnd = "\n";
046: String recordDivider = "";
047:
048: String fieldStart = " <TD>";
049: String fieldEnd = "\n";
050: String fieldDivider = "";
051:
052: int nRecords = 0;
053:
054: public RecordTransformer(String filename) throws IOException {
055: super (filename);
056: }
057:
058: public synchronized void setProlog(String prolog) {
059: this .prolog = prolog;
060: }
061:
062: public synchronized String getProlog() {
063: return prolog;
064: }
065:
066: public synchronized void setEpilog(String epilog) {
067: this .epilog = epilog;
068: }
069:
070: public synchronized String getEpilog() {
071: return epilog;
072: }
073:
074: public synchronized void setRecordStart(String recordStart) {
075: this .recordStart = recordStart;
076: }
077:
078: public synchronized String getRecordStart() {
079: return recordStart;
080: }
081:
082: public synchronized void setRecordEnd(String recordEnd) {
083: this .recordEnd = recordEnd;
084: }
085:
086: public synchronized String getRecordEnd() {
087: return recordEnd;
088: }
089:
090: public synchronized void setRecordDivider(String recordDivider) {
091: this .recordDivider = recordDivider;
092: }
093:
094: public synchronized String getRecordDivider() {
095: return recordDivider;
096: }
097:
098: public synchronized void setFieldStart(String fieldStart) {
099: this .fieldStart = fieldStart;
100: }
101:
102: public synchronized String getFieldStart() {
103: return fieldStart;
104: }
105:
106: public synchronized void setFieldEnd(String fieldEnd) {
107: this .fieldEnd = fieldEnd;
108: }
109:
110: public synchronized String getFieldEnd() {
111: return fieldEnd;
112: }
113:
114: public synchronized void setFieldDivider(String fieldDivider) {
115: this .fieldDivider = fieldDivider;
116: }
117:
118: public synchronized String getFieldDivider() {
119: return fieldDivider;
120: }
121:
122: /**
123: * Flush the record page to disk. Temporarily writes the epilog.
124: */
125: public synchronized void flush() throws IOException {
126: long p = getFilePointer();
127: if (nRecords == 0)
128: emit(prolog);
129: emit(epilog);
130: seek(p);
131: super .flush();
132: }
133:
134: public synchronized int getRecordCount() {
135: return nRecords;
136: }
137:
138: public synchronized void writeRecord(Object[] fields, boolean asText)
139: throws IOException {
140: ++nRecords;
141:
142: emit((nRecords == 1) ? prolog : recordDivider);
143:
144: URL url = urlOfFirstRegion(fields);
145:
146: emitTemplate(recordStart, url, nRecords);
147: for (int i = 0; i < fields.length; ++i) {
148: if (i > 0)
149: emit(fieldDivider);
150: emit(fieldStart);
151:
152: Object f = fields[i];
153: if (f instanceof Region) {
154: Region r = (Region) fields[i];
155: if (asText)
156: write(r.toText());
157: else
158: write(r);
159: } else
160: write(f.toString());
161:
162: emit(fieldEnd);
163: }
164: emitTemplate(recordEnd, url, nRecords);
165: }
166:
167: private URL urlOfFirstRegion(Object[] fields) {
168: for (int i = 0; i < fields.length; ++i)
169: if (fields[i] instanceof Region) {
170: Region r = (Region) fields[i];
171: return r.getSource().getURL();
172: }
173: return null;
174: }
175:
176: private void emitTemplate(String template, URL url, int record)
177: throws IOException {
178: if (template == null || template.length() == 0)
179: return;
180:
181: template = Str.replace(template, "%n", String.valueOf(record));
182: template = Str.replace(template, "%u", url != null ? url
183: .toString() : "");
184: emit(template);
185: }
186:
187: /*
188: * Testing
189: *
190: public static void main (String[] args) throws Exception {
191: Pattern p = new Tagexp (args[0].replace ('_', ' ') );
192: RecordTransformer records = new RecordTransformer (args[1]);
193: for (int i=2; i<args.length; ++i) {
194: Page page = new Page (new Link (args[i]));
195: PatternMatcher m = p.match (page);
196: for (Region r = m.nextMatch(); r != null; r = m.nextMatch())
197: records.writeRecord (r.getFields (Pattern.groups), false);
198: }
199: records.close ();
200: }
201: */
202:
203: }
|