Source Code Cross Referenced for WriterPoolProcessor.java in » Web-Crawler » heritrix » org » archive » crawler » framework » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.framework
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /* WriterPoolProcessor
002:         *
003:         * $Id: WriterPoolProcessor.java 5029 2007-03-29 23:53:50Z gojomo $
004:         *
005:         * Created on July 19th, 2006
006:         *
007:         * Copyright (C) 2006 Internet Archive.
008:         *
009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
010:         *
011:         * Heritrix is free software; you can redistribute it and/or modify
012:         * it under the terms of the GNU Lesser Public License as published by
013:         * the Free Software Foundation; either version 2.1 of the License, or
014:         * any later version.
015:         *
016:         * Heritrix is distributed in the hope that it will be useful,
017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019:         * GNU Lesser Public License for more details.
020:         *
021:         * You should have received a copy of the GNU Lesser Public License
022:         * along with Heritrix; if not, write to the Free Software
023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024:         */
025:        package org.archive.crawler.framework;
026:
027:        import java.io.DataInputStream;
028:        import java.io.DataOutputStream;
029:        import java.io.File;
030:        import java.io.FileInputStream;
031:        import java.io.FileNotFoundException;
032:        import java.io.FileOutputStream;
033:        import java.io.IOException;
034:        import java.io.ObjectInputStream;
035:        import java.io.StringWriter;
036:        import java.net.InetAddress;
037:        import java.net.UnknownHostException;
038:        import java.util.ArrayList;
039:        import java.util.Arrays;
040:        import java.util.Iterator;
041:        import java.util.List;
042:        import java.util.concurrent.atomic.AtomicInteger;
043:        import java.util.logging.Logger;
044:
045:        import javax.management.AttributeNotFoundException;
046:        import javax.management.MBeanException;
047:        import javax.management.ReflectionException;
048:        import javax.xml.transform.SourceLocator;
049:        import javax.xml.transform.Templates;
050:        import javax.xml.transform.Transformer;
051:        import javax.xml.transform.TransformerConfigurationException;
052:        import javax.xml.transform.TransformerException;
053:        import javax.xml.transform.TransformerFactory;
054:        import javax.xml.transform.stream.StreamResult;
055:        import javax.xml.transform.stream.StreamSource;
056:
057:        import org.archive.crawler.Heritrix;
058:        import org.archive.crawler.datamodel.CoreAttributeConstants;
059:        import org.archive.crawler.datamodel.CrawlHost;
060:        import org.archive.crawler.datamodel.CrawlOrder;
061:        import org.archive.crawler.datamodel.CrawlURI;
062:        import org.archive.crawler.datamodel.FetchStatusCodes;
063:        import org.archive.crawler.deciderules.recrawl.IdenticalDigestDecideRule;
064:        import org.archive.crawler.event.CrawlStatusListener;
065:        import org.archive.crawler.settings.SimpleType;
066:        import org.archive.crawler.settings.StringList;
067:        import org.archive.crawler.settings.Type;
068:        import org.archive.crawler.settings.XMLSettingsHandler;
069:        import org.archive.io.ObjectPlusFilesInputStream;
070:        import org.archive.io.WriterPool;
071:        import org.archive.io.WriterPoolMember;
072:
073:        /**
074:         * Abstract implementation of a file pool processor.
075:         * Subclass to implement for a particular {@link WriterPoolMember} instance.
076:         * @author Parker Thompson
077:         * @author stack
078:         */
079:        public abstract class WriterPoolProcessor extends Processor implements 
080:                CoreAttributeConstants, CrawlStatusListener, FetchStatusCodes {
081:            private final Logger logger = Logger.getLogger(this .getClass()
082:                    .getName());
083:
084:            /**
085:             * Key to use asking settings for file compression value.
086:             */
087:            public static final String ATTR_COMPRESS = "compress";
088:
089:            /**
090:             * Default as to whether we do compression of files.
091:             */
092:            public static final boolean DEFAULT_COMPRESS = true;
093:
094:            /**
095:             * Key to use asking settings for file prefix value.
096:             */
097:            public static final String ATTR_PREFIX = "prefix";
098:
099:            /**
100:             * Key to use asking settings for arc path value.
101:             */
102:            public static final String ATTR_PATH = "path";
103:
104:            /**
105:             * Key to use asking settings for file suffix value.
106:             */
107:            public static final String ATTR_SUFFIX = "suffix";
108:
109:            /**
110:             * Key to use asking settings for file max size value.
111:             */
112:            public static final String ATTR_MAX_SIZE_BYTES = "max-size-bytes";
113:
114:            /**
115:             * Key to get maximum pool size.
116:             *
117:             * This key is for maximum files active in the pool.
118:             */
119:            public static final String ATTR_POOL_MAX_ACTIVE = "pool-max-active";
120:
121:            /**
122:             * Key to get maximum wait on pool object before we give up and
123:             * throw IOException.
124:             */
125:            public static final String ATTR_POOL_MAX_WAIT = "pool-max-wait";
126:
127:            /**
128:             * Key for the maximum bytes to write attribute.
129:             */
130:            public static final String ATTR_MAX_BYTES_WRITTEN = "total-bytes-to-write";
131:
132:            /**
133:             * Key for whether to skip writing records of content-digest repeats 
134:             */
135:            public static final String ATTR_SKIP_IDENTICAL_DIGESTS = "skip-identical-digests";
136:
137:            /**
138:             * CrawlURI annotation indicating no record was written
139:             */
140:            protected static final String ANNOTATION_UNWRITTEN = "unwritten";
141:
142:            /**
143:             * Default maximum file size.
144:             * TODO: Check that subclasses can set a different MAX_FILE_SIZE and
145:             * it will be used in the constructor as default.
146:             */
147:            private static final int DEFAULT_MAX_FILE_SIZE = 100000000;
148:
149:            /**
150:             * Default path list.
151:             * 
152:             * TODO: Confirm this one gets picked up.
153:             */
154:            private static final String[] DEFAULT_PATH = { "crawl-store" };
155:
156:            /**
157:             * Reference to pool.
158:             */
159:            transient private WriterPool pool = null;
160:
161:            /**
162:             * Total number of bytes written to disc.
163:             */
164:            private long totalBytesWritten = 0;
165:
166:            /**
167:             * Calculate metadata once only.
168:             */
169:            transient private List<String> cachedMetadata = null;
170:
171:            /**
172:             * @param name Name of this processor.
173:             */
174:            public WriterPoolProcessor(String name) {
175:                this (name, "Pool of files processor");
176:            }
177:
178:            /**
179:             * @param name Name of this processor.
180:             * @param description Description for this processor.
181:             */
182:            public WriterPoolProcessor(final String name,
183:                    final String description) {
184:                super (name, description);
185:                Type e = addElementToDefinition(new SimpleType(ATTR_COMPRESS,
186:                        "Compress files when " + "writing to disk.",
187:                        new Boolean(DEFAULT_COMPRESS)));
188:                e.setOverrideable(false);
189:                e = addElementToDefinition(new SimpleType(
190:                        ATTR_PREFIX,
191:                        "File prefix. "
192:                                + "The text supplied here will be used as a prefix naming "
193:                                + "writer files.  For example if the prefix is 'IAH', "
194:                                + "then file names will look like "
195:                                + "IAH-20040808101010-0001-HOSTNAME.arc.gz "
196:                                + "...if writing ARCs (The prefix will be "
197:                                + "separated from the date by a hyphen).",
198:                        WriterPoolMember.DEFAULT_PREFIX));
199:                e = addElementToDefinition(new SimpleType(
200:                        ATTR_SUFFIX,
201:                        "Suffix to tag onto "
202:                                + "files. If value is '${HOSTNAME}', will use hostname for "
203:                                + "suffix. If empty, no suffix will be added.",
204:                        WriterPoolMember.DEFAULT_SUFFIX));
205:                e.setOverrideable(false);
206:                e = addElementToDefinition(new SimpleType(ATTR_MAX_SIZE_BYTES,
207:                        "Max size of each file",
208:                        new Long(DEFAULT_MAX_FILE_SIZE)));
209:                e.setOverrideable(false);
210:                e = addElementToDefinition(new StringList(
211:                        ATTR_PATH,
212:                        "Where to files. "
213:                                + "Supply absolute or relative path.  If relative, files "
214:                                + "will be written relative to "
215:                                + "the "
216:                                + CrawlOrder.ATTR_DISK_PATH
217:                                + "setting."
218:                                + " If more than one path specified, we'll round-robin"
219:                                + " dropping files to each.  This setting is safe"
220:                                + " to change midcrawl (You can remove and add new dirs"
221:                                + " as the crawler progresses).",
222:                        getDefaultPath()));
223:                e.setOverrideable(false);
224:                e = addElementToDefinition(new SimpleType(
225:                        ATTR_POOL_MAX_ACTIVE,
226:                        "Maximum active files in pool. "
227:                                + "This setting cannot be varied over the life of a crawl.",
228:                        new Integer(WriterPool.DEFAULT_MAX_ACTIVE)));
229:                e.setOverrideable(false);
230:                e = addElementToDefinition(new SimpleType(
231:                        ATTR_POOL_MAX_WAIT,
232:                        "Maximum time to wait on pool element"
233:                                + " (milliseconds). This setting cannot be varied over the life"
234:                                + " of a crawl.", new Integer(
235:                                WriterPool.DEFAULT_MAXIMUM_WAIT)));
236:                e.setOverrideable(false);
237:                e = addElementToDefinition(new SimpleType(
238:                        ATTR_MAX_BYTES_WRITTEN,
239:                        "Total file bytes to write to disk."
240:                                + " Once the size of all files on disk has exceeded this "
241:                                + "limit, this processor will stop the crawler. "
242:                                + "A value of zero means no upper limit.",
243:                        new Long(0)));
244:                e.setOverrideable(false);
245:                e.setExpertSetting(true);
246:                e = addElementToDefinition(new SimpleType(
247:                        ATTR_SKIP_IDENTICAL_DIGESTS,
248:                        "Whether to skip the writing of a record when URI "
249:                                + "history information is available and indicates the "
250:                                + "prior fetch had an identical content digest. "
251:                                + "Default is false.", new Boolean(false)));
252:                e.setOverrideable(true);
253:                e.setExpertSetting(true);
254:            }
255:
256:            protected String[] getDefaultPath() {
257:                return DEFAULT_PATH;
258:            }
259:
260:            public synchronized void initialTasks() {
261:                // Add this class to crawl state listeners and setup pool.
262:                getSettingsHandler().getOrder().getController()
263:                        .addCrawlStatusListener(this );
264:                setupPool(new AtomicInteger());
265:                // Run checkpoint recovery code.
266:                if (getSettingsHandler().getOrder().getController()
267:                        .isCheckpointRecover()) {
268:                    checkpointRecover();
269:                }
270:            }
271:
272:            protected AtomicInteger getSerialNo() {
273:                return ((WriterPool) getPool()).getSerialNo();
274:            }
275:
276:            /**
277:             * Set up pool of files.
278:             */
279:            protected abstract void setupPool(final AtomicInteger serialNo);
280:
281:            /**
282:             * Writes a CrawlURI and its associated data to store file.
283:             *
284:             * Currently this method understands the following uri types: dns, http, 
285:             * and https.
286:             *
287:             * @param curi CrawlURI to process.
288:             */
289:            protected abstract void innerProcess(CrawlURI curi);
290:
291:            protected void checkBytesWritten() {
292:                long max = getMaxToWrite();
293:                if (max <= 0) {
294:                    return;
295:                }
296:                if (max <= this .totalBytesWritten) {
297:                    getController().requestCrawlStop(
298:                            "Finished - Maximum bytes (" + Long.toString(max)
299:                                    + ") written");
300:                }
301:            }
302:
303:            /**
304:             * Whether the given CrawlURI should be written to archive files. 
305:             * Annotates CrawlURI with a reason for any negative answer. 
306:             * 
307:             * @param curi CrawlURI
308:             * @return true if URI should be written; false otherwise
309:             */
310:            protected boolean shouldWrite(CrawlURI curi) {
311:                // check for duplicate content write suppression
312:                if (((Boolean) getUncheckedAttribute(curi,
313:                        ATTR_SKIP_IDENTICAL_DIGESTS))
314:                        && IdenticalDigestDecideRule.hasIdenticalDigest(curi)) {
315:                    curi.addAnnotation(ANNOTATION_UNWRITTEN
316:                            + ":identicalDigest");
317:                    return false;
318:                }
319:                String scheme = curi.getUURI().getScheme().toLowerCase();
320:                // TODO: possibly move this sort of isSuccess() test into CrawlURI
321:                boolean retVal;
322:                if (scheme.equals("dns")) {
323:                    retVal = curi.getFetchStatus() == S_DNS_SUCCESS;
324:                } else if (scheme.equals("http") || scheme.equals("https")) {
325:                    retVal = curi.getFetchStatus() > 0
326:                            && curi.isHttpTransaction();
327:                } else if (scheme.equals("ftp")) {
328:                    retVal = curi.getFetchStatus() == 200;
329:                } else {
330:                    // unsupported scheme
331:                    curi.addAnnotation(ANNOTATION_UNWRITTEN + ":scheme");
332:                    return false;
333:                }
334:                if (retVal == false) {
335:                    // status not deserving writing
336:                    curi.addAnnotation(ANNOTATION_UNWRITTEN + ":status");
337:                    return false;
338:                }
339:                return true;
340:            }
341:
342:            /**
343:             * Return IP address of given URI suitable for recording (as in a
344:             * classic ARC 5-field header line).
345:             * 
346:             * @param curi CrawlURI
347:             * @return String of IP address
348:             */
349:            protected String getHostAddress(CrawlURI curi) {
350:                // special handling for DNS URIs: want address of DNS server
351:                if (curi.getUURI().getScheme().toLowerCase().equals("dns")) {
352:                    return curi.getString(A_DNS_SERVER_IP_LABEL);
353:                }
354:                // otherwise, host referenced in URI
355:                CrawlHost h = getController().getServerCache().getHostFor(curi);
356:                if (h == null) {
357:                    throw new NullPointerException("Crawlhost is null for "
358:                            + curi + " " + curi.getVia());
359:                }
360:                InetAddress a = h.getIP();
361:                if (a == null) {
362:                    throw new NullPointerException(
363:                            "Address is null for "
364:                                    + curi
365:                                    + " "
366:                                    + curi.getVia()
367:                                    + ". Address "
368:                                    + ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP) ? "was never looked up."
369:                                            : (System.currentTimeMillis() - h
370:                                                    .getIpFetched())
371:                                                    + " ms ago."));
372:                }
373:                return h.getIP().getHostAddress();
374:            }
375:
376:            /**
377:             * Version of getAttributes that catches and logs exceptions
378:             * and returns null if failure to fetch the attribute.
379:             * @param name Attribute name.
380:             * @return Attribute or null.
381:             */
382:            public Object getAttributeUnchecked(String name) {
383:                Object result = null;
384:                try {
385:                    result = super .getAttribute(name);
386:                } catch (AttributeNotFoundException e) {
387:                    logger.warning(e.getLocalizedMessage());
388:                } catch (MBeanException e) {
389:                    logger.warning(e.getLocalizedMessage());
390:                } catch (ReflectionException e) {
391:                    logger.warning(e.getLocalizedMessage());
392:                }
393:                return result;
394:            }
395:
396:            /**
397:             * Max size we want files to be (bytes).
398:             *
399:             * Default is ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE.  Note that ARC
400:             * files will usually be bigger than maxSize; they'll be maxSize + length
401:             * to next boundary.
402:             * @return ARC maximum size.
403:             */
404:            public long getMaxSize() {
405:                Object obj = getAttributeUnchecked(ATTR_MAX_SIZE_BYTES);
406:                return (obj == null) ? DEFAULT_MAX_FILE_SIZE : ((Long) obj)
407:                        .longValue();
408:            }
409:
410:            public String getPrefix() {
411:                Object obj = getAttributeUnchecked(ATTR_PREFIX);
412:                return (obj == null) ? WriterPoolMember.DEFAULT_PREFIX
413:                        : (String) obj;
414:            }
415:
416:            public List<File> getOutputDirs() {
417:                Object obj = getAttributeUnchecked(ATTR_PATH);
418:                List list = (obj == null) ? Arrays.asList(DEFAULT_PATH)
419:                        : (StringList) obj;
420:                ArrayList<File> results = new ArrayList<File>();
421:                for (Iterator i = list.iterator(); i.hasNext();) {
422:                    String path = (String) i.next();
423:                    File f = new File(path);
424:                    if (!f.isAbsolute()) {
425:                        f = new File(getController().getDisk(), path);
426:                    }
427:                    if (!f.exists()) {
428:                        try {
429:                            f.mkdirs();
430:                        } catch (Exception e) {
431:                            e.printStackTrace();
432:                            continue;
433:                        }
434:                    }
435:                    results.add(f);
436:                }
437:                return results;
438:            }
439:
440:            public boolean isCompressed() {
441:                Object obj = getAttributeUnchecked(ATTR_COMPRESS);
442:                return (obj == null) ? DEFAULT_COMPRESS : ((Boolean) obj)
443:                        .booleanValue();
444:            }
445:
446:            /**
447:             * @return Returns the poolMaximumActive.
448:             */
449:            public int getPoolMaximumActive() {
450:                Object obj = getAttributeUnchecked(ATTR_POOL_MAX_ACTIVE);
451:                return (obj == null) ? WriterPool.DEFAULT_MAX_ACTIVE
452:                        : ((Integer) obj).intValue();
453:            }
454:
455:            /**
456:             * @return Returns the poolMaximumWait.
457:             */
458:            public int getPoolMaximumWait() {
459:                Object obj = getAttributeUnchecked(ATTR_POOL_MAX_WAIT);
460:                return (obj == null) ? WriterPool.DEFAULT_MAXIMUM_WAIT
461:                        : ((Integer) obj).intValue();
462:            }
463:
464:            public String getSuffix() {
465:                Object obj = getAttributeUnchecked(ATTR_SUFFIX);
466:                String sfx = (obj == null) ? WriterPoolMember.DEFAULT_SUFFIX
467:                        : (String) obj;
468:                if (sfx != null
469:                        && sfx.trim()
470:                                .equals(WriterPoolMember.HOSTNAME_VARIABLE)) {
471:                    String str = "localhost.localdomain";
472:                    try {
473:                        str = InetAddress.getLocalHost().getHostName();
474:                    } catch (UnknownHostException ue) {
475:                        logger.severe("Failed getHostAddress for this host: "
476:                                + ue);
477:                    }
478:                    sfx = str;
479:                }
480:                return sfx;
481:            }
482:
483:            public long getMaxToWrite() {
484:                Object obj = getAttributeUnchecked(ATTR_MAX_BYTES_WRITTEN);
485:                return (obj == null) ? 0 : ((Long) obj).longValue();
486:            }
487:
488:            public void crawlEnding(String sExitMessage) {
489:                this .pool.close();
490:            }
491:
492:            public void crawlEnded(String sExitMessage) {
493:                // sExitMessage is unused.
494:            }
495:
496:            /* (non-Javadoc)
497:             * @see org.archive.crawler.event.CrawlStatusListener#crawlStarted(java.lang.String)
498:             */
499:            public void crawlStarted(String message) {
500:                // TODO Auto-generated method stub
501:            }
502:
503:            protected String getCheckpointStateFile() {
504:                return this .getClass().getName() + ".state";
505:            }
506:
507:            public void crawlCheckpoint(File checkpointDir) throws IOException {
508:                int serial = getSerialNo().get();
509:                if (this .pool.getNumActive() > 0) {
510:                    // If we have open active Archive files, up the serial number
511:                    // so after checkpoint, we start at one past current number and
512:                    // so the number we serialize, is one past current serialNo.
513:                    // All this serial number manipulation should be fine in here since
514:                    // we're paused checkpointing (Revisit if this assumption changes).
515:                    serial = getSerialNo().incrementAndGet();
516:                }
517:                saveCheckpointSerialNumber(checkpointDir, serial);
518:                // Close all ARCs on checkpoint.
519:                try {
520:                    this .pool.close();
521:                } finally {
522:                    // Reopen on checkpoint.
523:                    setupPool(new AtomicInteger(serial));
524:                }
525:            }
526:
527:            public void crawlPausing(String statusMessage) {
528:                // sExitMessage is unused.
529:            }
530:
531:            public void crawlPaused(String statusMessage) {
532:                // sExitMessage is unused.
533:            }
534:
535:            public void crawlResuming(String statusMessage) {
536:                // sExitMessage is unused.
537:            }
538:
539:            private void readObject(ObjectInputStream stream)
540:                    throws IOException, ClassNotFoundException {
541:                stream.defaultReadObject();
542:                ObjectPlusFilesInputStream coistream = (ObjectPlusFilesInputStream) stream;
543:                coistream.registerFinishTask(new Runnable() {
544:                    public void run() {
545:                        setupPool(new AtomicInteger());
546:                    }
547:                });
548:            }
549:
550:            protected WriterPool getPool() {
551:                return pool;
552:            }
553:
554:            protected void setPool(WriterPool pool) {
555:                this .pool = pool;
556:            }
557:
558:            protected long getTotalBytesWritten() {
559:                return totalBytesWritten;
560:            }
561:
562:            protected void setTotalBytesWritten(long totalBytesWritten) {
563:                this .totalBytesWritten = totalBytesWritten;
564:            }
565:
566:            /**
567:             * Called out of {@link #initialTasks()} when recovering a checkpoint.
568:             * Restore state.
569:             */
570:            protected void checkpointRecover() {
571:                int serialNo = loadCheckpointSerialNumber();
572:                if (serialNo != -1) {
573:                    getSerialNo().set(serialNo);
574:                }
575:            }
576:
577:            /**
578:             * @return Serial number from checkpoint state file or if unreadable, -1
579:             * (Client should check for -1).
580:             */
581:            protected int loadCheckpointSerialNumber() {
582:                int result = -1;
583:
584:                // If in recover mode, read in the Writer serial number saved
585:                // off when we checkpointed.
586:                File stateFile = new File(getSettingsHandler().getOrder()
587:                        .getController().getCheckpointRecover().getDirectory(),
588:                        getCheckpointStateFile());
589:                if (!stateFile.exists()) {
590:                    logger
591:                            .info(stateFile.getAbsolutePath()
592:                                    + " doesn't exist so cannot restore Writer serial number.");
593:                } else {
594:                    DataInputStream dis = null;
595:                    try {
596:                        dis = new DataInputStream(
597:                                new FileInputStream(stateFile));
598:                        result = dis.readShort();
599:                    } catch (FileNotFoundException e) {
600:                        e.printStackTrace();
601:                    } catch (IOException e) {
602:                        e.printStackTrace();
603:                    } finally {
604:                        try {
605:                            if (dis != null) {
606:                                dis.close();
607:                            }
608:                        } catch (IOException e) {
609:                            e.printStackTrace();
610:                        }
611:                    }
612:                }
613:                return result;
614:            }
615:
616:            protected void saveCheckpointSerialNumber(final File checkpointDir,
617:                    final int serialNo) throws IOException {
618:                // Write out the current state of the ARCWriter serial number.
619:                File f = new File(checkpointDir, getCheckpointStateFile());
620:                DataOutputStream dos = new DataOutputStream(
621:                        new FileOutputStream(f));
622:                try {
623:                    dos.writeShort(serialNo);
624:                } finally {
625:                    dos.close();
626:                }
627:            }
628:
629:            /**
630:             * Return list of metadatas to add to first arc file metadata record.
631:             * 
632:             * Default is to stylesheet the order file.  To specify stylesheet,
633:             * override {@link #getFirstrecordStylesheet()}.
634:             *
635:             * Get xml files from settingshandler.  Currently order file is the
636:             * only xml file.  We're NOT adding seeds to meta data.
637:             *
638:             * @return List of strings and/or files to add to arc file as metadata or
639:             * null.
640:             */
641:            public synchronized List<String> getMetadata() {
642:                if (this .cachedMetadata != null) {
643:                    return this .cachedMetadata;
644:                }
645:                return cacheMetadata();
646:            }
647:
648:            protected synchronized List<String> cacheMetadata() {
649:                if (this .cachedMetadata != null) {
650:                    return this .cachedMetadata;
651:                }
652:
653:                // If no stylesheet, return empty metadata.
654:                if (getFirstrecordStylesheet() == null
655:                        || getFirstrecordStylesheet().length() == 0) {
656:                    this .cachedMetadata = new ArrayList<String>(1);
657:                    this .cachedMetadata.add("");
658:                    return this .cachedMetadata;
659:                }
660:
661:                List<String> result = null;
662:                if (!XMLSettingsHandler.class.isInstance(getSettingsHandler())) {
663:                    logger
664:                            .warning("Expected xml settings handler (No warcinfo).");
665:                    // Early return
666:                    return result;
667:                }
668:
669:                XMLSettingsHandler xsh = (XMLSettingsHandler) getSettingsHandler();
670:                File orderFile = xsh.getOrderFile();
671:                if (!orderFile.exists() || !orderFile.canRead()) {
672:                    logger.severe("File " + orderFile.getAbsolutePath()
673:                            + " is does not exist or is not readable.");
674:                } else {
675:                    result = new ArrayList<String>(1);
676:                    result.add(getFirstrecordBody(orderFile));
677:                }
678:                this .cachedMetadata = result;
679:                return this .cachedMetadata;
680:            }
681:
682:            /**
683:             * @preturn Full path to stylesheet (Its read off the CLASSPATH
684:             * as resource).
685:             */
686:            protected String getFirstrecordStylesheet() {
687:                return null;
688:            }
689:
690:            /**
691:             * Write the arc metadata body content.
692:             *
693:             * Its based on the order xml file but into this base we'll add other info
694:             * such as machine ip.
695:             *
696:             * @param orderFile Order file.
697:
698:             *
699:             * @return String that holds the arc metaheader body.
700:             */
701:            protected String getFirstrecordBody(File orderFile) {
702:                String result = null;
703:                TransformerFactory factory = TransformerFactory.newInstance();
704:                Templates templates = null;
705:                Transformer xformer = null;
706:                try {
707:                    templates = factory.newTemplates(new StreamSource(this 
708:                            .getClass().getResourceAsStream(
709:                                    getFirstrecordStylesheet())));
710:                    xformer = templates.newTransformer();
711:                    // Below parameter names must match what is in the stylesheet.
712:                    xformer.setParameter("software", "Heritrix "
713:                            + Heritrix.getVersion()
714:                            + " http://crawler.archive.org");
715:                    xformer.setParameter("ip", InetAddress.getLocalHost()
716:                            .getHostAddress());
717:                    xformer.setParameter("hostname", InetAddress.getLocalHost()
718:                            .getHostName());
719:                    StreamSource source = new StreamSource(new FileInputStream(
720:                            orderFile));
721:                    StringWriter writer = new StringWriter();
722:                    StreamResult target = new StreamResult(writer);
723:                    xformer.transform(source, target);
724:                    result = writer.toString();
725:                } catch (TransformerConfigurationException e) {
726:                    logger.severe("Failed transform " + e);
727:                } catch (FileNotFoundException e) {
728:                    logger.severe("Failed transform, file not found " + e);
729:                } catch (UnknownHostException e) {
730:                    logger.severe("Failed transform, unknown host " + e);
731:                } catch (TransformerException e) {
732:                    SourceLocator locator = e.getLocator();
733:                    int col = locator.getColumnNumber();
734:                    int line = locator.getLineNumber();
735:                    String publicId = locator.getPublicId();
736:                    String systemId = locator.getSystemId();
737:                    logger.severe("Transform error " + e + ", col " + col
738:                            + ", line " + line + ", publicId " + publicId
739:                            + ", systemId " + systemId);
740:                }
741:
742:                return result;
743:            }
744:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.