001: /*
002: * AbstractMETSDisseminator.java
003: *
004: * Version: $Revision: 1.1 $
005: *
006: * Date: $Date: 2006/03/17 00:04:38 $
007: *
008: * Copyright (c) 2002-2006, Hewlett-Packard Company and Massachusetts
009: * Institute of Technology. All rights reserved.
010: *
011: * Redistribution and use in source and binary forms, with or without
012: * modification, are permitted provided that the following conditions are
013: * met:
014: *
015: * - Redistributions of source code must retain the above copyright
016: * notice, this list of conditions and the following disclaimer.
017: *
018: * - Redistributions in binary form must reproduce the above copyright
019: * notice, this list of conditions and the following disclaimer in the
020: * documentation and/or other materials provided with the distribution.
021: *
022: * - Neither the name of the Hewlett-Packard Company nor the name of the
023: * Massachusetts Institute of Technology nor the names of their
024: * contributors may be used to endorse or promote products derived from
025: * this software without specific prior written permission.
026: *
027: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
028: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
029: * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
030: * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
031: * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
032: * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
033: * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
034: * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
035: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
036: * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
037: * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
038: * DAMAGE.
039: */
040: package org.dspace.app.sitemap;
041:
042: import java.io.File;
043: import java.io.FileOutputStream;
044: import java.io.IOException;
045: import java.io.OutputStream;
046: import java.io.PrintStream;
047: import java.util.Date;
048: import java.util.zip.GZIPOutputStream;
049:
050: /**
051: * Base class for creating sitemaps of various kinds. A sitemap consists of one
052: * or more files which list significant URLs on a site for search engines to
053: * efficiently crawl. Dates of modification may also be included. A sitemap
054: * index file that links to each of the sitemap files is also generated. It is
055: * this index file that search engines should be directed towards.
056: * <P>
057: * Provides most of the required functionality, subclasses need just implement a
058: * few methods that specify the "boilerplate" and text for including URLs.
059: * <P>
060: * Typical usage:
061: * <pre>
062: * AbstractGenerator g = new FooGenerator(...);
063: * while (...) {
064: * g.addURL(url, date);
065: * }
066: * g.finish();
067: * </pre>
068: *
069: * @author Robert Tansley
070: */
071: public abstract class AbstractGenerator {
072: /** Number of files written so far */
073: protected int fileCount;
074:
075: /** Number of bytes written to current file */
076: protected int bytesWritten;
077:
078: /** Number of URLs written to current file */
079: protected int urlsWritten;
080:
081: /** Directory files are written to */
082: protected File outputDir;
083:
084: /** Current output */
085: protected PrintStream currentOutput;
086:
087: /** Size in bytes of trailing boilerplate */
088: private int trailingByteCount;
089:
090: /**
091: * Initialize this generator to write to the given directory. This must be
092: * called by any subclass constructor.
093: *
094: * @param outputDirIn
095: * directory to write sitemap files to
096: */
097: public AbstractGenerator(File outputDirIn) {
098: fileCount = 0;
099: outputDir = outputDirIn;
100: trailingByteCount = getTrailingBoilerPlate().length();
101: currentOutput = null;
102: }
103:
104: /**
105: * Start writing a new sitemap file.
106: *
107: * @throws IOException
108: * if an error occurs creating the file
109: */
110: protected void startNewFile() throws IOException {
111: String lbp = getLeadingBoilerPlate();
112:
113: OutputStream fo = new FileOutputStream(new File(outputDir,
114: getFilename(fileCount)));
115:
116: if (useCompression()) {
117: fo = new GZIPOutputStream(fo);
118: }
119:
120: currentOutput = new PrintStream(fo);
121: currentOutput.print(lbp);
122: bytesWritten = lbp.length();
123: urlsWritten = 0;
124: }
125:
126: /**
127: * Add the given URL to the sitemap.
128: *
129: * @param url
130: * Full URL to add
131: * @param lastMod
132: * Date URL was last modified, or {@code null}
133: * @throws IOException
134: * if an error occurs writing
135: */
136: public void addURL(String url, Date lastMod) throws IOException {
137: // Kick things off if this is the first call
138: if (currentOutput == null) {
139: startNewFile();
140: }
141:
142: String newURLText = getURLText(url, lastMod);
143:
144: if (bytesWritten + newURLText.length() + trailingByteCount > getMaxSize()
145: || urlsWritten + 1 > getMaxURLs()) {
146: closeCurrentFile();
147: startNewFile();
148: }
149:
150: currentOutput.print(newURLText);
151: bytesWritten += newURLText.length();
152: urlsWritten++;
153: }
154:
155: /**
156: * Finish with the current sitemap file.
157: *
158: * @throws IOException
159: * if an error occurs writing
160: */
161: protected void closeCurrentFile() throws IOException {
162: currentOutput.print(getTrailingBoilerPlate());
163: currentOutput.close();
164: fileCount++;
165: }
166:
167: /**
168: * Complete writing sitemap files and write the index files. This is invoked
169: * when all calls to {@link AbstractGenerator#addURL(String, Date)} have
170: * been completed, and invalidates the generator.
171: *
172: * @return number of sitemap files written.
173: *
174: * @throws IOException
175: * if an error occurs writing
176: */
177: public int finish() throws IOException {
178: closeCurrentFile();
179:
180: OutputStream fo = new FileOutputStream(new File(outputDir,
181: getIndexFilename()));
182:
183: if (useCompression()) {
184: fo = new GZIPOutputStream(fo);
185: }
186:
187: PrintStream out = new PrintStream(fo);
188: writeIndex(out, fileCount);
189: out.close();
190:
191: return fileCount;
192: }
193:
194: /**
195: * Return marked-up text to be included in a sitemap about a given URL.
196: *
197: * @param url
198: * URL to add information about
199: * @param lastMod
200: * date URL was last modified, or {@code null} if unknown or not
201: * applicable
202: * @return the mark-up to include
203: */
204: public abstract String getURLText(String url, Date lastMod);
205:
206: /**
207: * Return the boilerplate at the top of a sitemap file.
208: *
209: * @return The boilerplate markup.
210: */
211: public abstract String getLeadingBoilerPlate();
212:
213: /**
214: * Return the boilerplate at the end of a sitemap file.
215: *
216: * @return The boilerplate markup.
217: */
218: public abstract String getTrailingBoilerPlate();
219:
220: /**
221: * Return the maximum size in bytes that an individual sitemap file should
222: * be.
223: *
224: * @return the size in bytes.
225: */
226: public abstract int getMaxSize();
227:
228: /**
229: * Return the maximum number of URLs that an individual sitemap file should
230: * contain.
231: *
232: * @return the maximum number of URLs.
233: */
234: public abstract int getMaxURLs();
235:
236: /**
237: * Return whether the written sitemap files and index should be
238: * GZIP-compressed.
239: *
240: * @return {@code true} if GZIP compression should be used, {@code false}
241: * otherwise.
242: */
243: public abstract boolean useCompression();
244:
245: /**
246: * Return the filename a sitemap at the given index should be stored at.
247: *
248: * @param number
249: * index of the sitemap file (zero is first).
250: * @return the filename to write the sitemap to.
251: */
252: public abstract String getFilename(int number);
253:
254: /**
255: * Get the filename the index should be written to.
256: *
257: * @return the filename of the index.
258: */
259: public abstract String getIndexFilename();
260:
261: /**
262: * Write the index file.
263: *
264: * @param output
265: * stream to write the index to
266: * @param sitemapCount
267: * number of sitemaps that were generated
268: * @throws IOException
269: * if an IO error occurs
270: */
271: public abstract void writeIndex(PrintStream output, int sitemapCount)
272: throws IOException;
273: }
|