001: /*
002: * (c) Copyright 2007 by Volker Bergmann. All rights reserved.
003: *
004: * Redistribution and use in source and binary forms, with or without
005: * modification, is permitted under the terms of the
006: * GNU General Public License.
007: *
008: * For redistributing this software or a derivative work under a license other
009: * than the GPL-compatible Free Software License as defined by the Free
010: * Software Foundation or approved by OSI, you must first obtain a commercial
011: * license to this software product from Volker Bergmann.
012: *
013: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
014: * WITHOUT A WARRANTY OF ANY KIND. ALL EXPRESS OR IMPLIED CONDITIONS,
015: * REPRESENTATIONS AND WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF
016: * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT, ARE
017: * HEREBY EXCLUDED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
018: * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
019: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
020: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
021: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
022: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
023: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
024: * POSSIBILITY OF SUCH DAMAGE.
025: */
026:
027: package org.databene.benerator.util;
028:
029: import org.apache.commons.logging.LogFactory;
030: import org.apache.commons.logging.Log;
031: import org.databene.benerator.Generator;
032: import org.databene.benerator.Sequence;
033: import org.databene.benerator.factory.GeneratorFactory;
034: import org.databene.commons.ReaderLineIterator;
035: import org.databene.commons.StringUtil;
036: import org.databene.commons.IOUtil;
037:
038: import java.io.*;
039: import java.util.List;
040: import java.util.ArrayList;
041:
042: /**
043: * Reads a text file, shuffles its lines and writes it to another file.<br/>
044: * <br/>
045: * Created: 16.07.2007 20:29:10
046: */
047: public class LineShuffler {
048:
049: public static final Log logger = LogFactory
050: .getLog(LineShuffler.class);
051:
052: public static void main(String[] args) throws IOException {
053: if (args.length < 2) {
054: printHelp();
055: System.exit(-1);
056: }
057: String inFilename = args[0];
058: String outFilename = args[1];
059: int bufferSize = (args.length > 2 ? Integer.parseInt(args[2])
060: : 100000);
061: shuffle(inFilename, outFilename, bufferSize);
062: }
063:
064: public static void shuffle(String inFilename, String outFilename,
065: int bufferSize) throws IOException {
066: logger.info("shuffling " + inFilename + " and writing to "
067: + outFilename + " (max. " + bufferSize + " lines)");
068: ReaderLineIterator iterator = new ReaderLineIterator(
069: new BufferedReader(new FileReader(inFilename)));
070: List<String> lines = read(bufferSize, iterator);
071: shuffle(lines);
072: save(lines, outFilename);
073: }
074:
075: public static void shuffle(List<String> lines) {
076: int size = lines.size();
077: Generator<Integer> indexGenerator = GeneratorFactory
078: .getNumberGenerator(Integer.class, 0, size - 1, 1,
079: Sequence.RANDOM, 0);
080: int iterations = size / 2;
081: for (int i = 0; i < iterations; i++) {
082: int i1 = indexGenerator.generate();
083: int i2;
084: do {
085: i2 = indexGenerator.generate();
086: } while (i1 == i2);
087: String tmp = lines.get(i1);
088: lines.set(i1, lines.get(i2));
089: lines.set(i2, tmp);
090: }
091: }
092:
093: // private helpers -------------------------------------------------------------------------------------------------
094:
095: private static List<String> read(int bufferSize,
096: ReaderLineIterator iterator) {
097: List<String> lines = new ArrayList<String>(Math.max(100000,
098: bufferSize));
099: int lineCount = 0;
100: while (iterator.hasNext() && lineCount < bufferSize) {
101: String line = iterator.next();
102: if (!StringUtil.isEmpty(line)) {
103: lines.add(line);
104: lineCount++;
105: if (lineCount % 100000 == 100000)
106: logger.info("parsed " + lineCount + " lines");
107: }
108: }
109: return lines;
110: }
111:
112: private static void save(List<String> lines, String outputFilename)
113: throws IOException {
114: logger.info("saving " + outputFilename + "...");
115: PrintWriter printer = new PrintWriter(new BufferedWriter(
116: new FileWriter(outputFilename)));
117: try {
118: for (String line : lines)
119: printer.println(line);
120: } finally {
121: IOUtil.close(printer);
122: }
123: }
124:
125: private static void printHelp() {
126: System.out.println("Parameters: inFile outFile [buffer size]");
127: }
128: }
|