001: /*
002: * $Id: strings.java,v 1.3 2003/08/12 18:11:30 dfs Exp $
003: *
004: * ====================================================================
005: * The Apache Software License, Version 1.1
006: *
007: * Copyright (c) 2002 The Apache Software Foundation. All rights
008: * reserved.
009: *
010: * Redistribution and use in source and binary forms, with or without
011: * modification, are permitted provided that the following conditions
012: * are met:
013: *
014: * 1. Redistributions of source code must retain the above copyright
015: * notice, this list of conditions and the following disclaimer.
016: *
017: * 2. Redistributions in binary form must reproduce the above copyright
018: * notice, this list of conditions and the following disclaimer in
019: * the documentation and/or other materials provided with the
020: * distribution.
021: *
022: * 3. The end-user documentation included with the redistribution,
023: * if any, must include the following acknowledgment:
024: * "This product includes software developed by the
025: * Apache Software Foundation (http://www.apache.org/)."
026: * Alternately, this acknowledgment may appear in the software itself,
027: * if and wherever such third-party acknowledgments normally appear.
028: *
029: * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
030: * must not be used to endorse or promote products derived from this
031: * software without prior written permission. For written
032: * permission, please contact apache@apache.org.
033: *
034: * 5. Products derived from this software may not be called "Apache"
035: * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
036: * name, without prior written permission of the Apache Software Foundation.
037: *
038: * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
039: * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
040: * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
041: * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
042: * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
043: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
044: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
045: * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
046: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
047: * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
048: * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
049: * SUCH DAMAGE.
050: * ====================================================================
051: *
052: * This software consists of voluntary contributions made by many
053: * individuals on behalf of the Apache Software Foundation. For more
054: * information on the Apache Software Foundation, please see
055: * <http://www.apache.org/>.
056: */
057:
058: package examples.awk;
059:
060: import java.io.*;
061: import org.apache.oro.text.regex.*;
062: import org.apache.oro.text.awk.*;
063:
064: /**
065: * This is a test program demonstrating how to search an input stream
066: * with the jakarta-oro awk package regular expression classes. It
067: * performs a function similar to the Unix <code>strings</code> command,
068: * but is intended to show how matching on a stream is affected by its
069: * character encoding. The most important thing to remember is that
070: * AwkMatcher only matches on 8-bit values. If your input contains
071: * Java characters containing values greater than 255, the pattern
072: * matching process will result in an ArrayIndexOutOfBoundsException.
073: * Therefore, if you want to search a binary file containing arbitrary
074: * bytes, you have to make sure you use an 8-bit character encoding
075: * like ISO-8859-1, so that the mapping between byte-values and character
076: * values will be one to one. Otherwise, the file will be interpreted
077: * as UTF-8 by default, and you will probably wind up with character
078: * values outside of the 8-bit range.
079: *
080: * @version @version@
081: */
082: public final class strings {
083:
084: public static final class StringFinder {
085: /**
086: * Default string expression. Looks for at least 4 contiguous
087: * printable characters. Differs slightly from GNU strings command
088: * in that any printable character may start a string.
089: */
090: public static final String DEFAULT_PATTERN = "[\\x20-\\x7E]{3}[\\x20-\\x7E]+";
091:
092: Pattern pattern;
093: AwkMatcher matcher;
094:
095: public StringFinder(String regex)
096: throws MalformedPatternException {
097: AwkCompiler compiler = new AwkCompiler();
098: pattern = compiler.compile(regex,
099: AwkCompiler.CASE_INSENSITIVE_MASK);
100: matcher = new AwkMatcher();
101: }
102:
103: public StringFinder() throws MalformedPatternException {
104: this (DEFAULT_PATTERN);
105: }
106:
107: public void search(Reader input, PrintWriter output)
108: throws IOException {
109: MatchResult result;
110: AwkStreamInput in = new AwkStreamInput(input);
111:
112: while (matcher.contains(in, pattern)) {
113: result = matcher.getMatch();
114: output.println(result);
115: }
116: output.flush();
117: }
118: }
119:
120: public static final String DEFAULT_ENCODING = "ISO-8859-1";
121:
122: public static final void main(String args[]) {
123: String regex = StringFinder.DEFAULT_PATTERN;
124: String filename, encoding = DEFAULT_ENCODING;
125: StringFinder finder;
126: Reader file = null;
127:
128: // Some users thought it would be useful to use the default pattern
129: // and just pass the encoding as the second parameter. Therefore,
130: // when two arguments are given and the second argument is not a valid
131: // encoding, it is interpreted as a pattern. This means you can't
132: // use a valid encoding name as a pattern without also specifying
133: // an encoding as a third argument.
134: if (args.length < 1) {
135: System.err
136: .println("usage: strings file [pattern|encoding] [encoding]");
137: return;
138: } else if (args.length > 2) {
139: regex = args[1];
140: encoding = args[2];
141: } else if (args.length > 1)
142: encoding = args[1];
143:
144: filename = args[0];
145:
146: try {
147: InputStream fin = new FileInputStream(filename);
148:
149: try {
150: file = new InputStreamReader(fin, encoding);
151: } catch (UnsupportedEncodingException uee) {
152: if (args.length == 2) {
153: regex = encoding;
154: encoding = DEFAULT_ENCODING;
155: file = new InputStreamReader(fin, encoding);
156: } else
157: throw uee;
158: }
159:
160: finder = new StringFinder(regex);
161: finder.search(file, new PrintWriter(new OutputStreamWriter(
162: System.out)));
163: file.close();
164: } catch (Exception e) {
165: e.printStackTrace();
166: return;
167: }
168: }
169: }
|