001: /*
002:
003: This software is OSI Certified Open Source Software.
004: OSI Certified is a certification mark of the Open Source Initiative.
005:
006: The license (Mozilla version 1.0) can be read at the MMBase site.
007: See http://www.MMBase.org/license
008:
009: */
010: package org.mmbase.util.transformers;
011:
012: import java.util.*;
013: import java.io.*;
014:
015: import org.mmbase.util.logging.*;
016:
017: /**
018: * A chunked transformer is a transformer that transforms on a 'chunk by chunk' base. A chunck is
019: * typically a word or a line or so. The type of the 'chunks' is controled by the 'mode' parameter.
020: *
021: * It can ignored existing XML markup (the 'XMLTEXT' modes), and also avoids trailing dots and
022: * comments and surrounding quotes and parentheses.
023: *
024: * @author Michiel Meeuwissen
025: * @since MMBase-1.8
026: */
027:
028: public abstract class ChunkedTransformer extends
029: ConfigurableReaderTransformer implements CharTransformer {
030: private static final Logger log = Logging
031: .getLoggerInstance(ChunkedTransformer.class);
032:
033: /**
034: * Match word by word, but only in PCDATA of xml elements.
035: */
036: public final static int XMLTEXT_WORDS = 1;
037:
038: /**
039: * Match in PCDATA of xml elements.
040: */
041: public final static int XMLTEXT = 2;
042:
043: /**
044: * Match word by word.
045: */
046: public final static int WORDS = 3;
047:
048: /**
049: * Match line by line.
050: */
051: public final static int LINES = 4;
052:
053: /**
054: * Match the entire stream (so, one String must be created).
055: */
056: public final static int ENTIRE = 5;
057:
058: /**
059: * If this is added to the config-int, then only the first match should be used.
060: */
061: public final static int REPLACE_FIRST = 100;
062: /**
063: * If this is added to the config-int, then only the first match should be used.
064: */
065: public final static int REPLACE_FIRST_ALL = 200;
066:
067: protected boolean replaceFirst = false;
068: protected boolean replaceFirstAll = false;
069:
070: public void configure(int i) {
071: if (i >= 200) {
072: replaceFirstAll = true;
073: i -= 200;
074: }
075: if (i >= 100) {
076: replaceFirst = true;
077: i -= 100;
078: }
079: super .configure(i);
080: }
081:
082: protected ChunkedTransformer(int i) {
083: super (i);
084: }
085:
086: public ChunkedTransformer() {
087: this (WORDS);
088: }
089:
090: protected class Status {
091: int replaced = 0;
092: Set<Object> used = null;
093: {
094: if (replaceFirstAll)
095: used = new HashSet<Object>();
096: }
097: }
098:
099: protected Status newStatus() {
100: return new Status();
101:
102: }
103:
104: /**
105: * Implement this. Return true if a replacement done.
106: */
107: protected abstract boolean replace(String string, Writer w,
108: Status status) throws IOException;
109:
110: protected boolean replaceWord(StringBuilder word, Writer writer,
111: Status status) throws IOException {
112: int l = word.length();
113: StringBuilder postFix = null;
114: String w;
115: if (l > 0) {
116:
117: postFix = new StringBuilder();
118:
119: // surrounding quotes might look like " because of earlier escaping, so we take those out of consideration.
120: w = word.toString();
121: while (w.endsWith(""")) {
122: postFix.insert(0, """);
123: l -= 6;
124: word.setLength(l);
125: w = word.toString();
126: }
127: if (l > 0) {
128:
129: // to allow for . , and like in the end, we tear those of.
130: char d = word.charAt(l - 1);
131: while (!Character.isLetterOrDigit(d)) {
132: postFix.insert(0, d);
133: word.setLength(--l);
134: if (l == 0)
135: break;
136: d = word.charAt(l - 1);
137: }
138: }
139: }
140:
141: w = word.toString();
142:
143: // stuff in the beginning:
144: while (w.startsWith(""")) {
145: writer.write(""");
146: word.delete(0, 6);
147: l -= 6;
148: w = word.toString();
149: }
150:
151: // ready to make the replacements now.
152: boolean result = replace(w, writer, status);
153:
154: if (postFix != null) {
155: writer.write(postFix.toString());
156: }
157: return result;
158: }
159:
160: /**
161: * Whether still to do replacing, given status.
162: */
163: protected boolean replace(Status status) {
164: return !replaceFirst || status.replaced == 0;
165: }
166:
167: public Writer transformXmlTextWords(Reader r, Writer w) {
168: Status status = newStatus();
169: StringBuilder word = new StringBuilder(); // current word
170: boolean translating = true;
171: try {
172: log.trace("Starting replacing");
173: while (true) {
174: int c = r.read();
175: if (c == -1)
176: break;
177: if (!replace(status)) {
178: w.write(c);
179: } else if (c == '<') { // don't do it in existing tags and attributes
180: translating = false;
181: replaceWord(word, w, status);
182: w.write(c);
183: } else if (c == '>') {
184: translating = true;
185: word.setLength(0);
186: w.write(c);
187: } else if (!translating) {
188: w.write(c);
189: } else {
190: if (Character.isWhitespace((char) c) || c == '\''
191: || c == '\"' || c == '(' || c == ')') {
192: replaceWord(word, w, status);
193: word.setLength(0);
194: w.write(c);
195: } else {
196: word.append((char) c);
197: }
198: }
199: }
200: // write last word
201: if (replace(status)) {
202: if (translating)
203: replaceWord(word, w, status);
204: } else {
205: w.write(word.toString());
206: }
207: if (log.isDebugEnabled()) {
208: log.debug("Finished replacing. Replaced "
209: + status.replaced + " words");
210: }
211: } catch (java.io.IOException e) {
212: log.error(e.toString());
213: }
214: return w;
215: }
216:
217: public Writer transformXmlText(Reader r, Writer w) {
218: Status status = newStatus();
219: StringBuilder xmltext = new StringBuilder(); // current word
220: boolean translating = true;
221: try {
222: log.trace("Starting replacing");
223: while (true) {
224: int c = r.read();
225: if (c == -1)
226: break;
227: if (!replace(status)) {
228: w.write(c);
229: } else
230: // perhaps better use SAX to decently detect XML, but then it probably won't work
231: // very well on sloppy XML (like HTML).
232: if (c == '<') { // don't do it in existing tags and attributes
233: translating = false;
234: replace(xmltext.toString(), w, status);
235: w.write(c);
236: } else if (c == '>') {
237: translating = true;
238: xmltext.setLength(0);
239: w.write(c);
240: } else if (!translating) {
241: w.write(c);
242: } else {
243: xmltext.append((char) c);
244: }
245: }
246: // write last word
247: if (replace(status)) {
248: if (translating)
249: replace(xmltext.toString(), w, status);
250: } else {
251: w.write(xmltext.toString());
252: }
253: log.debug("Finished replacing. Replaced "
254: + status.replaced + " words");
255: } catch (java.io.IOException e) {
256: log.error(e.toString());
257: }
258: return w;
259: }
260:
261: public Writer transformWords(Reader r, Writer w) {
262: Status status = newStatus();
263: StringBuilder word = new StringBuilder(); // current word
264: try {
265: if (log.isDebugEnabled()) {
266: log.trace("Starting replacing words."
267: + Logging.stackTrace());
268: }
269: while (true) {
270: int c = r.read();
271: if (c == -1)
272: break;
273: if (replace(status)
274: && (Character.isWhitespace((char) c)
275: || c == '\'' || c == '\"' || c == '('
276: || c == ')' || c == '<' || c == '>')) {
277: replaceWord(word, w, status);
278: word.setLength(0);
279: w.write(c);
280: } else {
281: word.append((char) c);
282: }
283: }
284: // write last word
285: if (replace(status)) {
286: replaceWord(word, w, status);
287: } else {
288: w.write(word.toString());
289: }
290: log.debug("Finished replacing. Replaced " + status.replaced
291: + " words");
292: } catch (java.io.IOException e) {
293: log.error(e.toString());
294: }
295: return w;
296: }
297:
298: public Writer transformLines(Reader r, Writer w) {
299: BufferedReader reader = new BufferedReader(r);
300: Status status = newStatus();
301: try {
302: String line = reader.readLine();
303: while (line != null) {
304: if (replace(status)) {
305: replace(line, w, status);
306: } else {
307: w.write(line);
308: }
309: line = reader.readLine();
310: if (line != null) {
311: w.write("\n");
312: }
313: }
314: } catch (java.io.IOException e) {
315: log.error(e.toString());
316: }
317: return w;
318: }
319:
320: public Writer transformEntire(Reader r, Writer w) {
321: StringWriter sw = new StringWriter();
322: Status status = newStatus();
323: try {
324: while (true) {
325: int c = r.read();
326: if (c == -1)
327: break;
328: sw.write(c);
329: }
330: replace(sw.toString(), w, status);
331: } catch (java.io.IOException e) {
332: log.error(e.toString());
333: }
334:
335: return w;
336: }
337:
338: public Writer transform(Reader r, Writer w) {
339: switch (to) {
340: case XMLTEXT_WORDS:
341: return transformXmlTextWords(r, w);
342: case XMLTEXT:
343: return transformXmlText(r, w);
344: case WORDS:
345: return transformWords(r, w);
346: case LINES:
347: return transformLines(r, w);
348: case ENTIRE:
349: return transformEntire(r, w);
350: default:
351: throw new UnknownCodingException(getClass(), to);
352: }
353: }
354:
355: abstract protected String base();
356:
357: public String getEncoding() {
358: switch (to) {
359: case XMLTEXT_WORDS:
360: return base() + "_XMLTEXT_WORDS";
361: case XMLTEXT:
362: return base() + "_XMLTEXT";
363: case WORDS:
364: return base() + "_WORDS";
365: case LINES:
366: return base() + "_LINES";
367: case ENTIRE:
368: return base() + "_ENTIRE";
369: default:
370: throw new UnknownCodingException(getClass(), to);
371: }
372: }
373:
374: public Map<String, Config> transformers() {
375: Map<String, Config> h = new HashMap<String, Config>();
376: h
377: .put(
378: base() + "_XMLTEXT_WORDS",
379: new Config(RegexpReplacer.class, XMLTEXT_WORDS,
380: "Search and replaces regexps word-by-word, only in XML text() blocks."));
381: h
382: .put(
383: base() + "_XMLTEXT",
384: new Config(RegexpReplacer.class, XMLTEXT,
385: "Search and replaces regexps, only in XML text() blocks."));
386: h.put(base() + "_WORDS", new Config(RegexpReplacer.class,
387: WORDS, "Search and replaces regexps word-by-word"));
388: h.put(base() + "_LINES", new Config(RegexpReplacer.class,
389: LINES, "Search and replaces regexps, line-by-line"));
390: h.put(base() + "_ENTIRE", new Config(RegexpReplacer.class,
391: ENTIRE, "Search and replaces regexps"));
392:
393: return Collections.unmodifiableMap(h);
394: }
395:
396: public static void main(String[] argv) {
397: CharTransformer trans = new ChunkedTransformer(XMLTEXT) {
398: protected boolean replace(String string, Writer w,
399: Status status) throws IOException {
400: w.write(string);
401: return false;
402: }
403:
404: protected String base() {
405: return "test";
406: }
407: };
408: CharTransformer trans2 = new BufferedReaderTransformer() {
409: @Override
410: protected boolean transform(PrintWriter bw, String line,
411: Status status) {
412: bw.println(line);
413: return true;
414: }
415:
416: @Override
417: protected Status createNewStatus() {
418: return null;
419: }
420: };
421: long startTime = System.currentTimeMillis();
422: if (argv.length > 0) {
423: if ("buf1".equals(argv[0])) {
424: trans.transform(new BufferedReader(
425: new InputStreamReader(System.in)),
426: new BufferedWriter(new OutputStreamWriter(
427: System.out)));
428: } else if ("buf2".equals(argv[0])) {
429: trans2.transform(new InputStreamReader(System.in),
430: new BufferedWriter(new OutputStreamWriter(
431: System.out)));
432: } else {
433: System.err
434: .println("Don't understand '" + argv[0] + "'");
435: }
436: } else {
437: trans.transform(new InputStreamReader(System.in),
438: new OutputStreamWriter(System.out));
439: }
440: long duration = System.currentTimeMillis() - startTime;
441: System.err.println("Converstion took " + duration + " ms");
442:
443: }
444:
445: }
|