001: /**
002: * $Id: RtfTokeniser.java 2429 2006-10-06 14:58:54Z psoares33 $
003: * $Name$
004: *
005: * Copyright 2006 by Mark Hall
006: *
007: * The contents of this file are subject to the Mozilla Public License Version 1.1
008: * (the "License"); you may not use this file except in compliance with the License.
009: * You may obtain a copy of the License at http://www.mozilla.org/MPL/
010: *
011: * Software distributed under the License is distributed on an "AS IS" basis,
012: * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
013: * for the specific language governing rights and limitations under the License.
014: *
015: * The Original Code is 'iText, a free JAVA-PDF library'.
016: *
017: * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
018: * the Initial Developer are Copyright (C) 1999-2006 by Bruno Lowagie.
019: * All Rights Reserved.
020: * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
021: * are Copyright (C) 2000-2006 by Paulo Soares. All Rights Reserved.
022: *
023: * Contributor(s): all the names of the contributors are added in the source code
024: * where applicable.
025: *
026: * Alternatively, the contents of this file may be used under the terms of the
027: * LGPL license (the ?GNU LIBRARY GENERAL PUBLIC LICENSE?), in which case the
028: * provisions of LGPL are applicable instead of those above. If you wish to
029: * allow use of your version of this file only under the terms of the LGPL
030: * License and not to allow others to use your version of this file under
031: * the MPL, indicate your decision by deleting the provisions above and
032: * replace them with the notice and other provisions required by the LGPL.
033: * If you do not delete the provisions above, a recipient may use your version
034: * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
035: *
036: * This library is free software; you can redistribute it and/or modify it
037: * under the terms of the MPL as stated above or under the terms of the GNU
038: * Library General Public License as published by the Free Software Foundation;
039: * either version 2 of the License, or any later version.
040: *
041: * This library is distributed in the hope that it will be useful, but WITHOUT
042: * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
043: * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
044: * details.
045: *
046: * If you didn't download this code from the following link, you should check if
047: * you aren't using an obsolete version:
048: * http://www.lowagie.com/iText/
049: */package com.lowagie.text.rtf.direct;
050:
051: import java.io.IOException;
052: import java.io.Reader;
053:
054: /**
055: * The RtfTokeniser takes an RTF document stream and
056: * turns it into a set of RTF tokens. Five groups of
057: * tokens are differentiated:
058: *
059: * <ul>
060: * <li>Group opening: {</li>
061: * <li>Group closing: }</li>
062: * <li>Control characters</li>
063: * <li>Control words</li>
064: * <li>Text</li>
065: * </ul>
066: *
067: * @version $Revision: 2429 $
068: * @author Mark Hall (mhall@edu.uni-klu.ac.at)
069: * @author Bullo (bullo70@users.sourceforge.net)
070: */
071: public class RtfTokeniser {
072: /**
073: * The RtfTokeniser is in its ground state. Any token may follow.
074: */
075: private static final int TOKENISER_STATE_READY = 0;
076: /**
077: * The last token parsed was a slash.
078: */
079: private static final int TOKENISER_STATE_SLASH = 1;
080: /**
081: * The RtfTokeniser is currently tokenising a control word.
082: */
083: private static final int TOKENISER_STATE_IN_CTRL_WORD = 2;
084: /**
085: * The RtfTokeniser is currently tokenising a text.
086: */
087: private static final int TOKENISER_STATE_IN_TEXT = 4;
088:
089: /**
090: * The current state of this RtfTokeniser.
091: */
092: private int state = TOKENISER_STATE_READY;
093: /**
094: * The current group nesting level.
095: */
096: private int groupLevel = 0;
097: /**
098: * The RtfParser to send tokens to.
099: */
100: private RtfParser rtfParser = null;
101:
102: /**
103: * Constructs a new RtfTokeniser. The startGroupLevel is required when parsing
104: * RTF fragments, since they are missing the opening group and closing group
105: * and thus this has to be set at the beginning.
106: *
107: * @param rtfParser The RtfParser to send tokens to.
108: * @param startGroupLevel The starting group nesting level. 0 for full documents, 1 for fragments.
109: */
110: public RtfTokeniser(RtfParser rtfParser, int startGroupLevel) {
111: this .rtfParser = rtfParser;
112: this .groupLevel = startGroupLevel;
113: }
114:
115: /**
116: * The main tokenisation method. Implements a LL(1) parser.
117: *
118: * @param reader The Reader to read the RTF document from.
119: * @throws IOException On I/O errors.
120: */
121: public void tokenise(Reader reader) throws IOException {
122: char[] nextChar = new char[1];
123: StringBuffer temp = new StringBuffer();
124: this .state = TOKENISER_STATE_READY;
125: this .groupLevel = 0;
126: while (reader.read(nextChar) != -1) {
127: if (this .state == TOKENISER_STATE_READY) { // No influence from previous characters.
128: if (nextChar[0] == '{') { // Open a group
129: this .rtfParser.handleOpenGroup(this .groupLevel);
130: groupLevel++;
131: } else if (nextChar[0] == '}') { // Close a group
132: this .rtfParser.handleCloseGroup(this .groupLevel);
133: groupLevel--;
134: } else if (nextChar[0] == '\\') {
135: this .state = TOKENISER_STATE_SLASH;
136: temp = new StringBuffer();
137: } else {
138: this .state = TOKENISER_STATE_IN_TEXT;
139: temp.append(nextChar[0]);
140: }
141: } else if ((this .state & TOKENISER_STATE_SLASH) == TOKENISER_STATE_SLASH) { // A slash signals a control character or word or an escaped character
142: if (nextChar[0] == '{') {
143: this .state = TOKENISER_STATE_IN_TEXT;
144: temp.append("\\{");
145: } else if (nextChar[0] == '}') {
146: this .state = TOKENISER_STATE_IN_TEXT;
147: temp.append("\\}");
148: } else if (nextChar[0] == '\\') {
149: this .state = TOKENISER_STATE_IN_TEXT;
150: temp.append("\\\\");
151: } else {
152: if ((this .state & TOKENISER_STATE_IN_TEXT) == TOKENISER_STATE_IN_TEXT) { // A control word or character closes previous text token
153: this .rtfParser.handleText(temp.toString(),
154: this .groupLevel);
155: temp = new StringBuffer();
156: }
157: if (nextChar[0] == '|') {
158: this .state = TOKENISER_STATE_READY;
159: this .rtfParser.handleCtrlCharacter("\\|",
160: this .groupLevel);
161: } else if (nextChar[0] == '~') {
162: this .state = TOKENISER_STATE_READY;
163: this .rtfParser.handleCtrlCharacter("\\~",
164: this .groupLevel);
165: } else if (nextChar[0] == '-') {
166: this .state = TOKENISER_STATE_READY;
167: this .rtfParser.handleCtrlCharacter("\\-",
168: this .groupLevel);
169: } else if (nextChar[0] == '_') {
170: this .state = TOKENISER_STATE_READY;
171: this .rtfParser.handleCtrlCharacter("\\_",
172: this .groupLevel);
173: } else if (nextChar[0] == ':') {
174: this .state = TOKENISER_STATE_READY;
175: this .rtfParser.handleCtrlCharacter("\\:",
176: this .groupLevel);
177: } else if (nextChar[0] == '*') {
178: this .state = TOKENISER_STATE_READY;
179: this .rtfParser.handleCtrlCharacter("\\*",
180: this .groupLevel);
181: } else {
182: this .state = TOKENISER_STATE_IN_CTRL_WORD;
183: temp = new StringBuffer("\\");
184: temp.append(nextChar[0]);
185: }
186: }
187: } else if (this .state == TOKENISER_STATE_IN_CTRL_WORD) { // Control words run until a space, close or open group or another control word is found.
188: if (nextChar[0] == '\n' || nextChar[0] == '\r') {
189: nextChar[0] = ' ';
190: }
191: if (nextChar[0] == '{') {
192: this .rtfParser.handleCtrlWord(temp.toString(),
193: this .groupLevel);
194: this .rtfParser.handleOpenGroup(this .groupLevel);
195: groupLevel++;
196: this .state = TOKENISER_STATE_READY;
197: temp = new StringBuffer();
198: } else if (nextChar[0] == '}') {
199: this .rtfParser.handleCtrlWord(temp.toString(),
200: this .groupLevel);
201: this .rtfParser.handleCloseGroup(this .groupLevel);
202: groupLevel--;
203: this .state = TOKENISER_STATE_READY;
204: temp = new StringBuffer();
205: } else if (nextChar[0] == '\\') {
206: this .rtfParser.handleCtrlWord(temp.toString(),
207: this .groupLevel);
208: this .state = TOKENISER_STATE_SLASH;
209: temp = new StringBuffer();
210: } else if (nextChar[0] == ' ') {
211: this .rtfParser.handleCtrlWord(temp.toString(),
212: this .groupLevel);
213: this .rtfParser.handleText(" ", this .groupLevel);
214: this .state = TOKENISER_STATE_READY;
215: temp = new StringBuffer();
216: } else if (nextChar[0] == ';') {
217: this .rtfParser.handleCtrlWord(temp.toString(),
218: this .groupLevel);
219: this .rtfParser.handleText(";", this .groupLevel);
220: this .state = TOKENISER_STATE_READY;
221: temp = new StringBuffer();
222: } else {
223: temp.append(nextChar[0]);
224: }
225: } else if (this .state == TOKENISER_STATE_IN_TEXT) { // Text tokens are closed by control characters or words or open and close groups
226: if (nextChar[0] == '{') {
227: this .rtfParser.handleText(temp.toString(),
228: this .groupLevel);
229: this .rtfParser.handleOpenGroup(this .groupLevel);
230: groupLevel++;
231: this .state = TOKENISER_STATE_READY;
232: temp = new StringBuffer();
233: } else if (nextChar[0] == '}') {
234: this .rtfParser.handleText(temp.toString(),
235: this .groupLevel);
236: this .rtfParser.handleCloseGroup(this .groupLevel);
237: groupLevel--;
238: this .state = TOKENISER_STATE_READY;
239: temp = new StringBuffer();
240: } else if (nextChar[0] == '\\') {
241: this .state = TOKENISER_STATE_IN_TEXT
242: | TOKENISER_STATE_SLASH;
243: } else {
244: temp.append(nextChar[0]);
245: }
246: }
247: }
248: if ((this .state & TOKENISER_STATE_IN_TEXT) == TOKENISER_STATE_IN_TEXT
249: && !temp.toString().equals("")) { // If at the end a text token was being parsed, emmit that token. Required for RTF fragments
250: this.rtfParser.handleText(temp.toString(), this.groupLevel);
251: }
252: }
253: }
|