001: /**
002: * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
003: * @author Zev Blut zb@ubit.com
004: * @author Romain PELISSE belaran@gmail.com
005: */package net.sourceforge.pmd.cpd;
006:
007: import java.util.List;
008:
009: public abstract class AbstractTokenizer implements Tokenizer {
010:
011: protected List<String> stringToken; // List<String>, should be setted by children classes
012: protected List<String> ignorableCharacter; // List<String>, should be setted by children classes
013: // FIXME:Maybe an array of 'char' would be better for perfomance ?
014: protected List<String> ignorableStmt; // List<String>, should be setted by children classes
015: protected char ONE_LINE_COMMENT_CHAR = '#'; // Most script language ( shell, ruby, python,...) use this symbol for comment line
016:
017: private List<String> code;
018: private int lineNumber = 0;
019: private String currentLine;
020:
021: protected boolean spanMultipleLinesString = true; // Most language does, so default is true
022:
023: private boolean downcaseString = true;
024:
025: public void tokenize(SourceCode tokens, Tokens tokenEntries) {
026: this .code = tokens.getCode();
027:
028: for (this .lineNumber = 0; lineNumber < this .code.size(); lineNumber++) {
029: this .currentLine = this .code.get(this .lineNumber);
030: int loc = 0;
031: while (loc < currentLine.length()) {
032: StringBuffer token = new StringBuffer();
033: loc = getTokenFromLine(token, loc);
034: if (token.length() > 0
035: && !isIgnorableString(token.toString())) {
036: if (downcaseString) {
037: token = new StringBuffer(token.toString()
038: .toLowerCase());
039: }
040: if (CPD.debugEnable)
041: System.out.println("Token added:"
042: + token.toString());
043: tokenEntries.add(new TokenEntry(token.toString(),
044: tokens.getFileName(), lineNumber));
045:
046: }
047: }
048: }
049: tokenEntries.add(TokenEntry.getEOF());
050: }
051:
052: private int getTokenFromLine(StringBuffer token, int loc) {
053: for (int j = loc; j < this .currentLine.length(); j++) {
054: char tok = this .currentLine.charAt(j);
055: if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
056: if (isComment(tok)) {
057: if (token.length() > 0) {
058: return j;
059: } else {
060: return getCommentToken(token, loc);
061: }
062: } else if (isString(tok)) {
063: if (token.length() > 0) {
064: return j; // we need to now parse the string as a seperate token.
065: } else {
066: // we are at the start of a string
067: return parseString(token, j, tok);
068: }
069: } else {
070: token.append(tok);
071: }
072: } else {
073: if (token.length() > 0) {
074: return j;
075: }
076: }
077: loc = j;
078: }
079: return loc + 1;
080: }
081:
082: private int parseString(StringBuffer token, int loc,
083: char stringDelimiter) {
084: boolean escaped = false;
085: boolean done = false;
086: char tok = ' '; // this will be replaced.
087: while ((loc < currentLine.length()) && !done) {
088: tok = currentLine.charAt(loc);
089: if (escaped && tok == stringDelimiter) // Found an escaped string
090: escaped = false;
091: else if (tok == stringDelimiter && (token.length() > 0)) // We are done, we found the end of the string...
092: done = true;
093: else if (tok == '\\') // Found an escaped char
094: escaped = true;
095: else
096: // Adding char...
097: escaped = false;
098: //Adding char to String:" + token.toString());
099: token.append(tok);
100: loc++;
101: }
102: // Handling multiple lines string
103: if (!done && // ... we didn't find the end of the string
104: loc >= currentLine.length() && // ... we have reach the end of the line ( the String is incomplete, for the moment at least)
105: this .spanMultipleLinesString && // ... the language allow multiple line span Strings
106: ++this .lineNumber < this .code.size() // ... there is still more lines to parse
107: ) {
108: // parsing new line
109: this .currentLine = this .code.get(this .lineNumber);
110: // Warning : recursive call !
111: loc = this .parseString(token, loc, stringDelimiter);
112: }
113: return loc + 1;
114: }
115:
116: private boolean ignoreCharacter(char tok) {
117: return this .ignorableCharacter.contains("" + tok);
118: }
119:
120: private boolean isString(char tok) {
121: return this .stringToken.contains("" + tok);
122: }
123:
124: private boolean isComment(char tok) {
125: return tok == ONE_LINE_COMMENT_CHAR;
126: }
127:
128: private int getCommentToken(StringBuffer token, int loc) {
129: while (loc < this .currentLine.length()) {
130: token.append(this .currentLine.charAt(loc++));
131: }
132: return loc;
133: }
134:
135: private boolean isIgnorableString(String token) {
136: return this.ignorableStmt.contains(token);
137: }
138: }
|