001: /*
002: * PerlFormatter.java
003: *
004: * Copyright (C) 1998-2003 Peter Graves
005: * $Id: PerlFormatter.java,v 1.2 2003/04/25 14:20:12 piso Exp $
006: *
007: * This program is free software; you can redistribute it and/or
008: * modify it under the terms of the GNU General Public License
009: * as published by the Free Software Foundation; either version 2
010: * of the License, or (at your option) any later version.
011: *
012: * This program is distributed in the hope that it will be useful,
013: * but WITHOUT ANY WARRANTY; without even the implied warranty of
014: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
015: * GNU General Public License for more details.
016: *
017: * You should have received a copy of the GNU General Public License
018: * along with this program; if not, write to the Free Software
019: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
020: */
021:
022: package org.armedbear.j;
023:
024: import gnu.regexp.RE;
025: import gnu.regexp.REMatch;
026: import gnu.regexp.UncheckedRE;
027:
028: public final class PerlFormatter extends Formatter {
029: private static final int STATE_VARIABLE = STATE_LAST + 1;
030: private static final int STATE_HERE_DOCUMENT = STATE_LAST + 2;
031: private static final int STATE_POD = STATE_LAST + 3;
032: private static final int STATE_REGEXP_DELIMITER = STATE_LAST + 4;
033: private static final int STATE_REGEXP = STATE_LAST + 5;
034: private static final int STATE_SUBST = STATE_LAST + 6;
035:
036: private static final String punctuation = "&`^:+#-%'\"/~_";
037:
038: // Formats.
039: private static final int PERL_FORMAT_TEXT = 0;
040: private static final int PERL_FORMAT_COMMENT = 1;
041: private static final int PERL_FORMAT_STRING = 2;
042: private static final int PERL_FORMAT_KEYWORD = 3;
043: private static final int PERL_FORMAT_FUNCTION = 4;
044: private static final int PERL_FORMAT_BRACE = 5;
045: private static final int PERL_FORMAT_NUMBER = 6;
046: private static final int PERL_FORMAT_SCALAR = 7;
047: private static final int PERL_FORMAT_LIST = 8;
048:
049: private static StringSet functions;
050:
051: private FastStringBuffer sb = new FastStringBuffer();
052:
053: private String endOfText;
054:
055: private static RE matchRE = new UncheckedRE(
056: "(=~|!~)[ \t]+m[^a-zA-Z0-9]");
057:
058: public PerlFormatter(Buffer buffer) {
059: this .buffer = buffer;
060: if (functions == null)
061: functions = new StringSet(perlFunctions);
062: }
063:
064: private void endToken(int state) {
065: if (sb.length() > 0) {
066: int format = -1;
067: switch (state) {
068: case STATE_NEUTRAL:
069: break;
070: case STATE_QUOTE:
071: case STATE_SINGLEQUOTE:
072: case STATE_HERE_DOCUMENT:
073: case STATE_REGEXP:
074: case STATE_SUBST:
075: format = PERL_FORMAT_STRING;
076: break;
077: case STATE_REGEXP_DELIMITER:
078: format = PERL_FORMAT_FUNCTION;
079: break;
080: case STATE_IDENTIFIER:
081: break;
082: case STATE_COMMENT:
083: case STATE_POD:
084: format = PERL_FORMAT_COMMENT;
085: break;
086: case STATE_BRACE:
087: format = PERL_FORMAT_BRACE;
088: break;
089: case STATE_NUMBER:
090: case STATE_HEXNUMBER:
091: format = PERL_FORMAT_NUMBER;
092: break;
093: }
094: addSegment(sb.toString(), format);
095: sb.setLength(0);
096: }
097: }
098:
099: private void parseLine(String text, int state) {
100: if (Editor.tabsAreVisible())
101: text = Utilities
102: .makeTabsVisible(text, buffer.getTabWidth());
103: else
104: text = Utilities.detab(text, buffer.getTabWidth());
105: clearSegmentList();
106: sb.setLength(0);
107: int i = 0;
108: if (state == STATE_HERE_DOCUMENT) {
109: if (text.startsWith(endOfText))
110: state = STATE_NEUTRAL;
111: else {
112: sb.append(text);
113: endToken(state);
114: return;
115: }
116: }
117: if (state == STATE_POD) {
118: sb.append(text);
119: endToken(state);
120: return;
121: }
122: final int limit = text.length();
123: char c;
124: // Skip whitespace at start of line.
125: while (i < limit) {
126: c = text.charAt(i);
127: if (Character.isWhitespace(c)) {
128: sb.append(c);
129: ++i;
130: } else {
131: endToken(state);
132: break;
133: }
134: }
135: char delimiter = 0;
136: while (i < limit) {
137: c = text.charAt(i);
138: if (c == '\\') {
139: // Escape.
140: sb.append(c);
141: if (i < limit - 1)
142: sb.append(text.charAt(++i));
143: ++i;
144: continue;
145: }
146: if (state == STATE_QUOTE) {
147: sb.append(c);
148: if (c == '"') {
149: endToken(state);
150: state = STATE_NEUTRAL;
151: }
152: ++i;
153: continue;
154: }
155: if (state == STATE_SINGLEQUOTE) {
156: sb.append(c);
157: if (c == '\'') {
158: endToken(state);
159: state = STATE_NEUTRAL;
160: }
161: ++i;
162: continue;
163: }
164: if (state == STATE_REGEXP) {
165: if (c == delimiter) {
166: endToken(state);
167: sb.append(c);
168: endToken(STATE_REGEXP_DELIMITER);
169: state = STATE_NEUTRAL;
170: } else
171: sb.append(c);
172: ++i;
173: continue;
174: }
175: if (state == STATE_SUBST) {
176: if (c == delimiter) {
177: endToken(state);
178: sb.append(c);
179: endToken(STATE_REGEXP_DELIMITER);
180: state = STATE_REGEXP;
181: } else
182: sb.append(c);
183: ++i;
184: continue;
185: }
186: // Reaching here, we're not in a quoted string or regexp.
187: if (c == '{' || c == '}') {
188: endToken(state);
189: sb.append(c);
190: endToken(STATE_BRACE);
191: state = STATE_NEUTRAL;
192: ++i;
193: continue;
194: }
195: if (state == STATE_VARIABLE) {
196: boolean ok = false;
197: if (PerlMode.isIdentifierChar(c))
198: ok = true;
199: else if (sb.length() == 1
200: && punctuation.indexOf(c) >= 0)
201: ok = true;
202: if (ok)
203: sb.append(c);
204: else {
205: endToken(state);
206: sb.append(c);
207: state = STATE_NEUTRAL;
208: }
209: ++i;
210: continue;
211: }
212: if (c == '"') {
213: endToken(state);
214: sb.append(c);
215: state = STATE_QUOTE;
216: ++i;
217: continue;
218: }
219: if (c == '\'') {
220: endToken(state);
221: sb.append(c);
222: state = STATE_SINGLEQUOTE;
223: ++i;
224: continue;
225: }
226: if (c == '=' || c == '!') {
227: REMatch match = matchRE.getMatch(text.substring(i));
228: if (match != null) {
229: final String s = match.toString();
230: final int length = s.length();
231: // End the previous token.
232: endToken(state);
233: sb.append(s.substring(0, 2));
234: endToken(STATE_NEUTRAL);
235: i += 2;
236: sb.append(s.substring(2));
237: endToken(STATE_REGEXP_DELIMITER);
238: i += length - 2;
239: delimiter = s.charAt(length - 1);
240: if (delimiter == '{')
241: delimiter = '}';
242: state = STATE_REGEXP;
243: } else {
244: sb.append(c);
245: ++i;
246: }
247: continue;
248: }
249: if (c == '/') {
250: if (isSubst(text, i)) {
251: delimiter = '/';
252: sb.append(c);
253: endToken(STATE_REGEXP_DELIMITER);
254: state = STATE_SUBST;
255: } else if (isRegExp(text, i)) {
256: delimiter = '/';
257: // End the previous token unless we've got "m/".
258: if (i > 0 && text.charAt(i - 1) != 'm')
259: endToken(state);
260: sb.append(c);
261: endToken(STATE_REGEXP_DELIMITER);
262: state = STATE_REGEXP;
263: } else {
264: // It's the division operator.
265: sb.append(c);
266: }
267: ++i;
268: continue;
269: }
270: if (c == '#') {
271: endToken(state);
272: state = STATE_COMMENT;
273: sb.append(text.substring(i));
274: endToken(state);
275: return;
276: }
277: if (state == STATE_IDENTIFIER) {
278: if (PerlMode.isIdentifierChar(c))
279: sb.append(c);
280: else {
281: endToken(state);
282: sb.append(c);
283: state = STATE_NEUTRAL;
284: }
285: ++i;
286: continue;
287: }
288: if (state == STATE_NUMBER) {
289: if (Character.isDigit(c))
290: sb.append(c);
291: else if (sb.length() == 1 && c == 'x' || c == 'X') {
292: sb.append(c);
293: state = STATE_HEXNUMBER;
294: } else {
295: endToken(state);
296: sb.append(c);
297: if (PerlMode.isIdentifierChar(c))
298: state = STATE_IDENTIFIER;
299: else
300: state = STATE_NEUTRAL;
301: }
302: ++i;
303: continue;
304: }
305: if (state == STATE_HEXNUMBER) {
306: if (Character.isDigit(c))
307: sb.append(c);
308: else if ((c >= 'a' && c <= 'f')
309: || (c >= 'A' && c <= 'F'))
310: sb.append(c);
311: else {
312: endToken(state);
313: sb.append(c);
314: if (PerlMode.isIdentifierChar(c))
315: state = STATE_IDENTIFIER;
316: else
317: state = STATE_NEUTRAL;
318: }
319: ++i;
320: continue;
321: }
322: if (state == STATE_NEUTRAL) {
323: if (c == '$') {
324: endToken(state);
325: sb.append(c);
326: state = STATE_VARIABLE;
327: } else if (PerlMode.isIdentifierChar(c)) {
328: endToken(state);
329: sb.append(c);
330: state = STATE_IDENTIFIER;
331: } else if (Character.isDigit(c)) {
332: endToken(state);
333: sb.append(c);
334: state = STATE_NUMBER;
335: } else
336: // Still neutral...
337: sb.append(c);
338: }
339: ++i;
340: }
341: endToken(state);
342: }
343:
344: // i is the index of '/'.
345: public static boolean isSubst(String text, int i) {
346: Debug.assertTrue(text.charAt(i) == '/');
347: if (text.regionMatches(i - 2, "tr/", 0, 3)) {
348: if (i < 3)
349: return true;
350: char c = text.charAt(i - 3);
351: if (PerlMode.getMode().isIdentifierPart(c))
352: return false;
353: else
354: return true;
355: }
356: if (text.regionMatches(i - 1, "s/", 0, 2)) {
357: if (i < 2)
358: return true;
359: char c = text.charAt(i - 2);
360: if (PerlMode.getMode().isIdentifierPart(c))
361: return false;
362: else
363: return true;
364: }
365: if (text.regionMatches(i - 1, "y/", 0, 2)) {
366: if (i < 2)
367: return true;
368: char c = text.charAt(i - 2);
369: if (PerlMode.getMode().isIdentifierPart(c))
370: return false;
371: else
372: return true;
373: }
374: return false;
375: }
376:
377: // Make sure the '/' at i is not the division operator.
378: public static boolean isRegExp(String text, int i) {
379: Debug.assertTrue(text.charAt(i) == '/');
380: if (i == 0) {
381: // It's the first character on the line.
382: return true;
383: }
384: // Consider the previous character.
385: char c = text.charAt(i - 1);
386: if (c == '(')
387: return true;
388: if (c == 'm') {
389: if (i - 2 < 0)
390: return true;
391: c = text.charAt(i - 2);
392: if (c == '(' || Character.isWhitespace(c))
393: return true;
394: return false;
395: }
396: // If it's an identifier character, we're not looking at a regexp,
397: // since we've already tested for substitution and translation
398: // patterns and "m/".
399: if (PerlMode.isIdentifierChar(c))
400: return false;
401:
402: if (!Character.isWhitespace(c))
403: return false;
404:
405: // The immediately previous character is whitespace.
406: final String s = text.substring(0, i - 1).trim();
407: final int length = s.length();
408: if (length == 0) {
409: // The '/' is the first non-whitespace character on the line.
410: return true;
411: }
412: c = s.charAt(length - 1);
413: if (c == ')')
414: return false; // "(a + b) / c"
415: if (c == '}')
416: return false;
417: if (!PerlMode.isIdentifierChar(c))
418: return true;
419:
420: // Last non-whitespace character is an identifier character.
421: if (s.endsWith("and")) {
422: if (length == 3
423: || Character.isWhitespace(s.charAt(length - 4)))
424: return true;
425: } else if (s.endsWith("or")) {
426: if (length == 2
427: || Character.isWhitespace(s.charAt(length - 3)))
428: return true;
429: } else if (s.endsWith("not")) {
430: if (length == 3
431: || Character.isWhitespace(s.charAt(length - 4)))
432: return true;
433: }
434:
435: return false;
436: }
437:
438: public LineSegmentList formatLine(Line line) {
439: if (line == null) {
440: clearSegmentList();
441: addSegment("", PERL_FORMAT_TEXT);
442: return segmentList;
443: }
444: parseLine(line.getText(), line.flags());
445: final int tokenCount = segmentList.size();
446: for (int i = 0; i < tokenCount; i++) {
447: LineSegment segment = segmentList.getSegment(i);
448: if (segment.getFormat() >= 0)
449: continue;
450: String s = segment.getText();
451: if (isKeyword(s)) {
452: segment.setFormat(PERL_FORMAT_KEYWORD);
453: continue;
454: }
455: char c = s.charAt(0);
456: if (c == '$') {
457: segment.setFormat(PERL_FORMAT_SCALAR);
458: continue;
459: }
460: if (c == '%' || c == '@') {
461: segment.setFormat(PERL_FORMAT_LIST);
462: continue;
463: }
464: boolean isFunction = false;
465: if (PerlMode.isIdentifierChar(c)) {
466: boolean maybeFunction = true;
467: final int length = s.length();
468: for (int j = 1; j < length; j++) {
469: if (!PerlMode.isIdentifierChar(s.charAt(j))) {
470: maybeFunction = false;
471: break;
472: }
473: }
474: if (maybeFunction) {
475: if (isFunction(s))
476: isFunction = true;
477: else if (i > 1) {
478: // See if "sub" is two segments back (one segment back
479: // would be intervening whitespace).
480: LineSegment prevSegment = segmentList
481: .getSegment(i - 2);
482: if (prevSegment.getText().trim().equals("sub"))
483: isFunction = true;
484: }
485: if (!isFunction && i < segmentList.size() - 1) {
486: LineSegment nextSegment = segmentList
487: .getSegment(i + 1);
488: if (nextSegment.getText().trim()
489: .startsWith("("))
490: isFunction = true;
491: }
492: }
493: }
494: segment.setFormat(isFunction ? PERL_FORMAT_FUNCTION
495: : PERL_FORMAT_TEXT);
496: }
497: return segmentList;
498: }
499:
500: public boolean parseBuffer() {
501: int state = STATE_NEUTRAL;
502: Line line = buffer.getFirstLine();
503: boolean changed = false;
504: while (line != null) {
505: int oldflags = line.flags();
506: if (state == STATE_HERE_DOCUMENT) {
507: if (line.getText().equals(endOfText))
508: state = STATE_NEUTRAL;
509: }
510: if (state == STATE_POD) {
511: if (line.getText().startsWith("=cut")) {
512: if (state != oldflags) {
513: line.setFlags(state);
514: changed = true;
515: }
516: state = STATE_NEUTRAL;
517: line = line.next();
518: continue;
519: }
520: }
521: // Assume no multiline quotes.
522: if (state == STATE_QUOTE || state == STATE_SINGLEQUOTE)
523: state = STATE_NEUTRAL;
524: if (state == STATE_NEUTRAL)
525: if (line.getText().startsWith("="))
526: state = STATE_POD;
527: if (state != oldflags) {
528: line.setFlags(state);
529: changed = true;
530: }
531: if (state == STATE_HERE_DOCUMENT || state == STATE_POD) {
532: line = line.next();
533: continue;
534: }
535: final int limit = line.length();
536: for (int i = 0; i < limit; i++) {
537: char c = line.charAt(i);
538: if (c == '\\' && i < limit - 1) {
539: // Escape.
540: ++i;
541: continue;
542: }
543: if (state == STATE_QUOTE) {
544: if (c == '"')
545: state = STATE_NEUTRAL;
546: continue;
547: }
548: if (state == STATE_SINGLEQUOTE) {
549: if (c == '\'')
550: state = STATE_NEUTRAL;
551: continue;
552: }
553: // Not in comment or quoted string.
554: if (c == '$' && i < limit - 1) {
555: // In effect, another kind of escape.
556: // Next char can be quote or single quote but should be ignored.
557: ++i;
558: continue;
559: }
560: if (c == '<' && i < limit - 2) {
561: if (line.charAt(i + 1) == '<') {
562: // Line must have semicolon at end.
563: if (line.trim().endsWith(";")) {
564: endOfText = line.substring(i + 2).trim();
565: int length = endOfText.length();
566: // Remove ';' at end of line.
567: if (length > 0
568: && endOfText.charAt(length - 1) == ';')
569: endOfText = endOfText.substring(0,
570: --length);
571: // Remove ')' if any.
572: if (length > 0
573: && endOfText.charAt(length - 1) == ')')
574: endOfText = endOfText.substring(0,
575: --length);
576: if (length > 2) {
577: if (endOfText.charAt(0) == '"'
578: && endOfText.charAt(length - 1) == '"')
579: // Removed enclosing double quotes.
580: endOfText = endOfText.substring(1,
581: length - 1);
582: else if (endOfText.charAt(0) == '\''
583: && endOfText.charAt(length - 1) == '\'')
584: // Removed enclosing single quotes.
585: endOfText = endOfText.substring(1,
586: length - 1);
587: }
588: if (endOfText.length() > 0) {
589: // Make sure "<<" is not shift operator.
590: if (Character.isLetter(endOfText
591: .charAt(0))) {
592: state = STATE_HERE_DOCUMENT;
593: break;
594: }
595: }
596: }
597: }
598: continue;
599: }
600: if (c == '#')
601: // Single-line comment beginning. Ignore rest of line.
602: break;
603: else if (c == '"')
604: state = STATE_QUOTE;
605: else if (c == '\'')
606: state = STATE_SINGLEQUOTE;
607: }
608: line = line.next();
609: }
610: buffer.setNeedsParsing(false);
611: return changed;
612: }
613:
614: private static final String[] perlFunctions = { "abs", "accept",
615: "alarm", "atan2", "bind", "binmode", "bless", "caller",
616: "chdir", "chmod", "chomp", "chop", "chown", "chr",
617: "chroot", "close", "closedir", "connect", "cos", "crypt",
618: "dbmclose", "dbmopen", "defined", "delete", "die", "dump",
619: "each", "eof", "eval", "exec", "exists", "exit", "exp",
620: "fcntl", "fileno", "flock", "fork", "format", "formline",
621: "getc", "getgrent", "getgrgid", "getgrnam",
622: "gethostbyaddr", "gethostbyname", "gethostent", "getlogin",
623: "getnetbyaddr", "getnetbyname", "getnetent", "getpeername",
624: "getpgrp", "getppid", "getpriority", "getprotobyname",
625: "getprotobynumber", "getprotoent", "getpwent", "getpwnam",
626: "getpwuid", "getservbyname", "getservbyport", "getservent",
627: "getsockname", "getsockopt", "glob", "gmtime", "grep",
628: "hex", "import", "index", "int", "ioctl", "join", "keys",
629: "kill", "lc", "lcfirst", "length", "link", "listen",
630: "localtime", "log", "lstat", "map", "mkdir", "msgctl",
631: "msgget", "msgrcv", "msgsnd", "oct", "open", "opendir",
632: "ord", "pack", "pipe", "pop", "pos", "print", "printf",
633: "push", "quotemeta", "rand", "read", "readdir", "readlink",
634: "recv", "rename", "reset", "reverse", "rewinddir",
635: "rindex", "rmdir", "scalar", "seek", "seekdir", "select",
636: "semctl", "semget", "semop", "send", "setpgrp",
637: "setpriority", "setsockopt", "shift", "shmctl", "shmget",
638: "shmread", "shmwrite", "shutdown", "sin", "sleep",
639: "socket", "socketpair", "sort", "splice", "split",
640: "sprintf", "sqrt", "srand", "stat", "study", "substr",
641: "symlink", "syscall", "sysopen", "sysread", "system",
642: "syswrite", "tell", "telldir", "time", "times", "truncate",
643: "uc", "ucfirst", "umask", "unlink", "unpack", "unshift",
644: "utime", "values", "vec", "wait", "waitpid", "wantarray",
645: "warn", "write" };
646:
647: private final boolean isFunction(String s) {
648: if (functions == null)
649: return false;
650: return functions.contains(s);
651: }
652:
653: public FormatTable getFormatTable() {
654: if (formatTable == null) {
655: formatTable = new FormatTable("PerlMode");
656: formatTable.addEntryFromPrefs(PERL_FORMAT_TEXT, "text");
657: formatTable.addEntryFromPrefs(PERL_FORMAT_COMMENT,
658: "comment");
659: formatTable.addEntryFromPrefs(PERL_FORMAT_STRING, "string");
660: formatTable.addEntryFromPrefs(PERL_FORMAT_KEYWORD,
661: "keyword");
662: formatTable.addEntryFromPrefs(PERL_FORMAT_FUNCTION,
663: "function");
664: formatTable.addEntryFromPrefs(PERL_FORMAT_BRACE, "brace");
665: formatTable.addEntryFromPrefs(PERL_FORMAT_NUMBER, "number");
666: formatTable.addEntryFromPrefs(PERL_FORMAT_SCALAR, "scalar");
667: formatTable.addEntryFromPrefs(PERL_FORMAT_LIST, "list");
668: }
669: return formatTable;
670: }
671: }
|