001: /*
002:
003: This software is OSI Certified Open Source Software.
004: OSI Certified is a certification mark of the Open Source Initiative.
005:
006: The license (Mozilla version 1.0) can be read at the MMBase site.
007: See http://www.MMBase.org/license
008:
009: */
010:
011: package org.mmbase.util.magicfile;
012:
013: import java.util.*;
014: import java.io.*;
015: import org.mmbase.util.logging.*;
016:
017: /**
018: * A Detector stores one entry from the magic.xml file, and contains
019: * the functionality to determines if a certain byte[] satisfies it.
020: *
021: * Implementation made on the basis of actual magic file and its manual.<br />
022: *
023: * TODO:<br />
024: * - link the info with mimetypes<br />
025: * - add test modifiers<br />
026: * - add commandline switches for warning, error and debugging messages<br />
027: *<br />
028: * Ignored features of magic:<br />
029: * - date types<br />
030: * - indirect offsets (prefix of '&' in sublevel match or (address+bytes) where offset = value of address plus bytes<br />
031: * - AND'ing of type<br />
032: *<br />
033: * BUGS:<br />
034: * - test string isn't read when end of line is reached in absence of a message string<br />
035: * <br />
036: *
037: * Tested:<br />
038: * - .doc<br />
039: * - .rtf<br />
040: * - .pdf<br />
041: * - .sh<br />
042: * - .gz<br />
043: * - .bz2<br />
044: * - .html<br />
045: * - .rpm<br />
046: * - .wav<br />
047: *<br />
048: * Not supported by magic file:<br />
049: * - StarOffice<br />
050: * @version $Id: Detector.java,v 1.13 2007/02/24 21:57:50 nklasens Exp $
051: */
052:
053: public class Detector {
054: private static final Logger log = Logging
055: .getLoggerInstance(Detector.class);
056:
057: // No configuration below
058: private static final int BIG_ENDIAN = 0;
059: private static final int LITTLE_ENDIAN = 1;
060: private static final String[] label = new String[] { "big endian",
061: "little endian" };
062:
063: private String rawinput; // Original input line
064: private int offset = -1;
065: private String type;
066: // types: byte, short, long, string, date, beshort, belong, bedate, leshort, lelong, ledate
067: private String typeAND;
068: // Some types are defined as e.g. "belong&0x0000ff70", then typeAND=0x0000ff70 (NOT IMPLEMENTED!)
069: private String test; // Test value
070: private char testComparator; // What the test is like,
071: private String message; // Designation for this type in 'magic' file
072: private List<String> extensions; // Possible file extensions for this type
073: private String mimetype; // MimeType for this type
074:
075: // What are these?
076: private String xString;
077: private int xInt;
078: private char xChar;
079:
080: private List<Detector> childList;
081:
082: private boolean valid; // Set this if parsing of magic file fails
083: private boolean hasX; // Is set when an 'x' value is matched
084:
085: /**
086: * Add an embedded detector object that searches for more details after an initial match.
087: */
088: public void addChild(Detector detector, int level) {
089: if (level == 1) {
090: childList.add(detector);
091: } else if (level > 1) {
092: if (childList.size() == 0) {
093: log.debug("Hm. level = " + level
094: + ", but childList is empty");
095: } else {
096: (childList.get(childList.size() - 1)).addChild(
097: detector, level - 1);
098: }
099: }
100: }
101:
102: /**
103: * Detectors are instanciated by MagicXMLReader, and by Parser.
104: */
105: Detector() {
106: childList = new ArrayList<Detector>();
107: extensions = new ArrayList<String>();
108: mimetype = "application/octet-stream";
109: message = "Unknown";
110: valid = true;
111: }
112:
113: /**
114: * Adds a possible extension. The last added one is the default (returned by 'getExtension').
115: */
116: public void setExtension(String extension) {
117: extensions.add(0, extension);
118: }
119:
120: public String getExtension() {
121: if (extensions.size() == 0) {
122: return "";
123: }
124: return extensions.get(0);
125: }
126:
127: public List<String> getExtensions() {
128: return extensions;
129: }
130:
131: public void setMimeType(String mimetype) {
132: this .mimetype = mimetype;
133: }
134:
135: public String getMimeType() {
136: if (mimetype.equals("???")) {
137: return "application/octet-stream";
138: } else {
139: return mimetype;
140: }
141: }
142:
143: public void setDesignation(String designation) {
144: this .message = designation;
145: }
146:
147: public void setOffset(String offset) {
148: this .offset = Integer.parseInt(offset);
149: }
150:
151: public int getOffset() {
152: return offset;
153: }
154:
155: public void setType(String type) {
156: this .type = type;
157: }
158:
159: public String getType() {
160: return type;
161: }
162:
163: public void setTest(String test) {
164: this .test = test;
165: }
166:
167: public String getTest() {
168: return test;
169: }
170:
171: public void setComparator(char comparator) {
172: this .testComparator = comparator;
173: }
174:
175: public char getComparator() {
176: return testComparator;
177: }
178:
179: /**
180: * @return Whether detector matches the prefix/lithmus of the file
181: */
182: public boolean test(byte[] lithmus) {
183: if (lithmus == null || lithmus.length == 0 || offset == -1) {
184: return false;
185: }
186: boolean hit;
187: //log.debug("TESTING "+rawinput);
188: if (type.equals("string")) {
189: hit = testString(lithmus);
190: } else if (type.equals("beshort")) {
191: hit = testShort(lithmus, BIG_ENDIAN);
192: } else if (type.equals("belong")) {
193: hit = testLong(lithmus, BIG_ENDIAN);
194: } else if (type.equals("leshort")) {
195: hit = testShort(lithmus, LITTLE_ENDIAN);
196: } else if (type.equals("lelong")) {
197: hit = testLong(lithmus, LITTLE_ENDIAN);
198: } else if (type.equals("byte")) {
199: hit = testByte(lithmus);
200: } else {
201: // Date types are not supported
202: hit = false;
203: }
204: if (hit) {
205: log.debug("Detector " + this + " hit");
206: for (int i = 0; i < childList.size(); i++) {
207: Detector child = childList.get(i);
208: if (child.test(lithmus)) {
209: String s = child.getDesignation();
210: if (s.startsWith("\\b")) {
211: s = s.substring(2);
212: }
213: this .message = this .message + " " + s;
214: }
215: }
216: }
217: return hit;
218: }
219:
220: /**
221: * todo: I noticed there is also a %5.5s variation in magic...
222: */
223: public String getDesignation() {
224: if (hasX) {
225: int n = message.indexOf("%d");
226: if (n >= 0) {
227: return message.substring(0, n) + xInt
228: + message.substring(n + 2);
229: }
230:
231: n = message.indexOf("%s");
232: if (n >= 0) {
233: return message.substring(0, n) + xString
234: + message.substring(n + 2);
235: }
236:
237: n = message.indexOf("%c");
238: if (n >= 0) {
239: return message.substring(0, n) + xChar
240: + message.substring(n + 2);
241: }
242: }
243: return message;
244: }
245:
246: public void setInvalid() {
247: valid = false;
248: }
249:
250: /**
251: * @return Whether parsing of magic line for this detector succeeded
252: */
253: public boolean valid() {
254: return valid;
255: }
256:
257: /**
258: * @return Conversion of 2 byte array to integer
259: */
260: private int byteArrayToInt(byte[] ar) {
261: StringBuffer buf = new StringBuffer();
262: for (byte element : ar) {
263: buf.append(Integer.toHexString(element & 0x000000ff));
264: }
265: return Integer.decode("0x" + buf.toString()).intValue();
266: }
267:
268: /**
269: * @return Conversion of 4 byte array to long
270: */
271: private long byteArrayToLong(byte[] ar) {
272: StringBuffer buf = new StringBuffer();
273: for (byte element : ar) {
274: buf.append(Integer.toHexString(element & 0x000000ff));
275: }
276: return Long.decode("0x" + buf.toString()).longValue();
277: }
278:
279: /**
280: * Test whether a string matches
281: */
282: protected boolean testString(byte[] lithmus) {
283:
284: if (test.length() == 0) {
285: log.warn("TEST STRING LENGTH ZERO FOR [" + rawinput + "]");
286: return false;
287: }
288:
289: int maxNeeded = offset + test.length();
290:
291: if (maxNeeded > lithmus.length) {
292: return false;
293: }
294:
295: try {
296: xString = new String(lithmus, offset, test.length(),
297: "US-ASCII");
298: // US-ASCII: fixate the charset, do not depend on platform default:
299: // US-ASCCII: one byte = one char, so length can be predicted
300: } catch (java.io.UnsupportedEncodingException usee) { // could not happen: US-ASCII is supported
301: }
302:
303: log.debug("test string = '" + test + "' (" + message
304: + ") comparing with '" + xString + "'");
305: int n = xString.compareTo(test);
306: switch (testComparator) {
307: case '=':
308: return n == 0;
309: case '>':
310: hasX = true;
311: return n > 0;
312: case '<':
313: hasX = true;
314: return n < 0;
315: default:
316: return false;
317: }
318: }
319:
320: /**
321: * Test whether a short matches
322: */
323: protected boolean testShort(byte[] lithmus, int endian) {
324: log
325: .debug("testing " + label[endian] + " short for "
326: + rawinput);
327: int found = 0;
328: if (endian == BIG_ENDIAN) {
329: found = byteArrayToInt(new byte[] { lithmus[offset],
330: lithmus[offset + 1] });
331: } else if (endian == LITTLE_ENDIAN) {
332: found = byteArrayToInt(new byte[] { lithmus[offset + 1],
333: lithmus[offset] });
334: }
335: xInt = found;
336:
337: if (test.equals("x")) {
338: hasX = true;
339: return true;
340: } else if (test.equals("")) {
341: return false;
342: } else {
343: int v = Integer.decode(test).intValue();
344: // Hm. How did that binary arithmatic go?
345: log
346: .debug("dumb string conversion: 0x"
347: + Integer
348: .toHexString(lithmus[offset] & 0x000000ff)
349: + Integer
350: .toHexString(lithmus[offset + 1] & 0x000000ff));
351:
352: switch (testComparator) {
353: case '=':
354: log.debug(Integer.toHexString(v) + " = "
355: + Integer.toHexString(found));
356: return v == found;
357: case '>':
358: hasX = true;
359: return found > v;
360: case '<':
361: hasX = true;
362: return found < v;
363: default:
364: return false;
365: }
366: }
367: }
368:
369: /**
370: * Test whether a long matches
371: */
372: protected boolean testLong(byte[] lithmus, int endian) {
373: log.debug("testing " + label[endian] + " long for " + rawinput);
374: long found = 0;
375: try {
376: if (endian == BIG_ENDIAN) {
377: found = byteArrayToLong(new byte[] { lithmus[offset],
378: lithmus[offset + 1], lithmus[offset + 2],
379: lithmus[offset + 3] });
380: } else if (endian == LITTLE_ENDIAN) {
381: found = byteArrayToLong(new byte[] {
382: lithmus[offset + 3], lithmus[offset + 2],
383: lithmus[offset + 1], lithmus[offset] });
384: }
385: } catch (ArrayIndexOutOfBoundsException e) {
386: if (!message.equals("")) {
387: log.error("Failed to test " + label[endian]
388: + " long for " + message);
389: } else {
390: log.error("Failed to test " + label[endian] + " long:");
391: }
392: log.error("Offset out of bounds: " + offset
393: + " while max is " /*+BUFSIZE*/);
394: return false;
395: }
396: xInt = (int) found;
397: // If it really is a long, we wouldn't want to know about it
398:
399: if (test.equals("x")) {
400: hasX = true;
401: return true;
402: } else if (test.equals("")) {
403: return false;
404: } else {
405: long v = Long.decode(test).longValue();
406:
407: // Hm. How did that binary arithmatic go?
408:
409: switch (testComparator) {
410: case '=':
411: log.debug("checking " + label[endian] + " long: "
412: + Long.toHexString(v) + " = "
413: + Long.toHexString(found));
414: return v == found;
415: case '>':
416: hasX = true;
417: return found > v;
418: case '<':
419: hasX = true;
420: return found < v;
421: default:
422: return false;
423: }
424: }
425: }
426:
427: /**
428: * Test whether a byte matches
429: */
430: protected boolean testByte(byte[] lithmus) {
431: log.debug("testing byte for " + rawinput);
432: if (test.equals("x")) {
433: hasX = true;
434: xInt = lithmus[offset];
435: xChar = (char) lithmus[offset];
436: xString = "" + xChar;
437: return true;
438: } else if (test.equals("")) {
439: return false;
440: } else {
441: byte b = (byte) Integer.decode(test).intValue();
442: switch (testComparator) {
443: // DOES THIS MAKE ANY SENSE AT ALL!!
444: case '=':
445: return b == lithmus[offset];
446: case '&':
447: // All bits in the test byte should be set in the found byte
448: //log.debug("byte test as string = '"+test+"'");
449: byte filter = (byte) (lithmus[offset] & b);
450: //log.debug("lithmus = "+lithmus[offset]+"; test = "+b+"; filter = "+filter);
451: return filter == b;
452: default:
453: return false;
454: }
455: }
456: }
457:
458: /**
459: * @return Original unprocessed input line
460: * @since MMBase-1.7
461: */
462: public String getRawInput() {
463: return rawinput;
464: }
465:
466: protected String xmlEntities(String s) {
467: StringBuffer res = new StringBuffer();
468: for (int i = 0; i < s.length(); i++) {
469: char c = s.charAt(i);
470: switch (c) {
471: case '>':
472: res.append(">");
473: break;
474: case '<':
475: res.append("<");
476: break;
477: case '&':
478: res.append("&");
479: break;
480: default:
481: // Convert all characters not in the allowed XML character set
482: int n = c;
483: /* -- below is actual xml standard definition of allowed characters
484: if (n == 0x9 || n == 0xA || n == 0xD || (n >= 0x20 && n <= 0xD7FF) || (n >= 0xE000 && n <= 0xFFFD) ||
485: (n >= 0x10000 && n <= 0x10FFFF)) {
486: */
487: if (n == 0x9 || n == 0xA || n == 0xD
488: || (n >= 0x20 && n < 128)) {
489: res.append(c);
490: } else {
491: // octal representation of number; pad with zeros
492: String oct = Integer.toOctalString(n);
493: res.append("\\");
494: for (int j = 3; j > oct.length(); j--) {
495: res.append("0");
496: }
497: res.append(oct);
498: }
499: }
500: }
501: return res.toString();
502: }
503:
504: /**
505: * XML notatie:
506: * <detector>
507: * <mimetype>foo/bar</mimetype>
508: * <extension>bar</extension>
509: * <designation>blablabla</designation>
510: * <test offset="bla" type="bla" comparator="=">test string</test>
511: * <childlist>
512: * <detector>etc</detector>
513: * </childlist>
514: * </detector>
515: *
516: */
517: public void toXML(FileWriter f) throws IOException {
518: toXML(f, 0);
519: }
520:
521: /**
522: * @param level Indicates depth of (child) element
523: */
524: public void toXML(FileWriter f, int level) throws IOException {
525: StringBuffer s = new StringBuffer();
526: String comparatorEntity;
527:
528: char[] pad;
529: if (level > 0) {
530: pad = new char[level * 4];
531: for (int i = 0; i < level * 4; i++) {
532: pad[i] = ' ';
533: }
534: } else {
535: pad = new char[] {};
536: }
537: String padStr = new String(pad);
538:
539: if (testComparator == '>') {
540: comparatorEntity = ">";
541: } else if (testComparator == '<') {
542: comparatorEntity = "<";
543: } else if (testComparator == '&') {
544: comparatorEntity = "&";
545: } else {
546: comparatorEntity = "" + testComparator;
547: }
548: s.append(padStr + "<detector>\n" + padStr + " <mimetype>"
549: + getMimeType() + "</mimetype>\n" + padStr
550: + " <extension>" + getExtension() + "</extension>\n"
551: + padStr + " <designation>" + xmlEntities(message)
552: + "</designation>\n" + padStr + " <test offset=\""
553: + offset + "\" type=\"" + type + "\" comparator=\""
554: + comparatorEntity + "\">" + xmlEntities(test)
555: + "</test>\n");
556: f.write(s.toString());
557: if (childList.size() > 0) {
558: f.write(padStr + " <childlist>\n");
559: for (Detector detector : childList) {
560: detector.toXML(f, level + 1);
561: }
562: f.write(padStr + " </childlist>\n");
563: }
564: f.write(padStr + "</detector>\n");
565:
566: }
567:
568: /**
569: * @return String representation of Detector object.
570: */
571: public String toString() {
572: if (!valid) {
573: return "parse error";
574: } else {
575: StringBuffer res = new StringBuffer("[" + offset + "] {"
576: + type);
577: if (typeAND != null) {
578: res.append("[" + typeAND + "]");
579: }
580: res.append("} " + testComparator + "(" + test + ") "
581: + message);
582: if (childList.size() > 0) {
583: res.append("\n");
584: for (int i = 0; i < childList.size(); i++) {
585: res.append("> ")
586: .append(childList.get(i).toString());
587: }
588: }
589: return res.toString();
590: }
591: }
592: }
|