#! /usr/bin/env python
#
# Part of the A-A-P project: File type detection module
# Copyright (C) 2002-2003 Stichting NLnet Labs
# Permission to copy and use this file is specified in the file COPYING.
# If this file is missing you can find it here: http://www.a-a-p.org/COPYING
# This module detects the type of a file.
# It can be run as a separate program or called from Python.
# Many types are recognized by default. More types can be added dynamically.
# See the Aap reference manual for an explanation.
#
#
# EXTERNAL INTERFACE:
#
# ft_detect(fname [, ignore] [, recdict])
# Detects the type of file "fname".
#
# ft_check_dir(dir [, errmsg] [, recdict])
# Scan directory "dir" for "*.afd" files, which are
# loaded with ft_read_file().
#
# ft_read_file(fname [, recdict])
# Read file "fname" for detection rules.
#
# ft_add_rules(str, lnum [, recdict])
# Add file type detection rules from "str". See
# the Aap reference manual for the syntax.
#
# ft_known(type) Returns True if "type" is a known filetype, False
# otherwise.
#
# ft_declare(type) Declare "type" to be a known filetype.
#
import string
import os.path
import sys
import import_re # import the re module in a special way
import glob
from Util import *
# Make a copy of the recdict after these imports, so that they can be used when
# executing Python snippets.
exec_recdict = globals().copy()
# Set to non-zero when run as a program.
_run_as_program = 0
#
# The default list of detected file types by suffix.
#
_def_suffix_list = [
("aap", "aap"),
("abc", "abc"),
("abl", "abel"),
("wrm", "acedb"),
("ada", "ada"),
("adb", "ada"),
("ads", "ada"),
("afd", "afd"),
("tdf", "ahdl"),
("aml", "aml"),
("run", "ampl"),
("a", "asm"),
("asm", "asm"),
("lst", "asm"),
("mac", "asm"),
("s", "asm"),
("asn", "asn"),
("asn1", "asn"),
("asa", "aspvbs"),
("as", "atlas"),
("atl", "atlas"),
("ave", "ave"),
("awk", "awk"),
("imp", "b"),
("mch", "b"),
("ref", "b"),
("bc", "bc"),
("bdf", "bdf"),
("bib", "bib"),
("bl", "blank"),
("btm", "btm"),
("c", "c"),
("cdl", "cdl"),
("cfi", "cf"),
("cfm", "cf"),
("chs", "chaskell"),
("eni", "cl"),
("dcl", "clean"),
("icl", "clean"),
("prg", "clipper"),
("cbl", "cobol"),
("cob", "cobol"),
("cpy", "cobol"),
("c++", "cpp"),
("cc", "cpp"),
("cpp", "cpp"),
("cxx", "cpp"),
("h", "cpp"),
("hh", "cpp"),
("hpp", "cpp"),
("hxx", "cpp"),
("inl", "cpp"),
("tcc", "cpp"),
("cs", "cs"),
("csc", "csc"),
("csh", "csh"),
("tcsh", "csh"),
("csp", "csp"),
("fdr", "csp"),
("css", "css"),
("con", "cterm"),
("pld", "cupl"),
("si", "cuplsim"),
("cyn", "cynpp"),
("d", "d"),
("def", "def"),
("desc", "desc"),
("diff", "diff"),
("patch", "diff"),
("rej", "diff"),
("bat", "dosbatch"),
("cmd", "dosbatch"),
("sys", "dosbatch"),
("ini", "dosini"),
("dot", "dot"),
("drac", "dracula"),
("drc", "dracula"),
("dsl", "dsl"),
("dtd", "dtd"),
("dylan", "dylan"),
("intr", "dylanintr"),
("lid", "dylanlid"),
("ecd", "ecd"),
("am", "elf"),
("erl", "erlang"),
("EC", "esqlc"),
("ec", "esqlc"),
("exp", "expect"),
("4gh", "fgl"),
("4gl", "fgl"),
("m4gl", "fgl"),
("fex", "focexec"),
("focexec", "focexec"),
("fs", "forth"),
("ft", "forth"),
("F", "fortran"),
("f", "fortran"),
("f77", "fortran"),
("f90", "fortran"),
("f95", "fortran"),
("for", "fortran"),
("fpp", "fortran"),
("ftn", "fortran"),
("gdmo", "gdmo"),
("mo", "gdmo"),
("ged", "gedcom"),
("gif", "gif"),
("gpi", "gnuplot"),
("gp", "gp"),
("gsp", "gsp"),
("hs", "haskell"),
("hb", "hb"),
("errsum", "hercules"),
("ev", "hercules"),
("rs", "hercules"),
("sum", "hercules"),
("vc", "hercules"),
("h32", "hex"),
("hex", "hex"),
("hog", "hog"),
("rules", "hog"),
("htm", "html"),
("htm", "html"),
("html", "html"),
("html", "html"),
("shtml", "html"),
("html.m4", "htmlm4"),
("icn", "icon"),
("idl", "idl"),
("Z", "ignore"),
("bak", "ignore"),
("bz2", "ignore"),
("gz", "ignore"),
("in", "ignore"),
("new", "ignore"),
("old", "ignore"),
("orig", "ignore"),
("rmpnew", "ignore"),
("rpmsave", "ignore"),
("indent.pro", "indent"),
("INF", "inform"),
("inf", "inform"),
("iss", "iss"),
("ist", "ist"),
("mst", "ist"),
("jpl", "jam"),
("jpr", "jam"),
("jav", "java"),
("java", "java"),
("jj", "javacc"),
("jjt", "javacc"),
("javascript", "javascript"),
("js", "javascript"),
("clp", "jess"),
("jgr", "jgraph"),
("jpg", "jpeg"),
("png", "png"),
("properties", "jproperties"),
("jsp", "jsp"),
("kix", "kix"),
("ks", "kscript"),
("k", "kwt"),
("ACE", "lace"),
("ace", "lace"),
("latte", "latte"),
("lte", "latte"),
("l", "lex"),
("lex", "lex"),
("lhs", "lhaskell"),
("ll", "lexpp"),
("cl", "lisp"),
("el", "lisp"),
("jl", "lisp"),
("lisp", "lisp"),
("lsp", "lisp"),
("lite", "lite"),
("lt", "lite"),
("lgt", "logtalk"),
("lot", "lotos"),
("lotos", "lotos"),
("lou", "lout"),
("lout", "lout"),
("sig", "lprolog"),
("lss", "lss"),
("lua", "lua"),
("mc", "m4"),
("eml", "mail"),
("dsp", "make"),
("mak", "make"),
("mk", "make"),
("man", "man"),
("mpl", "maple"),
("mv", "maple"),
("mws", "maple"),
("mason", "mason"),
("mhtml", "mason"),
("mel", "mel"),
("mf", "mf"),
("mgp", "mgp"),
("mib", "mib"),
("mms", "mmix"),
("moc", "moc"),
("DEF", "modula2"),
("MOD", "modula2"),
("m2", "modula2"),
("md", "modula2"),
("mi", "modula2"),
("i3", "modula3"),
("ig", "modula3"),
("m3", "modula3"),
("mg", "modula3"),
("isc", "monk"),
("monk", "monk"),
("ssc", "monk"),
("tsc", "monk"),
("moo", "moo"),
("mp", "mp"),
("msql", "msql"),
("mush", "mush"),
("mysql", "mysql"),
(".NSA", "natural"),
(".NSC", "natural"),
(".NSG", "natural"),
(".NSL", "natural"),
(".NSM", "natural"),
(".NSN", "natural"),
(".NSP", "natural"),
(".NSS", "natural"),
("ncf", "ncf"),
("nqc", "nqc"),
("OPL", "opl"),
("OPl", "opl"),
("Opl", "opl"),
("dpr", "pascal"),
("g", "pccts"),
("inc", "php"),
("ml", "ocaml"),
("mli", "ocaml"),
("mll", "ocaml"),
("mly", "ocaml"),
("mm", "nroff"),
("nr", "nroff"),
("nsi", "nsis"),
("o", "object"),
("obj", "object"),
("opl", "opl"),
("or", "openroad"),
("ora", "ora"),
("papp", "papp"),
("pas", "pascal"),
("php", "php"),
("php", "php"),
("pl", "perl"),
("pxml", "papp"),
("pxsl", "papp"),
("roff", "nroff"),
("sho", "dllobject"),
("sob", "dllobject"),
("tr", "nroff"),
("xin", "omnimark"),
("xom", "omnimark"),
("php3", "php"),
("phtml", "phtml"),
("lpc", "pike"),
("pike", "pike"),
("pmod", "pike"),
("ulpc", "pike"),
("rcp", "pilrc"),
("p36", "plm"),
("pac", "plm"),
("plm", "plm"),
("plp", "plp"),
("pls", "plsql"),
("plsql", "plsql"),
("po", "po"),
("pod", "pod"),
("eps", "postscript"),
("ps", "postscript"),
("pov", "pov"),
("ppd", "ppd"),
("ih", "ppwiz"),
("it", "ppwiz"),
("pdb", "prolog"),
("psf", "psf"),
("py", "python"),
("py", "python"),
("mat", "radiance"),
("rad", "radiance"),
("rc", "rc"),
("rex", "rexx"),
("rexx", "rexx"),
("x", "rpcgen"),
("rpl", "rpl"),
("rtf", "rtf"),
("rbw", "ruby"),
("rbw", "ruby"),
("sas", "sas"),
("sa", "sather"),
("scm", "scheme"),
("sci", "scilab"),
("pdl", "sdl"),
("pr", "sdl"),
("sed", "sed"),
("sgm", "sgml"),
("sgml", "sgml"),
("bash", "sh"),
("ebuild", "sh"),
("env", "sh"),
("ksh", "sh"),
("sh", "sh"),
("sh", "sh"),
("sim", "simula"),
("s85", "sinda"),
("sin", "sinda"),
("il", "skill"),
("sl", "slang"),
("score", "slrnsc"),
("tpl", "smarty"),
("smith", "smith"),
("smt", "smith"),
("sml", "sml"),
("sno", "snobol4"),
("spec", "spec"),
("sp", "spice"),
("spice", "spice"),
("spd", "spup"),
("spdata", "spup"),
("speedup", "spup"),
("pkb", "sql"),
("pks", "sql"),
("sql", "sql"),
("tyb", "sql"),
("tyc", "sql"),
("typ", "sql"),
("sqlj", "sqlj"),
("sqi", "sqr"),
("sqr", "sqr"),
("s19", "srec"),
("s28", "srec"),
("s37", "srec"),
("cls", "st"),
("st", "st"),
("stp", "stp"),
("tak", "tak"),
("itcl", "tcl"),
("itk", "tcl"),
("tar", "tar"),
("tar.bz2", "tarbz2"),
("tar.gz", "targz"),
("tgz", "targz"),
("tcl", "tcl"),
("tk", "tcl"),
("ti", "terminfo"),
("dtx", "tex"),
("latex", "tex"),
("ltx", "tex"),
("sty", "tex"),
("tex", "tex"),
("texi", "texinfo"),
("texinfo", "texinfo"),
("txi", "texinfo"),
("tf", "tf"),
("t.html", "tilde"),
("tli", "tli"),
("slt", "tsalt"),
("tsscl", "tsscl"),
("tssgm", "tssgm"),
("tssop", "tssop"),
("uc", "uc"),
("ui", "ui"),
("uil", "uil"),
("uit", "uil"),
("ctl", "vb"),
("dsm", "vb"),
("sba", "vb"),
("vbs", "vb"),
("v", "verilog"),
("hdl", "vhdl"),
("vbe", "vhdl"),
("vhd", "vhdl"),
("vhdl", "vhdl"),
("vst", "vhdl"),
("vim", "vim"),
("hw", "virata"),
("module", "virata"),
("pkg", "virata"),
("wrl", "vrml"),
("wm", "webmacro"),
("wbt", "winbatch"),
("wml", "wml"),
("doc", "word"),
("wsc", "wsh"),
("wsf", "wsh"),
("ad", "xdefaults"),
("msc", "xmath"),
("msf", "xmath"),
("xpm2", "xpm2"),
("xs", "xs"),
("xsd", "xsd"),
("xsl", "xslt"),
("y", "yacc"),
("yy", "yaccpp"),
("zip", "zip"),
("z8a", "z8a"),
]
#
# The default list of detected file types by regexp.
# The order matters here! Last item is checked first.
#
_def_regexp_list = [
("[cC]hange[lL]og", "changelog", 1),
("/var/named/", "bindzone", 0),
("crontab", "crontab", 1),
(".*\\drac\\.", "dracula", 0),
(".*fvwmrc", "fvwm", 0),
(".*fvwm95", "fvwm", 0),
(".*fvwm2rc", "fvwm", 0),
("\\.gtkrc", "gtkrc", 1),
("gtkrc", "gtkrc", 1),
("Prl.*\\.", "jam", 1),
("JAM.*\\.", "jam", 1),
("[mM]akefile", "make", 1),
("muttrc", "muttrc", 1),
("tmac\\.", "nroff", 1),
(".*printcap", "printcap", 0),
(".*termcap", "termcap", 0),
(".*vimrc", "vim", 0),
("Xresources", "xdefaults", 1),
(".*/app-defaults/", "xdefaults", 0),
(".*/Xresources/", "xdefaults", 0),
("XF86Config", "xf86conf", 1),
(".*xmodmap", "xmodmap", 0),
("zsh", "zsh", 1),
("zlog", "zsh", 1),
("xdm-config$", "xdefaults", 1),
("\\.Xresources$", "xdefaults", 1),
("\\.Xpdefaults$", "xdefaults", 1),
("\\.Xdefaults$", "xdefaults", 1),
("XF86Config$", "xf86conf", 1),
("cvs\\d+$", "cvs", 1),
("wvdial\\.conf$", "wvdial", 1),
("\\wgetrc$", "wget", 1),
("\\.wgetrc$", "wget", 1),
("vgrindefs$", "vgrindefs", 1),
("\\.viminfo", "viminfo", 1),
("\\_viminfo", "viminfo", 1),
(".*\\.vhdl_[0-9]*$", "vhdl", 0),
("\\tidyrc$", "tidy", 1),
("\\.tidyrc$", "tidy", 1),
("texmf\\.cnf$", "texmf", 1),
("tags$", "tags", 1),
("squid\\.conf$", "squid", 1),
("vision\\.conf$", "hog", 1),
("snort\\.conf$", "hog", 1),
("\\.lrnrc", "slrnrc", 1),
("screenrc$", "screen", 1),
("\\.screenrc$", "screen", 1),
("\\.zcompdump", "zsh", 1),
("\\.zfbfmarks$", "zsh", 1),
("\\.zprofile$", "zsh", 1),
("\\.zlog", "zsh", 1),
("\\.zsh", "zsh", 1),
("csh\\.logout$", "csh", 1),
("csh\\.login$", "csh", 1),
("csh\\.cshrc$", "csh", 1),
("\\.alias", "csh", 1),
("\\.tcshrc", "csh", 1),
("\\.cshrc", "csh", 1),
("\\.login", "csh", 1),
("\\.profile", "sh", 1),
("/etc/profile", "sh", 0),
("\\.kshrc", "sh", 1),
("\\.bashrc", "sh", 1),
("bashrc", "sh", 1),
("bash\\.bashrc", "sh", 1),
("\\.bash_profile", "sh", 1),
("\\.bash_logout", "sh", 1),
("sgml\\.catalog", "catalog", 1),
("catalog$", "catalog", 1),
("sendmail\\.cf", "sendmail", 1),
("smb\\.conf", "samba", 1),
("robots.txt", "robots", 1),
("\\.reminders", "remind", 1),
("\\.inputrc$", "readline", 1),
("\\.ratpoisonrc$", "ratpoison", 1),
("\\ratpoisonrc$", "ratpoison", 1),
("\\.procmail$", "procmail", 1),
("\\.procmailrc$", "procmail", 1),
(".*printcap$", "printcap", 0),
(".*termcap$", "termcap", 0),
("\\.povrayrc$", "povini", 1),
("main.cf$", "pfmain", 1),
("\\.pinerc$", "pine", 1),
("\\pinerc$", "pine", 1),
("\\.muttrc", "muttrc", 1),
("\\.mutt/muttrc", "muttrc", 1),
("Muttrc$", "muttrc", 1),
("[mM]akefile$", "make", 1),
("GNUmakefile$", "make", 1),
("snd.\\d+$", "mail", 1),
("\\.letter$", "mail", 1),
("\\.letter\\.\\d+$", "mail", 1),
("\\.followup$", "mail", 1),
("\\.article$", "mail", 1),
("\\.article\\.\\d+$", "mail", 1),
("\\pico\\.\\d+$", "mail", 1),
("\\mutt-.*-\\d+$", "mail", 1),
("\\mutt\\w{6}$", "mail", 1),
("\\ae\\d+\\.txt$", "mail", 1),
("/tmp/SLRN[0-9A-Z.]+$", "mail", 0),
("\\.emacs$", "lisp", 1),
("\\.sawfishrc$", "lisp", 1),
("lilo.conf", "lilo", 1),
("lftp.conf$", "lftp", 1),
("\\.lftprc$", "lftp", 1),
(".*lftp/rc$", "lftp", 0),
(".*properties_..$", "jproperties", 0),
(".*properties_.._..$", "jproperties", 0),
(".*properties_.._.._.*$", "jproperties", 0),
("inittab$", "inittab", 1),
("\\.gtkrc$", "gtkrc", 1),
("gtkrc$", "gtkrc", 1),
("gkrellmrc_.$", "gkrellmrc", 1),
("gkrellmrc$", "gkrellmrc", 1),
("\\.gdbinit$", "gdb", 1),
("fstab$", "fstab", 1),
("auto.master$", "conf", 1),
("exports$", "exports", 1),
("filter-rules$", "elmfilt", 1),
(".*lvs$", "dracula", 0),
(".*lpe$", "dracula", 0),
("debian/control$", "debcontrol", 1),
(".*\\.\\.ch$", "ch", 0),
("named\\.conf$", "named", 1),
("named\\.root$", "bindzone", 1),
("build\\.xml$", "ant", 1),
(".*vimrc$", "vim", 0),
(".*exrc$", "vim", 0),
("configure$", "sh", 1),
("configure.ac$", "config", 1),
(".*COPYING$", "text", 0),
(".*README$", "text", 0),
(".*read.me$", "text", 0),
("proftpd\\.conf", "apachestyle", 1),
("httpd\\.conf", "apache", 1),
("srm\\.conf", "apache", 1),
("access\\.conf", "apache", 1),
("apache\\.conf", "apache", 1),
("\\.htaccess$", "apache", 1),
(".*enlightenment/.*\\.cfg$", "c", 0),
(".*Eterm/.*\\.cfg$", "eterm", 0),
("lynx\\.cfg$", "lynx", 1),
(".*baseq[2-3]/.*\\.cfg$", "quake", 0),
(".*id1/.*\\.cfg$", "quake", 0),
(".*quake[1-3]/.*\\.cfg$", "quake", 0),
("crontab$", "crontab", 1),
]
#
# The default list of detected file types by script name.
#
_def_script_list = [
(".*\\bpython", "python"),
(".*\\bperl", "perl"),
(".*\\bphp", "php"),
(".*\\bruby", "ruby"),
(".*\\bbc\\b", "bc"),
(".*\\bsed\\b", "sed"),
(".*\\bocaml", "ocaml"),
(".*awk\\b", "awk"),
(".*wml\\b", "wml"),
(".*\\bksh\\b", "sh"),
(".*\\bsh\\b", "sh"),
(".*\\bbash", "sh"),
(".*csh\\b", "csh"),
(".*\\bzsh\\b", "zsh"),
(".*\\btclsh\\b", "tcl"),
(".*\\bwish\\b", "tcl"),
(".*\\bexpectk\\b", "tcl"),
(".*\\bitclsh\\b", "tcl"),
(".*\\bitwish\\b", "tcl"),
(".*\\bexpect\\b", "expect"),
(".*\\bgnuplot\\b", "gnuplot"),
(".*make\\b", "make"),
]
#
# The default list of detected file types with Python code.
#
_def_python_list = [
("am", 0,
""" # Use Python to avoid the .am suffix is recognized
if string.lower(fname_base) == "makefile.am":
type = "automake"
"""),
("bas,frm", 0,
""" if string.lower(fname[-3:]) == "frm":
type = "form"
else:
type = "basic"
f = open(fname)
l = ''
try:
for i in xrange(1,5):
l = l + f.readline()
except:
pass
f.close()
if re.search("VB_Name|Begin VB\\\\.(Form|MDIForm|UserControl)", l, re.I):
type = "vb"
"""),
("ch", 0,
""" type = "ch"
f = open(fname)
try:
for i in xrange(1,10):
if f.readline()[0] == '@':
type = "change"
break
except:
pass
f.close()
"""),
("e,E", 0,
""" type = "eiffel"
f = open(fname)
try:
for i in xrange(1,100):
if cre_match("\\\\s*(<'|'>)\\\\s*$", f.readline()):
type = "specman"
break
except:
pass
f.close()
"""),
("ent", 0,
""" type = "dtd"
f = open(fname)
try:
for i in xrange(1,6):
l = f.readline()
if cre_match("\\\\s*[#{]", l):
type = "cl"
break
if not cre_match("\\\\s*$", l):
break
except:
pass
f.close()
"""),
("rul", 0,
""" type = "diva"
f = open(fname)
try:
for i in xrange(1,6):
if string.find("InstallShield", f.readline()):
type = "ishd"
break
except:
pass
f.close()
"""),
("com", 0,
""" type = "dcl"
f = open(fname)
try:
l1 = f.readline() + f.readline()
l2 = f.readline() + f.readline()
if (cre_search("\\\\$ORIGIN|\\\\$TTL|IN\\\\s*SOA", l1)
or cre_search("BIND.*named", l1 + l2)):
type = "dns"
except:
pass
f.close()
"""),
("in", 0,
""" # Use Python to avoid the .in suffix is recognized
if fname_base == "configure.in":
type = "config"
"""),
("m", 0,
""" type = "matlab"
f = open(fname)
try:
for i in xrange(1,10):
l = f.readline()
if cre_match("\\\\s*#(include|import)", l):
type = "objc"
break
if cre_match("\\\\s*%", l):
break
if cre_match("\\\\s*\\\\(\\\\*", l):
type = "mma"
break
except:
pass
f.close()
"""),
("mod", 0,
""" type = "modsim3"
f = open(fname)
try:
if cre_search("\\\\bmodule\\\\b", f.readline()):
type = "lprolog"
except:
pass
f.close()
"""),
("1,2,3,4,5,6,7,8,9,t,ms", 0,
""" f = open(fname)
found = 0
try:
for i in xrange(1,5):
l = f.readline()
if not l:
break
if l[0] == '.':
type = "nroff"
found = 1
break
except:
pass
f.close()
if not found:
if fname[-1] == 't':
type = "tads"
elif fname[-1] == 's':
type = "xmath"
"""),
("pl", 0,
""" type = "perl"
f = open(fname)
try:
while 1:
l = f.readline()
if l:
break
except:
pass
f.close()
if (cre_search("\\\\bprolog\\\\b|:-", l)
or cre_match("\\\\s*(%+(\\\\s|$)|/\\\\*)", l)):
type = "prolog"
"""),
("pm", 0,
""" type = "perl"
f = open(fname)
try:
l = f.readline()
except:
pass
f.close()
if cre_search("XPM2", l):
type = "xpm2"
elif cre_search("XPM", l):
type = "xpm"
"""),
("inc", 0,
""" type = "php"
f = open(fname)
l = ''
try:
for i in xrange(1,3):
l = l + f.readline()
except:
pass
f.close()
if cre_search("perlscript", l):
type = "aspperl"
elif cre_search("<%", l):
type = "aspvbs"
elif cre_search("<?", l):
type = "php"
else:
type = "asm" # could also be "pov", how to check?
"""),
("w", 0,
""" type = "cweb"
f = open(fname)
try:
if cre_search("&ANALYZE", f.readline()):
type = "progress"
else:
f.readline()
if cre_search("&GLOBAL-DEFINE", f.readline()):
type = "progress"
except:
pass
f.close()
"""),
("i", 0,
""" type = asm
f = open(fname)
found = 0
try:
for i in xrange(1,10):
l = f.readline()
if l[0] == '*' or cre_match("\\\\s*;", l):
found = 1
break
if not cre_match("\\\\s*$", l) or cre_match("/\\\\*", l):
break
except:
pass
f.close()
if not found:
type = "progress"
"""),
("p", 0,
""" type = "pascal"
f = open(fname)
found = 0
try:
for i in xrange(1,10):
l = f.readline()
if cre_match("\\\\s*((program|procedure|function|const|type|var)\\\\b|{)", l):
found = 1
break
if not cre_match("\\\\s*$", l) or cre_match("/\\\\*", l):
break
except:
pass
f.close()
if not found:
type = "progress"
"""),
("reg", 0,
""" f = open(fname)
try:
if cre_match("REGEDIT[0-9]*\\\\s*$", f.readline()):
type = "registry"
except:
pass
f.close()
"""),
("r", 0,
""" type = "rexx"
f = open(fname)
try:
if cre_match("REBOL", f.readline()):
type = "rebol"
except:
pass
f.close()
"""),
("decl,dcl,dec", 0,
""" f = open(fname)
try:
l = f.readline() + f.readline() + f.readline()
if cre_match("<!SGML", l):
type = "sgmldecl"
except:
pass
f.close()
"""),
("smil", 0,
""" type = "smil"
f = open(fname)
try:
if cre_search("<?\\\\s*xml.*?>", f.readline()):
type = "xml"
except:
pass
f.close()
"""),
("smi", 0,
""" type = "mib"
f = open(fname)
try:
if cre_search("\\\\bsmil\\\\b", f.readline()):
type = "smil"
except:
pass
f.close()
"""),
("web", 0,
""" type = "winbatch"
f = open(fname)
try:
for i in xrange(0,5):
if f.readline()[0] == '%':
type = "web"
break
except:
pass
f.close()
"""),
("xpm", 0,
""" type = "xpm"
f = open(fname)
try:
if cre_search("XPM2", f.readline()):
type = "xpm2"
except:
pass
f.close()
"""),
("xml", 0,
""" type = "xml"
"""),
("", 0,
""" while 1:
if fname == "INDEX" or fname == "INFO":
f = open(fname)
try:
if cre_match("\\\\s*(distribution|installed_software|root|bundle|product)\\\\s*$", f.readline()):
type = "psf"
f.close()
break
except:
pass
f.close()
if string.find("jarg", fname):
f = open(fname)
try:
for i in xrange(0,5):
if re.search("THIS IS THE JARGON FILE", f.readline(), re.I):
type = "jargon"
break
except:
pass
f.close()
break
"""),
("", 1,
""" if ignore and fname[-1] == '~':
type = ft_detect(fname[:-1], 1)
"""),
("", 1,
""" f = open(fname)
line1 = f.readline()
lines = ['', line1, '', '', '', '']
for i in xrange(2, 6):
try:
lines[i] = f.readline()
except:
break
if line1 and line1[0] == ':' and line1[1] == '\\\\n':
type = "sh"
elif cre_match("#(compdef|autoload)\\\\b", line1):
type = "zsh"
elif cre_match("From [a-zA-Z][a-zA-Z_0-9\\\\.=-]*(@[^ ]*)? .*[12][09]\\\\d\\\\d$", line1):
type = "mail"
elif cre_match("<[%&].*>", line1):
type = "mason"
elif cre_match('" *[vV]im$', line1):
type = "vim"
elif cre_match("\\\\*\\\\* LambdaMOO Database, Format Version", line1):
type = "moo"
elif (cre_match("diff\\\\b|Only in |\\\\d+(,\\\\d+)?[cda]\\\\d+\\\\b|# It was generated by makepatch |Index:\\\\s+\\\\S+$|==== //\\\\S+#\\\\d+", line1)
or (cre_match("--- ", line1) and cre_match("+++ ", lines[2]))
or (cre_match("\\\\*\\\\*\\\\* ", line1) and cre_match("--- ", lines[2]))):
type = "diff"
elif cre_match("%!\\\\s*PS", line1):
type = "postscript"
elif (cre_match("\\\\s*dnl\\\\b", line1)
or cre_match("\\\\s*dnl\\\\b", lines[2])
or cre_match("\\\\s*dnl\\\\b", lines[3])
or cre_match("\\\\s*dnl\\\\b", lines[4])
or cre_match("\\\\s*dnl\\\\b", lines[5])):
type = "m4"
elif re.match(" *proc[nd] *$", line1, re.I):
type = "sicad"
elif cre_match("\\\\*\\\\*\\\\* Purify", line1):
type = "purifylog"
elif cre_search("<\\\\?\\\\s*xml.*\\\\?>", line1):
type = "xml"
elif cre_match("[0-9a-fA-F]{7}: [0-9a-fA-F]{2} [0-9a-fA-F]{2} [0-9a-fA-F]{2} [0-9a-fA-F]{2} ", line1):
type = "xxd"
elif cre_match("RCS file:", line1) or cre_match("RCS file:", lines[2]):
type = "rcslog"
elif cre_match("CVS:", lines[2]):
type = "cvs"
elif cre_match("SEND-PR:", line1):
type = "sendpr"
elif cre_match("SNNS network definition file", line1):
type = "snnsnet"
elif cre_match("SNNS pattern definition file", line1):
type = "snnspat"
elif cre_match("SNNS result file", line1):
type = "snnsres"
elif (cre_match("%.*?[Vv]irata", line1)
or cre_match("%.*?[Vv]irata", lines[2])
or cre_match("%.*?[Vv]irata", lines[3])
or cre_match("%.*?[Vv]irata", lines[4])
or cre_match("%.*?[Vv]irata", lines[5])):
type = "virata"
elif cre_match("[0-9]* *execve\\\\(", line1):
type = "strace"
elif (cre_search("K & K Associates", lines[4])
or cre_search("TAK 2000", lines[2])):
type = "takout"
elif cre_search("S Y S T E M S I M P R O V E D ", lines[3]):
type = "sindaout"
# takcmp and sindacmp skipped
elif (cre_search("\\\\$ORIGIN|\\\\$TTL|IN\\\\s*SOA", line1 + lines[2])
or cre_search("BIND.*named", line1 + lines[2] + lines[3] + lines[4])):
type = "dns"
elif ((cre_search("\\\\|\\\\*{1,80}", line1)
and cre_search("VRC ", lines[2]))
or (cre_search("\\\\|\\\\*{1,80}", lines[2])
and cre_search("VRC ", lines[3]))):
type = "baan"
elif cre_match("==\\\\d+== valgrind", line1):
type = "valgrind"
else:
line = None
for i in xrange(1,6):
if not cre_match("\\\\? ", lines[i]):
line = lines[i]
break
if not line:
while 1:
try:
l = f.readline()
if not cre_match("\\\\? ", l):
line = l
break
except:
break
if line and cre_match("Index:\\\\s+\\\\S+$", line):
type = "diff"
f.close()
"""),
("mas,master", 1,
""" type = "master"
"""),
("m4", 1,
""" type = "m4"
"""),
("me", 1,
""" type = "nroff"
"""),
("txt", 1,
""" type = "text"
"""),
("inp", 1,
""" f = open(fname)
try:
l = f.readline()
if l[0] == '*':
type = "abaqus"
else:
for i in xrange(1, 500):
if len(l) >= 19 and string.lower(l[:19]) == "header surface data":
type = "trasys"
break
l = f.readline()
except:
pass
f.close()
"""),
("asp", 1,
""" type = "aspvbs"
f = open(fname)
try:
l = f.readline()
l = l + f.readline()
l = l + f.readline()
except:
pass
if string.find("perlscript", string.lower(l)) >= 0:
type = "aspperl"
f.close()
"""),
("cfg", 1,
""" type = "cfg"
"""),
]
#
# The extra list of detected file types for case sensitive systems.
#
if os.name == "posix":
_case_detect_list = """
suffix L lisp
suffix C cpp
suffix H cpp
"""
# List of _Ft_py objects: Python code executed to detect file type.
# Used first.
_py_list_before = []
# Dictionary used to map file name extension to file type.
_suffix_dict = {}
# List of _Ft_re objects; a match of the RE with the file name defines the file
# type.
_regexp_list = []
# List of _Ft_re objects: a match of the RE with the script in the first line
# of the file defines the file type.
_script_list = []
# List of _Ft_py objects: Python code executed to detect file type.
# Used after everything else didn't detect the type.
_py_list_after = []
# The detected file types are cached. This assumes the file type doesn't
# change while executing recipes. Would this every be false?
# Index in the list is "ignore".
_cache_dict = [{}, {}]
# Dictionary of known filetypes (only the keys are important)
_filetype_dict = {}
# List of types from the builtin python scripts.
# Generated with the following shell command:
#
# grep 'type[[:space:]]*=[[:space:]]*"' Filetype.py | \
# sed -e 's,""",,' | \
# sed -e 's,[^"]*",,' -e 's,".*,,' | \
# sort | uniq \
# sed -e 's,^, ",' -e 's/$/",/'
#
# This is used to pre-populate _filetype_dict. Update this list if the list of
# builtin Python detected-types changes.
_filetype_pre_list = [
"abaqus",
"asm",
"aspperl",
"aspvbs",
"automake",
"baan",
"basic",
"cfg",
"ch",
"change",
"cl",
"config",
"cvs",
"cweb",
"dcl",
"diff",
"diva",
"dns",
"dtd",
"eiffel",
"form",
"ishd",
"jargon",
"lprolog",
"m4",
"mail",
"mason",
"master",
"matlab",
"mib",
"mma",
"modsim3",
"moo",
"nroff",
"objc",
"pascal",
"perl",
"php",
"postscript",
"progress",
"prolog",
"psf",
"purifylog",
"rcslog",
"rebol",
"registry",
"rexx",
"sendpr",
"sgmldecl",
"sh",
"sicad",
"sindaout",
"smil",
"snnsnet",
"snnspat",
"snnsres",
"specman",
"strace",
"tads",
"takout",
"text",
"trasys",
"valgrind",
"vb",
"vim",
"virata",
"web",
"winbatch",
"xmath",
"xml",
"xpm",
"xpm2",
"xxd",
"zsh",
# End of the list of grepped types.
# The remainder of these types is internal to AAP and cannot be detected.
"libobject",
"ltobject"
]
_did_init = 0 # non-zero when __init__() did its work
def __init__():
global _suffix_dict, _regexp_list, _script_list
global _py_list_before, _py_list_after
global _did_init
global _filetype_dict
# this only needs to be done once
if _did_init:
return
_did_init = 1
_py_list_before = []
_suffix_dict = {}
_regexp_list = []
_script_list = []
_py_list_after = []
_filetype_dict = {}
# Load the built-in detection rules.
_add_suffixlist(_def_suffix_list)
_add_regexplist(_def_regexp_list)
_add_scriptlist(_def_script_list)
_add_pythonlist(_def_python_list)
if os.name == "posix":
ft_add_rules(_case_detect_list, 1)
# Load detection rules from system and user *.afd files.
for dirpath in default_dirs({}):
ft_check_dir(os.path.join(dirpath, "afd"))
# Declare all the filetypes known from the builtin Python bits
for i in _filetype_pre_list:
_filetype_dict[i] = 1
class DetectError(Exception):
"""Error for something gone wrong."""
def __init__(self, args = None):
Exception.__init__(self)
self.args = args
def ft_known(type):
"""Return True when "type" is a known filetype."""
__init__()
return _filetype_dict.has_key(type)
def ft_declare(type):
"""Delcare "type" to be a known filetype."""
__init__()
_filetype_dict[type] = 1
def ft_check_dir(dir, errmsg = 0, recdict = None):
"""Check directory "dir" for *.afd files and load them.
When "errmsg" is non-zero give an error message when the directory
doesn't exist."""
if os.path.exists(dir) and os.path.isdir(dir):
for f in glob.glob(os.path.join(dir, "*.afd")):
try:
ft_read_file(f, recdict)
except DetectError, e:
if _run_as_program:
print str(e)
else:
from Message import msg_error
msg_error(recdict, str(e))
elif errmsg:
e = _('Directory does not exist: "%s"') % dir
if _run_as_program:
print e
else:
from Message import msg_error
msg_error(recdict, e)
def ft_read_file(fname, recdict = None):
"""Read file "fname" for file type detection rules."""
try:
fd = open(fname)
except IOError, e:
raise DetectError, (_('Cannot open "%s": ') % fname) + str(e)
try:
s = fd.read()
except IOError, e:
raise DetectError, (_('Cannot read "%s": ') % fname) + str(e)
fd.close()
ft_add_rules(s, 1, recdict)
def ft_add_rules(dtstr, recipe_line_nr, recdict = None):
"""Add file type detection rules from string "dtstr".
"recipe_line_nr" is the first line number in a recipe, zero when not
reading a recipe."""
# Always load the default rules first (skipped when done already).
__init__()
# Split the string into individual lines.
lines = string.split(dtstr, '\n')
# Loop over all the lines (may use more than one for python items).
# Note: using skip_white() and skip_to_white() is avoided here for speed.
line_idx = 0
line_count = len(lines)
while line_idx < line_count:
line = lines[line_idx]
# isolate first word: type of detection.
items = string.split(line, None, 1)
# ignore empty and comment lines
if len(items) < 1 or items[0][0] == '#':
line_idx = line_idx + 1
continue
itype = items[0]
if len(items) < 2:
rline = ''
else:
rline = items[1]
rline_len = len(rline)
# isolate first argument, which may be in quotes
astart = 0
if astart < rline_len:
if rline[astart] == '"' or rline[astart] == "'":
quote = rline[astart]
astart = astart + 1
aend = astart
while aend < rline_len and rline[aend] != quote:
aend = aend + 1
if aend == rline_len:
raise DetectError, (_('Missing quote in line %d: "%s"')
% (line_idx + recipe_line_nr, line))
n = aend + 1
else:
aend = astart
while aend < rline_len and rline[aend] != ' ' and rline[aend] != '\t':
aend = aend + 1
n = aend
arg1 = rline[astart:aend]
else:
arg1 = ''
n = rline_len
# Isolate further arguments (no quotes!).
# A superfluous argument is silently ignored (could be a comment).
args = string.split(rline[n:])
if len(args) >= 1:
arg2 = args[0]
else:
arg2 = ''
if len(args) >= 2:
arg3 = args[1]
else:
arg3 = ''
if len(args) >= 3:
arg4 = args[2]
else:
arg4 = ''
if ((itype in ["suffix", "regexp", "script"] and not arg2)
or (itype == "declare" and not arg1)):
raise DetectError, (_('Missing argument in line %d: "%s"')
% (line_idx + recipe_line_nr, line))
# Just declare a filetype
if itype == "declare":
_filetype_dict[arg1] = 1
# Filetype file file suffix
elif itype == "suffix":
_add_suffix(arg1, arg2)
# Filetype based on a regex match of the filename
elif itype == "regexp":
_add_regexp(arg1, arg2, arg3 == "tail" or arg4 == "tail",
arg3 == "append" or arg4 == "append")
# Filetype based on checking the #! line for an interpreter
elif itype == "script":
_add_script(arg1, arg2, arg3 and arg3 == "append")
# Magic python-based filetype detection
elif itype == "python":
append = 0
after = 0
suffix = None
for arg in [arg1, arg2, arg3]:
if arg:
if arg == "append":
append = 1
elif arg == "after":
after = 1
elif not suffix:
suffix = arg
else:
raise DetectError, (
_('Illegal argument in line %d: "%s"')
% (line_idx + recipe_line_nr, line))
start_indent = get_indent(line)
line_idx = line_idx + 1
start_line_idx = line_idx + recipe_line_nr
cmds = ""
while line_idx < line_count:
line = lines[line_idx]
if get_indent(line) <= start_indent:
# Ignore empty and comment lines.
i = skip_white(line, 0)
if i < len(line) and line[i] != '#':
line_idx = line_idx - 1 # this line has next item
break
cmds = cmds + line + '\n'
line_idx = line_idx + 1
if not cmds:
raise DetectError, (_('Python commands missing in line %d')
% (line_idx + recipe_line_nr))
_add_python(cmds, _("filetype detection; python code at line %d: ")
% start_line_idx, after, append, suffix)
else:
raise DetectError, (
_('Illegal item "%s" in argument to ft_add_rules(): %s')
% (itype, line))
line_idx = line_idx + 1
class _Ft_re:
"""Class used to store pairs of RE and file type."""
def __init__(self, regexp, type, tail):
self.re = regexp
self.type = type
self.tail = tail # match tail of filename
self.cre = None
def comp(self):
"""Get the compiled regexp, cache the result."""
try:
self.cre = re.compile(self.re)
except StandardError, e:
raise DetectError, (_('Error in filetype detection regexp "%s": ')
% self.re) + str(e)
class _Ft_py:
"""Class used to store Python code for detecting a file type."""
def __init__(self, code, suffix, error_msg):
self.code = code # the Python code as a string
self.ccode = None # the compiled Python code
self.suffix = suffix # the list of required suffixes or None
self.error_msg = error_msg # a message used for errors
def compile(self):
if not self.ccode:
# DEBUG
# print "compling for suffix: ", self.suffix
# print "compiling code: ", self.code
# Prepend "if 1:" to get the indenting right.
if self.code[0] == ' ' or self.code[0] == '\t':
tcode = "if 1:\n" + self.code
else:
tcode = self.code
try:
self.ccode = compile(tcode, 'filetype detection rules', 'exec')
except StandardError, e:
raise DetectError, (_('Error in Python code (%s): ')
% self.error_msg) + str(e)
def _add_suffix(suf, type):
"""Add detection of "type" by file name extension "suf".
When "type" is "ignore" it means the suffix is removed and further
detection done on the rest.
When "type" is "remove" an existing detection for "suf" is removed."""
if type == 'remove':
if _suffix_dict.has_key(suf):
del _suffix_dict[suf]
else:
_suffix_dict[suf] = type
_filetype_dict[type] = 1
def _add_suffixlist(list):
"""Add suffix rules from a list of suffix-type tuples."""
for suf, itype in list:
_suffix_dict[suf] = itype
_filetype_dict[itype] = 1
def _add_regexp(regexp, type, tail, append):
"""Add detection of "type" by matching the file name with Python regular
expression "regexp".
When append is non-zero, add to the end of the regexp rules.
When "type" is "remove" an existing detection for "regexp" is removed."""
if type == 'remove':
for r in _regexp_list:
if r.re == regexp:
_regexp_list.remove(r)
else:
f = _Ft_re(regexp, type, tail)
if append:
_regexp_list.append(f)
else:
_regexp_list.insert(0, f)
_filetype_dict[type] = 1
def _add_regexplist(list):
"""Add regexp rules from a list of regexp-type-tail tuples."""
for regexp, itype, tail in list:
_add_regexp(regexp, itype, tail, 0)
def _add_script(regexp, type, append):
"""Add detection of "type" by matching the script name in the first line of
the file with Python regular expression "regexp".
When append is non-zero, add to the end of the script rules.
When "type" is "remove" an existing detection for "regexp" is removed."""
if type == 'remove':
for r in _script_list:
if r.re == regexp:
_script_list.remove(r)
else:
f = _Ft_re(regexp, type, 0)
_filetype_dict[type] = 1
if append:
_script_list.append(f)
else:
_script_list.insert(0, f)
def _add_scriptlist(list):
"""Add script rules from a list of scriptname-type tuples."""
for regexp, itype in list:
_add_script(regexp, itype, 0)
def _add_python(code, error_msg, after, append, suffix):
"""Add detection of "type" by using Python code "code".
Each line in "code" must end in a '\n'.
"error_msg" is printed when executing the code results in an error.
When "after" is non-zero use this rule after suffix, regexp and script
rules.
When append is non-zero, add to the end of the python rules."""
if suffix:
l = string.split(suffix, ',')
else:
l = []
p = _Ft_py(code, l, error_msg)
if after:
ilist = _py_list_after
else:
ilist = _py_list_before
if append:
ilist.append(p)
else:
ilist.insert(0, p)
def _add_pythonlist(list):
"""Add python rules from a list of type-after-script tuples."""
msg = _("default rule")
for suffix, after, script in list:
_add_python(script, msg, after, 0, suffix)
def _exec_py(fname, item, ignore):
"""Execute the code defined with _add_python()."""
# Make a completely fresh recdict dictionary.
exec_recdict["fname"] = fname
exec_recdict["fname_base"] = os.path.basename(fname)
exec_recdict["ft_detect"] = ft_detect
exec_recdict["ignore"] = ignore
if exec_recdict.has_key("type"):
del exec_recdict["type"]
item.compile()
try:
exec item.ccode in exec_recdict, exec_recdict
except IOError, e:
pass # ignore errors for reading the file
except StandardError, e:
raise DetectError, _(item.error_msg) + str(e)
if exec_recdict.has_key("type"):
return exec_recdict["type"]
return None
def ft_detect(fname, ignore = 0, recdict = None):
"""Detect the file type for file "fname".
Returns the type as a string or None."""
# return quickly when already detected before
if _cache_dict[ignore].has_key(fname):
return _cache_dict[ignore][fname]
if os.path.isdir(fname):
_cache_dict[ignore][fname] = "directory"
return "directory"
# Internationalisation inits: setlocale and gettext.
i18n_init()
# Initialize (will skip when done already)
__init__()
# On non-Posix systems we ignore case differences by making the name lower
# case.
fname = fname_fold(fname)
# Do the early python code checks. May first check if the suffix matches.
i = string.rfind(fname, ".")
if i > 0:
suffix = fname[i + 1:]
else:
suffix = ''
for p in _py_list_before:
if not p.suffix or suffix in p.suffix:
atype = _exec_py(fname, p, ignore)
if atype:
_cache_dict[ignore][fname] = atype
return atype
# Try the extension, this is fastest.
# When "fname" has several extensions, try with all of them first, then
# try by removing the first ones: "f.html.c": "html.c" then ".c".
bn = os.path.basename(fname)
i = string.find(bn, ".")
while i > 0 and i + 1 < len(bn):
# Found a dot that's not the first or last character.
if _suffix_dict.has_key(bn[i + 1:]):
ft = _suffix_dict[bn[i + 1:]]
if ft == "ignore" and ignore:
# remove an ignored extension and detect with that
ft = ft_detect(fname[:-(len(bn[i:]))], 1, recdict)
_cache_dict[ignore][fname] = ft
return ft
i = string.find(bn, ".", i + 1)
# match all defined REs with the file name.
# TODO: handle "/" in RE and fname.
for r in _regexp_list:
if not r.cre:
r.comp()
if r.tail:
if r.cre.match(bn):
_cache_dict[ignore][fname] = r.type
return r.type
else:
if r.cre.match(fname):
_cache_dict[ignore][fname] = r.type
return r.type
# match all defined REs with the script name in the first line of the
# file.
try:
f = open(fname)
line = f.readline()
f.close()
except:
# Errors for files that can't be read are ignored.
pass
else:
if len(line) > 2 and line[:2] == "#!":
# TODO: remove "env VAR=val" and script arguments from line
text = line[2:]
for r in _script_list:
if not r.cre:
r.comp()
if r.cre.match(text):
_cache_dict[ignore][fname] = r.type
return r.type
# Do the python code checks. May first check if the suffix matches.
i = string.rfind(bn, ".")
if i > 0:
suffix = bn[i + 1:]
else:
suffix = ''
for p in _py_list_after:
if not p.suffix or suffix in p.suffix:
atype = _exec_py(fname, p, ignore)
if atype:
_cache_dict[ignore][fname] = atype
return atype
_cache_dict[ignore][fname] = None
return None
def filetype_root(ft):
"""When "ft" contains an underscore, return the part before the underscore.
This is the basic filetype for user-defined filetypes.
Return None otherwise."""
i = string.find(ft, '_')
if i > 0:
return ft[:i]
return None
# When executed as a program, detect the type of the specified file.
if __name__ == '__main__':
# Internationalisation inits: setlocale and gettext.
i18n_init()
items = []
checkfile = None
_run_as_program = 1
# Check for any "-Idir", "-I dir", "-ffile" and "-f file" arguments.
next_is_dir = 0
next_is_file = 0
for arg in sys.argv[1:]:
if next_is_dir:
items.extend({"dir" : arg})
next_is_dir = 0
elif next_is_file:
items.extend({"file" : arg})
next_is_file = 0
elif len(arg) >= 2 and arg[:2] == "-I":
if len(arg) > 2:
items.extend({"dir" : arg[2:]})
else:
next_is_dir = 1
elif len(arg) >= 2 and arg[:2] == "-f":
if len(arg) > 2:
items.extend({"file" : arg[2:]})
else:
next_is_file = 1
else:
if checkfile:
print _("Can only check one file")
sys.exit(1)
checkfile = arg
if next_is_dir:
print _("-I argument must be followed by a directory name")
sys.exit(1)
if next_is_file:
print _("-f argument must be followed by a file name")
sys.exit(1)
if not checkfile:
print _("Usage: %s [-I ruledir] [-f rulefile] filename") % sys.argv[0]
sys.exit(1)
# load the built-in default rules
__init__()
# Check specified directories for *.afd files and read specified files.
for item in items:
if item.has_key("dir"):
ft_check_dir(item["dir"])
else:
try:
ft_read_file(item["file"])
except DetectError, e:
print e
try:
type = ft_detect(sys.argv[1])
if type == "ignore":
print ft_detect(sys.argv[1], 1), "(ignored suffix)"
else:
print ft_detect(sys.argv[1])
except DetectError, e:
sys.stderr.write("Detection error: " + str(e))
# vim: set sw=4 et sts=4 tw=79 fo+=l:
|