charfinder.py :  » IDE » PIDA » pida-0.6beta3 » pida » core » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » IDE » PIDA 
PIDA » pida 0.6beta3 » pida » core » charfinder.py
# -*- coding: utf-8 -*- 
# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79:
"""
    Chartype detection

    :copyright: 2005-2008 by The PIDA Project
    :license: GPL 2 or later (see README/COPYING/LICENSE)
"""
import codecs
import re

# list of mimetypes that are known to be text
# will be filled by languge service
text_mime = set()

dumb_encodings = ["utf-8", "iso-8859-15", "windows-1252"]

def dumb_detect(stream, filename, mimetype):
    for encoding in dumb_encodings:
        try:
            codecs.open(filename, encoding=encoding).read()
            return encoding
        except UnicodeDecodeError:
            pass

PY_ENC = re.compile(r"coding: ([\w\-_0-9]+)")
def python_detect(stream, filename, mimetype):
    def find_one():
        match = PY_ENC.search(stream.readline())
        if match:
            return match.group(1)

    return find_one() or find_one()

try:
    from pida.utils import magic

    WELL_KNOWN = ['ISO-8859', 'ASCII', 'UTF-8', 'UTF-16LE', 'UTF-16BE',
                  'UTF-32BE', 'UTF-32LE']
    FAILED = []

    def _magic_enc(type_):
        if type_ in WELL_KNOWN:
            return type_
        if type_ in FAILED:
            return None
        try:
            #why does codecs don't have a list of this ????
            codecs.lookup(type_)
            return type_
        except LookupError:
            FAILED.append(type_)
            return None


    def magic_detect(stream, filename, mimetype):
        if filename:
            mime = magic.Magic(mime=True).from_file(filename)
            if mime[:5] == 'text/':
                # magic return often a to specific content that does not
                # contain the encoding :-(
                return _magic_enc(magic.Magic().from_file(filename).split()[0])
        elif stream:
            # very bad
            chunk = stream.read()
            mime = magic.Magic(mime=True).from_buffer(chunk)
            if mime[:5] == 'text/':
                return _magic_enc(magic.Magic().from_buffer(chunk).split()[0])


    def magic_test(stream, filename, mimetype):
        if filename:
            mime = magic.Magic(mime=True).from_file(filename)
            if mime[:5] == 'text/':
                return True
            elif mime in text_mime:
                return True
            return False
        elif stream:
            # very bad
            chunk = stream.read()
            mime = magic.Magic(mime=True).from_buffer(chunk)
            if mime[:5] == 'text/':
                return True
            elif mime in text_mime:
                return True
            return False


except AttributeError, e:
    print "can't load magic module"
    magic_detect = lambda *k: None
    magic_test = lambda *k: None


try:
    from chardet.universaldetector import UniversalDetector

    def chardet_sniff(stream, filename, mimetype):
        detector = UniversalDetector()
        chunk = stream.read(4086)
        while chunk and not detector.done:
            detector.feed(chunk)
            chunk = stream.read(4086)

        detector.close()
        return detector.result["encoding"]

except ImportError:
    chardet_sniff = lambda *k: None

mime_detectors = {
    ('text', 'x-python'): python_detect,
}

def detect_mime(stream, filename, mimetype):
    if mimetype in mime_detectors:
        return mime_detectors[mimetype](stream, filename, mimetype)

detectors = [detect_mime, magic_detect, chardet_sniff, dumb_detect]
text_detectors = [magic_test]

def detect_encoding(stream, filename, mimetype):
    """
    Detect and returns the encoding of:

    @stream: fileobject
    @filename: absolute path
    @mimetype: mimetype
    """
    for encoder in detectors:
        encoding = encoder(stream, filename, mimetype)
        stream.seek(0)
        if encoding is not None:
            return encoding

    return 'ASCII' #XXX this seems a bit insane

def detect_text(stream, filename, mimetype):
    """
    Detects if the input is of type text and returns True

    @stream: fileobject
    @filename: absolute path
    @mimetype: mimetype
    """
    if mimetype:
        if mimetype[:5] == 'text/':
            return True
        if mimetype in text_mime:
            return True

    for encoder in text_detectors:
        rv = encoder(stream, filename, mimetype)
        if rv is not None:
            return rv

    #try the rest..
    # chardetsniff wont work, it finds libraries as text..
    for encoder in [dumb_detect]:
        encoding = encoder(stream, filename, mimetype)
        stream.seek(0)
        if encoding is not None:
            print encoder, encoding
            return True

    return False
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.