HtmlRead.py : » Development » SnapLogic » snaplogic » components » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » Development » SnapLogic
SnapLogic » snaplogic » components » HtmlRead.py
# $SnapHashLicense:
# 
# SnapLogic - Open source data services
# 
# Copyright (C) 2009, SnapLogic, Inc.  All rights reserved.
# 
# See http://www.snaplogic.org for more information about
# the SnapLogic project. 
# 
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
# 
# "SnapLogic" is a trademark of SnapLogic, Inc.
# 
# 
# $

#     $Id: HtmlRead.py 10330 2009-12-24 22:13:38Z grisha $

"""
    HtmlScrape Module and Resource Definition
"""

import os
from decimal import Decimal
from BeautifulSoup import *

import snaplogic.components.FileUtils as FileUtils
from snaplogic.common import snap_log
from snaplogic.common.snap_exceptions import *
from snaplogic.common.data_types import Record
from snaplogic.cc.component_api import ComponentAPI
import snaplogic.cc.prop as prop

from snaplogic.common.SnapReader import SnapReader
from snaplogic.common.SnapReader import SnapFtpReader
from snaplogic.common.SnapReader import SnapHttpReader
from snaplogic.common.SnapReader import SnapFileReader
from snaplogic.common.data_types import SnapNumber,SnapString
from snaplogic.common import version_info
from snaplogic.snapi_base import keys
from snaplogic.cc import component_api

# Public names
__all__ = [ "HtmlRead" ]


class HtmlRead(ComponentAPI):
    """
    This class implements HTML Read component.
    
    It provides functionality to read certain structured parts of HTML
    into data fields
    """
#    Capability = HtmlScrapeCapability()
    api_version = '1.0'
    component_version = '1.2'

    """ Capabilities for HtmlRead """    
    capabilities = {
        ComponentAPI.CAPABILITY_INPUT_VIEW_LOWER_LIMIT    : 0,
        ComponentAPI.CAPABILITY_INPUT_VIEW_UPPER_LIMIT    : 0,
        ComponentAPI.CAPABILITY_OUTPUT_VIEW_LOWER_LIMIT   : 1,
        ComponentAPI.CAPABILITY_OUTPUT_VIEW_UPPER_LIMIT   : 1,
        ComponentAPI.CAPABILITY_ALLOW_PASS_THROUGH        : False
    }
    
    component_description = "Reads structured parts of HTML"
    component_label = "HTML Reader"
    component_doc_uri = "https://www.snaplogic.org/trac/wiki/Documentation/%s/ComponentRef/HtmlReader" % \
                                                        version_info.doc_uri_version
 
    def validate_config_file(self):
        """
        If config file is provided for this component then see if it provides a value for root directory.
        
        The root directory is a way specifying that all local files must be read from the specified
        root directory.
        
        """
        root = None
        for k in self.config_dictionary:
            if k == "root_directory" and self.config_dictionary[k]:
                if not os.path.exists(self.config_dictionary[k]):
                    raise SnapComponentConfigError("The path specified for root (%s) is not valid" %
                                                   self.config_dictionary[k])
                root = self.config_dictionary[k]
            elif k == "schemes":
                # Make sure "schemes" contains only the schemes we support
                FileUtils.validate_schemes(self.config_dictionary.get("schemes"), FileUtils.reader_schemes)
            else:
                # No other config file param is supported.
                raise SnapComponentConfigError("Unexpected config file entry (%s) encountered" % k)
    
    def create_resource_template(self):
        """
        Create HtmlRead resource definition template. It consists of
        
        inputURL:   The URL of the site to read
        username:   Credentials: username needed to read the file.
        password:   Credentials: password needed to read the file.
        table_num:  Table number (in order) to extract from the HTML
        skip_lines: Header lines (rows) to skip in that table
        class:      CSS class to extract (instead of a table number)
        """
        self.set_property_def("inputURL", prop.SimpleProp("Input URL", SnapString, 
                  "The URI of the page to be read.  You can enter either a URI pointing to a HTTP or FTP server, or a local file path.  The following URI schemes are supported: file, http, https, ftp, file.\n"
                  "\n\nExamples of valid URIs:\n"
                  "http://www.server.com/dir/input.file\n"
                  "https://www.server.com/dir/input.file"
                  "/home/user/input.file\n"
                  "c:\dir\input.file\n"
                  "file:///c:/dir/input.file\n"
                  "ftp://ftp.server.com/dir/input.file\n"
                  , None, True))
        
        # Credentials are username and password
        self.set_property_def("username",
                              prop.SimpleProp("Credentials:  Username", SnapString, 
                                              "Username to use if credentials are needed for the accessing data"))
        self.set_property_value("username", "")
        self.set_property_def("password",
                              prop.SimpleProp("Credentials: Password", SnapString, 
                                              "Password to use if credentials are needed for the accessing data", 
                                              {'obfuscate' : 0}))
        
        self.set_property_def('table_num',
                              prop.SimpleProp("Table number", SnapNumber, "Table to read",
                                              {"min_value": 1}))
        
        self.set_property_def('skip_lines',
                              prop.SimpleProp("Skip lines", SnapNumber, "Number of head table lines to skip",
                                              {"min_value": 0}))
        self.set_property_value('skip_lines', 0)
        
        self.set_property_def('class',
                              prop.SimpleProp("Class name", SnapString, "CSS class to read"))
        
        
    def suggest_resource_values(self, err_obj):
        """Suggest that "0" be used as skip_lines, if no value has been selected."""
        val = self.get_property_value("skip_lines")
        if not val:
            self.set_property_value("skip_lines", 0)
            
    def validate(self, err_obj):
        """NB: all required properties are handled by the generic validation system"""
        tablenum = self.get_property_value("table_num")
        classname = self.get_property_value("class")
        if tablenum and classname and (classname != ''):
            err_obj.get_property_err("class").set_message("'Table number' and 'Class name' are mutually exclusive. Only one can be specified.")
            err_obj.get_property_err("table_num").set_message("'Table number' and 'Class name' are mutually exclusive. Only one can be specified.")
        
        # Validate that the filename complies with the allowed URI schemes,
        # unless it's specified via a parameter 
        FileUtils.validate_filename_property(self.get_property_value("inputURL"), "inputURL", err_obj, 
                                   self.config_dictionary.get("schemes"), FileUtils.reader_schemes)
        
        # Validate that field datatypes are of supported types
        views = self.list_output_view_names()
        view_name = views[keys.SINGLE_VIEW]
        view = self.get_output_view_def(view_name)

        supported_datatypes = (SnapString, SnapNumber)
        for i in range(0, len(view[keys.VIEW_FIELDS])):
            field_type = view[keys.VIEW_FIELDS][i][keys.FIELD_TYPE]
            field_name = view[keys.VIEW_FIELDS][i][keys.FIELD_NAME]

            # For each field, check the datatype.
            # If we don't support this datatype add an error. 
            if field_type not in supported_datatypes:
                err_obj.get_output_view_err()[view_name][keys.VIEW_FIELDS][i].set_message(
                        "Output field '%s' datatype '%s' is not supported.  Must be one of: %s" %
                            (field_name, field_type, str(supported_datatypes)))

    def execute(self, input_views, output_views):
        """Execute the HtmlRead functionality of the component."""
        try:
            self._output_view = output_views.values()[keys.SINGLE_VIEW] 
        except IndexError:
            raise SnapComponentError("No output view connected.")

        self._inputurl = self.get_property_value("inputURL")
        self._username = self.get_property_value("username")
        self._password = self.get_property_value("password")
        self._table_num = self.get_property_value("table_num")
        self._skip_lines = int(self.get_property_value("skip_lines"))
        self._class = self.get_property_value("class")


        # Make sure the filename is always qualified
        self._inputurl = FileUtils.qualify_filename(self._inputurl)

        # Validate filename URI scheme
        error = FileUtils.validate_file_scheme(self._inputurl, self.config_dictionary.get("schemes"), FileUtils.reader_schemes)
        if error is not None:
            raise SnapComponentError(error)

        self._inputurl = FileUtils.get_file_location(self._inputurl, self.config_dictionary)
            
        self._rdr = SnapReader.create(self._inputurl, self._username, self._password, None, self.env)

        """Read the document and write records out."""
        
        if self._rdr.scheme.startswith('http'):
            root = BeautifulSoup(self._rdr.open(), convertEntities='html')
        else:
            # Use Reader method to read the input and pass the data to BeautifulSoup.
            self._rdr.open()
            root = BeautifulSoup(self._rdr.read())
            self._rdr.close()

        tables = root.findAll('table')
        self.log(snap_log.LEVEL_DEBUG, "HtmlRead: found %s tables" % len(tables))
        self.log(snap_log.LEVEL_DEBUG, "HtmlRead: processing table %s " % self._table_num)
        #print "tables: %s" % tables

        if not self._class or (self._class == ''):
            if not self._table_num:
                # Be forgiving: assume omitted table number means the first table on the page
                tablenum = 0
            else:
                tablenum = int(self._table_num) - 1
            table = tables[tablenum]
            #print table.prettify()
            rows = table.findAll('tr')
            for row in rows[self._skip_lines:]:
                cells = row.findAll('td')
                self.processCells(cells)
        else:
            self.log(snap_log.LEVEL_DEBUG, "HtmlRead: processing class %s" % self._class)
            cells = root.findAll('td', self._class)
            self.processCells(cells)
        self._output_view.completed()


    def processCells(self, cells):
        i = 0
        out_rec = self._output_view.create_record()
        self.log(snap_log.LEVEL_DEBUG, "HtmlRead: found %s cells" % len(cells))

        for cell in cells:
            """
            print "cell: %s " % cell
            print "type: %s " % type(cell)
            print "type2: %s " % type(cell.contents)
            print  "conts: %s " % cell.contents
            """
            try:
                field_name = out_rec.field_names[i]
                field_type = self._output_view.field_types[i]
                self.log(snap_log.LEVEL_DEBUG, "field %s" % field_name)
                
                if len(cell.contents)==1 and type(cell.contents[0])==NavigableString :#isinstance(cell.contents[0], NavigableString) :
                    text = cell.contents[0].strip()
                    self.log(snap_log.LEVEL_DEBUG, "p1: %s " % text)
                else:
                    #print "cell conts: %s " % cell.contents[0]
                    data = ''
                    for cellconts in cell.findAll(text=True):
                        data = data + cellconts.strip()
                    text = data
                    self.log(snap_log.LEVEL_DEBUG, "p2: %s " % text)
                
                if field_type == SnapNumber:
                    # Numeric field
                    if not text:
                        out_rec[field_name] = None
                    else:
                        out_rec[field_name] = Decimal(text)
                else:
                    # String field
                    try: 
                        # Convert to unicode... 
                        u = unicode(text, 'utf-8') 
                    except: 
                        # ... which may fail because the string already is unicode 
                        u = text 
                    out_rec[field_name] = u
                    
                i = i + 1
            except Exception, e :
                self.log(snap_log.LEVEL_DEBUG, "HtmlRead: exception %s " % e)
                pass

        self._output_view.write_record(out_rec)

    def upgrade_1_0_to_1_1(self):
        """ 
        Upgrade resource from version 1.0 to version 1.1.
        In version 1.0 credentials were stored as a single user:passwd string separated by colon.
        In version 1.1 it's stored as two separate properties, and password is obfuscated. 

        Also, change the description of the inputURL property.
        The description was changed in release 2.1.0.
        """
        # Old credentials were stored as user:password, split them into two variables
        credentials = self.get_property_value("credential")
        username = None
        password = None

        if credentials is not None:
            # Colons are allowed in passwords, so split it at the first colon
            cred_list = credentials.split(':', 1)
            if len(cred_list) >= 1:
                # If there is a username, it's the first element of the list
                username = cred_list[0]
                if len(cred_list) == 2:
                    # If there is a password, it's the second element of the list 
                    password = cred_list[1]

        # Delete the old credentials property
        self.del_property_def("credential")
        
        # Create the new credentials properties
        self.set_property_def("username",
                              prop.SimpleProp("Credentials:  Username", SnapString, 
                                              "Username to use if credentials are needed for the accessing data"))
        self.set_property_def("password",
                              prop.SimpleProp("Credentials: Password", SnapString, 
                                              "Password to use if credentials are needed for the accessing data", 
                                              {'obfuscate' : 0}))

        # Set the new credentials properties
        self.set_property_value("username", username)
        self.set_property_value("password", password)
        
        # Recreate the inputURL property
        filename = self.get_property_value("inputURL")
        self.del_property_def("inputURL")
        self.set_property_def("inputURL", prop.SimpleProp("Input URL", SnapString, 
                  "The URI of the page to be read.  You can enter either a URI pointing to a HTTP or FTP server, or a local file path.  The following URI schemes are supported: file, http, https, ftp, file.\n"
                  "\n\nExamples of valid URIs:\n"
                  "http://www.server.com/dir/input.file\n"
                  "https://www.server.com/dir/input.file"
                  "/home/user/input.file\n"
                  "c:\dir\input.file\n"
                  "file:///c:/dir/input.file\n"
                  "ftp://ftp.server.com/dir/input.file\n"
                  , None, True))
        self.set_property_value("inputURL", filename)

    def upgrade_1_1_to_1_2(self):
        """
        No-op upgrade only to change component doc URI during the upgrade
        which will be by cc_info before calling this method.
        
        """
        pass
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.