# $SnapHashLicense:
#
# SnapLogic - Open source data services
#
# Copyright (C) 2009, SnapLogic, Inc. All rights reserved.
#
# See http://www.snaplogic.org for more information about
# the SnapLogic project.
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
#
# "SnapLogic" is a trademark of SnapLogic, Inc.
#
#
# $
# $Id: HtmlRead.py 10330 2009-12-24 22:13:38Z grisha $
"""
HtmlScrape Module and Resource Definition
"""
import os
from decimal import Decimal
from BeautifulSoup import *
import snaplogic.components.FileUtils as FileUtils
from snaplogic.common import snap_log
from snaplogic.common.snap_exceptions import *
from snaplogic.common.data_types import Record
from snaplogic.cc.component_api import ComponentAPI
import snaplogic.cc.prop as prop
from snaplogic.common.SnapReader import SnapReader
from snaplogic.common.SnapReader import SnapFtpReader
from snaplogic.common.SnapReader import SnapHttpReader
from snaplogic.common.SnapReader import SnapFileReader
from snaplogic.common.data_types import SnapNumber,SnapString
from snaplogic.common import version_info
from snaplogic.snapi_base import keys
from snaplogic.cc import component_api
# Public names
__all__ = [ "HtmlRead" ]
class HtmlRead(ComponentAPI):
"""
This class implements HTML Read component.
It provides functionality to read certain structured parts of HTML
into data fields
"""
# Capability = HtmlScrapeCapability()
api_version = '1.0'
component_version = '1.2'
""" Capabilities for HtmlRead """
capabilities = {
ComponentAPI.CAPABILITY_INPUT_VIEW_LOWER_LIMIT : 0,
ComponentAPI.CAPABILITY_INPUT_VIEW_UPPER_LIMIT : 0,
ComponentAPI.CAPABILITY_OUTPUT_VIEW_LOWER_LIMIT : 1,
ComponentAPI.CAPABILITY_OUTPUT_VIEW_UPPER_LIMIT : 1,
ComponentAPI.CAPABILITY_ALLOW_PASS_THROUGH : False
}
component_description = "Reads structured parts of HTML"
component_label = "HTML Reader"
component_doc_uri = "https://www.snaplogic.org/trac/wiki/Documentation/%s/ComponentRef/HtmlReader" % \
version_info.doc_uri_version
def validate_config_file(self):
"""
If config file is provided for this component then see if it provides a value for root directory.
The root directory is a way specifying that all local files must be read from the specified
root directory.
"""
root = None
for k in self.config_dictionary:
if k == "root_directory" and self.config_dictionary[k]:
if not os.path.exists(self.config_dictionary[k]):
raise SnapComponentConfigError("The path specified for root (%s) is not valid" %
self.config_dictionary[k])
root = self.config_dictionary[k]
elif k == "schemes":
# Make sure "schemes" contains only the schemes we support
FileUtils.validate_schemes(self.config_dictionary.get("schemes"), FileUtils.reader_schemes)
else:
# No other config file param is supported.
raise SnapComponentConfigError("Unexpected config file entry (%s) encountered" % k)
def create_resource_template(self):
"""
Create HtmlRead resource definition template. It consists of
inputURL: The URL of the site to read
username: Credentials: username needed to read the file.
password: Credentials: password needed to read the file.
table_num: Table number (in order) to extract from the HTML
skip_lines: Header lines (rows) to skip in that table
class: CSS class to extract (instead of a table number)
"""
self.set_property_def("inputURL", prop.SimpleProp("Input URL", SnapString,
"The URI of the page to be read. You can enter either a URI pointing to a HTTP or FTP server, or a local file path. The following URI schemes are supported: file, http, https, ftp, file.\n"
"\n\nExamples of valid URIs:\n"
"http://www.server.com/dir/input.file\n"
"https://www.server.com/dir/input.file"
"/home/user/input.file\n"
"c:\dir\input.file\n"
"file:///c:/dir/input.file\n"
"ftp://ftp.server.com/dir/input.file\n"
, None, True))
# Credentials are username and password
self.set_property_def("username",
prop.SimpleProp("Credentials: Username", SnapString,
"Username to use if credentials are needed for the accessing data"))
self.set_property_value("username", "")
self.set_property_def("password",
prop.SimpleProp("Credentials: Password", SnapString,
"Password to use if credentials are needed for the accessing data",
{'obfuscate' : 0}))
self.set_property_def('table_num',
prop.SimpleProp("Table number", SnapNumber, "Table to read",
{"min_value": 1}))
self.set_property_def('skip_lines',
prop.SimpleProp("Skip lines", SnapNumber, "Number of head table lines to skip",
{"min_value": 0}))
self.set_property_value('skip_lines', 0)
self.set_property_def('class',
prop.SimpleProp("Class name", SnapString, "CSS class to read"))
def suggest_resource_values(self, err_obj):
"""Suggest that "0" be used as skip_lines, if no value has been selected."""
val = self.get_property_value("skip_lines")
if not val:
self.set_property_value("skip_lines", 0)
def validate(self, err_obj):
"""NB: all required properties are handled by the generic validation system"""
tablenum = self.get_property_value("table_num")
classname = self.get_property_value("class")
if tablenum and classname and (classname != ''):
err_obj.get_property_err("class").set_message("'Table number' and 'Class name' are mutually exclusive. Only one can be specified.")
err_obj.get_property_err("table_num").set_message("'Table number' and 'Class name' are mutually exclusive. Only one can be specified.")
# Validate that the filename complies with the allowed URI schemes,
# unless it's specified via a parameter
FileUtils.validate_filename_property(self.get_property_value("inputURL"), "inputURL", err_obj,
self.config_dictionary.get("schemes"), FileUtils.reader_schemes)
# Validate that field datatypes are of supported types
views = self.list_output_view_names()
view_name = views[keys.SINGLE_VIEW]
view = self.get_output_view_def(view_name)
supported_datatypes = (SnapString, SnapNumber)
for i in range(0, len(view[keys.VIEW_FIELDS])):
field_type = view[keys.VIEW_FIELDS][i][keys.FIELD_TYPE]
field_name = view[keys.VIEW_FIELDS][i][keys.FIELD_NAME]
# For each field, check the datatype.
# If we don't support this datatype add an error.
if field_type not in supported_datatypes:
err_obj.get_output_view_err()[view_name][keys.VIEW_FIELDS][i].set_message(
"Output field '%s' datatype '%s' is not supported. Must be one of: %s" %
(field_name, field_type, str(supported_datatypes)))
def execute(self, input_views, output_views):
"""Execute the HtmlRead functionality of the component."""
try:
self._output_view = output_views.values()[keys.SINGLE_VIEW]
except IndexError:
raise SnapComponentError("No output view connected.")
self._inputurl = self.get_property_value("inputURL")
self._username = self.get_property_value("username")
self._password = self.get_property_value("password")
self._table_num = self.get_property_value("table_num")
self._skip_lines = int(self.get_property_value("skip_lines"))
self._class = self.get_property_value("class")
# Make sure the filename is always qualified
self._inputurl = FileUtils.qualify_filename(self._inputurl)
# Validate filename URI scheme
error = FileUtils.validate_file_scheme(self._inputurl, self.config_dictionary.get("schemes"), FileUtils.reader_schemes)
if error is not None:
raise SnapComponentError(error)
self._inputurl = FileUtils.get_file_location(self._inputurl, self.config_dictionary)
self._rdr = SnapReader.create(self._inputurl, self._username, self._password, None, self.env)
"""Read the document and write records out."""
if self._rdr.scheme.startswith('http'):
root = BeautifulSoup(self._rdr.open(), convertEntities='html')
else:
# Use Reader method to read the input and pass the data to BeautifulSoup.
self._rdr.open()
root = BeautifulSoup(self._rdr.read())
self._rdr.close()
tables = root.findAll('table')
self.log(snap_log.LEVEL_DEBUG, "HtmlRead: found %s tables" % len(tables))
self.log(snap_log.LEVEL_DEBUG, "HtmlRead: processing table %s " % self._table_num)
#print "tables: %s" % tables
if not self._class or (self._class == ''):
if not self._table_num:
# Be forgiving: assume omitted table number means the first table on the page
tablenum = 0
else:
tablenum = int(self._table_num) - 1
table = tables[tablenum]
#print table.prettify()
rows = table.findAll('tr')
for row in rows[self._skip_lines:]:
cells = row.findAll('td')
self.processCells(cells)
else:
self.log(snap_log.LEVEL_DEBUG, "HtmlRead: processing class %s" % self._class)
cells = root.findAll('td', self._class)
self.processCells(cells)
self._output_view.completed()
def processCells(self, cells):
i = 0
out_rec = self._output_view.create_record()
self.log(snap_log.LEVEL_DEBUG, "HtmlRead: found %s cells" % len(cells))
for cell in cells:
"""
print "cell: %s " % cell
print "type: %s " % type(cell)
print "type2: %s " % type(cell.contents)
print "conts: %s " % cell.contents
"""
try:
field_name = out_rec.field_names[i]
field_type = self._output_view.field_types[i]
self.log(snap_log.LEVEL_DEBUG, "field %s" % field_name)
if len(cell.contents)==1 and type(cell.contents[0])==NavigableString :#isinstance(cell.contents[0], NavigableString) :
text = cell.contents[0].strip()
self.log(snap_log.LEVEL_DEBUG, "p1: %s " % text)
else:
#print "cell conts: %s " % cell.contents[0]
data = ''
for cellconts in cell.findAll(text=True):
data = data + cellconts.strip()
text = data
self.log(snap_log.LEVEL_DEBUG, "p2: %s " % text)
if field_type == SnapNumber:
# Numeric field
if not text:
out_rec[field_name] = None
else:
out_rec[field_name] = Decimal(text)
else:
# String field
try:
# Convert to unicode...
u = unicode(text, 'utf-8')
except:
# ... which may fail because the string already is unicode
u = text
out_rec[field_name] = u
i = i + 1
except Exception, e :
self.log(snap_log.LEVEL_DEBUG, "HtmlRead: exception %s " % e)
pass
self._output_view.write_record(out_rec)
def upgrade_1_0_to_1_1(self):
"""
Upgrade resource from version 1.0 to version 1.1.
In version 1.0 credentials were stored as a single user:passwd string separated by colon.
In version 1.1 it's stored as two separate properties, and password is obfuscated.
Also, change the description of the inputURL property.
The description was changed in release 2.1.0.
"""
# Old credentials were stored as user:password, split them into two variables
credentials = self.get_property_value("credential")
username = None
password = None
if credentials is not None:
# Colons are allowed in passwords, so split it at the first colon
cred_list = credentials.split(':', 1)
if len(cred_list) >= 1:
# If there is a username, it's the first element of the list
username = cred_list[0]
if len(cred_list) == 2:
# If there is a password, it's the second element of the list
password = cred_list[1]
# Delete the old credentials property
self.del_property_def("credential")
# Create the new credentials properties
self.set_property_def("username",
prop.SimpleProp("Credentials: Username", SnapString,
"Username to use if credentials are needed for the accessing data"))
self.set_property_def("password",
prop.SimpleProp("Credentials: Password", SnapString,
"Password to use if credentials are needed for the accessing data",
{'obfuscate' : 0}))
# Set the new credentials properties
self.set_property_value("username", username)
self.set_property_value("password", password)
# Recreate the inputURL property
filename = self.get_property_value("inputURL")
self.del_property_def("inputURL")
self.set_property_def("inputURL", prop.SimpleProp("Input URL", SnapString,
"The URI of the page to be read. You can enter either a URI pointing to a HTTP or FTP server, or a local file path. The following URI schemes are supported: file, http, https, ftp, file.\n"
"\n\nExamples of valid URIs:\n"
"http://www.server.com/dir/input.file\n"
"https://www.server.com/dir/input.file"
"/home/user/input.file\n"
"c:\dir\input.file\n"
"file:///c:/dir/input.file\n"
"ftp://ftp.server.com/dir/input.file\n"
, None, True))
self.set_property_value("inputURL", filename)
def upgrade_1_1_to_1_2(self):
"""
No-op upgrade only to change component doc URI during the upgrade
which will be by cc_info before calling this method.
"""
pass
|