# $SnapHashLicense:
#
# SnapLogic - Open source data services
#
# Copyright (C) 2009, SnapLogic, Inc. All rights reserved.
#
# See http://www.snaplogic.org for more information about
# the SnapLogic project.
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
#
# "SnapLogic" is a trademark of SnapLogic, Inc.
#
#
# $
# $Id: XmlRead.py 10330 2009-12-24 22:13:38Z grisha $
"""
XmlRead Module and Resource Definition
"""
import os
from xml.dom import minidom
from xml import xpath
from decimal import Decimal
from snaplogic.common import snap_log
from snaplogic.common.snap_exceptions import SnapComponentError,SnapComponentConfigError
from snaplogic.common.SnapReader import SnapReader
from snaplogic.common.SnapReader import SnapFtpReader
from snaplogic.common.SnapReader import SnapHttpReader
from snaplogic.common.SnapReader import SnapFileReader
from snaplogic.common.data_types import Record
from snaplogic.common.data_types import SnapString,SnapNumber,SnapDateTime
from snaplogic.common import version_info
from snaplogic.cc.component_api import ComponentAPI
from snaplogic.components import FileUtils
import snaplogic.cc.prop as prop
from snaplogic.cc import component_api
from snaplogic.snapi_base import keys
# Configs and Properties
CONFIG_SCHEMES = "schemes"
PROP_FILENAME = "filename"
PROP_IS_INPUT_A_LIST = "is_input_a_list"
PROP_USERNAME = "username"
PROP_PASSWORD = "password"
PROP_XPATH_ROOT = "XPath root"
PROP_XPATH_SUBXS = "XPath subexpressions"
PROP_XPATH_SUBX = "XPath subexpression"
class XmlRead(ComponentAPI):
"""
Reads data from XML file as flat records, determined by XPath expressions
provided.
For example, given the following XML file
<root>
<record><name>John</name><age>32</age><gender>M</gender></record>
<record><name>Jim</name><age>44</age></record>
<record><name>Jane</name><gender>F</gender></record>
</root>
And given XPathExpression '//record' and subexpressions
(('./name'), ('./age'), ('./gender')
The output record will consist of three fields. The XmlRead component
will output three records, with fields
'John', '32', 'M'
'Jim', '44', None
'Jane', None, 'F'
It is the responsibility of the user to make sure that the
number of fields in the output view corresponds to the number of XPath
expressions in the L{XmlReadResDef.XPathExpressions} list property.
"""
api_version = '1.0'
component_version = '1.2'
capabilities = {
ComponentAPI.CAPABILITY_INPUT_VIEW_LOWER_LIMIT : 0,
ComponentAPI.CAPABILITY_INPUT_VIEW_UPPER_LIMIT : 0,
ComponentAPI.CAPABILITY_OUTPUT_VIEW_LOWER_LIMIT : 1,
ComponentAPI.CAPABILITY_OUTPUT_VIEW_UPPER_LIMIT : 1,
}
component_description = "This component creates records from XML data using XPath expressions."
component_label = "XML Read"
component_doc_uri = "https://www.snaplogic.org/trac/wiki/Documentation/%s/ComponentRef/XMLRead" % \
version_info.doc_uri_version
supported_datatypes = [SnapString, SnapNumber]
def validate_config_file(self):
"""
If config file is provided for this component then see if it provides a value for root directory.
The root directory is a way specifying that all local files must be read from the specified
root directory.
"""
root = None
for k in self.config_dictionary:
if k == FileUtils.CONFIG_ROOT_DIRECTORY and self.config_dictionary[k]:
if not os.path.exists(self.config_dictionary[k]):
raise SnapComponentConfigError("The path specified for root (%s) is not valid" %
self.config_dictionary[k])
root = self.config_dictionary[k]
elif k == CONFIG_SCHEMES:
# Make sure PROP_SCHEMES contains only the schemes we support
FileUtils.validate_schemes(self.config_dictionary.get(CONFIG_SCHEMES), FileUtils.reader_schemes)
else:
# No other config file param is supported.
raise SnapComponentConfigError("Unexpected config file entry (%s) encountered" % k)
def create_resource_template(self):
"""
Create XmlRead resource template.
"""
self.set_property_def(PROP_FILENAME, prop.SimpleProp("File name", SnapString,
"The URI of the file to be read. You can enter either a local file path, or a remote location on a FTP or HTTP server. The following URI schemes are supported: file, http, https, ftp, file.\n"
"\n\nExamples of valid URIs:\n"
"/home/user/input.file\n"
"c:\dir\input.file\n"
"file:///c:/dir/input.file\n"
"ftp://ftp.server.com/dir/input.file\n"
"http://www.server.com/dir/input.file\n"
"https://www.server.com/dir/input.file"
, None, True))
self.set_property_def(PROP_IS_INPUT_A_LIST,
prop.SimpleProp("Is input a list", "boolean", "Is input a list", required=True))
self.set_property_value(PROP_IS_INPUT_A_LIST, False)
xpath_desc = """
Name of the property specifying the XPath of the node to start reading from.
For the syntax of the expressions, see http://pyxml.sourceforge.net/topics/howto/section-XPath.html.
"""
self.set_property_def(PROP_XPATH_ROOT, prop.SimpleProp(PROP_XPATH_ROOT, SnapString, xpath_desc, required=True))
# Credentials are username and password
self.set_property_def(PROP_USERNAME,
prop.SimpleProp("Credentials: Username", SnapString,
"Username to use if credentials are needed for the accessing data"))
self.set_property_value(PROP_USERNAME, "")
self.set_property_def(PROP_PASSWORD,
prop.SimpleProp("Credentials: Password", SnapString,
"Password to use if credentials are needed for the accessing data",
{'obfuscate' : 0}))
xpath_sub_prop = prop.SimpleProp(PROP_XPATH_SUBX, SnapString,
"Specifies an XPath expression (relative to XPath root) which determine fields in the outgoing record", required=True)
self.set_property_def(PROP_XPATH_SUBXS, prop.ListProp(PROP_XPATH_SUBXS, xpath_sub_prop, required=True))
def validate(self, err_obj):
"""
Component-specific validation logic.
Validate that the URI scheme specified for the filename is one of the allowed
schemes as specified in the component config file.
@param err_obj: Object for error reporting
@type err_obj: L{SimplePropErr} or L{ListPropErr} or L{DictPropErr}
"""
# Validate that the filename complies with the allowed URI schemes,
# unless it's specified via a parameter
FileUtils.validate_filename_property(self.get_property_value(PROP_FILENAME), "filename", err_obj,
self.config_dictionary.get(CONFIG_SCHEMES), FileUtils.reader_schemes)
# We can check that each output field has a corresponding xpath expression.
output_views = self.list_output_view_names()
output_view_name = output_views[keys.SINGLE_VIEW]
output_view = self.get_output_view_def(output_view_name)
output_view_fields = [ d[keys.FIELD_NAME] for d in output_view[keys.VIEW_FIELDS] ]
output_view_field_types = [ d[keys.FIELD_TYPE] for d in output_view[keys.VIEW_FIELDS] ]
output_view_count = len(output_view_fields)
output_viewfield_names = []
# For each field, check the datatype.
# If we don't support this datatype add an error.
for i, (field_name, field_type) in enumerate(zip(output_view_fields, output_view_field_types)):
if field_type not in self.supported_datatypes:
err_obj.get_output_view_err()[output_view_name][keys.VIEW_FIELDS][i].set_message(
"Output field '%s' datatype '%s' is not supported. Must be one of: %s" %
(field_name, field_type, str(self.supported_datatypes)))
# Get the subexpressions
subx_specs = self.get_property_value(PROP_XPATH_SUBXS)
# The number of fields should match
if output_view_count != len(subx_specs):
err_obj.get_property_err(PROP_XPATH_SUBXS)[len(subx_specs) - 1].set_message(
"Number of XPath subexpressions '%d' does not match number of output view fields '%d'" %
(len(subx_specs), output_view_count))
def execute(self, input_views, output_views):
try:
output_view = output_views.values()[keys.SINGLE_VIEW]
except IndexError:
raise SnapComponentError("No output view connected.")
self._filename = self.get_property_value(PROP_FILENAME)
self._username = self.get_property_value(PROP_USERNAME)
self._password = self.get_property_value(PROP_PASSWORD)
self._is_input_a_list = self.get_property_value(PROP_IS_INPUT_A_LIST)
xpath_exp = self.get_property_value(PROP_XPATH_ROOT)
sub_xpaths = self.get_property_value(PROP_XPATH_SUBXS)
self._filename = FileUtils.get_file_location(self._filename, self.config_dictionary)
# Validate filename URI scheme
error = FileUtils.validate_file_scheme(self._filename, self.config_dictionary.get(CONFIG_SCHEMES), FileUtils.reader_schemes)
if error is not None:
raise SnapComponentError(error)
if self._is_input_a_list:
self._data_sources = FileUtils.read_input_list(self._filename, self._username, self._password)
else:
self._data_sources = [ ( self._filename, self._username, self._password ) ]
for (input, username, password) in self._data_sources:
if self._is_input_a_list:
input = FileUtils.get_file_location(input, self.config_dictionary)
self.log(snap_log.LEVEL_DEBUG, "Input: %s" % input)
rdr = SnapReader.create(input, username, password, None, self.env)
rdr.open()
doc = minidom.parseString(rdr.read())
nodes = xpath.Evaluate(xpath_exp, doc.documentElement)
# Array storing a flag for each field
# is_string_field[i] = True if it's a string field
# Pre-calculating this is a performance optimization,
# since looking up strings in a string array is expensive
# (This code borrowed from csvread)
is_string_field = []
for field_type in output_view.field_types:
is_string_field.append(field_type == SnapString)
# Store a flag telling us if there are any non-string output fields at all.
all_string_fields = False not in is_string_field
if all_string_fields:
for node in nodes:
out_rec = output_view.create_record()
field_num = 0
for sub_xpath in sub_xpaths:
sub_nodes = xpath.Evaluate(sub_xpath, node)
result = ""
for sub_node in sub_nodes:
child = sub_node.firstChild
while child:
result += child.toxml()
child = child.nextSibling
if not result:
result = None
field_name = out_rec.field_names[field_num]
out_rec[field_name] = result
field_num += 1
output_view.write_record(out_rec)
else:
# Here we must handle each field individually.
for node in nodes:
out_rec = output_view.create_record()
field_num = 0
for sub_xpath in sub_xpaths:
sub_nodes = xpath.Evaluate(sub_xpath, node)
result = ""
for sub_node in sub_nodes:
child = sub_node.firstChild
while child:
result += child.toxml()
child = child.nextSibling
if not result:
result = None
field_name = out_rec.field_names[field_num]
# Output field is a string?
if is_string_field[field_num]:
out_rec[field_name] = result
else:
# Attempt to convert to a number
# Empty field is interpreted as None
if result is None or len(result.strip()) == 0:
out_rec[field_name] = None
else:
result = result.strip()
try:
# Convert to decimal
out_rec[field_name] = Decimal(result)
except Exception, e:
# Conversion failed, throw an appropriate exception
raise SnapComponentError("Failed to cast field %s value '%s' to type 'number' (%s)" %
(field_name, result, e))
field_num += 1
output_view.write_record(out_rec)
output_view.completed()
def upgrade_1_0_to_1_1(self):
"""
Upgrade resource from version 1.0 to version 1.1.
In version 1.0 credentials were stored as a single user:passwd string separated by colon.
In version 1.1 it's stored as two separate properties, and password is obfuscated.
Also, change the description of the filename property.
The description was changed in release 2.1.0.
"""
# Old credentials were stored as user:password, split them into two variables
credentials = self.get_property_value("credential")
username = None
password = None
if credentials is not None:
# Colons are allowed in passwords, so split it at the first colon
cred_list = credentials.split(':', 1)
if len(cred_list) >= 1:
# If there is a username, it's the first element of the list
username = cred_list[0]
if len(cred_list) == 2:
# If there is a password, it's the second element of the list
password = cred_list[1]
# Delete the old credentials property
self.del_property_def("credential")
# Create the new credentials properties
self.set_property_def("username",
prop.SimpleProp("Credentials: Username", SnapString,
"Username to use if credentials are needed for the accessing data"))
self.set_property_def("password",
prop.SimpleProp("Credentials: Password", SnapString,
"Password to use if credentials are needed for the accessing data",
{'obfuscate' : 0}))
# Set the new credentials properties
self.set_property_value("username", username)
self.set_property_value("password", password)
# Recreate the filename property
filename = self.get_property_value(PROP_FILENAME)
self.del_property_def(PROP_FILENAME)
self.set_property_def(PROP_FILENAME, prop.SimpleProp("File name", SnapString,
"The URI of the file to be read. You can enter either a local file path, or a remote location on a FTP or HTTP server. The following URI schemes are supported: file, http, https, ftp, file.\n"
"\n\nExamples of valid URIs:\n"
"/home/user/input.file\n"
"c:\dir\input.file\n"
"file:///c:/dir/input.file\n"
"ftp://ftp.server.com/dir/input.file\n"
"http://www.server.com/dir/input.file\n"
"https://www.server.com/dir/input.file"
, None, True))
self.set_property_value(PROP_FILENAME, filename)
def upgrade_1_1_to_1_2(self):
"""
No-op upgrade only to change component doc URI during the upgrade
which will be by cc_info before calling this method.
"""
pass
|