XmlRead.py :  » Development » SnapLogic » snaplogic » components » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Development » SnapLogic 
SnapLogic » snaplogic » components » XmlRead.py
# $SnapHashLicense:
# 
# SnapLogic - Open source data services
# 
# Copyright (C) 2009, SnapLogic, Inc.  All rights reserved.
# 
# See http://www.snaplogic.org for more information about
# the SnapLogic project. 
# 
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
# 
# "SnapLogic" is a trademark of SnapLogic, Inc.
# 
# 
# $

#     $Id: XmlRead.py 10330 2009-12-24 22:13:38Z grisha $

"""
XmlRead Module and Resource Definition

"""

import os

from xml.dom import minidom
from xml import xpath
from decimal import Decimal

from snaplogic.common import snap_log
from snaplogic.common.snap_exceptions import SnapComponentError,SnapComponentConfigError
from snaplogic.common.SnapReader import SnapReader
from snaplogic.common.SnapReader import SnapFtpReader
from snaplogic.common.SnapReader import SnapHttpReader
from snaplogic.common.SnapReader import SnapFileReader
from snaplogic.common.data_types import Record
from snaplogic.common.data_types import SnapString,SnapNumber,SnapDateTime
from snaplogic.common import version_info
from snaplogic.cc.component_api import ComponentAPI
from snaplogic.components import FileUtils
import snaplogic.cc.prop as prop
from snaplogic.cc import component_api
from snaplogic.snapi_base import keys

# Configs and Properties
CONFIG_SCHEMES         = "schemes"
PROP_FILENAME        = "filename"
PROP_IS_INPUT_A_LIST = "is_input_a_list"
PROP_USERNAME        = "username"
PROP_PASSWORD        = "password"
PROP_XPATH_ROOT      = "XPath root"
PROP_XPATH_SUBXS     = "XPath subexpressions"
PROP_XPATH_SUBX      = "XPath subexpression"

class XmlRead(ComponentAPI):
    """
    Reads data from XML file as flat records, determined by XPath expressions
    provided. 
    
    For example, given the following XML file
    
    <root>
        <record><name>John</name><age>32</age><gender>M</gender></record>
        <record><name>Jim</name><age>44</age></record>
        <record><name>Jane</name><gender>F</gender></record>
    </root>
    
    And given XPathExpression '//record' and subexpressions
    
        (('./name'), ('./age'), ('./gender')
        
    The output record will consist of three fields. The XmlRead component
    will output three records, with fields 
    
    'John', '32', 'M'
    'Jim', '44', None
    'Jane', None, 'F'
    
    It is the responsibility of the user to make sure that the 
    number of fields in the output view corresponds to the number of XPath
    expressions in the L{XmlReadResDef.XPathExpressions} list property.
    
    """

    api_version = '1.0'
    component_version = '1.2'

    capabilities = {
        ComponentAPI.CAPABILITY_INPUT_VIEW_LOWER_LIMIT    : 0,
        ComponentAPI.CAPABILITY_INPUT_VIEW_UPPER_LIMIT    : 0,
        ComponentAPI.CAPABILITY_OUTPUT_VIEW_LOWER_LIMIT   : 1,
        ComponentAPI.CAPABILITY_OUTPUT_VIEW_UPPER_LIMIT   : 1,
    }
   
    component_description = "This component creates records from XML data using XPath expressions."
    component_label = "XML Read"
    component_doc_uri = "https://www.snaplogic.org/trac/wiki/Documentation/%s/ComponentRef/XMLRead" % \
                                                        version_info.doc_uri_version
    supported_datatypes = [SnapString, SnapNumber]
    
    def validate_config_file(self):
        """
        If config file is provided for this component then see if it provides a value for root directory.
        
        The root directory is a way specifying that all local files must be read from the specified
        root directory.
        
        """
        root = None
        for k in self.config_dictionary:
            if k == FileUtils.CONFIG_ROOT_DIRECTORY and self.config_dictionary[k]:
                if not os.path.exists(self.config_dictionary[k]):
                    raise SnapComponentConfigError("The path specified for root (%s) is not valid" %
                                                   self.config_dictionary[k])
                root = self.config_dictionary[k]
            elif k == CONFIG_SCHEMES:
                # Make sure PROP_SCHEMES contains only the schemes we support
                FileUtils.validate_schemes(self.config_dictionary.get(CONFIG_SCHEMES), FileUtils.reader_schemes)
            else:
                # No other config file param is supported.
                raise SnapComponentConfigError("Unexpected config file entry (%s) encountered" % k)
    
    def create_resource_template(self):
        """
        Create XmlRead resource template.
        
        """

        self.set_property_def(PROP_FILENAME, prop.SimpleProp("File name", SnapString, 
                  "The URI of the file to be read.  You can enter either a local file path, or a remote location on a FTP or HTTP server.  The following URI schemes are supported: file, http, https, ftp, file.\n"
                  "\n\nExamples of valid URIs:\n"
                  "/home/user/input.file\n"
                  "c:\dir\input.file\n"
                  "file:///c:/dir/input.file\n"
                  "ftp://ftp.server.com/dir/input.file\n"
                  "http://www.server.com/dir/input.file\n"
                  "https://www.server.com/dir/input.file"
                  , None, True))
        
        self.set_property_def(PROP_IS_INPUT_A_LIST,
                              prop.SimpleProp("Is input a list", "boolean", "Is input a list", required=True))
        self.set_property_value(PROP_IS_INPUT_A_LIST, False)

        xpath_desc = """
                Name of the property specifying the XPath of the node to start reading from. 
                For the syntax of the expressions, see http://pyxml.sourceforge.net/topics/howto/section-XPath.html.
                """
        self.set_property_def(PROP_XPATH_ROOT, prop.SimpleProp(PROP_XPATH_ROOT, SnapString, xpath_desc, required=True))

        # Credentials are username and password
        self.set_property_def(PROP_USERNAME,
                              prop.SimpleProp("Credentials:  Username", SnapString, 
                                              "Username to use if credentials are needed for the accessing data"))
        self.set_property_value(PROP_USERNAME, "")
        self.set_property_def(PROP_PASSWORD,
                              prop.SimpleProp("Credentials: Password", SnapString, 
                                              "Password to use if credentials are needed for the accessing data", 
                                              {'obfuscate' : 0}))
        
        xpath_sub_prop = prop.SimpleProp(PROP_XPATH_SUBX, SnapString, 
            "Specifies an XPath expression (relative to XPath root) which determine fields in the outgoing record", required=True)
        self.set_property_def(PROP_XPATH_SUBXS, prop.ListProp(PROP_XPATH_SUBXS, xpath_sub_prop, required=True))
        
    def validate(self, err_obj):
        """
        Component-specific validation logic.
        Validate that the URI scheme specified for the filename is one of the allowed
        schemes as specified in the component config file.
        
        @param err_obj: Object for error reporting
        @type err_obj: L{SimplePropErr} or L{ListPropErr} or L{DictPropErr}  
        """
        # Validate that the filename complies with the allowed URI schemes,
        # unless it's specified via a parameter 
        FileUtils.validate_filename_property(self.get_property_value(PROP_FILENAME), "filename", err_obj, 
                                   self.config_dictionary.get(CONFIG_SCHEMES), FileUtils.reader_schemes)

        # We can check that each output field has a corresponding xpath expression.
        output_views = self.list_output_view_names()
        output_view_name = output_views[keys.SINGLE_VIEW]
        output_view = self.get_output_view_def(output_view_name)
        output_view_fields = [ d[keys.FIELD_NAME] for d in output_view[keys.VIEW_FIELDS] ]
        output_view_field_types = [ d[keys.FIELD_TYPE] for d in output_view[keys.VIEW_FIELDS] ]
        output_view_count = len(output_view_fields)

        output_viewfield_names = []
        
        # For each field, check the datatype.
        # If we don't support this datatype add an error. 
        for i, (field_name, field_type) in enumerate(zip(output_view_fields, output_view_field_types)):
            if field_type not in self.supported_datatypes:
                err_obj.get_output_view_err()[output_view_name][keys.VIEW_FIELDS][i].set_message(
                        "Output field '%s' datatype '%s' is not supported.  Must be one of: %s" %
                            (field_name, field_type, str(self.supported_datatypes)))

        # Get the subexpressions
        subx_specs = self.get_property_value(PROP_XPATH_SUBXS)
            
        # The number of fields should match
        if output_view_count != len(subx_specs):
            err_obj.get_property_err(PROP_XPATH_SUBXS)[len(subx_specs) - 1].set_message(
                "Number of XPath subexpressions '%d' does not match number of output view fields '%d'" % 
                (len(subx_specs), output_view_count))

        
    def execute(self, input_views, output_views):

        try:
            output_view = output_views.values()[keys.SINGLE_VIEW] 
        except IndexError:
            raise SnapComponentError("No output view connected.")
        
        self._filename   = self.get_property_value(PROP_FILENAME)
        self._username = self.get_property_value(PROP_USERNAME)
        self._password = self.get_property_value(PROP_PASSWORD)
        self._is_input_a_list = self.get_property_value(PROP_IS_INPUT_A_LIST)

        xpath_exp  = self.get_property_value(PROP_XPATH_ROOT)
        sub_xpaths = self.get_property_value(PROP_XPATH_SUBXS)

        self._filename = FileUtils.get_file_location(self._filename, self.config_dictionary)

        # Validate filename URI scheme
        error = FileUtils.validate_file_scheme(self._filename, self.config_dictionary.get(CONFIG_SCHEMES), FileUtils.reader_schemes)
        if error is not None:
            raise SnapComponentError(error)
       
        if self._is_input_a_list:
            self._data_sources = FileUtils.read_input_list(self._filename, self._username, self._password)
        else:
            self._data_sources = [ ( self._filename, self._username, self._password ) ]
        
        for (input, username, password) in self._data_sources:
            if self._is_input_a_list:
                input = FileUtils.get_file_location(input, self.config_dictionary)
            self.log(snap_log.LEVEL_DEBUG, "Input: %s" % input)
            rdr = SnapReader.create(input, username, password, None, self.env)
            rdr.open()
            doc = minidom.parseString(rdr.read())
            nodes = xpath.Evaluate(xpath_exp, doc.documentElement)

            # Array storing a flag for each field 
            # is_string_field[i] = True if it's a string field
            # Pre-calculating this is a performance optimization,
            # since looking up strings in a string array is expensive
            # (This code borrowed from csvread) 
            is_string_field = []
            for field_type in output_view.field_types:
               is_string_field.append(field_type == SnapString)

            # Store a flag telling us if there are any non-string output fields at all.
            all_string_fields = False not in is_string_field  

            if all_string_fields:            
                for node in nodes:
                    out_rec = output_view.create_record()
                    field_num = 0
                    for sub_xpath in sub_xpaths:
                        sub_nodes = xpath.Evaluate(sub_xpath, node)
                        result = ""
                        for sub_node in sub_nodes:
                            child = sub_node.firstChild
                            while child:
                                result += child.toxml()
                                child = child.nextSibling
                        if not result:
                            result = None
                        field_name = out_rec.field_names[field_num]
                        out_rec[field_name] = result    
                        field_num += 1
                    output_view.write_record(out_rec)
            else:
                # Here we must handle each field individually.  
                for node in nodes:
                    out_rec = output_view.create_record()
                    field_num = 0
                    for sub_xpath in sub_xpaths:
                        sub_nodes = xpath.Evaluate(sub_xpath, node)
                        result = ""
                        for sub_node in sub_nodes:
                            child = sub_node.firstChild
                            while child:
                                result += child.toxml()
                                child = child.nextSibling
                        if not result:
                            result = None
                        field_name = out_rec.field_names[field_num]
                        # Output field is a string?
                        if is_string_field[field_num]:
                            out_rec[field_name] = result
                        else:
                            # Attempt to convert to a number
                            # Empty field is interpreted as None
                            if result is None or len(result.strip()) == 0: 
                                out_rec[field_name] = None
                            else:
                                result = result.strip()
                                try:
                                    # Convert to decimal
                                    out_rec[field_name] = Decimal(result)
                                except Exception, e:
                                    # Conversion failed, throw an appropriate exception
                                    raise SnapComponentError("Failed to cast field %s value '%s' to type 'number' (%s)" % 
                                                                 (field_name, result, e))
        
                        field_num += 1
                    output_view.write_record(out_rec)
        output_view.completed()

    def upgrade_1_0_to_1_1(self):
        """ 
        Upgrade resource from version 1.0 to version 1.1.
        In version 1.0 credentials were stored as a single user:passwd string separated by colon.
        In version 1.1 it's stored as two separate properties, and password is obfuscated. 
        
        Also, change the description of the filename property.
        The description was changed in release 2.1.0.
        """
        # Old credentials were stored as user:password, split them into two variables
        credentials = self.get_property_value("credential")
        username = None
        password = None
        
        if credentials is not None:
            # Colons are allowed in passwords, so split it at the first colon
            cred_list = credentials.split(':', 1)
            if len(cred_list) >= 1:
                # If there is a username, it's the first element of the list
                username = cred_list[0]
                if len(cred_list) == 2:
                    # If there is a password, it's the second element of the list 
                    password = cred_list[1]

        # Delete the old credentials property
        self.del_property_def("credential")
        
        # Create the new credentials properties
        self.set_property_def("username",
                              prop.SimpleProp("Credentials:  Username", SnapString, 
                                              "Username to use if credentials are needed for the accessing data"))
        self.set_property_def("password",
                              prop.SimpleProp("Credentials: Password", SnapString, 
                                              "Password to use if credentials are needed for the accessing data", 
                                              {'obfuscate' : 0}))

        # Set the new credentials properties
        self.set_property_value("username", username)
        self.set_property_value("password", password)
        
        # Recreate the filename property
        filename = self.get_property_value(PROP_FILENAME)
        self.del_property_def(PROP_FILENAME)
        self.set_property_def(PROP_FILENAME, prop.SimpleProp("File name", SnapString, 
                  "The URI of the file to be read.  You can enter either a local file path, or a remote location on a FTP or HTTP server.  The following URI schemes are supported: file, http, https, ftp, file.\n"
                  "\n\nExamples of valid URIs:\n"
                  "/home/user/input.file\n"
                  "c:\dir\input.file\n"
                  "file:///c:/dir/input.file\n"
                  "ftp://ftp.server.com/dir/input.file\n"
                  "http://www.server.com/dir/input.file\n"
                  "https://www.server.com/dir/input.file"
                  , None, True))
        self.set_property_value(PROP_FILENAME, filename)
    
    def upgrade_1_1_to_1_2(self):
        """
        No-op upgrade only to change component doc URI during the upgrade
        which will be by cc_info before calling this method.
        
        """
        pass
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.