Regex.py :  » Development » SnapLogic » snaplogic » components » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Development » SnapLogic 
SnapLogic » snaplogic » components » Regex.py
# $SnapHashLicense:
# 
# SnapLogic - Open source data services
# 
# Copyright (C) 2008 - 2009, SnapLogic, Inc.  All rights reserved.
# 
# See http://www.snaplogic.org for more information about
# the SnapLogic project. 
# 
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
# 
# "SnapLogic" is a trademark of SnapLogic, Inc.
# 
# $

#    $Id: Regex.py 10330 2009-12-24 22:13:38Z grisha $

import re
from decimal import Decimal
from snaplogic.cc import prop
from snaplogic.cc.component_api import ComponentAPI
from snaplogic.common.snap_exceptions import SnapComponentError
from snaplogic.common import version_info
from snaplogic.snapi_base import resdef,keys
from snaplogic.common.data_types import SnapString,SnapNumber,SnapDateTime
from sets import Set

# Property names defined as constants
INPUT_FIELD = "Input field"
REGEX = "Regex"
OUTPUT_FIELD = "Output field"
REGEX_SPEC = "Regex specification"
REGEX_SPECS = "Regex specifications"

class Regex(ComponentAPI):
    """
    Regex component
    
    Regex component uses regular expressions to search input fields
    for patterns and assign search results to output fields. 
    You specify what input field should be searched using what pattern,
    and what output field the results should go to.
    In the pattern, you must use parenthesis to group portions
    of the string you'd like captured and sent to the output field.
    
    Implementation notes:

    1) We are using python's "re" module.  Specifically, we are using 
    the "match" method (not the "search" method).  When using "match"  
    your pattern must match the string starting at the beginning,
    as opposed to "search" method which matches anywhere in the string.  

    2) If you define multiple groups in the pattern, we take 
    concatentate the values captured by all of the groups to assign
    to the output field.
    
    3) You can use string or numbers for either input or output fields, 
    and the component will do the conversion.  

    
    Example:
    Input field value  Pattern        Output field value
    ---------------------------------------------------------
    abc                a(.*)          bc
    abcdef             a(.*)d(.*)     bcef 
    abc345            ([0-9]*)        <empty string>
                                      (because pattern isn't matched at
                                       the beginning of the string, see 
                                       implementation notes above, and 
                                       corrected pattern below)
    abc345            [a-z]*([0-9]*)  345
    3.1415            .*\.([0-9]*)    1415
    
    """

   
    api_version = '1.0'
    component_version = '1.2'
    
    capabilities = {
        ComponentAPI.CAPABILITY_INPUT_VIEW_LOWER_LIMIT    : 1,
        ComponentAPI.CAPABILITY_INPUT_VIEW_UPPER_LIMIT    : 1,
        ComponentAPI.CAPABILITY_OUTPUT_VIEW_LOWER_LIMIT   : 1,
        ComponentAPI.CAPABILITY_OUTPUT_VIEW_UPPER_LIMIT   : 1,
        ComponentAPI.CAPABILITY_ALLOW_PASS_THROUGH        : True
    }
    
    component_description = "This component allows you to search input fields for patterns and capture selected information into output fields."
    component_label = "Regex"
    component_doc_uri = "https://www.snaplogic.org/trac/wiki/Documentation/%s/ComponentRef/Regex" % \
                                                        version_info.doc_uri_version

    # We only support numbers and strings as field datatype.
    # Because dates can be represented in many different ways as a string,
    # supporting dates would require defining masks for conversion,
    # and we don't do this yet.   
    supported_datatypes = [SnapString, SnapNumber]

    def create_resource_template(self):
        """
        Create Regex resource template.
        """

        input_field = prop.SimpleProp(INPUT_FIELD, 
                                      SnapString, 
                                      "Input field against which the pattern is matched", 
                                      {'lov': [ keys.CONSTRAINT_LOV_INPUT_FIELD] }, 
                                      True)
        regex = prop.SimpleProp(REGEX, 
                                SnapString,
                                "Regex expression", 
                                None, 
                                True)
        
        output_field = prop.SimpleProp(OUTPUT_FIELD, 
                                       SnapString,
                                       "What output field the result corresponds to", 
                                       {'lov': [ keys.CONSTRAINT_LOV_OUTPUT_FIELD] }, 
                                       True)

        regex_spec = prop.DictProp(REGEX_SPEC, 
                                   input_field, 
                                   "Regex definition dictionary", 
                                   3,
                                   3,
                                   True,
                                   True)
        
        regex_spec[INPUT_FIELD] = input_field
        regex_spec[REGEX] = regex
        regex_spec[OUTPUT_FIELD] = output_field
        
        regex_specs = prop.ListProp(REGEX_SPECS, 
                                    regex_spec, 
                                    "Regex specification properties", 
                                    1,
                                    resdef.UNLIMITED_ENTRIES, 
                                    True)
        self.set_property_def(REGEX_SPECS, regex_specs)
        
    def validate(self, err_obj):
        """
        Validate regex definition
        """
        
        input_views = self.list_input_view_names()
        input_view_name = input_views[keys.SINGLE_VIEW]
        input_view = self.get_input_view_def(input_view_name)
        input_field_names = [ d[keys.FIELD_NAME] for d in input_view[keys.VIEW_FIELDS] ]
        input_field_types = [ d[keys.FIELD_TYPE] for d in input_view[keys.VIEW_FIELDS] ]
        
        output_views = self.list_output_view_names()
        output_view_name = output_views[keys.SINGLE_VIEW]
        output_view = self.get_output_view_def(output_view_name)
        output_field_names = [ d[keys.FIELD_NAME] for d in output_view[keys.VIEW_FIELDS] ]
        output_field_types = [ d[keys.FIELD_TYPE] for d in output_view[keys.VIEW_FIELDS] ]

        used_output_fields = {}
        
        # Validate each regex spec supplied
        regex_specs = self.get_property_value(REGEX_SPECS)
        for i, spec in enumerate(regex_specs):
            regex = spec[REGEX]
            input_field_name = spec[INPUT_FIELD] 
            output_field_name = spec[OUTPUT_FIELD]
            
            # Check that the input field datatype is supported
            input_field_type = self._resdef.get_input_field_type(input_view_name, input_field_name)
            if input_field_type not in self.supported_datatypes:
                err_obj.get_property_err(REGEX_SPECS)[i][INPUT_FIELD].set_message(
                    "Input field '%s' datatype '%s' is not supported.  Must be one of: %s" % 
                        (input_field_name, input_field_type, str(self.supported_datatypes)))
                
            # Check that the output field datatype is supported
            output_field_type = self._resdef.get_output_field_type(output_view_name, output_field_name)
            if output_field_type not in self.supported_datatypes:
                err_obj.get_property_err(REGEX_SPECS)[i][OUTPUT_FIELD].set_message(
                    "Output field '%s' datatype '%s' is not supported.  Must be one of: %s" % 
                        (output_field_name, output_field_type, str(self.supported_datatypes)))
            
            # Check that they don't use the same output field twice
            if used_output_fields.has_key(output_field_name):
                err_obj.get_property_err(REGEX_SPECS)[i][OUTPUT_FIELD].set_message(
                    "Output field '%s' is associated with more than one input field: '%s' and '%s'." 
                        % (output_field_name, used_output_fields[output_field_name], input_field_name))

            used_output_fields[output_field_name] = input_field_name
            
            # Check that regex compiles
            try:
                re.compile(regex)
            except Exception, e:
                err_obj.get_property_err(REGEX_SPECS)[i][REGEX].set_message(
                    "Regex '%s' is invalid: %s" % (regex, e.message))
                
        # Now check that all output fields are mapped
        # From the set of output field names subtract the set of used output field names
        # and also subtract the set of input field names.
        # That gives us the set containing unmapped output fields:
        # fields that neither have the same names as input field names,
        # nor do they have a regex defined.
        unmapped_fields = Set(output_field_names).difference(used_output_fields.keys())
        unmapped_fields = unmapped_fields.difference(input_field_names)
        for field in unmapped_fields:
            err_obj.get_property_err(REGEX_SPECS).set_message("Output field '%s' is not associated with any input field" % field)
        

    def execute(self, input_views, output_views):
        """
        Execute regex matching
        """
        
        try:
            output_view = output_views.values()[keys.SINGLE_VIEW] 
        except IndexError:
            raise SnapComponentError("No output view connected.")
        try: 
            input_view = input_views.values()[keys.SINGLE_VIEW]
        except IndexError:
            raise SnapComponentError("No input view connected.")
        
        # Make a list of common fields: fields with same names in the input and output views
        common_fields = Set(output_view.field_names) & Set(input_view.field_names)
        
        regex_specs = self.get_property_value(REGEX_SPECS)       
        
        # Compile the regexes and put them in a dictionary
        # where output field name maps to compiled regex.
        compiled_regexes = {} 
        for regex_spec in regex_specs:
            output_field_name = regex_spec[OUTPUT_FIELD]
            regex = regex_spec[REGEX]
            compiled_regexes[output_field_name] = re.compile(regex)
            
        # For output fields, create a dictionary that maps
        # field name to a boolean that's true if field type is string 
        is_string_field = {}
        for name, type in zip(output_view.field_names, output_view.field_types):
            is_string_field[name] = type == SnapString 

        # Process input records
        input_record = input_view.read_record()
        while input_record is not None:
            output_record = output_view.create_record()

            # Go through all regexes specified for the component,
            # match input fields and assign output fields.
            for regex_spec in regex_specs:
                input_field_name = regex_spec[INPUT_FIELD]
                input_field_value = input_record[input_field_name]
                output_field_name = regex_spec[OUTPUT_FIELD]
                compiled_regex = compiled_regexes[output_field_name]

                if input_field_value is not None:
                    # Match input field on regex
                    # If the pattern captured multiple groups, concatenate them
                    match = compiled_regex.match(unicode(input_field_value))
                    if match is not None:
                        output = u''
                        # Note: we're iterating instead of using ''.join
                        # because a group may be None which causes
                        # a runtime exception
                        for group in match.groups():
                            # Concatenate the groups and assign the output field
                            if group is not None:
                                output += group
                    else:
                        # If there is no match assign None
                        output = None

                    # Convert string to another datatype if necessary
                    if is_string_field[output_field_name] or output is None:
                        # If it's a string field or we have None value
                        # there is no need for conversion.
                        output_converted = output
                    else:
                        # Convert string to number
                        if output == '':
                            # If output is an empty string, and output type is numeric,
                            # assign output field to None.
                            output_converted = None
                        else:
                            try:
                                # Convert to decimal
                                output_converted = Decimal(str(output))
                            except Exception, e:
                                # Conversion failed, throw an appropriate exception
                                raise SnapComponentError("Failed to cast output field %s value '%s' to type 'number' (%s)" %
                                                         (output_field_name, output, e))
                else:
                    # Input field is None, so output field is None
                    output_converted = None

                output_record[output_field_name] = output_converted
                
            # Handle pass-through fields
            output_record.transfer_pass_through_fields(input_record)

            # Transfer fields matched by name
            output_record.transfer_matching_fields(input_record, common_fields)

            # Write the record we've created
            output_view.write_record(output_record)    
            
            # Read next record
            input_record = input_view.read_record()
            
        # We are done
        output_view.completed()

    def upgrade_1_0_to_1_1(self):
        """
         Add source constraint to Field property
         
        """
        
        # Save the property value.
        # We need to recreate the property, which resets the value
        property_value = self.get_property_value(REGEX_SPECS)
        
        input_field = prop.SimpleProp(INPUT_FIELD, 
                                      SnapString, 
                                      "Input field against which the pattern is matched", 
                                      {'lov': [ keys.CONSTRAINT_LOV_INPUT_FIELD] }, 
                                      True)
        regex = prop.SimpleProp(REGEX, 
                                SnapString,
                                "Regex expression", 
                                None, 
                                True)
        
        output_field = prop.SimpleProp(OUTPUT_FIELD, 
                                       SnapString,
                                       "What output field the result corresponds to", 
                                       {'lov': [ keys.CONSTRAINT_LOV_OUTPUT_FIELD] }, 
                                       True)

        regex_spec = prop.DictProp(REGEX_SPEC, 
                                   input_field, 
                                   "Regex definition dictionary", 
                                   3,
                                   3,
                                   True,
                                   True)
        
        regex_spec[INPUT_FIELD] = input_field
        regex_spec[REGEX] = regex
        regex_spec[OUTPUT_FIELD] = output_field
        
        regex_specs = prop.ListProp(REGEX_SPECS, 
                                    regex_spec, 
                                    "Regex specification properties", 
                                    1,
                                    resdef.UNLIMITED_ENTRIES, 
                                    True)
        self.set_property_def(REGEX_SPECS, regex_specs)
        
        # Restore the value
        self.set_property_value(REGEX_SPECS, property_value)
    
    def upgrade_1_1_to_1_2(self):
        """
        No-op upgrade only to change component doc URI during the upgrade
        which will be by cc_info before calling this method.
        
        """
        pass
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.