# $SnapHashLicense:
#
# SnapLogic - Open source data services
#
# Copyright (C) 2008 - 2009, SnapLogic, Inc. All rights reserved.
#
# See http://www.snaplogic.org for more information about
# the SnapLogic project.
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
#
# "SnapLogic" is a trademark of SnapLogic, Inc.
#
# $
# $Id: Regex.py 10330 2009-12-24 22:13:38Z grisha $
import re
from decimal import Decimal
from snaplogic.cc import prop
from snaplogic.cc.component_api import ComponentAPI
from snaplogic.common.snap_exceptions import SnapComponentError
from snaplogic.common import version_info
from snaplogic.snapi_base import resdef,keys
from snaplogic.common.data_types import SnapString,SnapNumber,SnapDateTime
from sets import Set
# Property names defined as constants
INPUT_FIELD = "Input field"
REGEX = "Regex"
OUTPUT_FIELD = "Output field"
REGEX_SPEC = "Regex specification"
REGEX_SPECS = "Regex specifications"
class Regex(ComponentAPI):
"""
Regex component
Regex component uses regular expressions to search input fields
for patterns and assign search results to output fields.
You specify what input field should be searched using what pattern,
and what output field the results should go to.
In the pattern, you must use parenthesis to group portions
of the string you'd like captured and sent to the output field.
Implementation notes:
1) We are using python's "re" module. Specifically, we are using
the "match" method (not the "search" method). When using "match"
your pattern must match the string starting at the beginning,
as opposed to "search" method which matches anywhere in the string.
2) If you define multiple groups in the pattern, we take
concatentate the values captured by all of the groups to assign
to the output field.
3) You can use string or numbers for either input or output fields,
and the component will do the conversion.
Example:
Input field value Pattern Output field value
---------------------------------------------------------
abc a(.*) bc
abcdef a(.*)d(.*) bcef
abc345 ([0-9]*) <empty string>
(because pattern isn't matched at
the beginning of the string, see
implementation notes above, and
corrected pattern below)
abc345 [a-z]*([0-9]*) 345
3.1415 .*\.([0-9]*) 1415
"""
api_version = '1.0'
component_version = '1.2'
capabilities = {
ComponentAPI.CAPABILITY_INPUT_VIEW_LOWER_LIMIT : 1,
ComponentAPI.CAPABILITY_INPUT_VIEW_UPPER_LIMIT : 1,
ComponentAPI.CAPABILITY_OUTPUT_VIEW_LOWER_LIMIT : 1,
ComponentAPI.CAPABILITY_OUTPUT_VIEW_UPPER_LIMIT : 1,
ComponentAPI.CAPABILITY_ALLOW_PASS_THROUGH : True
}
component_description = "This component allows you to search input fields for patterns and capture selected information into output fields."
component_label = "Regex"
component_doc_uri = "https://www.snaplogic.org/trac/wiki/Documentation/%s/ComponentRef/Regex" % \
version_info.doc_uri_version
# We only support numbers and strings as field datatype.
# Because dates can be represented in many different ways as a string,
# supporting dates would require defining masks for conversion,
# and we don't do this yet.
supported_datatypes = [SnapString, SnapNumber]
def create_resource_template(self):
"""
Create Regex resource template.
"""
input_field = prop.SimpleProp(INPUT_FIELD,
SnapString,
"Input field against which the pattern is matched",
{'lov': [ keys.CONSTRAINT_LOV_INPUT_FIELD] },
True)
regex = prop.SimpleProp(REGEX,
SnapString,
"Regex expression",
None,
True)
output_field = prop.SimpleProp(OUTPUT_FIELD,
SnapString,
"What output field the result corresponds to",
{'lov': [ keys.CONSTRAINT_LOV_OUTPUT_FIELD] },
True)
regex_spec = prop.DictProp(REGEX_SPEC,
input_field,
"Regex definition dictionary",
3,
3,
True,
True)
regex_spec[INPUT_FIELD] = input_field
regex_spec[REGEX] = regex
regex_spec[OUTPUT_FIELD] = output_field
regex_specs = prop.ListProp(REGEX_SPECS,
regex_spec,
"Regex specification properties",
1,
resdef.UNLIMITED_ENTRIES,
True)
self.set_property_def(REGEX_SPECS, regex_specs)
def validate(self, err_obj):
"""
Validate regex definition
"""
input_views = self.list_input_view_names()
input_view_name = input_views[keys.SINGLE_VIEW]
input_view = self.get_input_view_def(input_view_name)
input_field_names = [ d[keys.FIELD_NAME] for d in input_view[keys.VIEW_FIELDS] ]
input_field_types = [ d[keys.FIELD_TYPE] for d in input_view[keys.VIEW_FIELDS] ]
output_views = self.list_output_view_names()
output_view_name = output_views[keys.SINGLE_VIEW]
output_view = self.get_output_view_def(output_view_name)
output_field_names = [ d[keys.FIELD_NAME] for d in output_view[keys.VIEW_FIELDS] ]
output_field_types = [ d[keys.FIELD_TYPE] for d in output_view[keys.VIEW_FIELDS] ]
used_output_fields = {}
# Validate each regex spec supplied
regex_specs = self.get_property_value(REGEX_SPECS)
for i, spec in enumerate(regex_specs):
regex = spec[REGEX]
input_field_name = spec[INPUT_FIELD]
output_field_name = spec[OUTPUT_FIELD]
# Check that the input field datatype is supported
input_field_type = self._resdef.get_input_field_type(input_view_name, input_field_name)
if input_field_type not in self.supported_datatypes:
err_obj.get_property_err(REGEX_SPECS)[i][INPUT_FIELD].set_message(
"Input field '%s' datatype '%s' is not supported. Must be one of: %s" %
(input_field_name, input_field_type, str(self.supported_datatypes)))
# Check that the output field datatype is supported
output_field_type = self._resdef.get_output_field_type(output_view_name, output_field_name)
if output_field_type not in self.supported_datatypes:
err_obj.get_property_err(REGEX_SPECS)[i][OUTPUT_FIELD].set_message(
"Output field '%s' datatype '%s' is not supported. Must be one of: %s" %
(output_field_name, output_field_type, str(self.supported_datatypes)))
# Check that they don't use the same output field twice
if used_output_fields.has_key(output_field_name):
err_obj.get_property_err(REGEX_SPECS)[i][OUTPUT_FIELD].set_message(
"Output field '%s' is associated with more than one input field: '%s' and '%s'."
% (output_field_name, used_output_fields[output_field_name], input_field_name))
used_output_fields[output_field_name] = input_field_name
# Check that regex compiles
try:
re.compile(regex)
except Exception, e:
err_obj.get_property_err(REGEX_SPECS)[i][REGEX].set_message(
"Regex '%s' is invalid: %s" % (regex, e.message))
# Now check that all output fields are mapped
# From the set of output field names subtract the set of used output field names
# and also subtract the set of input field names.
# That gives us the set containing unmapped output fields:
# fields that neither have the same names as input field names,
# nor do they have a regex defined.
unmapped_fields = Set(output_field_names).difference(used_output_fields.keys())
unmapped_fields = unmapped_fields.difference(input_field_names)
for field in unmapped_fields:
err_obj.get_property_err(REGEX_SPECS).set_message("Output field '%s' is not associated with any input field" % field)
def execute(self, input_views, output_views):
"""
Execute regex matching
"""
try:
output_view = output_views.values()[keys.SINGLE_VIEW]
except IndexError:
raise SnapComponentError("No output view connected.")
try:
input_view = input_views.values()[keys.SINGLE_VIEW]
except IndexError:
raise SnapComponentError("No input view connected.")
# Make a list of common fields: fields with same names in the input and output views
common_fields = Set(output_view.field_names) & Set(input_view.field_names)
regex_specs = self.get_property_value(REGEX_SPECS)
# Compile the regexes and put them in a dictionary
# where output field name maps to compiled regex.
compiled_regexes = {}
for regex_spec in regex_specs:
output_field_name = regex_spec[OUTPUT_FIELD]
regex = regex_spec[REGEX]
compiled_regexes[output_field_name] = re.compile(regex)
# For output fields, create a dictionary that maps
# field name to a boolean that's true if field type is string
is_string_field = {}
for name, type in zip(output_view.field_names, output_view.field_types):
is_string_field[name] = type == SnapString
# Process input records
input_record = input_view.read_record()
while input_record is not None:
output_record = output_view.create_record()
# Go through all regexes specified for the component,
# match input fields and assign output fields.
for regex_spec in regex_specs:
input_field_name = regex_spec[INPUT_FIELD]
input_field_value = input_record[input_field_name]
output_field_name = regex_spec[OUTPUT_FIELD]
compiled_regex = compiled_regexes[output_field_name]
if input_field_value is not None:
# Match input field on regex
# If the pattern captured multiple groups, concatenate them
match = compiled_regex.match(unicode(input_field_value))
if match is not None:
output = u''
# Note: we're iterating instead of using ''.join
# because a group may be None which causes
# a runtime exception
for group in match.groups():
# Concatenate the groups and assign the output field
if group is not None:
output += group
else:
# If there is no match assign None
output = None
# Convert string to another datatype if necessary
if is_string_field[output_field_name] or output is None:
# If it's a string field or we have None value
# there is no need for conversion.
output_converted = output
else:
# Convert string to number
if output == '':
# If output is an empty string, and output type is numeric,
# assign output field to None.
output_converted = None
else:
try:
# Convert to decimal
output_converted = Decimal(str(output))
except Exception, e:
# Conversion failed, throw an appropriate exception
raise SnapComponentError("Failed to cast output field %s value '%s' to type 'number' (%s)" %
(output_field_name, output, e))
else:
# Input field is None, so output field is None
output_converted = None
output_record[output_field_name] = output_converted
# Handle pass-through fields
output_record.transfer_pass_through_fields(input_record)
# Transfer fields matched by name
output_record.transfer_matching_fields(input_record, common_fields)
# Write the record we've created
output_view.write_record(output_record)
# Read next record
input_record = input_view.read_record()
# We are done
output_view.completed()
def upgrade_1_0_to_1_1(self):
"""
Add source constraint to Field property
"""
# Save the property value.
# We need to recreate the property, which resets the value
property_value = self.get_property_value(REGEX_SPECS)
input_field = prop.SimpleProp(INPUT_FIELD,
SnapString,
"Input field against which the pattern is matched",
{'lov': [ keys.CONSTRAINT_LOV_INPUT_FIELD] },
True)
regex = prop.SimpleProp(REGEX,
SnapString,
"Regex expression",
None,
True)
output_field = prop.SimpleProp(OUTPUT_FIELD,
SnapString,
"What output field the result corresponds to",
{'lov': [ keys.CONSTRAINT_LOV_OUTPUT_FIELD] },
True)
regex_spec = prop.DictProp(REGEX_SPEC,
input_field,
"Regex definition dictionary",
3,
3,
True,
True)
regex_spec[INPUT_FIELD] = input_field
regex_spec[REGEX] = regex
regex_spec[OUTPUT_FIELD] = output_field
regex_specs = prop.ListProp(REGEX_SPECS,
regex_spec,
"Regex specification properties",
1,
resdef.UNLIMITED_ENTRIES,
True)
self.set_property_def(REGEX_SPECS, regex_specs)
# Restore the value
self.set_property_value(REGEX_SPECS, property_value)
def upgrade_1_1_to_1_2(self):
"""
No-op upgrade only to change component doc URI during the upgrade
which will be by cc_info before calling this method.
"""
pass
|