# $SnapHashLicense:
#
# SnapLogic - Open source data services
#
# Copyright (C) 2009, SnapLogic, Inc. All rights reserved.
#
# See http://www.snaplogic.org for more information about
# the SnapLogic project.
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
#
# "SnapLogic" is a trademark of SnapLogic, Inc.
#
#
# $
# $Id: Unique.py 10330 2009-12-24 22:13:38Z grisha $
"""
Unique Module and Resource Definition
"""
from snaplogic.common.snap_exceptions import SnapObjTypeError
import tempfile
import os
from decimal import Decimal
from datetime import datetime
import time
from sqlite3 import dbapi2
import cPickle as pickle
import snaplogic.components as components
from snaplogic.components import computils
from snaplogic.common import snap_log,sqlite_iter
from snaplogic.common import version_info
from snaplogic.cc import component_api
from snaplogic.cc.component_api import ComponentAPI
from snaplogic.cc import prop
from snaplogic.snapi_base import keys
from snaplogic.common.snap_exceptions import SnapComponentError
from snaplogic.common.data_types import SnapNumber,SnapString,SnapDateTime
CONTIGUOUS_PROP = "Contiguous Duplicates"
DATE_FORMAT_STRING = "%Y-%m-%d %H:%M:%S"
FORMATTED_DATE_LENGTH = len(time.strftime(DATE_FORMAT_STRING, datetime.today().timetuple()))
# Couple of field names that are only used in this component code
# and are not visible outside the component.
INTERNAL_PASS_THRU = "__INT_PASS_THRU__...%%%"
EXTERNAL_PASS_THRU = "__EXT_PASS_THRU__...%%%"
class Unique(ComponentAPI):
"""
This component filters out duplicate records. Here 'duplicate' is defined as the records that have the
same value for the fields specified in the input view. Any pass-through fields are ignored in this
comparison. This component can run in two different modes.
1) If the input records to the component are ordered in such a way that the duplicate records are adjacent
to each other, then the property '%s' (%s) should be set to True. This allows the component to run in
an optimized manner and the records can be processed with no buffering.
2) If the input records have no such ordering, then the property '%s' (%s) should be set to false. The
component in this mode will have to buffer all the records and reorder them before eliminating the
duplicate records.
""" % (CONTIGUOUS_PROP, CONTIGUOUS_PROP, CONTIGUOUS_PROP, CONTIGUOUS_PROP)
api_version = '1.0'
component_version = '1.1'
capabilities = {
ComponentAPI.CAPABILITY_INPUT_VIEW_LOWER_LIMIT : 1,
ComponentAPI.CAPABILITY_INPUT_VIEW_UPPER_LIMIT : 1,
ComponentAPI.CAPABILITY_OUTPUT_VIEW_LOWER_LIMIT : 1,
ComponentAPI.CAPABILITY_OUTPUT_VIEW_UPPER_LIMIT : 1,
ComponentAPI.CAPABILITY_ALLOW_PASS_THROUGH : True
}
component_description = "Unique"
component_label = "Unique"
component_doc_uri = "https://www.snaplogic.org/trac/wiki/Documentation/%s/ComponentRef/Unique" % \
version_info.doc_uri_version
def create_resource_template(self):
"""Create Unique resource template."""
p = prop.SimpleProp(CONTIGUOUS_PROP,
"boolean",
"Set to true if duplicate records are adjacent to each other in the input stream. "
"The component executes much faster if this is the case. ",
required=True)
self.set_property_def(CONTIGUOUS_PROP, p)
self.set_property_value(CONTIGUOUS_PROP, False)
def validate(self, err_obj):
"""Validate the Unique Resource."""
# Validate that the output view is the same as the input view.
# Make sure that the output view matches the input view. (at least the types)
input_views = self.list_input_view_names()
input_view_name = input_views[keys.SINGLE_VIEW]
input_view = self.get_input_view_def(input_view_name)
input_view_fields = input_view[keys.VIEW_FIELDS]
output_views = self.list_output_view_names()
output_view_name = output_views[keys.SINGLE_VIEW]
output_view = self.get_output_view_def(output_view_name)
# Field count matches?
if len(output_view[keys.VIEW_FIELDS]) != len(input_view_fields):
err_obj.get_output_view_err()[output_views[keys.SINGLE_VIEW]].set_message(
"Output view '%s' field count '%d' does not match corresponding input view '%s' field count '%d'." \
% (output_view_name, len(output_view[keys.VIEW_FIELDS]),
input_view_name, len(input_view_fields)))
else:
# Field types match?
for i, output_field in enumerate(output_view[keys.VIEW_FIELDS]):
output_field_name = output_field[keys.FIELD_NAME]
output_field_type = output_field[keys.FIELD_TYPE]
input_field_name = input_view_fields[i][keys.FIELD_NAME]
input_field_type = input_view_fields[i][keys.FIELD_TYPE]
if output_field_type != input_field_type:
err_obj.get_output_view_err()[output_views[keys.SINGLE_VIEW]][keys.VIEW_FIELDS][i].set_message(
"Output view '%s' field '%s' type '%s' does not match corresponding input view '%s' field '%s' type '%s'." \
% (output_view_name, output_field_name, output_field_type,
input_view_name, input_field_name, input_field_type))
def execute(self, input_views, output_views):
try:
output_view = output_views.values()[keys.SINGLE_VIEW]
except IndexError:
raise SnapComponentError("No output view connected.")
try:
input_view = input_views.values()[keys.SINGLE_VIEW]
except IndexError:
raise SnapComponentError("No input view connected.")
if self.get_property_value(CONTIGUOUS_PROP):
self.unique_in_stream(input_view, output_view)
else:
self.unique_in_db(input_view, output_view)
def unique_in_stream(self, input_view, output_view):
"""
Process the data under the assumption that duplicate records are adjacent to each other.
@param input_view: The input view object to read records from.
@type input_view: L{InputView}
@param output_view: The output view object to write unique records to.
@type output_view: L{OutputView}
"""
prev_items = None
rec = input_view.read_record()
while rec is not None:
rec_items = rec.items()
# A record structure has other hidden fields (like pass-through), so specifically compare just
# the key/value pairs.
if rec_items != prev_items:
# A unique record.
output_view.write_record(rec)
prev_items = rec_items
rec = input_view.read_record()
output_view.completed()
def unique_in_db(self, input_view, output_view):
"""
Process the records by entering them into a database and then retrieving the unique records.
@param input_view: The input view object to read records from.
@type input_view: L{InputView}
@param output_view: The output view object to write unique records to.
@type output_view: L{OutputView}
"""
fields_to_db_fields = {}
db_fields_to_fields = {}
field_types = {}
db_fields = []
fields = []
input_view_fields = list(input_view.fields)
# Here we introduce two fake input fields, which will be used to hold internal pass-through and
# external pass-through fields in the form of a string.
input_view_fields.append((INTERNAL_PASS_THRU, SnapString,
"Dummy field which holds all the internal pass through fields serialized to a string."))
input_view_fields.append((EXTERNAL_PASS_THRU, SnapString,
"Dummy field which holds all the external pass through fields serialized to a string."))
# Create mappings between the DB column names and the input view fields.
for (i, input_field) in enumerate(input_view_fields):
field = input_field[keys.FIELD_NAME]
field_type = input_field[keys.FIELD_TYPE]
field_types[field] = field_type
fields.append(field)
db_field = "field%s" % i
i += 1
db_fields_to_fields[db_field] = field
fields_to_db_fields[field] = db_field
db_fields = [fields_to_db_fields[field] for field in fields]
db_file = None
db_file_name = None
cursor = None
con = None
try:
# Create a temp file to hold the SQLite database.
# Note that mkstemp opens the file as well, which we don't need,
# so close the temp file after it's been created.
(db_file, db_file_name) = tempfile.mkstemp(".db","snapunique")
os.close(db_file)
con = sqlite.connect(db_file_name)
cursor = con.cursor()
sqlite.register_adapter(Decimal, self._adapt_Decimal)
first = True
stmt = 'CREATE TABLE unique_tbl ('
for db_field in db_fields:
if not first:
stmt += ", "
stmt += db_field + " "
field = db_fields_to_fields[db_field]
field_type = field_types[field]
if field_type == SnapString:
stmt += "TEXT"
elif field_type == SnapNumber:
stmt += "DECIMAL"
elif field_type == SnapDateTime:
stmt += 'DATETIME'
else:
raise SnapObjTypeError('Unknown type %s', field_type)
if first:
first = False
# Unique on all fields other than the pass-through fields which are the last two columns of the table.
stmt += (", UNIQUE(%s) ON CONFLICT IGNORE)" % ", ".join(db_fields[:-2]))
cursor.execute(stmt)
insert_stmt = "INSERT INTO unique_tbl (" + \
",".join(db_fields) + \
") VALUES (" + \
",".join(['?' for i in db_fields]) + \
")"
self._process_records(input_view, cursor, insert_stmt)
stmt = "SELECT " + ",".join(db_fields) + " FROM unique_tbl"
cursor.execute(stmt)
output_field_types = list(output_view.field_types)
for row in sqlite_iter(cursor):
out_rec = output_view.create_record()
i = 0
for field in out_rec.field_names:
if row[i] is None:
out_rec[field] = row[i]
elif output_field_types[i] == SnapNumber:
out_rec[field] = Decimal(str(row[i]))
elif output_field_types[i] == SnapDateTime:
no_micros = row[i][:FORMATTED_DATE_LENGTH]
micros = row[i][FORMATTED_DATE_LENGTH+1:]
formatted = time.strptime(no_micros, DATE_FORMAT_STRING)
time_tuple = formatted[0:6]
dt = datetime(*time_tuple)
if micros:
dt = dt.replace(microsecond=int(micros))
out_rec[field] = dt
else:
out_rec[field] = row[i]
i += 1
# Process the pass-through fields if any.
if row[i] is not None:
# Have to use str() here, as pickle.loads does not take unicode. Tests show that
# using str() is not an issue, even when unicode values have been serialized by
# pickle.
out_rec._internal_pass_through[input_view.name] = pickle.loads(str(row[i]))
i += 1
if row[i] is not None:
out_rec._external_pass_through[input_view.name] = pickle.loads(str(row[i]))
output_view.write_record(out_rec)
output_view.completed()
except Exception, e:
e2 = SnapComponentError("Failed to unique the records")
e2.append(e)
raise e2
finally:
if cursor:
try:
cursor.close()
except:
# Can't do much here
pass
if con:
try:
con.close()
except:
pass
if db_file_name:
try:
os.remove(db_file_name)
except:
pass
def _adapt_Decimal(self, dec):
return str(dec)
def _process_records(self, input_view, cursor, insert_stmt):
while True:
record = input_view.read_record()
if record is None:
break
vals = [record[field_name] for field_name in record.field_names]
# Dump the internal and external pass-through fields as strings and append
# them to the end of the record.
# TODO: The record object should support methods that set and get internal and external
# pass-through fields.
# TODO: This manner of dumping pass-through fields to store in DB needs to be supported
# with some utility methods, as it can be reused in components like sort. Cannot create these
# utilities right now, as the customer cannot wait for 2.1.3 to be released. This component
# has to function with utilities in 2.1.2 for now.
if record.view_name in record._internal_pass_through:
internal_pt = pickle.dumps(record._internal_pass_through[record.view_name])
vals.append(internal_pt)
external_pt = pickle.dumps(record._external_pass_through[record.view_name])
vals.append(external_pt)
else:
vals.extend([None, None])
# Insert the fields into DB.
cursor.execute(insert_stmt, vals)
def upgrade_1_0_to_1_1(self):
"""
No-op upgrade only to change component doc URI during the upgrade
which will be by cc_info before calling this method.
"""
pass
|