Unique.py : » Development » SnapLogic » snaplogic » components » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » Development » SnapLogic
SnapLogic » snaplogic » components » Unique.py
# $SnapHashLicense:
# 
# SnapLogic - Open source data services
# 
# Copyright (C) 2009, SnapLogic, Inc.  All rights reserved.
# 
# See http://www.snaplogic.org for more information about
# the SnapLogic project. 
# 
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
# 
# "SnapLogic" is a trademark of SnapLogic, Inc.
# 
# 
# $

#    $Id: Unique.py 10330 2009-12-24 22:13:38Z grisha $
"""
Unique Module and Resource Definition

"""

from snaplogic.common.snap_exceptions import SnapObjTypeError

import tempfile
import os
from decimal import Decimal
from datetime import datetime
import time
from sqlite3 import dbapi2
import cPickle as pickle

import snaplogic.components as components
from snaplogic.components import computils
from snaplogic.common import snap_log,sqlite_iter
from snaplogic.common import version_info
from snaplogic.cc import component_api
from snaplogic.cc.component_api import ComponentAPI
from snaplogic.cc import prop
from snaplogic.snapi_base import keys
from snaplogic.common.snap_exceptions import SnapComponentError
from snaplogic.common.data_types import SnapNumber,SnapString,SnapDateTime

CONTIGUOUS_PROP = "Contiguous Duplicates"
DATE_FORMAT_STRING = "%Y-%m-%d %H:%M:%S"

FORMATTED_DATE_LENGTH = len(time.strftime(DATE_FORMAT_STRING, datetime.today().timetuple()))

# Couple of field names that are only used in this component code
# and are not visible outside the component.
INTERNAL_PASS_THRU = "__INT_PASS_THRU__...%%%"
EXTERNAL_PASS_THRU = "__EXT_PASS_THRU__...%%%"

class Unique(ComponentAPI):
    """
    This component filters out duplicate records. Here 'duplicate' is defined as the records that have the
    same value for the fields specified in the input view. Any pass-through fields are ignored in this
    comparison. This component can run in two different modes.
    1) If the input records to the component are ordered in such a way that the duplicate records are adjacent
       to each other, then the property '%s' (%s) should be set to True. This allows the component to run in
       an optimized manner and the records can be processed with no buffering.
       
    2) If the input records have no such ordering, then the property '%s' (%s) should be set to false. The
       component in this mode will have to buffer all the records and reorder them before eliminating the
       duplicate records.
    
    """ % (CONTIGUOUS_PROP, CONTIGUOUS_PROP, CONTIGUOUS_PROP, CONTIGUOUS_PROP)
    
    api_version = '1.0'
    component_version = '1.1'
    
    capabilities = {
        ComponentAPI.CAPABILITY_INPUT_VIEW_LOWER_LIMIT    : 1,
        ComponentAPI.CAPABILITY_INPUT_VIEW_UPPER_LIMIT    : 1,
        ComponentAPI.CAPABILITY_OUTPUT_VIEW_LOWER_LIMIT   : 1,
        ComponentAPI.CAPABILITY_OUTPUT_VIEW_UPPER_LIMIT   : 1,
        ComponentAPI.CAPABILITY_ALLOW_PASS_THROUGH        : True
    }
    component_description = "Unique"
    
    component_label = "Unique"
    
    component_doc_uri = "https://www.snaplogic.org/trac/wiki/Documentation/%s/ComponentRef/Unique" % \
                                                        version_info.doc_uri_version
    
    
    def create_resource_template(self):
        """Create Unique resource template."""
        p = prop.SimpleProp(CONTIGUOUS_PROP,
                            "boolean",
                            "Set to true if duplicate records are adjacent to each other in the input stream. "
                            "The component executes much faster if this is the case. ",
                            required=True)
        self.set_property_def(CONTIGUOUS_PROP, p)
        self.set_property_value(CONTIGUOUS_PROP, False)
    
    def validate(self, err_obj):
        """Validate the Unique Resource."""
        # Validate that the output view is the same as the input view.
        # Make sure that the output view matches the input view. (at least the types)                     
        input_views = self.list_input_view_names()
        input_view_name = input_views[keys.SINGLE_VIEW]
        input_view = self.get_input_view_def(input_view_name)
        input_view_fields = input_view[keys.VIEW_FIELDS]
        output_views = self.list_output_view_names()
        output_view_name = output_views[keys.SINGLE_VIEW]
        output_view = self.get_output_view_def(output_view_name)
        # Field count matches? 
        if len(output_view[keys.VIEW_FIELDS]) != len(input_view_fields):
            err_obj.get_output_view_err()[output_views[keys.SINGLE_VIEW]].set_message(
                    "Output view '%s' field count '%d' does not match corresponding input view '%s' field count '%d'." \
                    % (output_view_name, len(output_view[keys.VIEW_FIELDS]), 
                       input_view_name, len(input_view_fields)))
        else:
            # Field types match? 
            for i, output_field in enumerate(output_view[keys.VIEW_FIELDS]):
                output_field_name = output_field[keys.FIELD_NAME]
                output_field_type = output_field[keys.FIELD_TYPE]
                input_field_name = input_view_fields[i][keys.FIELD_NAME]
                input_field_type = input_view_fields[i][keys.FIELD_TYPE]
                if output_field_type != input_field_type:
                    err_obj.get_output_view_err()[output_views[keys.SINGLE_VIEW]][keys.VIEW_FIELDS][i].set_message(
                        "Output view '%s' field '%s' type '%s' does not match corresponding input view '%s' field '%s' type '%s'." \
                        % (output_view_name, output_field_name, output_field_type, 
                           input_view_name, input_field_name, input_field_type))
         
         
    def execute(self, input_views, output_views):
        try:
            output_view = output_views.values()[keys.SINGLE_VIEW] 
        except IndexError:
            raise SnapComponentError("No output view connected.")
        try: 
            input_view = input_views.values()[keys.SINGLE_VIEW]
        except IndexError:
            raise SnapComponentError("No input view connected.")
        
        if self.get_property_value(CONTIGUOUS_PROP):
            self.unique_in_stream(input_view, output_view)
        else:
            self.unique_in_db(input_view, output_view)
    
    def unique_in_stream(self, input_view, output_view):
        """
        Process the data under the assumption that duplicate records are adjacent to each other.
        
        @param input_view: The input view object to read records from.
        @type input_view:  L{InputView}
        
        @param output_view: The output view object to write unique records to.
        @type output_view:  L{OutputView}
        
        """
        prev_items = None
        rec = input_view.read_record()
        while rec is not None:
            rec_items = rec.items()
            # A record structure has other hidden fields (like pass-through), so specifically compare just
            # the key/value pairs. 
            if rec_items != prev_items:
                # A unique record.
                output_view.write_record(rec)
                prev_items = rec_items
            rec = input_view.read_record()
        
        output_view.completed()
        
    def unique_in_db(self, input_view, output_view):
        """
        Process the records by entering them into a database and then retrieving the unique records.
        
        @param input_view: The input view object to read records from.
        @type input_view:  L{InputView}
        
        @param output_view: The output view object to write unique records to.
        @type output_view:  L{OutputView}
        
        """
        fields_to_db_fields = {}
        db_fields_to_fields = {}
        field_types = {}
        db_fields = []
        fields = []

        input_view_fields = list(input_view.fields)
        # Here we introduce two fake input fields, which will be used to hold internal pass-through and
        # external pass-through fields in the form of a string.
        input_view_fields.append((INTERNAL_PASS_THRU, SnapString, 
                                "Dummy field which holds all the internal pass through fields serialized to a string."))
        input_view_fields.append((EXTERNAL_PASS_THRU, SnapString, 
                                "Dummy field which holds all the external pass through fields serialized to a string."))
        
        # Create mappings between the DB column names and the input view fields.
        for (i, input_field) in enumerate(input_view_fields):
            field = input_field[keys.FIELD_NAME]
            field_type = input_field[keys.FIELD_TYPE]
            field_types[field] = field_type
            fields.append(field)
            db_field = "field%s" % i
            i += 1                
            db_fields_to_fields[db_field] = field
            fields_to_db_fields[field] = db_field 
        
        db_fields = [fields_to_db_fields[field] for field in fields]
        db_file = None
        db_file_name = None
        cursor = None
        con = None
               
        try:
            # Create a temp file to hold the SQLite database.
            # Note that mkstemp opens the file as well, which we don't need,
            # so close the temp file after it's been created.  
            (db_file, db_file_name) = tempfile.mkstemp(".db","snapunique")
            os.close(db_file)
            
            con = sqlite.connect(db_file_name)
            cursor = con.cursor()
            sqlite.register_adapter(Decimal, self._adapt_Decimal)
            
            first = True
            stmt = 'CREATE TABLE unique_tbl ('
            for db_field in db_fields:
                if not first:
                    stmt += ", "
                stmt += db_field + " "
                field = db_fields_to_fields[db_field]
                field_type = field_types[field]
                if field_type == SnapString:
                    stmt += "TEXT"
                elif field_type == SnapNumber:
                    stmt += "DECIMAL"
                elif field_type == SnapDateTime:
                    stmt += 'DATETIME'
                else:
                    raise SnapObjTypeError('Unknown type %s', field_type)
            
                if first:
                    first = False
            
            # Unique on all fields other than the pass-through fields which are the last two columns of the table.
            stmt += (", UNIQUE(%s) ON CONFLICT IGNORE)" %  ", ".join(db_fields[:-2]))
            
            cursor.execute(stmt)
            insert_stmt = "INSERT INTO unique_tbl (" + \
                            ",".join(db_fields) + \
                            ") VALUES (" + \
                            ",".join(['?' for i in db_fields]) + \
                            ")"
            self._process_records(input_view, cursor, insert_stmt)
            stmt = "SELECT " + ",".join(db_fields) + " FROM unique_tbl"
            cursor.execute(stmt)

            output_field_types = list(output_view.field_types)     
            for row in sqlite_iter(cursor):
                out_rec = output_view.create_record()
                i = 0
                for field in out_rec.field_names:
                    if row[i] is None:
                        out_rec[field] = row[i]
                    elif output_field_types[i] == SnapNumber:
                        out_rec[field] = Decimal(str(row[i]))
                    elif output_field_types[i] == SnapDateTime:
                        no_micros = row[i][:FORMATTED_DATE_LENGTH]
                        micros = row[i][FORMATTED_DATE_LENGTH+1:]
                        formatted = time.strptime(no_micros, DATE_FORMAT_STRING)
                        time_tuple = formatted[0:6]
                        dt =  datetime(*time_tuple)
                        if micros:
                            dt = dt.replace(microsecond=int(micros))
                        out_rec[field] = dt
                    else:
                        out_rec[field] = row[i]
                    i += 1
                
                # Process the pass-through fields if any.
                if row[i] is not None:
                    # Have to use str() here, as pickle.loads does not take unicode. Tests show that
                    # using str() is not an issue, even when unicode values have been serialized by
                    # pickle.
                    out_rec._internal_pass_through[input_view.name] = pickle.loads(str(row[i]))
                i += 1
                if row[i] is not None:
                    out_rec._external_pass_through[input_view.name] = pickle.loads(str(row[i]))
                output_view.write_record(out_rec)
                
            output_view.completed()
        except Exception, e:            
            e2 = SnapComponentError("Failed to unique the records")
            e2.append(e)
            raise e2
        finally:
            if cursor:
                try:
                    cursor.close()
                except:
                    # Can't do much here
                    pass
            if con:
                try:
                    con.close()
                except:
                    pass
            if db_file_name:
                try:
                    os.remove(db_file_name)
                except:  
                    pass

    def _adapt_Decimal(self, dec):
        return str(dec)
       
    def _process_records(self, input_view, cursor, insert_stmt):
        while True:
            record = input_view.read_record()
            if record is None:
                break
            vals = [record[field_name] for field_name in record.field_names]
            
            # Dump the internal and external pass-through fields as strings and append
            # them to the end of the record.
            # TODO: The record object should support methods that set and get internal and external
            # pass-through fields.
            # TODO: This manner of dumping pass-through fields to store in DB needs to be supported
            # with some utility methods, as it can be reused in components like sort. Cannot create these
            # utilities right now, as the customer cannot wait for 2.1.3 to be released. This component
            # has to function with utilities in 2.1.2 for now.
            if record.view_name in record._internal_pass_through:
                internal_pt = pickle.dumps(record._internal_pass_through[record.view_name])
                vals.append(internal_pt)
                external_pt = pickle.dumps(record._external_pass_through[record.view_name])
                vals.append(external_pt)
            else:
                vals.extend([None, None])
                
            # Insert the fields into DB.
            cursor.execute(insert_stmt, vals)
    
    def upgrade_1_0_to_1_1(self):
        """
        No-op upgrade only to change component doc URI during the upgrade
        which will be by cc_info before calling this method.
        
        """
        pass
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.