Merge pull request #414 from CodeForPhilly/403-db-matching

c-simpson · web-flow · commit 3a06ef8d3332 · 2022-02-01T19:49:38.000-05:00
pipeline matching using database
diff --git a/src/server/alembic/versions/783cabf889d9_inital_schema_setup.py b/src/server/alembic/versions/783cabf889d9_inital_schema_setup.py
@@ -5,10 +5,12 @@
 Create Date: 2020-12-16 01:47:43.686881
 
 """
+from sqlalchemy.sql.expression import null
 from alembic import op
 import sqlalchemy as sa
 
 
+
 # revision identifiers, used by Alembic.
 revision = '783cabf889d9'
 down_revision = None
@@ -33,6 +35,5 @@ def upgrade():
         sa.Column('created', sa.DateTime,nullable=False, server_default='now()')
     )
 
-
 def downgrade():
     pass
diff --git a/src/server/alembic/versions/fc7325372396_merge_heads.py b/src/server/alembic/versions/fc7325372396_merge_heads.py
@@ -0,0 +1,24 @@
+"""Merges heads '8f4, '28b
+
+Revision ID: fc7325372396
+Revises: a3ba63dee8f4, fd187937528b
+Create Date: 2022-01-17 22:05:05.824901
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = 'fc7325372396'
+down_revision = ('a3ba63dee8f4', 'fd187937528b')
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    pass
+
+
+def downgrade():
+    pass
diff --git a/src/server/alembic/versions/fd187937528b_create_pdp_contacts_table.py b/src/server/alembic/versions/fd187937528b_create_pdp_contacts_table.py
@@ -0,0 +1,45 @@
+"""create pdp_contacts table
+
+Revision ID: fd187937528b
+Revises: 57b547e9b464
+Create Date: 2021-08-10 20:16:54.169168
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects.postgresql import JSONB
+import datetime
+
+# revision identifiers, used by Alembic.
+revision = 'fd187937528b'
+down_revision = '57b547e9b464'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    
+    op.create_table('pdp_contacts',
+        sa.Column('_id', sa.Integer, primary_key=True, autoincrement=True),
+        sa.Column('matching_id', sa.Integer, primary_key=True),
+        sa.Column('source_type', sa.String, nullable=False),
+        sa.Column('source_id', sa.String, nullable=False),
+        sa.Column('is_organization', sa.Boolean),
+        sa.Column('first_name', sa.String),
+        sa.Column('last_name', sa.String),
+        sa.Column('email', sa.String),
+        sa.Column('mobile', sa.String),
+        sa.Column('street_and_number', sa.String),
+        sa.Column('apartment', sa.String),
+        sa.Column('city', sa.String),
+        sa.Column('state', sa.String),
+        sa.Column('zip', sa.String),
+        sa.Column('json', JSONB),
+        sa.Column('created_date', sa.DateTime, default=datetime.datetime.utcnow),
+        sa.Column('archived_date', sa.DateTime, default=None)
+    )
+
+def downgrade():
+    
+    op.drop_table("pdp_contacts")
+    op.drop_table("pdp_contact_types")
diff --git a/src/server/config.py b/src/server/config.py
@@ -31,10 +31,6 @@
 
 engine = db.create_engine(DB)
 
-with engine.connect() as connection:
-    models.Base.metadata.create_all(connection)
-    # This is safe: by default, will check first to ensure tables don't already exist
-
 # Run Alembic to create managed tables
 # from alembic.config import Config
 # from alembic import command
diff --git a/src/server/datasource_manager.py b/src/server/datasource_manager.py
@@ -12,7 +12,7 @@ def __clean_csv_headers(header):
                          'Apartment', 'City', 'State', 'Zip', 'Email', 'Phone', 'Animal_ids'],
     'volgistics': ['Last name', 'First name', 'Middle name', 'Number', 'Complete address', 'Street 1', 'Street 2',
                    'Street 3', 'City', 'State', 'Zip', 'All phone numbers', 'Home', 'Work', 'Cell', 'Email'],
-    'salesforcecontacts': ['Contact ID 18', 'First Name', 'Last Name', 'Mailing Street', 'Mailing City',
+    'salesforcecontacts': ['Account Name', 'Contact ID 18', 'First Name', 'Last Name', 'Mailing Street', 'Mailing City',
                            'Mailing State/Province', 'Mailing Zip/Postal Code', 'Mailing Country', 'Phone', 'Mobile',
                            'Email', 'Account ID 18', 'Volgistics ID', 'Person ID'],
     'volgisticsshifts': ['Number', 'Place', 'Assignment', 'From date', 'To date', 'Hours'],
@@ -115,6 +115,7 @@ def normalize_phone_number(number):
         "city": "mailing_city",
         "state": "mailing_state_province",
         "zip": "mailing_zip_postal_code",
+        "account_name": "account_name",
         "others": {
             "should_drop_first_column": True
         }
diff --git a/src/server/pipeline/calssify_new_data.py b/src/server/pipeline/calssify_new_data.py
@@ -37,6 +37,8 @@ def start(pdp_contacts_df, normalized_data):
 
     incoming_ids = normalized_data[["source_id", "source_type"]].drop_duplicates()
     existing_ids = pdp_contacts_df[["source_id", "source_type"]].drop_duplicates()
+    #probably need a smarter method of dropping duplicates, e.g. row with least amount of null values
+    normalized_data = normalized_data.drop_duplicates(["source_id", "source_type"])
     new_ids, reused_ids, old_ids = venn_diagram_join(incoming_ids, existing_ids)
     current_app.logger.info(" - ID's identified as {} new, {} reused, and {} old".format(
         new_ids.shape[0], reused_ids.shape[0], old_ids.shape[0]
diff --git a/src/server/pipeline/flow_script.py b/src/server/pipeline/flow_script.py
@@ -7,10 +7,13 @@
 from config import RAW_DATA_PATH
 from config import engine
 from models import Base
+
+import time
+
 from rfm_funcs.create_scores import create_scores
 
 def start_flow():
-
+    start = time.time()
     job_id = admin_api.start_job()
     job_outcome = None
     trace_back_string = None
@@ -22,71 +25,64 @@ def start_flow():
 
     else:
 
+
         try:
 
             log_db.log_exec_status(job_id, 'start_flow', 'executing', '')
 
             file_path_list = os.listdir(RAW_DATA_PATH)
 
             if file_path_list:
-                with engine.connect() as connection:
-                    Base.metadata.create_all(connection)
-
-                    # Get previous version of pdp_contacts table, which is used later to classify new records
-                    pdp_contacts_df = pd.read_sql_table('pdp_contacts', connection)
-                    pdp_contacts_df = pdp_contacts_df[pdp_contacts_df["archived_date"].isnull()]
-                    pdp_contacts_df = pdp_contacts_df.drop(columns=['archived_date', 'created_date', '_id', 'matching_id'])
-
-                    current_app.logger.info('Loaded {} records from pdp_contacts table'.format(pdp_contacts_df.shape[0]))
-
-                    # Clean the input data and normalize/rename columns
-                    # Populate new records in secondary tables (donations, volunteer shifts)
-                    # input - existing files in path
-                    # output - normalized object of all entries, as well as the input json rows for primary sources
-                    log_db.log_exec_status(job_id, 'clean_and_load', 'executing', '')
-                    normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
-
-                    # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
-                    # (If additional inconsistencies are encountered, may need to enforce the schema of
-                    # the contacts loader by initializing it from pdp_contacts.)
-                    normalized_data.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
-                    normalized_data = pd.read_sql_table('_temp_pdp_contacts_loader', connection)
-
-                    # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
-                    log_db.log_exec_status(job_id, 'classify', 'executing', '')
-                    rows_classified = calssify_new_data.start(pdp_contacts_df, normalized_data)
-
-                    # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
-                    archive_rows.archive(connection, rows_classified["updated"])
-
-                    # Match new+updated records against previous version of pdp_contacts database, and
-                    # write these rows to the database.
-                    match_data.start(connection, rows_classified, manual_matches_df, job_id)
-
-                    # Copy raw input rows to json fields in pdp_contacts,
-                    # using a temporary table to simplify the update code.
-                    current_app.logger.info('Saving json of original rows to pdp_contacts')
-                    source_json.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
-                    # https://www.postgresql.org/docs/8.4/sql-update.html
-                    connection.execute('''
-                        UPDATE pdp_contacts pdp
-                        SET json = to_json(temp.json)
-                        FROM _temp_pdp_contacts_loader temp
-                        WHERE
-                            pdp.source_type = temp.source_type AND
-                            pdp.source_id = temp.source_id AND
-                            pdp.archived_date IS NULL
-                    ''')
-
-                current_app.logger.info('Finished flow script run, running RFM scoring')
-
-                score_result = create_scores()  # Run RFM scoring on newly-processed donations
-                current_app.logger.info('Scored ' + str(score_result) + ' tuples')
-
-                job_outcome = 'completed'
-                log_db.log_exec_status(job_id, 'flow', 'complete', '' )
-
-
+                with engine.begin() as connection:
+
+                        # Get previous version of pdp_contacts table, which is used later to classify new records
+                        pdp_contacts_df = pd.read_sql_table('pdp_contacts', connection)
+                        pdp_contacts_df = pdp_contacts_df[pdp_contacts_df["archived_date"].isnull()]
+                        pdp_contacts_df = pdp_contacts_df.drop(columns=['archived_date', 'created_date', '_id', 'matching_id'])
+
+                        current_app.logger.info('Loaded {} records from pdp_contacts table'.format(pdp_contacts_df.shape[0]))
+
+                        # Clean the input data and normalize/rename columns
+                        # Populate new records in secondary tables (donations, volunteer shifts)
+                        # input - existing files in path
+                        # output - normalized object of all entries, as well as the input json rows for primary sources
+                        log_db.log_exec_status(job_id, 'clean_and_load', 'executing', '')
+                        normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
+                        # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
+                        # (If additional inconsistencies are encountered, may need to enforce the schema of
+                        # the contacts loader by initializing it from pdp_contacts.)
+                        normalized_data.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
+                        normalized_data = pd.read_sql_table('_temp_pdp_contacts_loader', connection)
+
+                        # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
+                        log_db.log_exec_status(job_id, 'classify', 'executing', '')
+                        rows_classified = calssify_new_data.start(pdp_contacts_df, normalized_data)
+
+                        # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
+                        archive_rows.archive(connection, rows_classified["updated"])
+
+                        # Match new+updated records against previous version of pdp_contacts database, and
+                        # write these rows to the database.
+                        match_data.start(connection, rows_classified, manual_matches_df, job_id)
+
+                        # Copy raw input rows to json fields in pdp_contacts,
+                        # using a temporary table to simplify the update code.
+                        current_app.logger.info('Saving json of original rows to pdp_contacts')
+                        source_json.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
+                        # https://www.postgresql.org/docs/8.4/sql-update.html
+                        connection.execute('''
+                            UPDATE pdp_contacts pdp
+                            SET json = to_json(temp.json)
+                            FROM _temp_pdp_contacts_loader temp
+                            WHERE
+                                pdp.source_type = temp.source_type AND
+                                pdp.source_id = temp.source_id AND
+                                pdp.archived_date IS NULL
+                        ''')
+
+                        current_app.logger.info('Finished flow script run')
+                        job_outcome = 'completed'
+                        log_db.log_exec_status(job_id, 'flow', 'complete', '' )
 
             else: # No files in list
                 current_app.logger.info('No files to process')
@@ -107,4 +103,5 @@ def start_flow():
                 job_outcome = 'error'
                 return 'error'
 
+    current_app.logger.info('Pipeline execution took {} seconds '.format(time.time() - start))
     return job_outcome
diff --git a/src/server/pipeline/flow_script.py.orig b/src/server/pipeline/flow_script.py.orig
diff --git a/src/server/pipeline/match_data.py b/src/server/pipeline/match_data.py