Skip to content

Commit 3a06ef8

Browse files
authored
Merge pull request #414 from CodeForPhilly/403-db-matching
pipeline matching using database
2 parents 19dad1e + 7df4c51 commit 3a06ef8

9 files changed

+357
-113
lines changed

src/server/alembic/versions/783cabf889d9_inital_schema_setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@
55
Create Date: 2020-12-16 01:47:43.686881
66
77
"""
8+
from sqlalchemy.sql.expression import null
89
from alembic import op
910
import sqlalchemy as sa
1011

1112

13+
1214
# revision identifiers, used by Alembic.
1315
revision = '783cabf889d9'
1416
down_revision = None
@@ -33,6 +35,5 @@ def upgrade():
3335
sa.Column('created', sa.DateTime,nullable=False, server_default='now()')
3436
)
3537

36-
3738
def downgrade():
3839
pass
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""Merges heads '8f4, '28b
2+
3+
Revision ID: fc7325372396
4+
Revises: a3ba63dee8f4, fd187937528b
5+
Create Date: 2022-01-17 22:05:05.824901
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
11+
12+
# revision identifiers, used by Alembic.
13+
revision = 'fc7325372396'
14+
down_revision = ('a3ba63dee8f4', 'fd187937528b')
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade():
20+
pass
21+
22+
23+
def downgrade():
24+
pass
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
"""create pdp_contacts table
2+
3+
Revision ID: fd187937528b
4+
Revises: 57b547e9b464
5+
Create Date: 2021-08-10 20:16:54.169168
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
from sqlalchemy.dialects.postgresql import JSONB
11+
import datetime
12+
13+
# revision identifiers, used by Alembic.
14+
revision = 'fd187937528b'
15+
down_revision = '57b547e9b464'
16+
branch_labels = None
17+
depends_on = None
18+
19+
20+
def upgrade():
21+
22+
op.create_table('pdp_contacts',
23+
sa.Column('_id', sa.Integer, primary_key=True, autoincrement=True),
24+
sa.Column('matching_id', sa.Integer, primary_key=True),
25+
sa.Column('source_type', sa.String, nullable=False),
26+
sa.Column('source_id', sa.String, nullable=False),
27+
sa.Column('is_organization', sa.Boolean),
28+
sa.Column('first_name', sa.String),
29+
sa.Column('last_name', sa.String),
30+
sa.Column('email', sa.String),
31+
sa.Column('mobile', sa.String),
32+
sa.Column('street_and_number', sa.String),
33+
sa.Column('apartment', sa.String),
34+
sa.Column('city', sa.String),
35+
sa.Column('state', sa.String),
36+
sa.Column('zip', sa.String),
37+
sa.Column('json', JSONB),
38+
sa.Column('created_date', sa.DateTime, default=datetime.datetime.utcnow),
39+
sa.Column('archived_date', sa.DateTime, default=None)
40+
)
41+
42+
def downgrade():
43+
44+
op.drop_table("pdp_contacts")
45+
op.drop_table("pdp_contact_types")

src/server/config.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@
3131

3232
engine = db.create_engine(DB)
3333

34-
with engine.connect() as connection:
35-
models.Base.metadata.create_all(connection)
36-
# This is safe: by default, will check first to ensure tables don't already exist
37-
3834
# Run Alembic to create managed tables
3935
# from alembic.config import Config
4036
# from alembic import command

src/server/datasource_manager.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def __clean_csv_headers(header):
1212
'Apartment', 'City', 'State', 'Zip', 'Email', 'Phone', 'Animal_ids'],
1313
'volgistics': ['Last name', 'First name', 'Middle name', 'Number', 'Complete address', 'Street 1', 'Street 2',
1414
'Street 3', 'City', 'State', 'Zip', 'All phone numbers', 'Home', 'Work', 'Cell', 'Email'],
15-
'salesforcecontacts': ['Contact ID 18', 'First Name', 'Last Name', 'Mailing Street', 'Mailing City',
15+
'salesforcecontacts': ['Account Name', 'Contact ID 18', 'First Name', 'Last Name', 'Mailing Street', 'Mailing City',
1616
'Mailing State/Province', 'Mailing Zip/Postal Code', 'Mailing Country', 'Phone', 'Mobile',
1717
'Email', 'Account ID 18', 'Volgistics ID', 'Person ID'],
1818
'volgisticsshifts': ['Number', 'Place', 'Assignment', 'From date', 'To date', 'Hours'],
@@ -115,6 +115,7 @@ def normalize_phone_number(number):
115115
"city": "mailing_city",
116116
"state": "mailing_state_province",
117117
"zip": "mailing_zip_postal_code",
118+
"account_name": "account_name",
118119
"others": {
119120
"should_drop_first_column": True
120121
}

src/server/pipeline/calssify_new_data.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ def start(pdp_contacts_df, normalized_data):
3737

3838
incoming_ids = normalized_data[["source_id", "source_type"]].drop_duplicates()
3939
existing_ids = pdp_contacts_df[["source_id", "source_type"]].drop_duplicates()
40+
#probably need a smarter method of dropping duplicates, e.g. row with least amount of null values
41+
normalized_data = normalized_data.drop_duplicates(["source_id", "source_type"])
4042
new_ids, reused_ids, old_ids = venn_diagram_join(incoming_ids, existing_ids)
4143
current_app.logger.info(" - ID's identified as {} new, {} reused, and {} old".format(
4244
new_ids.shape[0], reused_ids.shape[0], old_ids.shape[0]

src/server/pipeline/flow_script.py

Lines changed: 56 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,13 @@
77
from config import RAW_DATA_PATH
88
from config import engine
99
from models import Base
10+
11+
import time
12+
1013
from rfm_funcs.create_scores import create_scores
1114

1215
def start_flow():
13-
16+
start = time.time()
1417
job_id = admin_api.start_job()
1518
job_outcome = None
1619
trace_back_string = None
@@ -22,71 +25,64 @@ def start_flow():
2225

2326
else:
2427

28+
2529
try:
2630

2731
log_db.log_exec_status(job_id, 'start_flow', 'executing', '')
2832

2933
file_path_list = os.listdir(RAW_DATA_PATH)
3034

3135
if file_path_list:
32-
with engine.connect() as connection:
33-
Base.metadata.create_all(connection)
34-
35-
# Get previous version of pdp_contacts table, which is used later to classify new records
36-
pdp_contacts_df = pd.read_sql_table('pdp_contacts', connection)
37-
pdp_contacts_df = pdp_contacts_df[pdp_contacts_df["archived_date"].isnull()]
38-
pdp_contacts_df = pdp_contacts_df.drop(columns=['archived_date', 'created_date', '_id', 'matching_id'])
39-
40-
current_app.logger.info('Loaded {} records from pdp_contacts table'.format(pdp_contacts_df.shape[0]))
41-
42-
# Clean the input data and normalize/rename columns
43-
# Populate new records in secondary tables (donations, volunteer shifts)
44-
# input - existing files in path
45-
# output - normalized object of all entries, as well as the input json rows for primary sources
46-
log_db.log_exec_status(job_id, 'clean_and_load', 'executing', '')
47-
normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
48-
49-
# Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
50-
# (If additional inconsistencies are encountered, may need to enforce the schema of
51-
# the contacts loader by initializing it from pdp_contacts.)
52-
normalized_data.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
53-
normalized_data = pd.read_sql_table('_temp_pdp_contacts_loader', connection)
54-
55-
# Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
56-
log_db.log_exec_status(job_id, 'classify', 'executing', '')
57-
rows_classified = calssify_new_data.start(pdp_contacts_df, normalized_data)
58-
59-
# Archives rows the were updated in the current state of the DB (changes their archived_date to now)
60-
archive_rows.archive(connection, rows_classified["updated"])
61-
62-
# Match new+updated records against previous version of pdp_contacts database, and
63-
# write these rows to the database.
64-
match_data.start(connection, rows_classified, manual_matches_df, job_id)
65-
66-
# Copy raw input rows to json fields in pdp_contacts,
67-
# using a temporary table to simplify the update code.
68-
current_app.logger.info('Saving json of original rows to pdp_contacts')
69-
source_json.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
70-
# https://www.postgresql.org/docs/8.4/sql-update.html
71-
connection.execute('''
72-
UPDATE pdp_contacts pdp
73-
SET json = to_json(temp.json)
74-
FROM _temp_pdp_contacts_loader temp
75-
WHERE
76-
pdp.source_type = temp.source_type AND
77-
pdp.source_id = temp.source_id AND
78-
pdp.archived_date IS NULL
79-
''')
80-
81-
current_app.logger.info('Finished flow script run, running RFM scoring')
82-
83-
score_result = create_scores() # Run RFM scoring on newly-processed donations
84-
current_app.logger.info('Scored ' + str(score_result) + ' tuples')
85-
86-
job_outcome = 'completed'
87-
log_db.log_exec_status(job_id, 'flow', 'complete', '' )
88-
89-
36+
with engine.begin() as connection:
37+
38+
# Get previous version of pdp_contacts table, which is used later to classify new records
39+
pdp_contacts_df = pd.read_sql_table('pdp_contacts', connection)
40+
pdp_contacts_df = pdp_contacts_df[pdp_contacts_df["archived_date"].isnull()]
41+
pdp_contacts_df = pdp_contacts_df.drop(columns=['archived_date', 'created_date', '_id', 'matching_id'])
42+
43+
current_app.logger.info('Loaded {} records from pdp_contacts table'.format(pdp_contacts_df.shape[0]))
44+
45+
# Clean the input data and normalize/rename columns
46+
# Populate new records in secondary tables (donations, volunteer shifts)
47+
# input - existing files in path
48+
# output - normalized object of all entries, as well as the input json rows for primary sources
49+
log_db.log_exec_status(job_id, 'clean_and_load', 'executing', '')
50+
normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
51+
# Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
52+
# (If additional inconsistencies are encountered, may need to enforce the schema of
53+
# the contacts loader by initializing it from pdp_contacts.)
54+
normalized_data.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
55+
normalized_data = pd.read_sql_table('_temp_pdp_contacts_loader', connection)
56+
57+
# Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
58+
log_db.log_exec_status(job_id, 'classify', 'executing', '')
59+
rows_classified = calssify_new_data.start(pdp_contacts_df, normalized_data)
60+
61+
# Archives rows the were updated in the current state of the DB (changes their archived_date to now)
62+
archive_rows.archive(connection, rows_classified["updated"])
63+
64+
# Match new+updated records against previous version of pdp_contacts database, and
65+
# write these rows to the database.
66+
match_data.start(connection, rows_classified, manual_matches_df, job_id)
67+
68+
# Copy raw input rows to json fields in pdp_contacts,
69+
# using a temporary table to simplify the update code.
70+
current_app.logger.info('Saving json of original rows to pdp_contacts')
71+
source_json.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
72+
# https://www.postgresql.org/docs/8.4/sql-update.html
73+
connection.execute('''
74+
UPDATE pdp_contacts pdp
75+
SET json = to_json(temp.json)
76+
FROM _temp_pdp_contacts_loader temp
77+
WHERE
78+
pdp.source_type = temp.source_type AND
79+
pdp.source_id = temp.source_id AND
80+
pdp.archived_date IS NULL
81+
''')
82+
83+
current_app.logger.info('Finished flow script run')
84+
job_outcome = 'completed'
85+
log_db.log_exec_status(job_id, 'flow', 'complete', '' )
9086

9187
else: # No files in list
9288
current_app.logger.info('No files to process')
@@ -107,4 +103,5 @@ def start_flow():
107103
job_outcome = 'error'
108104
return 'error'
109105

106+
current_app.logger.info('Pipeline execution took {} seconds '.format(time.time() - start))
110107
return job_outcome

0 commit comments

Comments
 (0)