Merge pull request #429 from CodeForPhilly/catch_flow_error

c-simpson · web-flow · commit d1084e010a0b · 2021-09-21T19:43:20.000-04:00
Catches errors in flow_script and sets status (in execution_status table)
diff --git a/src/server/alembic/versions/a3ba63dee8f4_rmv_details_size_limit.py b/src/server/alembic/versions/a3ba63dee8f4_rmv_details_size_limit.py
@@ -0,0 +1,26 @@
+"""Remove execution_status.details field size limit
+
+Revision ID: a3ba63dee8f4
+Revises: 40be910424f0
+Create Date: 2021-09-18 18:14:48.044985
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = 'a3ba63dee8f4'
+down_revision = '40be910424f0'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.alter_column('execution_status',"details", type_=sa.String(None) )
+
+
+
+def downgrade():
+    op.alter_column('execution_status',"details", type_=sa.String(128) )
+
diff --git a/src/server/api/admin_api.py b/src/server/api/admin_api.py
@@ -100,6 +100,9 @@ def execute():
     elif job_outcome == 'completed' :
         return jsonify({'outcome' : 'Analysis completed'}), 200
 
+    elif job_outcome == 'error' :
+        return jsonify({'outcome' : 'Analysis not completed due to error'}), 500
+    
     else:
         return jsonify({'outcome' : 'Unknown status: ' + str(job_outcome)}), 200
 
diff --git a/src/server/pipeline/flow_script.py b/src/server/pipeline/flow_script.py
@@ -1,4 +1,4 @@
-import os
+import os , sys, traceback
 import pandas as pd
 
 from flask import current_app
@@ -12,73 +12,95 @@
 def start_flow():
 
     job_id = admin_api.start_job()
+    job_outcome = None
+    trace_back_string = None
+
 
     if not job_id:
         current_app.logger.info('Failed to get job_id')
         job_outcome = 'busy'
 
     else:
-        log_db.log_exec_status(job_id, 'start_flow', 'executing', '')
-
-        file_path_list = os.listdir(RAW_DATA_PATH)
-
-        if file_path_list:
-            with engine.connect() as connection:
-                Base.metadata.create_all(connection)
-
-                # Get previous version of pdp_contacts table, which is used later to classify new records
-                pdp_contacts_df = pd.read_sql_table('pdp_contacts', connection)
-                pdp_contacts_df = pdp_contacts_df[pdp_contacts_df["archived_date"].isnull()]
-                pdp_contacts_df = pdp_contacts_df.drop(columns=['archived_date', 'created_date', '_id', 'matching_id'])
-
-                current_app.logger.info('Loaded {} records from pdp_contacts table'.format(pdp_contacts_df.shape[0]))
-
-                # Clean the input data and normalize/rename columns
-                # Populate new records in secondary tables (donations, volunteer shifts)
-                # input - existing files in path
-                # output - normalized object of all entries, as well as the input json rows for primary sources
-                log_db.log_exec_status(job_id, 'clean_and_load', 'executing', '')
-                normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
-
-                # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
-                # (If additional inconsistencies are encountered, may need to enforce the schema of
-                # the contacts loader by initializing it from pdp_contacts.)
-                normalized_data.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
-                normalized_data = pd.read_sql_table('_temp_pdp_contacts_loader', connection)
-
-                # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
-                log_db.log_exec_status(job_id, 'classify', 'executing', '')
-                rows_classified = calssify_new_data.start(pdp_contacts_df, normalized_data)
-
-                # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
-                archive_rows.archive(connection, rows_classified["updated"])
-
-                # Match new+updated records against previous version of pdp_contacts database, and
-                # write these rows to the database.
-                match_data.start(connection, rows_classified, manual_matches_df, job_id)
-
-                # Copy raw input rows to json fields in pdp_contacts,
-                # using a temporary table to simplify the update code.
-                current_app.logger.info('Saving json of original rows to pdp_contacts')
-                source_json.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
-                # https://www.postgresql.org/docs/8.4/sql-update.html
-                connection.execute('''
-                    UPDATE pdp_contacts pdp
-                    SET json = to_json(temp.json)
-                    FROM _temp_pdp_contacts_loader temp
-                    WHERE
-                        pdp.source_type = temp.source_type AND
-                        pdp.source_id = temp.source_id AND
-                        pdp.archived_date IS NULL
-                ''')
-
-            current_app.logger.info('Finished flow script run')
-            job_outcome = 'completed'
-
-        else: # No files in list
-            current_app.logger.info('No files to process')
-            job_outcome = 'nothing to do'  
-
-        log_db.log_exec_status(job_id, 'flow', 'complete', '' )
+
+        try:
+
+            log_db.log_exec_status(job_id, 'start_flow', 'executing', '')
+
+            file_path_list = os.listdir(RAW_DATA_PATH)
+
+            if file_path_list:
+                with engine.connect() as connection:
+                    Base.metadata.create_all(connection)
+
+                    # Get previous version of pdp_contacts table, which is used later to classify new records
+                    pdp_contacts_df = pd.read_sql_table('pdp_contacts', connection)
+                    pdp_contacts_df = pdp_contacts_df[pdp_contacts_df["archived_date"].isnull()]
+                    pdp_contacts_df = pdp_contacts_df.drop(columns=['archived_date', 'created_date', '_id', 'matching_id'])
+
+                    current_app.logger.info('Loaded {} records from pdp_contacts table'.format(pdp_contacts_df.shape[0]))
+
+                    # Clean the input data and normalize/rename columns
+                    # Populate new records in secondary tables (donations, volunteer shifts)
+                    # input - existing files in path
+                    # output - normalized object of all entries, as well as the input json rows for primary sources
+                    log_db.log_exec_status(job_id, 'clean_and_load', 'executing', '')
+                    normalized_data, source_json, manual_matches_df = clean_and_load_data.start(connection, pdp_contacts_df, file_path_list)
+
+                    # Standardize column data types via postgres (e.g. reading a csv column as int vs. str)
+                    # (If additional inconsistencies are encountered, may need to enforce the schema of
+                    # the contacts loader by initializing it from pdp_contacts.)
+                    normalized_data.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
+                    normalized_data = pd.read_sql_table('_temp_pdp_contacts_loader', connection)
+
+                    # Classifies rows to old rows that haven't changed, updated rows and new rows - compared to the existing state of the DB
+                    log_db.log_exec_status(job_id, 'classify', 'executing', '')
+                    rows_classified = calssify_new_data.start(pdp_contacts_df, normalized_data)
+
+                    # Archives rows the were updated in the current state of the DB (changes their archived_date to now)
+                    archive_rows.archive(connection, rows_classified["updated"])
+
+                    # Match new+updated records against previous version of pdp_contacts database, and
+                    # write these rows to the database.
+                    match_data.start(connection, rows_classified, manual_matches_df, job_id)
+
+                    # Copy raw input rows to json fields in pdp_contacts,
+                    # using a temporary table to simplify the update code.
+                    current_app.logger.info('Saving json of original rows to pdp_contacts')
+                    source_json.to_sql('_temp_pdp_contacts_loader', connection, index=False, if_exists='replace')
+                    # https://www.postgresql.org/docs/8.4/sql-update.html
+                    connection.execute('''
+                        UPDATE pdp_contacts pdp
+                        SET json = to_json(temp.json)
+                        FROM _temp_pdp_contacts_loader temp
+                        WHERE
+                            pdp.source_type = temp.source_type AND
+                            pdp.source_id = temp.source_id AND
+                            pdp.archived_date IS NULL
+                    ''')
+
+                current_app.logger.info('Finished flow script run')
+                job_outcome = 'completed'
+                log_db.log_exec_status(job_id, 'flow', 'complete', '' )
+
+
+
+            else: # No files in list
+                current_app.logger.info('No files to process')
+                job_outcome = 'nothing to do'  
+                log_db.log_exec_status(job_id, 'flow', 'complete', '' )
+
+
+        except Exception as e:
+              current_app.logger.error(e)
+              trace_back_string = traceback.format_exc()
+              current_app.logger.error(trace_back_string)
+            
+        finally:
+            if job_outcome != 'completed':  
+                
+                log_db.log_exec_status(job_id, 'flow', 'error', trace_back_string )
+                current_app.logger.error("Uncaught error status, setting job status to \'error\' ")
+                job_outcome = 'error'
+                return 'error'
 
     return job_outcome