Tweaks for create_scores

c-simpson · c-simpson · commit 93b95d7dc296 · 2021-08-16T19:16:39.000-04:00
diff --git a/src/server/api/admin_api.py b/src/server/api/admin_api.py
@@ -403,3 +403,10 @@ def hit_gdrs():
 #     d = read_rfm_edges()        # read it again     
 #     print("round-trip d is : \n " + str(d) )
 #     return "OK"
+
+from server.rfm_funcs.create_scores import create_scores
+@admin_api.route("/api/admin/test_create_scores", methods=["GET"])
+def hit_create_scores():
+    current_app.logger.info("Hitting create_scores() ")
+    tuple_count = create_scores('2021-07-27')
+    current_app.logger.info("create_scores()  processed " + tuple_count + " scores")
diff --git a/src/server/rfm_funcs/create_scores.py b/src/server/rfm_funcs/create_scores.py
@@ -1,111 +1,121 @@
-def create_scores(connection, query_date):
+from config import engine
+
+import pandas as pd
+import numpy as np
+from datetime import datetime, date
+from collections import Counter
+
+def date_difference(my_date, query_date):
     '''
-    requires query date as input-- must be string in the following format "%Y-%m-%d"
-    returns a list of matching_ids and scores as tuples
-    will also insert rfm scores into rfm_scores table----see src/server/api/admin_api.py
+    This function takes in a single date from the donations dataframe (per row) and compares the difference between that date and the date in which matching occurs.
+    I.e. pipeline matching should provide a query_date so that this can work.
     '''
-    # Import dependencies
-    import pandas as pd
-    import numpy as np
-    from datetime import datetime, date
-    from collections import Counter
-    from api.admin_api import read_rfm_edges,  insert_rfm_scores
-
 
-    # read in data from database via pull_donations_for_rfm() func (reads in as a list of tuples)
-    df = pd.read_sql(
-        """
-        select pc.matching_id, s.amount, s.close_date 
-        from salesforcedonations s 
-        inner join pdp_contacts pc on pc.source_id = s.contact_id and pc.source_type = 'salesforcecontacts'
-        where pc.archived_date is null order by matching_id
-        """
-        , connection)
-    df = pd.DataFrame(df, columns=['matching_id', 'amount', 'close_date'])
+    d1 = datetime.strptime(str(my_date), "%Y-%m-%d")
+    d2 = datetime.strptime(str(query_date), "%Y-%m-%d")
+    diff = (d2 - d1)
+    return diff
 
-    # read in labels and bin edges from table
-    recency_labels = [5,4,3,2,1]
-    recency_bins =   list(read_rfm_edges('r').values())    #imported from table
 
-    frequency_labels = [1,2,3,4,5]
-    frequency_bins  =  list(read_rfm_edges('f').values())    #imported from table
+def create_scores(query_date):
+    '''
+    requires query date as input-- must be string in the following format "%Y-%m-%d"
+    returns a list of matching_ids and scores as tuples
+    will also insert rfm scores into rfm_scores table----see src/server/api/admin_api.py
+    '''
 
-    monetary_labels = [ 1,2,3,4,5]
-    monetary_bins =   list(read_rfm_edges('m').values())      #imported from table
+    with engine.connect() as connection:
 
+        # read in data from database via pull_donations_for_rfm() func (reads in as a list of tuples)
+        df = pd.read_sql(
+            """
+            select pc.matching_id, s.amount, s.close_date 
+            from salesforcedonations s 
+            inner join pdp_contacts pc on pc.source_id = s.contact_id and pc.source_type = 'salesforcecontacts'
+            where pc.archived_date is null order by matching_id
+            """
+            , connection)
+        df = pd.DataFrame(df, columns=['matching_id', 'amount', 'close_date'])
 
-    ########################## recency #########################################
+        from api.admin_api import read_rfm_edges,  insert_rfm_scores  # Avoid circular import issues
 
+        rfm_dict = read_rfm_edges()
+        recency_labels = [5,4,3,2,1]
+        recency_bins =   list(rfm_dict['r'])    #imported from table
 
-    donations_past_year = df
-    donations_past_year['close_date'] =pd.to_datetime(donations_past_year['close_date']).dt.date
+        frequency_labels = [1,2,3,4,5]
+        frequency_bins  =  list(rfm_dict['f'])    #imported from table
 
-        # calculate date difference between input date and individual row close date
-    from rfm_functions import date_difference
-    days = []
-    for ii in donations_past_year['close_date']:
-        days.append(date_difference(ii, str(query_date)))
-    donations_past_year['days_since'] = days
+        monetary_labels = [ 1,2,3,4,5]
+        monetary_bins =   list(rfm_dict['m'])      #imported from table
 
-    grouped_past_year = donations_past_year.groupby('_id').agg({'days_since': ['min']}).reset_index()
 
-    grouped_past_year[('days_since', 'min')]= grouped_past_year[('days_since', 'min')].dt.days
+        ########################## recency #########################################
 
-    recency_bins.append(grouped_past_year[('days_since', 'min')].max())
+        donations_past_year = df
+        donations_past_year['close_date'] =pd.to_datetime(donations_past_year['close_date']).dt.date
 
-    grouped_past_year['recency_score'] = pd.cut(grouped_past_year[('days_since','min')], bins= recency_bins, labels=recency_labels, include_lowest = True)
-    grouped_past_year.rename(columns={('recency_score', ''): 'recency_score'})
+            # calculate date difference between input date and individual row close date
 
+        days = []
+        for ii in donations_past_year['close_date']:
+            days.append(date_difference(ii, str(query_date)))
+        donations_past_year['days_since'] = days
 
-    ################################## frequency ###############################
+        grouped_past_year = donations_past_year.groupby('_id').agg({'days_since': ['min']}).reset_index()
 
-    df['close_date'] = pd.DatetimeIndex(df['close_date'])
+        grouped_past_year[('days_since', 'min')]= grouped_past_year[('days_since', 'min')].dt.days
 
-    df_grouped = df.groupby(['matching_id', pd.Grouper(key = 'close_date', freq = 'Q')]).count().max(level=0)
+        recency_bins.append(grouped_past_year[('days_since', 'min')].max())
 
-    df_grouped = df_grouped.reset_index()
+        grouped_past_year['recency_score'] = pd.cut(grouped_past_year[('days_since','min')], bins= recency_bins, labels=recency_labels, include_lowest = True)
+        grouped_past_year.rename(columns={('recency_score', ''): 'recency_score'})
 
-    frequency_bins.append(np.inf)
+        ################################## frequency ###############################
 
-    df_frequency = df_grouped[['matching_id' , 'opp_id']]
+        df['close_date'] = pd.DatetimeIndex(df['close_date'])
 
-    df_frequency['frequency_score'] = pd.cut(df_frequency['opp_id'],
-                                               bins = frequency_bins, labels=frequency_labels, include_lowest=True)
+        df_grouped = df.groupby(['matching_id', pd.Grouper(key = 'close_date', freq = 'Q')]).count().max(level=0)
 
+        df_grouped = df_grouped.reset_index()
 
+        frequency_bins.append(np.inf)
 
-    ################################## amount ##################################
+        df_frequency = df_grouped[['matching_id' , 'opp_id']]
 
-    monetary_bins.append(np.inf)
+        df_frequency['frequency_score'] = pd.cut(df_frequency['opp_id'],
+                                                bins = frequency_bins, labels=frequency_labels, include_lowest=True)
 
-    df_amount = df.groupby(df['matching_id'], as_index=False).amount.max()
+        ################################## amount ##################################
 
-    df_amount['amount_score'] = pd.cut(df_amount['amount'], bins= monetary_bins, include_lowest=True, labels = monetary_labels)
+        monetary_bins.append(np.inf)
 
+        df_amount = df.groupby(df['matching_id'], as_index=False).amount.max()
 
+        df_amount['amount_score'] = pd.cut(df_amount['amount'], bins= monetary_bins, include_lowest=True, labels = monetary_labels)
 
 
-    # Concatenate rfm scores
-        # merge monetary df and frequency df
-    df_semi = df_amount.merge(df_frequency, left_on='matching_id', right_on= 'matching_id')
-    print(grouped_past_year.head())
-    print(df_semi.head())
-    df_final = df_semi.merge(grouped_past_year, left_on='matching_id', right_on= '_id')        # merge monetary/frequency dfs to recency df
+        # Concatenate rfm scores
+            # merge monetary df and frequency df
+        df_semi = df_amount.merge(df_frequency, left_on='matching_id', right_on= 'matching_id')
+        print(grouped_past_year.head())
+        print(df_semi.head())
+        df_final = df_semi.merge(grouped_past_year, left_on='matching_id', right_on= '_id')        # merge monetary/frequency dfs to recency df
 
-    ### get avg fm score and merge with df_final
-    # df_final['f_m_AVG_score'] = df_final[['frequency_score', 'amount_score']].mean(axis=1)
+        ### get avg fm score and merge with df_final
+        # df_final['f_m_AVG_score'] = df_final[['frequency_score', 'amount_score']].mean(axis=1)
 
 
-    # import function: rfm_concat, which will catenate integers as a string and then convert back to a single integer
-    from rfm_functions import rfm_concat
-    rfm_score = rfm_concat(df_final[('recency_score'), ''], df_final['frequency_score'], df_final['amount_score'])
+        # import function: rfm_concat, which will catenate integers as a string and then convert back to a single integer
+        from rfm_functions import rfm_concat
+        rfm_score = rfm_concat(df_final[('recency_score'), ''], df_final['frequency_score'], df_final['amount_score'])
 
-    # Append rfm score to final df
-    df_final['rfm_score'] = rfm_score
+        # Append rfm score to final df
+        df_final['rfm_score'] = rfm_score
 
-    from rfm_functions import merge_series
-    score_tuples = merge_series((df_final['matching_id']), df_final['rfm_score'])
+        from rfm_functions import merge_series
+        score_tuples = merge_series((df_final['matching_id']), df_final['rfm_score'])
 
-    insert_rfm_scores(score_tuples)
+        insert_rfm_scores(score_tuples)
 
-    return score_tuples
+        return len(score_tuples)   # Not sure there's anything to do with them at this point
diff --git a/src/server/rfm_funcs/rfm_functions.py b/src/server/rfm_funcs/rfm_functions.py
@@ -2,17 +2,17 @@
 
 ### A number of RFM functions which are called by the main create_scores function.
 
-def date_difference(my_date, query_date):
-    '''
-    This function takes in a single date from the donations dataframe (per row) and compares the difference between that date and the date in which matching occurs.
-    I.e. pipeline matching should provide a query_date so that this can work.
-    '''
-    from datetime import datetime, date
-
-    d1 = datetime.strptime(str(my_date), "%Y-%m-%d")
-    d2 = datetime.strptime(str(query_date), "%Y-%m-%d")
-    diff = (d2 - d1)
-    return diff
+# def date_difference(my_date, query_date):
+#     '''
+#     This function takes in a single date from the donations dataframe (per row) and compares the difference between that date and the date in which matching occurs.
+#     I.e. pipeline matching should provide a query_date so that this can work.
+#     '''
+#     from datetime import datetime, date
+
+#     d1 = datetime.strptime(str(my_date), "%Y-%m-%d")
+#     d2 = datetime.strptime(str(query_date), "%Y-%m-%d")
+#     diff = (d2 - d1)
+#     return diff