Merge pull request #422 from CodeForPhilly/285-rfm-model

c-simpson · web-flow · commit 57a6426284a5 · 2021-10-03T17:52:49.000-04:00
Merging to enable PAWS testing. See #436 for some needed work.
diff --git a/src/server/alembic/populate_rfm_mapping.sql b/src/server/alembic/populate_rfm_mapping.sql
@@ -1,9 +1,9 @@
 -- Run this script in your SQL query tool
 -- Run truncate command if this table is already populated
+
 -- TRUNCATE TABLE rfm_mapping; 
 -- BEGIN;
 -- Fields are                 (rfm_score, label, (background) color, text color)
-        
 insert into rfm_mapping values('111', 'Low impact, disengaged','#eed0aa', '#000000');
 insert into rfm_mapping values('112', 'Low impact, disengaged','#eed0aa', '#000000');
 insert into rfm_mapping values('113', 'Low impact, disengaged','#eed0aa', '#000000');
@@ -129,4 +129,4 @@ insert into rfm_mapping values('552', 'High impact, engaged','#034858', '#ffffff
 insert into rfm_mapping values('553', 'High impact, engaged','#034858', '#ffffff');
 insert into rfm_mapping values('554', 'High impact, engaged','#034858', '#ffffff');
 insert into rfm_mapping values('555', 'High impact, engaged','#034858', '#ffffff');
--- COMMIT;
+COMMIT;
diff --git a/src/server/api/admin_api.py b/src/server/api/admin_api.py
@@ -406,3 +406,11 @@ def hit_gdrs():
 #     d = read_rfm_edges()        # read it again     
 #     print("round-trip d is : \n " + str(d) )
 #     return "OK"
+
+from rfm_funcs.create_scores import create_scores
+@admin_api.route("/api/admin/test_create_scores", methods=["GET"])
+def hit_create_scores():
+    current_app.logger.info("Hitting create_scores() ")
+    tuple_count = create_scores('2021-07-27')
+    current_app.logger.info("create_scores()  processed " + str(tuple_count) + " scores")
+    return jsonify(200)
diff --git a/src/server/rfm_funcs/__init__.py b/src/server/rfm_funcs/__init__.py
diff --git a/src/server/rfm_funcs/create_bins.py b/src/server/rfm_funcs/create_bins.py
@@ -0,0 +1,55 @@
+def create_bins(data, query_date):
+    '''This script will take table data and bin edges for RFM scores for all PAWS donations
+
+    query_date = date data was queried
+    '''
+
+    import pandas as pd
+    import numpy as np
+    import jenkspy
+    from datetime import datetime, date
+    import os
+
+
+
+    ####
+    # read in data from database as list of tuples
+    df = pull_donations_for_rfm()
+    df = pd.DataFrame(df, columns=['matching_id', 'amount', 'close_date'])
+
+    donations_df['Close_Date'] =pd.to_datetime(df['Close_Date']).dt.date
+
+    ##################################################################################
+    # Calculate recency bins
+    from recency_bins import recency_bins
+    recency_bins, quantile_scores= recency_bins(donations_df, query_date)
+
+    ###################################################################################
+    # Calculate frequency bins
+    from frequency_bins import frequency_bins
+
+    jenks_frequency_bins, human_frequency_bins = frequency_bins(donations_df)
+
+
+        def checkIfDuplicates(listOfElems):
+        ''' Check if given list contains any duplicates '''
+        for elem in listOfElems:
+            if listOfElems.count(elem) > 1:
+                return True
+        return False
+
+        duplicats_bool = checkIfDuplicates(jenks_frequency_bins)
+        if duplicates_bool  == True:
+            final_frequency_bins = human_frequency_bins
+
+    ###################################################################################
+    # Calculate Amount bins
+    from amount_bins import amount_bins
+
+    amount_jenks_bins, human_amount_bins = amount_bins(donations_df)
+
+
+
+    ###################################################################################
+    # Write bins to dict
+    bins_dict = {}
diff --git a/src/server/rfm_funcs/create_scores.py b/src/server/rfm_funcs/create_scores.py
@@ -0,0 +1,131 @@
+from config import engine
+
+import pandas as pd
+import numpy as np
+from datetime import datetime
+from collections import Counter
+
+def date_difference(my_date, max_date):
+    '''
+    This function takes in a single date from the donations dataframe (per row) and compares the difference between that date and the date in which matching occurs.
+    I.e. pipeline matching should provide a query_date so that this can work.
+    '''
+
+    d1 = datetime.strptime(str(my_date), "%Y-%m-%d")
+    d2 = datetime.strptime(str(max_date), "%Y-%m-%d")
+    diff = (d2 - d1)
+    return diff
+
+
+def create_scores(query_date):
+    '''
+    requires query date as input-- must be string in the following format "%Y-%m-%d"
+    returns a list of matching_ids and scores as tuples
+    will also insert rfm scores into rfm_scores table----see src/server/api/admin_api.py
+    '''
+
+    with engine.connect() as connection:
+
+        # read in data from database via pull_donations_for_rfm() func (reads in as a list of tuples)
+        df = pd.read_sql(
+            """
+            select pc.matching_id, s.amount, s.close_date 
+            from salesforcedonations s 
+            inner join pdp_contacts pc on pc.source_id = s.contact_id and pc.source_type = 'salesforcecontacts'
+            where pc.archived_date is null order by matching_id
+            """
+            , connection)
+        df = pd.DataFrame(df, columns=['matching_id', 'amount', 'close_date'])
+
+        from api.admin_api import read_rfm_edges,  insert_rfm_scores  # Avoid circular import issues
+
+        rfm_dict = read_rfm_edges()
+        recency_labels = [5,4,3,2,1]
+        recency_bins =   list(rfm_dict['r'].values())    #imported from table
+
+        frequency_labels = [1,2,3,4,5]
+        frequency_bins  =  list(rfm_dict['f'].values())    #imported from table
+
+        monetary_labels = [1,2,3,4,5]
+        monetary_bins =   list(rfm_dict['m'].values())      #imported from table
+
+
+        ########################## recency #########################################
+
+        donations_past_year = df
+        donations_past_year['close_date'] =pd.to_datetime(donations_past_year['close_date']).dt.date
+
+        # calculate date difference between input date and individual row close date
+
+        days = []
+        max_close_date = donations_past_year['close_date'].max()
+        for ii in donations_past_year['close_date']:
+            days.append(date_difference(ii, max_close_date))
+        donations_past_year['days_since'] = days
+
+        grouped_past_year = donations_past_year.groupby('matching_id').agg({'days_since': ['min']}).reset_index()
+        print(grouped_past_year.head())
+    
+        grouped_past_year[('days_since', 'min')]= grouped_past_year[('days_since', 'min')].dt.days
+
+        max_maybe = grouped_past_year[('days_since', 'min')].max()
+
+        real_max = max(max_maybe,  max(recency_bins)+1 )
+
+        recency_bins.append(real_max)
+
+
+
+        grouped_past_year['recency_score'] = pd.cut(grouped_past_year[('days_since','min')], bins= recency_bins, labels=recency_labels, include_lowest = True)
+        grouped_past_year.rename(columns={('recency_score', ''): 'recency_score'})
+
+        ################################## frequency ###############################
+
+        df['close_date'] = pd.DatetimeIndex(df['close_date'])
+
+        df_grouped = df.groupby(['matching_id', pd.Grouper(key = 'close_date', freq = 'Q')]).count().max(level=0)
+
+        df_grouped = df_grouped.reset_index()
+
+        frequency_bins.append(np.inf)
+
+        df_frequency = df_grouped[['matching_id' , 'amount']] # amount is a placeholder as the groupby step just gives a frequency count, the value doesn't correspond to donation monetary amount.
+
+        df_frequency = df_frequency.rename(columns = {'amount':'frequency'}) #renaming amount to frequency
+
+        df_frequency['frequency_score'] = pd.cut(df_frequency['frequency'],
+                                                bins = frequency_bins, labels=frequency_labels, include_lowest=True)
+
+        ################################## amount ##################################
+
+        monetary_bins.append(np.inf)
+
+        df_amount = df.groupby(df['matching_id'], as_index=False).amount.max()
+
+        df_amount['amount_score'] = pd.cut(df_amount['amount'], bins= monetary_bins, include_lowest=True, labels = monetary_labels)
+
+
+        # Concatenate rfm scores
+            # merge monetary df and frequency df
+        df_semi = df_amount.merge(df_frequency, left_on='matching_id', right_on= 'matching_id')
+        print(grouped_past_year.head())
+        print(df_semi.head())
+        df_final = df_semi.merge(grouped_past_year, left_on='matching_id', right_on= 'matching_id')        # merge monetary/frequency dfs to recency df
+
+        ### get avg fm score and merge with df_final
+        # df_final['f_m_AVG_score'] = df_final[['frequency_score', 'amount_score']].mean(axis=1)
+
+
+        # import function: rfm_concat, which will catenate integers as a string and then convert back to a single integer
+        from rfm_funcs.rfm_functions import rfm_concat
+        rfm_score = rfm_concat(df_final[('recency_score'), ''], df_final['frequency_score'], df_final['amount_score'])
+
+        # Append rfm score to final df
+        df_final['rfm_score'] = rfm_score
+
+        from rfm_funcs.rfm_functions import merge_series
+        score_tuples = merge_series((df_final['matching_id']), df_final['rfm_score'])
+
+        insert_rfm_scores(score_tuples)
+
+        return len(score_tuples)   # Not sure there's anything to do with them at this point
diff --git a/src/server/rfm_funcs/rfm_functions.py b/src/server/rfm_funcs/rfm_functions.py
@@ -0,0 +1,80 @@
+# rfm_funcs
+
+### A number of RFM functions which are called by the main create_scores function.
+
+# def date_difference(my_date, query_date):
+#     '''
+#     This function takes in a single date from the donations dataframe (per row) and compares the difference between that date and the date in which matching occurs.
+#     I.e. pipeline matching should provide a query_date so that this can work.
+#     '''
+#     from datetime import datetime, date
+
+#     d1 = datetime.strptime(str(my_date), "%Y-%m-%d")
+#     d2 = datetime.strptime(str(query_date), "%Y-%m-%d")
+#     diff = (d2 - d1)
+#     return diff
+
+
+
+
+
+def rfm_concat(days_score, frequency_score, amount_score):
+    '''
+    This function takes in three pandas.series columns and returns a concatenated version of each score for a total rfm score.
+    Assumes that arg1 are Recency, arg2 are Frequency and arg3 are Monetary values
+    arg1: pandas.series
+    arg2: pandas.series
+    arg3: pandas.series
+
+
+    '''
+    def concat(a, b, c):
+        return int(f"{a}{b}{c}")
+
+    rfm_score = list()
+    for ii, jj, kk in zip(days_score, frequency_score, amount_score):
+        rfm_score.append(concat(ii,jj,kk))
+
+
+
+    return rfm_score
+
+
+
+def merge_series(list1, list2):
+    '''
+    This function takes in two tuples and merges them into a list of tuples.
+    '''
+    merged_list = tuple(zip(list(list1), list(list2)))
+    return merged_list
+
+
+
+def create_bins_dict(recency_edges, frequency_edges, monetary_edges):
+    '''
+    Create_bins_dict-- creates dictionaries for each edge and label pairing
+    This function takes in user defined bin edges and respective labels per each bin edge. User should
+    input a list of edges and labels in corresponding order. A set of edges and bins for each score should be entered.
+
+    e.g.
+    recency_edges = np.array([0, 1., 2.,4., 10.])
+    '''
+
+    recency_dict = {}
+    recency_labels = list(5,4,3,2,1)
+    for ii,jj in zip(recency_labels, recency_edges):
+        recency_dict["{0}".format(ii)] = jj
+
+    frequency_dict = {}
+    frequency_labels= list(1,2,3,4,5)
+    for tt,kk in zip(frequency_labels, frequency_edges):
+        frequency_dict["{0}".format(tt)] = kk
+
+
+    monetary_dict = {}
+    monetary_labels = list(1,2,3,4,5)
+    for ww,hh in zip(monetary_labels, monetary_edges):
+        monetary_dict["{0}".format(ww)] = hh
+
+
+    return recency_dict, frequency_dict, monetary_dict
diff --git a/src/server/rfm_funcs/rfm_instructions.md b/src/server/rfm_funcs/rfm_instructions.md
@@ -0,0 +1,19 @@
+# RFM code run instructions
+
+In order to obtain rfm scores a few dependencies will be required. 
+
+1. The most up to date bin edges must be stored within the postgres database.
+2. If bin edges must be updated use the following---via "src/server/api/admin_api.py"
+    write_rfm_edges(rfm_dict : dict)
+
+Once all above situations are satisfied create_scores can be run. 
+create_scores.py is the main function which will output a list of tuples for matching_id and corresponding score.
+This function requires a single input, query_date
+    query_date should be pulled from the most recent data ingestion----once per week.
+
+create_scores.py runs in 4 distinct steps
+1. calculate recency since last donation over the total lifespan of data collection
+2. calculate frequency over the past year from query date.
+3. calculate monetary donations from the individual's max donation over the course of data lifespan.
+4. concatenate recency, frequency, and monetary values into a single integer and pair these with individual matching ids to update via 'insert_rfm_scores'
+
diff --git a/src/server/rfm_funcs/test_rfm.py b/src/server/rfm_funcs/test_rfm.py
@@ -0,0 +1,18 @@
+# This function is meant to test the RFM create_scores.py function. 
+
+'''
+Things needed
+1. Create mock data
+    a. Mock data must be realistic
+    b. mock data must have 5^3 possibilities for RFM score, i.e., 1 RFM score each. 
+    c. Therefore we need 125 unique rows.
+    d. Recency needs to have at least 5 different dates
+    e. Frequency needs to have at least 5 different IDs
+    f. Monetary needs to have at least 5 different amounts
+    g. Each subject ID will get an RFM score. 
+2. create_scores.py will accept this mock data and then generate a new RFM score
+3. final step of this function will perform a jaccard similarity analysis to determine if the vectors
+match where the result should be exatly 1.0
+
+'''
+