diff --git a/src/server/alembic/populate_rfm_mapping.sql b/src/server/alembic/populate_rfm_mapping.sql index f000caad..61cc1800 100644 --- a/src/server/alembic/populate_rfm_mapping.sql +++ b/src/server/alembic/populate_rfm_mapping.sql @@ -1,9 +1,9 @@ -- Run this script in your SQL query tool -- Run truncate command if this table is already populated + -- TRUNCATE TABLE rfm_mapping; -- BEGIN; -- Fields are (rfm_score, label, (background) color, text color) - insert into rfm_mapping values('111', 'Low impact, disengaged','#eed0aa', '#000000'); insert into rfm_mapping values('112', 'Low impact, disengaged','#eed0aa', '#000000'); insert into rfm_mapping values('113', 'Low impact, disengaged','#eed0aa', '#000000'); @@ -129,4 +129,4 @@ insert into rfm_mapping values('552', 'High impact, engaged','#034858', '#ffffff insert into rfm_mapping values('553', 'High impact, engaged','#034858', '#ffffff'); insert into rfm_mapping values('554', 'High impact, engaged','#034858', '#ffffff'); insert into rfm_mapping values('555', 'High impact, engaged','#034858', '#ffffff'); --- COMMIT; +COMMIT; diff --git a/src/server/api/admin_api.py b/src/server/api/admin_api.py index 2bd7e8ec..76905705 100644 --- a/src/server/api/admin_api.py +++ b/src/server/api/admin_api.py @@ -406,3 +406,11 @@ def hit_gdrs(): # d = read_rfm_edges() # read it again # print("round-trip d is : \n " + str(d) ) # return "OK" + +from rfm_funcs.create_scores import create_scores +@admin_api.route("/api/admin/test_create_scores", methods=["GET"]) +def hit_create_scores(): + current_app.logger.info("Hitting create_scores() ") + tuple_count = create_scores('2021-07-27') + current_app.logger.info("create_scores() processed " + str(tuple_count) + " scores") + return jsonify(200) diff --git a/src/server/rfm_funcs/__init__.py b/src/server/rfm_funcs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/server/rfm_funcs/create_bins.py b/src/server/rfm_funcs/create_bins.py new file mode 100644 index 00000000..59b2db2f --- /dev/null +++ b/src/server/rfm_funcs/create_bins.py @@ -0,0 +1,55 @@ +def create_bins(data, query_date): + '''This script will take table data and bin edges for RFM scores for all PAWS donations + + query_date = date data was queried + ''' + + import pandas as pd + import numpy as np + import jenkspy + from datetime import datetime, date + import os + + + + #### + # read in data from database as list of tuples + df = pull_donations_for_rfm() + df = pd.DataFrame(df, columns=['matching_id', 'amount', 'close_date']) + + donations_df['Close_Date'] =pd.to_datetime(df['Close_Date']).dt.date + + ################################################################################## + # Calculate recency bins + from recency_bins import recency_bins + recency_bins, quantile_scores= recency_bins(donations_df, query_date) + + ################################################################################### + # Calculate frequency bins + from frequency_bins import frequency_bins + + jenks_frequency_bins, human_frequency_bins = frequency_bins(donations_df) + + + def checkIfDuplicates(listOfElems): + ''' Check if given list contains any duplicates ''' + for elem in listOfElems: + if listOfElems.count(elem) > 1: + return True + return False + + duplicats_bool = checkIfDuplicates(jenks_frequency_bins) + if duplicates_bool == True: + final_frequency_bins = human_frequency_bins + + ################################################################################### + # Calculate Amount bins + from amount_bins import amount_bins + + amount_jenks_bins, human_amount_bins = amount_bins(donations_df) + + + + ################################################################################### + # Write bins to dict + bins_dict = {} diff --git a/src/server/rfm_funcs/create_scores.py b/src/server/rfm_funcs/create_scores.py new file mode 100644 index 00000000..44e5518d --- /dev/null +++ b/src/server/rfm_funcs/create_scores.py @@ -0,0 +1,131 @@ +from config import engine + +import pandas as pd +import numpy as np +from datetime import datetime +from collections import Counter + +def date_difference(my_date, max_date): + ''' + This function takes in a single date from the donations dataframe (per row) and compares the difference between that date and the date in which matching occurs. + I.e. pipeline matching should provide a query_date so that this can work. + ''' + + d1 = datetime.strptime(str(my_date), "%Y-%m-%d") + d2 = datetime.strptime(str(max_date), "%Y-%m-%d") + diff = (d2 - d1) + return diff + + +def create_scores(query_date): + ''' + requires query date as input-- must be string in the following format "%Y-%m-%d" + returns a list of matching_ids and scores as tuples + will also insert rfm scores into rfm_scores table----see src/server/api/admin_api.py + ''' + + with engine.connect() as connection: + + # read in data from database via pull_donations_for_rfm() func (reads in as a list of tuples) + df = pd.read_sql( + """ + select pc.matching_id, s.amount, s.close_date + from salesforcedonations s + inner join pdp_contacts pc on pc.source_id = s.contact_id and pc.source_type = 'salesforcecontacts' + where pc.archived_date is null order by matching_id + """ + , connection) + df = pd.DataFrame(df, columns=['matching_id', 'amount', 'close_date']) + + from api.admin_api import read_rfm_edges, insert_rfm_scores # Avoid circular import issues + + rfm_dict = read_rfm_edges() + recency_labels = [5,4,3,2,1] + recency_bins = list(rfm_dict['r'].values()) #imported from table + + frequency_labels = [1,2,3,4,5] + frequency_bins = list(rfm_dict['f'].values()) #imported from table + + monetary_labels = [1,2,3,4,5] + monetary_bins = list(rfm_dict['m'].values()) #imported from table + + + ########################## recency ######################################### + + donations_past_year = df + donations_past_year['close_date'] =pd.to_datetime(donations_past_year['close_date']).dt.date + + # calculate date difference between input date and individual row close date + + days = [] + max_close_date = donations_past_year['close_date'].max() + for ii in donations_past_year['close_date']: + days.append(date_difference(ii, max_close_date)) + donations_past_year['days_since'] = days + + grouped_past_year = donations_past_year.groupby('matching_id').agg({'days_since': ['min']}).reset_index() + print(grouped_past_year.head()) + + grouped_past_year[('days_since', 'min')]= grouped_past_year[('days_since', 'min')].dt.days + + max_maybe = grouped_past_year[('days_since', 'min')].max() + + real_max = max(max_maybe, max(recency_bins)+1 ) + + recency_bins.append(real_max) + + + + grouped_past_year['recency_score'] = pd.cut(grouped_past_year[('days_since','min')], bins= recency_bins, labels=recency_labels, include_lowest = True) + grouped_past_year.rename(columns={('recency_score', ''): 'recency_score'}) + + ################################## frequency ############################### + + df['close_date'] = pd.DatetimeIndex(df['close_date']) + + df_grouped = df.groupby(['matching_id', pd.Grouper(key = 'close_date', freq = 'Q')]).count().max(level=0) + + df_grouped = df_grouped.reset_index() + + frequency_bins.append(np.inf) + + df_frequency = df_grouped[['matching_id' , 'amount']] # amount is a placeholder as the groupby step just gives a frequency count, the value doesn't correspond to donation monetary amount. + + df_frequency = df_frequency.rename(columns = {'amount':'frequency'}) #renaming amount to frequency + + df_frequency['frequency_score'] = pd.cut(df_frequency['frequency'], + bins = frequency_bins, labels=frequency_labels, include_lowest=True) + + ################################## amount ################################## + + monetary_bins.append(np.inf) + + df_amount = df.groupby(df['matching_id'], as_index=False).amount.max() + + df_amount['amount_score'] = pd.cut(df_amount['amount'], bins= monetary_bins, include_lowest=True, labels = monetary_labels) + + + # Concatenate rfm scores + # merge monetary df and frequency df + df_semi = df_amount.merge(df_frequency, left_on='matching_id', right_on= 'matching_id') + print(grouped_past_year.head()) + print(df_semi.head()) + df_final = df_semi.merge(grouped_past_year, left_on='matching_id', right_on= 'matching_id') # merge monetary/frequency dfs to recency df + + ### get avg fm score and merge with df_final + # df_final['f_m_AVG_score'] = df_final[['frequency_score', 'amount_score']].mean(axis=1) + + + # import function: rfm_concat, which will catenate integers as a string and then convert back to a single integer + from rfm_funcs.rfm_functions import rfm_concat + rfm_score = rfm_concat(df_final[('recency_score'), ''], df_final['frequency_score'], df_final['amount_score']) + + # Append rfm score to final df + df_final['rfm_score'] = rfm_score + + from rfm_funcs.rfm_functions import merge_series + score_tuples = merge_series((df_final['matching_id']), df_final['rfm_score']) + + insert_rfm_scores(score_tuples) + + return len(score_tuples) # Not sure there's anything to do with them at this point diff --git a/src/server/rfm_funcs/rfm_functions.py b/src/server/rfm_funcs/rfm_functions.py new file mode 100644 index 00000000..0fdd1f59 --- /dev/null +++ b/src/server/rfm_funcs/rfm_functions.py @@ -0,0 +1,80 @@ +# rfm_funcs + +### A number of RFM functions which are called by the main create_scores function. + +# def date_difference(my_date, query_date): +# ''' +# This function takes in a single date from the donations dataframe (per row) and compares the difference between that date and the date in which matching occurs. +# I.e. pipeline matching should provide a query_date so that this can work. +# ''' +# from datetime import datetime, date + +# d1 = datetime.strptime(str(my_date), "%Y-%m-%d") +# d2 = datetime.strptime(str(query_date), "%Y-%m-%d") +# diff = (d2 - d1) +# return diff + + + + + +def rfm_concat(days_score, frequency_score, amount_score): + ''' + This function takes in three pandas.series columns and returns a concatenated version of each score for a total rfm score. + Assumes that arg1 are Recency, arg2 are Frequency and arg3 are Monetary values + arg1: pandas.series + arg2: pandas.series + arg3: pandas.series + + + ''' + def concat(a, b, c): + return int(f"{a}{b}{c}") + + rfm_score = list() + for ii, jj, kk in zip(days_score, frequency_score, amount_score): + rfm_score.append(concat(ii,jj,kk)) + + + + return rfm_score + + + +def merge_series(list1, list2): + ''' + This function takes in two tuples and merges them into a list of tuples. + ''' + merged_list = tuple(zip(list(list1), list(list2))) + return merged_list + + + +def create_bins_dict(recency_edges, frequency_edges, monetary_edges): + ''' + Create_bins_dict-- creates dictionaries for each edge and label pairing + This function takes in user defined bin edges and respective labels per each bin edge. User should + input a list of edges and labels in corresponding order. A set of edges and bins for each score should be entered. + + e.g. + recency_edges = np.array([0, 1., 2.,4., 10.]) + ''' + + recency_dict = {} + recency_labels = list(5,4,3,2,1) + for ii,jj in zip(recency_labels, recency_edges): + recency_dict["{0}".format(ii)] = jj + + frequency_dict = {} + frequency_labels= list(1,2,3,4,5) + for tt,kk in zip(frequency_labels, frequency_edges): + frequency_dict["{0}".format(tt)] = kk + + + monetary_dict = {} + monetary_labels = list(1,2,3,4,5) + for ww,hh in zip(monetary_labels, monetary_edges): + monetary_dict["{0}".format(ww)] = hh + + + return recency_dict, frequency_dict, monetary_dict diff --git a/src/server/rfm_funcs/rfm_instructions.md b/src/server/rfm_funcs/rfm_instructions.md new file mode 100644 index 00000000..bd683f37 --- /dev/null +++ b/src/server/rfm_funcs/rfm_instructions.md @@ -0,0 +1,19 @@ +# RFM code run instructions + +In order to obtain rfm scores a few dependencies will be required. + +1. The most up to date bin edges must be stored within the postgres database. +2. If bin edges must be updated use the following---via "src/server/api/admin_api.py" + write_rfm_edges(rfm_dict : dict) + +Once all above situations are satisfied create_scores can be run. +create_scores.py is the main function which will output a list of tuples for matching_id and corresponding score. +This function requires a single input, query_date + query_date should be pulled from the most recent data ingestion----once per week. + +create_scores.py runs in 4 distinct steps +1. calculate recency since last donation over the total lifespan of data collection +2. calculate frequency over the past year from query date. +3. calculate monetary donations from the individual's max donation over the course of data lifespan. +4. concatenate recency, frequency, and monetary values into a single integer and pair these with individual matching ids to update via 'insert_rfm_scores' + diff --git a/src/server/rfm_funcs/test_rfm.py b/src/server/rfm_funcs/test_rfm.py new file mode 100644 index 00000000..fd4fee67 --- /dev/null +++ b/src/server/rfm_funcs/test_rfm.py @@ -0,0 +1,18 @@ +# This function is meant to test the RFM create_scores.py function. + +''' +Things needed +1. Create mock data + a. Mock data must be realistic + b. mock data must have 5^3 possibilities for RFM score, i.e., 1 RFM score each. + c. Therefore we need 125 unique rows. + d. Recency needs to have at least 5 different dates + e. Frequency needs to have at least 5 different IDs + f. Monetary needs to have at least 5 different amounts + g. Each subject ID will get an RFM score. +2. create_scores.py will accept this mock data and then generate a new RFM score +3. final step of this function will perform a jaccard similarity analysis to determine if the vectors +match where the result should be exatly 1.0 + +''' +