Skip to content

Commit 57a6426

Browse files
authored
Merge pull request #422 from CodeForPhilly/285-rfm-model
Merging to enable PAWS testing. See #436 for some needed work.
2 parents 4c920f2 + 87cf8bb commit 57a6426

File tree

8 files changed

+313
-2
lines changed

8 files changed

+313
-2
lines changed

src/server/alembic/populate_rfm_mapping.sql

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
-- Run this script in your SQL query tool
22
-- Run truncate command if this table is already populated
3+
34
-- TRUNCATE TABLE rfm_mapping;
45
-- BEGIN;
56
-- Fields are (rfm_score, label, (background) color, text color)
6-
77
insert into rfm_mapping values('111', 'Low impact, disengaged','#eed0aa', '#000000');
88
insert into rfm_mapping values('112', 'Low impact, disengaged','#eed0aa', '#000000');
99
insert into rfm_mapping values('113', 'Low impact, disengaged','#eed0aa', '#000000');
@@ -129,4 +129,4 @@ insert into rfm_mapping values('552', 'High impact, engaged','#034858', '#ffffff
129129
insert into rfm_mapping values('553', 'High impact, engaged','#034858', '#ffffff');
130130
insert into rfm_mapping values('554', 'High impact, engaged','#034858', '#ffffff');
131131
insert into rfm_mapping values('555', 'High impact, engaged','#034858', '#ffffff');
132-
-- COMMIT;
132+
COMMIT;

src/server/api/admin_api.py

+8
Original file line numberDiff line numberDiff line change
@@ -406,3 +406,11 @@ def hit_gdrs():
406406
# d = read_rfm_edges() # read it again
407407
# print("round-trip d is : \n " + str(d) )
408408
# return "OK"
409+
410+
from rfm_funcs.create_scores import create_scores
411+
@admin_api.route("/api/admin/test_create_scores", methods=["GET"])
412+
def hit_create_scores():
413+
current_app.logger.info("Hitting create_scores() ")
414+
tuple_count = create_scores('2021-07-27')
415+
current_app.logger.info("create_scores() processed " + str(tuple_count) + " scores")
416+
return jsonify(200)

src/server/rfm_funcs/__init__.py

Whitespace-only changes.

src/server/rfm_funcs/create_bins.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
def create_bins(data, query_date):
2+
'''This script will take table data and bin edges for RFM scores for all PAWS donations
3+
4+
query_date = date data was queried
5+
'''
6+
7+
import pandas as pd
8+
import numpy as np
9+
import jenkspy
10+
from datetime import datetime, date
11+
import os
12+
13+
14+
15+
####
16+
# read in data from database as list of tuples
17+
df = pull_donations_for_rfm()
18+
df = pd.DataFrame(df, columns=['matching_id', 'amount', 'close_date'])
19+
20+
donations_df['Close_Date'] =pd.to_datetime(df['Close_Date']).dt.date
21+
22+
##################################################################################
23+
# Calculate recency bins
24+
from recency_bins import recency_bins
25+
recency_bins, quantile_scores= recency_bins(donations_df, query_date)
26+
27+
###################################################################################
28+
# Calculate frequency bins
29+
from frequency_bins import frequency_bins
30+
31+
jenks_frequency_bins, human_frequency_bins = frequency_bins(donations_df)
32+
33+
34+
def checkIfDuplicates(listOfElems):
35+
''' Check if given list contains any duplicates '''
36+
for elem in listOfElems:
37+
if listOfElems.count(elem) > 1:
38+
return True
39+
return False
40+
41+
duplicats_bool = checkIfDuplicates(jenks_frequency_bins)
42+
if duplicates_bool == True:
43+
final_frequency_bins = human_frequency_bins
44+
45+
###################################################################################
46+
# Calculate Amount bins
47+
from amount_bins import amount_bins
48+
49+
amount_jenks_bins, human_amount_bins = amount_bins(donations_df)
50+
51+
52+
53+
###################################################################################
54+
# Write bins to dict
55+
bins_dict = {}

src/server/rfm_funcs/create_scores.py

+131
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
from config import engine
2+
3+
import pandas as pd
4+
import numpy as np
5+
from datetime import datetime
6+
from collections import Counter
7+
8+
def date_difference(my_date, max_date):
9+
'''
10+
This function takes in a single date from the donations dataframe (per row) and compares the difference between that date and the date in which matching occurs.
11+
I.e. pipeline matching should provide a query_date so that this can work.
12+
'''
13+
14+
d1 = datetime.strptime(str(my_date), "%Y-%m-%d")
15+
d2 = datetime.strptime(str(max_date), "%Y-%m-%d")
16+
diff = (d2 - d1)
17+
return diff
18+
19+
20+
def create_scores(query_date):
21+
'''
22+
requires query date as input-- must be string in the following format "%Y-%m-%d"
23+
returns a list of matching_ids and scores as tuples
24+
will also insert rfm scores into rfm_scores table----see src/server/api/admin_api.py
25+
'''
26+
27+
with engine.connect() as connection:
28+
29+
# read in data from database via pull_donations_for_rfm() func (reads in as a list of tuples)
30+
df = pd.read_sql(
31+
"""
32+
select pc.matching_id, s.amount, s.close_date
33+
from salesforcedonations s
34+
inner join pdp_contacts pc on pc.source_id = s.contact_id and pc.source_type = 'salesforcecontacts'
35+
where pc.archived_date is null order by matching_id
36+
"""
37+
, connection)
38+
df = pd.DataFrame(df, columns=['matching_id', 'amount', 'close_date'])
39+
40+
from api.admin_api import read_rfm_edges, insert_rfm_scores # Avoid circular import issues
41+
42+
rfm_dict = read_rfm_edges()
43+
recency_labels = [5,4,3,2,1]
44+
recency_bins = list(rfm_dict['r'].values()) #imported from table
45+
46+
frequency_labels = [1,2,3,4,5]
47+
frequency_bins = list(rfm_dict['f'].values()) #imported from table
48+
49+
monetary_labels = [1,2,3,4,5]
50+
monetary_bins = list(rfm_dict['m'].values()) #imported from table
51+
52+
53+
########################## recency #########################################
54+
55+
donations_past_year = df
56+
donations_past_year['close_date'] =pd.to_datetime(donations_past_year['close_date']).dt.date
57+
58+
# calculate date difference between input date and individual row close date
59+
60+
days = []
61+
max_close_date = donations_past_year['close_date'].max()
62+
for ii in donations_past_year['close_date']:
63+
days.append(date_difference(ii, max_close_date))
64+
donations_past_year['days_since'] = days
65+
66+
grouped_past_year = donations_past_year.groupby('matching_id').agg({'days_since': ['min']}).reset_index()
67+
print(grouped_past_year.head())
68+
69+
grouped_past_year[('days_since', 'min')]= grouped_past_year[('days_since', 'min')].dt.days
70+
71+
max_maybe = grouped_past_year[('days_since', 'min')].max()
72+
73+
real_max = max(max_maybe, max(recency_bins)+1 )
74+
75+
recency_bins.append(real_max)
76+
77+
78+
79+
grouped_past_year['recency_score'] = pd.cut(grouped_past_year[('days_since','min')], bins= recency_bins, labels=recency_labels, include_lowest = True)
80+
grouped_past_year.rename(columns={('recency_score', ''): 'recency_score'})
81+
82+
################################## frequency ###############################
83+
84+
df['close_date'] = pd.DatetimeIndex(df['close_date'])
85+
86+
df_grouped = df.groupby(['matching_id', pd.Grouper(key = 'close_date', freq = 'Q')]).count().max(level=0)
87+
88+
df_grouped = df_grouped.reset_index()
89+
90+
frequency_bins.append(np.inf)
91+
92+
df_frequency = df_grouped[['matching_id' , 'amount']] # amount is a placeholder as the groupby step just gives a frequency count, the value doesn't correspond to donation monetary amount.
93+
94+
df_frequency = df_frequency.rename(columns = {'amount':'frequency'}) #renaming amount to frequency
95+
96+
df_frequency['frequency_score'] = pd.cut(df_frequency['frequency'],
97+
bins = frequency_bins, labels=frequency_labels, include_lowest=True)
98+
99+
################################## amount ##################################
100+
101+
monetary_bins.append(np.inf)
102+
103+
df_amount = df.groupby(df['matching_id'], as_index=False).amount.max()
104+
105+
df_amount['amount_score'] = pd.cut(df_amount['amount'], bins= monetary_bins, include_lowest=True, labels = monetary_labels)
106+
107+
108+
# Concatenate rfm scores
109+
# merge monetary df and frequency df
110+
df_semi = df_amount.merge(df_frequency, left_on='matching_id', right_on= 'matching_id')
111+
print(grouped_past_year.head())
112+
print(df_semi.head())
113+
df_final = df_semi.merge(grouped_past_year, left_on='matching_id', right_on= 'matching_id') # merge monetary/frequency dfs to recency df
114+
115+
### get avg fm score and merge with df_final
116+
# df_final['f_m_AVG_score'] = df_final[['frequency_score', 'amount_score']].mean(axis=1)
117+
118+
119+
# import function: rfm_concat, which will catenate integers as a string and then convert back to a single integer
120+
from rfm_funcs.rfm_functions import rfm_concat
121+
rfm_score = rfm_concat(df_final[('recency_score'), ''], df_final['frequency_score'], df_final['amount_score'])
122+
123+
# Append rfm score to final df
124+
df_final['rfm_score'] = rfm_score
125+
126+
from rfm_funcs.rfm_functions import merge_series
127+
score_tuples = merge_series((df_final['matching_id']), df_final['rfm_score'])
128+
129+
insert_rfm_scores(score_tuples)
130+
131+
return len(score_tuples) # Not sure there's anything to do with them at this point

src/server/rfm_funcs/rfm_functions.py

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# rfm_funcs
2+
3+
### A number of RFM functions which are called by the main create_scores function.
4+
5+
# def date_difference(my_date, query_date):
6+
# '''
7+
# This function takes in a single date from the donations dataframe (per row) and compares the difference between that date and the date in which matching occurs.
8+
# I.e. pipeline matching should provide a query_date so that this can work.
9+
# '''
10+
# from datetime import datetime, date
11+
12+
# d1 = datetime.strptime(str(my_date), "%Y-%m-%d")
13+
# d2 = datetime.strptime(str(query_date), "%Y-%m-%d")
14+
# diff = (d2 - d1)
15+
# return diff
16+
17+
18+
19+
20+
21+
def rfm_concat(days_score, frequency_score, amount_score):
22+
'''
23+
This function takes in three pandas.series columns and returns a concatenated version of each score for a total rfm score.
24+
Assumes that arg1 are Recency, arg2 are Frequency and arg3 are Monetary values
25+
arg1: pandas.series
26+
arg2: pandas.series
27+
arg3: pandas.series
28+
29+
30+
'''
31+
def concat(a, b, c):
32+
return int(f"{a}{b}{c}")
33+
34+
rfm_score = list()
35+
for ii, jj, kk in zip(days_score, frequency_score, amount_score):
36+
rfm_score.append(concat(ii,jj,kk))
37+
38+
39+
40+
return rfm_score
41+
42+
43+
44+
def merge_series(list1, list2):
45+
'''
46+
This function takes in two tuples and merges them into a list of tuples.
47+
'''
48+
merged_list = tuple(zip(list(list1), list(list2)))
49+
return merged_list
50+
51+
52+
53+
def create_bins_dict(recency_edges, frequency_edges, monetary_edges):
54+
'''
55+
Create_bins_dict-- creates dictionaries for each edge and label pairing
56+
This function takes in user defined bin edges and respective labels per each bin edge. User should
57+
input a list of edges and labels in corresponding order. A set of edges and bins for each score should be entered.
58+
59+
e.g.
60+
recency_edges = np.array([0, 1., 2.,4., 10.])
61+
'''
62+
63+
recency_dict = {}
64+
recency_labels = list(5,4,3,2,1)
65+
for ii,jj in zip(recency_labels, recency_edges):
66+
recency_dict["{0}".format(ii)] = jj
67+
68+
frequency_dict = {}
69+
frequency_labels= list(1,2,3,4,5)
70+
for tt,kk in zip(frequency_labels, frequency_edges):
71+
frequency_dict["{0}".format(tt)] = kk
72+
73+
74+
monetary_dict = {}
75+
monetary_labels = list(1,2,3,4,5)
76+
for ww,hh in zip(monetary_labels, monetary_edges):
77+
monetary_dict["{0}".format(ww)] = hh
78+
79+
80+
return recency_dict, frequency_dict, monetary_dict
+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# RFM code run instructions
2+
3+
In order to obtain rfm scores a few dependencies will be required.
4+
5+
1. The most up to date bin edges must be stored within the postgres database.
6+
2. If bin edges must be updated use the following---via "src/server/api/admin_api.py"
7+
write_rfm_edges(rfm_dict : dict)
8+
9+
Once all above situations are satisfied create_scores can be run.
10+
create_scores.py is the main function which will output a list of tuples for matching_id and corresponding score.
11+
This function requires a single input, query_date
12+
query_date should be pulled from the most recent data ingestion----once per week.
13+
14+
create_scores.py runs in 4 distinct steps
15+
1. calculate recency since last donation over the total lifespan of data collection
16+
2. calculate frequency over the past year from query date.
17+
3. calculate monetary donations from the individual's max donation over the course of data lifespan.
18+
4. concatenate recency, frequency, and monetary values into a single integer and pair these with individual matching ids to update via 'insert_rfm_scores'
19+

src/server/rfm_funcs/test_rfm.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# This function is meant to test the RFM create_scores.py function.
2+
3+
'''
4+
Things needed
5+
1. Create mock data
6+
a. Mock data must be realistic
7+
b. mock data must have 5^3 possibilities for RFM score, i.e., 1 RFM score each.
8+
c. Therefore we need 125 unique rows.
9+
d. Recency needs to have at least 5 different dates
10+
e. Frequency needs to have at least 5 different IDs
11+
f. Monetary needs to have at least 5 different amounts
12+
g. Each subject ID will get an RFM score.
13+
2. create_scores.py will accept this mock data and then generate a new RFM score
14+
3. final step of this function will perform a jaccard similarity analysis to determine if the vectors
15+
match where the result should be exatly 1.0
16+
17+
'''
18+

0 commit comments

Comments
 (0)