|
| 1 | +from config import engine |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +import numpy as np |
| 5 | +from datetime import datetime |
| 6 | +from collections import Counter |
| 7 | + |
| 8 | +def date_difference(my_date, max_date): |
| 9 | + ''' |
| 10 | + This function takes in a single date from the donations dataframe (per row) and compares the difference between that date and the date in which matching occurs. |
| 11 | + I.e. pipeline matching should provide a query_date so that this can work. |
| 12 | + ''' |
| 13 | + |
| 14 | + d1 = datetime.strptime(str(my_date), "%Y-%m-%d") |
| 15 | + d2 = datetime.strptime(str(max_date), "%Y-%m-%d") |
| 16 | + diff = (d2 - d1) |
| 17 | + return diff |
| 18 | + |
| 19 | + |
| 20 | +def create_scores(query_date): |
| 21 | + ''' |
| 22 | + requires query date as input-- must be string in the following format "%Y-%m-%d" |
| 23 | + returns a list of matching_ids and scores as tuples |
| 24 | + will also insert rfm scores into rfm_scores table----see src/server/api/admin_api.py |
| 25 | + ''' |
| 26 | + |
| 27 | + with engine.connect() as connection: |
| 28 | + |
| 29 | + # read in data from database via pull_donations_for_rfm() func (reads in as a list of tuples) |
| 30 | + df = pd.read_sql( |
| 31 | + """ |
| 32 | + select pc.matching_id, s.amount, s.close_date |
| 33 | + from salesforcedonations s |
| 34 | + inner join pdp_contacts pc on pc.source_id = s.contact_id and pc.source_type = 'salesforcecontacts' |
| 35 | + where pc.archived_date is null order by matching_id |
| 36 | + """ |
| 37 | + , connection) |
| 38 | + df = pd.DataFrame(df, columns=['matching_id', 'amount', 'close_date']) |
| 39 | + |
| 40 | + from api.admin_api import read_rfm_edges, insert_rfm_scores # Avoid circular import issues |
| 41 | + |
| 42 | + rfm_dict = read_rfm_edges() |
| 43 | + recency_labels = [5,4,3,2,1] |
| 44 | + recency_bins = list(rfm_dict['r'].values()) #imported from table |
| 45 | + |
| 46 | + frequency_labels = [1,2,3,4,5] |
| 47 | + frequency_bins = list(rfm_dict['f'].values()) #imported from table |
| 48 | + |
| 49 | + monetary_labels = [1,2,3,4,5] |
| 50 | + monetary_bins = list(rfm_dict['m'].values()) #imported from table |
| 51 | + |
| 52 | + |
| 53 | + ########################## recency ######################################### |
| 54 | + |
| 55 | + donations_past_year = df |
| 56 | + donations_past_year['close_date'] =pd.to_datetime(donations_past_year['close_date']).dt.date |
| 57 | + |
| 58 | + # calculate date difference between input date and individual row close date |
| 59 | + |
| 60 | + days = [] |
| 61 | + max_close_date = donations_past_year['close_date'].max() |
| 62 | + for ii in donations_past_year['close_date']: |
| 63 | + days.append(date_difference(ii, max_close_date)) |
| 64 | + donations_past_year['days_since'] = days |
| 65 | + |
| 66 | + grouped_past_year = donations_past_year.groupby('matching_id').agg({'days_since': ['min']}).reset_index() |
| 67 | + print(grouped_past_year.head()) |
| 68 | + |
| 69 | + grouped_past_year[('days_since', 'min')]= grouped_past_year[('days_since', 'min')].dt.days |
| 70 | + |
| 71 | + max_maybe = grouped_past_year[('days_since', 'min')].max() |
| 72 | + |
| 73 | + real_max = max(max_maybe, max(recency_bins)+1 ) |
| 74 | + |
| 75 | + recency_bins.append(real_max) |
| 76 | + |
| 77 | + |
| 78 | + |
| 79 | + grouped_past_year['recency_score'] = pd.cut(grouped_past_year[('days_since','min')], bins= recency_bins, labels=recency_labels, include_lowest = True) |
| 80 | + grouped_past_year.rename(columns={('recency_score', ''): 'recency_score'}) |
| 81 | + |
| 82 | + ################################## frequency ############################### |
| 83 | + |
| 84 | + df['close_date'] = pd.DatetimeIndex(df['close_date']) |
| 85 | + |
| 86 | + df_grouped = df.groupby(['matching_id', pd.Grouper(key = 'close_date', freq = 'Q')]).count().max(level=0) |
| 87 | + |
| 88 | + df_grouped = df_grouped.reset_index() |
| 89 | + |
| 90 | + frequency_bins.append(np.inf) |
| 91 | + |
| 92 | + df_frequency = df_grouped[['matching_id' , 'amount']] # amount is a placeholder as the groupby step just gives a frequency count, the value doesn't correspond to donation monetary amount. |
| 93 | + |
| 94 | + df_frequency = df_frequency.rename(columns = {'amount':'frequency'}) #renaming amount to frequency |
| 95 | + |
| 96 | + df_frequency['frequency_score'] = pd.cut(df_frequency['frequency'], |
| 97 | + bins = frequency_bins, labels=frequency_labels, include_lowest=True) |
| 98 | + |
| 99 | + ################################## amount ################################## |
| 100 | + |
| 101 | + monetary_bins.append(np.inf) |
| 102 | + |
| 103 | + df_amount = df.groupby(df['matching_id'], as_index=False).amount.max() |
| 104 | + |
| 105 | + df_amount['amount_score'] = pd.cut(df_amount['amount'], bins= monetary_bins, include_lowest=True, labels = monetary_labels) |
| 106 | + |
| 107 | + |
| 108 | + # Concatenate rfm scores |
| 109 | + # merge monetary df and frequency df |
| 110 | + df_semi = df_amount.merge(df_frequency, left_on='matching_id', right_on= 'matching_id') |
| 111 | + print(grouped_past_year.head()) |
| 112 | + print(df_semi.head()) |
| 113 | + df_final = df_semi.merge(grouped_past_year, left_on='matching_id', right_on= 'matching_id') # merge monetary/frequency dfs to recency df |
| 114 | + |
| 115 | + ### get avg fm score and merge with df_final |
| 116 | + # df_final['f_m_AVG_score'] = df_final[['frequency_score', 'amount_score']].mean(axis=1) |
| 117 | + |
| 118 | + |
| 119 | + # import function: rfm_concat, which will catenate integers as a string and then convert back to a single integer |
| 120 | + from rfm_funcs.rfm_functions import rfm_concat |
| 121 | + rfm_score = rfm_concat(df_final[('recency_score'), ''], df_final['frequency_score'], df_final['amount_score']) |
| 122 | + |
| 123 | + # Append rfm score to final df |
| 124 | + df_final['rfm_score'] = rfm_score |
| 125 | + |
| 126 | + from rfm_funcs.rfm_functions import merge_series |
| 127 | + score_tuples = merge_series((df_final['matching_id']), df_final['rfm_score']) |
| 128 | + |
| 129 | + insert_rfm_scores(score_tuples) |
| 130 | + |
| 131 | + return len(score_tuples) # Not sure there's anything to do with them at this point |
0 commit comments