|
1 |
| -def create_scores(connection, query_date): |
| 1 | +from config import engine |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +import numpy as np |
| 5 | +from datetime import datetime, date |
| 6 | +from collections import Counter |
| 7 | + |
| 8 | +def date_difference(my_date, query_date): |
2 | 9 | '''
|
3 |
| - requires query date as input-- must be string in the following format "%Y-%m-%d" |
4 |
| - returns a list of matching_ids and scores as tuples |
5 |
| - will also insert rfm scores into rfm_scores table----see src/server/api/admin_api.py |
| 10 | + This function takes in a single date from the donations dataframe (per row) and compares the difference between that date and the date in which matching occurs. |
| 11 | + I.e. pipeline matching should provide a query_date so that this can work. |
6 | 12 | '''
|
7 |
| - # Import dependencies |
8 |
| - import pandas as pd |
9 |
| - import numpy as np |
10 |
| - from datetime import datetime, date |
11 |
| - from collections import Counter |
12 |
| - from api.admin_api import read_rfm_edges, insert_rfm_scores |
13 |
| - |
14 | 13 |
|
15 |
| - # read in data from database via pull_donations_for_rfm() func (reads in as a list of tuples) |
16 |
| - df = pd.read_sql( |
17 |
| - """ |
18 |
| - select pc.matching_id, s.amount, s.close_date |
19 |
| - from salesforcedonations s |
20 |
| - inner join pdp_contacts pc on pc.source_id = s.contact_id and pc.source_type = 'salesforcecontacts' |
21 |
| - where pc.archived_date is null order by matching_id |
22 |
| - """ |
23 |
| - , connection) |
24 |
| - df = pd.DataFrame(df, columns=['matching_id', 'amount', 'close_date']) |
| 14 | + d1 = datetime.strptime(str(my_date), "%Y-%m-%d") |
| 15 | + d2 = datetime.strptime(str(query_date), "%Y-%m-%d") |
| 16 | + diff = (d2 - d1) |
| 17 | + return diff |
25 | 18 |
|
26 |
| - # read in labels and bin edges from table |
27 |
| - recency_labels = [5,4,3,2,1] |
28 |
| - recency_bins = list(read_rfm_edges('r').values()) #imported from table |
29 | 19 |
|
30 |
| - frequency_labels = [1,2,3,4,5] |
31 |
| - frequency_bins = list(read_rfm_edges('f').values()) #imported from table |
| 20 | +def create_scores(query_date): |
| 21 | + ''' |
| 22 | + requires query date as input-- must be string in the following format "%Y-%m-%d" |
| 23 | + returns a list of matching_ids and scores as tuples |
| 24 | + will also insert rfm scores into rfm_scores table----see src/server/api/admin_api.py |
| 25 | + ''' |
32 | 26 |
|
33 |
| - monetary_labels = [ 1,2,3,4,5] |
34 |
| - monetary_bins = list(read_rfm_edges('m').values()) #imported from table |
| 27 | + with engine.connect() as connection: |
35 | 28 |
|
| 29 | + # read in data from database via pull_donations_for_rfm() func (reads in as a list of tuples) |
| 30 | + df = pd.read_sql( |
| 31 | + """ |
| 32 | + select pc.matching_id, s.amount, s.close_date |
| 33 | + from salesforcedonations s |
| 34 | + inner join pdp_contacts pc on pc.source_id = s.contact_id and pc.source_type = 'salesforcecontacts' |
| 35 | + where pc.archived_date is null order by matching_id |
| 36 | + """ |
| 37 | + , connection) |
| 38 | + df = pd.DataFrame(df, columns=['matching_id', 'amount', 'close_date']) |
36 | 39 |
|
37 |
| - ########################## recency ######################################### |
| 40 | + from api.admin_api import read_rfm_edges, insert_rfm_scores # Avoid circular import issues |
38 | 41 |
|
| 42 | + rfm_dict = read_rfm_edges() |
| 43 | + recency_labels = [5,4,3,2,1] |
| 44 | + recency_bins = list(rfm_dict['r']) #imported from table |
39 | 45 |
|
40 |
| - donations_past_year = df |
41 |
| - donations_past_year['close_date'] =pd.to_datetime(donations_past_year['close_date']).dt.date |
| 46 | + frequency_labels = [1,2,3,4,5] |
| 47 | + frequency_bins = list(rfm_dict['f']) #imported from table |
42 | 48 |
|
43 |
| - # calculate date difference between input date and individual row close date |
44 |
| - from rfm_functions import date_difference |
45 |
| - days = [] |
46 |
| - for ii in donations_past_year['close_date']: |
47 |
| - days.append(date_difference(ii, str(query_date))) |
48 |
| - donations_past_year['days_since'] = days |
| 49 | + monetary_labels = [ 1,2,3,4,5] |
| 50 | + monetary_bins = list(rfm_dict['m']) #imported from table |
49 | 51 |
|
50 |
| - grouped_past_year = donations_past_year.groupby('_id').agg({'days_since': ['min']}).reset_index() |
51 | 52 |
|
52 |
| - grouped_past_year[('days_since', 'min')]= grouped_past_year[('days_since', 'min')].dt.days |
| 53 | + ########################## recency ######################################### |
53 | 54 |
|
54 |
| - recency_bins.append(grouped_past_year[('days_since', 'min')].max()) |
| 55 | + donations_past_year = df |
| 56 | + donations_past_year['close_date'] =pd.to_datetime(donations_past_year['close_date']).dt.date |
55 | 57 |
|
56 |
| - grouped_past_year['recency_score'] = pd.cut(grouped_past_year[('days_since','min')], bins= recency_bins, labels=recency_labels, include_lowest = True) |
57 |
| - grouped_past_year.rename(columns={('recency_score', ''): 'recency_score'}) |
| 58 | + # calculate date difference between input date and individual row close date |
58 | 59 |
|
| 60 | + days = [] |
| 61 | + for ii in donations_past_year['close_date']: |
| 62 | + days.append(date_difference(ii, str(query_date))) |
| 63 | + donations_past_year['days_since'] = days |
59 | 64 |
|
60 |
| - ################################## frequency ############################### |
| 65 | + grouped_past_year = donations_past_year.groupby('_id').agg({'days_since': ['min']}).reset_index() |
61 | 66 |
|
62 |
| - df['close_date'] = pd.DatetimeIndex(df['close_date']) |
| 67 | + grouped_past_year[('days_since', 'min')]= grouped_past_year[('days_since', 'min')].dt.days |
63 | 68 |
|
64 |
| - df_grouped = df.groupby(['matching_id', pd.Grouper(key = 'close_date', freq = 'Q')]).count().max(level=0) |
| 69 | + recency_bins.append(grouped_past_year[('days_since', 'min')].max()) |
65 | 70 |
|
66 |
| - df_grouped = df_grouped.reset_index() |
| 71 | + grouped_past_year['recency_score'] = pd.cut(grouped_past_year[('days_since','min')], bins= recency_bins, labels=recency_labels, include_lowest = True) |
| 72 | + grouped_past_year.rename(columns={('recency_score', ''): 'recency_score'}) |
67 | 73 |
|
68 |
| - frequency_bins.append(np.inf) |
| 74 | + ################################## frequency ############################### |
69 | 75 |
|
70 |
| - df_frequency = df_grouped[['matching_id' , 'opp_id']] |
| 76 | + df['close_date'] = pd.DatetimeIndex(df['close_date']) |
71 | 77 |
|
72 |
| - df_frequency['frequency_score'] = pd.cut(df_frequency['opp_id'], |
73 |
| - bins = frequency_bins, labels=frequency_labels, include_lowest=True) |
| 78 | + df_grouped = df.groupby(['matching_id', pd.Grouper(key = 'close_date', freq = 'Q')]).count().max(level=0) |
74 | 79 |
|
| 80 | + df_grouped = df_grouped.reset_index() |
75 | 81 |
|
| 82 | + frequency_bins.append(np.inf) |
76 | 83 |
|
77 |
| - ################################## amount ################################## |
| 84 | + df_frequency = df_grouped[['matching_id' , 'opp_id']] |
78 | 85 |
|
79 |
| - monetary_bins.append(np.inf) |
| 86 | + df_frequency['frequency_score'] = pd.cut(df_frequency['opp_id'], |
| 87 | + bins = frequency_bins, labels=frequency_labels, include_lowest=True) |
80 | 88 |
|
81 |
| - df_amount = df.groupby(df['matching_id'], as_index=False).amount.max() |
| 89 | + ################################## amount ################################## |
82 | 90 |
|
83 |
| - df_amount['amount_score'] = pd.cut(df_amount['amount'], bins= monetary_bins, include_lowest=True, labels = monetary_labels) |
| 91 | + monetary_bins.append(np.inf) |
84 | 92 |
|
| 93 | + df_amount = df.groupby(df['matching_id'], as_index=False).amount.max() |
85 | 94 |
|
| 95 | + df_amount['amount_score'] = pd.cut(df_amount['amount'], bins= monetary_bins, include_lowest=True, labels = monetary_labels) |
86 | 96 |
|
87 | 97 |
|
88 |
| - # Concatenate rfm scores |
89 |
| - # merge monetary df and frequency df |
90 |
| - df_semi = df_amount.merge(df_frequency, left_on='matching_id', right_on= 'matching_id') |
91 |
| - print(grouped_past_year.head()) |
92 |
| - print(df_semi.head()) |
93 |
| - df_final = df_semi.merge(grouped_past_year, left_on='matching_id', right_on= '_id') # merge monetary/frequency dfs to recency df |
| 98 | + # Concatenate rfm scores |
| 99 | + # merge monetary df and frequency df |
| 100 | + df_semi = df_amount.merge(df_frequency, left_on='matching_id', right_on= 'matching_id') |
| 101 | + print(grouped_past_year.head()) |
| 102 | + print(df_semi.head()) |
| 103 | + df_final = df_semi.merge(grouped_past_year, left_on='matching_id', right_on= '_id') # merge monetary/frequency dfs to recency df |
94 | 104 |
|
95 |
| - ### get avg fm score and merge with df_final |
96 |
| - # df_final['f_m_AVG_score'] = df_final[['frequency_score', 'amount_score']].mean(axis=1) |
| 105 | + ### get avg fm score and merge with df_final |
| 106 | + # df_final['f_m_AVG_score'] = df_final[['frequency_score', 'amount_score']].mean(axis=1) |
97 | 107 |
|
98 | 108 |
|
99 |
| - # import function: rfm_concat, which will catenate integers as a string and then convert back to a single integer |
100 |
| - from rfm_functions import rfm_concat |
101 |
| - rfm_score = rfm_concat(df_final[('recency_score'), ''], df_final['frequency_score'], df_final['amount_score']) |
| 109 | + # import function: rfm_concat, which will catenate integers as a string and then convert back to a single integer |
| 110 | + from rfm_functions import rfm_concat |
| 111 | + rfm_score = rfm_concat(df_final[('recency_score'), ''], df_final['frequency_score'], df_final['amount_score']) |
102 | 112 |
|
103 |
| - # Append rfm score to final df |
104 |
| - df_final['rfm_score'] = rfm_score |
| 113 | + # Append rfm score to final df |
| 114 | + df_final['rfm_score'] = rfm_score |
105 | 115 |
|
106 |
| - from rfm_functions import merge_series |
107 |
| - score_tuples = merge_series((df_final['matching_id']), df_final['rfm_score']) |
| 116 | + from rfm_functions import merge_series |
| 117 | + score_tuples = merge_series((df_final['matching_id']), df_final['rfm_score']) |
108 | 118 |
|
109 |
| - insert_rfm_scores(score_tuples) |
| 119 | + insert_rfm_scores(score_tuples) |
110 | 120 |
|
111 |
| - return score_tuples |
| 121 | + return len(score_tuples) # Not sure there's anything to do with them at this point |
0 commit comments