Skip to content

Commit d21d503

Browse files
committed
feature engineering, category cleaning
1 parent f614033 commit d21d503

File tree

5 files changed

+936
-913
lines changed

5 files changed

+936
-913
lines changed

Master Notebook.ipynb

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Stop and Frisk"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"Modeling what are effective stops - which lead to arrests, non-arrests. Unreasonable search and seizure is unconstitutional; racial bias is illegal; police activity without any effectiveness on crime is wasteful."
15+
]
16+
},
17+
{
18+
"cell_type": "markdown",
19+
"metadata": {},
20+
"source": [
21+
"Ideally we'll examine outcomes more closely. Multi-class logistic regression and random forest. You can use multiclass or holdouts. Remember that with logistic regression you drop the first dummy variable; with random forest you don't need to. Also remember to use n_jobs. Gridsearch probably should be done to optimize model. Worth adding borough."
22+
]
23+
},
24+
{
25+
"cell_type": "markdown",
26+
"metadata": {},
27+
"source": [
28+
"TODO..."
29+
]
30+
},
31+
{
32+
"cell_type": "markdown",
33+
"metadata": {},
34+
"source": [
35+
"* populate missing city fields from pct\n",
36+
"* sector is less of a problem but still lots of missing values"
37+
]
38+
},
39+
{
40+
"cell_type": "markdown",
41+
"metadata": {},
42+
"source": [
43+
"Using a balanced accuracy score."
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"metadata": {},
50+
"outputs": [],
51+
"source": []
52+
}
53+
],
54+
"metadata": {
55+
"kernelspec": {
56+
"display_name": "Python 3",
57+
"language": "python",
58+
"name": "python3"
59+
},
60+
"language_info": {
61+
"codemirror_mode": {
62+
"name": "ipython",
63+
"version": 3
64+
},
65+
"file_extension": ".py",
66+
"mimetype": "text/x-python",
67+
"name": "python",
68+
"nbconvert_exporter": "python",
69+
"pygments_lexer": "ipython3",
70+
"version": "3.7.3"
71+
}
72+
},
73+
"nbformat": 4,
74+
"nbformat_minor": 4
75+
}

data_cleaner.py

Lines changed: 50 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def add_month_weekday(data):
8888

8989
def add_pct_sector(data):
9090
"""sectors are subdivisions of precincts"""
91-
data['pct_sector'] = data.pct.astype(int).astype(str) + data.sector.astype('object').fillna('').astype(str)
91+
data['pct_sector'] = data.pct.astype(int).astype(str) + '-' + data.sector.astype('object').fillna('').astype(str)
9292
data.pct_sector = data.pct_sector.astype('category')
9393
return data
9494

@@ -107,6 +107,7 @@ def y_n_to_1_0_cols(data, cols=Y_N_COLS, yes_values=['Y'], set_na=True):
107107
for y_n_col in Y_N_COLS:
108108
if y_n_col in data:
109109
data[y_n_col] = y_n_to_1_0(data[y_n_col], yes_values, set_na)
110+
return data
110111

111112
def height_to_feet_inch(data, height_col):
112113
"""Convert the height_col column to ht_feet, ht_inch
@@ -158,14 +159,48 @@ def convert_17_18_data(data):
158159

159160
return data
160161

162+
def gen_replace_dict(key_list_dict):
163+
"""change { a : [1, 2, 3]} to {1 : a, 2 : a, 3 : a}"""
164+
return { val : key for key, val_list in key_list_dict.items() for val in val_list}
165+
166+
def simplify_category(data, catname):
167+
"""clean category catname"""
168+
data = data.astype({catname : 'object'})
169+
data = data.replace({catname : gen_replace_dict(REPLACE_REVERSE_DICT[catname])})
170+
valid_vals = list(set(USE_OTHER_VALUES[catname]) | set(REPLACE_REVERSE_DICT[catname]))
171+
data.loc[~data[catname].isin(valid_vals), catname] = "OTHER"
172+
data = data.astype({catname : 'category'})
173+
return data
174+
161175
def clean_categories(data):
162176
"""get data categories ready for one-hot-encoding"""
163-
data = data.astype('object') \
177+
cat_set = set(CLEAN_CAT_VALUES) | set(CAT_FILL_NA_VALUES)
178+
data = data.astype({cat : 'object' for cat in cat_set}) \
164179
.replace(CLEAN_CAT_VALUES) \
165180
.fillna(CAT_FILL_NA_VALUES) \
166181
.dropna(subset=CLEAN_CAT_VALUES.keys()) \
167-
.drop(columns=MODEL_IGNORE_COLS) \
168-
.astype('category')
182+
.drop(columns=MODEL_IGNORE_COLS, errors='ignore') \
183+
.astype({cat : 'category' for cat in cat_set})
184+
for catname in REPLACE_REVERSE_DICT:
185+
data = simplify_category(data, catname)
186+
return data
187+
188+
189+
190+
def engineer_features(data):
191+
"""Engineer features. Probably should be in data_modeler"""
192+
# add date-time columns
193+
data = add_datetimestop(data)
194+
data = add_month_weekday(data)
195+
data = data.dropna(subset=['pct'])
196+
# convert ht_feet, ht_inch to height
197+
data = add_height(data)
198+
# add pct-sector column
199+
data = add_pct_sector(data)
200+
# convert yes-no columns to 1-0
201+
data = y_n_to_1_0_cols(data)
202+
# clean categories
203+
data = clean_categories(data)
169204
return data
170205

171206

@@ -200,10 +235,13 @@ def get_dtypes(on_input=True):
200235
dtypes.pop(key)
201236

202237
return dtypes
203-
204-
def load_sqf(year, dirname='../data/stop_frisk', convert=True):
238+
239+
240+
241+
def load_sqf(year, dirname='../data/stop_frisk', convert=True, engineer=True):
205242
"""Load and clean sqf csv file by year.
206-
convert=True if 2017, 2018 should be converted to pre-2017 format."""
243+
convert=True if 2017, 2018 should be converted to pre-2017 format.
244+
engineer=True to engineer features for modeling."""
207245
print(f'Loading {year}...')
208246
# '*' is a na_value for the beat variable
209247
# '12311900' is a na_value for DOB
@@ -236,21 +274,14 @@ def load_sqf(year, dirname='../data/stop_frisk', convert=True):
236274
'details_' : 'detailcm'})
237275
if convert or year < 2017:
238276
data.columns = data.columns.str.lower()
239-
# add date-time columns
240-
data = add_datetimestop(data)
241-
data = add_month_weekday(data)
242-
# 999 is a na_value for the precinct variable
243277
data = data.replace(REPLACE_VALUES)
244-
data = data.dropna(subset=['pct'])
245-
# convert ht_feet, ht_inch to height
246-
data = add_height(data)
247-
# add pct-sector column
248-
data = add_pct_sector(data)
249-
# convert yes-no columns to 1-0
250-
y_n_to_1_0_cols(data)
251-
278+
if engineer:
279+
data = engineer_features(data)
252280
return data
253281

282+
283+
284+
254285
def load_sqfs(start=2003, end=2018, dirname='../data/stop_frisk'):
255286
"""Loads sqf data in format dir/<year>.csv into dict of dataframes
256287
Currently works for years in 2003 to 2016"""

0 commit comments

Comments
 (0)