climatebrad
diff --git a/‎Master Notebook.ipynb
Lines changed: 75 additions & 0 deletions b/‎Master Notebook.ipynb
Lines changed: 75 additions & 0 deletions
diff --git a/‎data_cleaner.py
Lines changed: 50 additions & 19 deletions b/‎data_cleaner.py
Lines changed: 50 additions & 19 deletions
@@ -0,0 +1,75 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Stop and Frisk"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Modeling what are effective stops - which lead to arrests, non-arrests. Unreasonable search and seizure is unconstitutional; racial bias is illegal; police activity without any effectiveness on crime is wasteful."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Ideally we'll examine outcomes more closely. Multi-class logistic regression and random forest. You can use multiclass or holdouts. Remember that with logistic regression you drop the first dummy variable; with random forest you don't need to. Also remember to use n_jobs. Gridsearch probably should be done to optimize model. Worth adding borough."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "TODO..."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* populate missing city fields from pct\n",
+    "* sector is less of a problem but still lots of missing values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Using a balanced accuracy score."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
@@ -88,7 +88,7 @@ def add_month_weekday(data):
 
 def add_pct_sector(data):
     """sectors are subdivisions of precincts"""
-    data['pct_sector'] = data.pct.astype(int).astype(str) + data.sector.astype('object').fillna('').astype(str)
+    data['pct_sector'] = data.pct.astype(int).astype(str) + '-' + data.sector.astype('object').fillna('').astype(str)
     data.pct_sector = data.pct_sector.astype('category')
     return data
 
@@ -107,6 +107,7 @@ def y_n_to_1_0_cols(data, cols=Y_N_COLS, yes_values=['Y'], set_na=True):
     for y_n_col in Y_N_COLS:
         if y_n_col in data:
             data[y_n_col] = y_n_to_1_0(data[y_n_col], yes_values, set_na)
+    return data
 
 def height_to_feet_inch(data, height_col):
     """Convert the height_col column to ht_feet, ht_inch
@@ -158,14 +159,48 @@ def convert_17_18_data(data):
 
     return data
 
+def gen_replace_dict(key_list_dict):
+    """change { a : [1, 2, 3]} to {1 : a, 2 : a, 3 : a}"""
+    return { val : key for key, val_list in key_list_dict.items() for val in val_list}
+
+def simplify_category(data, catname):
+    """clean category catname"""
+    data = data.astype({catname : 'object'})
+    data = data.replace({catname : gen_replace_dict(REPLACE_REVERSE_DICT[catname])})
+    valid_vals = list(set(USE_OTHER_VALUES[catname]) | set(REPLACE_REVERSE_DICT[catname]))
+    data.loc[~data[catname].isin(valid_vals), catname] = "OTHER"
+    data = data.astype({catname : 'category'})
+    return data
+
 def clean_categories(data):
     """get data categories ready for one-hot-encoding"""
-    data = data.astype('object') \
+    cat_set = set(CLEAN_CAT_VALUES) | set(CAT_FILL_NA_VALUES) 
+    data = data.astype({cat : 'object' for cat in cat_set}) \
                 .replace(CLEAN_CAT_VALUES) \
                 .fillna(CAT_FILL_NA_VALUES) \
                 .dropna(subset=CLEAN_CAT_VALUES.keys()) \
-                .drop(columns=MODEL_IGNORE_COLS) \
-                .astype('category')
+                .drop(columns=MODEL_IGNORE_COLS, errors='ignore') \
+                .astype({cat : 'category' for cat in cat_set})
+    for catname in REPLACE_REVERSE_DICT:
+        data = simplify_category(data, catname)    
+    return data
+
+
+
+def engineer_features(data):
+    """Engineer features. Probably should be in data_modeler"""
+    # add date-time columns
+    data = add_datetimestop(data)
+    data = add_month_weekday(data)
+    data = data.dropna(subset=['pct'])
+    # convert ht_feet, ht_inch to height 
+    data = add_height(data)
+    # add pct-sector column
+    data = add_pct_sector(data)
+    # convert yes-no columns to 1-0
+    data = y_n_to_1_0_cols(data)
+    # clean categories
+    data = clean_categories(data)
     return data
 
 
@@ -200,10 +235,13 @@ def get_dtypes(on_input=True):
                 dtypes.pop(key)
 
     return dtypes
-        
-def load_sqf(year, dirname='../data/stop_frisk', convert=True):
+   
+
+
+def load_sqf(year, dirname='../data/stop_frisk', convert=True, engineer=True):
     """Load and clean sqf csv file by year. 
-    convert=True if 2017, 2018 should be converted to pre-2017 format."""
+    convert=True if 2017, 2018 should be converted to pre-2017 format.
+    engineer=True to engineer features for modeling."""
     print(f'Loading {year}...')
     # '*' is a na_value for the beat variable
     # '12311900' is a na_value for DOB
@@ -236,21 +274,14 @@ def load_sqf(year, dirname='../data/stop_frisk', convert=True):
                                     'details_' : 'detailcm'})
     if convert or year < 2017: 
         data.columns = data.columns.str.lower()
-        # add date-time columns
-        data = add_datetimestop(data)
-        data = add_month_weekday(data)
-        # 999 is a na_value for the precinct variable
         data = data.replace(REPLACE_VALUES)
-        data = data.dropna(subset=['pct'])
-        # convert ht_feet, ht_inch to height 
-        data = add_height(data)
-        # add pct-sector column
-        data = add_pct_sector(data)
-        # convert yes-no columns to 1-0
-        y_n_to_1_0_cols(data)
-        
+        if engineer:
+            data = engineer_features(data)        
     return data
 
+
+
+
 def load_sqfs(start=2003, end=2018, dirname='../data/stop_frisk'):
     """Loads sqf data in format dir/<year>.csv into dict of dataframes
     Currently works for years in 2003 to 2016"""