Skip to content

Commit aacc545

Browse files
committed
using dask for read/write large files
1 parent 9740899 commit aacc545

File tree

2 files changed

+23
-11
lines changed

2 files changed

+23
-11
lines changed

doctor_visits/delphi_doctor_visits/update_sensor.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@
1414
from pathlib import Path
1515

1616
# third party
17+
import dask.dataframe as dd
1718
import numpy as np
1819
import pandas as pd
1920

21+
2022
# first party
2123
from delphi_utils import Weekday
2224
from .config import Config
@@ -89,22 +91,31 @@ def update_sensor(
8991
# value cols: Denominator, Covid_like, Flu_like, Flu1, Mixed
9092
filename = Path(filepath).name
9193
data = pd.read_csv(
94+
ddata = dd.read_csv(
9295
filepath,
96+
compression="gzip",
9397
dtype=Config.DTYPES,
98+
blocksize=None,
9499
)
95-
logger.info(f"Starting processing {filename} ")
96-
data.rename(columns=Config.DEVIANT_COLS_MAP, inplace=True)
97-
data = data[Config.FILT_COLS]
98-
data[Config.DATE_COL] = data[Config.DATE_COL].apply(pd.to_datetime)
99-
logger.info(f"finished processing {filename} ")
100-
assert (
101-
np.sum(data.duplicated(subset=Config.ID_COLS)) == 0
102-
), "Duplicated data! Check the input file"
103100

104-
# drop HRR columns - unused for now since we assign HRRs by FIPS
105-
data.drop(columns=Config.HRR_COLS, inplace=True)
106-
data.dropna(inplace=True) # drop rows with any missing entries
101+
ddata = ddata.dropna()
102+
ddata = ddata.rename(columns=Config.DEVIANT_COLS_MAP)
103+
ddata = ddata[Config.FILT_COLS]
104+
105+
106+
data = ddata.compute()
107107

108+
# data.dropna(inplace=True) # drop rows with any missing entries
109+
110+
# data.columns = data.columns.to_series().replace(Config.DEVIANT_COLS_MAP)
111+
#
112+
# data = data[Config.FILT_COLS]
113+
#
114+
# # drop HRR columns - unused for now since we assign HRRs by FIPS
115+
# data.drop(columns=Config.HRR_COLS, inplace=True)
116+
# data.dropna(inplace=True) # drop rows with any missing entries
117+
118+
data[Config.DATE_COL] = data[Config.DATE_COL].apply(pd.to_datetime)
108119
# aggregate age groups (so data is unique by service date and FIPS)
109120
data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum(numeric_only=True).reset_index()
110121
assert np.sum(data.duplicated()) == 0, "Duplicates after age group aggregation"

doctor_visits/setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
"pytest-cov",
1212
"pytest",
1313
"scikit-learn",
14+
"dask",
1415
]
1516

1617
setup(

0 commit comments

Comments
 (0)