Skip to content

Commit b55ff8a

Browse files
committed
debug jhu, still msa issue
1 parent eee0acf commit b55ff8a

File tree

7 files changed

+57
-177
lines changed

7 files changed

+57
-177
lines changed

_delphi_utils_python/delphi_utils/geomap.py

Lines changed: 27 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,17 @@ class GeoMapper:
5151
+ county -> state : unweighted
5252
+ county -> msa : unweighted
5353
+ county -> megacounty
54+
+ county -> hrr
55+
+ county -> zip
5456
- zip -> state
5557
- zip -> msa
56-
- fips -> hrr
5758
5859
Geotypes (listed by default column name):
5960
zip: zip5, length 5 str of 0-9 with leading 0's
6061
fips: county, length 5 str of 0-9 with leading 0's
6162
msa: metro stat area, length 5 str of 0-9 with leading 0's
62-
st_code: state code, str of 0-9
63-
state_id: state id, str of A-Z
63+
st_code: state code, str in 1-99
64+
state_id: state id, str in AA-ZZ
6465
hrr: hrr id, int 1-500
6566
"""
6667

@@ -147,11 +148,26 @@ def load_jhu_uid_fips_cross(self):
147148
'weight': float})
148149
self.jhu_uid_fips_cross = GeoMapper.convert_int_to_str5(self.jhu_uid_fips_cross, int_col='fips', str_col='fips')
149150

151+
@staticmethod
152+
def convert_int_to_str5(data,
153+
int_col='fips',
154+
str_col='fips'):
155+
"""convert int to a string of length 5"""
156+
data = data.copy()
157+
data[str_col] = data[int_col].astype(str).str.zfill(5)
158+
return data
150159

151-
def convert_fips_to_stcode(self,
160+
def convert_intfips_to_str(self,
152161
data,
153-
fips_col='fips',
154-
stcode_col='st_code'):
162+
intfips_col='fips',
163+
strfips_col='fips'):
164+
"""convert fips to a string of length 5"""
165+
return GeoMapper.convert_int_to_str5(data,int_col=intfips_col,str_col=strfips_col)
166+
167+
def convert_fips_to_stcode(self,
168+
data: pd.DataFrame,
169+
fips_col: str = 'fips',
170+
stcode_col: str = 'st_code'):
155171
"""create st_code column from fips column
156172
157173
Args:
@@ -168,23 +184,6 @@ def convert_fips_to_stcode(self,
168184
data[stcode_col] = data[fips_col].str[:2]
169185
return data
170186

171-
@staticmethod
172-
def convert_int_to_str5(data,
173-
int_col='fips',
174-
str_col='fips'):
175-
"""convert int to a string of length 5"""
176-
data = data.copy()
177-
data[str_col] = data[int_col].astype(str).str.zfill(5)
178-
return data
179-
180-
def convert_intfips_to_str(self,
181-
data,
182-
intfips_col='fips',
183-
strfips_col='fips'):
184-
"""convert fips to a string of length 5"""
185-
return GeoMapper.convert_int_to_str5(data,int_col=intfips_col,str_col=strfips_col)
186-
187-
188187
def convert_stcode_to_state_id(self,
189188
data,
190189
stcode_col='st_code',
@@ -365,16 +364,11 @@ def county_to_state(self,
365364
if count_cols:
366365
data = data[[fips_col,date_col] + count_cols].copy()
367366
data = self.convert_fips_to_state_id(data,fips_col=fips_col,state_id_col=state_id_col,full=full)
368-
data.dropna(subset=['st_code'], axis=0, inplace=True)
367+
data.dropna(subset=[state_id_col], axis=0, inplace=True)
369368
data.drop([fips_col,'st_code'],axis=1,inplace=True)
370-
# assert not data[state_id_col].isnull().values.any(), "nan states, probably invalid fips"
371-
if date_col:
372-
assert not data[date_col].isnull().values.any(), "nan dates not allowed"
373-
data.fillna(0,inplace=True)
374-
data = data.groupby([date_col,state_id_col]).sum()
375-
else:
376-
data.fillna(0,inplace=True)
377-
data = data.groupby(state_id_col).sum()
369+
assert not data[date_col].isnull().values.any(), "nan dates not allowed"
370+
# data.fillna(0,inplace=True)
371+
data = data.groupby([date_col,state_id_col]).sum()
378372
return data.reset_index()
379373

380374
def county_to_msa(self,
@@ -828,4 +822,4 @@ def county_to_hrr(self,
828822
zip_col = "_zip_col_temp"
829823
data = self.county_to_zip(data, fips_col=fips_col, zip_col=zip_col, date_col=date_col, count_cols=count_cols)
830824
data = self.zip_to_hrr(data, zip_col=zip_col, date_col=date_col, count_cols=count_cols, hrr_col=hrr_col)
831-
return data.astype(dtype={hrr_col: int})
825+
return data.astype(dtype={hrr_col: int})

_delphi_utils_python/tests/test_geomap.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
import pytest
2-
import os
1+
from delphi_utils.geomap import GeoMapper
32
import pandas as pd
43
import numpy as np
5-
from delphi_utils.geomap import GeoMapper
64

75

86
class TestGeoMapper:

jhu/delphi_jhu/geo.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def geo_map(df: pd.DataFrame, geo_res: str, sensor: str):
3737
elif geo_res == "msa":
3838
df = gmpr.county_to_msa(df, fips_col="fips", msa_col="geo_id", date_col="timestamp")
3939
df['geo_id'] = df['geo_id'].astype(int)
40+
print(df[df['population']==0])
4041
elif geo_res == 'hrr':
4142
df = gmpr.county_to_hrr(df, fips_col="fips", hrr_col="geo_id", date_col="timestamp")
4243
df['geo_id'] = df['geo_id'].astype(int)

jhu/delphi_jhu/pull.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# -*- coding: utf-8 -*-
2-
import numpy as np
3-
import pandas as pd
4-
import re
52

63
from delphi_utils import GeoMapper
4+
import pandas as pd
5+
import numpy as np
6+
import re
77

88
def detect_date_col(col_name: str):
99
"""determine if column name is a date"""

jhu/tests/compare_receiving.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,33 +5,40 @@
55
import pandas as pd
66
import os
77

8-
def load_files():
8+
rec_pattern = ""
9+
# rec_pattern = "county_deaths_incidence_num"
10+
11+
def load_files(pattern = "", num = 1000):
912
rec_dir = os.listdir('../receiving')
10-
rec_stable_dir = os.listdir('../receiving_rf')
11-
rec_common = set(rec_dir) & set(rec_stable_dir)
13+
suff = "stable"
14+
rec_stable_dir = os.listdir(f'../receiving_{suff}')
15+
rec_common = list(set(rec_dir) & set(rec_stable_dir))
1216
print(set(rec_dir).symmetric_difference(rec_stable_dir))
17+
num_iter = 0
1318
for rec in rec_common:
14-
df_rec = pd.read_csv(f'../receiving/{rec}').set_index('geo_id')
15-
df_stable = pd.read_csv(f'../receiving_rf/{rec}').set_index('geo_id')
16-
try:
17-
df_join = df_rec.join(df_stable, rsuffix='_stable' )
18-
except:
19-
print(df_rec.info())
20-
print(df_stable.info())
21-
assert False, f"failed join on {rec}"
22-
yield rec, df_join
19+
if num_iter <= num:
20+
num_iter += 1
21+
df_rec = pd.read_csv(f'../receiving/{rec}').set_index('geo_id')
22+
df_stable = pd.read_csv(f'../receiving_{suff}/{rec}').set_index('geo_id')
23+
try:
24+
df_join = df_rec.join(df_stable, rsuffix='_stable' )
25+
except:
26+
print(df_rec.info())
27+
print(df_stable.info())
28+
assert False, f"failed join on {rec}"
29+
yield rec, df_join
2330

2431
def main():
25-
load_iter = load_files()
32+
load_iter = load_files(rec_pattern)
2633
for rec, df in load_iter:
2734
if ('msa' in rec) and False:
2835
msa_ds = (df['val'] - df['val_stable']).sum()
2936
print(f'{msa_ds} value diff')
30-
if (df.eval('abs(val - val_stable)').sum() > 0.01) and ("county" in rec):
37+
if (df.eval('abs(val - val_stable)').sum() > 0.01):
3138
print(f'Printing {rec} difference')
3239
df_diff = df[df.eval('val != val_stable')]
33-
print(df_diff.head())
3440
print(df_diff.shape)
41+
df_diff.to_csv(f'rec_diffs/diff_{rec}.csv')
3542
# assert "county_confirmed_7dav_incidence_num" not in rec, f"{rec}!!!"
3643
#input('w')
3744

jhu/tests/receiving/.gitignore

Lines changed: 0 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -1,120 +0,0 @@
1-
# You should hard commit a prototype for this file, but we
2-
# want to avoid accidental adding of API tokens and other
3-
# private data parameters
4-
params.json
5-
6-
# Do not commit output files
7-
receiving/*.csv
8-
9-
# Remove macOS files
10-
.DS_Store
11-
12-
# virtual environment
13-
dview/
14-
15-
# Byte-compiled / optimized / DLL files
16-
__pycache__/
17-
*.py[cod]
18-
*$py.class
19-
20-
# C extensions
21-
*.so
22-
23-
# Distribution / packaging
24-
coverage.xml
25-
.Python
26-
build/
27-
develop-eggs/
28-
dist/
29-
downloads/
30-
eggs/
31-
.eggs/
32-
lib/
33-
lib64/
34-
parts/
35-
sdist/
36-
var/
37-
wheels/
38-
*.egg-info/
39-
.installed.cfg
40-
*.egg
41-
MANIFEST
42-
43-
# PyInstaller
44-
# Usually these files are written by a python script from a template
45-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
46-
*.manifest
47-
*.spec
48-
49-
# Installer logs
50-
pip-log.txt
51-
pip-delete-this-directory.txt
52-
53-
# Unit test / coverage reports
54-
htmlcov/
55-
.tox/
56-
.coverage
57-
.coverage.*
58-
.cache
59-
nosetests.xml
60-
coverage.xml
61-
*.cover
62-
.hypothesis/
63-
.pytest_cache/
64-
65-
# Translations
66-
*.mo
67-
*.pot
68-
69-
# Django stuff:
70-
*.log
71-
.static_storage/
72-
.media/
73-
local_settings.py
74-
75-
# Flask stuff:
76-
instance/
77-
.webassets-cache
78-
79-
# Scrapy stuff:
80-
.scrapy
81-
82-
# Sphinx documentation
83-
docs/_build/
84-
85-
# PyBuilder
86-
target/
87-
88-
# Jupyter Notebook
89-
.ipynb_checkpoints
90-
91-
# pyenv
92-
.python-version
93-
94-
# celery beat schedule file
95-
celerybeat-schedule
96-
97-
# SageMath parsed files
98-
*.sage.py
99-
100-
# Environments
101-
.env
102-
.venv
103-
env/
104-
venv/
105-
ENV/
106-
env.bak/
107-
venv.bak/
108-
109-
# Spyder project settings
110-
.spyderproject
111-
.spyproject
112-
113-
# Rope project settings
114-
.ropeproject
115-
116-
# mkdocs documentation
117-
/site
118-
119-
# mypy
120-
.mypy_cache/

jhu/tests/test_smooth.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,16 @@
1010
class TestSmooth:
1111
def test_output_files_smoothed(self, run_as_module):
1212

13-
dates = [str(x) for x in range(20200304, 20200311)]
13+
dates = [str(x) for x in range(20200701, 20200730)]
1414

1515
smoothed = pd.read_csv(
16-
join("receiving",
16+
join("../receiving",
1717
f"{dates[-1]}_state_confirmed_7dav_cumulative_num.csv")
1818
)
1919

2020
raw = pd.concat([
2121
pd.read_csv(
22-
join("receiving",
22+
join("../receiving",
2323
f"{date}_state_confirmed_cumulative_num.csv")
2424
) for date in dates
2525
])

0 commit comments

Comments
 (0)