Skip to content

Commit 8e8e7db

Browse files
authored
Merge pull request #417 from dshemetov/main
NAN coding database and acquisition changes
2 parents 78df911 + e9b080c commit 8e8e7db

File tree

15 files changed

+665
-205
lines changed

15 files changed

+665
-205
lines changed

docs/api/covidcast.md

+3
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,9 @@ require knowing when we last confirmed an unchanged value, please get in touch.
168168
| `epidata[].sample_size` | number of "data points" used in computing the statistic, `null` when not applicable | float |
169169
| `epidata[].issue` | time unit (e.g. date) when this statistic was published | integer |
170170
| `epidata[].lag` | time delta (e.g. days) between when the underlying events happened and when this statistic was published | integer |
171+
| `epidata[].missing_value` | an integer code that is zero when the `value` field is present and non-zero when the data is missing (see [missing codes](missing_codes.md)) | integer |
172+
| `epidata[].missing_stderr` | an integer code that is zero when the `stderr` field is present and non-zero when the data is missing (see [missing codes](missing_codes.md)) | integer |
173+
| `epidata[].missing_sample_size` | an integer code that is zero when the `sample_size` field is present and non-zero when the data is missing (see [missing codes](missing_codes.md)) | integer |
171174
| `message` | `success` or error message | string |
172175

173176
**Note:** `result` code 2, "too many results", means that the number of results

docs/api/missing_codes.md

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
---
2+
title: NaN Missing Codes
3+
parent: COVIDcast Epidata API
4+
nav_order: 5
5+
---
6+
7+
# Missing Value Coding
8+
9+
Occasionally, data will be missing from our database and will be explicitly coded as NaN.
10+
In these cases, we strive to supply our best-known reason for the value to be missing by
11+
providing an integer code in the corresponding `missing_` column (i.e. `missing_value`
12+
corresponds to the `value` column). The integer codes are as follows
13+
14+
| Code | Name | Description |
15+
| --- | --- | --- |
16+
| 0 | DEFAULT | This is the default value for when the field is not missing. |
17+
| 1 | NOT APPLICABLE | This value is used when the field is not expected to have a value (e.g. stderr for a signal that is not estimated from a sample). |
18+
| 2 | REGION EXCEPTION | This value is used when the field is not reported because the particular indicator does not serve the geographical region requested. |
19+
| 3 | PRIVACY | This value is used when the field has been censored for data privacy reasons. This could be due to reasons such as low sample sizes or simply a requirement from our data partners. |
20+
| 4 | DELETED | This value is used when the field was present in previous issues, but is no longer reported. Deletions can arise due to bug fixes, changing censorship requirements, or data corrections from the source. |
21+
| 5 | UNKNOWN | This value is used when the field is missing, but does not fall into any of the categories above. |
22+
23+
These codes are supplied as part of the `delphi_utils` Python library (see [here](https://pypi.org/project/delphi-utils/)).

integrations/acquisition/covidcast/test_covidcast_meta_caching.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import requests
1010

1111
# first party
12+
from delphi_utils import Nans
1213
from delphi.epidata.client.delphi_epidata import Epidata
1314
import delphi.operations.secrets as secrets
1415
import delphi.epidata.acquisition.covidcast.database as live
@@ -65,17 +66,17 @@ def test_caching(self):
6566
"""Populate, query, cache, query, and verify the cache."""
6667

6768
# insert dummy data
68-
self.cur.execute('''
69+
self.cur.execute(f'''
6970
insert into covidcast values
7071
(0, 'src', 'sig', 'day', 'state', 20200422, 'pa',
71-
123, 1, 2, 3, 456, 1, 20200422, 0, 1, False),
72+
123, 1, 2, 3, 456, 1, 20200422, 0, 1, False, {Nans.NOT_MISSING}, {Nans.NOT_MISSING}, {Nans.NOT_MISSING}),
7273
(0, 'src', 'sig', 'day', 'state', 20200422, 'wa',
73-
789, 1, 2, 3, 456, 1, 20200423, 1, 1, False)
74+
789, 1, 2, 3, 456, 1, 20200423, 1, 1, False, {Nans.NOT_MISSING}, {Nans.NOT_MISSING}, {Nans.NOT_MISSING})
7475
''')
75-
self.cur.execute('''
76+
self.cur.execute(f'''
7677
insert into covidcast values
7778
(100, 'src', 'wip_sig', 'day', 'state', 20200422, 'pa',
78-
456, 4, 5, 6, 789, -1, 20200422, 0, 1, True)
79+
456, 4, 5, 6, 789, -1, 20200422, 0, 1, True, {Nans.NOT_MISSING}, {Nans.NOT_MISSING}, {Nans.NOT_MISSING})
7980
''')
8081

8182
self.cnx.commit()

integrations/acquisition/covidcast/test_csv_uploading.py

+124-13
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import mysql.connector
1111

1212
# first party
13+
from delphi_utils import Nans
1314
from delphi.epidata.client.delphi_epidata import Epidata
1415
from delphi.epidata.acquisition.covidcast.csv_to_database import main
1516
import delphi.operations.secrets as secrets
@@ -66,27 +67,49 @@ def test_uploading(self):
6667

6768
# valid
6869
with open(source_receiving_dir + '/20200419_state_test.csv', 'w') as f:
70+
f.write('geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size\n')
71+
f.write(f'ca,1,0.1,10,{Nans.NOT_MISSING},{Nans.NOT_MISSING},{Nans.NOT_MISSING}\n')
72+
f.write(f'tx,2,0.2,20,{Nans.NOT_MISSING},{Nans.NOT_MISSING},{Nans.NOT_MISSING}\n')
73+
f.write(f'fl,3,0.3,30,{Nans.NOT_MISSING},{Nans.NOT_MISSING},{Nans.NOT_MISSING}\n')
74+
75+
# valid, old style no missing cols should have intelligent defaults
76+
with open(source_receiving_dir + '/20200419_state_test_no_missing.csv', 'w') as f:
6977
f.write('geo_id,val,se,sample_size\n')
7078
f.write('ca,1,0.1,10\n')
71-
f.write('tx,2,0.2,20\n')
72-
f.write('fl,3,0.3,30\n')
79+
f.write('tx,NA,0.2,20\n')
80+
f.write('wa,3,0.3,30\n')
81+
82+
# invalid, missing with an inf value
83+
with open(source_receiving_dir + '/20200419_state_test_missing1.csv', 'w') as f:
84+
f.write('geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size\n')
85+
f.write(f'fl,inf,0.3,30,{Nans.OTHER},{Nans.NOT_MISSING},{Nans.NOT_MISSING}\n')
86+
87+
# invalid, missing with an incorrect missing code
88+
with open(source_receiving_dir + '/20200419_state_test_missing2.csv', 'w') as f:
89+
f.write('geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size\n')
90+
f.write(f'tx,NA,0.2,20,{Nans.NOT_MISSING},{Nans.NOT_MISSING},{Nans.NOT_MISSING}\n')
91+
92+
# invalid, no missing with an incorrect missing code
93+
with open(source_receiving_dir + '/20200419_state_test_missing3.csv', 'w') as f:
94+
f.write('geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size\n')
95+
f.write(f'wa,3,0.3,30,{Nans.OTHER},{Nans.NOT_MISSING},{Nans.NOT_MISSING}\n')
7396

7497
# valid wip
7598
with open(source_receiving_dir + '/20200419_state_wip_prototype.csv', 'w') as f:
76-
f.write('geo_id,val,se,sample_size\n')
77-
f.write('me,10,0.01,100\n')
78-
f.write('nd,20,0.02,200\n')
79-
f.write('wa,30,0.03,300\n')
99+
f.write('geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size\n')
100+
f.write(f'me,10,0.01,100,{Nans.NOT_MISSING},{Nans.NOT_MISSING},{Nans.NOT_MISSING}\n')
101+
f.write(f'nd,20,0.02,200,{Nans.NOT_MISSING},{Nans.NOT_MISSING},{Nans.NOT_MISSING}\n')
102+
f.write(f'wa,30,0.03,300,{Nans.NOT_MISSING},{Nans.NOT_MISSING},{Nans.NOT_MISSING}\n')
80103

81104
# invalid
82105
with open(source_receiving_dir + '/20200419_state_wip_really_long_name_that_will_be_accepted.csv', 'w') as f:
83-
f.write('geo_id,val,se,sample_size\n')
84-
f.write('pa,100,5.4,624\n')
106+
f.write('geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size\n')
107+
f.write(f'pa,100,5.4,624,{Nans.NOT_MISSING},{Nans.NOT_MISSING},{Nans.NOT_MISSING}\n')
85108

86109
# invalid
87110
with open(source_receiving_dir + '/20200419_state_wip_really_long_name_that_will_get_truncated_lorem_ipsum_dolor_sit_amet.csv', 'w') as f:
88-
f.write('geo_id,val,se,sample_size\n')
89-
f.write('pa,100,5.4,624\n')
111+
f.write('geo_id,val,se,sample_size,missing_val,missing_se,missing_sample_size\n')
112+
f.write(f'pa,100,5.4,624,{Nans.NOT_MISSING}, {Nans.NOT_MISSING}, {Nans.NOT_MISSING}\n')
90113

91114
# invalid
92115
with open(source_receiving_dir + '/20200420_state_test.csv', 'w') as f:
@@ -137,6 +160,9 @@ def apply_lag(expected_epidata):
137160
'sample_size': 10,
138161
'direction': None,
139162
'signal': 'test',
163+
'missing_value': Nans.NOT_MISSING,
164+
'missing_stderr': Nans.NOT_MISSING,
165+
'missing_sample_size': Nans.NOT_MISSING,
140166
},
141167
{
142168
'time_value': 20200419,
@@ -146,6 +172,9 @@ def apply_lag(expected_epidata):
146172
'sample_size': 30,
147173
'direction': None,
148174
'signal': 'test',
175+
'missing_value': Nans.NOT_MISSING,
176+
'missing_stderr': Nans.NOT_MISSING,
177+
'missing_sample_size': Nans.NOT_MISSING,
149178
},
150179
{
151180
'time_value': 20200419,
@@ -155,16 +184,87 @@ def apply_lag(expected_epidata):
155184
'sample_size': 20,
156185
'direction': None,
157186
'signal': 'test',
187+
'missing_value': Nans.NOT_MISSING,
188+
'missing_stderr': Nans.NOT_MISSING,
189+
'missing_sample_size': Nans.NOT_MISSING,
158190
},
159191
]),
160192
'message': 'success',
161193
})
162194

195+
# request CSV data from the API on the test with missing values
196+
response = Epidata.covidcast(
197+
'src-name', 'test_no_missing', 'day', 'state', 20200419, '*')
198+
199+
# verify data matches the CSV
200+
# NB these are ordered by geo_value
201+
self.assertEqual(response, {
202+
'result': 1,
203+
'epidata': apply_lag([
204+
{
205+
'time_value': 20200419,
206+
'geo_value': 'ca',
207+
'value': 1,
208+
'stderr': 0.1,
209+
'sample_size': 10,
210+
'direction': None,
211+
'signal': 'test_no_missing',
212+
'missing_value': Nans.NOT_MISSING,
213+
'missing_stderr': Nans.NOT_MISSING,
214+
'missing_sample_size': Nans.NOT_MISSING,
215+
},
216+
{
217+
'time_value': 20200419,
218+
'geo_value': 'tx',
219+
'value': None,
220+
'stderr': 0.2,
221+
'sample_size': 20,
222+
'direction': None,
223+
'signal': 'test_no_missing',
224+
'missing_value': Nans.OTHER,
225+
'missing_stderr': Nans.NOT_MISSING,
226+
'missing_sample_size': Nans.NOT_MISSING,
227+
},
228+
{
229+
'time_value': 20200419,
230+
'geo_value': 'wa',
231+
'value': 3,
232+
'stderr': 0.3,
233+
'sample_size': 30,
234+
'direction': None,
235+
'signal': 'test_no_missing',
236+
'missing_value': Nans.NOT_MISSING,
237+
'missing_stderr': Nans.NOT_MISSING,
238+
'missing_sample_size': Nans.NOT_MISSING,
239+
},
240+
]),
241+
'message': 'success',
242+
})
243+
244+
# invalid missing files
245+
response = Epidata.covidcast(
246+
'src-name', 'test_missing1', 'day', 'state', 20200419, '*')
247+
self.assertEqual(response, {
248+
'result': -2,
249+
'message': 'no results',
250+
})
251+
response = Epidata.covidcast(
252+
'src-name', 'test_missing2', 'day', 'state', 20200419, '*')
253+
self.assertEqual(response, {
254+
'result': -2,
255+
'message': 'no results',
256+
})
257+
response = Epidata.covidcast(
258+
'src-name', 'test_missing3', 'day', 'state', 20200419, '*')
259+
self.assertEqual(response, {
260+
'result': -2,
261+
'message': 'no results',
262+
})
263+
163264
# request CSV data from the API on WIP signal
164265
response = Epidata.covidcast(
165266
'src-name', 'wip_prototype', 'day', 'state', 20200419, '*')
166267

167-
168268
# verify data matches the CSV
169269
# NB these are ordered by geo_value
170270
self.assertEqual(response, {
@@ -178,6 +278,9 @@ def apply_lag(expected_epidata):
178278
'sample_size': 100,
179279
'direction': None,
180280
'signal': 'wip_prototype',
281+
'missing_value': Nans.NOT_MISSING,
282+
'missing_stderr': Nans.NOT_MISSING,
283+
'missing_sample_size': Nans.NOT_MISSING,
181284
},
182285
{
183286
'time_value': 20200419,
@@ -187,6 +290,9 @@ def apply_lag(expected_epidata):
187290
'sample_size': 200,
188291
'direction': None,
189292
'signal': 'wip_prototype',
293+
'missing_value': Nans.NOT_MISSING,
294+
'missing_stderr': Nans.NOT_MISSING,
295+
'missing_sample_size': Nans.NOT_MISSING,
190296
},
191297
{
192298
'time_value': 20200419,
@@ -196,12 +302,14 @@ def apply_lag(expected_epidata):
196302
'sample_size': 300,
197303
'direction': None,
198304
'signal': 'wip_prototype',
305+
'missing_value': Nans.NOT_MISSING,
306+
'missing_stderr': Nans.NOT_MISSING,
307+
'missing_sample_size': Nans.NOT_MISSING,
199308
},
200309
]),
201310
'message': 'success',
202311
})
203312

204-
205313
# request CSV data from the API on the signal with name length 32<x<64
206314
response = Epidata.covidcast(
207315
'src-name', 'wip_really_long_name_that_will_be_accepted', 'day', 'state', 20200419, '*')
@@ -218,7 +326,10 @@ def apply_lag(expected_epidata):
218326
'stderr': 5.4,
219327
'sample_size': 624,
220328
'direction': None,
221-
'signal': 'wip_really_long_name_that_will_be_accepted',
329+
'signal': 'wip_really_long_name_that_will_be_accepted',\
330+
'missing_value': Nans.NOT_MISSING,
331+
'missing_stderr': Nans.NOT_MISSING,
332+
'missing_sample_size': Nans.NOT_MISSING,
222333
},
223334
])
224335
})

0 commit comments

Comments
 (0)