Skip to content

Commit 67a312c

Browse files
committed
moving wrapper in seperate module
1 parent 4afe0f0 commit 67a312c

File tree

2 files changed

+207
-43
lines changed

2 files changed

+207
-43
lines changed
Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
from datetime import datetime, date, timedelta
2+
from typing import List, Tuple, Union, Iterable
3+
4+
import pandas as pd
5+
6+
from delphi_epidata import Epidata
7+
8+
def date_generator(startdate, enddate):
9+
while startdate <= enddate:
10+
yield startdate.strftime('%Y-%m-%d')
11+
startdate = startdate + timedelta(days=1)
12+
13+
14+
15+
def metadata():
16+
response = Epidata._request("covidcast_meta")
17+
18+
if response["result"] != 1:
19+
# Something failed in the API and we did not get real metadata
20+
raise RuntimeError("Error when fetching metadata from the API",
21+
response["message"])
22+
23+
df = pd.DataFrame.from_dict(response["epidata"])
24+
return df
25+
26+
27+
def signal(
28+
data_source: str,
29+
signal: str, # pylint: disable=W0621
30+
start_day: date = None,
31+
end_day: date = None,
32+
geo_type: str = "county",
33+
geo_values: Union[str, Iterable[str]] = "*",
34+
as_of: date = None,
35+
issues: Union[date, Tuple[date], List[date]] = None,
36+
lag: int = None,
37+
time_type: str = "day",
38+
) -> Union[pd.DataFrame, None]:
39+
"""Download a Pandas data frame for one signal.
40+
41+
Obtains data for selected date ranges for all geographic regions of the
42+
United States. Available data sources and signals are documented in the
43+
`COVIDcast signal documentation
44+
<https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html>`_.
45+
Most (but not all) data sources are available at the county level, but the
46+
API can also return data aggregated to metropolitan statistical areas,
47+
hospital referral regions, or states, as desired, by using the ``geo_type``
48+
argument.
49+
50+
The COVIDcast API tracks updates and changes to its underlying data, and
51+
records the first date each observation became available. For example, a
52+
data source may report its estimate for a specific state on June 3rd on June
53+
5th, once records become available. This data is considered "issued" on June
54+
5th. Later, the data source may update its estimate for June 3rd based on
55+
revised data, creating a new issue on June 8th. By default, ``signal()``
56+
returns the most recent issue available for every observation. The
57+
``as_of``, ``issues``, and ``lag`` parameters allow the user to select
58+
specific issues instead, or to see all updates to observations. These
59+
options are mutually exclusive; if you specify more than one, ``as_of`` will
60+
take priority over ``issues``, which will take priority over ``lag``.
61+
62+
Note that the API only tracks the initial value of an estimate and *changes*
63+
to that value. If a value was first issued on June 5th and never updated,
64+
asking for data issued on June 6th (using ``issues`` or ``lag``) would *not*
65+
return that value, though asking for data ``as_of`` June 6th would.
66+
67+
Note also that the API enforces a maximum result row limit; results beyond
68+
the maximum limit are truncated. This limit is sufficient to fetch
69+
observations in all counties in the United States on one day. This client
70+
automatically splits queries for multiple days across multiple API calls.
71+
However, if data for one day has been issued many times, using the
72+
``issues`` argument may return more results than the query limit. A warning
73+
will be issued in this case. To see all results, split your query across
74+
multiple calls with different ``issues`` arguments.
75+
76+
See the `COVIDcast API documentation
77+
<https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html>`_ for more
78+
information on available geography types, signals, and data formats, and
79+
further discussion of issue dates and data versioning.
80+
81+
:param data_source: String identifying the data source to query, such as
82+
``"fb-survey"``.
83+
:param signal: String identifying the signal from that source to query,
84+
such as ``"smoothed_cli"``.
85+
:param start_day: Query data beginning on this date. Provided as a
86+
``datetime.date`` object. If ``start_day`` is ``None``, defaults to the
87+
first day data is available for this signal. If ``time_type == "week"``, then
88+
this is rounded to the epiweek containing the day (i.e. the previous Sunday).
89+
:param end_day: Query data up to this date, inclusive. Provided as a
90+
``datetime.date`` object. If ``end_day`` is ``None``, defaults to the most
91+
recent day data is available for this signal. If ``time_type == "week"``, then
92+
this is rounded to the epiweek containing the day (i.e. the previous Sunday).
93+
:param geo_type: The geography type for which to request this data, such as
94+
``"county"`` or ``"state"``. Available types are described in the
95+
COVIDcast signal documentation. Defaults to ``"county"``.
96+
:param geo_values: The geographies to fetch data for. The default, ``"*"``,
97+
fetches all geographies. To fetch one geography, specify its ID as a
98+
string; multiple geographies can be provided as an iterable (list, tuple,
99+
...) of strings.
100+
:param as_of: Fetch only data that was available on or before this date,
101+
provided as a ``datetime.date`` object. If ``None``, the default, return
102+
the most recent available data. If ``time_type == "week"``, then
103+
this is rounded to the epiweek containing the day (i.e. the previous Sunday).
104+
:param issues: Fetch only data that was published or updated ("issued") on
105+
these dates. Provided as either a single ``datetime.date`` object,
106+
indicating a single date to fetch data issued on, or a tuple or list
107+
specifying (start, end) dates. In this case, return all data issued in
108+
this range. There may be multiple rows for each observation, indicating
109+
several updates to its value. If ``None``, the default, return the most
110+
recently issued data. If ``time_type == "week"``, then these are rounded to
111+
the epiweek containing the day (i.e. the previous Sunday).
112+
:param lag: Integer. If, for example, ``lag=3``, fetch only data that was
113+
published or updated exactly 3 days after the date. For example, a row
114+
with ``time_value`` of June 3 will only be included in the results if its
115+
data was issued or updated on June 6. If ``None``, the default, return the
116+
most recently issued data regardless of its lag.
117+
:param time_type: The temporal resolution to request this data. Most signals
118+
are available at the "day" resolution (the default); some are only
119+
available at the "week" resolution, representing an MMWR week ("epiweek").
120+
:returns: A Pandas data frame with matching data, or ``None`` if no data is
121+
returned. Each row is one observation on one day in one geographic location.
122+
Contains the following columns:
123+
124+
``geo_value``
125+
Identifies the location, such as a state name or county FIPS code. The
126+
geographic coding used by COVIDcast is described in the `API
127+
documentation here
128+
<https://cmu-delphi.github.io/delphi-epidata/api/covidcast_geography.html>`_.
129+
130+
``signal``
131+
Name of the signal, same as the value of the ``signal`` input argument. Used for
132+
downstream functions to recognize where this signal is from.
133+
134+
``time_value``
135+
Contains a `pandas Timestamp object
136+
<https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html>`_
137+
identifying the date this estimate is for. For data with ``time_type = "week"``, this
138+
is the first day of the corresponding epiweek.
139+
140+
``issue``
141+
Contains a `pandas Timestamp object
142+
<https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timestamp.html>`_
143+
identifying the date this estimate was issued. For example, an estimate
144+
with a ``time_value`` of June 3 might have been issued on June 5, after
145+
the data for June 3rd was collected and ingested into the API.
146+
147+
``lag``
148+
Integer giving the difference between ``issue`` and ``time_value``,
149+
in days.
150+
151+
``value``
152+
The signal quantity requested. For example, in a query for the
153+
``confirmed_cumulative_num`` signal from the ``usa-facts`` source,
154+
this would be the cumulative number of confirmed cases in the area, as
155+
of the ``time_value``.
156+
157+
``stderr``
158+
The value's standard error, if available.
159+
160+
``sample_size``
161+
Indicates the sample size available in that geography on that day;
162+
sample size may not be available for all signals, due to privacy or
163+
other constraints.
164+
165+
``geo_type``
166+
Geography type for the signal, same as the value of the ``geo_type`` input argument.
167+
Used for downstream functions to parse ``geo_value`` correctly
168+
169+
``data_source``
170+
Name of the signal source, same as the value of the ``data_source`` input argument. Used for
171+
downstream functions to recognize where this signal is from.
172+
173+
Consult the `signal documentation
174+
<https://cmu-delphi.github.io/delphi-epidata/api/covidcast_signals.html>`_
175+
for more details on how values and standard errors are calculated for
176+
specific signals.
177+
178+
"""
179+
if start_day > end_day:
180+
raise ValueError(
181+
"end_day must be on or after start_day, but " f"start_day = '{start_day}', end_day = '{end_day}'"
182+
)
183+
184+
time_values = list(date_generator(start_day, end_day))
185+
issues = list(date_generator(start_day, end_day)) #TODO placesholder
186+
response = Epidata.covidcast(data_source, signal, time_type=time_type,
187+
geo_type=geo_type, time_values=time_values,
188+
geo_value=geo_values, as_of=as_of,
189+
issues=issues, lag=lag)
190+
if response["result"] != 1:
191+
# Something failed in the API and we did not get real metadata
192+
raise RuntimeError("Error when fetching metadata from the API",
193+
response["message"])
194+
195+
api_df = pd.DataFrame.from_dict(response["epidata"])
196+
api_df["issue"] = pd.to_datetime(api_df["issue"], format='%Y%m%d')
197+
api_df["time_value"] = pd.to_datetime(api_df["time_value"], format='%Y%m%d')
198+
api_df.drop("direction", axis=1, inplace=True)
199+
api_df["data_source"] = data_source
200+
api_df["signal"] = signal
201+
202+
return api_df

_delphi_utils_python/delphi_utils/validator/datafetcher.py

Lines changed: 5 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
import requests
1010
import pandas as pd
1111
import numpy as np
12-
import covidcast
13-
from delphi_epidata import Epidata
12+
13+
from ..covidcast_wrapper import metadata, signal
1414
from .errors import APIDataFetchError, ValidationFailure
1515

1616
FILENAME_REGEX = re.compile(
@@ -117,14 +117,7 @@ def get_geo_signal_combos(data_source, api_key):
117117
source_signal_mappings = {i['source']:i['db_source'] for i in
118118
meta_response.json()}
119119

120-
response = Epidata._request("covidcast_meta")
121-
122-
if response["result"] != 1:
123-
# Something failed in the API and we did not get real metadata
124-
raise RuntimeError("Error when fetching metadata from the API",
125-
response["message"])
126-
127-
meta = pd.DataFrame.from_dict(response["epidata"])
120+
meta = metadata()
128121

129122
source_meta = meta[meta['data_source'] == data_source]
130123
# Need to convert np.records to tuples so they are hashable and can be used in sets and dicts.
@@ -169,39 +162,8 @@ def fetch_api_reference(data_source, start_date, end_date, geo_type, signal_type
169162
Formatting is changed to match that of source data CSVs.
170163
"""
171164
with warnings.catch_warnings():
172-
warnings.simplefilter("ignore")
173-
# api_df = covidcast.signal(
174-
# data_source, signal_type, start_date, end_date, geo_type)
175-
176-
response = Epidata.covidcast(data_source, signal_type, time_type="day",
177-
geo_type=geo_type, time_values=[start_date, end_date],
178-
geo_value="*", as_of=None,
179-
issues=None, lag=None)
180-
181-
if response["result"] != 1:
182-
# Something failed in the API and we did not get real metadata
183-
raise RuntimeError("Error when fetching metadata from the API",
184-
response["message"])
185-
186-
api_df = pd.DataFrame.from_dict(response["epidata"])
187-
188-
# # Two possible error conditions: no data or too much data.
189-
# if day_data["message"] == "no results":
190-
# warnings.warn(f"No {data_source} {signal} data found on {day_str} "
191-
# f"for geography '{geo_type}'",
192-
# NoDataWarning)
193-
# if day_data["message"] not in {"success", "no results"}:
194-
# warnings.warn(f"Problem obtaining {data_source} {signal} data on {day_str} "
195-
# f"for geography '{geo_type}': {day_data['message']}",
196-
# RuntimeWarning)
197-
#
198-
# # In the too-much-data case, we continue to try putting the truncated
199-
# # data in our results. In the no-data case, skip this day entirely,
200-
# # since there is no "epidata" in the response.
201-
# if day_data.get("epidata"):
202-
# dfs.append(pd.DataFrame.from_dict(day_data["epidata"]))
203-
# cur_day += timedelta(1) if time_type == "day" else timedelta(7)
204-
#
165+
api_df = signal(data_source, signal_type, start_date, end_date, geo_type)
166+
205167

206168
error_context = f"when fetching reference data from {start_date} to {end_date} " +\
207169
f"for data source: {data_source}, signal type: {signal_type}, geo type: {geo_type}"

0 commit comments

Comments
 (0)