Skip to content

Commit 30f3df6

Browse files
nmdefriescchuong
andauthored
rvdss interface and new fn layout so current/historical data can be easily fetched (#1551)
* add basic sql tables -- needs update with real col names * rename files * add main fn with CLI; remove date range params in package frontend fn stubs * start filling out historical fn stubs * rest of new fn layout. adds CLI * dashboard results can be stored directly in list in fetch_historical_dashboard_data * Add in archived dashboards, and calculate start year from data * address todos and fix historical fetching * Change misspelled CB to BC * Update imports --------- Co-authored-by: cchuong <[email protected]>
1 parent 5696636 commit 30f3df6

File tree

7 files changed

+488
-188
lines changed

7 files changed

+488
-188
lines changed

src/acquisition/rvdss/constants.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from datetime import datetime
2+
13
# The dataset calls the same viruses, provinces, regions (province groups),
24
# and country by multiple names. Map each of those to a common abbreviation.
35
VIRUSES = {
@@ -34,7 +36,7 @@
3436
"saskatchewan":"sk",
3537
"alberta": "ab",
3638
"british columbia" :"bc",
37-
"yukon" : "yk",
39+
"yukon" : "yt",
3840
"northwest territories" : "nt",
3941
"nunavut" : "nu",
4042
"canada":"ca",
@@ -54,6 +56,8 @@
5456
# Construct dashboard and data report URLS.
5557
DASHBOARD_BASE_URL = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/"
5658
DASHBOARD_W_DATE_URL = DASHBOARD_BASE_URL + "archive/{date}/"
59+
60+
# May not need this since we write a function for this in pull_historic
5761
DASHBOARD_BASE_URLS_2023_2024_SEASON = (
5862
DASHBOARD_W_DATE_URL.format(date = date) for date in
5963
(
@@ -74,6 +78,7 @@
7478
SEASON_BASE_URL = "https://www.canada.ca"
7579
ALTERNATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"
7680
HISTORIC_SEASON_REPORTS_URL = SEASON_BASE_URL+"/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html"
81+
DASHBOARD_ARCHIVED_DATES_URL= "https://health-infobase.canada.ca/src/js/respiratory-virus-detections/ArchiveData.json"
7782

7883
# Each URL created here points to a list of all data reports made during that
7984
# season, e.g.
@@ -82,7 +87,7 @@
8287
# disease data in a dashboard with a static URL. Therefore, this collection
8388
# of URLs does _NOT_ need to be updated. It is used for fetching historical
8489
# data (for dates on or before June 8, 2024) only.
85-
HISTORIC_SEASON_URL = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in
90+
HISTORIC_SEASON_URLS = (HISTORIC_SEASON_REPORTS_URL.format(year_range = year_range) for year_range in
8691
(
8792
"2013-2014",
8893
"2014-2015",
@@ -101,7 +106,12 @@
101106
DASHBOARD_UPDATE_DATE_FILE = "RVD_UpdateDate.csv"
102107
DASHBOARD_DATA_FILE = "RVD_WeeklyData.csv"
103108

104-
RESP_COUNTS_OUTPUT_FILE = "respiratory_detections.csv"
109+
110+
RESP_DETECTIONS_OUTPUT_FILE = "respiratory_detections.csv"
105111
POSITIVE_TESTS_OUTPUT_FILE = "positive_tests.csv"
112+
COUNTS_OUTPUT_FILE = "number_of_detections.csv"
113+
114+
FIRST_WEEK_OF_YEAR = 35
106115

107-
LAST_WEEK_OF_YEAR = 35
116+
UPDATE_DATES_FILE = "update_dates.txt"
117+
NOW = datetime.now()

src/acquisition/rvdss/database.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
"""
2+
===============
3+
=== Purpose ===
4+
===============
5+
6+
Stores data provided by rvdss Corp., which contains flu lab test results.
7+
See: rvdss.py
8+
9+
10+
=======================
11+
=== Data Dictionary ===
12+
=======================
13+
14+
`rvdss` is the table where rvdss data is stored.
15+
+----------+-------------+------+-----+---------+----------------+
16+
| Field | Type | Null | Key | Default | Extra |
17+
+----------+-------------+------+-----+---------+----------------+
18+
| id | int(11) | NO | PRI | NULL | auto_increment |
19+
| location | varchar(8) | NO | MUL | NULL | |
20+
| epiweek | int(11) | NO | MUL | NULL | |
21+
| value | float | NO | | NULL | |
22+
+----------+-------------+------+-----+---------+----------------+
23+
id: unique identifier for each record
24+
location: hhs1-10
25+
epiweek: the epiweek during which the queries were executed
26+
value: number of total test records per facility, within each epiweek
27+
28+
=================
29+
=== Changelog ===
30+
=================
31+
2017-12-14:
32+
* add "need update" check
33+
34+
2017-12-02:
35+
* original version
36+
"""
37+
38+
# standard library
39+
import argparse
40+
41+
# third party
42+
import mysql.connector
43+
44+
# first party
45+
from delphi.epidata.acquisition.rvdss import rvdss
46+
import delphi.operations.secrets as secrets
47+
from delphi.utils.epidate import EpiDate
48+
import delphi.utils.epiweek as flu
49+
from delphi.utils.geo.locations import Locations
50+
51+
LOCATIONS = Locations.hhs_list
52+
DATAPATH = "/home/automation/rvdss_data"
53+
54+
55+
def update(locations, first=None, last=None, force_update=False, load_email=True):
56+
# download and prepare data first
57+
qd = rvdss.rvdssData(DATAPATH, load_email)
58+
if not qd.need_update and not force_update:
59+
print("Data not updated, nothing needs change.")
60+
return
61+
62+
qd_data = qd.load_csv()
63+
qd_measurements = qd.prepare_measurements(qd_data, start_weekday=4)
64+
qd_ts = rvdss.measurement_to_ts(qd_measurements, 7, startweek=first, endweek=last)
65+
# connect to the database
66+
u, p = secrets.db.epi
67+
cnx = mysql.connector.connect(user=u, password=p, database="epidata")
68+
cur = cnx.cursor()
69+
70+
def get_num_rows():
71+
cur.execute("SELECT count(1) `num` FROM `rvdss`")
72+
for (num,) in cur:
73+
pass
74+
return num
75+
76+
# check from 4 weeks preceeding the last week with data through this week
77+
cur.execute("SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `rvdss`")
78+
for (ew0, ew1) in cur:
79+
ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4)
80+
ew0 = ew0 if first is None else first
81+
ew1 = ew1 if last is None else last
82+
print(f"Checking epiweeks between {int(ew0)} and {int(ew1)}...")
83+
84+
# keep track of how many rows were added
85+
rows_before = get_num_rows()
86+
87+
# check rvdss for new and/or revised data
88+
sql = """
89+
INSERT INTO
90+
`rvdss` (`location`, `epiweek`, `value`)
91+
VALUES
92+
(%s, %s, %s)
93+
ON DUPLICATE KEY UPDATE
94+
`value` = %s
95+
"""
96+
97+
total_rows = 0
98+
99+
for location in locations:
100+
if location not in qd_ts:
101+
continue
102+
ews = sorted(qd_ts[location].keys())
103+
num_missing = 0
104+
for ew in ews:
105+
v = qd_ts[location][ew]
106+
sql_data = (location, ew, v, v)
107+
cur.execute(sql, sql_data)
108+
total_rows += 1
109+
if v == 0:
110+
num_missing += 1
111+
if num_missing > 0:
112+
print(f" [{location}] missing {int(num_missing)}/{len(ews)} value(s)")
113+
114+
# keep track of how many rows were added
115+
rows_after = get_num_rows()
116+
print(f"Inserted {int(rows_after - rows_before)}/{int(total_rows)} row(s)")
117+
118+
# cleanup
119+
cur.close()
120+
cnx.commit()
121+
cnx.close()

0 commit comments

Comments
 (0)