Skip to content

Commit ecca542

Browse files
committed
add more comments
1 parent 00f3f9a commit ecca542

File tree

2 files changed

+60
-19
lines changed

2 files changed

+60
-19
lines changed

src/acquisition/rvdss/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
# Construct dashboard and data report URLS.
5454
DASHBOARD_BASE_URL = "https://health-infobase.canada.ca/src/data/respiratory-virus-detections/"
5555
DASHBOARD_W_DATE_URL = DASHBOARD_BASE_URL + "archive/{date}/"
56-
DASHBOARD_BASE_URLS_2023 = (
56+
DASHBOARD_BASE_URLS_2023_2024_SEASON = (
5757
DASHBOARD_W_DATE_URL.format(date = date) for date in
5858
(
5959
"2024-06-20",
@@ -72,7 +72,7 @@
7272

7373
SEASON_BASE_URL = "https://www.canada.ca"
7474
ALTERNATIVE_SEASON_BASE_URL = "www.phac-aspc.gc.ca/bid-bmi/dsd-dsm/rvdi-divr/"
75-
HISTORIC_SEASON_REPORTS_URL = "/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html"
75+
HISTORIC_SEASON_REPORTS_URL = SEASON_BASE_URL+"/en/public-health/services/surveillance/respiratory-virus-detections-canada/{year_range}.html"
7676

7777
# Each URL created here points to a list of all data reports made during that
7878
# season, e.g.

src/acquisition/rvdss/rvdss_historic.py

Lines changed: 58 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import math
1616

1717
from delphi.epidata.acquisition.rvdss.constants import (
18-
DASHBOARD_BASE_URLS_2023, HISTORIC_SEASON_URL,
18+
DASHBOARD_BASE_URLS_2023_2024_SEASON, HISTORIC_SEASON_URL,
1919
ALTERNATIVE_SEASON_BASE_URL, SEASON_BASE_URL, LAST_WEEK_OF_YEAR,
2020
RESP_COUNTS_OUTPUT_FILE, POSITIVE_TESTS_OUTPUT_FILE
2121
)
@@ -91,9 +91,8 @@ def get_report_date(week,start_year,epi=False):
9191
report_date = str(epi_week)
9292

9393
return(report_date)
94-
9594

96-
def parse_table_captions(soup):
95+
def extract_captions_of_interest(soup):
9796
"""
9897
finds all the table captions for the current week so tables can be identified
9998
@@ -369,6 +368,8 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu=
369368
return(table)
370369

371370
def get_season_reports(url):
371+
# From the url, go to the main landing page for a season
372+
# which contains all the links to each week in the season
372373
page=requests.get(url)
373374
soup=BeautifulSoup(page.text,'html.parser')
374375

@@ -387,7 +388,9 @@ def get_season_reports(url):
387388
current_week = weeks[week_num]
388389
current_week_end = end_dates[week_num]
389390

390-
# Skip empty pages
391+
# In the 2019=2020 season, the webpages for weeks 5 and 47 only have
392+
# the abbreviations table and the headers for the respiratory detections
393+
# table, so they are effectively empty, and skipped
391394
if season[0] == '2019':
392395
if current_week == 5 or current_week == 47:
393396
continue
@@ -396,7 +399,7 @@ def get_season_reports(url):
396399
temp_url=urls[week_num]
397400
temp_page=requests.get(temp_url)
398401
new_soup = BeautifulSoup(temp_page.text, 'html.parser')
399-
captions = parse_table_captions(new_soup)
402+
captions = extract_captions_of_interest(new_soup)
400403
modified_date = get_modified_dates(new_soup,current_week_end)
401404

402405
positive_tables=[]
@@ -405,55 +408,87 @@ def get_season_reports(url):
405408
caption=captions[i]
406409
tab = caption.find_next('table')
407410

408-
# Remove footers from tables
411+
# Remove footers from tables so the text isn't read in as a table row
409412
if tab.find('tfoot'):
410413
tab.tfoot.decompose()
411414

412-
# Delete duplicate entry from week 35 of the 2019-2020 season
415+
# In the positive adenovirus table in week 35 of the 2019-2020 season
416+
# The week number has been duplicated, which makes all the entries in the table
417+
# are one column to the right of where they should be. To fix this the
418+
# entry in the table (which is the first "td" element in the html) is deleted
413419
if season[0] == '2019' and current_week == 35:
414420
if "Positive Adenovirus" in caption.text:
415421
tab.select_one('td').decompose()
416422

417423
# Replace commas with periods
424+
# Some "number of detections" tables have number with commas (i.e 1,000)
425+
# In this case the commas must be deleted, otherwise turn into periods
426+
# because some tables have commas instead of decimal points
418427
if "number" not in caption.text.lower():
419428
tab = re.sub(",",r".",str(tab))
420429
else:
421430
tab = re.sub(",","",str(tab))
422431

423-
# Read table
432+
# Read table, coding all the abbreviations for missing data into NA
433+
# Also use dropna because removing footers causes the html to have an empty row
424434
na_values = ['N.A.','N.A', 'N.C.','N.R.','Not Available','Not Tested',"N.D.","-"]
425435
table = pd.read_html(tab,na_values=na_values)[0].dropna(how="all")
426436

427437
# Check for multiline headers
438+
# If there are any, combine them into a single line header
428439
if isinstance(table.columns, pd.MultiIndex):
429440
table.columns = [c[0] + " " + c[1] if c[0] != c[1] else c[0] for c in table.columns]
430441

431442
# Make column names lowercase
432443
table.columns=table.columns.str.lower()
433444

445+
# One-off edge cases where tables need to be manually adjusted because
446+
# they will cause errors otherwise
434447
if season[0] == '2017':
435448
if current_week == 35 and "entero" in caption.text.lower():
436-
# Remove french from headers in week 35 for the entero table
449+
# The positive enterovirus table in week 35 of the 2017-2018 season has french
450+
# in the headers,so the french needs to be removed
437451
table.columns = ['week', 'week end', 'canada tests', 'entero/rhino%', 'at tests',
438452
'entero/rhino%.1', 'qc tests', 'entero/rhino%.2', 'on tests',
439453
'entero/rhino%.3', 'pr tests', 'entero/rhino%.4', 'bc tests',
440454
'entero/rhino%.5']
441455
elif current_week == 35 and "adeno" in caption.text.lower():
442-
# Remove > from column name
456+
# In week 35 of the 2017-2018, the positive adenovirus table has ">week end"
457+
# instead of "week end", so remove > from the column
443458
table = table.rename(columns={'>week end':"week end"})
444459
elif current_week == 47 and "rsv" in caption.text.lower():
445-
# fix date written as 201-11-25
460+
# In week 47 of the 2017-2018 season, a date is written as 201-11-25,
461+
# instead of 2017-11-25
446462
table.loc[table['week'] == 47, 'week end'] = "2017-11-25"
447463
elif season[0] == '2015' and current_week == 41:
448-
# Fix date written m-d-y not d-m-y
464+
# In week 41 of the 2015-2016 season, a date written in m-d-y format not d-m-y
449465
table=table.replace("10-17-2015","17-10-2015",regex=True)
450466
elif season[0] == '2022' and current_week == 11 and "hmpv" in caption.text.lower():
451-
# fix date written as 022-09-03
467+
# In week 11 of the 2022-2023 season, in the positive hmpv table,
468+
# a date is written as 022-09-03, instead of 2022-09-03
452469
table.loc[table['week'] == 35, 'week end'] = "2022-09-03"
453470

454471
# Rename columns
455472
table= preprocess_table_columns(table)
456473

474+
# If "reporting laboratory" is one of the columns of the table, the table must be
475+
# the "Respiratory virus detections " table for a given week
476+
# this is the lab level table that has weekly positive tests for each virus, with no revisions
477+
# and each row represents a lab
478+
479+
# If "number" is in the table caption, the table must be the
480+
# "Number of positive respiratory detections" table, for a given week
481+
# this is a national level table, reporting the number of detections for each virus,
482+
# this table has revisions, so each row is a week in the season, with weeks going from the
483+
# start of the season up to and including the current week
484+
485+
# If "positive" is in the table caption, the table must be one of the
486+
# "Positive [virus] Tests (%)" table, for a given week
487+
# This is a region level table, reporting the total tests and percent positive tests for each virus,
488+
# this table has revisions, so each row is a week in the season, with weeks going from the
489+
# start of the season up to and including the current week
490+
# The columns have the region information (i.e Pr tests, meaning this columns has the tests for the prairies)
491+
457492
if "reporting laboratory" in str(table.columns):
458493
respiratory_detection_table = create_detections_table(table,modified_date,current_week,current_week_end,season[0])
459494
respiratory_detection_table = respiratory_detection_table.set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
@@ -465,9 +500,13 @@ def get_season_reports(url):
465500
flu = " influenza" in caption.text.lower()
466501

467502
# tables are missing week 53
468-
if season[0]=="2014" and current_week==2:
469-
overwrite_weeks=True
470-
elif season[0]=="2014" and current_week==3:
503+
# In the 2014-2015 season the year ends at week 53 before starting at week 1 again.
504+
# weeks 53,2 and 3 skip week 53 in the positive detection tables, going from 52 to 1,
505+
# this means the week numbers following 52 are 1 larger then they should be
506+
# fix this by overwriting the week number columns
507+
508+
missing_week_53 = [53,2,3]
509+
if season[0]=="2014" and current_week in missing_week_53:
471510
overwrite_weeks=True
472511
else:
473512
overwrite_weeks=False
@@ -491,6 +530,8 @@ def get_season_reports(url):
491530

492531
# Check if the indices are already in the season table
493532
# If not, add the weeks tables into the season table
533+
534+
# check for deduplication pandas
494535
if not respiratory_detection_table.index.isin(all_respiratory_detection_table.index).any():
495536
all_respiratory_detection_table= pd.concat([all_respiratory_detection_table,respiratory_detection_table])
496537

@@ -519,7 +560,7 @@ def main():
519560
old_detection_data = pd.read_csv('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
520561
old_positive_data = pd.read_csv('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
521562

522-
for base_url in DASHBOARD_BASE_URLS_2023:
563+
for base_url in DASHBOARD_BASE_URLS_2023_2024_SEASON:
523564
# Get weekly dashboard data
524565
weekly_data = get_weekly_data(base_url,2023).set_index(['epiweek', 'time_value', 'issue', 'geo_type', 'geo_value'])
525566
positive_data = get_revised_data(base_url)

0 commit comments

Comments
 (0)