15
15
import math
16
16
17
17
from delphi .epidata .acquisition .rvdss .constants import (
18
- DASHBOARD_BASE_URLS_2023 , HISTORIC_SEASON_URL ,
18
+ DASHBOARD_BASE_URLS_2023_2024_SEASON , HISTORIC_SEASON_URL ,
19
19
ALTERNATIVE_SEASON_BASE_URL , SEASON_BASE_URL , LAST_WEEK_OF_YEAR ,
20
20
RESP_COUNTS_OUTPUT_FILE , POSITIVE_TESTS_OUTPUT_FILE
21
21
)
@@ -91,9 +91,8 @@ def get_report_date(week,start_year,epi=False):
91
91
report_date = str (epi_week )
92
92
93
93
return (report_date )
94
-
95
94
96
- def parse_table_captions (soup ):
95
+ def extract_captions_of_interest (soup ):
97
96
"""
98
97
finds all the table captions for the current week so tables can be identified
99
98
@@ -369,6 +368,8 @@ def create_percent_positive_detection_table(table,modified_date,start_year, flu=
369
368
return (table )
370
369
371
370
def get_season_reports (url ):
371
+ # From the url, go to the main landing page for a season
372
+ # which contains all the links to each week in the season
372
373
page = requests .get (url )
373
374
soup = BeautifulSoup (page .text ,'html.parser' )
374
375
@@ -387,7 +388,9 @@ def get_season_reports(url):
387
388
current_week = weeks [week_num ]
388
389
current_week_end = end_dates [week_num ]
389
390
390
- # Skip empty pages
391
+ # In the 2019=2020 season, the webpages for weeks 5 and 47 only have
392
+ # the abbreviations table and the headers for the respiratory detections
393
+ # table, so they are effectively empty, and skipped
391
394
if season [0 ] == '2019' :
392
395
if current_week == 5 or current_week == 47 :
393
396
continue
@@ -396,7 +399,7 @@ def get_season_reports(url):
396
399
temp_url = urls [week_num ]
397
400
temp_page = requests .get (temp_url )
398
401
new_soup = BeautifulSoup (temp_page .text , 'html.parser' )
399
- captions = parse_table_captions (new_soup )
402
+ captions = extract_captions_of_interest (new_soup )
400
403
modified_date = get_modified_dates (new_soup ,current_week_end )
401
404
402
405
positive_tables = []
@@ -405,55 +408,87 @@ def get_season_reports(url):
405
408
caption = captions [i ]
406
409
tab = caption .find_next ('table' )
407
410
408
- # Remove footers from tables
411
+ # Remove footers from tables so the text isn't read in as a table row
409
412
if tab .find ('tfoot' ):
410
413
tab .tfoot .decompose ()
411
414
412
- # Delete duplicate entry from week 35 of the 2019-2020 season
415
+ # In the positive adenovirus table in week 35 of the 2019-2020 season
416
+ # The week number has been duplicated, which makes all the entries in the table
417
+ # are one column to the right of where they should be. To fix this the
418
+ # entry in the table (which is the first "td" element in the html) is deleted
413
419
if season [0 ] == '2019' and current_week == 35 :
414
420
if "Positive Adenovirus" in caption .text :
415
421
tab .select_one ('td' ).decompose ()
416
422
417
423
# Replace commas with periods
424
+ # Some "number of detections" tables have number with commas (i.e 1,000)
425
+ # In this case the commas must be deleted, otherwise turn into periods
426
+ # because some tables have commas instead of decimal points
418
427
if "number" not in caption .text .lower ():
419
428
tab = re .sub ("," ,r"." ,str (tab ))
420
429
else :
421
430
tab = re .sub ("," ,"" ,str (tab ))
422
431
423
- # Read table
432
+ # Read table, coding all the abbreviations for missing data into NA
433
+ # Also use dropna because removing footers causes the html to have an empty row
424
434
na_values = ['N.A.' ,'N.A' , 'N.C.' ,'N.R.' ,'Not Available' ,'Not Tested' ,"N.D." ,"-" ]
425
435
table = pd .read_html (tab ,na_values = na_values )[0 ].dropna (how = "all" )
426
436
427
437
# Check for multiline headers
438
+ # If there are any, combine them into a single line header
428
439
if isinstance (table .columns , pd .MultiIndex ):
429
440
table .columns = [c [0 ] + " " + c [1 ] if c [0 ] != c [1 ] else c [0 ] for c in table .columns ]
430
441
431
442
# Make column names lowercase
432
443
table .columns = table .columns .str .lower ()
433
444
445
+ # One-off edge cases where tables need to be manually adjusted because
446
+ # they will cause errors otherwise
434
447
if season [0 ] == '2017' :
435
448
if current_week == 35 and "entero" in caption .text .lower ():
436
- # Remove french from headers in week 35 for the entero table
449
+ # The positive enterovirus table in week 35 of the 2017-2018 season has french
450
+ # in the headers,so the french needs to be removed
437
451
table .columns = ['week' , 'week end' , 'canada tests' , 'entero/rhino%' , 'at tests' ,
438
452
'entero/rhino%.1' , 'qc tests' , 'entero/rhino%.2' , 'on tests' ,
439
453
'entero/rhino%.3' , 'pr tests' , 'entero/rhino%.4' , 'bc tests' ,
440
454
'entero/rhino%.5' ]
441
455
elif current_week == 35 and "adeno" in caption .text .lower ():
442
- # Remove > from column name
456
+ # In week 35 of the 2017-2018, the positive adenovirus table has ">week end"
457
+ # instead of "week end", so remove > from the column
443
458
table = table .rename (columns = {'>week end' :"week end" })
444
459
elif current_week == 47 and "rsv" in caption .text .lower ():
445
- # fix date written as 201-11-25
460
+ # In week 47 of the 2017-2018 season, a date is written as 201-11-25,
461
+ # instead of 2017-11-25
446
462
table .loc [table ['week' ] == 47 , 'week end' ] = "2017-11-25"
447
463
elif season [0 ] == '2015' and current_week == 41 :
448
- # Fix date written m-d-y not d-m-y
464
+ # In week 41 of the 2015-2016 season, a date written in m-d-y format not d-m-y
449
465
table = table .replace ("10-17-2015" ,"17-10-2015" ,regex = True )
450
466
elif season [0 ] == '2022' and current_week == 11 and "hmpv" in caption .text .lower ():
451
- # fix date written as 022-09-03
467
+ # In week 11 of the 2022-2023 season, in the positive hmpv table,
468
+ # a date is written as 022-09-03, instead of 2022-09-03
452
469
table .loc [table ['week' ] == 35 , 'week end' ] = "2022-09-03"
453
470
454
471
# Rename columns
455
472
table = preprocess_table_columns (table )
456
473
474
+ # If "reporting laboratory" is one of the columns of the table, the table must be
475
+ # the "Respiratory virus detections " table for a given week
476
+ # this is the lab level table that has weekly positive tests for each virus, with no revisions
477
+ # and each row represents a lab
478
+
479
+ # If "number" is in the table caption, the table must be the
480
+ # "Number of positive respiratory detections" table, for a given week
481
+ # this is a national level table, reporting the number of detections for each virus,
482
+ # this table has revisions, so each row is a week in the season, with weeks going from the
483
+ # start of the season up to and including the current week
484
+
485
+ # If "positive" is in the table caption, the table must be one of the
486
+ # "Positive [virus] Tests (%)" table, for a given week
487
+ # This is a region level table, reporting the total tests and percent positive tests for each virus,
488
+ # this table has revisions, so each row is a week in the season, with weeks going from the
489
+ # start of the season up to and including the current week
490
+ # The columns have the region information (i.e Pr tests, meaning this columns has the tests for the prairies)
491
+
457
492
if "reporting laboratory" in str (table .columns ):
458
493
respiratory_detection_table = create_detections_table (table ,modified_date ,current_week ,current_week_end ,season [0 ])
459
494
respiratory_detection_table = respiratory_detection_table .set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
@@ -465,9 +500,13 @@ def get_season_reports(url):
465
500
flu = " influenza" in caption .text .lower ()
466
501
467
502
# tables are missing week 53
468
- if season [0 ]== "2014" and current_week == 2 :
469
- overwrite_weeks = True
470
- elif season [0 ]== "2014" and current_week == 3 :
503
+ # In the 2014-2015 season the year ends at week 53 before starting at week 1 again.
504
+ # weeks 53,2 and 3 skip week 53 in the positive detection tables, going from 52 to 1,
505
+ # this means the week numbers following 52 are 1 larger then they should be
506
+ # fix this by overwriting the week number columns
507
+
508
+ missing_week_53 = [53 ,2 ,3 ]
509
+ if season [0 ]== "2014" and current_week in missing_week_53 :
471
510
overwrite_weeks = True
472
511
else :
473
512
overwrite_weeks = False
@@ -491,6 +530,8 @@ def get_season_reports(url):
491
530
492
531
# Check if the indices are already in the season table
493
532
# If not, add the weeks tables into the season table
533
+
534
+ # check for deduplication pandas
494
535
if not respiratory_detection_table .index .isin (all_respiratory_detection_table .index ).any ():
495
536
all_respiratory_detection_table = pd .concat ([all_respiratory_detection_table ,respiratory_detection_table ])
496
537
@@ -519,7 +560,7 @@ def main():
519
560
old_detection_data = pd .read_csv ('season_2023_2024/' + RESP_COUNTS_OUTPUT_FILE ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
520
561
old_positive_data = pd .read_csv ('season_2023_2024/' + POSITIVE_TESTS_OUTPUT_FILE ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
521
562
522
- for base_url in DASHBOARD_BASE_URLS_2023 :
563
+ for base_url in DASHBOARD_BASE_URLS_2023_2024_SEASON :
523
564
# Get weekly dashboard data
524
565
weekly_data = get_weekly_data (base_url ,2023 ).set_index (['epiweek' , 'time_value' , 'issue' , 'geo_type' , 'geo_value' ])
525
566
positive_data = get_revised_data (base_url )
0 commit comments