From 47a8c5242d9186d0baf60cf424d4e0a4c7c47151 Mon Sep 17 00:00:00 2001 From: george haff Date: Thu, 23 Jul 2020 15:33:56 -0400 Subject: [PATCH 01/15] sane *_updated_timestamp field names --- .../covidcast/test_csv_uploading.py | 8 +-- .../covidcast/test_direction_updating.py | 2 +- src/acquisition/covidcast/database.py | 52 +++++++++---------- src/acquisition/covidcast/direction.py | 12 ++--- .../covidcast/direction_updater.py | 30 +++++------ src/ddl/covidcast.sql | 46 ++++++++-------- .../proc_db_backups_pd.py | 12 ++--- tests/acquisition/covidcast/test_database.py | 16 +++--- tests/acquisition/covidcast/test_direction.py | 6 +-- .../covidcast/test_direction_updater.py | 4 +- 10 files changed, 94 insertions(+), 94 deletions(-) diff --git a/integrations/acquisition/covidcast/test_csv_uploading.py b/integrations/acquisition/covidcast/test_csv_uploading.py index 848dea269..0d26a5d60 100644 --- a/integrations/acquisition/covidcast/test_csv_uploading.py +++ b/integrations/acquisition/covidcast/test_csv_uploading.py @@ -192,10 +192,10 @@ def apply_lag(expected_epidata): }) # verify timestamps and default values are reasonable - self.cur.execute('select timestamp1, timestamp2, direction from covidcast') - for timestamp1, timestamp2, direction in self.cur: - self.assertGreater(timestamp1, 0) - self.assertEqual(timestamp2, 0) + self.cur.execute('select value_updated_timestamp, direction_updated_timestamp, direction from covidcast') + for value_updated_timestamp, direction_updated_timestamp, direction in self.cur: + self.assertGreater(value_updated_timestamp, 0) + self.assertEqual(direction_updated_timestamp, 0) self.assertIsNone(direction) # verify that the CSVs were archived diff --git a/integrations/acquisition/covidcast/test_direction_updating.py b/integrations/acquisition/covidcast/test_direction_updating.py index e27360a8e..fb33e929c 100644 --- a/integrations/acquisition/covidcast/test_direction_updating.py +++ b/integrations/acquisition/covidcast/test_direction_updating.py @@ -188,7 +188,7 @@ def test_uploading(self): }) # verify secondary timestamps were updated - self.cur.execute('select timestamp2 from covidcast order by id asc') + self.cur.execute('select direction_updated_timestamp from covidcast order by id asc') timestamps = [t for (t,) in self.cur] for t in timestamps[:6]: # first 6 rows had `direction` updated diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index 8a664e8c7..dc6ef0ddc 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -42,7 +42,7 @@ def __init__(self, source, signal, time_type, geo_type, time_value, geo_value, v self.value = value # ... self.stderr = stderr # ... self.sample_size = sample_size # from CSV row - self.timestamp2 = 0 + self.direction_updated_timestamp = 0 self.direction = None self.issue = issue self.lag = lag @@ -101,8 +101,8 @@ def insert_or_update_batch(self, cc_rows, batch_size=0, commit_partial=False): sql = ''' INSERT INTO `covidcast` (`id`, `source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, - `timestamp1`, `value`, `stderr`, `sample_size`, - `timestamp2`, `direction`, + `value_updated_timestamp`, `value`, `stderr`, `sample_size`, + `direction_updated_timestamp`, `direction`, `issue`, `lag`) VALUES (0, %s, %s, %s, %s, %s, %s, @@ -110,12 +110,12 @@ def insert_or_update_batch(self, cc_rows, batch_size=0, commit_partial=False): 0, NULL, %s, %s) ON DUPLICATE KEY UPDATE - `timestamp1` = VALUES(`timestamp1`), + `value_updated_timestamp` = VALUES(`value_updated_timestamp`), `value` = VALUES(`value`), `stderr` = VALUES(`stderr`), `sample_size` = VALUES(`sample_size`) ''' - # TODO: ^ do we want to reset `timestamp2` and `direction` in the duplicate key case? + # TODO: ^ do we want to reset `direction_updated_timestamp` and `direction` in the duplicate key case? # TODO: consider handling cc_rows as a generator instead of a list num_rows = len(cc_rows) @@ -175,7 +175,7 @@ def insert_or_update( INSERT INTO `covidcast` VALUES (0, %s, %s, %s, %s, %s, %s, UNIX_TIMESTAMP(NOW()), %s, %s, %s, 0, NULL, %s, %s) ON DUPLICATE KEY UPDATE - `timestamp1` = VALUES(`timestamp1`), + `value_updated_timestamp` = VALUES(`value_updated_timestamp`), `value` = VALUES(`value`), `stderr` = VALUES(`stderr`), `sample_size` = VALUES(`sample_size`) @@ -246,7 +246,7 @@ def update_direction( UPDATE `covidcast` SET - `timestamp2` = UNIX_TIMESTAMP(NOW()), + `direction_updated_timestamp` = UNIX_TIMESTAMP(NOW()), `direction` = %s WHERE `source` = %s AND @@ -286,9 +286,9 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type` varchar(12), `geo_value` varchar(12), `time_value` int(11), - `timestamp1` int(11), + `value_updated_timestamp` int(11), `value` double, - `timestamp2` int(11), + `direction_updated_timestamp` int(11), `direction` int(11), PRIMARY KEY(`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; @@ -305,9 +305,9 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type`, `geo_value`, `time_value`, - `timestamp1`, + `value_updated_timestamp`, `value`, - `timestamp2`, + `direction_updated_timestamp`, `direction` FROM ( @@ -358,7 +358,7 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type`, `geo_value` HAVING - MAX(`timestamp1`) > MIN(`timestamp2`) + MAX(`value_updated_timestamp`) > MIN(`direction_updated_timestamp`) ''' # A query that selects rows of the time-series selected by stale_ts_key_sql query. @@ -372,9 +372,9 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type`, `geo_value`, `time_value`, - `timestamp1`, + `value_updated_timestamp`, `value`, - `timestamp2`, + `direction_updated_timestamp`, `direction` FROM ({stale_ts_key_sql}) AS t2 LEFT JOIN `latest_issues` AS t3 @@ -424,8 +424,8 @@ def drop_temporary_table(self, tmp_table_name): sql = f'DROP TEMPORARY TABLE `{tmp_table_name}`;' self._cursor.execute(sql) - def update_timestamp2_from_temporary_table(self, tmp_table_name): - """Updates the `timestamp2` column of `covidcast` table for all the rows with id value in `tmp_table_name`. + def update_direction_updated_timestamp_from_temporary_table(self, tmp_table_name): + """Updates the `direction_updated_timestamp` column of `covidcast` table for all the rows with id value in `tmp_table_name`. `tmp_table_name`: name of the temporary table. """ @@ -437,7 +437,7 @@ def update_timestamp2_from_temporary_table(self, tmp_table_name): ON `covidcast`.id=t.id SET - `covidcast`.timestamp2=UNIX_TIMESTAMP(NOW()) + `covidcast`.direction_updated_timestamp=UNIX_TIMESTAMP(NOW()) ''' self._cursor.execute(sql) @@ -457,8 +457,8 @@ def get_keys_with_potentially_stale_direction(self): `signal`, `geo_type`, `geo_value`, - MAX(`timestamp1`) AS `max_timestamp1`, - MIN(`timestamp2`) AS `min_timestamp2`, + MAX(`value_updated_timestamp`) AS `max_value_updated_timestamp`, + MIN(`direction_updated_timestamp`) AS `min_direction_updated_timestamp`, MIN(`time_value`) AS `min_day`, MAX(`time_value`) AS `max_day`, COUNT(1) AS `series_length` @@ -473,7 +473,7 @@ def get_keys_with_potentially_stale_direction(self): `geo_type`, `geo_value` HAVING - MAX(`timestamp1`) > MIN(`timestamp2`) + MAX(`value_updated_timestamp`) > MIN(`direction_updated_timestamp`) ''' self._cursor.execute(sql) @@ -488,8 +488,8 @@ def get_daily_timeseries_for_direction_update( DATEDIFF(`time_value`, %s) AS `offset`, `time_value` AS `day`, `value`, - `timestamp1`, - `timestamp2` + `value_updated_timestamp`, + `direction_updated_timestamp` FROM `covidcast` WHERE @@ -507,9 +507,9 @@ def get_daily_timeseries_for_direction_update( self._cursor.execute(sql, args) return list(self._cursor) - def update_timeseries_timestamp2( + def update_timeseries_direction_updated_timestamp( self, source, signal, time_type, geo_type, geo_value): - """Update the `timestamp2` column for an entire time-series. + """Update the `direction_updated_timestamp` column for an entire time-series. For daily time-series, this implies that all `direction` values in the specified time-series are confirmed fresh as of the current time. Even if @@ -523,7 +523,7 @@ def update_timeseries_timestamp2( UPDATE `covidcast` SET - `timestamp2` = UNIX_TIMESTAMP(NOW()) + `direction_updated_timestamp` = UNIX_TIMESTAMP(NOW()) WHERE `source` = %s AND `signal` = %s AND @@ -552,7 +552,7 @@ def get_covidcast_meta(self): MAX(`value`) AS `max_value`, ROUND(AVG(`value`),7) AS `mean_value`, ROUND(STD(`value`),7) AS `stdev_value`, - MAX(`timestamp1`) AS `last_update`, + MAX(`value_updated_timestamp`) AS `last_update`, MAX(`issue`) as `max_issue`, MIN(`lag`) as `min_lag`, MAX(`lag`) as `max_lag` diff --git a/src/acquisition/covidcast/direction.py b/src/acquisition/covidcast/direction.py index 7444a8340..f364ae6be 100644 --- a/src/acquisition/covidcast/direction.py +++ b/src/acquisition/covidcast/direction.py @@ -64,8 +64,8 @@ def scan_timeseries( offsets, days, values, - timestamp1s, - timestamp2s, + value_updated_timestamps, + direction_updated_timestamps, get_direction_impl): """Scan an entire time-series and return fresh direction updates. @@ -73,9 +73,9 @@ def scan_timeseries( each day in `days` `days`: day (YYYYMMDD) corresponding to each row in the other arrays `values`: value of the signal on each day - `timestamp1s`: primary timestamp for each row (i.e. when `value` was + `value_updated_timestamps`: primary timestamp for each row (i.e. when `value` was updated) - `timestamp2s`: secondary timestamp for each row (i.e. when `direction` was + `direction_updated_timestamps`: secondary timestamp for each row (i.e. when `direction` was last deemed to be fresh, relative to associated `value`s) `get_direction_impl`: a function which takes two arrays (time and value) and returns a classification of the direction (i.e. as -1, 0, +1) @@ -100,8 +100,8 @@ def scan_timeseries( start += 1 # check whether this row needs an update - direction_time = timestamp2s[end] - value_time = np.max(timestamp1s[start:end + 1]) + direction_time = direction_updated_timestamps[end] + value_time = np.max(value_updated_timestamps[start:end + 1]) if direction_time > value_time: # this row is fresh continue diff --git a/src/acquisition/covidcast/direction_updater.py b/src/acquisition/covidcast/direction_updater.py index d8167c2f7..1183e0949 100644 --- a/src/acquisition/covidcast/direction_updater.py +++ b/src/acquisition/covidcast/direction_updater.py @@ -79,8 +79,8 @@ def update_loop(database, direction_impl=Direction): signal, geo_type, geo_value, - max_timestamp1, - min_timestamp2, + max_value_updated_timestamp, + min_direction_updated_timestamp, min_day, max_day, series_length, @@ -108,7 +108,7 @@ def update_loop(database, direction_impl=Direction): min_day, max_day, series_length, - max_timestamp1 - min_timestamp2, + max_value_updated_timestamp - min_direction_updated_timestamp, ) print(msg % args) @@ -118,12 +118,12 @@ def update_loop(database, direction_impl=Direction): # transpose result set and cast data types data = np.array(timeseries_rows) - offsets, days, values, timestamp1s, timestamp2s = data.T + offsets, days, values, value_updated_timestamps, direction_updated_timestamps = data.T offsets = offsets.astype(np.int64) days = days.astype(np.int64) values = values.astype(np.float64) - timestamp1s = timestamp1s.astype(np.int64) - timestamp2s = timestamp2s.astype(np.int64) + value_updated_timestamps = value_updated_timestamps.astype(np.int64) + direction_updated_timestamps = direction_updated_timestamps.astype(np.int64) # create a direction classifier for this signal data_stdev = data_stdevs[source][signal][geo_type] @@ -135,7 +135,7 @@ def get_direction_impl(x, y): # recompute any stale directions days, directions = direction_impl.scan_timeseries( - offsets, days, values, timestamp1s, timestamp2s, get_direction_impl) + offsets, days, values, value_updated_timestamps, direction_updated_timestamps, get_direction_impl) if be_verbose: print(' computed %d direction updates' % len(directions)) @@ -148,7 +148,7 @@ def get_direction_impl(x, y): source, signal, 'day', geo_type, day, geo_value, direction) # mark the entire time-series as fresh with respect to direction - database.update_timeseries_timestamp2( + database.update_timeseries_direction_updated_timestamp( source, signal, 'day', geo_type, geo_value) @@ -162,7 +162,7 @@ def optimized_update_loop(database, direction_impl=Direction): # A pandas DataFrame that will hold all rows from potentially stale time-series df_all = pd.DataFrame(columns=['id', 'source', 'signal', 'time_type', 'geo_type', 'geo_value', 'time_value', - 'timestamp1', 'value', 'timestamp2', 'direction'], + 'value_updated_timestamp', 'value', 'direction_updated_timestamp', 'direction'], data=database.get_all_record_values_of_timeseries_with_potentially_stale_direction( tmp_table_name)) df_all.drop(columns=['time_type'], inplace=True) @@ -220,15 +220,15 @@ def optimized_update_loop(database, direction_impl=Direction): ts_rows.time_value.min(), ts_rows.time_value.max(), len(ts_rows), - ts_rows.timestamp1.max() - ts_rows.timestamp2.min() + ts_rows.value_updated_timestamp.max() - ts_rows.direction_updated_timestamp.min() ) print(msg % args) offsets = ts_rows.offsets.values.astype(np.int64) days = ts_rows.time_value.values.astype(np.int64) values = ts_rows.value.values.astype(np.float64) - timestamp1s = ts_rows.timestamp1.values.astype(np.int64) - timestamp2s = ts_rows.timestamp2.values.astype(np.int64) + value_updated_timestamps = ts_rows.value_updated_timestamp.values.astype(np.int64) + direction_updated_timestamps = ts_rows.direction_updated_timestamp.values.astype(np.int64) # create a direction classifier for this signal data_stdev = data_stdevs[source][signal][geo_type] @@ -240,7 +240,7 @@ def get_direction_impl(x, y): # recompute any stale directions days, directions = direction_impl.scan_timeseries( - offsets, days, values, timestamp1s, timestamp2s, get_direction_impl) + offsets, days, values, value_updated_timestamps, direction_updated_timestamps, get_direction_impl) if be_verbose: print(' computed %d direction updates' % len(directions)) @@ -264,8 +264,8 @@ def get_direction_impl(x, y): for v, id_list in changed_rows.items(): database.batched_update_direction(v, id_list) - # Updating timestamp2 - database.update_timestamp2_from_temporary_table(tmp_table_name) + # Updating direction_updated_timestamp + database.update_direction_updated_timestamp_from_temporary_table(tmp_table_name) # Dropping temporary table database.drop_temporary_table(tmp_table_name) diff --git a/src/ddl/covidcast.sql b/src/ddl/covidcast.sql index 795c5523e..f94259107 100644 --- a/src/ddl/covidcast.sql +++ b/src/ddl/covidcast.sql @@ -8,25 +8,25 @@ Delphi's COVID-19 surveillance streams. Data is public. -+-------------+-------------+------+-----+---------+----------------+ -| Field | Type | Null | Key | Default | Extra | -+-------------+-------------+------+-----+---------+----------------+ -| id | int(11) | NO | PRI | NULL | auto_increment | -| source | varchar(32) | NO | MUL | NULL | | -| signal | varchar(32) | NO | | NULL | | -| time_type | varchar(12) | NO | | NULL | | -| geo_type | varchar(12) | NO | | NULL | | -| time_value | int(11) | NO | | NULL | | -| geo_value | varchar(12) | NO | | NULL | | -| timestamp1 | int(11) | NO | | NULL | | -| value | double | NO | | NULL | | -| stderr | double | YES | | NULL | | -| sample_size | double | YES | | NULL | | -| timestamp2 | int(11) | NO | | NULL | | -| direction | int(11) | YES | | NULL | | -| issue | int(11) | NO | | NULL | | -| lag | int(11) | NO | | NULL | | -+-------------+-------------+------+-----+---------+----------------+ ++------------------------------+-------------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++------------------------------+-------------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| source | varchar(32) | NO | MUL | NULL | | +| signal | varchar(32) | NO | | NULL | | +| time_type | varchar(12) | NO | | NULL | | +| geo_type | varchar(12) | NO | | NULL | | +| time_value | int(11) | NO | | NULL | | +| geo_value | varchar(12) | NO | | NULL | | +| value_updated_timestamp | int(11) | NO | | NULL | | +| value | double | NO | | NULL | | +| stderr | double | YES | | NULL | | +| sample_size | double | YES | | NULL | | +| direction_updated_timestamp | int(11) | NO | | NULL | | +| direction | int(11) | YES | | NULL | | +| issue | int(11) | NO | | NULL | | +| lag | int(11) | NO | | NULL | | ++------------------------------+-------------+------+-----+---------+----------------+ - `id` unique identifier for each record @@ -48,7 +48,7 @@ Data is public. - HRR: hospital referral region (HRR) number - DMA: designated market area (DMA) code - state: two-letter state abbreviation -- `timestamp1` +- `value_updated_timestamp` time when primary data (e.g. `value`) was last updated - `value` value (statistic) derived from the underlying data source @@ -56,7 +56,7 @@ Data is public. standard error of the statistic with respect to its sampling distribution - `sample_size` (NULL when not applicable) number of "data points" used in computing the statistic -- `timestamp2` +- `direction_updated_timestamp` time when secondary data (e.g. `direction`) was last updated - `direction` (NULL when not applicable) trend classifier with possible values: @@ -78,12 +78,12 @@ CREATE TABLE `covidcast` ( `time_value` int(11) NOT NULL, `geo_value` varchar(12) NOT NULL, -- "primary" values are derived from the upstream data source - `timestamp1` int(11) NOT NULL, + `value_updated_timestamp` int(11) NOT NULL, `value` double NOT NULL, `stderr` double, `sample_size` double, -- "secondary" values are derived from data in this table - `timestamp2` int(11) NOT NULL, + `direction_updated_timestamp` int(11) NOT NULL, `direction` int(11), `issue` int(11) NOT NULL, `lag` int(11) NOT NULL, diff --git a/src/server/covidcast_issues_migration/proc_db_backups_pd.py b/src/server/covidcast_issues_migration/proc_db_backups_pd.py index e88f86dbd..1aa2cbe1b 100755 --- a/src/server/covidcast_issues_migration/proc_db_backups_pd.py +++ b/src/server/covidcast_issues_migration/proc_db_backups_pd.py @@ -25,7 +25,7 @@ # Column names INDEX_COLS = ["source", "signal", "time_type", "geo_type", "time_value", "geo_value"] -VALUE_COLS = ["timestamp1", "value", "stderr", "sample_size", "timestamp2", "direction"] +VALUE_COLS = ["value_updated_timestamp", "value", "stderr", "sample_size", "direction_updated_timestamp", "direction"] ALL_COLS = INDEX_COLS + VALUE_COLS ALL_COLS_WITH_PK = ["id"] + ALL_COLS @@ -39,11 +39,11 @@ # time_value as str, because we need this parsed as a datetime anyway "time_value": "str", "geo_value": "category", - "timestamp1": "int", + "value_updated_timestamp": "int", "value": "str", "stderr": "str", "sample_size": "str", - "timestamp2": "int", + "direction_updated_timestamp": "int", "direction": "category" } @@ -432,8 +432,8 @@ def pd_csvdiff( # Since df_before is filled with NaN for new indices, new indices turn false in same_mask same_mask = (df_before.reindex(df_after.index) == df_after) - # Ignore timestamp2 in the diff - is_diff = ~(same_mask.loc[:, same_mask.columns != "timestamp2"].all(axis=1)) + # Ignore direction_updated_timestamp in the diff + is_diff = ~(same_mask.loc[:, same_mask.columns != "direction_updated_timestamp"].all(axis=1)) # Removed indices can be found via index difference, but is expensive if find_removals: @@ -469,7 +469,7 @@ def generate_issues( row_fmt = "(" \ "{id},{source},{signal},{time_type},{geo_type},{time_value},{geo_value}," \ - "{row.timestamp1},{row.value},{row.stderr},{row.sample_size},{row.timestamp2},{row.direction}," \ + "{row.value_updated_timestamp},{row.value},{row.stderr},{row.sample_size},{row.direction_updated_timestamp},{row.direction}," \ "{issue},{row.lag})" try: diff --git a/tests/acquisition/covidcast/test_database.py b/tests/acquisition/covidcast/test_database.py index 666d4ca52..b58af60b7 100644 --- a/tests/acquisition/covidcast/test_database.py +++ b/tests/acquisition/covidcast/test_database.py @@ -149,7 +149,7 @@ def test_update_direction_query(self): sql = sql.lower() self.assertIn('update', sql) self.assertIn('`covidcast`', sql) - self.assertIn('`timestamp2` = unix_timestamp', sql) + self.assertIn('`direction_updated_timestamp` = unix_timestamp', sql) self.assertIn('`direction` = %s', sql) def test_get_data_stdev_across_locations_query(self): @@ -197,8 +197,8 @@ def test_get_keys_with_potentially_stale_direction_query(self): sql = cursor.execute.call_args[0][0].lower() self.assertIn('select', sql) self.assertIn('`covidcast`', sql) - self.assertIn('timestamp1', sql) - self.assertIn('timestamp2', sql) + self.assertIn('value_updated_timestamp', sql) + self.assertIn('direction_updated_timestamp', sql) def test_get_daily_timeseries_for_direction_update_query(self): """Query to get a daily time-series looks sensible. @@ -232,10 +232,10 @@ def test_get_daily_timeseries_for_direction_update_query(self): sql = sql.lower() self.assertIn('select', sql) self.assertIn('`covidcast`', sql) - self.assertIn('timestamp1', sql) - self.assertIn('timestamp2', sql) + self.assertIn('value_updated_timestamp', sql) + self.assertIn('direction_updated_timestamp', sql) - def test_update_timeseries_timestamp2_query(self): + def test_update_timeseries_direction_updated_timestamp_query(self): """Query to update the secondary timestamp of a time-series looks sensible. NOTE: Actual behavior is tested by integration test. @@ -246,7 +246,7 @@ def test_update_timeseries_timestamp2_query(self): database = Database() database.connect(connector_impl=mock_connector) - database.update_timeseries_timestamp2(*args) + database.update_timeseries_direction_updated_timestamp(*args) connection = mock_connector.connect() cursor = connection.cursor() @@ -259,7 +259,7 @@ def test_update_timeseries_timestamp2_query(self): sql = sql.lower() self.assertIn('update', sql) self.assertIn('`covidcast`', sql) - self.assertIn('timestamp2', sql) + self.assertIn('direction_updated_timestamp', sql) self.assertIn('unix_timestamp(now())', sql) def test_update_covidcast_meta_cache_query(self): diff --git a/tests/acquisition/covidcast/test_direction.py b/tests/acquisition/covidcast/test_direction.py index 529056248..b6f6e2a12 100644 --- a/tests/acquisition/covidcast/test_direction.py +++ b/tests/acquisition/covidcast/test_direction.py @@ -87,7 +87,7 @@ def test_get_direction_validates_arguments(self): def test_scan_timeseries(self): """Scan a time-series and update stale directions.""" - offsets, days, values, timestamp1s, timestamp2s = [ + offsets, days, values, value_updated_timestamps, direction_updated_timestamps = [ # missing days '230', '240', and '250' (the gap helps test windowing) [100, 101, 102, 106, 107, 108], [200, 210, 220, 260, 270, 280], @@ -104,8 +104,8 @@ def test_scan_timeseries(self): offsets, days, values, - timestamp1s, - timestamp2s, + value_updated_timestamps, + direction_updated_timestamps, get_direction_impl) self.assertEqual(days, [210, 280]) diff --git a/tests/acquisition/covidcast/test_direction_updater.py b/tests/acquisition/covidcast/test_direction_updater.py index 4e73ad1b1..6f73bc49b 100644 --- a/tests/acquisition/covidcast/test_direction_updater.py +++ b/tests/acquisition/covidcast/test_direction_updater.py @@ -110,7 +110,7 @@ def test_update_loop(self): ) self.assertEqual(call_args_list[1][0], expected_args) - self.assertTrue(mock_database.update_timeseries_timestamp2.called) - args = mock_database.update_timeseries_timestamp2.call_args[0] + self.assertTrue(mock_database.update_timeseries_direction_updated_timestamp.called) + args = mock_database.update_timeseries_direction_updated_timestamp.call_args[0] expected_args = ('source', 'signal', 'day', 'geo_type', 'geo_value') self.assertEqual(args, expected_args) From d3dd3814a3c8050e9527dbe5c34b72fa4df4f3bb Mon Sep 17 00:00:00 2001 From: "Wael H. Al Saeed" Date: Fri, 24 Jul 2020 11:12:31 -0400 Subject: [PATCH 02/15] Updated Direction Updater to be processed in partitions --- .../covidcast/test_direction_updating.py | 2 +- src/acquisition/covidcast/database.py | 6 +- .../covidcast/direction_updater.py | 63 ++++++++++++------- .../covidcast/test_direction_updater.py | 4 +- 4 files changed, 47 insertions(+), 28 deletions(-) diff --git a/integrations/acquisition/covidcast/test_direction_updating.py b/integrations/acquisition/covidcast/test_direction_updating.py index fb33e929c..4fa0de039 100644 --- a/integrations/acquisition/covidcast/test_direction_updating.py +++ b/integrations/acquisition/covidcast/test_direction_updating.py @@ -82,7 +82,7 @@ def test_uploading(self): self.cnx.commit() # update direction (only 20200417 has enough history) - args = None + args = get_argument_parser().parse_args('') main(args) # request data from the API diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index dc6ef0ddc..f1566ffa7 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -269,7 +269,8 @@ def update_direction( self._cursor.execute(sql, args) - def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, temporary_table=None): + def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, temporary_table=None, + partition_condition='(TRUE)'): """Return the rows of all daily time-series with potentially stale directions, only rows corresponding to the most recent issue for each time_value is returned. @@ -321,7 +322,8 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t MAX(`issue`) AS `issue` FROM `covidcast` WHERE - `time_type` = 'day' + `time_type` = 'day' AND + {partition_condition} GROUP BY `source`, `signal`, diff --git a/src/acquisition/covidcast/direction_updater.py b/src/acquisition/covidcast/direction_updater.py index 1183e0949..dd0becf62 100644 --- a/src/acquisition/covidcast/direction_updater.py +++ b/src/acquisition/covidcast/direction_updater.py @@ -39,11 +39,26 @@ class Constants: ) +# partition configuration +PARTITION_VARIABLE = 'geo_value' +PARTITION_SPLITS = ["'05101'", "'101'", "'13071'", "'15007'", "'17161'", "'19039'", "'20123'", "'21213'", "'24035'", + "'27005'", "'28115'", "'29510'", "'31161'", "'35100'", "'37117'", "'39081'", "'41013'", "'44140'", + "'47027'", "'48140'", "'48461'", "'51169'", "'55033'"] + + + def get_argument_parser(): """Define command line arguments.""" # there are no flags, but --help will still work - return argparse.ArgumentParser() + parser = argparse.ArgumentParser() + parser.add_argument( + '--partitions', + type=int, + nargs='+', + default=list(range(len(PARTITION_SPLITS) + 1)), # default is to process all partitions + help='a list of integers for indexes of partitions to be processed') + return parser def update_loop(database, direction_impl=Direction): @@ -152,11 +167,19 @@ def get_direction_impl(x, y): source, signal, 'day', geo_type, geo_value) -def optimized_update_loop(database, direction_impl=Direction): +def optimized_update_loop(database, partition_index, direction_impl=Direction): """An optimized implementation of update_loop, finds and updates rows with a stale `direction` value. `database`: an open connection to the epidata database + `partition_index`: the index of the partition to be processed """ + + # constructing the partitoin condition from partition index + ge_condition = 'TRUE' if partition_index == 0 else f'`{PARTITION_VARIABLE}` >= {PARTITION_SPLITS[partition_index-1]}' + l_condition = 'TRUE' if partition_index == len(PARTITION_SPLITS) else \ + f'`{PARTITION_VARIABLE}` < {PARTITION_SPLITS[partition_index]}' + partition_condition = f'({ge_condition}) AND ({l_condition})' + # Name of temporary table, which will store all rows from potentially stale time-series tmp_table_name = 'tmp_ts_rows' @@ -164,7 +187,7 @@ def optimized_update_loop(database, direction_impl=Direction): df_all = pd.DataFrame(columns=['id', 'source', 'signal', 'time_type', 'geo_type', 'geo_value', 'time_value', 'value_updated_timestamp', 'value', 'direction_updated_timestamp', 'direction'], data=database.get_all_record_values_of_timeseries_with_potentially_stale_direction( - tmp_table_name)) + tmp_table_name, partition_condition)) df_all.drop(columns=['time_type'], inplace=True) df_all['time_value_datetime'] = pd.to_datetime(df_all.time_value, format="%Y%m%d") df_all.direction = df_all.direction.astype(np.float64) @@ -249,11 +272,6 @@ def get_direction_impl(x, y): ts_pot_changed = ts_rows.set_index('time_value').loc[days] ts_pot_changed['new_direction'] = np.array(directions, np.float64) - # is_eq_nan = ts_pot_changed.direction.isnull() & ts_pot_changed.new_direction.isnull() - # is_eq_num = ts_pot_changed.direction == ts_pot_changed.new_direction - # changed_mask = ~(is_eq_nan | is_eq_num) - # ts_changed = ts_pot_changed[changed_mask] - # Adding changed values to the changed_rows dictionary gb_o = ts_pot_changed.groupby('new_direction') for v in gb_o.groups: @@ -278,21 +296,20 @@ def main( `args`: parsed command-line arguments """ - - database = database_impl() - database.connect() - commit = False - - try: - update_loop_impl(database) - # only commit on success so that directions are consistent with respect - # to methodology - commit = True - finally: - # no catch block so that an exception above will cause the program to - # fail after the following cleanup - database.disconnect(commit) - print('committed=%s' % str(commit)) + for partition_index in args.partitions: + database = database_impl() + database.connect() + commit = False + try: + update_loop_impl(database, partition_index) + # only commit on success so that directions are consistent with respect + # to methodology + commit = True + finally: + # no catch block so that an exception above will cause the program to + # fail after the following cleanup + database.disconnect(commit) + print('partition %d committed=%s' % (partition_index, str(commit))) if __name__ == '__main__': diff --git a/tests/acquisition/covidcast/test_direction_updater.py b/tests/acquisition/covidcast/test_direction_updater.py index 6f73bc49b..64060e07a 100644 --- a/tests/acquisition/covidcast/test_direction_updater.py +++ b/tests/acquisition/covidcast/test_direction_updater.py @@ -20,7 +20,7 @@ def test_get_argument_parser(self): def test_main_successful(self): """Run the main program, and successfully commit changes.""" - args = None + args = MagicMock(partitions=[0, 1]) mock_database = MagicMock() fake_database_impl = lambda: mock_database mock_update_loop = MagicMock() @@ -38,7 +38,7 @@ def test_main_successful(self): def test_main_unsuccessful(self): """Run the main program, but don't commit changes on failure.""" - args = None + args = MagicMock(partitions=[0, 1]) mock_database = MagicMock() fake_database_impl = lambda: mock_database mock_update_loop = MagicMock(side_effect=Exception('testing')) From 55a8f103c7beff7b330507667677e692d8ee5e2b Mon Sep 17 00:00:00 2001 From: "Wael H. Al Saeed" Date: Fri, 24 Jul 2020 11:24:54 -0400 Subject: [PATCH 03/15] Added argument description --- src/acquisition/covidcast/database.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index f1566ffa7..3e73415e3 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -276,6 +276,7 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `temporary_table`: if provided, a temporary table with the name `temporary_table` is created and the result is also stored in that table. + `partition_condition`: a condition that defines the partition to be processed. """ create_tmp_table_sql = f''' From 2b183d70e2b313c0b1df43fbac1cad1fa27f6258 Mon Sep 17 00:00:00 2001 From: george haff Date: Fri, 24 Jul 2020 15:59:25 -0400 Subject: [PATCH 04/15] Added support in DB for `is_wip` flag to (hopefully) speed up cache updates. Also fixed/improved setUp in integrations/server/test_covidcast_meta.py and added skeleton-code for new `missing_*` rows. --- .../covidcast/test_covidcast_meta_caching.py | 6 +-- .../covidcast/test_direction_updating.py | 18 ++++---- integrations/client/test_delphi_epidata.py | 12 ++--- integrations/server/test_covidcast.py | 44 +++++++++---------- integrations/server/test_covidcast_meta.py | 15 ++++--- src/acquisition/covidcast/csv_to_database.py | 7 ++- src/acquisition/covidcast/database.py | 28 +++++++----- src/ddl/covidcast.sql | 16 +++++++ .../covidcast/test_csv_to_database.py | 14 +++--- tests/acquisition/covidcast/test_database.py | 1 + 10 files changed, 96 insertions(+), 65 deletions(-) diff --git a/integrations/acquisition/covidcast/test_covidcast_meta_caching.py b/integrations/acquisition/covidcast/test_covidcast_meta_caching.py index eba061406..0b5d5c222 100644 --- a/integrations/acquisition/covidcast/test_covidcast_meta_caching.py +++ b/integrations/acquisition/covidcast/test_covidcast_meta_caching.py @@ -66,14 +66,14 @@ def test_caching(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'state', 20200422, 'pa', - 123, 1, 2, 3, 456, 1, 20200422, 0), + 123, 1, 2, 3, 456, 1, 20200422, 0, False), (0, 'src', 'sig', 'day', 'state', 20200422, 'wa', - 789, 1, 2, 3, 456, 1, 20200423, 1) + 789, 1, 2, 3, 456, 1, 20200423, 1, False) ''') self.cur.execute(''' insert into covidcast values (100, 'src', 'wip_sig', 'day', 'state', 20200422, 'pa', - 456, 4, 5, 6, 789, -1, 20200422, 0) + 456, 4, 5, 6, 789, -1, 20200422, 0, True) ''') self.cnx.commit() diff --git a/integrations/acquisition/covidcast/test_direction_updating.py b/integrations/acquisition/covidcast/test_direction_updating.py index fb33e929c..54a33af9a 100644 --- a/integrations/acquisition/covidcast/test_direction_updating.py +++ b/integrations/acquisition/covidcast/test_direction_updating.py @@ -61,23 +61,23 @@ def test_uploading(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'state', 20200228, 'ca', - 123, 2, 0, 0, 0, NULL, 20200228, 0), + 123, 2, 0, 0, 0, NULL, 20200228, 0, False), (0, 'src', 'sig', 'day', 'state', 20200229, 'ca', - 123, 6, 0, 0, 0, NULL, 20200229, 0), + 123, 6, 0, 0, 0, NULL, 20200229, 0, False), (0, 'src', 'sig', 'day', 'state', 20200301, 'ca', - 123, 5, 0, 0, 0, NULL, 20200301, 0), + 123, 5, 0, 0, 0, NULL, 20200301, 0, False), (0, 'src', 'sig', 'day', 'state', 20200511, 'fl', - 123, 1, 0, 0, 0, NULL, 20200511, 0), + 123, 1, 0, 0, 0, NULL, 20200511, 0, False), (0, 'src', 'sig', 'day', 'state', 20200512, 'fl', - 123, 2, 0, 0, 0, NULL, 20200512, 0), + 123, 2, 0, 0, 0, NULL, 20200512, 0, False), (0, 'src', 'sig', 'day', 'state', 20200517, 'fl', - 123, 2, 0, 0, 0, NULL, 20200517, 0), + 123, 2, 0, 0, 0, NULL, 20200517, 0, False), (0, 'src', 'sig', 'day', 'state', 20200615, 'tx', - 123, 9, 0, 0, 456, NULL, 20200615, 0), + 123, 9, 0, 0, 456, NULL, 20200615, 0, False), (0, 'src', 'sig', 'day', 'state', 20200616, 'tx', - 123, 5, 0, 0, 456, NULL, 20200616, 0), + 123, 5, 0, 0, 456, NULL, 20200616, 0, False), (0, 'src', 'sig', 'day', 'state', 20200617, 'tx', - 123, 1, 0, 0, 456, 1, 20200617, 0) + 123, 1, 0, 0, 456, 1, 20200617, 0, False) ''') self.cnx.commit() diff --git a/integrations/client/test_delphi_epidata.py b/integrations/client/test_delphi_epidata.py index fcab62ad0..3f1e57b47 100644 --- a/integrations/client/test_delphi_epidata.py +++ b/integrations/client/test_delphi_epidata.py @@ -50,11 +50,11 @@ def test_covidcast(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0), + 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0, False), (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 456, 5.5, 1.2, 10.5, 789, 0, 20200415, 1), + 456, 5.5, 1.2, 10.5, 789, 0, 20200415, 1, False), (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 345, 6.5, 2.2, 11.5, 678, 0, 20200416, 2) + 345, 6.5, 2.2, 11.5, 678, 0, 20200416, 2, False) ''') self.cnx.commit() @@ -157,11 +157,11 @@ def test_covidcast_meta(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0), + 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0, False), (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 345, 6.0, 2.2, 11.5, 678, 0, 20200416, 2), + 345, 6.0, 2.2, 11.5, 678, 0, 20200416, 2, False), (0, 'src', 'sig', 'day', 'county', 20200415, '01234', - 345, 7.0, 2.0, 12.5, 678, 0, 20200416, 1) + 345, 7.0, 2.0, 12.5, 678, 0, 20200416, 1, False) ''') self.cnx.commit() diff --git a/integrations/server/test_covidcast.py b/integrations/server/test_covidcast.py index 58c34614c..7119de393 100644 --- a/integrations/server/test_covidcast.py +++ b/integrations/server/test_covidcast.py @@ -45,7 +45,7 @@ def test_round_trip(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0) + 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0, False) ''') self.cnx.commit() @@ -85,17 +85,17 @@ def test_location_wildcard(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '11111', - 123, 10, 11, 12, 456, 13, 20200414, 0), + 123, 10, 11, 12, 456, 13, 20200414, 0, False), (0, 'src', 'sig', 'day', 'county', 20200414, '22222', - 123, 20, 21, 22, 456, 23, 20200414, 0), + 123, 20, 21, 22, 456, 23, 20200414, 0, False), (0, 'src', 'sig', 'day', 'county', 20200414, '33333', - 123, 30, 31, 32, 456, 33, 20200414, 0), + 123, 30, 31, 32, 456, 33, 20200414, 0, False), (0, 'src', 'sig', 'day', 'msa', 20200414, '11111', - 123, 40, 41, 42, 456, 43, 20200414, 0), + 123, 40, 41, 42, 456, 43, 20200414, 0, False), (0, 'src', 'sig', 'day', 'msa', 20200414, '22222', - 123, 50, 51, 52, 456, 53, 20200414, 0), + 123, 50, 51, 52, 456, 53, 20200414, 0, False), (0, 'src', 'sig', 'day', 'msa', 20200414, '33333', - 123, 60, 61, 62, 456, 634, 20200414, 0) + 123, 60, 61, 62, 456, 634, 20200414, 0, False) ''') self.cnx.commit() @@ -155,17 +155,17 @@ def test_location_timeline(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200411, '01234', - 123, 10, 11, 12, 456, 13, 20200413, 2), + 123, 10, 11, 12, 456, 13, 20200413, 2, False), (0, 'src', 'sig', 'day', 'county', 20200412, '01234', - 123, 20, 21, 22, 456, 23, 20200413, 1), + 123, 20, 21, 22, 456, 23, 20200413, 1, False), (0, 'src', 'sig', 'day', 'county', 20200413, '01234', - 123, 30, 31, 32, 456, 33, 20200413, 0), + 123, 30, 31, 32, 456, 33, 20200413, 0, False), (0, 'src', 'sig', 'day', 'county', 20200411, '11111', - 123, 40, 41, 42, 456, 43, 20200413, 2), + 123, 40, 41, 42, 456, 43, 20200413, 2, False), (0, 'src', 'sig', 'day', 'county', 20200412, '22222', - 123, 50, 51, 52, 456, 53, 20200413, 1), + 123, 50, 51, 52, 456, 53, 20200413, 1, False), (0, 'src', 'sig', 'day', 'county', 20200413, '33333', - 123, 60, 61, 62, 456, 63, 20200413, 0) + 123, 60, 61, 62, 456, 63, 20200413, 0, False) ''') self.cnx.commit() @@ -225,7 +225,7 @@ def test_unique_key_constraint(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 0, 0, 0, 0, 0, 0, 20200414, 0) + 0, 0, 0, 0, 0, 0, 20200414, 0, False) ''') self.cnx.commit() @@ -234,14 +234,14 @@ def test_unique_key_constraint(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 1, 1, 1, 1, 1, 1, 20200414, 0) + 1, 1, 1, 1, 1, 1, 20200414, 0, False) ''') # succeed to insert different dummy data under a different issue self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 1, 1, 1, 1, 1, 1, 20200415, 1) + 1, 1, 1, 1, 1, 1, 20200415, 1, False) ''') def test_nullable_columns(self): @@ -251,7 +251,7 @@ def test_nullable_columns(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 123, 0.123, NULL, NULL, 456, NULL, 20200414, 0) + 123, 0.123, NULL, NULL, 456, NULL, 20200414, 0, False) ''') self.cnx.commit() @@ -291,15 +291,15 @@ def test_temporal_partitioning(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'hour', 'state', 2020041714, 'vi', - 123, 10, 11, 12, 456, 13, 2020041714, 0), + 123, 10, 11, 12, 456, 13, 2020041714, 0, False), (0, 'src', 'sig', 'day', 'state', 20200417, 'vi', - 123, 20, 21, 22, 456, 23, 20200417, 00), + 123, 20, 21, 22, 456, 23, 20200417, 00, False), (0, 'src', 'sig', 'week', 'state', 202016, 'vi', - 123, 30, 31, 32, 456, 33, 202016, 0), + 123, 30, 31, 32, 456, 33, 202016, 0, False), (0, 'src', 'sig', 'month', 'state', 202004, 'vi', - 123, 40, 41, 42, 456, 43, 202004, 0), + 123, 40, 41, 42, 456, 43, 202004, 0, False), (0, 'src', 'sig', 'year', 'state', 2020, 'vi', - 123, 50, 51, 52, 456, 53, 2020, 0) + 123, 50, 51, 52, 456, 53, 2020, 0, False) ''') self.cnx.commit() diff --git a/integrations/server/test_covidcast_meta.py b/integrations/server/test_covidcast_meta.py index 9966b3f17..7d8aec8e3 100644 --- a/integrations/server/test_covidcast_meta.py +++ b/integrations/server/test_covidcast_meta.py @@ -28,6 +28,7 @@ def setUp(self): database='epidata') cur = cnx.cursor() cur.execute('truncate table covidcast') + cur.execute('update covidcast_meta_cache set timestamp = 0, epidata = ""') cnx.commit() cur.close() @@ -46,7 +47,7 @@ def test_round_trip(self): # insert dummy data and accumulate expected results (in sort order) template = ''' insert into covidcast values - (0, "%s", "%s", "%s", "%s", %d, "%s", 123, %d, 0, 0, 456, 0, %d, 0) + (0, "%s", "%s", "%s", "%s", %d, "%s", 123, %d, 0, 0, 456, 0, %d, 0, %d) ''' expected = [] for src in ('src1', 'src2'): @@ -72,7 +73,7 @@ def test_round_trip(self): }) for tv in (1, 2): for gv, v in zip(('geo1', 'geo2'), (10, 20)): - self.cur.execute(template % (src, sig, tt, gt, tv, gv, v, tv)) + self.cur.execute(template % (src, sig, tt, gt, tv, gv, v, tv, False)) self.cnx.commit() update_cache(args=None) @@ -94,14 +95,18 @@ def test_suppress_work_in_progress(self): # insert dummy data and accumulate expected results (in sort order) template = ''' insert into covidcast values - (0, "%s", "%s", "%s", "%s", %d, "%s", 123, %d, 0, 0, 456, 0, %d, 0) + (0, "%s", "%s", "%s", "%s", %d, "%s", 123, %d, 0, 0, 456, 0, %d, 0, %d) ''' expected = [] for src in ('src1', 'src2'): for sig in ('sig1', 'sig2', 'wip_sig3'): for tt in ('day', 'week'): for gt in ('hrr', 'msa'): - if sig != 'wip_sig3': + + if sig == 'wip_sig3': + is_wip = True + else: + is_wip = False expected.append({ 'data_source': src, 'signal': sig, @@ -121,7 +126,7 @@ def test_suppress_work_in_progress(self): }) for tv in (1, 2): for gv, v in zip(('geo1', 'geo2'), (10, 20)): - self.cur.execute(template % (src, sig, tt, gt, tv, gv, v, tv)) + self.cur.execute(template % (src, sig, tt, gt, tv, gv, v, tv, is_wip)) self.cnx.commit() update_cache(args=None) diff --git a/src/acquisition/covidcast/csv_to_database.py b/src/acquisition/covidcast/csv_to_database.py index a95f857e2..bcf74413a 100644 --- a/src/acquisition/covidcast/csv_to_database.py +++ b/src/acquisition/covidcast/csv_to_database.py @@ -74,11 +74,16 @@ def archive_as_successful(path_src, filename, source): (source, signal, time_type, geo_type, time_value, issue, lag) = details + is_wip = False + if signal[:4].lower() == "wip_": + is_wip = True + print(signal, is_wip) + csv_rows = csv_importer_impl.load_csv(path, geo_type) all_rows_valid = False try: - cc_rows = CovidcastRow.fromCsvRows(csv_rows, source, signal, time_type, geo_type, time_value, issue, lag) + cc_rows = CovidcastRow.fromCsvRows(csv_rows, source, signal, time_type, geo_type, time_value, issue, lag, is_wip) rows_list = list(cc_rows) if not rows_list: raise ValueError("No data") diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index dc6ef0ddc..a31d9f032 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -17,21 +17,21 @@ class CovidcastRow(): """A container for all the values of a single covidcast row.""" @staticmethod - def fromCsvRowValue(row_value, source, signal, time_type, geo_type, time_value, issue, lag): + def fromCsvRowValue(row_value, source, signal, time_type, geo_type, time_value, issue, lag, is_wip): return CovidcastRow(source, signal, time_type, geo_type, time_value, row_value.geo_value, row_value.value, row_value.stderr, row_value.sample_size, - issue, lag) + issue, lag, is_wip) @staticmethod - def fromCsvRows(row_values, source, signal, time_type, geo_type, time_value, issue, lag): + def fromCsvRows(row_values, source, signal, time_type, geo_type, time_value, issue, lag, is_wip): # NOTE: returns a generator, as row_values is expected to be a generator - return (CovidcastRow.fromCsvRowValue(row_value, source, signal, time_type, geo_type, time_value, issue, lag) + return (CovidcastRow.fromCsvRowValue(row_value, source, signal, time_type, geo_type, time_value, issue, lag, is_wip) for row_value in row_values) - def __init__(self, source, signal, time_type, geo_type, time_value, geo_value, value, stderr, sample_size, issue, lag): + def __init__(self, source, signal, time_type, geo_type, time_value, geo_value, value, stderr, sample_size, issue, lag, is_wip): self.id = None self.source = source self.signal = signal @@ -46,6 +46,7 @@ def __init__(self, source, signal, time_type, geo_type, time_value, geo_value, v self.direction = None self.issue = issue self.lag = lag + self.is_wip = is_wip class Database: @@ -103,12 +104,12 @@ def insert_or_update_batch(self, cc_rows, batch_size=0, commit_partial=False): (`id`, `source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, `value_updated_timestamp`, `value`, `stderr`, `sample_size`, `direction_updated_timestamp`, `direction`, - `issue`, `lag`) + `issue`, `lag`, `is_wip`) VALUES (0, %s, %s, %s, %s, %s, %s, UNIX_TIMESTAMP(NOW()), %s, %s, %s, 0, NULL, - %s, %s) + %s, %s, %s) ON DUPLICATE KEY UPDATE `value_updated_timestamp` = VALUES(`value_updated_timestamp`), `value` = VALUES(`value`), @@ -139,7 +140,8 @@ def insert_or_update_batch(self, cc_rows, batch_size=0, commit_partial=False): row.stderr, row.sample_size, row.issue, - row.lag + row.lag, + row.is_wip ) for row in cc_rows[start:end]] result = self._cursor.executemany(sql, args) @@ -164,7 +166,8 @@ def insert_or_update( stderr, sample_size, issue, - lag): + lag, + is_wip): """ Insert a new row, or update an existing row, in the `covidcast` table. @@ -173,7 +176,7 @@ def insert_or_update( sql = ''' INSERT INTO `covidcast` VALUES - (0, %s, %s, %s, %s, %s, %s, UNIX_TIMESTAMP(NOW()), %s, %s, %s, 0, NULL, %s, %s) + (0, %s, %s, %s, %s, %s, %s, UNIX_TIMESTAMP(NOW()), %s, %s, %s, 0, NULL, %s, %s, %s) ON DUPLICATE KEY UPDATE `value_updated_timestamp` = VALUES(`value_updated_timestamp`), `value` = VALUES(`value`), @@ -192,7 +195,8 @@ def insert_or_update( stderr, sample_size, issue, - lag + lag, + is_wip ) self._cursor.execute(sql, args) @@ -587,7 +591,7 @@ def get_covidcast_meta(self): x.`geo_type` = t.`geo_type` AND x.`geo_value` = t.`geo_value` WHERE - t.`signal` NOT LIKE 'wip_%' + NOT t.`is_wip` GROUP BY t.`source`, t.`signal`, diff --git a/src/ddl/covidcast.sql b/src/ddl/covidcast.sql index f94259107..4bea9aff0 100644 --- a/src/ddl/covidcast.sql +++ b/src/ddl/covidcast.sql @@ -26,6 +26,10 @@ Data is public. | direction | int(11) | YES | | NULL | | | issue | int(11) | NO | | NULL | | | lag | int(11) | NO | | NULL | | +| is_wip | binary(1) | YES | | NULL | | +| missing_value | int(11) | YES | | NULL | | +| missing_std | int(11) | YES | | NULL | | +| missing_sample_size | int(11) | YES | | NULL | | +------------------------------+-------------+------+-----+---------+----------------+ - `id` @@ -67,6 +71,14 @@ Data is public. the time_value of publication - `lag` the number of time_type units between `time_value` and `issue` +- `is_wip` + flag indicating that the signal is a 'work in progress'. this should be True iff `signal` has a 'wip_' prefix. +- `missing_value` + ~ENUM for the reason a `value` was deleted +- `missing_std` + ~ENUM for the reason a `stderr` was deleted +- `missing_sample_size` + ~ENUM for the reason a `sample_size` was deleted */ CREATE TABLE `covidcast` ( @@ -87,6 +99,10 @@ CREATE TABLE `covidcast` ( `direction` int(11), `issue` int(11) NOT NULL, `lag` int(11) NOT NULL, + `is_wip` binary(1) DEFAULT NULL, + -- TODO: `missing_value` int(11) DEFAULT NULL, + -- TODO: `missing_std` int(11) DEFAULT NULL, + -- TODO: `missing_sample_size` int(11) DEFAULT NULL, PRIMARY KEY (`id`), -- for uniqueness, and also fast lookup of all locations on a given date UNIQUE KEY (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, `issue`), diff --git a/tests/acquisition/covidcast/test_csv_to_database.py b/tests/acquisition/covidcast/test_csv_to_database.py index 2727b7d5a..8c624324f 100644 --- a/tests/acquisition/covidcast/test_csv_to_database.py +++ b/tests/acquisition/covidcast/test_csv_to_database.py @@ -55,8 +55,8 @@ def load_csv_impl(path, *args): ('path/b.csv', ('src_b', 'sig_b', 'week', 'msa', 202016, 202017, 1)), # emulate a file that's named incorrectly ('path/c.csv', None), - # another good file - ('path/d.csv', ('src_d', 'sig_d', 'week', 'msa', 202016, 202017, 1)), + # another good file w/ wip + ('path/d.csv', ('src_d', 'wip_sig_d', 'week', 'msa', 202016, 202017, 1)), ] mock_csv_importer.load_csv = load_csv_impl mock_file_archiver = MagicMock() @@ -71,13 +71,13 @@ def load_csv_impl(path, *args): self.assertEqual(mock_database.insert_or_update_bulk.call_count, 2) call_args_list = mock_database.insert_or_update_bulk.call_args_list actual_args = [[(a.source, a.signal, a.time_type, a.geo_type, a.time_value, - a.geo_value, a.value, a.stderr, a.sample_size, a.issue, a.lag) + a.geo_value, a.value, a.stderr, a.sample_size, a.issue, a.lag, a.is_wip) for a in call.args[0]] for call in call_args_list] expected_args = [ - [('src_a', 'sig_a', 'day', 'hrr', 20200419, 'a1', 'a1', 'a1', 'a1', 20200420, 1), - ('src_a', 'sig_a', 'day', 'hrr', 20200419, 'a2', 'a2', 'a2', 'a2', 20200420, 1), - ('src_a', 'sig_a', 'day', 'hrr', 20200419, 'a3', 'a3', 'a3', 'a3', 20200420, 1)], - [('src_d', 'sig_d', 'week', 'msa', 202016, 'd1', 'd1', 'd1', 'd1', 202017, 1)] + [('src_a', 'sig_a', 'day', 'hrr', 20200419, 'a1', 'a1', 'a1', 'a1', 20200420, 1, False), + ('src_a', 'sig_a', 'day', 'hrr', 20200419, 'a2', 'a2', 'a2', 'a2', 20200420, 1, False), + ('src_a', 'sig_a', 'day', 'hrr', 20200419, 'a3', 'a3', 'a3', 'a3', 20200420, 1, False)], + [('src_d', 'wip_sig_d', 'week', 'msa', 202016, 'd1', 'd1', 'd1', 'd1', 202017, 1, True)] ] self.assertEqual(actual_args, expected_args) diff --git a/tests/acquisition/covidcast/test_database.py b/tests/acquisition/covidcast/test_database.py index b58af60b7..55a077f7c 100644 --- a/tests/acquisition/covidcast/test_database.py +++ b/tests/acquisition/covidcast/test_database.py @@ -89,6 +89,7 @@ def test_insert_or_update_query(self): 'sample_size', 'issue', 'lag', + 'is_wip' ) mock_connector = MagicMock() database = Database() From 972df51f38940d3a6605a1575b44294dcc62911a Mon Sep 17 00:00:00 2001 From: "Wael H. Al Saeed" Date: Mon, 27 Jul 2020 13:34:46 -0400 Subject: [PATCH 05/15] Changed column names Changed column names back to timestamp1, timestamp2 for testing purposes. --- .../covidcast/test_csv_uploading.py | 8 +-- .../covidcast/test_direction_updating.py | 2 +- src/acquisition/covidcast/database.py | 52 +++++++++---------- src/acquisition/covidcast/direction.py | 12 ++--- .../covidcast/direction_updater.py | 30 +++++------ src/ddl/covidcast.sql | 12 ++--- .../proc_db_backups_pd.py | 12 ++--- tests/acquisition/covidcast/test_database.py | 16 +++--- tests/acquisition/covidcast/test_direction.py | 6 +-- .../covidcast/test_direction_updater.py | 4 +- 10 files changed, 77 insertions(+), 77 deletions(-) diff --git a/integrations/acquisition/covidcast/test_csv_uploading.py b/integrations/acquisition/covidcast/test_csv_uploading.py index 0d26a5d60..848dea269 100644 --- a/integrations/acquisition/covidcast/test_csv_uploading.py +++ b/integrations/acquisition/covidcast/test_csv_uploading.py @@ -192,10 +192,10 @@ def apply_lag(expected_epidata): }) # verify timestamps and default values are reasonable - self.cur.execute('select value_updated_timestamp, direction_updated_timestamp, direction from covidcast') - for value_updated_timestamp, direction_updated_timestamp, direction in self.cur: - self.assertGreater(value_updated_timestamp, 0) - self.assertEqual(direction_updated_timestamp, 0) + self.cur.execute('select timestamp1, timestamp2, direction from covidcast') + for timestamp1, timestamp2, direction in self.cur: + self.assertGreater(timestamp1, 0) + self.assertEqual(timestamp2, 0) self.assertIsNone(direction) # verify that the CSVs were archived diff --git a/integrations/acquisition/covidcast/test_direction_updating.py b/integrations/acquisition/covidcast/test_direction_updating.py index 4fa0de039..7721e809a 100644 --- a/integrations/acquisition/covidcast/test_direction_updating.py +++ b/integrations/acquisition/covidcast/test_direction_updating.py @@ -188,7 +188,7 @@ def test_uploading(self): }) # verify secondary timestamps were updated - self.cur.execute('select direction_updated_timestamp from covidcast order by id asc') + self.cur.execute('select timestamp2 from covidcast order by id asc') timestamps = [t for (t,) in self.cur] for t in timestamps[:6]: # first 6 rows had `direction` updated diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index 3e73415e3..550b96777 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -42,7 +42,7 @@ def __init__(self, source, signal, time_type, geo_type, time_value, geo_value, v self.value = value # ... self.stderr = stderr # ... self.sample_size = sample_size # from CSV row - self.direction_updated_timestamp = 0 + self.timestamp2 = 0 self.direction = None self.issue = issue self.lag = lag @@ -101,8 +101,8 @@ def insert_or_update_batch(self, cc_rows, batch_size=0, commit_partial=False): sql = ''' INSERT INTO `covidcast` (`id`, `source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, - `value_updated_timestamp`, `value`, `stderr`, `sample_size`, - `direction_updated_timestamp`, `direction`, + `timestamp1`, `value`, `stderr`, `sample_size`, + `timestamp2`, `direction`, `issue`, `lag`) VALUES (0, %s, %s, %s, %s, %s, %s, @@ -110,12 +110,12 @@ def insert_or_update_batch(self, cc_rows, batch_size=0, commit_partial=False): 0, NULL, %s, %s) ON DUPLICATE KEY UPDATE - `value_updated_timestamp` = VALUES(`value_updated_timestamp`), + `timestamp1` = VALUES(`timestamp1`), `value` = VALUES(`value`), `stderr` = VALUES(`stderr`), `sample_size` = VALUES(`sample_size`) ''' - # TODO: ^ do we want to reset `direction_updated_timestamp` and `direction` in the duplicate key case? + # TODO: ^ do we want to reset `timestamp2` and `direction` in the duplicate key case? # TODO: consider handling cc_rows as a generator instead of a list num_rows = len(cc_rows) @@ -175,7 +175,7 @@ def insert_or_update( INSERT INTO `covidcast` VALUES (0, %s, %s, %s, %s, %s, %s, UNIX_TIMESTAMP(NOW()), %s, %s, %s, 0, NULL, %s, %s) ON DUPLICATE KEY UPDATE - `value_updated_timestamp` = VALUES(`value_updated_timestamp`), + `timestamp1` = VALUES(`timestamp1`), `value` = VALUES(`value`), `stderr` = VALUES(`stderr`), `sample_size` = VALUES(`sample_size`) @@ -246,7 +246,7 @@ def update_direction( UPDATE `covidcast` SET - `direction_updated_timestamp` = UNIX_TIMESTAMP(NOW()), + `timestamp2` = UNIX_TIMESTAMP(NOW()), `direction` = %s WHERE `source` = %s AND @@ -288,9 +288,9 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type` varchar(12), `geo_value` varchar(12), `time_value` int(11), - `value_updated_timestamp` int(11), + `timestamp1` int(11), `value` double, - `direction_updated_timestamp` int(11), + `timestamp2` int(11), `direction` int(11), PRIMARY KEY(`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; @@ -307,9 +307,9 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type`, `geo_value`, `time_value`, - `value_updated_timestamp`, + `timestamp1`, `value`, - `direction_updated_timestamp`, + `timestamp2`, `direction` FROM ( @@ -361,7 +361,7 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type`, `geo_value` HAVING - MAX(`value_updated_timestamp`) > MIN(`direction_updated_timestamp`) + MAX(`timestamp1`) > MIN(`timestamp2`) ''' # A query that selects rows of the time-series selected by stale_ts_key_sql query. @@ -375,9 +375,9 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type`, `geo_value`, `time_value`, - `value_updated_timestamp`, + `timestamp1`, `value`, - `direction_updated_timestamp`, + `timestamp2`, `direction` FROM ({stale_ts_key_sql}) AS t2 LEFT JOIN `latest_issues` AS t3 @@ -427,8 +427,8 @@ def drop_temporary_table(self, tmp_table_name): sql = f'DROP TEMPORARY TABLE `{tmp_table_name}`;' self._cursor.execute(sql) - def update_direction_updated_timestamp_from_temporary_table(self, tmp_table_name): - """Updates the `direction_updated_timestamp` column of `covidcast` table for all the rows with id value in `tmp_table_name`. + def update_timestamp2_from_temporary_table(self, tmp_table_name): + """Updates the `timestamp2` column of `covidcast` table for all the rows with id value in `tmp_table_name`. `tmp_table_name`: name of the temporary table. """ @@ -440,7 +440,7 @@ def update_direction_updated_timestamp_from_temporary_table(self, tmp_table_name ON `covidcast`.id=t.id SET - `covidcast`.direction_updated_timestamp=UNIX_TIMESTAMP(NOW()) + `covidcast`.timestamp2=UNIX_TIMESTAMP(NOW()) ''' self._cursor.execute(sql) @@ -460,8 +460,8 @@ def get_keys_with_potentially_stale_direction(self): `signal`, `geo_type`, `geo_value`, - MAX(`value_updated_timestamp`) AS `max_value_updated_timestamp`, - MIN(`direction_updated_timestamp`) AS `min_direction_updated_timestamp`, + MAX(`timestamp1`) AS `max_timestamp1`, + MIN(`timestamp2`) AS `min_timestamp2`, MIN(`time_value`) AS `min_day`, MAX(`time_value`) AS `max_day`, COUNT(1) AS `series_length` @@ -476,7 +476,7 @@ def get_keys_with_potentially_stale_direction(self): `geo_type`, `geo_value` HAVING - MAX(`value_updated_timestamp`) > MIN(`direction_updated_timestamp`) + MAX(`timestamp1`) > MIN(`timestamp2`) ''' self._cursor.execute(sql) @@ -491,8 +491,8 @@ def get_daily_timeseries_for_direction_update( DATEDIFF(`time_value`, %s) AS `offset`, `time_value` AS `day`, `value`, - `value_updated_timestamp`, - `direction_updated_timestamp` + `timestamp1`, + `timestamp2` FROM `covidcast` WHERE @@ -510,9 +510,9 @@ def get_daily_timeseries_for_direction_update( self._cursor.execute(sql, args) return list(self._cursor) - def update_timeseries_direction_updated_timestamp( + def update_timeseries_timestamp2( self, source, signal, time_type, geo_type, geo_value): - """Update the `direction_updated_timestamp` column for an entire time-series. + """Update the `timestamp2` column for an entire time-series. For daily time-series, this implies that all `direction` values in the specified time-series are confirmed fresh as of the current time. Even if @@ -526,7 +526,7 @@ def update_timeseries_direction_updated_timestamp( UPDATE `covidcast` SET - `direction_updated_timestamp` = UNIX_TIMESTAMP(NOW()) + `timestamp2` = UNIX_TIMESTAMP(NOW()) WHERE `source` = %s AND `signal` = %s AND @@ -555,7 +555,7 @@ def get_covidcast_meta(self): MAX(`value`) AS `max_value`, ROUND(AVG(`value`),7) AS `mean_value`, ROUND(STD(`value`),7) AS `stdev_value`, - MAX(`value_updated_timestamp`) AS `last_update`, + MAX(`timestamp1`) AS `last_update`, MAX(`issue`) as `max_issue`, MIN(`lag`) as `min_lag`, MAX(`lag`) as `max_lag` diff --git a/src/acquisition/covidcast/direction.py b/src/acquisition/covidcast/direction.py index f364ae6be..7444a8340 100644 --- a/src/acquisition/covidcast/direction.py +++ b/src/acquisition/covidcast/direction.py @@ -64,8 +64,8 @@ def scan_timeseries( offsets, days, values, - value_updated_timestamps, - direction_updated_timestamps, + timestamp1s, + timestamp2s, get_direction_impl): """Scan an entire time-series and return fresh direction updates. @@ -73,9 +73,9 @@ def scan_timeseries( each day in `days` `days`: day (YYYYMMDD) corresponding to each row in the other arrays `values`: value of the signal on each day - `value_updated_timestamps`: primary timestamp for each row (i.e. when `value` was + `timestamp1s`: primary timestamp for each row (i.e. when `value` was updated) - `direction_updated_timestamps`: secondary timestamp for each row (i.e. when `direction` was + `timestamp2s`: secondary timestamp for each row (i.e. when `direction` was last deemed to be fresh, relative to associated `value`s) `get_direction_impl`: a function which takes two arrays (time and value) and returns a classification of the direction (i.e. as -1, 0, +1) @@ -100,8 +100,8 @@ def scan_timeseries( start += 1 # check whether this row needs an update - direction_time = direction_updated_timestamps[end] - value_time = np.max(value_updated_timestamps[start:end + 1]) + direction_time = timestamp2s[end] + value_time = np.max(timestamp1s[start:end + 1]) if direction_time > value_time: # this row is fresh continue diff --git a/src/acquisition/covidcast/direction_updater.py b/src/acquisition/covidcast/direction_updater.py index dd0becf62..8338a6f49 100644 --- a/src/acquisition/covidcast/direction_updater.py +++ b/src/acquisition/covidcast/direction_updater.py @@ -94,8 +94,8 @@ def update_loop(database, direction_impl=Direction): signal, geo_type, geo_value, - max_value_updated_timestamp, - min_direction_updated_timestamp, + max_timestamp1, + min_timestamp2, min_day, max_day, series_length, @@ -123,7 +123,7 @@ def update_loop(database, direction_impl=Direction): min_day, max_day, series_length, - max_value_updated_timestamp - min_direction_updated_timestamp, + max_timestamp1 - min_timestamp2, ) print(msg % args) @@ -133,12 +133,12 @@ def update_loop(database, direction_impl=Direction): # transpose result set and cast data types data = np.array(timeseries_rows) - offsets, days, values, value_updated_timestamps, direction_updated_timestamps = data.T + offsets, days, values, timestamp1s, timestamp2s = data.T offsets = offsets.astype(np.int64) days = days.astype(np.int64) values = values.astype(np.float64) - value_updated_timestamps = value_updated_timestamps.astype(np.int64) - direction_updated_timestamps = direction_updated_timestamps.astype(np.int64) + timestamp1s = timestamp1s.astype(np.int64) + timestamp2s = timestamp2s.astype(np.int64) # create a direction classifier for this signal data_stdev = data_stdevs[source][signal][geo_type] @@ -150,7 +150,7 @@ def get_direction_impl(x, y): # recompute any stale directions days, directions = direction_impl.scan_timeseries( - offsets, days, values, value_updated_timestamps, direction_updated_timestamps, get_direction_impl) + offsets, days, values, timestamp1s, timestamp2s, get_direction_impl) if be_verbose: print(' computed %d direction updates' % len(directions)) @@ -163,7 +163,7 @@ def get_direction_impl(x, y): source, signal, 'day', geo_type, day, geo_value, direction) # mark the entire time-series as fresh with respect to direction - database.update_timeseries_direction_updated_timestamp( + database.update_timeseries_timestamp2( source, signal, 'day', geo_type, geo_value) @@ -185,7 +185,7 @@ def optimized_update_loop(database, partition_index, direction_impl=Direction): # A pandas DataFrame that will hold all rows from potentially stale time-series df_all = pd.DataFrame(columns=['id', 'source', 'signal', 'time_type', 'geo_type', 'geo_value', 'time_value', - 'value_updated_timestamp', 'value', 'direction_updated_timestamp', 'direction'], + 'timestamp1', 'value', 'timestamp2', 'direction'], data=database.get_all_record_values_of_timeseries_with_potentially_stale_direction( tmp_table_name, partition_condition)) df_all.drop(columns=['time_type'], inplace=True) @@ -243,15 +243,15 @@ def optimized_update_loop(database, partition_index, direction_impl=Direction): ts_rows.time_value.min(), ts_rows.time_value.max(), len(ts_rows), - ts_rows.value_updated_timestamp.max() - ts_rows.direction_updated_timestamp.min() + ts_rows.timestamp1.max() - ts_rows.timestamp2.min() ) print(msg % args) offsets = ts_rows.offsets.values.astype(np.int64) days = ts_rows.time_value.values.astype(np.int64) values = ts_rows.value.values.astype(np.float64) - value_updated_timestamps = ts_rows.value_updated_timestamp.values.astype(np.int64) - direction_updated_timestamps = ts_rows.direction_updated_timestamp.values.astype(np.int64) + timestamp1s = ts_rows.timestamp1.values.astype(np.int64) + timestamp2s = ts_rows.timestamp2.values.astype(np.int64) # create a direction classifier for this signal data_stdev = data_stdevs[source][signal][geo_type] @@ -263,7 +263,7 @@ def get_direction_impl(x, y): # recompute any stale directions days, directions = direction_impl.scan_timeseries( - offsets, days, values, value_updated_timestamps, direction_updated_timestamps, get_direction_impl) + offsets, days, values, timestamp1s, timestamp2s, get_direction_impl) if be_verbose: print(' computed %d direction updates' % len(directions)) @@ -282,8 +282,8 @@ def get_direction_impl(x, y): for v, id_list in changed_rows.items(): database.batched_update_direction(v, id_list) - # Updating direction_updated_timestamp - database.update_direction_updated_timestamp_from_temporary_table(tmp_table_name) + # Updating timestamp2 + database.update_timestamp2_from_temporary_table(tmp_table_name) # Dropping temporary table database.drop_temporary_table(tmp_table_name) diff --git a/src/ddl/covidcast.sql b/src/ddl/covidcast.sql index f94259107..2032540f5 100644 --- a/src/ddl/covidcast.sql +++ b/src/ddl/covidcast.sql @@ -18,11 +18,11 @@ Data is public. | geo_type | varchar(12) | NO | | NULL | | | time_value | int(11) | NO | | NULL | | | geo_value | varchar(12) | NO | | NULL | | -| value_updated_timestamp | int(11) | NO | | NULL | | +| timestamp1 | int(11) | NO | | NULL | | | value | double | NO | | NULL | | | stderr | double | YES | | NULL | | | sample_size | double | YES | | NULL | | -| direction_updated_timestamp | int(11) | NO | | NULL | | +| timestamp2 | int(11) | NO | | NULL | | | direction | int(11) | YES | | NULL | | | issue | int(11) | NO | | NULL | | | lag | int(11) | NO | | NULL | | @@ -48,7 +48,7 @@ Data is public. - HRR: hospital referral region (HRR) number - DMA: designated market area (DMA) code - state: two-letter state abbreviation -- `value_updated_timestamp` +- `timestamp1` time when primary data (e.g. `value`) was last updated - `value` value (statistic) derived from the underlying data source @@ -56,7 +56,7 @@ Data is public. standard error of the statistic with respect to its sampling distribution - `sample_size` (NULL when not applicable) number of "data points" used in computing the statistic -- `direction_updated_timestamp` +- `timestamp2` time when secondary data (e.g. `direction`) was last updated - `direction` (NULL when not applicable) trend classifier with possible values: @@ -78,12 +78,12 @@ CREATE TABLE `covidcast` ( `time_value` int(11) NOT NULL, `geo_value` varchar(12) NOT NULL, -- "primary" values are derived from the upstream data source - `value_updated_timestamp` int(11) NOT NULL, + `timestamp1` int(11) NOT NULL, `value` double NOT NULL, `stderr` double, `sample_size` double, -- "secondary" values are derived from data in this table - `direction_updated_timestamp` int(11) NOT NULL, + `timestamp2` int(11) NOT NULL, `direction` int(11), `issue` int(11) NOT NULL, `lag` int(11) NOT NULL, diff --git a/src/server/covidcast_issues_migration/proc_db_backups_pd.py b/src/server/covidcast_issues_migration/proc_db_backups_pd.py index 1aa2cbe1b..e88f86dbd 100755 --- a/src/server/covidcast_issues_migration/proc_db_backups_pd.py +++ b/src/server/covidcast_issues_migration/proc_db_backups_pd.py @@ -25,7 +25,7 @@ # Column names INDEX_COLS = ["source", "signal", "time_type", "geo_type", "time_value", "geo_value"] -VALUE_COLS = ["value_updated_timestamp", "value", "stderr", "sample_size", "direction_updated_timestamp", "direction"] +VALUE_COLS = ["timestamp1", "value", "stderr", "sample_size", "timestamp2", "direction"] ALL_COLS = INDEX_COLS + VALUE_COLS ALL_COLS_WITH_PK = ["id"] + ALL_COLS @@ -39,11 +39,11 @@ # time_value as str, because we need this parsed as a datetime anyway "time_value": "str", "geo_value": "category", - "value_updated_timestamp": "int", + "timestamp1": "int", "value": "str", "stderr": "str", "sample_size": "str", - "direction_updated_timestamp": "int", + "timestamp2": "int", "direction": "category" } @@ -432,8 +432,8 @@ def pd_csvdiff( # Since df_before is filled with NaN for new indices, new indices turn false in same_mask same_mask = (df_before.reindex(df_after.index) == df_after) - # Ignore direction_updated_timestamp in the diff - is_diff = ~(same_mask.loc[:, same_mask.columns != "direction_updated_timestamp"].all(axis=1)) + # Ignore timestamp2 in the diff + is_diff = ~(same_mask.loc[:, same_mask.columns != "timestamp2"].all(axis=1)) # Removed indices can be found via index difference, but is expensive if find_removals: @@ -469,7 +469,7 @@ def generate_issues( row_fmt = "(" \ "{id},{source},{signal},{time_type},{geo_type},{time_value},{geo_value}," \ - "{row.value_updated_timestamp},{row.value},{row.stderr},{row.sample_size},{row.direction_updated_timestamp},{row.direction}," \ + "{row.timestamp1},{row.value},{row.stderr},{row.sample_size},{row.timestamp2},{row.direction}," \ "{issue},{row.lag})" try: diff --git a/tests/acquisition/covidcast/test_database.py b/tests/acquisition/covidcast/test_database.py index b58af60b7..666d4ca52 100644 --- a/tests/acquisition/covidcast/test_database.py +++ b/tests/acquisition/covidcast/test_database.py @@ -149,7 +149,7 @@ def test_update_direction_query(self): sql = sql.lower() self.assertIn('update', sql) self.assertIn('`covidcast`', sql) - self.assertIn('`direction_updated_timestamp` = unix_timestamp', sql) + self.assertIn('`timestamp2` = unix_timestamp', sql) self.assertIn('`direction` = %s', sql) def test_get_data_stdev_across_locations_query(self): @@ -197,8 +197,8 @@ def test_get_keys_with_potentially_stale_direction_query(self): sql = cursor.execute.call_args[0][0].lower() self.assertIn('select', sql) self.assertIn('`covidcast`', sql) - self.assertIn('value_updated_timestamp', sql) - self.assertIn('direction_updated_timestamp', sql) + self.assertIn('timestamp1', sql) + self.assertIn('timestamp2', sql) def test_get_daily_timeseries_for_direction_update_query(self): """Query to get a daily time-series looks sensible. @@ -232,10 +232,10 @@ def test_get_daily_timeseries_for_direction_update_query(self): sql = sql.lower() self.assertIn('select', sql) self.assertIn('`covidcast`', sql) - self.assertIn('value_updated_timestamp', sql) - self.assertIn('direction_updated_timestamp', sql) + self.assertIn('timestamp1', sql) + self.assertIn('timestamp2', sql) - def test_update_timeseries_direction_updated_timestamp_query(self): + def test_update_timeseries_timestamp2_query(self): """Query to update the secondary timestamp of a time-series looks sensible. NOTE: Actual behavior is tested by integration test. @@ -246,7 +246,7 @@ def test_update_timeseries_direction_updated_timestamp_query(self): database = Database() database.connect(connector_impl=mock_connector) - database.update_timeseries_direction_updated_timestamp(*args) + database.update_timeseries_timestamp2(*args) connection = mock_connector.connect() cursor = connection.cursor() @@ -259,7 +259,7 @@ def test_update_timeseries_direction_updated_timestamp_query(self): sql = sql.lower() self.assertIn('update', sql) self.assertIn('`covidcast`', sql) - self.assertIn('direction_updated_timestamp', sql) + self.assertIn('timestamp2', sql) self.assertIn('unix_timestamp(now())', sql) def test_update_covidcast_meta_cache_query(self): diff --git a/tests/acquisition/covidcast/test_direction.py b/tests/acquisition/covidcast/test_direction.py index b6f6e2a12..529056248 100644 --- a/tests/acquisition/covidcast/test_direction.py +++ b/tests/acquisition/covidcast/test_direction.py @@ -87,7 +87,7 @@ def test_get_direction_validates_arguments(self): def test_scan_timeseries(self): """Scan a time-series and update stale directions.""" - offsets, days, values, value_updated_timestamps, direction_updated_timestamps = [ + offsets, days, values, timestamp1s, timestamp2s = [ # missing days '230', '240', and '250' (the gap helps test windowing) [100, 101, 102, 106, 107, 108], [200, 210, 220, 260, 270, 280], @@ -104,8 +104,8 @@ def test_scan_timeseries(self): offsets, days, values, - value_updated_timestamps, - direction_updated_timestamps, + timestamp1s, + timestamp2s, get_direction_impl) self.assertEqual(days, [210, 280]) diff --git a/tests/acquisition/covidcast/test_direction_updater.py b/tests/acquisition/covidcast/test_direction_updater.py index 64060e07a..6dcc638be 100644 --- a/tests/acquisition/covidcast/test_direction_updater.py +++ b/tests/acquisition/covidcast/test_direction_updater.py @@ -110,7 +110,7 @@ def test_update_loop(self): ) self.assertEqual(call_args_list[1][0], expected_args) - self.assertTrue(mock_database.update_timeseries_direction_updated_timestamp.called) - args = mock_database.update_timeseries_direction_updated_timestamp.call_args[0] + self.assertTrue(mock_database.update_timeseries_timestamp2.called) + args = mock_database.update_timeseries_timestamp2.call_args[0] expected_args = ('source', 'signal', 'day', 'geo_type', 'geo_value') self.assertEqual(args, expected_args) From bf86b290e9192535d14aec142130ad9b9d637437 Mon Sep 17 00:00:00 2001 From: george haff Date: Wed, 29 Jul 2020 12:03:37 -0400 Subject: [PATCH 06/15] piecewise metadata computation tests changed as i pulled the serialization down a layer --- .../covidcast/covidcast_meta_cache_updater.py | 6 +-- src/acquisition/covidcast/database.py | 52 +++++++++++++------ .../test_covidcast_meta_cache_updater.py | 3 +- tests/acquisition/covidcast/test_database.py | 4 +- 4 files changed, 41 insertions(+), 24 deletions(-) diff --git a/src/acquisition/covidcast/covidcast_meta_cache_updater.py b/src/acquisition/covidcast/covidcast_meta_cache_updater.py index dca7df8df..e28fbeb10 100644 --- a/src/acquisition/covidcast/covidcast_meta_cache_updater.py +++ b/src/acquisition/covidcast/covidcast_meta_cache_updater.py @@ -2,7 +2,6 @@ # standard library import argparse -import json import sys # first party @@ -43,12 +42,9 @@ def main(args, epidata_impl=Epidata, database_impl=Database): print('unable to cache epidata') return False - # serialize the data - epidata_json = json.dumps(metadata) - # update the cache try: - database.update_covidcast_meta_cache(epidata_json) + database.update_covidcast_meta_cache(metadata) print('successfully cached epidata') finally: # no catch block so that an exception above will cause the program to diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index a31d9f032..c70744681 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -4,13 +4,14 @@ """ # third party +import json import mysql.connector import numpy as np +from math import ceil # first party import delphi.operations.secrets as secrets -from math import ceil class CovidcastRow(): @@ -543,7 +544,13 @@ def update_timeseries_direction_updated_timestamp( def get_covidcast_meta(self): """Compute and return metadata on all non-WIP COVIDcast signals.""" - sql = ''' + meta = [] + + sql = 'SELECT `source`, `signal` FROM covidcast GROUP BY `source`, `signal` ORDER BY `source` ASC, `signal` ASC;' + self._cursor.execute(sql) + for source, signal in [ss for ss in self._cursor]: #NOTE: this obfuscation protects the integrity of the cursor; using the cursor as a generator will cause contention w/ subsequent queries + + sql = ''' SELECT t.`source` AS `data_source`, t.`signal`, @@ -574,39 +581,38 @@ def get_covidcast_meta(self): `geo_value` FROM `covidcast` + WHERE + `source` = %s AND + `signal` = %s GROUP BY `time_value`, `time_type`, `geo_type`, - `source`, - `signal`, `geo_value` ) x ON x.`max_issue` = t.`issue` AND x.`time_type` = t.`time_type` AND x.`time_value` = t.`time_value` AND - x.`source` = t.`source` AND - x.`signal` = t.`signal` AND x.`geo_type` = t.`geo_type` AND x.`geo_value` = t.`geo_value` WHERE - NOT t.`is_wip` + NOT t.`is_wip` AND + t.`source` = %s AND + t.`signal` = %s GROUP BY - t.`source`, - t.`signal`, t.`time_type`, t.`geo_type` ORDER BY - t.`source` ASC, - t.`signal` ASC, t.`time_type` ASC, t.`geo_type` ASC - ''' - self._cursor.execute(sql) - return list(dict(zip(self._cursor.column_names,x)) for x in self._cursor) + ''' + self._cursor.execute(sql, (source, signal, source, signal)) + meta.extend(list(dict(zip(self._cursor.column_names,x)) for x in self._cursor)) - def update_covidcast_meta_cache(self, epidata_json): + return meta + + def update_covidcast_meta_cache(self, metadata): """Updates the `covidcast_meta_cache` table.""" sql = ''' @@ -616,5 +622,21 @@ def update_covidcast_meta_cache(self, epidata_json): `timestamp` = UNIX_TIMESTAMP(NOW()), `epidata` = %s ''' + epidata_json = json.dumps(metadata) self._cursor.execute(sql, (epidata_json,)) + + def retrieve_covidcast_meta_cache(self): + sql = ''' + SELECT `epidata` + FROM `covidcast_meta_cache` + ORDER BY `timestamp` DESC + LIMIT 1; + ''' + self._cursor.execute(sql) + cache_json = self._cursor.fetchone()[0] + cache = json.loads(cache_json) + cache_hash = {} + for entry in cache: + cache_hash[(entry['data_source'], entry['signal'], entry['time_type'], entry['geo_type'])] = entry + return cache_hash diff --git a/tests/acquisition/covidcast/test_covidcast_meta_cache_updater.py b/tests/acquisition/covidcast/test_covidcast_meta_cache_updater.py index 7429049d8..71e043e6a 100644 --- a/tests/acquisition/covidcast/test_covidcast_meta_cache_updater.py +++ b/tests/acquisition/covidcast/test_covidcast_meta_cache_updater.py @@ -2,7 +2,6 @@ # standard library import argparse -import json import unittest from unittest.mock import MagicMock @@ -49,7 +48,7 @@ def test_main_successful(self): self.assertTrue(mock_database.update_covidcast_meta_cache.called) actual_args = mock_database.update_covidcast_meta_cache.call_args[0] - expected_args = (json.dumps(api_response['epidata']),) + expected_args = (api_response['epidata'],) self.assertEqual(actual_args, expected_args) self.assertTrue(mock_database.disconnect.called) diff --git a/tests/acquisition/covidcast/test_database.py b/tests/acquisition/covidcast/test_database.py index 55a077f7c..491829ad3 100644 --- a/tests/acquisition/covidcast/test_database.py +++ b/tests/acquisition/covidcast/test_database.py @@ -269,7 +269,7 @@ def test_update_covidcast_meta_cache_query(self): NOTE: Actual behavior is tested by integration test. """ - args = ('epidata_json',) + args = ('epidata_json_str',) mock_connector = MagicMock() database = Database() database.connect(connector_impl=mock_connector) @@ -281,7 +281,7 @@ def test_update_covidcast_meta_cache_query(self): self.assertTrue(cursor.execute.called) sql, args = cursor.execute.call_args[0] - expected_args = ('epidata_json',) + expected_args = ('"epidata_json_str"',) self.assertEqual(args, expected_args) sql = sql.lower() From a310ae087c6d45f4ae9f9ba2e7700c83576f0944 Mon Sep 17 00:00:00 2001 From: "Wael H. Al Saeed" Date: Wed, 29 Jul 2020 16:42:16 -0400 Subject: [PATCH 07/15] Updated Direction Updater * Implemented the Quick Fix: Direction should be updated to NA wherever there are no historical data to compute the stdev from. * Modified the integration test to test for this behavior --- .../covidcast/test_direction_updating.py | 54 +++++++++++++++++-- .../covidcast/direction_updater.py | 36 ++++++++----- 2 files changed, 73 insertions(+), 17 deletions(-) diff --git a/integrations/acquisition/covidcast/test_direction_updating.py b/integrations/acquisition/covidcast/test_direction_updating.py index 7721e809a..a5117ae37 100644 --- a/integrations/acquisition/covidcast/test_direction_updating.py +++ b/integrations/acquisition/covidcast/test_direction_updating.py @@ -51,6 +51,8 @@ def test_uploading(self): """Update rows having a stale `direction` field and serve the results.""" # insert some sample data + # src, sig1, 1111: + # direction should be updated to None as there are no historical data for (src, sig1, state). # CA 20200301: # timeline should be x=[-2, -1, 0], y=[2, 6, 5] with direction=1 # FL 20200517: @@ -60,6 +62,12 @@ def test_uploading(self): # wrong) is fresh self.cur.execute(''' insert into covidcast values + (0, 'src', 'sig1', 'day', 'state', 20201028, '1111', + 123, 2, 0, 0, 0, -1, 20201028, 0), + (0, 'src', 'sig1', 'day', 'state', 20201029, '1111', + 123, 6, 0, 0, 0, 0, 20201029, 0), + (0, 'src', 'sig1', 'day', 'state', 20201030, '1111', + 123, 5, 0, 0, 0, 1, 20201030, 0), (0, 'src', 'sig', 'day', 'state', 20200228, 'ca', 123, 2, 0, 0, 0, NULL, 20200228, 0), (0, 'src', 'sig', 'day', 'state', 20200229, 'ca', @@ -85,6 +93,46 @@ def test_uploading(self): args = get_argument_parser().parse_args('') main(args) + # The Quick-Fix is working + response = Epidata.covidcast( + 'src', 'sig1', 'day', 'state', '20200101-20201231', '*') + + self.assertEqual(response, { + 'result': 1, + 'epidata': [{ + 'time_value': 20201028, + 'geo_value': '1111', + 'value': 2, + 'stderr': 0, + 'sample_size': 0, + 'direction': None, + 'issue': 20201028, + 'lag': 0 + }, + { + 'time_value': 20201029, + 'geo_value': '1111', + 'value': 6, + 'stderr': 0, + 'sample_size': 0, + 'direction': None, + 'issue': 20201029, + 'lag': 0 + }, + { + 'time_value': 20201030, + 'geo_value': '1111', + 'value': 5, + 'stderr': 0, + 'sample_size': 0, + 'direction': None, + 'issue': 20201030, + 'lag': 0 + }, + ], + 'message': 'success', + }) + # request data from the API response = Epidata.covidcast( 'src', 'sig', 'day', 'state', '20200101-20201231', '*') @@ -190,9 +238,9 @@ def test_uploading(self): # verify secondary timestamps were updated self.cur.execute('select timestamp2 from covidcast order by id asc') timestamps = [t for (t,) in self.cur] - for t in timestamps[:6]: - # first 6 rows had `direction` updated + for t in timestamps[:9]: + # first 9 rows had `direction` updated self.assertGreater(t, 0) - for t in timestamps[6:]: + for t in timestamps[9:]: # last 3 rows were not updated self.assertEqual(t, 456) diff --git a/src/acquisition/covidcast/direction_updater.py b/src/acquisition/covidcast/direction_updater.py index 8338a6f49..74b24092d 100644 --- a/src/acquisition/covidcast/direction_updater.py +++ b/src/acquisition/covidcast/direction_updater.py @@ -253,24 +253,30 @@ def optimized_update_loop(database, partition_index, direction_impl=Direction): timestamp1s = ts_rows.timestamp1.values.astype(np.int64) timestamp2s = ts_rows.timestamp2.values.astype(np.int64) - # create a direction classifier for this signal - data_stdev = data_stdevs[source][signal][geo_type] - slope_threshold = data_stdev * Constants.BASE_SLOPE_THRESHOLD + if (source in data_stdevs) and (signal in data_stdevs[source]) and (geo_type in data_stdevs[source][signal]): + # create a direction classifier for this signal + data_stdev = data_stdevs[source][signal][geo_type] + slope_threshold = data_stdev * Constants.BASE_SLOPE_THRESHOLD - def get_direction_impl(x, y): - return direction_impl.get_direction( - x, y, n=Constants.SLOPE_STERR_SCALE, limit=slope_threshold) + def get_direction_impl(x, y): + return direction_impl.get_direction( + x, y, n=Constants.SLOPE_STERR_SCALE, limit=slope_threshold) - # recompute any stale directions - days, directions = direction_impl.scan_timeseries( - offsets, days, values, timestamp1s, timestamp2s, get_direction_impl) + # recompute any stale directions + days, directions = direction_impl.scan_timeseries( + offsets, days, values, timestamp1s, timestamp2s, get_direction_impl) - if be_verbose: - print(' computed %d direction updates' % len(directions)) + if be_verbose: + print(' computed %d direction updates' % len(directions)) + + # A DataFrame holding rows that potentially changed direction value + ts_pot_changed = ts_rows.set_index('time_value').loc[days] + ts_pot_changed['new_direction'] = np.array(directions, np.float64) - # A DataFrame holding rows that potentially changed direction value - ts_pot_changed = ts_rows.set_index('time_value').loc[days] - ts_pot_changed['new_direction'] = np.array(directions, np.float64) + # This is a Quick-Fix [in case no data for (source, signal, geo_type) exists before Constants.SIGNAL_STDEV_MAX_DAY] + else: + ts_pot_changed = ts_rows.set_index('time_value') + ts_pot_changed['new_direction'] = np.nan # Adding changed values to the changed_rows dictionary gb_o = ts_pot_changed.groupby('new_direction') @@ -305,6 +311,8 @@ def main( # only commit on success so that directions are consistent with respect # to methodology commit = True + except Exception as e: + raise e finally: # no catch block so that an exception above will cause the program to # fail after the following cleanup From 5a33dc3c16616bfae28c9a1786832524347538c0 Mon Sep 17 00:00:00 2001 From: "Wael H. Al Saeed" Date: Mon, 3 Aug 2020 15:11:44 -0400 Subject: [PATCH 08/15] Added `is_latest_issue` column --- .../covidcast/test_covidcast_meta_caching.py | 6 +- .../covidcast/test_direction_updating.py | 24 +- integrations/client/test_delphi_epidata.py | 12 +- integrations/server/test_covidcast.py | 44 ++-- integrations/server/test_covidcast_meta.py | 4 +- src/acquisition/covidcast/database.py | 229 +++++++++--------- src/ddl/covidcast.sql | 8 +- tests/acquisition/covidcast/test_database.py | 38 --- 8 files changed, 171 insertions(+), 194 deletions(-) diff --git a/integrations/acquisition/covidcast/test_covidcast_meta_caching.py b/integrations/acquisition/covidcast/test_covidcast_meta_caching.py index eba061406..efc5d67d1 100644 --- a/integrations/acquisition/covidcast/test_covidcast_meta_caching.py +++ b/integrations/acquisition/covidcast/test_covidcast_meta_caching.py @@ -66,14 +66,14 @@ def test_caching(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'state', 20200422, 'pa', - 123, 1, 2, 3, 456, 1, 20200422, 0), + 123, 1, 2, 3, 456, 1, 20200422, 0, 1), (0, 'src', 'sig', 'day', 'state', 20200422, 'wa', - 789, 1, 2, 3, 456, 1, 20200423, 1) + 789, 1, 2, 3, 456, 1, 20200423, 1, 1) ''') self.cur.execute(''' insert into covidcast values (100, 'src', 'wip_sig', 'day', 'state', 20200422, 'pa', - 456, 4, 5, 6, 789, -1, 20200422, 0) + 456, 4, 5, 6, 789, -1, 20200422, 0, 1) ''') self.cnx.commit() diff --git a/integrations/acquisition/covidcast/test_direction_updating.py b/integrations/acquisition/covidcast/test_direction_updating.py index a5117ae37..fa04afb11 100644 --- a/integrations/acquisition/covidcast/test_direction_updating.py +++ b/integrations/acquisition/covidcast/test_direction_updating.py @@ -63,29 +63,29 @@ def test_uploading(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig1', 'day', 'state', 20201028, '1111', - 123, 2, 0, 0, 0, -1, 20201028, 0), + 123, 2, 0, 0, 0, -1, 20201028, 0, 1), (0, 'src', 'sig1', 'day', 'state', 20201029, '1111', - 123, 6, 0, 0, 0, 0, 20201029, 0), + 123, 6, 0, 0, 0, 0, 20201029, 0, 1), (0, 'src', 'sig1', 'day', 'state', 20201030, '1111', - 123, 5, 0, 0, 0, 1, 20201030, 0), + 123, 5, 0, 0, 0, 1, 20201030, 0, 1), (0, 'src', 'sig', 'day', 'state', 20200228, 'ca', - 123, 2, 0, 0, 0, NULL, 20200228, 0), + 123, 2, 0, 0, 0, NULL, 20200228, 0, 1), (0, 'src', 'sig', 'day', 'state', 20200229, 'ca', - 123, 6, 0, 0, 0, NULL, 20200229, 0), + 123, 6, 0, 0, 0, NULL, 20200229, 0, 1), (0, 'src', 'sig', 'day', 'state', 20200301, 'ca', - 123, 5, 0, 0, 0, NULL, 20200301, 0), + 123, 5, 0, 0, 0, NULL, 20200301, 0, 1), (0, 'src', 'sig', 'day', 'state', 20200511, 'fl', - 123, 1, 0, 0, 0, NULL, 20200511, 0), + 123, 1, 0, 0, 0, NULL, 20200511, 0, 1), (0, 'src', 'sig', 'day', 'state', 20200512, 'fl', - 123, 2, 0, 0, 0, NULL, 20200512, 0), + 123, 2, 0, 0, 0, NULL, 20200512, 0, 1), (0, 'src', 'sig', 'day', 'state', 20200517, 'fl', - 123, 2, 0, 0, 0, NULL, 20200517, 0), + 123, 2, 0, 0, 0, NULL, 20200517, 0, 1), (0, 'src', 'sig', 'day', 'state', 20200615, 'tx', - 123, 9, 0, 0, 456, NULL, 20200615, 0), + 123, 9, 0, 0, 456, NULL, 20200615, 0, 1), (0, 'src', 'sig', 'day', 'state', 20200616, 'tx', - 123, 5, 0, 0, 456, NULL, 20200616, 0), + 123, 5, 0, 0, 456, NULL, 20200616, 0, 1), (0, 'src', 'sig', 'day', 'state', 20200617, 'tx', - 123, 1, 0, 0, 456, 1, 20200617, 0) + 123, 1, 0, 0, 456, 1, 20200617, 0, 1) ''') self.cnx.commit() diff --git a/integrations/client/test_delphi_epidata.py b/integrations/client/test_delphi_epidata.py index fcab62ad0..f7836a4f5 100644 --- a/integrations/client/test_delphi_epidata.py +++ b/integrations/client/test_delphi_epidata.py @@ -50,11 +50,11 @@ def test_covidcast(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0), + 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0, 0), (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 456, 5.5, 1.2, 10.5, 789, 0, 20200415, 1), + 456, 5.5, 1.2, 10.5, 789, 0, 20200415, 1, 0), (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 345, 6.5, 2.2, 11.5, 678, 0, 20200416, 2) + 345, 6.5, 2.2, 11.5, 678, 0, 20200416, 2, 1) ''') self.cnx.commit() @@ -157,11 +157,11 @@ def test_covidcast_meta(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0), + 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0, 0), (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 345, 6.0, 2.2, 11.5, 678, 0, 20200416, 2), + 345, 6.0, 2.2, 11.5, 678, 0, 20200416, 2, 1), (0, 'src', 'sig', 'day', 'county', 20200415, '01234', - 345, 7.0, 2.0, 12.5, 678, 0, 20200416, 1) + 345, 7.0, 2.0, 12.5, 678, 0, 20200416, 1, 1) ''') self.cnx.commit() diff --git a/integrations/server/test_covidcast.py b/integrations/server/test_covidcast.py index 58c34614c..22f57d245 100644 --- a/integrations/server/test_covidcast.py +++ b/integrations/server/test_covidcast.py @@ -45,7 +45,7 @@ def test_round_trip(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0) + 123, 1.5, 2.5, 3.5, 456, 4, 20200414, 0, 1) ''') self.cnx.commit() @@ -85,17 +85,17 @@ def test_location_wildcard(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '11111', - 123, 10, 11, 12, 456, 13, 20200414, 0), + 123, 10, 11, 12, 456, 13, 20200414, 0, 1), (0, 'src', 'sig', 'day', 'county', 20200414, '22222', - 123, 20, 21, 22, 456, 23, 20200414, 0), + 123, 20, 21, 22, 456, 23, 20200414, 0, 1), (0, 'src', 'sig', 'day', 'county', 20200414, '33333', - 123, 30, 31, 32, 456, 33, 20200414, 0), + 123, 30, 31, 32, 456, 33, 20200414, 0, 1), (0, 'src', 'sig', 'day', 'msa', 20200414, '11111', - 123, 40, 41, 42, 456, 43, 20200414, 0), + 123, 40, 41, 42, 456, 43, 20200414, 0, 1), (0, 'src', 'sig', 'day', 'msa', 20200414, '22222', - 123, 50, 51, 52, 456, 53, 20200414, 0), + 123, 50, 51, 52, 456, 53, 20200414, 0, 1), (0, 'src', 'sig', 'day', 'msa', 20200414, '33333', - 123, 60, 61, 62, 456, 634, 20200414, 0) + 123, 60, 61, 62, 456, 634, 20200414, 0, 1) ''') self.cnx.commit() @@ -155,17 +155,17 @@ def test_location_timeline(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200411, '01234', - 123, 10, 11, 12, 456, 13, 20200413, 2), + 123, 10, 11, 12, 456, 13, 20200413, 2, 1), (0, 'src', 'sig', 'day', 'county', 20200412, '01234', - 123, 20, 21, 22, 456, 23, 20200413, 1), + 123, 20, 21, 22, 456, 23, 20200413, 1, 1), (0, 'src', 'sig', 'day', 'county', 20200413, '01234', - 123, 30, 31, 32, 456, 33, 20200413, 0), + 123, 30, 31, 32, 456, 33, 20200413, 0, 1), (0, 'src', 'sig', 'day', 'county', 20200411, '11111', - 123, 40, 41, 42, 456, 43, 20200413, 2), + 123, 40, 41, 42, 456, 43, 20200413, 2, 1), (0, 'src', 'sig', 'day', 'county', 20200412, '22222', - 123, 50, 51, 52, 456, 53, 20200413, 1), + 123, 50, 51, 52, 456, 53, 20200413, 1, 1), (0, 'src', 'sig', 'day', 'county', 20200413, '33333', - 123, 60, 61, 62, 456, 63, 20200413, 0) + 123, 60, 61, 62, 456, 63, 20200413, 0, 1) ''') self.cnx.commit() @@ -225,7 +225,7 @@ def test_unique_key_constraint(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 0, 0, 0, 0, 0, 0, 20200414, 0) + 0, 0, 0, 0, 0, 0, 20200414, 0, 1) ''') self.cnx.commit() @@ -234,14 +234,14 @@ def test_unique_key_constraint(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 1, 1, 1, 1, 1, 1, 20200414, 0) + 1, 1, 1, 1, 1, 1, 20200414, 0, 1) ''') # succeed to insert different dummy data under a different issue self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 1, 1, 1, 1, 1, 1, 20200415, 1) + 1, 1, 1, 1, 1, 1, 20200415, 1, 1) ''') def test_nullable_columns(self): @@ -251,7 +251,7 @@ def test_nullable_columns(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'county', 20200414, '01234', - 123, 0.123, NULL, NULL, 456, NULL, 20200414, 0) + 123, 0.123, NULL, NULL, 456, NULL, 20200414, 0, 1) ''') self.cnx.commit() @@ -291,15 +291,15 @@ def test_temporal_partitioning(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'hour', 'state', 2020041714, 'vi', - 123, 10, 11, 12, 456, 13, 2020041714, 0), + 123, 10, 11, 12, 456, 13, 2020041714, 0, 1), (0, 'src', 'sig', 'day', 'state', 20200417, 'vi', - 123, 20, 21, 22, 456, 23, 20200417, 00), + 123, 20, 21, 22, 456, 23, 20200417, 00, 1), (0, 'src', 'sig', 'week', 'state', 202016, 'vi', - 123, 30, 31, 32, 456, 33, 202016, 0), + 123, 30, 31, 32, 456, 33, 202016, 0, 1), (0, 'src', 'sig', 'month', 'state', 202004, 'vi', - 123, 40, 41, 42, 456, 43, 202004, 0), + 123, 40, 41, 42, 456, 43, 202004, 0, 1), (0, 'src', 'sig', 'year', 'state', 2020, 'vi', - 123, 50, 51, 52, 456, 53, 2020, 0) + 123, 50, 51, 52, 456, 53, 2020, 0, 1) ''') self.cnx.commit() diff --git a/integrations/server/test_covidcast_meta.py b/integrations/server/test_covidcast_meta.py index 9966b3f17..901dda445 100644 --- a/integrations/server/test_covidcast_meta.py +++ b/integrations/server/test_covidcast_meta.py @@ -46,7 +46,7 @@ def test_round_trip(self): # insert dummy data and accumulate expected results (in sort order) template = ''' insert into covidcast values - (0, "%s", "%s", "%s", "%s", %d, "%s", 123, %d, 0, 0, 456, 0, %d, 0) + (0, "%s", "%s", "%s", "%s", %d, "%s", 123, %d, 0, 0, 456, 0, %d, 0, 1) ''' expected = [] for src in ('src1', 'src2'): @@ -94,7 +94,7 @@ def test_suppress_work_in_progress(self): # insert dummy data and accumulate expected results (in sort order) template = ''' insert into covidcast values - (0, "%s", "%s", "%s", "%s", %d, "%s", 123, %d, 0, 0, 456, 0, %d, 0) + (0, "%s", "%s", "%s", "%s", %d, "%s", 123, %d, 0, 0, 456, 0, %d, 0, 1) ''' expected = [] for src in ('src1', 'src2'): diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index 550b96777..29e864aef 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -92,110 +92,141 @@ def count_all_rows(self): def insert_or_update_bulk(self, cc_rows): return self.insert_or_update_batch(cc_rows) - def insert_or_update_batch(self, cc_rows, batch_size=0, commit_partial=False): + def insert_or_update_batch(self, cc_rows, batch_size=2**20, commit_partial=False): """ Insert new rows (or update existing) in the `covidcast` table. This has the intentional side effect of updating the primary timestamp. + + """ - sql = ''' - INSERT INTO `covidcast` - (`id`, `source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, - `timestamp1`, `value`, `stderr`, `sample_size`, - `timestamp2`, `direction`, - `issue`, `lag`) - VALUES - (0, %s, %s, %s, %s, %s, %s, - UNIX_TIMESTAMP(NOW()), %s, %s, %s, - 0, NULL, - %s, %s) - ON DUPLICATE KEY UPDATE - `timestamp1` = VALUES(`timestamp1`), - `value` = VALUES(`value`), - `stderr` = VALUES(`stderr`), - `sample_size` = VALUES(`sample_size`) + + tmp_table_name = 'tmp_insert_update_table' + + create_tmp_table_sql = f''' + CREATE TEMPORARY TABLE `{tmp_table_name}` ( + `source` varchar(32) NOT NULL, + `signal` varchar(32) NOT NULL, + `time_type` varchar(12) NOT NULL, + `geo_type` varchar(12) NOT NULL, + `time_value` int(11) NOT NULL, + `geo_value` varchar(12) NOT NULL, + `timestamp1` int(11) NOT NULL, + `value` double NOT NULL, + `stderr` double, + `sample_size` double, + `timestamp2` int(11) NOT NULL, + `direction` int(11), + `issue` int(11) NOT NULL, + `lag` int(11) NOT NULL, + `is_latest_issue` BINARY(1) NOT NULL + ) ENGINE=InnoDB DEFAULT CHARSET=utf8; ''' - # TODO: ^ do we want to reset `timestamp2` and `direction` in the duplicate key case? - # TODO: consider handling cc_rows as a generator instead of a list - num_rows = len(cc_rows) - total = 0 - if not batch_size: - batch_size = num_rows - num_batches = ceil(num_rows/batch_size) - for batch_num in range(num_batches): - start = batch_num * batch_size - end = min(num_rows, start + batch_size) - length = end - start - - args = [( - row.source, - row.signal, - row.time_type, - row.geo_type, - row.time_value, - row.geo_value, - row.value, - row.stderr, - row.sample_size, - row.issue, - row.lag - ) for row in cc_rows[start:end]] - - result = self._cursor.executemany(sql, args) - if result is None: - # the SQL connector does not support returning number of rows affected - total = None - else: - total += result - if commit_partial: - self._connection.commit() - return total - - def insert_or_update( - self, - source, - signal, - time_type, - geo_type, - time_value, - geo_value, - value, - stderr, - sample_size, - issue, - lag): - """ - Insert a new row, or update an existing row, in the `covidcast` table. + truncate_tmp_table_sql = f'TRUNCATE TABLE {tmp_table_name};' + drop_tmp_table_sql = f'DROP TEMPORARY TABLE {tmp_table_name}' - This has the intentional side effect of updating the primary timestamp. - """ + insert_into_tmp_sql = f''' + INSERT INTO `{tmp_table_name}` + (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, + `timestamp1`, `value`, `stderr`, `sample_size`, `timestamp2`, `direction`, + `issue`, `lag`, `is_latest_issue`) + VALUES + (%s, %s, %s, %s, %s, %s, UNIX_TIMESTAMP(NOW()), %s, %s, %s, 0, NULL, %s, %s, 0) + ''' - sql = ''' - INSERT INTO `covidcast` VALUES - (0, %s, %s, %s, %s, %s, %s, UNIX_TIMESTAMP(NOW()), %s, %s, %s, 0, NULL, %s, %s) + insert_or_update_sql = f''' + INSERT INTO `covidcast` + (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, + `timestamp1`, `value`, `stderr`, `sample_size`, `timestamp2`, `direction`, + `issue`, `lag`, `is_latest_issue`) + SELECT * FROM `{tmp_table_name}` ON DUPLICATE KEY UPDATE `timestamp1` = VALUES(`timestamp1`), `value` = VALUES(`value`), `stderr` = VALUES(`stderr`), `sample_size` = VALUES(`sample_size`) ''' + zero_is_latest_issue_sql = f''' + UPDATE + ( + SELECT DISTINCT `source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value` + FROM `{tmp_table_name}` + ) AS TMP + LEFT JOIN `covidcast` + USING (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`) + SET `is_latest_issue`=0 + ''' + set_is_latest_issue_sql = f''' + UPDATE + ( + SELECT `source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, MAX(`issue`) AS `issue` + FROM + ( + SELECT DISTINCT `source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value` + FROM `{tmp_table_name}` + ) AS TMP + LEFT JOIN `covidcast` + USING (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`) + GROUP BY `source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value` + ) AS TMP + LEFT JOIN `covidcast` + USING (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, `issue`) + SET `is_latest_issue`=1 + ''' - args = ( - source, - signal, - time_type, - geo_type, - time_value, - geo_value, - value, - stderr, - sample_size, - issue, - lag - ) + # TODO: ^ do we want to reset `timestamp2` and `direction` in the duplicate key case? - self._cursor.execute(sql, args) + # TODO: consider handling cc_rows as a generator instead of a list + self._cursor.execute(create_tmp_table_sql) + + try: + + num_rows = len(cc_rows) + total = 0 + if not batch_size: + batch_size = num_rows + num_batches = ceil(num_rows/batch_size) + for batch_num in range(num_batches): + start = batch_num * batch_size + end = min(num_rows, start + batch_size) + length = end - start + + args = [( + row.source, + row.signal, + row.time_type, + row.geo_type, + row.time_value, + row.geo_value, + row.value, + row.stderr, + row.sample_size, + row.issue, + row.lag + ) for row in cc_rows[start:end]] + + + result = self._cursor.executemany(insert_into_tmp_sql, args) + self._cursor.execute(insert_or_update_sql) + self._cursor.execute(zero_is_latest_issue_sql) + self._cursor.execute(set_is_latest_issue_sql) + self._cursor.execute(truncate_tmp_table_sql) + + if result is None: + # the SQL connector does not support returning number of rows affected + total = None + else: + total += result + if commit_partial: + self._connection.commit() + except Exception as e: + print('AAA') + print(e) + raise e + finally: + self._cursor.execute(drop_tmp_table_sql) + return total def get_data_stdev_across_locations(self, max_day): """ @@ -311,30 +342,10 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `value`, `timestamp2`, `direction` - FROM - ( - SELECT - `source`, - `signal`, - `time_type`, - `geo_type`, - `geo_value`, - `time_value`, - MAX(`issue`) AS `issue` - FROM `covidcast` - WHERE + FROM `covidcast` + WHERE `is_latest_issue` = 1 AND `time_type` = 'day' AND {partition_condition} - GROUP BY - `source`, - `signal`, - `time_type`, - `geo_type`, - `geo_value`, - `time_value` - ) b - LEFT JOIN `covidcast` a - USING (`source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value`, `issue`) ''' cte_definition = f''' diff --git a/src/ddl/covidcast.sql b/src/ddl/covidcast.sql index 2032540f5..48c3ead6f 100644 --- a/src/ddl/covidcast.sql +++ b/src/ddl/covidcast.sql @@ -18,14 +18,15 @@ Data is public. | geo_type | varchar(12) | NO | | NULL | | | time_value | int(11) | NO | | NULL | | | geo_value | varchar(12) | NO | | NULL | | -| timestamp1 | int(11) | NO | | NULL | | +| timestamp1 | int(11) | NO | | NULL | | | value | double | NO | | NULL | | | stderr | double | YES | | NULL | | | sample_size | double | YES | | NULL | | -| timestamp2 | int(11) | NO | | NULL | | +| timestamp2 | int(11) | NO | | NULL | | | direction | int(11) | YES | | NULL | | | issue | int(11) | NO | | NULL | | | lag | int(11) | NO | | NULL | | +| is_latest_issue | boolean | NO | | NULL | | +------------------------------+-------------+------+-----+---------+----------------+ - `id` @@ -67,6 +68,8 @@ Data is public. the time_value of publication - `lag` the number of time_type units between `time_value` and `issue` +- `is_latest_issue` + a boolean flag which indicates whether or not the row corresponds to the latest issue for its key */ CREATE TABLE `covidcast` ( @@ -87,6 +90,7 @@ CREATE TABLE `covidcast` ( `direction` int(11), `issue` int(11) NOT NULL, `lag` int(11) NOT NULL, + `is_latest_issue` BINARY(1) NOT NULL, PRIMARY KEY (`id`), -- for uniqueness, and also fast lookup of all locations on a given date UNIQUE KEY (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, `issue`), diff --git a/tests/acquisition/covidcast/test_database.py b/tests/acquisition/covidcast/test_database.py index 666d4ca52..958e8cd9e 100644 --- a/tests/acquisition/covidcast/test_database.py +++ b/tests/acquisition/covidcast/test_database.py @@ -71,44 +71,6 @@ def test_count_all_rows_query(self): self.assertIn('select count(1)', sql) self.assertIn('from `covidcast`', sql) - def test_insert_or_update_query(self): - """Query to insert/update a row looks sensible. - - NOTE: Actual behavior is tested by integration test. - """ - - row = ( - 'source', - 'signal', - 'time_type', - 'geo_type', - 'time_value', - 'geo_value', - 'value', - 'stderr', - 'sample_size', - 'issue', - 'lag', - ) - mock_connector = MagicMock() - database = Database() - database.connect(connector_impl=mock_connector) - - database.insert_or_update(*row) - - connection = mock_connector.connect() - cursor = connection.cursor() - self.assertTrue(cursor.execute.called) - - sql, args = cursor.execute.call_args[0] - self.assertEqual(args, row) - - sql = sql.lower() - self.assertIn('insert into', sql) - self.assertIn('`covidcast`', sql) - self.assertIn('unix_timestamp', sql) - self.assertIn('on duplicate key update', sql) - def test_update_direction_query(self): """Query to update a row's `direction` looks sensible. From b148a766ba9add932c2f502a48341706394a7005 Mon Sep 17 00:00:00 2001 From: "Wael H. Al Saeed" Date: Mon, 3 Aug 2020 15:33:46 -0400 Subject: [PATCH 09/15] Added fill is_latest_issue script --- .../covidcast/test_fill_is_latest_issue.py | 94 +++++++++++++++++++ .../covidcast/fill_is_latest_issue.py | 86 +++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 integrations/acquisition/covidcast/test_fill_is_latest_issue.py create mode 100644 src/acquisition/covidcast/fill_is_latest_issue.py diff --git a/integrations/acquisition/covidcast/test_fill_is_latest_issue.py b/integrations/acquisition/covidcast/test_fill_is_latest_issue.py new file mode 100644 index 000000000..33f77c41f --- /dev/null +++ b/integrations/acquisition/covidcast/test_fill_is_latest_issue.py @@ -0,0 +1,94 @@ +"""Integration tests for covidcast's direction updating.""" + +# standard library +import unittest + +# third party +import mysql.connector + +# first party +from delphi.epidata.client.delphi_epidata import Epidata +import delphi.operations.secrets as secrets + +# py3tester coverage target (equivalent to `import *`) +__test_target__ = 'delphi.epidata.acquisition.covidcast.fill_is_latest_issue' + + +class FillIsLatestIssueTests(unittest.TestCase): + """Tests filling is_latest_issue column""" + + def setUp(self): + """Perform per-test setup.""" + + # connect to the `epidata` database and clear the `covidcast` table + cnx = mysql.connector.connect( + user='user', + password='pass', + host='delphi_database_epidata', + database='epidata') + cur = cnx.cursor() + cur.execute('truncate table covidcast') + cnx.commit() + cur.close() + + # make connection and cursor available to test cases + self.cnx = cnx + self.cur = cnx.cursor() + + # use the local instance of the epidata database + secrets.db.host = 'delphi_database_epidata' + secrets.db.epi = ('user', 'pass') + + # use the local instance of the Epidata API + Epidata.BASE_URL = 'http://delphi_web_epidata/epidata/api.php' + + def tearDown(self): + """Perform per-test teardown.""" + self.cur.close() + self.cnx.close() + + def test_fill_is_latest_issue(self): + """Update rows having a stale `direction` field and serve the results.""" + + self.cur.execute(''' + insert into covidcast values + (0, 'src', 'sig', 'day', 'state', 20200228, 'ca', + 123, 2, 5, 5, 5, NULL, 20200228, 0, 1), + (0, 'src', 'sig', 'day', 'state', 20200228, 'ca', + 123, 2, 0, 0, 0, NULL, 20200229, 1, 1), + (0, 'src', 'sig', 'day', 'state', 20200229, 'ca', + 123, 6, 0, 0, 0, NULL, 20200301, 1, 1), + (0, 'src', 'sig', 'day', 'state', 20200229, 'ca', + 123, 6, 9, 9, 9, NULL, 20200229, 0, 1), + (0, 'src', 'sig', 'day', 'state', 20200301, 'ca', + 123, 5, 0, 0, 0, NULL, 20200303, 2, 1), + (0, 'src', 'sig', 'day', 'state', 20200301, 'ca', + 123, 5, 5, 5, 5, NULL, 20200302, 1, 1), + (0, 'src', 'sig', 'day', 'state', 20200301, 'ca', + 123, 5, 9, 8, 7, NULL, 20200301, 0, 1) + ''') + self.cnx.commit() + + # fill is_latest_issue + main() + + self.cur.execute('''select * from covidcast''') + result = list(self.cur) + expected = [ + (1, 'src', 'sig', 'day', 'state', 20200228, 'ca', + 123, 2, 5, 5, 5, None, 20200228, 0, bytearray(b'0')), + (2, 'src', 'sig', 'day', 'state', 20200228, 'ca', + 123, 2, 0, 0, 0, None, 20200229, 1, bytearray(b'1')), + (3, 'src', 'sig', 'day', 'state', 20200229, 'ca', + 123, 6, 0, 0, 0, None, 20200301, 1, bytearray(b'1')), + (4, 'src', 'sig', 'day', 'state', 20200229, 'ca', + 123, 6, 9, 9, 9, None, 20200229, 0, bytearray(b'0')), + (5, 'src', 'sig', 'day', 'state', 20200301, 'ca', + 123, 5, 0, 0, 0, None, 20200303, 2, bytearray(b'1')), + (6, 'src', 'sig', 'day', 'state', 20200301, 'ca', + 123, 5, 5, 5, 5, None, 20200302, 1, bytearray(b'0')), + (7, 'src', 'sig', 'day', 'state', 20200301, 'ca', + 123, 5, 9, 8, 7, None, 20200301, 0, bytearray(b'0')) + ] + + self.assertEqual(result, expected) diff --git a/src/acquisition/covidcast/fill_is_latest_issue.py b/src/acquisition/covidcast/fill_is_latest_issue.py new file mode 100644 index 000000000..a6c0b358c --- /dev/null +++ b/src/acquisition/covidcast/fill_is_latest_issue.py @@ -0,0 +1,86 @@ +"""Computes and updates the `is_latest_issue` column in the `covidcast` table. + +This update is only needed to be run once. +""" + +# third party +import mysql.connector + +# first party +import delphi.operations.secrets as secrets + + +# partition configuration +PARTITION_VARIABLE = 'geo_value' +PARTITION_SPLITS = ["'05101'", "'101'", "'13071'", "'15007'", "'17161'", "'19039'", "'20123'", "'21213'", "'24035'", + "'27005'", "'28115'", "'29510'", "'31161'", "'35100'", "'37117'", "'39081'", "'41013'", "'44140'", + "'47027'", "'48140'", "'48461'", "'51169'", "'55033'"] + +def main(): + + u, p = secrets.db.epi + connection = mysql.connector.connect( + host=secrets.db.host, + user=u, + password=p, + database='epidata') + cursor = connection.cursor() + + set_partition_to_one_query = ''' + UPDATE + ( + SELECT + `source`, + `signal`, + `time_type`, + `geo_type`, + `geo_value`, + `time_value`, + MAX(`issue`) AS `issue` + FROM `covidcast` + WHERE + `time_type` = 'day' AND + %s + GROUP BY + `source`, + `signal`, + `time_type`, + `geo_type`, + `geo_value`, + `time_value` + ) b + LEFT JOIN `covidcast` a + USING (`source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value`, `issue`) + SET `is_latest_issue`=1 + ''' + + set_to_zero_query = ''' + UPDATE `covidcast` + SET `is_latest_issue` = 0; + ''' + + commit = False + try: + cursor.execute(set_to_zero_query) + for partition_index in range(24): + # constructing the partitoin condition from partition index + ge_condition = 'TRUE' if partition_index == 0 else \ + f'`{PARTITION_VARIABLE}` >= {PARTITION_SPLITS[partition_index - 1]}' + l_condition = 'TRUE' if partition_index == len(PARTITION_SPLITS) else \ + f'`{PARTITION_VARIABLE}` < {PARTITION_SPLITS[partition_index]}' + partition_condition = f'({ge_condition}) AND ({l_condition})' + + cursor.execute(set_partition_to_one_query % partition_condition) + + commit = True + except Exception as e: + connection.rollback() + raise e + finally: + cursor.close() + if commit: + connection.commit() + connection.close() + +if __name__=='__main__': + main() \ No newline at end of file From 3ffc9ee31bdf889d8f4eec020155e67dab898f91 Mon Sep 17 00:00:00 2001 From: "Wael H. Al Saeed" Date: Mon, 3 Aug 2020 15:40:15 -0400 Subject: [PATCH 10/15] Updated column names * Updated column names back to value_updated_timestamp and direction_updated_timestamp. --- .../covidcast/test_csv_uploading.py | 8 +-- .../covidcast/test_direction_updating.py | 2 +- src/acquisition/covidcast/database.py | 54 +++++++++---------- src/acquisition/covidcast/direction.py | 12 ++--- .../covidcast/direction_updater.py | 30 +++++------ src/ddl/covidcast.sql | 14 ++--- .../proc_db_backups_pd.py | 12 ++--- tests/acquisition/covidcast/test_database.py | 16 +++--- tests/acquisition/covidcast/test_direction.py | 6 +-- .../covidcast/test_direction_updater.py | 4 +- 10 files changed, 79 insertions(+), 79 deletions(-) diff --git a/integrations/acquisition/covidcast/test_csv_uploading.py b/integrations/acquisition/covidcast/test_csv_uploading.py index 848dea269..0d26a5d60 100644 --- a/integrations/acquisition/covidcast/test_csv_uploading.py +++ b/integrations/acquisition/covidcast/test_csv_uploading.py @@ -192,10 +192,10 @@ def apply_lag(expected_epidata): }) # verify timestamps and default values are reasonable - self.cur.execute('select timestamp1, timestamp2, direction from covidcast') - for timestamp1, timestamp2, direction in self.cur: - self.assertGreater(timestamp1, 0) - self.assertEqual(timestamp2, 0) + self.cur.execute('select value_updated_timestamp, direction_updated_timestamp, direction from covidcast') + for value_updated_timestamp, direction_updated_timestamp, direction in self.cur: + self.assertGreater(value_updated_timestamp, 0) + self.assertEqual(direction_updated_timestamp, 0) self.assertIsNone(direction) # verify that the CSVs were archived diff --git a/integrations/acquisition/covidcast/test_direction_updating.py b/integrations/acquisition/covidcast/test_direction_updating.py index fa04afb11..f9824e77b 100644 --- a/integrations/acquisition/covidcast/test_direction_updating.py +++ b/integrations/acquisition/covidcast/test_direction_updating.py @@ -236,7 +236,7 @@ def test_uploading(self): }) # verify secondary timestamps were updated - self.cur.execute('select timestamp2 from covidcast order by id asc') + self.cur.execute('select direction_updated_timestamp from covidcast order by id asc') timestamps = [t for (t,) in self.cur] for t in timestamps[:9]: # first 9 rows had `direction` updated diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index 29e864aef..2fe7c7502 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -42,7 +42,7 @@ def __init__(self, source, signal, time_type, geo_type, time_value, geo_value, v self.value = value # ... self.stderr = stderr # ... self.sample_size = sample_size # from CSV row - self.timestamp2 = 0 + self.direction_updated_timestamp = 0 self.direction = None self.issue = issue self.lag = lag @@ -111,11 +111,11 @@ def insert_or_update_batch(self, cc_rows, batch_size=2**20, commit_partial=False `geo_type` varchar(12) NOT NULL, `time_value` int(11) NOT NULL, `geo_value` varchar(12) NOT NULL, - `timestamp1` int(11) NOT NULL, + `value_updated_timestamp` int(11) NOT NULL, `value` double NOT NULL, `stderr` double, `sample_size` double, - `timestamp2` int(11) NOT NULL, + `direction_updated_timestamp` int(11) NOT NULL, `direction` int(11), `issue` int(11) NOT NULL, `lag` int(11) NOT NULL, @@ -129,7 +129,7 @@ def insert_or_update_batch(self, cc_rows, batch_size=2**20, commit_partial=False insert_into_tmp_sql = f''' INSERT INTO `{tmp_table_name}` (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, - `timestamp1`, `value`, `stderr`, `sample_size`, `timestamp2`, `direction`, + `value_updated_timestamp`, `value`, `stderr`, `sample_size`, `direction_updated_timestamp`, `direction`, `issue`, `lag`, `is_latest_issue`) VALUES (%s, %s, %s, %s, %s, %s, UNIX_TIMESTAMP(NOW()), %s, %s, %s, 0, NULL, %s, %s, 0) @@ -138,11 +138,11 @@ def insert_or_update_batch(self, cc_rows, batch_size=2**20, commit_partial=False insert_or_update_sql = f''' INSERT INTO `covidcast` (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, - `timestamp1`, `value`, `stderr`, `sample_size`, `timestamp2`, `direction`, + `value_updated_timestamp`, `value`, `stderr`, `sample_size`, `direction_updated_timestamp`, `direction`, `issue`, `lag`, `is_latest_issue`) SELECT * FROM `{tmp_table_name}` ON DUPLICATE KEY UPDATE - `timestamp1` = VALUES(`timestamp1`), + `value_updated_timestamp` = VALUES(`value_updated_timestamp`), `value` = VALUES(`value`), `stderr` = VALUES(`stderr`), `sample_size` = VALUES(`sample_size`) @@ -175,7 +175,7 @@ def insert_or_update_batch(self, cc_rows, batch_size=2**20, commit_partial=False SET `is_latest_issue`=1 ''' - # TODO: ^ do we want to reset `timestamp2` and `direction` in the duplicate key case? + # TODO: ^ do we want to reset `direction_updated_timestamp` and `direction` in the duplicate key case? # TODO: consider handling cc_rows as a generator instead of a list self._cursor.execute(create_tmp_table_sql) @@ -277,7 +277,7 @@ def update_direction( UPDATE `covidcast` SET - `timestamp2` = UNIX_TIMESTAMP(NOW()), + `direction_updated_timestamp` = UNIX_TIMESTAMP(NOW()), `direction` = %s WHERE `source` = %s AND @@ -319,9 +319,9 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type` varchar(12), `geo_value` varchar(12), `time_value` int(11), - `timestamp1` int(11), + `value_updated_timestamp` int(11), `value` double, - `timestamp2` int(11), + `direction_updated_timestamp` int(11), `direction` int(11), PRIMARY KEY(`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8; @@ -338,9 +338,9 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type`, `geo_value`, `time_value`, - `timestamp1`, + `value_updated_timestamp`, `value`, - `timestamp2`, + `direction_updated_timestamp`, `direction` FROM `covidcast` WHERE `is_latest_issue` = 1 AND @@ -372,7 +372,7 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type`, `geo_value` HAVING - MAX(`timestamp1`) > MIN(`timestamp2`) + MAX(`value_updated_timestamp`) > MIN(`direction_updated_timestamp`) ''' # A query that selects rows of the time-series selected by stale_ts_key_sql query. @@ -386,9 +386,9 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t `geo_type`, `geo_value`, `time_value`, - `timestamp1`, + `value_updated_timestamp`, `value`, - `timestamp2`, + `direction_updated_timestamp`, `direction` FROM ({stale_ts_key_sql}) AS t2 LEFT JOIN `latest_issues` AS t3 @@ -438,8 +438,8 @@ def drop_temporary_table(self, tmp_table_name): sql = f'DROP TEMPORARY TABLE `{tmp_table_name}`;' self._cursor.execute(sql) - def update_timestamp2_from_temporary_table(self, tmp_table_name): - """Updates the `timestamp2` column of `covidcast` table for all the rows with id value in `tmp_table_name`. + def update_direction_updated_timestamp_from_temporary_table(self, tmp_table_name): + """Updates the `direction_updated_timestamp` column of `covidcast` table for all the rows with id value in `tmp_table_name`. `tmp_table_name`: name of the temporary table. """ @@ -451,7 +451,7 @@ def update_timestamp2_from_temporary_table(self, tmp_table_name): ON `covidcast`.id=t.id SET - `covidcast`.timestamp2=UNIX_TIMESTAMP(NOW()) + `covidcast`.direction_updated_timestamp=UNIX_TIMESTAMP(NOW()) ''' self._cursor.execute(sql) @@ -471,8 +471,8 @@ def get_keys_with_potentially_stale_direction(self): `signal`, `geo_type`, `geo_value`, - MAX(`timestamp1`) AS `max_timestamp1`, - MIN(`timestamp2`) AS `min_timestamp2`, + MAX(`value_updated_timestamp`) AS `max_value_updated_timestamp`, + MIN(`direction_updated_timestamp`) AS `min_direction_updated_timestamp`, MIN(`time_value`) AS `min_day`, MAX(`time_value`) AS `max_day`, COUNT(1) AS `series_length` @@ -487,7 +487,7 @@ def get_keys_with_potentially_stale_direction(self): `geo_type`, `geo_value` HAVING - MAX(`timestamp1`) > MIN(`timestamp2`) + MAX(`value_updated_timestamp`) > MIN(`direction_updated_timestamp`) ''' self._cursor.execute(sql) @@ -502,8 +502,8 @@ def get_daily_timeseries_for_direction_update( DATEDIFF(`time_value`, %s) AS `offset`, `time_value` AS `day`, `value`, - `timestamp1`, - `timestamp2` + `value_updated_timestamp`, + `direction_updated_timestamp` FROM `covidcast` WHERE @@ -521,9 +521,9 @@ def get_daily_timeseries_for_direction_update( self._cursor.execute(sql, args) return list(self._cursor) - def update_timeseries_timestamp2( + def update_timeseries_direction_updated_timestamp( self, source, signal, time_type, geo_type, geo_value): - """Update the `timestamp2` column for an entire time-series. + """Update the `direction_updated_timestamp` column for an entire time-series. For daily time-series, this implies that all `direction` values in the specified time-series are confirmed fresh as of the current time. Even if @@ -537,7 +537,7 @@ def update_timeseries_timestamp2( UPDATE `covidcast` SET - `timestamp2` = UNIX_TIMESTAMP(NOW()) + `direction_updated_timestamp` = UNIX_TIMESTAMP(NOW()) WHERE `source` = %s AND `signal` = %s AND @@ -566,7 +566,7 @@ def get_covidcast_meta(self): MAX(`value`) AS `max_value`, ROUND(AVG(`value`),7) AS `mean_value`, ROUND(STD(`value`),7) AS `stdev_value`, - MAX(`timestamp1`) AS `last_update`, + MAX(`value_updated_timestamp`) AS `last_update`, MAX(`issue`) as `max_issue`, MIN(`lag`) as `min_lag`, MAX(`lag`) as `max_lag` diff --git a/src/acquisition/covidcast/direction.py b/src/acquisition/covidcast/direction.py index 7444a8340..f364ae6be 100644 --- a/src/acquisition/covidcast/direction.py +++ b/src/acquisition/covidcast/direction.py @@ -64,8 +64,8 @@ def scan_timeseries( offsets, days, values, - timestamp1s, - timestamp2s, + value_updated_timestamps, + direction_updated_timestamps, get_direction_impl): """Scan an entire time-series and return fresh direction updates. @@ -73,9 +73,9 @@ def scan_timeseries( each day in `days` `days`: day (YYYYMMDD) corresponding to each row in the other arrays `values`: value of the signal on each day - `timestamp1s`: primary timestamp for each row (i.e. when `value` was + `value_updated_timestamps`: primary timestamp for each row (i.e. when `value` was updated) - `timestamp2s`: secondary timestamp for each row (i.e. when `direction` was + `direction_updated_timestamps`: secondary timestamp for each row (i.e. when `direction` was last deemed to be fresh, relative to associated `value`s) `get_direction_impl`: a function which takes two arrays (time and value) and returns a classification of the direction (i.e. as -1, 0, +1) @@ -100,8 +100,8 @@ def scan_timeseries( start += 1 # check whether this row needs an update - direction_time = timestamp2s[end] - value_time = np.max(timestamp1s[start:end + 1]) + direction_time = direction_updated_timestamps[end] + value_time = np.max(value_updated_timestamps[start:end + 1]) if direction_time > value_time: # this row is fresh continue diff --git a/src/acquisition/covidcast/direction_updater.py b/src/acquisition/covidcast/direction_updater.py index 74b24092d..44968a645 100644 --- a/src/acquisition/covidcast/direction_updater.py +++ b/src/acquisition/covidcast/direction_updater.py @@ -94,8 +94,8 @@ def update_loop(database, direction_impl=Direction): signal, geo_type, geo_value, - max_timestamp1, - min_timestamp2, + max_value_updated_timestamp, + min_direction_updated_timestamp, min_day, max_day, series_length, @@ -123,7 +123,7 @@ def update_loop(database, direction_impl=Direction): min_day, max_day, series_length, - max_timestamp1 - min_timestamp2, + max_value_updated_timestamp - min_direction_updated_timestamp, ) print(msg % args) @@ -133,12 +133,12 @@ def update_loop(database, direction_impl=Direction): # transpose result set and cast data types data = np.array(timeseries_rows) - offsets, days, values, timestamp1s, timestamp2s = data.T + offsets, days, values, value_updated_timestamps, direction_updated_timestamps = data.T offsets = offsets.astype(np.int64) days = days.astype(np.int64) values = values.astype(np.float64) - timestamp1s = timestamp1s.astype(np.int64) - timestamp2s = timestamp2s.astype(np.int64) + value_updated_timestamps = value_updated_timestamps.astype(np.int64) + direction_updated_timestamps = direction_updated_timestamps.astype(np.int64) # create a direction classifier for this signal data_stdev = data_stdevs[source][signal][geo_type] @@ -150,7 +150,7 @@ def get_direction_impl(x, y): # recompute any stale directions days, directions = direction_impl.scan_timeseries( - offsets, days, values, timestamp1s, timestamp2s, get_direction_impl) + offsets, days, values, value_updated_timestamps, direction_updated_timestamps, get_direction_impl) if be_verbose: print(' computed %d direction updates' % len(directions)) @@ -163,7 +163,7 @@ def get_direction_impl(x, y): source, signal, 'day', geo_type, day, geo_value, direction) # mark the entire time-series as fresh with respect to direction - database.update_timeseries_timestamp2( + database.update_timeseries_direction_updated_timestamp( source, signal, 'day', geo_type, geo_value) @@ -185,7 +185,7 @@ def optimized_update_loop(database, partition_index, direction_impl=Direction): # A pandas DataFrame that will hold all rows from potentially stale time-series df_all = pd.DataFrame(columns=['id', 'source', 'signal', 'time_type', 'geo_type', 'geo_value', 'time_value', - 'timestamp1', 'value', 'timestamp2', 'direction'], + 'value_updated_timestamp', 'value', 'direction_updated_timestamp', 'direction'], data=database.get_all_record_values_of_timeseries_with_potentially_stale_direction( tmp_table_name, partition_condition)) df_all.drop(columns=['time_type'], inplace=True) @@ -243,15 +243,15 @@ def optimized_update_loop(database, partition_index, direction_impl=Direction): ts_rows.time_value.min(), ts_rows.time_value.max(), len(ts_rows), - ts_rows.timestamp1.max() - ts_rows.timestamp2.min() + ts_rows.value_updated_timestamp.max() - ts_rows.direction_updated_timestamp.min() ) print(msg % args) offsets = ts_rows.offsets.values.astype(np.int64) days = ts_rows.time_value.values.astype(np.int64) values = ts_rows.value.values.astype(np.float64) - timestamp1s = ts_rows.timestamp1.values.astype(np.int64) - timestamp2s = ts_rows.timestamp2.values.astype(np.int64) + value_updated_timestamps = ts_rows.value_updated_timestamp.values.astype(np.int64) + direction_updated_timestamps = ts_rows.direction_updated_timestamp.values.astype(np.int64) if (source in data_stdevs) and (signal in data_stdevs[source]) and (geo_type in data_stdevs[source][signal]): # create a direction classifier for this signal @@ -264,7 +264,7 @@ def get_direction_impl(x, y): # recompute any stale directions days, directions = direction_impl.scan_timeseries( - offsets, days, values, timestamp1s, timestamp2s, get_direction_impl) + offsets, days, values, value_updated_timestamps, direction_updated_timestamps, get_direction_impl) if be_verbose: print(' computed %d direction updates' % len(directions)) @@ -288,8 +288,8 @@ def get_direction_impl(x, y): for v, id_list in changed_rows.items(): database.batched_update_direction(v, id_list) - # Updating timestamp2 - database.update_timestamp2_from_temporary_table(tmp_table_name) + # Updating direction_updated_timestamp + database.update_direction_updated_timestamp_from_temporary_table(tmp_table_name) # Dropping temporary table database.drop_temporary_table(tmp_table_name) diff --git a/src/ddl/covidcast.sql b/src/ddl/covidcast.sql index 48c3ead6f..dcc7e284a 100644 --- a/src/ddl/covidcast.sql +++ b/src/ddl/covidcast.sql @@ -18,15 +18,15 @@ Data is public. | geo_type | varchar(12) | NO | | NULL | | | time_value | int(11) | NO | | NULL | | | geo_value | varchar(12) | NO | | NULL | | -| timestamp1 | int(11) | NO | | NULL | | +| value_updated_timestamp | int(11) | NO | | NULL | | | value | double | NO | | NULL | | | stderr | double | YES | | NULL | | | sample_size | double | YES | | NULL | | -| timestamp2 | int(11) | NO | | NULL | | +| direction_updated_timestamp | int(11) | NO | | NULL | | | direction | int(11) | YES | | NULL | | | issue | int(11) | NO | | NULL | | | lag | int(11) | NO | | NULL | | -| is_latest_issue | boolean | NO | | NULL | | +| is_latest_issue | BINARY(1) | NO | | NULL | | +------------------------------+-------------+------+-----+---------+----------------+ - `id` @@ -49,7 +49,7 @@ Data is public. - HRR: hospital referral region (HRR) number - DMA: designated market area (DMA) code - state: two-letter state abbreviation -- `timestamp1` +- `value_updated_timestamp` time when primary data (e.g. `value`) was last updated - `value` value (statistic) derived from the underlying data source @@ -57,7 +57,7 @@ Data is public. standard error of the statistic with respect to its sampling distribution - `sample_size` (NULL when not applicable) number of "data points" used in computing the statistic -- `timestamp2` +- `direction_updated_timestamp` time when secondary data (e.g. `direction`) was last updated - `direction` (NULL when not applicable) trend classifier with possible values: @@ -81,12 +81,12 @@ CREATE TABLE `covidcast` ( `time_value` int(11) NOT NULL, `geo_value` varchar(12) NOT NULL, -- "primary" values are derived from the upstream data source - `timestamp1` int(11) NOT NULL, + `value_updated_timestamp` int(11) NOT NULL, `value` double NOT NULL, `stderr` double, `sample_size` double, -- "secondary" values are derived from data in this table - `timestamp2` int(11) NOT NULL, + `direction_updated_timestamp` int(11) NOT NULL, `direction` int(11), `issue` int(11) NOT NULL, `lag` int(11) NOT NULL, diff --git a/src/server/covidcast_issues_migration/proc_db_backups_pd.py b/src/server/covidcast_issues_migration/proc_db_backups_pd.py index e88f86dbd..1aa2cbe1b 100755 --- a/src/server/covidcast_issues_migration/proc_db_backups_pd.py +++ b/src/server/covidcast_issues_migration/proc_db_backups_pd.py @@ -25,7 +25,7 @@ # Column names INDEX_COLS = ["source", "signal", "time_type", "geo_type", "time_value", "geo_value"] -VALUE_COLS = ["timestamp1", "value", "stderr", "sample_size", "timestamp2", "direction"] +VALUE_COLS = ["value_updated_timestamp", "value", "stderr", "sample_size", "direction_updated_timestamp", "direction"] ALL_COLS = INDEX_COLS + VALUE_COLS ALL_COLS_WITH_PK = ["id"] + ALL_COLS @@ -39,11 +39,11 @@ # time_value as str, because we need this parsed as a datetime anyway "time_value": "str", "geo_value": "category", - "timestamp1": "int", + "value_updated_timestamp": "int", "value": "str", "stderr": "str", "sample_size": "str", - "timestamp2": "int", + "direction_updated_timestamp": "int", "direction": "category" } @@ -432,8 +432,8 @@ def pd_csvdiff( # Since df_before is filled with NaN for new indices, new indices turn false in same_mask same_mask = (df_before.reindex(df_after.index) == df_after) - # Ignore timestamp2 in the diff - is_diff = ~(same_mask.loc[:, same_mask.columns != "timestamp2"].all(axis=1)) + # Ignore direction_updated_timestamp in the diff + is_diff = ~(same_mask.loc[:, same_mask.columns != "direction_updated_timestamp"].all(axis=1)) # Removed indices can be found via index difference, but is expensive if find_removals: @@ -469,7 +469,7 @@ def generate_issues( row_fmt = "(" \ "{id},{source},{signal},{time_type},{geo_type},{time_value},{geo_value}," \ - "{row.timestamp1},{row.value},{row.stderr},{row.sample_size},{row.timestamp2},{row.direction}," \ + "{row.value_updated_timestamp},{row.value},{row.stderr},{row.sample_size},{row.direction_updated_timestamp},{row.direction}," \ "{issue},{row.lag})" try: diff --git a/tests/acquisition/covidcast/test_database.py b/tests/acquisition/covidcast/test_database.py index 958e8cd9e..cfe4e6fbf 100644 --- a/tests/acquisition/covidcast/test_database.py +++ b/tests/acquisition/covidcast/test_database.py @@ -111,7 +111,7 @@ def test_update_direction_query(self): sql = sql.lower() self.assertIn('update', sql) self.assertIn('`covidcast`', sql) - self.assertIn('`timestamp2` = unix_timestamp', sql) + self.assertIn('`direction_updated_timestamp` = unix_timestamp', sql) self.assertIn('`direction` = %s', sql) def test_get_data_stdev_across_locations_query(self): @@ -159,8 +159,8 @@ def test_get_keys_with_potentially_stale_direction_query(self): sql = cursor.execute.call_args[0][0].lower() self.assertIn('select', sql) self.assertIn('`covidcast`', sql) - self.assertIn('timestamp1', sql) - self.assertIn('timestamp2', sql) + self.assertIn('value_updated_timestamp', sql) + self.assertIn('direction_updated_timestamp', sql) def test_get_daily_timeseries_for_direction_update_query(self): """Query to get a daily time-series looks sensible. @@ -194,10 +194,10 @@ def test_get_daily_timeseries_for_direction_update_query(self): sql = sql.lower() self.assertIn('select', sql) self.assertIn('`covidcast`', sql) - self.assertIn('timestamp1', sql) - self.assertIn('timestamp2', sql) + self.assertIn('value_updated_timestamp', sql) + self.assertIn('direction_updated_timestamp', sql) - def test_update_timeseries_timestamp2_query(self): + def test_update_timeseries_direction_updated_timestamp_query(self): """Query to update the secondary timestamp of a time-series looks sensible. NOTE: Actual behavior is tested by integration test. @@ -208,7 +208,7 @@ def test_update_timeseries_timestamp2_query(self): database = Database() database.connect(connector_impl=mock_connector) - database.update_timeseries_timestamp2(*args) + database.update_timeseries_direction_updated_timestamp(*args) connection = mock_connector.connect() cursor = connection.cursor() @@ -221,7 +221,7 @@ def test_update_timeseries_timestamp2_query(self): sql = sql.lower() self.assertIn('update', sql) self.assertIn('`covidcast`', sql) - self.assertIn('timestamp2', sql) + self.assertIn('direction_updated_timestamp', sql) self.assertIn('unix_timestamp(now())', sql) def test_update_covidcast_meta_cache_query(self): diff --git a/tests/acquisition/covidcast/test_direction.py b/tests/acquisition/covidcast/test_direction.py index 529056248..b6f6e2a12 100644 --- a/tests/acquisition/covidcast/test_direction.py +++ b/tests/acquisition/covidcast/test_direction.py @@ -87,7 +87,7 @@ def test_get_direction_validates_arguments(self): def test_scan_timeseries(self): """Scan a time-series and update stale directions.""" - offsets, days, values, timestamp1s, timestamp2s = [ + offsets, days, values, value_updated_timestamps, direction_updated_timestamps = [ # missing days '230', '240', and '250' (the gap helps test windowing) [100, 101, 102, 106, 107, 108], [200, 210, 220, 260, 270, 280], @@ -104,8 +104,8 @@ def test_scan_timeseries(self): offsets, days, values, - timestamp1s, - timestamp2s, + value_updated_timestamps, + direction_updated_timestamps, get_direction_impl) self.assertEqual(days, [210, 280]) diff --git a/tests/acquisition/covidcast/test_direction_updater.py b/tests/acquisition/covidcast/test_direction_updater.py index 6dcc638be..64060e07a 100644 --- a/tests/acquisition/covidcast/test_direction_updater.py +++ b/tests/acquisition/covidcast/test_direction_updater.py @@ -110,7 +110,7 @@ def test_update_loop(self): ) self.assertEqual(call_args_list[1][0], expected_args) - self.assertTrue(mock_database.update_timeseries_timestamp2.called) - args = mock_database.update_timeseries_timestamp2.call_args[0] + self.assertTrue(mock_database.update_timeseries_direction_updated_timestamp.called) + args = mock_database.update_timeseries_direction_updated_timestamp.call_args[0] expected_args = ('source', 'signal', 'day', 'geo_type', 'geo_value') self.assertEqual(args, expected_args) From 47880d1696de9c753518433eccd3d65ac6f82bb7 Mon Sep 17 00:00:00 2001 From: "Wael H. Al Saeed" Date: Tue, 4 Aug 2020 09:47:14 -0400 Subject: [PATCH 11/15] Change signal type in temp tables change signal type from VARCHAR(32) to VARCHAR(64) in temporary tables. --- src/acquisition/covidcast/database.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index 2fe7c7502..99f0d3c25 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -106,7 +106,7 @@ def insert_or_update_batch(self, cc_rows, batch_size=2**20, commit_partial=False create_tmp_table_sql = f''' CREATE TEMPORARY TABLE `{tmp_table_name}` ( `source` varchar(32) NOT NULL, - `signal` varchar(32) NOT NULL, + `signal` varchar(64) NOT NULL, `time_type` varchar(12) NOT NULL, `geo_type` varchar(12) NOT NULL, `time_value` int(11) NOT NULL, @@ -314,7 +314,7 @@ def get_all_record_values_of_timeseries_with_potentially_stale_direction(self, t CREATE TEMPORARY TABLE `{temporary_table}` ( `id` int(11) NOT NULL, `source` varchar(32), - `signal` varchar(32), + `signal` varchar(64), `time_type` varchar(12), `geo_type` varchar(12), `geo_value` varchar(12), From d87fcb11a8eb3aad1dd3b81cabc5eb91547c8170 Mon Sep 17 00:00:00 2001 From: "Wael H. Al Saeed" Date: Wed, 5 Aug 2020 11:57:17 -0400 Subject: [PATCH 12/15] Removed outdated lines --- src/acquisition/covidcast/database.py | 2 -- src/acquisition/covidcast/direction_updater.py | 3 --- 2 files changed, 5 deletions(-) diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index 99f0d3c25..1f181118d 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -221,8 +221,6 @@ def insert_or_update_batch(self, cc_rows, batch_size=2**20, commit_partial=False if commit_partial: self._connection.commit() except Exception as e: - print('AAA') - print(e) raise e finally: self._cursor.execute(drop_tmp_table_sql) diff --git a/src/acquisition/covidcast/direction_updater.py b/src/acquisition/covidcast/direction_updater.py index 44968a645..0f09886c9 100644 --- a/src/acquisition/covidcast/direction_updater.py +++ b/src/acquisition/covidcast/direction_updater.py @@ -50,7 +50,6 @@ class Constants: def get_argument_parser(): """Define command line arguments.""" - # there are no flags, but --help will still work parser = argparse.ArgumentParser() parser.add_argument( '--partitions', @@ -314,8 +313,6 @@ def main( except Exception as e: raise e finally: - # no catch block so that an exception above will cause the program to - # fail after the following cleanup database.disconnect(commit) print('partition %d committed=%s' % (partition_index, str(commit))) From 4e56dcb643f5c804986474258bb74cc89d19b2c0 Mon Sep 17 00:00:00 2001 From: george haff Date: Wed, 5 Aug 2020 14:41:14 -0400 Subject: [PATCH 13/15] review suggestions --- src/acquisition/covidcast/database.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index c70744681..0eefc0462 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -546,7 +546,7 @@ def get_covidcast_meta(self): meta = [] - sql = 'SELECT `source`, `signal` FROM covidcast GROUP BY `source`, `signal` ORDER BY `source` ASC, `signal` ASC;' + sql = 'SELECT `source`, `signal` FROM covidcast WHERE NOT `is_wip` GROUP BY `source`, `signal` ORDER BY `source` ASC, `signal` ASC;' self._cursor.execute(sql) for source, signal in [ss for ss in self._cursor]: #NOTE: this obfuscation protects the integrity of the cursor; using the cursor as a generator will cause contention w/ subsequent queries @@ -594,12 +594,10 @@ def get_covidcast_meta(self): x.`max_issue` = t.`issue` AND x.`time_type` = t.`time_type` AND x.`time_value` = t.`time_value` AND + x.`source` = t.`source` AND + x.`signal` = t.`signal` AND x.`geo_type` = t.`geo_type` AND x.`geo_value` = t.`geo_value` - WHERE - NOT t.`is_wip` AND - t.`source` = %s AND - t.`signal` = %s GROUP BY t.`time_type`, t.`geo_type` @@ -607,7 +605,7 @@ def get_covidcast_meta(self): t.`time_type` ASC, t.`geo_type` ASC ''' - self._cursor.execute(sql, (source, signal, source, signal)) + self._cursor.execute(sql, (source, signal)) meta.extend(list(dict(zip(self._cursor.column_names,x)) for x in self._cursor)) return meta @@ -627,6 +625,8 @@ def update_covidcast_meta_cache(self, metadata): self._cursor.execute(sql, (epidata_json,)) def retrieve_covidcast_meta_cache(self): + """Useful for viewing cache entries (was used in debugging)""" + sql = ''' SELECT `epidata` FROM `covidcast_meta_cache` From 32b2603c3a8af2382c80090713b989ee05eb6099 Mon Sep 17 00:00:00 2001 From: "Wael H. Al Saeed" Date: Wed, 5 Aug 2020 15:03:45 -0400 Subject: [PATCH 14/15] Update test_fill_is_latest_issue.py --- .../covidcast/test_fill_is_latest_issue.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/integrations/acquisition/covidcast/test_fill_is_latest_issue.py b/integrations/acquisition/covidcast/test_fill_is_latest_issue.py index 33f77c41f..b00e1215e 100644 --- a/integrations/acquisition/covidcast/test_fill_is_latest_issue.py +++ b/integrations/acquisition/covidcast/test_fill_is_latest_issue.py @@ -53,19 +53,19 @@ def test_fill_is_latest_issue(self): self.cur.execute(''' insert into covidcast values (0, 'src', 'sig', 'day', 'state', 20200228, 'ca', - 123, 2, 5, 5, 5, NULL, 20200228, 0, 1), + 123, 2, 5, 5, 5, NULL, 20200228, 0, 1, False), (0, 'src', 'sig', 'day', 'state', 20200228, 'ca', - 123, 2, 0, 0, 0, NULL, 20200229, 1, 1), + 123, 2, 0, 0, 0, NULL, 20200229, 1, 1, False), (0, 'src', 'sig', 'day', 'state', 20200229, 'ca', - 123, 6, 0, 0, 0, NULL, 20200301, 1, 1), + 123, 6, 0, 0, 0, NULL, 20200301, 1, 1, False), (0, 'src', 'sig', 'day', 'state', 20200229, 'ca', - 123, 6, 9, 9, 9, NULL, 20200229, 0, 1), + 123, 6, 9, 9, 9, NULL, 20200229, 0, 1, False), (0, 'src', 'sig', 'day', 'state', 20200301, 'ca', - 123, 5, 0, 0, 0, NULL, 20200303, 2, 1), + 123, 5, 0, 0, 0, NULL, 20200303, 2, 1, False), (0, 'src', 'sig', 'day', 'state', 20200301, 'ca', - 123, 5, 5, 5, 5, NULL, 20200302, 1, 1), + 123, 5, 5, 5, 5, NULL, 20200302, 1, 1, False), (0, 'src', 'sig', 'day', 'state', 20200301, 'ca', - 123, 5, 9, 8, 7, NULL, 20200301, 0, 1) + 123, 5, 9, 8, 7, NULL, 20200301, 0, 1, False) ''') self.cnx.commit() @@ -76,19 +76,19 @@ def test_fill_is_latest_issue(self): result = list(self.cur) expected = [ (1, 'src', 'sig', 'day', 'state', 20200228, 'ca', - 123, 2, 5, 5, 5, None, 20200228, 0, bytearray(b'0')), + 123, 2, 5, 5, 5, None, 20200228, 0, bytearray(b'0'), bytearray(b'0')), (2, 'src', 'sig', 'day', 'state', 20200228, 'ca', - 123, 2, 0, 0, 0, None, 20200229, 1, bytearray(b'1')), + 123, 2, 0, 0, 0, None, 20200229, 1, bytearray(b'1'), bytearray(b'0')), (3, 'src', 'sig', 'day', 'state', 20200229, 'ca', - 123, 6, 0, 0, 0, None, 20200301, 1, bytearray(b'1')), + 123, 6, 0, 0, 0, None, 20200301, 1, bytearray(b'1'), bytearray(b'0')), (4, 'src', 'sig', 'day', 'state', 20200229, 'ca', - 123, 6, 9, 9, 9, None, 20200229, 0, bytearray(b'0')), + 123, 6, 9, 9, 9, None, 20200229, 0, bytearray(b'0'), bytearray(b'0')), (5, 'src', 'sig', 'day', 'state', 20200301, 'ca', - 123, 5, 0, 0, 0, None, 20200303, 2, bytearray(b'1')), + 123, 5, 0, 0, 0, None, 20200303, 2, bytearray(b'1'), bytearray(b'0')), (6, 'src', 'sig', 'day', 'state', 20200301, 'ca', - 123, 5, 5, 5, 5, None, 20200302, 1, bytearray(b'0')), + 123, 5, 5, 5, 5, None, 20200302, 1, bytearray(b'0'), bytearray(b'0')), (7, 'src', 'sig', 'day', 'state', 20200301, 'ca', - 123, 5, 9, 8, 7, None, 20200301, 0, bytearray(b'0')) + 123, 5, 9, 8, 7, None, 20200301, 0, bytearray(b'0'), bytearray(b'0')) ] self.assertEqual(result, expected) From 98154ca7a6c131c7689a71a49d5a63f0f576c604 Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Thu, 30 Jul 2020 13:39:00 -0400 Subject: [PATCH 15/15] Increase maximum signal length for COVIDcast to 64 characters * Database schema * Signal name length guard in CSV uploader * Integration test verifies signal length 3264 fails --- .../covidcast/test_csv_uploading.py | 30 +++++++++++++++++-- src/acquisition/covidcast/csv_importer.py | 4 +-- src/ddl/covidcast.sql | 5 ++-- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/integrations/acquisition/covidcast/test_csv_uploading.py b/integrations/acquisition/covidcast/test_csv_uploading.py index 0d26a5d60..641722fea 100644 --- a/integrations/acquisition/covidcast/test_csv_uploading.py +++ b/integrations/acquisition/covidcast/test_csv_uploading.py @@ -76,7 +76,12 @@ def test_uploading(self): f.write('wa,30,0.03,300\n') # invalid - with open(source_receiving_dir + '/20200419_state_wip_really_long_name_that_will_get_truncated.csv', 'w') as f: + with open(source_receiving_dir + '/20200419_state_wip_really_long_name_that_will_be_accepted.csv', 'w') as f: + f.write('geo_id,val,se,sample_size\n') + f.write('pa,100,5.4,624\n') + + # invalid + with open(source_receiving_dir + '/20200419_state_wip_really_long_name_that_will_get_truncated_lorem_ipsum_dolor_sit_amet.csv', 'w') as f: f.write('geo_id,val,se,sample_size\n') f.write('pa,100,5.4,624\n') @@ -180,9 +185,30 @@ def apply_lag(expected_epidata): 'message': 'success', }) + + # request CSV data from the API on the signal with name length 32 32: - print(' invalid signal name (32 char limit)',signal) + if len(signal) > 64: + print(' invalid signal name (64 char limit)',signal) yield (path, None) continue diff --git a/src/ddl/covidcast.sql b/src/ddl/covidcast.sql index 15690fb3a..2636a85fb 100644 --- a/src/ddl/covidcast.sql +++ b/src/ddl/covidcast.sql @@ -8,12 +8,13 @@ Delphi's COVID-19 surveillance streams. Data is public. + +------------------------------+-------------+------+-----+---------+----------------+ | Field | Type | Null | Key | Default | Extra | +------------------------------+-------------+------+-----+---------+----------------+ | id | int(11) | NO | PRI | NULL | auto_increment | | source | varchar(32) | NO | MUL | NULL | | -| signal | varchar(32) | NO | | NULL | | +| signal | varchar(64) | NO | | NULL | | | time_type | varchar(12) | NO | | NULL | | | geo_type | varchar(12) | NO | | NULL | | | time_value | int(11) | NO | | NULL | | @@ -87,7 +88,7 @@ Data is public. CREATE TABLE `covidcast` ( `id` int(11) NOT NULL AUTO_INCREMENT, `source` varchar(32) NOT NULL, - `signal` varchar(32) NOT NULL, + `signal` varchar(64) NOT NULL, `time_type` varchar(12) NOT NULL, `geo_type` varchar(12) NOT NULL, `time_value` int(11) NOT NULL,