cmu-delphi
diff --git a/‎.bumpversion.cfg
+1-1 b/‎.bumpversion.cfg
+1-1
diff --git a/‎.env.example
+3 b/‎.env.example
+3
diff --git a/‎dev/local/Makefile
+3 b/‎dev/local/Makefile
+3
diff --git a/‎dev/local/setup.cfg
+1-1 b/‎dev/local/setup.cfg
+1-1
diff --git a/‎devops/Dockerfile
+1-3 b/‎devops/Dockerfile
+1-3
diff --git a/‎devops/start_wrapper.sh
-10 b/‎devops/start_wrapper.sh
-10
diff --git a/‎docs/epidata_development.md
+10 b/‎docs/epidata_development.md
+10
diff --git a/‎docs/symptom-survey/publications.md
+4 b/‎docs/symptom-survey/publications.md
+4
diff --git a/‎integrations/acquisition/covid_hosp/state_daily/test_scenarios.py
+114-54 b/‎integrations/acquisition/covid_hosp/state_daily/test_scenarios.py
+114-54
diff --git a/‎requirements.api.txt
+2-2 b/‎requirements.api.txt
+2-2
diff --git a/‎requirements.dev.txt
+1-1 b/‎requirements.dev.txt
+1-1
diff --git a/‎src/acquisition/covid_hosp/common/database.py
+13-6 b/‎src/acquisition/covid_hosp/common/database.py
+13-6
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 4.1.13
+current_version = 4.1.15
 commit = False
 tag = False
 
 
@@ -4,3 +4,6 @@ FLASK_SECRET=abc
 #API_KEY_REQUIRED_STARTING_AT=2021-07-30
 API_KEY_ADMIN_PASSWORD=abc
 API_KEY_REGISTER_WEBHOOK_TOKEN=abc
+
+# Sentry
+# If setting a Sentry DSN, note that the URL should NOT be quoted!
@@ -77,6 +77,7 @@ LOG_REDIS:=delphi_redis_instance_$(NOW).log
 WEB_CONTAINER_ID:=$(shell docker ps -q --filter 'name=delphi_web_epidata')
 DATABASE_CONTAINER_ID:=$(shell docker ps -q --filter 'name=delphi_database_epidata')
 REDIS_CONTAINER_ID:=$(shell docker ps -q --filter 'name=delphi_redis')
+ENV_FILE:=repos/delphi/delphi-epidata/.env
 
 M1=
 ifeq ($(shell uname -smp), Darwin arm64 arm)
@@ -104,8 +105,10 @@ web:
 	@# Run the web server
 	@# MODULE_NAME specifies the location of the `app` variable, the actual WSGI application object to run.
 	@# see https://github.com/tiangolo/meinheld-gunicorn-docker#module_name
+	@touch $(ENV_FILE)
 	@docker run --rm -p 127.0.0.1:10080:80 \
 		$(M1) \
+		--env-file $(ENV_FILE) \
 		--env "MODULE_NAME=delphi.epidata.server.main" \
 		--env "SQLALCHEMY_DATABASE_URI=$(sqlalchemy_uri)" \
 		--env "FLASK_SECRET=abc" --env "FLASK_PREFIX=/epidata" --env "LOG_DEBUG" \
 
@@ -1,6 +1,6 @@
 [metadata]
 name = Delphi Development
-version = 4.1.13
+version = 4.1.15
 
 [options]
 packages =
 
@@ -7,7 +7,6 @@ FROM tiangolo/meinheld-gunicorn:python3.8
 LABEL org.opencontainers.image.source=https://github.com/cmu-delphi/delphi-epidata
 
 COPY ./devops/gunicorn_conf.py /app
-COPY ./devops/start_wrapper.sh /
 RUN mkdir -p /app/delphi/epidata
 COPY ./src/server /app/delphi/epidata/server
 COPY ./src/common /app/delphi/epidata/common
@@ -18,7 +17,6 @@ COPY requirements.api.txt /app/requirements_also.txt
 RUN ln -s -f /usr/share/zoneinfo/America/New_York /etc/localtime \
       && rm -rf /app/delphi/epidata/__pycache__ \
       && chmod -R o+r /app/delphi/epidata \
-      && chmod 755 /start_wrapper.sh \
       && pip install --no-cache-dir -r /tmp/requirements.txt -r requirements_also.txt
 # the file /tmp/requirements.txt is created in the parent docker definition.  (see:
 #   https://github.com/tiangolo/meinheld-gunicorn-docker/blob/master/docker-images/python3.8.dockerfile#L5 )
@@ -28,4 +26,4 @@ RUN ln -s -f /usr/share/zoneinfo/America/New_York /etc/localtime \
 ENV PYTHONUNBUFFERED 1
 
 ENTRYPOINT [ "/entrypoint.sh" ]
-CMD [ "/start_wrapper.sh" ]
+CMD [ "/start.sh" ]
@@ -388,3 +388,13 @@ The command above maps two local directories into the container:
 - `/repos/delphi/delphi-epidata/src`: Just the source code, which forms the
   container's `delphi.epidata` python package.
 
+## instrumentation with Sentry
+
+Delphi uses [Sentry](https://sentry.io/welcome/) in production for debugging, APM, and other observability purposes. You can instrument your local environment if you want to take advantage of Sentry's features during the development process. In most cases this option is available to internal Delphi team members only.
+
+The bare minimum to set up instrumentation is to supply the DSN for the [epidata-api](https://cmu-delphi.sentry.io/projects/epidata-api/?project=4506123377442816) Sentry project to the application environment.
+
+- You can get the DSN from the Sentry [project's keys config](https://cmu-delphi.sentry.io/settings/projects/epidata-api/keys/), or by asking someone in the prodsys, DevOps, or sysadmin space.
+- Once you have the DSN, add it to your local `.env` file and rebuild your containers to start sending telemetry to Sentry.
+
+Additional internal documentation for Sentry can be found [here](https://bookstack.delphi.cmu.edu/books/systems-handbook/page/sentry).
@@ -26,6 +26,10 @@ Pandemic"](https://www.pnas.org/topic/548) in *PNAS*:
 
 Research publications using the survey data include:
 
+- E. Tuzhilina, T. J. Hastie, D. J. McDonald, J. K. Tay & R. Tibshirani (2023).
+  [Smooth multi-period forecasting with application to prediction of COVID-19
+  cases](https://doi.org/10.1080/10618600.2023.2285337). *Journal of Computational
+  and Graphical Statistics*.
 - W. Dempsey (2023). [Addressing selection bias and measurement error in
   COVID-19 case count data using auxiliary information](https://doi.org/10.1214/23-AOAS1744).
   *Annals of Applied Statistics* 17 (4), 2903-2923.
 
@@ -47,62 +47,122 @@ def setUp(self):
         cur.execute('delete from api_user')
         cur.execute('insert into api_user(api_key, email) values("key", "email")')
 
-  @freeze_time("2021-03-16")
-  def test_acquire_dataset(self):
-    """Acquire a new dataset."""
+  def get_modified_dataset(self, critical_staffing_shortage_today_yes, reporting_cutoff_start):
+    """Get a simplified version of a test dataset.
 
-    # make sure the data does not yet exist
-    with self.subTest(name='no data yet'):
-      response = Epidata.covid_hosp('MA', Epidata.range(20200101, 20210101))
-      self.assertEqual(response['result'], -2, response)
+    Only WY data is modified. The issue date is specified in the metadata file.
+    """
+    df = self.test_utils.load_sample_dataset()
+    df_new = pd.DataFrame(df[df["state"] == "WY"], columns=df.columns).reset_index(drop=True)
+    df_new["critical_staffing_shortage_today_yes"] = critical_staffing_shortage_today_yes
+    df_new["reporting_cutoff_start"] = reporting_cutoff_start
+    return df_new
 
-    # acquire sample data into local database
-    # mock out network calls to external hosts
-    with self.subTest(name='first acquisition'), \
-         patch.object(Network, 'fetch_metadata', return_value=self.test_utils.load_sample_metadata()) as mock_fetch_meta, \
-         patch.object(Network, 'fetch_dataset', side_effect=[self.test_utils.load_sample_dataset("dataset0.csv"), # dataset for 3/13
-                                                             self.test_utils.load_sample_dataset("dataset0.csv"), # first dataset for 3/15
-                                                             self.test_utils.load_sample_dataset()] # second dataset for 3/15
-                      ) as mock_fetch:
-      acquired = Update.run()
-      self.assertTrue(acquired)
-      self.assertEqual(mock_fetch_meta.call_count, 1)
-
-    # make sure the data now exists
-    with self.subTest(name='initial data checks'):
-      response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
-      self.assertEqual(response['result'], 1)
-      self.assertEqual(len(response['epidata']), 1)
-      row = response['epidata'][0]
-      self.assertEqual(row['state'], 'WY')
-      self.assertEqual(row['date'], 20201209)
-      self.assertEqual(row['issue'], 20210315)
-      self.assertEqual(row['critical_staffing_shortage_today_yes'], 8)
-      self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
-      actual = row['inpatient_bed_covid_utilization']
-      expected = 0.11729857819905214
-      self.assertAlmostEqual(actual, expected)
-      self.assertIsNone(row['critical_staffing_shortage_today_no'])
-
-      # expect 61 fields per row (63 database columns, except `id` and `record_type`)
-      self.assertEqual(len(row), 118)
-
-    with self.subTest(name='all date batches acquired'):
-      response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101), issues=20210313)
-      self.assertEqual(response['result'], 1)
-
-    # re-acquisition of the same dataset should be a no-op
-    with self.subTest(name='second acquisition'), \
-         patch.object(Network, 'fetch_metadata', return_value=self.test_utils.load_sample_metadata()) as mock_fetch_meta, \
-         patch.object(Network, 'fetch_dataset', return_value=self.test_utils.load_sample_dataset()) as mock_fetch:
-      acquired = Update.run()
-      self.assertFalse(acquired)
+  def test_acquire_dataset(self):
+    """Acquire a new dataset."""
 
-    # make sure the data still exists
-    with self.subTest(name='final data checks'):
-      response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
-      self.assertEqual(response['result'], 1)
-      self.assertEqual(len(response['epidata']), 1)
+    with freeze_time("2021-03-15"):
+      # make sure the data does not yet exist
+      with self.subTest(name='no data yet'):
+        response = Epidata.covid_hosp('MA', Epidata.range(20200101, 20210101))
+        self.assertEqual(response['result'], -2, response)
+
+      # acquire sample data into local database
+      # mock out network calls to external hosts
+      # issues: 3/13, 3/15
+      with self.subTest(name='first acquisition'), \
+          patch.object(Network, 'fetch_metadata',
+                       return_value=self.test_utils.load_sample_metadata("metadata.csv")) as mock_fetch_meta, \
+          patch.object(Network, 'fetch_dataset', side_effect=[
+            self.test_utils.load_sample_dataset(),
+            self.test_utils.load_sample_dataset()
+          ]) as mock_fetch:
+        acquired = Update.run()
+        self.assertTrue(acquired)
+        self.assertEqual(mock_fetch_meta.call_count, 1)
+
+      # make sure the data now exists
+      with self.subTest(name='initial data checks'):
+        response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
+        self.assertEqual(response['result'], 1)
+        self.assertEqual(len(response['epidata']), 1)
+        row = response['epidata'][0]
+        self.assertEqual(row['state'], 'WY')
+        self.assertEqual(row['date'], 20201209)
+        self.assertEqual(row['issue'], 20210315) # include today's data by default
+        self.assertEqual(row['critical_staffing_shortage_today_yes'], 8)
+        self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
+        self.assertIsNone(row['critical_staffing_shortage_today_no'])
+
+        # expect 61 fields per row (63 database columns, except `id` and `record_type`)
+        self.assertEqual(len(row), 118)
+
+      with self.subTest(name='all date batches acquired'):
+        response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101), issues=20210313)
+        self.assertEqual(response['result'], 1)
+
+      # re-acquisition of the same dataset should be a no-op
+      # issues: 3/13, 3/15
+      with self.subTest(name='second acquisition'), \
+          patch.object(Network, 'fetch_metadata',
+                       return_value=self.test_utils.load_sample_metadata("metadata.csv")) as mock_fetch_meta, \
+          patch.object(Network, 'fetch_dataset', side_effect=[
+            self.test_utils.load_sample_dataset(),
+            self.test_utils.load_sample_dataset()
+          ]) as mock_fetch:
+        acquired = Update.run()
+        self.assertFalse(acquired)
+
+        # make sure the data still exists
+        response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
+        self.assertEqual(response['result'], 1)
+        self.assertEqual(len(response['epidata']), 1)
+
+    with freeze_time("2021-03-16"):
+      # simulate issue posted after yesterday's run
+      with self.subTest(name='late issue posted'), \
+          patch.object(Network, 'fetch_metadata',
+                      return_value=self.test_utils.load_sample_metadata("metadata2.csv")) as mock_fetch_meta, \
+          patch.object(Network, 'fetch_dataset', side_effect=[
+            self.get_modified_dataset(critical_staffing_shortage_today_yes = 9, reporting_cutoff_start="2020-12-09"),
+            self.get_modified_dataset(critical_staffing_shortage_today_yes = 10, reporting_cutoff_start="2020-12-09"),
+            self.get_modified_dataset(critical_staffing_shortage_today_yes = 11, reporting_cutoff_start="2020-12-10"),
+            self.get_modified_dataset(critical_staffing_shortage_today_yes = 12, reporting_cutoff_start="2020-12-10"),
+          ]) as mock_fetch:
+        acquired = Update.run()
+        self.assertTrue(acquired)
+        self.assertEqual(mock_fetch_meta.call_count, 1)
+
+      # make sure everything was filed correctly
+      with self.subTest(name='late issue data checks'):
+        response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101))
+        self.assertEqual(response['result'], 1)
+        self.assertEqual(len(response['epidata']), 2)
+
+        # should have data from 03-15 00:00:01AM
+        row = response['epidata'][0]
+        self.assertEqual(row['state'], 'WY')
+        self.assertEqual(row['date'], 20201209)
+        self.assertEqual(row['issue'], 20210315) # include today's data by default
+        self.assertEqual(row['critical_staffing_shortage_today_yes'], 10)
+        self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
+        self.assertIsNone(row['critical_staffing_shortage_today_no'])
+
+        # should have data from 03-16 00:00:01AM
+        row = response['epidata'][1]
+        self.assertEqual(row['state'], 'WY')
+        self.assertEqual(row['date'], 20201210)
+        self.assertEqual(row['issue'], 20210316) # include today's data by default
+        self.assertEqual(row['critical_staffing_shortage_today_yes'], 12)
+        self.assertEqual(row['total_patients_hospitalized_confirmed_influenza_covid_coverage'], 56)
+        self.assertIsNone(row['critical_staffing_shortage_today_no'])
+
+        # expect 61 fields per row (63 database columns, except `id` and `record_type`)
+        self.assertEqual(len(row), 118)
+
+      with self.subTest(name='all date batches acquired'):
+        response = Epidata.covid_hosp('WY', Epidata.range(20200101, 20210101), issues=20210316)
+        self.assertEqual(response['result'], 1)
 
 
   @freeze_time("2021-03-16")
@@ -121,7 +181,7 @@ def test_acquire_specific_issue(self):
     self.assertEqual(pre_max_issue, pd.Timestamp('1900-01-01 00:00:00'))
     with self.subTest(name='first acquisition'), \
          patch.object(Network, 'fetch_metadata', return_value=self.test_utils.load_sample_metadata()) as mock_fetch_meta, \
-         patch.object(Network, 'fetch_dataset', side_effect=[self.test_utils.load_sample_dataset("dataset0.csv")]
+         patch.object(Network, 'fetch_dataset', side_effect=[self.test_utils.load_sample_dataset()]
                       ) as mock_fetch:
       acquired = Utils.update_dataset(Database,
                                       Network,
 
@@ -5,16 +5,16 @@ Flask-Limiter==3.3.0
 jinja2==3.0.3
 more_itertools==8.4.0
 mysqlclient==2.1.1
-newrelic
 orjson==3.4.7
 pandas==1.2.3
 python-dotenv==0.15.0
 pyyaml
 redis==3.5.3
 requests==2.31.0
 scipy==1.10.0
+sentry-sdk[flask]
 SQLAlchemy==1.4.40
 structlog==22.1.0
 tenacity==7.0.0
 typing-extensions
-werkzeug==2.2.3
+werkzeug==2.3.8
@@ -1,4 +1,4 @@
-aiohttp==3.8.5
+aiohttp==3.9.0
 black>=20.8b1
 bump2version==1.0.1
 covidcast==0.1.5
 
@@ -184,15 +184,16 @@ def nan_safe_dtype(dtype, value):
     for csv_name in self.key_columns:
       dataframe.loc[:, csv_name] = dataframe[csv_name].map(self.columns_and_types[csv_name].dtype)
 
-    num_columns = 2 + len(dataframe_columns_and_types) + len(self.additional_fields)
-    value_placeholders = ', '.join(['%s'] * num_columns)
-    columns = ', '.join(f'`{i.sql_name}`' for i in dataframe_columns_and_types + self.additional_fields)
-    sql = f'INSERT INTO `{self.table_name}` (`id`, `{self.publication_col_name}`, {columns}) ' \
-          f'VALUES ({value_placeholders})'
+    col_names = [f'`{i.sql_name}`' for i in dataframe_columns_and_types + self.additional_fields]
+    value_placeholders = ', '.join(['%s'] * (2 + len(col_names))) # extra 2 for `id` and `self.publication_col_name` cols
+    columnstring = ', '.join(col_names)
+    sql = f'REPLACE INTO `{self.table_name}` (`id`, `{self.publication_col_name}`, {columnstring}) VALUES ({value_placeholders})'
     id_and_publication_date = (0, publication_date)
+    num_values = len(dataframe.index)
     if logger:
-      logger.info('updating values', count=len(dataframe.index))
+      logger.info('updating values', count=num_values)
     n = 0
+    rows_affected = 0
     many_values = []
     with self.new_cursor() as cursor:
       for index, row in dataframe.iterrows():
@@ -208,6 +209,7 @@ def nan_safe_dtype(dtype, value):
         if n % 5_000 == 0:
           try:
             cursor.executemany(sql, many_values)
+            rows_affected += cursor.rowcount
             many_values = []
           except Exception as e:
             if logger:
@@ -216,6 +218,11 @@ def nan_safe_dtype(dtype, value):
       # insert final batch
       if many_values:
         cursor.executemany(sql, many_values)
+        rows_affected += cursor.rowcount
+      if logger:
+        # NOTE: REPLACE INTO marks 2 rows affected for a "replace" (one for a delete and one for a re-insert)
+        # which allows us to count rows which were updated
+        logger.info('rows affected', total=rows_affected, updated=rows_affected-num_values)
 
     # deal with non/seldomly updated columns used like a fk table (if this database needs it)
     if hasattr(self, 'AGGREGATE_KEY_COLS'):
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-aiohttp==3.8.5`
	`1`	`+aiohttp==3.9.0`
`2`	`2`	`black>=20.8b1`
`3`	`3`	`bump2version==1.0.1`
`4`	`4`	`covidcast==0.1.5`