cmu-delphi
diff --git a/‎.bumpversion.cfg
Lines changed: 1 addition & 1 deletion b/‎.bumpversion.cfg
Lines changed: 1 addition & 1 deletion
diff --git a/‎ansible/templates/facebook-params-prod.json.j2
Lines changed: 1 addition & 0 deletions b/‎ansible/templates/facebook-params-prod.json.j2
Lines changed: 1 addition & 0 deletions
diff --git a/‎dsew_community_profile/delphi_dsew_community_profile/pull.py
Lines changed: 1 addition & 1 deletion b/‎dsew_community_profile/delphi_dsew_community_profile/pull.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎dsew_community_profile/tests/test_pull.py
Lines changed: 31 additions & 21 deletions b/‎dsew_community_profile/tests/test_pull.py
Lines changed: 31 additions & 21 deletions
diff --git a/‎facebook/Makefile
Lines changed: 78 additions & 12 deletions b/‎facebook/Makefile
Lines changed: 78 additions & 12 deletions
diff --git a/‎facebook/delphiFacebook/NAMESPACE
Lines changed: 1 addition & 1 deletion b/‎facebook/delphiFacebook/NAMESPACE
Lines changed: 1 addition & 1 deletion
diff --git a/‎facebook/delphiFacebook/R/contingency_run.R
Lines changed: 1 addition & 1 deletion b/‎facebook/delphiFacebook/R/contingency_run.R
Lines changed: 1 addition & 1 deletion
diff --git a/‎facebook/delphiFacebook/R/responses.R
Lines changed: 3 additions & 3 deletions b/‎facebook/delphiFacebook/R/responses.R
Lines changed: 3 additions & 3 deletions
diff --git a/‎facebook/delphiFacebook/R/run.R
Lines changed: 3 additions & 3 deletions b/‎facebook/delphiFacebook/R/run.R
Lines changed: 3 additions & 3 deletions
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.13
+current_version = 0.3.14
 commit = True
 message = chore: bump covidcast-indicators to {new_version}
 tag = False
@@ -57,6 +57,7 @@
   "start_date": "2021-08-16",
   "static_dir": "./static",
   "weights_in_dir": "./fb-incoming",
+  "weekly_weights_in_dir": "./fb-incoming-weekly",
   "weights_out_dir": "./fb-outgoing",
   "experimental_weights_out_dir": "./exp-fb-outgoing"
 }
@@ -408,7 +408,7 @@ def as_cached_filename(params, config):
 def fetch_listing(params):
     """Generate the list of report files to process."""
     export_start_date = params['indicator'].get(
-        'export_start_date', datetime.datetime.fromtimestamp(0).date()
+        'export_start_date', datetime.datetime.utcfromtimestamp(0).date()
     )
 
     listing = requests.get(DOWNLOAD_LISTING).json()['metadata']['attachments']
 
@@ -39,6 +39,7 @@ def _set_df_dtypes(df: pd.DataFrame, dtypes: Dict[str, Any]) -> pd.DataFrame:
             df[k] = df[k].astype(v)
     return df
 
+
 class TestPull:
     def test_DatasetTimes(self):
         examples = [
@@ -158,31 +159,35 @@ def test_Dataset_parse_sheet(self):
         # TODO
         pass
 
-    @patch('requests.get')
-    @patch('os.path.exists')
-    def test_fetch_listing(self, mock_listing, mock_exists):
+    def test_fetch_listing(self):
         inst = namedtuple("attachment", "assetId filename publish cache")
         instances = list(chain(*[
             [
-                inst(f"{i}", f"2021010{i}.xlsx", date(2021, 1, i), f"{i}---2021010{i}.xlsx"),
-                inst(f"p{i}", f"2021010{i}.pdf", date(2021, 1, i), f"p{i}---2021010{i}.pdf"),
+                inst(f"{i}", f"2021010{i}.xlsx", date(2021, 1, i), f"2021010{i}--{i}.xlsx"),
+                inst(f"p{i}", f"2021010{i}.pdf", date(2021, 1, i), f"2021010{i}--p{i}.pdf"),
             ]
             for i in [1, 2, 3, 4, 5]
         ]))
 
-        mock_listing.return_value = Mock()
-        mock_listing.return_value.json = Mock(
-            return_value = {
-                'metadata': {
-                    'attachments': [
-                        {"assetId": i.assetId, "filename": i.filename}
-                        for i in instances
-                    ]
-                }
-            }
-        )
-
-        mock_exists.reset_mock(return_value=False)
+        # Solution from https://stackoverflow.com/questions/15753390/
+        #how-can-i-mock-requests-and-the-response
+        def mocked_requests_get(*args, **kwargs):
+            class MockResponse:
+                def __init__(self, json_data):
+                    self.json_data = json_data
+
+                def json(self):
+                    return self.json_data
+
+            return MockResponse({
+                        'metadata': {
+                            'attachments': [
+                                {"assetId": i.assetId, "filename": i.filename}
+                                for i in instances
+                            ]
+                        }
+                    }
+                )
 
         def as_listing(instance):
             return {
@@ -192,15 +197,20 @@ def as_listing(instance):
                 "publish_date": instance.publish
             }
         ex = example(
-            {'indicator':{'reports':'new'}},
+            {'indicator':{'reports':'new', 'input_cache':''}},
             [
                 as_listing(instance)
                 for i, instance in filter(lambda x: x[0]%2 == 0, enumerate(instances))
             ]
         )
 
-        for actual, expected in zip(fetch_listing(ex.given), ex.expected):
-            assert actual == expected
+        with patch('requests.get', side_effect=mocked_requests_get):
+            with patch('os.path.exists', return_value=False):
+                for actual, expected in zip(fetch_listing(ex.given), ex.expected):
+                    assert actual == expected
+
+            with patch('os.path.exists', return_value=True):
+                assert fetch_listing(ex.given) == []
 
     def test_nation_from_state(self):
         geomapper = GeoMapper()
 
@@ -5,12 +5,16 @@ TODAY:=$(shell date +"%Y-%m-%d")
 YESTERDAY:=$(shell date --date "$(TODAY) -1 day" +"%Y-%m-%d")
 ONEWEEK:=$(shell date --date "$(TODAY) -7 day" +"%Y-%m-%d")
 THREEWEEK:=$(shell date --date "$(TODAY) -21 day" +"%Y-%m-%d")
+LAST_SATURDAY:=$(shell date -d "last Saturday" +"%Y-%m-%d")
+LAST_SUNDAY:=$(shell date -d "$(LAST_SATURDAY) -6 day" +"%Y-%m-%d")
+TUESDAY:=$(shell date -d "$(LAST_SATURDAY) +3 day" +"%Y-%m-%d")
 
 MESSAGES:="messages/$(TODAY).messages"
 
 PYTHON:=env/bin/python
 QUALTRICS=$(shell $(PYTHON) -m delphi_utils get input_dir)
 WEIGHTS=$(shell $(PYTHON) -m delphi_utils get weights_in_dir)
+WEEKLY_WEIGHTS=$(shell $(PYTHON) -m delphi_utils get weekly_weights_in_dir)
 CIDS=$(shell $(PYTHON) -m delphi_utils get weights_out_dir)
 CIDS_EXP=$(shell $(PYTHON) -m delphi_utils get experimental_weights_out_dir)
 INDIVIDUAL=$(shell $(PYTHON) -m delphi_utils get individual_dir)
@@ -24,6 +28,7 @@ DELPHI_SURVEY_EMAIL_USER=$(shell $(PYTHON) -m delphi_utils get delphi_survey_ema
 SFTP_OPTIONS=$(shell $(PYTHON) -m delphi_utils get sftp_options)
 
 MAX_WEIGHTED=ls -1 $(WEIGHTS) | grep dap | tail -1 | sed 's/_.*//;s/-//g;'
+MAX_WEEKLY_WEIGHTED=ls -1 $(WEEKLY_WEIGHTS) | grep map | tail -1 | sed 's/_.*//;s/-//g;'
 
 ANTIJOIN:="antijoin.cids.sorted.txt"
 ANTIJOIN_EXP:="antijoin.experimental.cids.sorted.txt"
@@ -42,24 +47,35 @@ else
 	SFTP_POST:=sshpass -p $(DELPHI_SURVEY_SFTP_PASSWORD) sftp $(SFTP_OPTIONS) -b <(echo -e "$${BATCH}") -P 2222 $(DELPHI_SURVEY_SFTP_USER)
 endif
 
+ifneq ("$(wildcard params.json)","")
+ifeq ($(WEIGHTS),$(WEEKLY_WEIGHTS))
+$(error "'weights_in_dir' and 'weekly_weights_in_dir' must be different.")
+endif
+endif
+
 default:
 	@echo No default implemented yet
 
 scratch:
 	mkdir scratch
 	rm -rf scratch/*
 
-tidy: receiving
-	rm -rf tidy/$(RECEIVING)
-	rm -rf tidy/$(INDIVIDUAL)
-	rm -f tidy/params.json
-	mkdir -p tidy tidy/$(RECEIVING) tidy/$(INDIVIDUAL)
-	cp params.json tidy/
-	mv $(RECEIVING)/*.csv tidy/$(RECEIVING)
-	mv $(INDIVIDUAL)/*.csv* tidy/$(INDIVIDUAL)
-	mv $(INDIVIDUAL_RACEETH)/*.csv* tidy/$(INDIVIDUAL_RACEETH)
-	tar -czf scratch/tidy-`date +"%Y-%m-%d-%H%M%S"`.tgz --exclude='tidy-*.tgz' tidy
-	mv scratch/*.tgz tidy/
+$(INDIVIDUAL) $(INDIVIDUAL_RACEETH):
+	mkdir $@
+
+tidy_%: receiving
+	rm -rf $@/$(RECEIVING)
+	rm -rf $@/$(INDIVIDUAL)
+	rm -rf $@/$(INDIVIDUAL_RACEETH)
+	rm -f $@/params.json
+	mkdir -p $@ $@/$(RECEIVING) $@/$(INDIVIDUAL) $@/$(INDIVIDUAL_RACEETH)
+	cp params.json $@/
+	# Check for _any_ matching files using https://stackoverflow.com/a/6364244/14401472
+	if compgen -G "$(RECEIVING)/*.csv" > /dev/null; then mv $(RECEIVING)/*.csv $@/$(RECEIVING); fi
+	mv $(INDIVIDUAL)/*.csv* $@/$(INDIVIDUAL)
+	mv $(INDIVIDUAL_RACEETH)/*.csv* $@/$(INDIVIDUAL_RACEETH)
+	tar -czf scratch/$@-`date +"%Y-%m-%d-%H%M%S"`.tgz --exclude='tidy*-*.tgz' --exclude='*.done' $@
+	mv scratch/*.tgz $@/
 
 clean:
 	rm -f $(RECEIVING)/*.csv $(INDIVIDUAL)/*.csv $(INDIVIDUAL_RACEETH)/*.csv $(CIDS)/*.csv $(CIDS_EXP)/*.csv
@@ -114,6 +130,17 @@ params.json: $(TODAY)
 		output cids,individual,covidalert,archive,community \
 		start_date $(YESTERDAY)
 
+params.weekly-weights.json: $(TODAY)
+	PAT=`grep fb-survey params.json | awk 'BEGIN{FS="\""}{print $$2}' | sed 's/ /_/g;s/^/-e /'`; \
+	$(PYTHON) -m delphi_utils set \
+		debug false \
+		produce_individual_raceeth true \
+		end_date $(LAST_SATURDAY) \
+		input <(find $(QUALTRICS) -maxdepth 1 -newer $< -type f -name "*.csv" | sort | grep $${PAT}  | tr '\n' ',' | sed 's_$(QUALTRICS)/__g;s/,$$//' ) \
+		parallel true \
+		output individual \
+		start_date $(LAST_SUNDAY)
+
 $(WEIGHTS): $(TODAY)
 	[ -f $(WEIGHTS) ] || mkdir -p $(WEIGHTS)
 	cd "$(WEIGHTS)"; \
@@ -133,6 +160,40 @@ $(WEIGHTS): $(TODAY)
 	  echo "WARNING: $${MSG}" >> $(MESSAGES); \
 	fi
 
+$(WEEKLY_WEIGHTS): $(TODAY)
+# This runs every day as a dependency of `pipeline`. A pipeline run is triggered when new weekly weights files are available.
+	[ -f $(WEEKLY_WEIGHTS) ] || mkdir -p $(WEEKLY_WEIGHTS)
+	cd "$(WEEKLY_WEIGHTS)"; \
+	BATCH="cd fb-interchange/cmu_respondent_ww_weights\nls -1"; \
+	NEW=`LC_ALL=C comm -23 <(sshpass -p $(DELPHI_SURVEY_SFTP_PASSWORD) sftp $(SFTP_OPTIONS) -b <(echo -e "$${BATCH}") -P 2222 $(DELPHI_SURVEY_SFTP_USER) | grep "^202" | LC_ALL=C sort) <(ls -1 | LC_ALL=C sort)`; \
+	echo "New weekly weights files:"; \
+	echo $${NEW}; \
+	for f in $${NEW}; do \
+	  BATCH="$${BATCH}\nget $$f"; \
+	done; \
+	sshpass -p $(DELPHI_SURVEY_SFTP_PASSWORD) sftp $(SFTP_OPTIONS) -b <(echo -e "$${BATCH}") -P 2222 $(DELPHI_SURVEY_SFTP_USER) || exit 90; \
+	cd -; \
+	touch -d $(YESTERDAY) $(WEEKLY_WEIGHTS); \
+	EXPECTED_WEEKLY_WEIGHTED=`date --date='$(LAST_SUNDAY)' +'%Y%m%d'`; \
+	MIN_NEW_WEEKLY_WEIGHTED=`grep map <<< $${NEW} | head -1 | sed 's/_.*//;s/-//g;'`; \
+	if [[ `wc -w <<< $${NEW}` -gt 0 ]] && [[ $$MIN_NEW_WEEKLY_WEIGHTED -ne $$EXPECTED_WEEKLY_WEIGHTED ]]; then \
+	  MSG="Expected new weekly weights files to start on: $$EXPECTED_WEEKLY_WEIGHTED; Actual new files starts on: $$MIN_NEW_WEEKLY_WEIGHTED"; \
+	  echo "WARNING: $${MSG}" >> $(MESSAGES); \
+	fi; \
+	MAX_WEEKLY_WEIGHTED=`$(MAX_WEEKLY_WEIGHTED)`; \
+	if [[ `date --date='$(TODAY)' +'%Y%m%d'` -gt `date --date='$(TUESDAY)' +'%Y%m%d'` ]] && [[ $$MAX_WEEKLY_WEIGHTED -lt $$EXPECTED_WEEKLY_WEIGHTED ]]; then \
+	  MSG="Weekly weights are old; Expected most recent weekly weights file to start on: $$EXPECTED_WEEKLY_WEIGHTED; Actual most recent file starts on: $$MAX_WEEKLY_WEIGHTED"; \
+	  echo "WARNING: $${MSG}" >> $(MESSAGES); \
+	fi; \
+	if [[ ! -f tidy_weekly/$(LAST_SUNDAY)-weekly-weights.done ]] && [[ $$MAX_WEEKLY_WEIGHTED -eq $$EXPECTED_WEEKLY_WEIGHTED ]]; then \
+	  if [ -f params.json ]; then cp params.json params.daily.json; fi; \
+	  $(MAKE) weekly-weights-pipeline; \
+	  if [ -f params.daily.json ]; then \
+		cp params.daily.json params.json; \
+		rm -f params.daily.json; \
+	  fi; \
+	fi
+
 dev: delphiFacebook_1.0.tar.gz
 	R CMD INSTALL delphiFacebook_1.0.tar.gz
 
@@ -146,13 +207,18 @@ run-R: $(CIDS) $(CIDS_EXP)
 	grep "scheduled core" tmp ; \
 	[ "$$?" -eq 1 ]
 
-pipeline: scratch init-qualtrics params.json $(WEIGHTS) run-R post-cids post-experimental-cids post-individual post-individual-raceeth post-done tidy
+pipeline: scratch init-qualtrics params.json $(WEIGHTS) run-R post-cids post-experimental-cids post-individual post-individual-raceeth post-done tidy_daily $(WEEKLY_WEIGHTS)
 	grep $(TODAY) params.json
 	[ -f $(YESTERDAY) ] && rm $(YESTERDAY) || true
 	touch $@
 	echo "SUCCESS: $(DRY_MESSAGE)pipeline complete" >> $(MESSAGES)
 	chmod o+w $(MESSAGES)
 
+weekly-weights-pipeline: scratch init-qualtrics params.weekly-weights.json run-R post-individual post-individual-raceeth tidy_weekly
+	touch $@
+	echo "SUCCESS: $(DRY_MESSAGE)completed weekly weights pipeline" >> $(MESSAGES)
+	touch tidy_weekly/$(LAST_SUNDAY)-weekly-weights.done
+
 coverage:
 	Rscript -e 'covr::package_coverage("delphiFacebook")'
 
 
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(add_weights)
 export(apply_privacy_censoring)
 export(assert)
 export(ceiling_epiweek)
@@ -21,7 +22,6 @@ export(get_range_prev_full_period)
 export(get_range_prev_full_week)
 export(get_sparse_filenames)
 export(jeffreys_se)
-export(join_weights)
 export(load_archive)
 export(load_response_one)
 export(load_responses_all)
 
@@ -144,7 +144,7 @@ run_contingency_tables_one_period <- function(params, aggregations)
       return()
     }
 
-    data_agg <- join_weights(data_agg, params, weights = "full")$df
+    data_agg <- add_weights(data_agg, params, weights = "full")$df
     msg_df("response data to aggregate", data_agg)
 
     produce_aggregates(data_agg, aggregations, cw_list, params)
 
@@ -779,10 +779,10 @@ filter_complete_responses <- function(data_full, params)
   data_full <- select(data_full, -.data$zip5)
 
   # 9 includes StartDatetime, EndDatetime, Date, token, wave, geo_id,
-  # UserLanguage + two questions (ignore raceethnicity, module, and
-  # w12_assignment fields which may or may not exist, depending on params and
+  # UserLanguage + two questions (ignore raceethnicity, module,
+  # w12_assignment, and weekly weights fields which may or may not exist, depending on params and
   # survey version)
-  ignore_cols <- c("raceethnicity", "w12_assignment", "module")
+  ignore_cols <- c("raceethnicity", "w12_assignment", "module", "weight_wf", "weight_wp")
   valid_row_filter <- rowSums( !is.na(data_full[, !(names(data_full) %in% ignore_cols)]) ) >= 9
   data_full <- data_full[valid_row_filter, ]
 
 
@@ -27,7 +27,7 @@ run_facebook <- function(params)
   # create data that will be aggregated for covidcast
   data_agg <- create_data_for_aggregation(input_data)
   data_agg <- filter_data_for_aggregation(data_agg, params, lead_days = 12)
-  weight_result <- join_weights(data_agg, params, weights = "step1")
+  weight_result <- add_weights(data_agg, params, weights = "step1")
   data_agg <- weight_result$df
   latest_weight_date_step1 <- weight_result$weight_date
   msg_df("response data to aggregate", data_agg)
@@ -36,10 +36,10 @@ run_facebook <- function(params)
     is.na(latest_weight_date_step1), as.Date(params$end_date), latest_weight_date_step1
   )
 
-  # create "complete" data that will be shared with research partners
+  # create "complete" data (microdata) that will be shared with research partners
   data_full <- create_complete_responses(input_data, cw_list$county, params)
   data_full <- filter_complete_responses(data_full, params)
-  data_full <- join_weights(data_full, params, weights = "full")$df
+  data_full <- add_weights(data_full, params, weights = "full", add_weekly_weights = TRUE)$df
   msg_df("full data to share with research partners", data_full)
 
   # create module-complete data used to create CID lists separately for each module
Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,7 @@`
`57`	`57`	`"start_date": "2021-08-16",`
`58`	`58`	`"static_dir": "./static",`
`59`	`59`	`"weights_in_dir": "./fb-incoming",`
	`60`	`+ "weekly_weights_in_dir": "./fb-incoming-weekly",`
`60`	`61`	`"weights_out_dir": "./fb-outgoing",`
`61`	`62`	`"experimental_weights_out_dir": "./exp-fb-outgoing"`
`62`	`63`	`}`
Original file line number	Diff line number	Diff line change
`@@ -408,7 +408,7 @@ def as_cached_filename(params, config):`
`408`	`408`	`def fetch_listing(params):`
`409`	`409`	`"""Generate the list of report files to process."""`
`410`	`410`	`export_start_date = params['indicator'].get(`
`411`		`- 'export_start_date', datetime.datetime.fromtimestamp(0).date()`
	`411`	`+ 'export_start_date', datetime.datetime.utcfromtimestamp(0).date()`
`412`	`412`	`)`
`413`	`413`
`414`	`414`	`listing = requests.get(DOWNLOAD_LISTING).json()['metadata']['attachments']`
Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ run_contingency_tables_one_period <- function(params, aggregations)`
`144`	`144`	`return()`
`145`	`145`	`}`
`146`	`146`
`147`		`- data_agg <- join_weights(data_agg, params, weights = "full")$df`
	`147`	`+ data_agg <- add_weights(data_agg, params, weights = "full")$df`
`148`	`148`	`msg_df("response data to aggregate", data_agg)`
`149`	`149`
`150`	`150`	`produce_aggregates(data_agg, aggregations, cw_list, params)`