County analysis

First, we’ll look at counties.

Getting data

We fetch DSEW CPR vaccination/booster rates from the past week and total number of people vaccinated/boosted from locally-generated files.

files <- list.files("~/Documents/covidcast-indicators/dsew_community_profile/receiving", full.names = TRUE)
filenames <- list.files("~/Documents/covidcast-indicators/dsew_community_profile/receiving", full.names = FALSE)
names(files) <- filenames
dfs <- list()
for (index in seq_along(files)) {
  file_path <- files[index]
  filename <- names(file_path)
  filename_parts <- str_split(filename, "_")[[1]]
  time_value = filename_parts[1]
  
  signal_name <- paste(filename_parts[3:length(filename_parts)], collapse="_")
  signal_name <- substr(signal_name, 1, nchar(signal_name)-4)
  
  geo_level <- filename_parts[2]
  
  df <- read_csv(file_path, col_types = cols(geo_id = col_character())) %>%
    mutate(time_value = as.Date(time_value, "%Y%m%d")) %>%
    rename(value = val, geo_value = geo_id)
  
  if ("se" %in% names(df)) {
    df <- select(df, -se)
  }
  if ("sample_size" %in% names(df)) {
    df <- select(df, -sample_size)
  }
  df$stderr <- NA_integer_
  df$sample_size <- NA_integer_
  
  signal_type <- paste(geo_level, signal_name)
  
  if (signal_type %in% names(dfs)) {
    dfs[[signal_type]] <- bind_rows(df, dfs[[signal_type]])
  } else {
    dfs[[signal_type]] <- df
  }
}
for (signal_type in names(dfs)) {
  signal_name <- str_split(signal_type, " ")[[1]][2]
  geo_type <- str_split(signal_type, " ")[[1]][1]
  
  dfs[[signal_type]] <- covidcast::as.covidcast_signal(dfs[[signal_type]], signal = signal_name, geo_type = geo_type, data_source = "local")
}
# Fetch the following sources and signals from local files


sources = rep("local", length(names(dfs)))
signals = names(dfs)
names = lapply(names(dfs), function(name) {str_split(name, " ")[[1]][2]} ) %>% unlist()

# Fetch USAFacts confirmed case incidence proportion (smoothed with 7-day 
# trailing average)
start_day = "2021-11-01"
end_day = NULL
df_cases = covidcast_signal("usa-facts", "confirmed_7dav_incidence_prop",
                            start_day, end_day)

Correlations sliced by time

Here we look at Spearman (rank) correlations between our signals and COVID-19 case incidence rates, sliced by time. That is, for each day, we compute the correlation between each signal and COVID-19 case incidence rates, over all
counties (with at least 500 cumulative cases).

signals_by_geo <- signals[startsWith(signals, "county")]
names_by_geo <- names[startsWith(signals, "county")]

# Consider only counties with at least 500 cumulative cases
case_num = 500
geo_values = bind_rows(lapply(signals_by_geo, function(signame) { dfs[[signame]] })) %>%
  filter(value >= case_num) %>% distinct(geo_value) %>% pull()
df_cor = vector("list", length(signals_by_geo))
for (i in 1:length(signals_by_geo)) {
  signame <- signals_by_geo[i]
  df_cor[[i]] = covidcast_cor(dfs[[signame]] %>% 
                                filter(geo_value %in% geo_values), 
                              df_cases %>% 
                                filter(geo_value %in% geo_values), 
                              by = "time_value", method = "spearman")
  df_cor[[i]]$signal = names_by_geo[i]
}
df = do.call(rbind, df_cor)
ggplot(df, aes(x = time_value, y = value)) +
  geom_line(aes(color = signal)) +
  guides(color = guide_legend(nrow = 2)) +
  labs(title = "Correlation between signals and case rates",
       subtitle = sprintf("Over all counties with at least %i cumulative cases",
                          case_num), x = "Date", y = "Correlation") +
  theme(legend.position = "bottom", legend.title = element_blank())

Correlations sliced by county

Now we look at Spearman (rank) correlations between our signals and COVID-19 case incidence rates, sliced by county. That is, for each county (with at least 500 cumulative cases), we compute the correlation between each signal and COVID-19 case incidence rates, over all time.

df_cor = vector("list", length(signals_by_geo))
for (i in 1:length(signals_by_geo)) {
  signame <- signals_by_geo[i]
  df_cor[[i]] = covidcast_cor(dfs[[signame]] %>% 
                                filter(geo_value %in% geo_values), 
                              df_cases %>% 
                                filter(geo_value %in% geo_values), 
                              by = "geo_value", method = "spearman")
  df_cor[[i]]$signal = names_by_geo[i]
}
df = do.call(rbind, df_cor)
ggplot(df, aes(value)) +
  geom_density(aes(color = signal, fill = signal), alpha = 0.4) +
  guides(color = guide_legend(nrow = 2)) +
  labs(title = "Correlation between signals and case rates",
       subtitle = sprintf("Over all counties with at least %i cumulative cases",
                          case_num), x = "Date", y = "Correlation") +
  theme(legend.position = "bottom", legend.title = element_blank())

We can also look at choropleth maps to get a geographic sense of the correlation distribution for each signal.

# Set some fields, then plot choropleth maps using covidcast functionality
for (i in 1:length(signals_by_geo)) {
  df_cor[[i]]$time_value = start_day
  df_cor[[i]]$issue = start_day
  df_cor[[i]]$data_source = "local"
  attributes(df_cor[[i]])$metadata$geo_type = "county"
  class(df_cor[[i]]) = c("covidcast_signal", "data.frame")
  
  print(plot(df_cor[[i]], range = c(-1, 1), choro_col = cm.colors(10),
             title = sprintf("Correlations for %s", names_by_geo[i])))
}

Metro area analysis

Next, we’ll look at metro areas.

Getting data

We fetch reference data from our API, from April 15 through to the current day.

# Fetch USAFacts confirmed case incidence proportion (smoothed with 7-day
# trailing average)
df_cases = covidcast_signal("usa-facts", "confirmed_7dav_incidence_prop",
                            start_day, end_day, geo_type = "msa")

Correlations sliced by time

Here we look at Spearman (rank) correlations between our signals and COVID-19 case incidence rates, sliced by time. That is, for each day, we compute the correlation between each signal and COVID-19 case incidence rates, over all metro areas (with at least 500 cumulative cases).

signals_by_geo <- signals[startsWith(signals, "msa")]
names_by_geo <- names[startsWith(signals, "msa")]

# Consider only counties with at least 500 cumulative cases
case_num = 500
geo_values = bind_rows(lapply(signals_by_geo, function(signame) { dfs[[signame]] })) %>%
  filter(value >= case_num) %>% distinct(geo_value) %>% pull()
df_cor = vector("list", length(signals_by_geo))
for (i in 1:length(signals_by_geo)) {
  signame <- signals_by_geo[i]
  df_cor[[i]] = covidcast_cor(dfs[[signame]] %>% 
                                filter(geo_value %in% geo_values), 
                              df_cases %>% 
                                filter(geo_value %in% geo_values), 
                              by = "time_value", method = "spearman")
  df_cor[[i]]$signal = names_by_geo[i]
}
df = do.call(rbind, df_cor)
ggplot(df, aes(x = time_value, y = value)) +
  geom_line(aes(color = signal)) +
  guides(color = guide_legend(nrow = 2)) +
  labs(title = "Correlation between signals and case rates",
       subtitle = sprintf("Over metro areas with at least %i cumulative cases",
                          case_num), x = "Date", y = "Correlation") +
  theme(legend.position = "bottom", legend.title = element_blank())

Correlations sliced by metro area

Now we look at Spearman (rank) correlations between our signals and COVID-19 case incidence rates, sliced by metro area. That is, for each metro area (with at least 500 cumulative cases), we compute the correlation between each signal and COVID-19 case incidence rates, over all time.

df_cor = vector("list", length(signals_by_geo))
for (i in 1:length(signals_by_geo)) {
  signame <- signals_by_geo[i]
  df_cor[[i]] = covidcast_cor(dfs[[signame]] %>% 
                                filter(geo_value %in% geo_values), 
                              df_cases %>% 
                                filter(geo_value %in% geo_values), 
                              by = "geo_value", method = "spearman")
  df_cor[[i]]$signal = names_by_geo[i]
}
df = do.call(rbind, df_cor)
ggplot(df, aes(value)) +
  geom_density(aes(color = signal, fill = signal), alpha = 0.4) +
  guides(color = guide_legend(nrow = 2)) +
  labs(title = "Correlation between signals and case rates",
       subtitle = sprintf("Over metro areas with at least %i cumulative cases",
                          case_num), x = "Date", y = "Correlation") +
  theme(legend.position = "bottom", legend.title = element_blank())

We can also look at choropleth maps to get a geographic sense of the correlation distribution for each signal.

# Set some fields, then plot choropleth maps using covidcast functionality
for (i in 1:length(signals_by_geo)) {
  df_cor[[i]]$time_value = start_day
  df_cor[[i]]$issue = start_day
  df_cor[[i]]$data_source = "local"
  attributes(df_cor[[i]])$metadata$geo_type = "msa"
  class(df_cor[[i]]) = c("covidcast_signal", "data.frame")
  
  print(plot(df_cor[[i]], range = c(-1, 1), choro_col = cm.colors(10),
             title = sprintf("Correlations for %s", names_by_geo[i])))
}

State analysis

# Fetch USAFacts confirmed case incidence proportion (smoothed with 7-day
# trailing average)
df_cases = covidcast_signal("usa-facts", "confirmed_7dav_incidence_prop",
                            start_day, end_day, geo_type = "state")

Correlations sliced by time

signals_by_geo <- signals[startsWith(signals, "state")]
names_by_geo <- names[startsWith(signals, "state")]

# Consider only counties with at least 500 cumulative cases
case_num = 500
geo_values = bind_rows(lapply(signals_by_geo, function(signame) { dfs[[signame]] })) %>%
  filter(value >= case_num) %>% distinct(geo_value) %>% pull()
df_cor = vector("list", length(signals_by_geo))
for (i in 1:length(signals_by_geo)) {
  signame <- signals_by_geo[i]
  df_cor[[i]] = covidcast_cor(dfs[[signame]] %>% 
                                filter(geo_value %in% geo_values), 
                              df_cases %>% 
                                filter(geo_value %in% geo_values), 
                              by = "time_value", method = "spearman")
  df_cor[[i]]$signal = names_by_geo[i]
}
df = do.call(rbind, df_cor)
ggplot(df, aes(x = time_value, y = value)) +
  geom_line(aes(color = signal)) +
  guides(color = guide_legend(nrow = 2)) +
  labs(title = "Correlation between signals and case rates",
       subtitle = sprintf("Over states with at least %i cumulative cases",
                          case_num), x = "Date", y = "Correlation") +
  theme(legend.position = "bottom", legend.title = element_blank())

Correlations sliced by metro area

df_cor = vector("list", length(signals_by_geo))
for (i in 1:length(signals_by_geo)) {
  signame <- signals_by_geo[i]
  df_cor[[i]] = covidcast_cor(dfs[[signame]] %>% 
                                filter(geo_value %in% geo_values), 
                              df_cases %>% 
                                filter(geo_value %in% geo_values), 
                              by = "geo_value", method = "spearman")
  df_cor[[i]]$signal = names_by_geo[i]
}
df = do.call(rbind, df_cor)
ggplot(df, aes(value)) +
  geom_density(aes(color = signal, fill = signal), alpha = 0.4) +
  guides(color = guide_legend(nrow = 2)) +
  labs(title = "Correlation between signals and case rates",
       subtitle = sprintf("Over states with at least %i cumulative cases",
                          case_num), x = "Date", y = "Correlation") +
  theme(legend.position = "bottom", legend.title = element_blank())

We can also look at choropleth maps to get a geographic sense of the correlation distribution for each signal.

# Set some fields, then plot choropleth maps using covidcast functionality
for (i in 1:length(signals_by_geo)) {
  df_cor[[i]]$time_value = start_day
  df_cor[[i]]$issue = start_day
  df_cor[[i]]$data_source = "local"
  attributes(df_cor[[i]])$metadata$geo_type = "state"
  class(df_cor[[i]]) = c("covidcast_signal", "data.frame")
  
  print(plot(df_cor[[i]], range = c(-1, 1), choro_col = cm.colors(10),
             title = sprintf("Correlations for %s", names_by_geo[i])))
}