diff --git a/.Rbuildignore b/.Rbuildignore
index 7007d2e22..aeec9a2cb 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -9,3 +9,4 @@
 ^docs$
 ^pkgdown$
 ^musings$
+^data-raw$
diff --git a/DESCRIPTION b/DESCRIPTION
index f51ff8a26..e4c939dd4 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -42,3 +42,6 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.2.0
+Depends: 
+    R (>= 2.10)
+LazyData: true
diff --git a/NAMESPACE b/NAMESPACE
index b632f8922..e3d6d3365 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -20,6 +20,7 @@ export(df_mat_mul)
 export(epi_keys)
 export(epi_recipe)
 export(get_precision)
+export(get_test_data)
 export(grab_names)
 export(knn_iteraive_ar_args_list)
 export(knn_iteraive_ar_forecaster)
diff --git a/R/data.R b/R/data.R
new file mode 100644
index 000000000..9c93506a3
--- /dev/null
+++ b/R/data.R
@@ -0,0 +1,35 @@
+#' Subset of JHU daily state cases and deaths
+#'
+#' This data source of confirmed COVID-19 cases and deaths
+#' is based on reports made available by the Center for
+#' Systems Science and Engineering at Johns Hopkins University.
+#' This example data ranges from Dec 31, 2020 to Dec 31, 2021,
+#' and includes all states.
+#'
+#' @format A tibble with 20,496 rows and 4 variables:
+#' \describe{
+#'   \item{geo_value}{the geographic value associated with each row
+#'       of measurements.}
+#'   \item{time_value}{the time value associated with each row of measurements.}
+#'   \item{case_rate}{7-day average signal of number of new
+#'       confirmed COVID-19 cases per 100,000 population, daily}
+#'   \item{death_rate}{7-day average signal of number of new confirmed
+#'       deaths due to COVID-19 per 100,000 population, daily}
+#' }
+#' @source This object contains a modified part of the
+#'   \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University}
+#'   as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}.
+#'   This data set is licensed under the terms of the
+#'   \href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
+#'   by the Johns Hopkins University on behalf of its Center for Systems Science
+#'   in Engineering. Copyright Johns Hopkins University 2020.
+#'
+#' Modifications:
+#' * \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}:
+#'   These signals are taken directly from the JHU CSSE
+#'   \href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository}
+#'   without changes. The 7-day average signals are computed by Delphi by
+#'   calculating moving averages of the preceding 7 days, so the signal for
+#'   June 7 is the average of the underlying data for June 1 through 7,
+#'   inclusive.
+"case_death_rate_subset"
diff --git a/R/get_test_data.R b/R/get_test_data.R
new file mode 100644
index 000000000..69ec92d1f
--- /dev/null
+++ b/R/get_test_data.R
@@ -0,0 +1,56 @@
+#' Get test data for prediction based on longest lag period
+#'
+#' Based on the longest lag period in the recipe,
+#' `get_test_data()` creates a tibble in [epiprocess::epi_df]
+#' format with columns `geo_value`, `time_value`
+#' and other variables in the original dataset,
+#' which will be used to create test data.
+#'
+#' @param recipe A recipe object. The step will be added to the
+#'  sequence of operations for this recipe.
+#' @param x A data frame, tibble, or epi_df data set.
+#'
+#' @return A tibble with columns `geo_value`, `time_value`
+#' and other variables in the original dataset.
+#' @examples
+#' # create recipe
+#'  rec <- epi_recipe(case_death_rate_subset) %>%
+#'   step_epi_ahead(death_rate, ahead = 7) %>%
+#'   step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
+#'   step_epi_lag(case_rate, lag = c(0, 7, 14))
+#'  get_test_data(recipe = rec, x = case_death_rate_subset)
+#' @export
+
+get_test_data <- function(recipe, x){
+  # TO-DO: SOME CHECKS OF THE DATASET
+  if (any(!(c('geo_value','time_value') %in% colnames(x)))) {
+    rlang::abort("`geo_value`, `time_value` does not exist in data")
+  }
+  ## CHECK if it is epi_df?
+
+
+  # initialize vector to hold max lags for each variable
+  max_lags <- c()
+  for(i in c(1:length(recipe$steps))){
+    if("lag" %in% names(recipe$steps[[i]])){
+      max_lags <- append(max_lags, max(recipe$steps[[i]]$lag))
+    }
+  }
+
+  # CHECK: Return NA if insufficient training data
+  if (dplyr::n_distinct(x$time_value)< max(max_lags)) {
+    stop("insufficient training data")
+  }
+
+  test_data <- x %>%
+    dplyr::filter(
+      dplyr::if_any(
+        .cols = recipe$term_info$variable[which(recipe$var_info$role == 'raw')],
+        .fns = ~ !is.na(.x)
+      )
+    ) %>%
+    dplyr::group_by(geo_value) %>%
+    dplyr::slice_tail(n = max(max_lags) + 1)
+
+  return(test_data)
+}
diff --git a/data-raw/case_death_rate_subset.R b/data-raw/case_death_rate_subset.R
new file mode 100644
index 000000000..11f552f4d
--- /dev/null
+++ b/data-raw/case_death_rate_subset.R
@@ -0,0 +1,32 @@
+library(tidyverse)
+library(covidcast)
+library(delphi.epidata)
+library(epiprocess)
+
+x <- covidcast(
+  data_source = "jhu-csse",
+  signals = "confirmed_7dav_incidence_prop",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20201231, 20211231),
+  geo_values = "*"
+) %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, case_rate = value)
+
+y <- covidcast(
+  data_source = "jhu-csse",
+  signals = "deaths_7dav_incidence_prop",
+  time_type = "day",
+  geo_type = "state",
+  time_values = epirange(20201231, 20211231),
+  geo_values = "*"
+) %>%
+  fetch_tbl() %>%
+  select(geo_value, time_value, death_rate = value)
+
+case_death_rate_subset <- x %>%
+  full_join(y, by = c("geo_value", "time_value")) %>%
+  as_epi_df()
+
+usethis::use_data(case_death_rate_subset, overwrite = TRUE)
diff --git a/data/case_death_rate_subset.rda b/data/case_death_rate_subset.rda
new file mode 100644
index 000000000..2e5ced29e
Binary files /dev/null and b/data/case_death_rate_subset.rda differ
diff --git a/man/case_death_rate_subset.Rd b/man/case_death_rate_subset.Rd
new file mode 100644
index 000000000..119c8ee26
--- /dev/null
+++ b/man/case_death_rate_subset.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data.R
+\docType{data}
+\name{case_death_rate_subset}
+\alias{case_death_rate_subset}
+\title{Subset of JHU daily state cases and deaths}
+\format{
+A tibble with 20,496 rows and 4 variables:
+\describe{
+\item{geo_value}{the geographic value associated with each row
+of measurements.}
+\item{time_value}{the time value associated with each row of measurements.}
+\item{case_rate}{7-day average signal of number of new
+confirmed COVID-19 cases per 100,000 population, daily}
+\item{death_rate}{7-day average signal of number of new confirmed
+deaths due to COVID-19 per 100,000 population, daily}
+}
+}
+\source{
+This object contains a modified part of the
+\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University}
+as \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{republished in the COVIDcast Epidata API}.
+This data set is licensed under the terms of the
+\href{https://creativecommons.org/licenses/by/4.0/}{Creative Commons Attribution 4.0 International license}
+by the Johns Hopkins University on behalf of its Center for Systems Science
+in Engineering. Copyright Johns Hopkins University 2020.
+
+Modifications:
+\itemize{
+\item \href{https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/jhu-csse.html}{From the COVIDcast Epidata API}:
+These signals are taken directly from the JHU CSSE
+\href{https://github.com/CSSEGISandData/COVID-19}{COVID-19 GitHub repository}
+without changes. The 7-day average signals are computed by Delphi by
+calculating moving averages of the preceding 7 days, so the signal for
+June 7 is the average of the underlying data for June 1 through 7,
+inclusive.
+}
+}
+\usage{
+case_death_rate_subset
+}
+\description{
+This data source of confirmed COVID-19 cases and deaths
+is based on reports made available by the Center for
+Systems Science and Engineering at Johns Hopkins University.
+This example data ranges from Dec 31, 2020 to Dec 31, 2021,
+and includes all states.
+}
+\keyword{datasets}
diff --git a/man/get_test_data.Rd b/man/get_test_data.Rd
new file mode 100644
index 000000000..b71bad8f6
--- /dev/null
+++ b/man/get_test_data.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/get_test_data.R
+\name{get_test_data}
+\alias{get_test_data}
+\title{Get test data for prediction based on longest lag period}
+\usage{
+get_test_data(recipe, x)
+}
+\arguments{
+\item{recipe}{A recipe object. The step will be added to the
+sequence of operations for this recipe.}
+
+\item{x}{A data frame, tibble, or epi_df data set.}
+}
+\value{
+A tibble with columns \code{geo_value}, \code{time_value}
+and other variables in the original dataset.
+}
+\description{
+Based on the longest lag period in the recipe,
+\code{get_test_data()} creates a tibble in \link[epiprocess:epi_df]{epiprocess::epi_df}
+format with columns \code{geo_value}, \code{time_value}
+and other variables in the original dataset,
+which will be used to create test data.
+}
+\examples{
+# create recipe
+ rec <- epi_recipe(case_death_rate_subset) \%>\%
+  step_epi_ahead(death_rate, ahead = 7) \%>\%
+  step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\%
+  step_epi_lag(case_rate, lag = c(0, 7, 14))
+ get_test_data(recipe = rec, x = case_death_rate_subset)
+}
diff --git a/tests/testthat/test-get_test_data.R b/tests/testthat/test-get_test_data.R
new file mode 100644
index 000000000..e063523a1
--- /dev/null
+++ b/tests/testthat/test-get_test_data.R
@@ -0,0 +1,37 @@
+test_that("return expected number of rows", {
+  r <- epi_recipe(case_death_rate_subset) %>%
+    step_epi_ahead(death_rate, ahead = 7) %>%
+    step_epi_lag(death_rate, lag = c(0, 7, 14, 21, 28)) %>%
+    step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
+    step_naomit(all_predictors()) %>%
+    step_naomit(all_outcomes(), skip = TRUE)
+
+  test <- get_test_data(recipe = r, x = case_death_rate_subset)
+
+  expect_equal(nrow(test),
+               dplyr::n_distinct(case_death_rate_subset$geo_value)* 29)
+})
+
+
+test_that("expect insufficient training data error", {
+  r <- epi_recipe(case_death_rate_subset) %>%
+    step_epi_ahead(death_rate, ahead = 7) %>%
+    step_epi_lag(death_rate, lag = c(0, 367)) %>%
+    step_naomit(all_predictors()) %>%
+    step_naomit(all_outcomes(), skip = TRUE)
+
+  expect_error(get_test_data(recipe = r, x = case_death_rate_subset))
+})
+
+test_that("expect error that geo_value or time_value does not exist", {
+  r <-  epi_recipe(case_death_rate_subset) %>%
+    step_epi_ahead(death_rate, ahead = 7) %>%
+    step_epi_lag(death_rate, lag = c(0, 7, 14)) %>%
+    step_epi_lag(case_rate, lag = c(0, 7, 14)) %>%
+    step_naomit(all_predictors()) %>%
+    step_naomit(all_outcomes(), skip = TRUE)
+
+  wrong_epi_df <- case_death_rate_subset %>% dplyr::select(-geo_value)
+
+  expect_error(get_test_data(recipe = r, x = wrong_epi_df))
+})