Skip to content

Commit 9649099

Browse files
committed
feat: add some s3 cleaning utility functions
1 parent 747da48 commit 9649099

File tree

1 file changed

+44
-1
lines changed

1 file changed

+44
-1
lines changed

R/aux_data_utils.R

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,50 @@ process_nhsn_data <- function(raw_nhsn_data) {
653653
# for filenames of the form nhsn_data_2024-11-19_16-29-43.191649.rds
654654
get_version_timestamp <- function(filename) ymd_hms(str_match(filename, "[0-9]{4}-..-.._..-..-..\\.[^.^_]*"))
655655

656-
#' all in one function to get and cache a nhsn archive from raw files
656+
#' Remove duplicate files from S3
657+
#'
658+
#' Removes duplicate files from S3 by keeping only the earliest timestamp file for each ETag.
659+
#' You can modify keep_df, if this doesn't suit your needs.
660+
#'
661+
#' @param bucket The name of the S3 bucket.
662+
#' @param prefix The prefix of the files to remove duplicates from.
663+
#' @param dry_run Whether to actually delete the files.
664+
#' @param .progress Whether to show a progress bar.
665+
delete_duplicates_from_s3_by_etag <- function(bucket, prefix, dry_run = TRUE, .progress = TRUE) {
666+
# Get a list of all new dataset snapshots from S3
667+
files_df <- aws.s3::get_bucket_df(bucket = bucket, prefix = prefix) %>% as_tibble()
668+
669+
# Create a list of all the files to keep by keeping the earliest timestamp file for each ETag
670+
keep_df <- files_df %>%
671+
group_by(ETag) %>%
672+
slice_min(LastModified) %>%
673+
ungroup()
674+
delete_df <- files_df %>%
675+
anti_join(keep_df, by = "Key")
676+
if (nrow(delete_df) > 0) {
677+
if (dry_run) {
678+
cli::cli_alert_info("Would delete {nrow(delete_df)} files from {bucket} with prefix {prefix}")
679+
print(delete_df)
680+
return(invisible(delete_df))
681+
} else {
682+
delete_files_from_s3(bucket = bucket, keys = delete_df$Key, .progress = .progress)
683+
}
684+
}
685+
}
686+
687+
#' Delete files from S3
688+
#'
689+
#' Faster than aws.s3::delete_object, when there are many files to delete (thousands).
690+
#'
691+
#' @param bucket The name of the S3 bucket.
692+
#' @param keys The keys of the files to delete, as a character vector.
693+
#' @param batch_size The number of files to delete in each batch.
694+
#' @param .progress Whether to show a progress bar.
695+
delete_files_from_s3 <- function(bucket, keys, batch_size = 500, .progress = TRUE) {
696+
split(keys, ceiling(seq_along(keys) / batch_size)) %>%
697+
purrr::walk(~aws.s3::delete_object(bucket = bucket, object = .x), .progress = .progress)
698+
}
699+
657700
#' @description
658701
#' This takes in all of the raw data files for the nhsn data, creates a
659702
#' quasi-archive (it keeps one example per version-day, rather than one per

0 commit comments

Comments
 (0)