@@ -653,7 +653,50 @@ process_nhsn_data <- function(raw_nhsn_data) {
653
653
# for filenames of the form nhsn_data_2024-11-19_16-29-43.191649.rds
654
654
get_version_timestamp <- function (filename ) ymd_hms(str_match(filename , " [0-9]{4}-..-.._..-..-..\\ .[^.^_]*" ))
655
655
656
- # ' all in one function to get and cache a nhsn archive from raw files
656
+ # ' Remove duplicate files from S3
657
+ # '
658
+ # ' Removes duplicate files from S3 by keeping only the earliest timestamp file for each ETag.
659
+ # ' You can modify keep_df, if this doesn't suit your needs.
660
+ # '
661
+ # ' @param bucket The name of the S3 bucket.
662
+ # ' @param prefix The prefix of the files to remove duplicates from.
663
+ # ' @param dry_run Whether to actually delete the files.
664
+ # ' @param .progress Whether to show a progress bar.
665
+ delete_duplicates_from_s3_by_etag <- function (bucket , prefix , dry_run = TRUE , .progress = TRUE ) {
666
+ # Get a list of all new dataset snapshots from S3
667
+ files_df <- aws.s3 :: get_bucket_df(bucket = bucket , prefix = prefix ) %> % as_tibble()
668
+
669
+ # Create a list of all the files to keep by keeping the earliest timestamp file for each ETag
670
+ keep_df <- files_df %> %
671
+ group_by(ETag ) %> %
672
+ slice_min(LastModified ) %> %
673
+ ungroup()
674
+ delete_df <- files_df %> %
675
+ anti_join(keep_df , by = " Key" )
676
+ if (nrow(delete_df ) > 0 ) {
677
+ if (dry_run ) {
678
+ cli :: cli_alert_info(" Would delete {nrow(delete_df)} files from {bucket} with prefix {prefix}" )
679
+ print(delete_df )
680
+ return (invisible (delete_df ))
681
+ } else {
682
+ delete_files_from_s3(bucket = bucket , keys = delete_df $ Key , .progress = .progress )
683
+ }
684
+ }
685
+ }
686
+
687
+ # ' Delete files from S3
688
+ # '
689
+ # ' Faster than aws.s3::delete_object, when there are many files to delete (thousands).
690
+ # '
691
+ # ' @param bucket The name of the S3 bucket.
692
+ # ' @param keys The keys of the files to delete, as a character vector.
693
+ # ' @param batch_size The number of files to delete in each batch.
694
+ # ' @param .progress Whether to show a progress bar.
695
+ delete_files_from_s3 <- function (bucket , keys , batch_size = 500 , .progress = TRUE ) {
696
+ split(keys , ceiling(seq_along(keys ) / batch_size )) %> %
697
+ purrr :: walk(~ aws.s3 :: delete_object(bucket = bucket , object = .x ), .progress = .progress )
698
+ }
699
+
657
700
# ' @description
658
701
# ' This takes in all of the raw data files for the nhsn data, creates a
659
702
# ' quasi-archive (it keeps one example per version-day, rather than one per
0 commit comments