1
1
"""
2
2
Utilities for diffing and archiving covidcast export CSVs.
3
+
3
4
Aims to simplify the creation of issues for new and backfilled value for indicators.
4
5
Also handles archiving of export CSVs to some backend (git, S3 etc.) before replacing them.
5
6
@@ -52,6 +53,7 @@ def diff_export_csv(
52
53
) -> Tuple [pd .DataFrame , pd .DataFrame , pd .DataFrame ]:
53
54
"""
54
55
Find differences in exported covidcast CSVs, using geo_id as the index.
56
+
55
57
Treats NA == NA as True.
56
58
57
59
Parameters
@@ -68,7 +70,6 @@ def diff_export_csv(
68
70
changed_df is the pd.DataFrame of common rows from after_csv with changed values.
69
71
added_df is the pd.DataFrame of added rows from after_csv.
70
72
"""
71
-
72
73
export_csv_dtypes = {"geo_id" : str , "val" : float ,
73
74
"se" : float , "sample_size" : float }
74
75
@@ -99,7 +100,7 @@ def run_module(archive_type: str,
99
100
cache_dir : str ,
100
101
export_dir : str ,
101
102
** kwargs ):
102
- """Builds and runs an ArchiveDiffer.
103
+ """Build and run an ArchiveDiffer.
103
104
104
105
Parameters
105
106
----------
@@ -132,13 +133,11 @@ def run_module(archive_type: str,
132
133
133
134
134
135
class ArchiveDiffer :
135
- """
136
- Base class for performing diffing and archiving of exported covidcast CSVs
137
- """
136
+ """Base class for performing diffing and archiving of exported covidcast CSVs."""
138
137
139
138
def __init__ (self , cache_dir : str , export_dir : str ):
140
139
"""
141
- Initialize an ArchiveDiffer
140
+ Initialize an ArchiveDiffer.
142
141
143
142
Parameters
144
143
----------
@@ -157,15 +156,17 @@ def __init__(self, cache_dir: str, export_dir: str):
157
156
158
157
def update_cache (self ):
159
158
"""
160
- For making sure cache_dir is updated correctly from a backend.
159
+ Make sure cache_dir is updated correctly from a backend.
160
+
161
161
To be implemented by specific archiving backends.
162
162
Should set self._cache_updated = True after verifying cache is updated.
163
163
"""
164
164
raise NotImplementedError
165
165
166
166
def diff_exports (self ) -> Tuple [Files , FileDiffMap , Files ]:
167
167
"""
168
- Finds diffs across and within CSV files, from cache_dir to export_dir.
168
+ Find diffs across and within CSV files, from cache_dir to export_dir.
169
+
169
170
Should be called after update_cache() succeeds. Only works on *.csv files,
170
171
ignores every other file.
171
172
@@ -223,7 +224,8 @@ def diff_exports(self) -> Tuple[Files, FileDiffMap, Files]:
223
224
224
225
def archive_exports (self , exported_files : Files ) -> Tuple [Files , Files ]:
225
226
"""
226
- Handles actual archiving of files, depending on specific backend.
227
+ Handle actual archiving of files, depending on specific backend.
228
+
227
229
To be implemented by specific archiving backends.
228
230
229
231
Parameters
@@ -241,6 +243,8 @@ def archive_exports(self, exported_files: Files) -> Tuple[Files, Files]:
241
243
242
244
def filter_exports (self , common_diffs : FileDiffMap ):
243
245
"""
246
+ Filter export directory to only contain relevant files.
247
+
244
248
Filters down the export_dir to only contain:
245
249
1) New files, 2) Changed files, filtered-down to the ADDED and CHANGED rows only.
246
250
Should be called after archive_exports() so we archive the raw exports before
@@ -269,7 +273,7 @@ def filter_exports(self, common_diffs: FileDiffMap):
269
273
replace (diff_file , exported_file )
270
274
271
275
def run (self ):
272
- """Runs the differ and archives the changed and new files."""
276
+ """Run the differ and archive the changed and new files."""
273
277
self .update_cache ()
274
278
275
279
# Diff exports, and make incremental versions
@@ -293,7 +297,8 @@ def run(self):
293
297
294
298
class S3ArchiveDiffer (ArchiveDiffer ):
295
299
"""
296
- AWS S3 backend for archving
300
+ AWS S3 backend for archiving.
301
+
297
302
Archives CSV files into a S3 bucket, with keys "{indicator_prefix}/{csv_file_name}".
298
303
Ideally, versioning should be enabled in this bucket to track versions of each CSV file.
299
304
"""
@@ -306,6 +311,7 @@ def __init__(
306
311
):
307
312
"""
308
313
Initialize a S3ArchiveDiffer.
314
+
309
315
See this link for possible aws_credentials kwargs:
310
316
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session
311
317
@@ -330,9 +336,7 @@ def __init__(
330
336
self .indicator_prefix = indicator_prefix
331
337
332
338
def update_cache (self ):
333
- """
334
- For making sure cache_dir is updated with all latest files from the S3 bucket.
335
- """
339
+ """Make sure cache_dir is updated with all latest files from the S3 bucket."""
336
340
# List all indicator-related objects from S3
337
341
archive_objects = self .bucket .objects .filter (
338
342
Prefix = self .indicator_prefix ).all ()
@@ -358,7 +362,7 @@ def archive_exports(self, # pylint: disable=arguments-differ
358
362
update_s3 : bool = True
359
363
) -> Tuple [Files , Files ]:
360
364
"""
361
- Handles actual archiving of files to the S3 bucket.
365
+ Handle actual archiving of files to the S3 bucket.
362
366
363
367
Parameters
364
368
----------
@@ -398,7 +402,8 @@ def archive_exports(self, # pylint: disable=arguments-differ
398
402
399
403
class GitArchiveDiffer (ArchiveDiffer ):
400
404
"""
401
- Local git repo backend for archiving
405
+ Local git repo backend for archiving.
406
+
402
407
Archives CSV files into a local git repo as commits.
403
408
Assumes that a git repository is already set up.
404
409
"""
@@ -446,7 +451,8 @@ def __init__(
446
451
447
452
def get_branch (self , branch_name : Optional [str ] = None ) -> Head :
448
453
"""
449
- Retrieves a Head object representing a branch of specified name.
454
+ Retrieve a Head object representing a branch of specified name.
455
+
450
456
Creates the branch from the current active branch if does not exist yet.
451
457
452
458
Parameters
@@ -469,6 +475,8 @@ def get_branch(self, branch_name: Optional[str] = None) -> Head:
469
475
@contextmanager
470
476
def archiving_branch (self ):
471
477
"""
478
+ Context manager for checking out a branch.
479
+
472
480
Useful for checking out self.branch within a context, then switching back
473
481
to original branch when finished.
474
482
"""
@@ -482,8 +490,9 @@ def archiving_branch(self):
482
490
483
491
def update_cache (self ):
484
492
"""
493
+ Check if cache_dir is clean: has everything nicely committed if override_dirty=False.
494
+
485
495
Since we are using a local git repo, assumes there is nothing to update from.
486
- Checks if cache_dir is clean: has everything nice committed if override_dirty=False
487
496
"""
488
497
# Make sure cache directory is clean: has everything nicely committed
489
498
if not self .override_dirty :
@@ -495,14 +504,16 @@ def update_cache(self):
495
504
496
505
def diff_exports (self ) -> Tuple [Files , FileDiffMap , Files ]:
497
506
"""
498
- Same as base class diff_exports, but in context of specified branch
507
+ Find diffs across and within CSV files, from cache_dir to export_dir.
508
+
509
+ Same as base class diff_exports, but in context of specified branch.
499
510
"""
500
511
with self .archiving_branch ():
501
512
return super ().diff_exports ()
502
513
503
514
def archive_exports (self , exported_files : Files ) -> Tuple [Files , Files ]:
504
515
"""
505
- Handles actual archiving of files to the local git repo.
516
+ Handle actual archiving of files to the local git repo.
506
517
507
518
Parameters
508
519
----------
0 commit comments