13
13
"""Utilities to support workflow."""
14
14
from __future__ import absolute_import
15
15
16
+ from pathlib import Path
16
17
from typing import List , Sequence , Union
17
18
import hashlib
19
+ from _hashlib import HASH as Hash
18
20
from urllib .parse import unquote , urlparse
19
21
20
22
from sagemaker .workflow .entities import (
23
25
)
24
26
from sagemaker .workflow .step_collections import StepCollection
25
27
28
+ BUF_SIZE = 65536 # 64KiB
29
+
26
30
27
31
def list_to_request (entities : Sequence [Union [Entity , StepCollection ]]) -> List [RequestType ]:
28
32
"""Get the request structure for list of entities.
@@ -49,15 +53,80 @@ def hash_file(path: str) -> str:
49
53
Returns:
50
54
str: The MD5 hash of the file.
51
55
"""
52
- BUF_SIZE = 65536 # read in 64KiB chunks
56
+ return _hash_file (path , hashlib .md5 ()).hexdigest ()
57
+
58
+
59
+ def hash_files_or_dirs (paths : List [str ]) -> str :
60
+ """Get the MD5 hash of the contents of a list of files or directories.
61
+
62
+ Hash is changed if:
63
+ * input list is changed
64
+ * new nested directories/files are added to any directory in the input list
65
+ * nested directory/file names are changed for any of the inputted directories
66
+ * content of files is edited
67
+
68
+ Args:
69
+ paths: List of file or directory paths
70
+ Returns:
71
+ str: The MD5 hash of the list of files or directories.
72
+ """
53
73
md5 = hashlib .md5 ()
54
- if path .lower ().startswith ("file://" ):
74
+ for path in sorted (paths ):
75
+ md5 = _hash_file_or_dir (path , md5 )
76
+ return md5 .hexdigest ()
77
+
78
+
79
+ def _hash_file_or_dir (path : str , md5 : Hash ) -> Hash :
80
+ """Updates the inputted Hash with the contents of the current path.
81
+
82
+ Args:
83
+ path: path of file or directory
84
+ Returns:
85
+ str: The MD5 hash of the file or directory
86
+ """
87
+ if isinstance (path , str ) and path .lower ().startswith ("file://" ):
55
88
path = unquote (urlparse (path ).path )
56
- with open (path , "rb" ) as f :
89
+ md5 .update (path .encode ())
90
+ if Path (path ).is_dir ():
91
+ md5 = _hash_dir (path , md5 )
92
+ elif Path (path ).is_file ():
93
+ md5 = _hash_file (path , md5 )
94
+ return md5
95
+
96
+
97
+ def _hash_dir (directory : Union [str , Path ], md5 : Hash ) -> Hash :
98
+ """Updates the inputted Hash with the contents of the current path.
99
+
100
+ Args:
101
+ directory: path of the directory
102
+ Returns:
103
+ str: The MD5 hash of the directory
104
+ """
105
+ assert Path (directory ).is_dir ()
106
+ for path in sorted (Path (directory ).iterdir ()):
107
+ md5 .update (path .name .encode ())
108
+ if path .is_file ():
109
+ md5 = _hash_file (path , md5 )
110
+ elif path .is_dir ():
111
+ md5 = _hash_dir (path , md5 )
112
+ return md5
113
+
114
+
115
+ def _hash_file (file : Union [str , Path ], md5 : Hash ) -> Hash :
116
+ """Updates the inputted Hash with the contents of the current path.
117
+
118
+ Args:
119
+ file: path of the file
120
+ Returns:
121
+ str: The MD5 hash of the file
122
+ """
123
+ if isinstance (file , str ) and file .lower ().startswith ("file://" ):
124
+ file = unquote (urlparse (file ).path )
125
+ assert Path (file ).is_file ()
126
+ with open (file , "rb" ) as f :
57
127
while True :
58
128
data = f .read (BUF_SIZE )
59
129
if not data :
60
130
break
61
131
md5 .update (data )
62
-
63
- return md5 .hexdigest ()
132
+ return md5
0 commit comments