13
13
"""Utilities to support workflow."""
14
14
from __future__ import absolute_import
15
15
16
+ from pathlib import Path
16
17
from typing import List , Sequence , Union
17
18
import hashlib
19
+ from _hashlib import HASH as Hash
18
20
from urllib .parse import unquote , urlparse
19
21
20
22
from sagemaker .workflow .entities import (
23
25
)
24
26
from sagemaker .workflow .step_collections import StepCollection
25
27
28
+ BUF_SIZE = 65536 # 64KiB
29
+
26
30
27
31
def list_to_request (entities : Sequence [Union [Entity , StepCollection ]]) -> List [RequestType ]:
28
32
"""Get the request structure for list of entities.
@@ -49,15 +53,77 @@ def hash_file(path: str) -> str:
49
53
Returns:
50
54
str: The MD5 hash of the file.
51
55
"""
52
- BUF_SIZE = 65536 # read in 64KiB chunks
56
+ return _hash_file (path , hashlib .md5 ()).hexdigest ()
57
+
58
+
59
+ def hash_files_or_dirs (paths : List [str ]) -> str :
60
+ """Get the MD5 hash of the contents of a list of files or directories.
61
+
62
+ Hash is changed if:
63
+ * input list is changed
64
+ * new nested directories/files are added to any directory in the input list
65
+ * nested directory/file names are changed for any of the inputted directories
66
+ * content of files is edited
67
+
68
+ Args:
69
+ paths: List of file or directory paths
70
+ Returns:
71
+ str: The MD5 hash of the list of files or directories.
72
+ """
53
73
md5 = hashlib .md5 ()
54
- if path .lower ().startswith ("file://" ):
74
+ for path in sorted (paths ):
75
+ md5 = _hash_file_or_dir (path , md5 )
76
+ return md5 .hexdigest ()
77
+
78
+
79
+ def _hash_file_or_dir (path : str , md5 : Hash ) -> Hash :
80
+ """Updates the inputted Hash with the contents of the current path
81
+ Args:
82
+ path: path of file or directory
83
+ Returns:
84
+ str: The MD5 hash of the file or directory
85
+ """
86
+ if isinstance (path , str ) and path .lower ().startswith ("file://" ):
55
87
path = unquote (urlparse (path ).path )
56
- with open (path , "rb" ) as f :
88
+ md5 .update (path .encode ())
89
+ if Path (path ).is_dir ():
90
+ md5 = _hash_dir (path , md5 )
91
+ elif Path (path ).is_file ():
92
+ md5 = _hash_file (path , md5 )
93
+ return md5
94
+
95
+
96
+ def _hash_dir (directory : Union [str , Path ], md5 : Hash ) -> Hash :
97
+ """Updates the inputted Hash with the contents of the current path
98
+ Args:
99
+ directory: path of the directory
100
+ Returns:
101
+ str: The MD5 hash of the directory
102
+ """
103
+ assert Path (directory ).is_dir ()
104
+ for path in sorted (Path (directory ).iterdir ()):
105
+ md5 .update (path .name .encode ())
106
+ if path .is_file ():
107
+ md5 = _hash_file (path , md5 )
108
+ elif path .is_dir ():
109
+ md5 = _hash_dir (path , md5 )
110
+ return md5
111
+
112
+
113
+ def _hash_file (file : Union [str , Path ], md5 : Hash ) -> Hash :
114
+ """Updates the inputted Hash with the contents of the current path
115
+ Args:
116
+ file: path of the file
117
+ Returns:
118
+ str: The MD5 hash of the file
119
+ """
120
+ if isinstance (file , str ) and file .lower ().startswith ("file://" ):
121
+ file = unquote (urlparse (file ).path )
122
+ assert Path (file ).is_file ()
123
+ with open (file , "rb" ) as f :
57
124
while True :
58
125
data = f .read (BUF_SIZE )
59
126
if not data :
60
127
break
61
128
md5 .update (data )
62
-
63
- return md5 .hexdigest ()
129
+ return md5
0 commit comments