13
13
"""Utilities to support workflow."""
14
14
from __future__ import absolute_import
15
15
16
+ from pathlib import Path
16
17
from typing import List , Sequence , Union
17
18
import hashlib
19
+ from _hashlib import HASH as Hash
18
20
from urllib .parse import unquote , urlparse
19
21
20
22
from sagemaker .workflow .entities import (
23
25
)
24
26
from sagemaker .workflow .step_collections import StepCollection
25
27
28
+ BUF_SIZE = 65536 # 64KiB
29
+
26
30
27
31
def list_to_request (entities : Sequence [Union [Entity , StepCollection ]]) -> List [RequestType ]:
28
32
"""Get the request structure for list of entities.
@@ -49,15 +53,75 @@ def hash_file(path: str) -> str:
49
53
Returns:
50
54
str: The MD5 hash of the file.
51
55
"""
52
- BUF_SIZE = 65536 # read in 64KiB chunks
56
+ return _hash_file (path , hashlib .md5 ()).hexdigest ()
57
+
58
+
59
+ def hash_files_or_dirs (paths : List [str ]) -> str :
60
+ """Get the MD5 hash of the contents of a list of files or directories. Hash is changed if:
61
+ * input list is changed
62
+ * new nested directories/files are added to any directory in the input list
63
+ * nested directory/file names are changed for any of the inputted directories
64
+ * content of files is edited
65
+
66
+ Args:
67
+ paths: List of file or directory paths
68
+ Returns:
69
+ str: The MD5 hash of the list of files or directories.
70
+ """
53
71
md5 = hashlib .md5 ()
54
- if path .lower ().startswith ("file://" ):
72
+ for path in sorted (paths ):
73
+ md5 = _hash_file_or_dir (path , md5 )
74
+ return md5 .hexdigest ()
75
+
76
+
77
+ def _hash_file_or_dir (path : str , md5 : Hash ) -> Hash :
78
+ """Updates the inputted Hash with the contents of the current path
79
+ Args:
80
+ path: path of file or directory
81
+ Returns:
82
+ str: The MD5 hash of the file or directory
83
+ """
84
+ if isinstance (path , str ) and path .lower ().startswith ("file://" ):
55
85
path = unquote (urlparse (path ).path )
56
- with open (path , "rb" ) as f :
86
+ md5 .update (path .encode ())
87
+ if Path (path ).is_dir ():
88
+ md5 = _hash_dir (path , md5 )
89
+ elif Path (path ).is_file ():
90
+ md5 = _hash_file (path , md5 )
91
+ return md5
92
+
93
+
94
+ def _hash_dir (directory : Union [str , Path ], md5 : Hash ) -> Hash :
95
+ """Updates the inputted Hash with the contents of the current path
96
+ Args:
97
+ directory: path of the directory
98
+ Returns:
99
+ str: The MD5 hash of the directory
100
+ """
101
+ assert Path (directory ).is_dir ()
102
+ for path in sorted (Path (directory ).iterdir ()):
103
+ md5 .update (path .name .encode ())
104
+ if path .is_file ():
105
+ md5 = _hash_file (path , md5 )
106
+ elif path .is_dir ():
107
+ md5 = _hash_dir (path , md5 )
108
+ return md5
109
+
110
+
111
+ def _hash_file (file : Union [str , Path ], md5 : Hash ) -> Hash :
112
+ """Updates the inputted Hash with the contents of the current path
113
+ Args:
114
+ file: path of the file
115
+ Returns:
116
+ str: The MD5 hash of the file
117
+ """
118
+ if isinstance (file , str ) and file .lower ().startswith ("file://" ):
119
+ file = unquote (urlparse (file ).path )
120
+ assert Path (file ).is_file ()
121
+ with open (file , "rb" ) as f :
57
122
while True :
58
123
data = f .read (BUF_SIZE )
59
124
if not data :
60
125
break
61
126
md5 .update (data )
62
-
63
- return md5 .hexdigest ()
127
+ return md5
0 commit comments