From ea7e9a95838927a64610700ae9f2ee434e3ca140 Mon Sep 17 00:00:00 2001 From: jeniyat Date: Wed, 23 Feb 2022 13:25:30 -0700 Subject: [PATCH 1/7] feature: Data Serializer capabile of supporting image audio files --- src/sagemaker/serializers.py | 34 ++++++++++++++++++++++- tests/data/cuteCat.raw | Bin 0 -> 6581 bytes tests/unit/sagemaker/test_serializers.py | 24 ++++++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 tests/data/cuteCat.raw diff --git a/src/sagemaker/serializers.py b/src/sagemaker/serializers.py index 727912a33c..5d8392ad4a 100644 --- a/src/sagemaker/serializers.py +++ b/src/sagemaker/serializers.py @@ -18,7 +18,7 @@ import csv import io import json - +import os import numpy as np from six import with_metaclass @@ -357,3 +357,35 @@ def serialize(self, data): return data.read() raise ValueError("Unable to handle input format: %s" % type(data)) + + +class DataSerializer(SimpleBaseSerializer): + """Serialize data in any file by extracting raw bytes from the file.""" + + def __init__(self, content_type="file-path/raw-bytes"): + """Initialize a ``DataSerializer`` instance. + + Args: + content_type (str): The MIME type to signal to the inference endpoint when sending + request data (default: "file-path/raw-bytes"). + """ + super(DataSerializer, self).__init__(content_type=content_type) + + def serialize(self, data): + """Serialize file of various formats to a raw bytes. + + Args: + data (object): Data to be serialized. The data can be a string, + representing file-path or the raw bytes from a file. + Returns: + raw-bytes: The data serialized as a raw-bytes from the input. + """ + if isinstance(data, str): + if not os.path.exists(data): + raise ValueError(f"{data} is not a valid file path.") + image = open(data, "rb") + return image.read() + if isinstance(data, bytes): + return data + + raise ValueError(f"Object of type {type(data)} is not Data serializable.") diff --git a/tests/data/cuteCat.raw b/tests/data/cuteCat.raw new file mode 100644 index 0000000000000000000000000000000000000000..6e89b9d78f59ed0c751e8600b849cb3d869d6503 GIT binary patch literal 6581 zcmZ`+cQjnxyFO#|G6qpc?`^6d*E-LB*LrT}ZdU=QmYSv-00IF3=3|I zWDpn>$^d6#<%0iz5cvNBw>wbI z|2dEXPykK=s=U`vTJglmU9uCnF=pXyDy3~c4bc<4IlwYp%&qCv{%Isp&cn~scxi+N zh-^G+X&qypjUDU)^XjY@Zhwlcme$a=RJ~sTBc#Zq#!{Z`tbrVr7!c;EDloKC;Yh0s zPKf`K>!$4@i3m}LzO0g#|E23pGDS{2ltoU0Q`;L|D~{8!nuDd+cx}j5L9E8xn_lgd zLRZ`T#tlN6UYh~7pvQrAFJIzm!$a$Ig}a8YDA7Zq$;hewE@!RWKKspn{vC}rpsC$Z zNYQ)i3zWExeN%V{Eu?HMCyi;WWD}vzWYvG>Vp0sB;7uW2CM*AEExGrJ)9jX>dbQxT z;Z{jsb{f%R6XJTMCyCs4xHE;9n9cJr19h05df!3WpF;jA>`zMh60<5qe#kALO6ER( zMrQuCuT#34oTB-Z@PO*xLNbK8EbfPqRKm-M^b_GzB`-f+p7cE)k#4ZopqkPr2_0D;!%i9M0gIkqnAhLQS&8erjq^xi*fF=j zm+|gHm@BOEnW%)A-bSwWOSgP0AwbjMMa2qvm4cX1Vv)spwk*d>x9?%`%hDXbAF`6E zB%TVNX?mg!%w>bBJHpj+MbBH=)aM5YexaF3J%@{OK0@o`U`!}w=IAPWQjeNcMh)hb zgQ9U)O=3-0D3f_7{E{jAAcyX8q8ThlB8eepO-TV7aWOZ`J8)*eJze z$5tU0ffoP9zFg;yi}4D8>*sv%g7`$>0t`evI|nJx&BG4)+peIWbUx;TK6ZZYM%SRh z(Vl^i-)xPNPN-<>jFz-akhfQ%RgU*q1P8H&r=~IeJsZyy3mGY%1p2A9y0|rx;J?Zc z0>7`Ra;(Xjs{X(!Y6Ru<&Ogid0l7Gt1}TBcJYqM46;HT%ru8$Hi^%X+iww&uPD5=L z$KVn)uUu`y*S7VCYko_JR2_mQZ9??f`fnaLyf%vq>4sjWDTwE;L?OU){!jw;xGfzr z_tuqVVE8rv1MIs(xI$Ron8W_WOtTjdoayOMI+c*Ftbs8}AIcC?^U z34<_Yl_eTAK=Y2MEW7MfW))Y1C!=!k230M}jfwS4ovD+ZM5^4da~uwdgG<=?pt|fb zBjWbn>c{+xjLih!_Zc`pwiK2sh@%IO@(MUmVByP{;(czml6q%J?!g>Ls{gX3-k5%~ zDp~3rT1z8WVruF!iny z)YuAMXS@aGpYGKL?ZGV9yeB4aY*~ZM_(v-zy3SaV`V+FezNW@E9MexsmXn-vrb)0V zhkqo#=V~nU@X1V0JGbWjZ;N}B4sCuWw#!Ib_Sz4qaR5gaL3-ufdVsBXw0sR)xn z#kJss5_7+GsYl{MgJy&eUk9zB@-BV24OwyU_p#alL4-yD_x;-Rqu8ihz-WJ{Cd^~^ z-0bnpFcW)VNOYESLV$z|5(38+19NRs-XC*QXqOvFwYm=wL7W*B;n%G6k)WlU&T;V7 zfHQ-m-GKDs=ws6IRtcgAZw8HYS-aVUu19TdVqiwdz^ePFS3foNsm)P>H85_?rJggu z@@0JvV_hZR8rC@zpoD+j#?pBgu{7E9sbDHhN{yrY1>E{ITF#(kfFrCY#WRP^gBRJq!r z4Eg9#zO?sHmEObLUvgSybQ9mx@V;a9f_9y<;jAv;nUI+iJ8qL_PX!MUZ=FnLgzkGNO2y}U&*@)r zKc9P}TW^QkN>7D+M|ai4mQ+=av;4hsRyl@mQejx~o1CP|TeYUWJr8Tmvqc2dwoYy> z)Ai{eyc}2I{aS3;kgcX|BgW98df%2w>f4~l^mfQE;auqi8%>ts8{m>Wk7Z#{Dj~IU z25@#JBf}w+#Nqq|13@vLQeeu{gam%JtZzP3t{W99@YQDMX(>jfvVfhA_8RkIdjl--1vRtLKR)i z78~tizg!+#9ZTB1qsPIS-kMgKxfe~GBd6BOejyA#ACq)&pcZ# zD!b7n>%xjJBj+(Bdm>*T?LP(PzqKXq4c>-pW4XDkI5lSZ7`6a?JUdz^IjTNs{Nhan zvEi6>q~8o$sJf~3QL_|5d1bOZ)9z%ASxK&At;3Aj7^$Br3aZF0E5Z083sPvYVshU% zZTUdT%4#HNUu8m4qU{y24#c`7BAcr{Lg)o^@Xeam3U&FywtgZ;bw*@94L6?(i7m1A z*IPd~?wPVVyl)s|AoeXVu0!9VZp3r`XxnLOO0$E}EktEU&cO2R10g^{2^RjGpo%jm zxX*L_9|ghTL~XvY5=HuHg9e|kW_YDErd&adQjOzRee3w-Fv~(i(^6aE>E`^_BOval zYz{)E=iK4v)4BUJDL9U(Qfs&egRR^^%-l?O92eSgj9hc*dUxNh^H~ZGXN| zL?Ox|7Q_CX!3xt$G>xRw9ajRX^vp@n->`h;Xh}Z)^L7v zjAL@%6qu+j=Djgue@ef57uHh?{A5!}6^;x6?v0E_HF` z1<{j2!@)Rdse8B??FcTxVkUtK2~#c5L_5O4_imD*wMj6{hj`?@!;u;3;$_x9rNtM+ zt1G5WZ1Pwr$jqAvP0$3O%{8zh%pC`Tj0H0OBo$06!7v(((J@@6&Zwxq>sbX*V1tVBP*+3 z;8+_eHMS9;U};HPhvVMgnW1XSM%%Eewq4o69Hu#!GP1a0ym)e2Nb>g#lvwHRv_)*+ z1jHvA?d2L+KkLb)h2>Gf?laRSLIYAcWo}qn3i#zVwtliu9_Yfve63hMu4IG_{Ry_p z`4u>)r5Nmqb4@nO2P{@-<&dg=smc za!i{OYtfd-CrGqRs-ji{pM5^-D3fllF1mUvVD(qjNWjOJsws0Ru|wALjB}~r1d$?k z(@OE3)f%%WP1I>yqTG0b)e(AG|Iv|eF^fQjh~Xe;DAVDtrr2cra)Ep+SE<6aaDpE7 zY~!kvVji$dNobmz!NaQr)T0?5!E)B7FB`%$3I#IeS=EirYht zo|oxc_{dv5#9Sews?Vdq8XIq)NEce?rI=!oXDG3NhuY@Vxshw^FC|m!E_j<34lD1( zqDMmt^-VpM`t6*OpQ1!pYIez>l!2@9;4l6bfPn+Bw+O9_2FB0)A$mVx_>0r8}Zhr49_&xXGkAVpf_dnC5FPsFRini~l z^?9{%Yr8uLIs-W7dO=AeiVp#b-s2sG-aw|7e3;>Yi8yI%>oK$15o(>Whmi7_!Xx<$ zw#3|E+Zk0yCP+);nEx^sL+|?ee4WTt0AI9Eb$L9j_ei_ih#)g52l=+#tl0P4Q;pC? zDe`r&V<_9#tJ0~bgQW2@eIf65dFaXE^)yIO83Uq?o)0^$f6AHcIR5NUf#@H>3iOi-TCM|Z<(RVqc|%G`>)EyP{u&QyF@3jJ&5j74C`PFVUdv(4!?{%3c&triXx#bzE&mnZv2R~z0j z0s47Hlgi%NH#3G+@gKB^P<-3~bCYcoWJiu}o1xmaBKI>$Pn)@IXkK8NwwJ)jo~XX< z+O{O&DsOgneUYS>T^DaPSiinLK4%g8J#=E3?e$Ze$R{RcpN5HW zQyr(~Ssbo<&xQ!zLVy$m*G+wQI!v-3Th;;SeLqvGeiqItv*!PYoB+=)gd^jCV)G#> zLD<4-n86``3tQR_EcnA_f#MQ&g(}qjX7S?QKh@ljfn{}$oy*fF15r(_UH2oqs7NnJ zKA*J5q%ylsHq|WT5$hz!Rr9UhRUkYL!^j*2NSl{na7QP5B0RW3H~TYrL60_(VYAlm zgp7~zYk8*8Id*e3jmydpZYS?YTVB(K8=Zb-H2Or6DU-Pdu)^SolP2?LMX$HSmUY_E zF}>?FK^*&JYMELU>x1&4Rv&|dUsf|;Hh)v%A7DuH**w1Dm-XPo3V=o|Y8d6|I74U_ zTQnrrqNPApsVe5#XF^G@H`ott7}qE1|J2SSS5cLO#?oi$2srwkpDjjhus3_RVS<-(9h39?Dyg$Kculw#*D!M5x zr>#8^tATEO{oSr5p7M0tK#;Po$TN9z^dWWm&el?R!6VD&*N~o{J9!EM}aDetn0DWnviiVr}mKM`qZSApp(nK>uk*kh_dqDjN2X zy-d~QfL5P$`cK6IMGXs#1`kPSqA!M7N=szMYaxy|)8wo_I+#0EeOI(50$nU?RPewfxKab;ng z9cfv<(D(fcQf8e(@;E=>qN`A<10_Qxut~LSg~d9jz%2k}cHS0w$u+oQE2R_D+=CaM z?)^;fS}Px>NOI={*QTMp#`xDB!5elVX?uRQ?kdt9-@XTeOBe0)OSNAnjXFm7J!0%p zRUQB+`V4Dh2j2jFpG6$Q^2TSYCj%u#0O59)xCs@?0CKX{!Y(#tA@dMb-&bl;Z;eL)XN2KvkRt2;*tGt{*ha(NTu4E^3nS^ z>HP4WsBPQb-+>iS30>KXCUq!(uz8yV>Kt5lU@9nG(4~Hb*(Ayj4+JW|oY155~DzDeY2vy*hx2I9LSe8t+f!1`M-q{awFH>>-qtsVV!_>XHv>U_L6vi!zj(7+* zGqC3UUruoUP#qh~+NO(sibOs5eG5!c4~9WcpZW)U3Y9j(@*-RbeL#hY)C)Sflg@rJ zD6h?yd34_{YWo&g<6ava@ZJy@{4sfOpp^Z7FVtpXYKZZJhroS=cZ$FIy7%7Zf%Jpj z7J5~Upj^ciXsXE4rMLNT=Cyh~{cnPz?YAttL-Q`Jg!$Hk*A4<2UP8_fW7pkO=2(nr zWS(DS4c{m4y9Jt_RSg*XN$fb|`16FHoS&QTS-Gp$yAQv-ShPn_9g+8jm9qwfEdRFn zBY;K=c+XCjrD!H-eI4r?m`#`1c!w)H>*Pk)^*;1kD-(9~lKlGd(u)B(g%VB__GJoN zI27J8p?9r6R5Nlj_-h(DE9j4Lmgr_RupxLjw?Mqb67BE1y!*ot)LsQu*;9H*-KytP zpt3kuu+5^<3u!#^f8sc0U3_l=!Dbq>R;GS)lpoa%!42rAw_gb6hV73yHI*cJyc6e} z{zyXS#u>aX^6>q3ll45Ml01sh8g;#RqLU)( zJ-?dy>m7<_^lC}HoO?skan<;Zb?A?0&+~`kyMfBSDU?Rzrm?@*3Exs0O|AS%d5M+j jKTXe>;IWY(cKoW%vUPU+DrQPa()!zHd1qM_Zs-3CsRR03 literal 0 HcmV?d00001 diff --git a/tests/unit/sagemaker/test_serializers.py b/tests/unit/sagemaker/test_serializers.py index d2e4b7ce46..6b70c600ca 100644 --- a/tests/unit/sagemaker/test_serializers.py +++ b/tests/unit/sagemaker/test_serializers.py @@ -28,6 +28,7 @@ SparseMatrixSerializer, JSONLinesSerializer, LibSVMSerializer, + DataSerializer, ) from tests.unit import DATA_DIR @@ -331,3 +332,26 @@ def test_libsvm_serializer_file_like(libsvm_serializer): libsvm_file.seek(0) result = libsvm_serializer.serialize(libsvm_file) assert result == validation_data + + +@pytest.fixture +def data_serializer(): + return DataSerializer() + + +def test_data_serializer_raw(data_serializer): + input_image_file_path = os.path.join(DATA_DIR, "", "cuteCat.jpg") + with open(input_image_file_path, "rb") as image: + input_image = image.read() + input_image_data = data_serializer.serialize(input_image) + validation_image_file_path = os.path.join(DATA_DIR, "", "cuteCat.raw") + validation_image_data = open(validation_image_file_path, "rb").read() + assert input_image_data == validation_image_data + + +def test_data_serializer_file_like(data_serializer): + input_image_file_path = os.path.join(DATA_DIR, "", "cuteCat.jpg") + validation_image_file_path = os.path.join(DATA_DIR, "", "cuteCat.raw") + input_image_data = data_serializer.serialize(input_image_file_path) + validation_image_data = open(validation_image_file_path, "rb").read() + assert input_image_data == validation_image_data From ecb480058303344f6489dd257ca663cad7b9c093 Mon Sep 17 00:00:00 2001 From: jeniyat Date: Wed, 23 Feb 2022 19:32:11 -0700 Subject: [PATCH 2/7] added exception handling --- src/sagemaker/serializers.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/sagemaker/serializers.py b/src/sagemaker/serializers.py index 5d8392ad4a..59b8134876 100644 --- a/src/sagemaker/serializers.py +++ b/src/sagemaker/serializers.py @@ -18,7 +18,6 @@ import csv import io import json -import os import numpy as np from six import with_metaclass @@ -381,10 +380,13 @@ def serialize(self, data): raw-bytes: The data serialized as a raw-bytes from the input. """ if isinstance(data, str): - if not os.path.exists(data): + try: + dataFile = open(data, "rb") + except Exception: raise ValueError(f"{data} is not a valid file path.") - image = open(data, "rb") - return image.read() + dataFileInfo = dataFile.read() + dataFile.close() + return dataFileInfo if isinstance(data, bytes): return data From 1cb9de619bd9b90980b28f14db013efd8307d977 Mon Sep 17 00:00:00 2001 From: jeniyat Date: Wed, 23 Feb 2022 19:36:47 -0700 Subject: [PATCH 3/7] updated doc string --- src/sagemaker/serializers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/serializers.py b/src/sagemaker/serializers.py index 59b8134876..07edaf6df3 100644 --- a/src/sagemaker/serializers.py +++ b/src/sagemaker/serializers.py @@ -371,7 +371,7 @@ def __init__(self, content_type="file-path/raw-bytes"): super(DataSerializer, self).__init__(content_type=content_type) def serialize(self, data): - """Serialize file of various formats to a raw bytes. + """Serialize file data to a raw bytes. Args: data (object): Data to be serialized. The data can be a string, @@ -383,7 +383,7 @@ def serialize(self, data): try: dataFile = open(data, "rb") except Exception: - raise ValueError(f"{data} is not a valid file path.") + raise ValueError(f"{data} is not a valid file-path.") dataFileInfo = dataFile.read() dataFile.close() return dataFileInfo From e2936b20a0ffee71b4effca9ae6267ca05144888 Mon Sep 17 00:00:00 2001 From: jeniyat Date: Thu, 24 Feb 2022 12:05:03 -0700 Subject: [PATCH 4/7] update exception handling --- src/sagemaker/serializers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sagemaker/serializers.py b/src/sagemaker/serializers.py index 07edaf6df3..9a8da7ca89 100644 --- a/src/sagemaker/serializers.py +++ b/src/sagemaker/serializers.py @@ -382,10 +382,10 @@ def serialize(self, data): if isinstance(data, str): try: dataFile = open(data, "rb") - except Exception: - raise ValueError(f"{data} is not a valid file-path.") - dataFileInfo = dataFile.read() - dataFile.close() + dataFileInfo = dataFile.read() + dataFile.close() + except Exception as e: + raise ValueError(f"Could not open/read file: {data}. {e.message}") return dataFileInfo if isinstance(data, bytes): return data From 7e4893c3fe694b11bbed28d919f18c596e9339a5 Mon Sep 17 00:00:00 2001 From: jeniyat Date: Thu, 24 Feb 2022 12:14:16 -0700 Subject: [PATCH 5/7] update exception handling --- src/sagemaker/serializers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker/serializers.py b/src/sagemaker/serializers.py index 9a8da7ca89..3285ad1092 100644 --- a/src/sagemaker/serializers.py +++ b/src/sagemaker/serializers.py @@ -385,7 +385,7 @@ def serialize(self, data): dataFileInfo = dataFile.read() dataFile.close() except Exception as e: - raise ValueError(f"Could not open/read file: {data}. {e.message}") + raise ValueError(f"Could not open/read file: {data}. {e}") return dataFileInfo if isinstance(data, bytes): return data From 8368584e2e4a2d1995d60f3bfe9dde7ec6194920 Mon Sep 17 00:00:00 2001 From: jeniyat Date: Thu, 24 Feb 2022 12:24:16 -0700 Subject: [PATCH 6/7] updated doc string --- src/sagemaker/serializers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/serializers.py b/src/sagemaker/serializers.py index 3285ad1092..f1e1b1eb30 100644 --- a/src/sagemaker/serializers.py +++ b/src/sagemaker/serializers.py @@ -374,10 +374,10 @@ def serialize(self, data): """Serialize file data to a raw bytes. Args: - data (object): Data to be serialized. The data can be a string, + data (object): Data to be serialized. The data can be a string representing file-path or the raw bytes from a file. Returns: - raw-bytes: The data serialized as a raw-bytes from the input. + raw-bytes: The data serialized as raw-bytes from the input. """ if isinstance(data, str): try: From bde9af2a14ad53e9dfb37268f477b254085ca84e Mon Sep 17 00:00:00 2001 From: jeniyat Date: Thu, 24 Feb 2022 20:42:30 -0700 Subject: [PATCH 7/7] udpated try/except --- src/sagemaker/serializers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker/serializers.py b/src/sagemaker/serializers.py index f1e1b1eb30..a5cc51239c 100644 --- a/src/sagemaker/serializers.py +++ b/src/sagemaker/serializers.py @@ -384,9 +384,9 @@ def serialize(self, data): dataFile = open(data, "rb") dataFileInfo = dataFile.read() dataFile.close() + return dataFileInfo except Exception as e: raise ValueError(f"Could not open/read file: {data}. {e}") - return dataFileInfo if isinstance(data, bytes): return data