fix: remove json transformation

rubenfonseca · rubenfonseca · commit 1e1218e408dd · 2022-11-17T10:55:26.000+01:00
diff --git a/aws_lambda_powertools/utilities/streaming/transformations/base.py b/aws_lambda_powertools/utilities/streaming/transformations/base.py
@@ -16,7 +16,8 @@ def __init__(self, *args, **kwargs):
     @abstractmethod
     def transform(self, input_stream: IO[bytes]) -> T:
         """
-        Transform the data from input_stream into something that implements IO[bytes].
+        Transforms the data from input_stream into an implementation of IO[bytes].
+
         This allows you to return your own object while still conforming to a protocol
         that allows transformations to be nested.
         """
diff --git a/aws_lambda_powertools/utilities/streaming/transformations/json.py b/aws_lambda_powertools/utilities/streaming/transformations/json.py
diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md
@@ -9,13 +9,13 @@ The streaming utility handles streaming data from AWS for processing data sets b
 
 * Simple interface to stream data from S3, even when the data is larger than memory
 * Read your S3 file using the patterns you already know to deal with files in Python
-* Includes common transformations to data stored in S3, like Gzip and Json deserialization
+* Includes common transformations to data stored in S3, like Gzip and CSV deserialization
 * Build your own data transformation and add it to the pipeline
 
 ## Background
 
 Processing S3 files inside your Lambda function presents challenges when the file is bigger than the allocated
-amount of memory. Your data may also be stored using a set of encapsulation layers (gzip, JSON strings, etc).
+amount of memory. Your data may also be stored using a set of encapsulation layers (gzip, CSV, zip files, etc).
 
 This utility makes it easy to process data coming from S3 files, while applying data transformations transparently
 to the data stream.
@@ -87,14 +87,20 @@ For instance, if you want to unzip an S3 file compressed using `LZMA` you could
 --8<-- "examples/streaming/src/s3_transform_lzma.py"
 ```
 
+Or, if you want to load a `TSV` file, you can just change the delimiter on the `CSV` transform:
+
+```python hl_lines="12"
+--8<-- "examples/streaming/src/s3_transform_tsv.py"
+```
+
 ### Building your own data transformation
 
 You can build your own custom data transformation by extending the `BaseTransform` class.
-The `transform` method receives an `io.RawIOBase` object, and you are responsible for returning an object that is also
-a `io.RawIOBase`.
+The `transform` method receives an `IO[bytes]` object, and you are responsible for returning an object that is also
+a `IO[bytes]`.
 
 ```python hl_lines="9 37 38"
---8<-- "aws_lambda_powertools/utilities/streaming/transformations/json.py"
+--8<-- "examples/streaming/src/s3_json_transform.py"
 ```
 
 ## Testing your code
diff --git a/examples/streaming/src/s3_json_transform.py b/examples/streaming/src/s3_json_transform.py
@@ -0,0 +1,29 @@
+import io
+from typing import IO, Optional
+
+import ijson
+
+from aws_lambda_powertools.utilities.streaming.transformations import BaseTransform
+
+
+# Using io.RawIOBase gets us default implementations of many of the common IO methods
+class JsonDeserializer(io.RawIOBase):
+    def __init__(self, input_stream: IO[bytes]):
+        self.input = ijson.items(input_stream, "", multiple_values=True)
+
+    def read(self, size: int = -1) -> Optional[bytes]:
+        raise NotImplementedError(f"{__name__} does not implement read")
+
+    def readline(self, size: Optional[int] = None) -> bytes:
+        raise NotImplementedError(f"{__name__} does not implement readline")
+
+    def read_object(self) -> dict:
+        return self.input.__next__()
+
+    def __next__(self):
+        return self.read_object()
+
+
+class JsonTransform(BaseTransform):
+    def transform(self, input_stream: IO[bytes]) -> JsonDeserializer:
+        return JsonDeserializer(input_stream=input_stream)
diff --git a/examples/streaming/src/s3_transform_tsv.py b/examples/streaming/src/s3_transform_tsv.py
@@ -0,0 +1,13 @@
+from typing import Dict
+
+from aws_lambda_powertools.utilities.streaming.s3_object import S3Object
+from aws_lambda_powertools.utilities.streaming.transformations import CsvTransform
+from aws_lambda_powertools.utilities.typing import LambdaContext
+
+
+def lambda_handler(event: Dict[str, str], context: LambdaContext):
+    s3 = S3Object(bucket=event["bucket"], key=event["key"])
+
+    tsv_stream = s3.transform(CsvTransform(delimiter="\t"))
+    for obj in tsv_stream:
+        print(obj)