From a8d84674f9526e0956315cb46a892d2195a0a31c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Tue, 15 Nov 2022 15:22:39 +0100 Subject: [PATCH 01/43] feat: add s3 streaming utility --- .../utilities/streaming/__init__.py | 0 .../utilities/streaming/s3.py | 241 ++++++++++++++++++ .../streaming/transformations/__init__.py | 6 + .../streaming/transformations/base.py | 14 + .../streaming/transformations/gzip.py | 9 + .../streaming/transformations/json.py | 38 +++ .../streaming/transformations/zip.py | 11 + docs/utilities/streaming.md | 102 ++++++++ examples/streaming/src/s3_basic_stream.py | 10 + .../src/s3_basic_stream_with_version.py | 10 + examples/streaming/src/s3_transform.py | 15 ++ examples/streaming/src/s3_transform_common.py | 10 + .../streaming/src/s3_transform_in_place.py | 15 ++ examples/streaming/src/s3_transform_lzma.py | 15 ++ mkdocs.yml | 1 + 15 files changed, 497 insertions(+) create mode 100644 aws_lambda_powertools/utilities/streaming/__init__.py create mode 100644 aws_lambda_powertools/utilities/streaming/s3.py create mode 100644 aws_lambda_powertools/utilities/streaming/transformations/__init__.py create mode 100644 aws_lambda_powertools/utilities/streaming/transformations/base.py create mode 100644 aws_lambda_powertools/utilities/streaming/transformations/gzip.py create mode 100644 aws_lambda_powertools/utilities/streaming/transformations/json.py create mode 100644 aws_lambda_powertools/utilities/streaming/transformations/zip.py create mode 100644 docs/utilities/streaming.md create mode 100644 examples/streaming/src/s3_basic_stream.py create mode 100644 examples/streaming/src/s3_basic_stream_with_version.py create mode 100644 examples/streaming/src/s3_transform.py create mode 100644 examples/streaming/src/s3_transform_common.py create mode 100644 examples/streaming/src/s3_transform_in_place.py create mode 100644 examples/streaming/src/s3_transform_lzma.py diff --git a/aws_lambda_powertools/utilities/streaming/__init__.py b/aws_lambda_powertools/utilities/streaming/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/aws_lambda_powertools/utilities/streaming/s3.py b/aws_lambda_powertools/utilities/streaming/s3.py new file mode 100644 index 00000000000..4d1d9d5eaeb --- /dev/null +++ b/aws_lambda_powertools/utilities/streaming/s3.py @@ -0,0 +1,241 @@ +import io +import logging +import typing +from typing import TYPE_CHECKING, List, Literal, Optional, Sequence, Union, overload + +import boto3 +from botocore.response import StreamingBody + +from aws_lambda_powertools.utilities.streaming.transformations.base import ( + BaseTransform, + T, +) +from aws_lambda_powertools.utilities.streaming.transformations.gzip import GzipTransform +from aws_lambda_powertools.utilities.streaming.transformations.json import JsonTransform + +if TYPE_CHECKING: + from mypy_boto3_s3.service_resource import Object, S3ServiceResource + +logger = logging.getLogger(__name__) + + +class _S3Proxy(io.RawIOBase): + def __init__( + self, bucket: str, key: str, version_id: Optional[str] = None, boto3_s3_resource=Optional["S3ServiceResource"] + ): + self.bucket = bucket + self.key = key + self.version_id = version_id + + self._position = 0 + self._size: Optional[int] = None + + self._s3_object: Optional["Object"] = None + self._s3_resource: Optional["S3ServiceResource"] = boto3_s3_resource + self._raw_stream: Optional[StreamingBody] = None + + @property + def s3_resource(self) -> "S3ServiceResource": + if self._s3_resource is None: + self._s3_resource = boto3.resource("s3") + return self._s3_resource + + @property + def s3_object(self) -> "Object": + if self._s3_object is None: + if self.version_id is not None: + self._s3_object = self.s3_resource.ObjectVersion( + bucket_name=self.bucket, object_key=self.key, id=self.version_id + ).Object() + else: + self._s3_object = self.s3_resource.Object(bucket_name=self.bucket, key=self.key) + + return self._s3_object + + @property + def size(self) -> int: + if self._size is None: + self._size = self.s3_object.content_length + return self._size + + @property + def raw_stream(self) -> StreamingBody: + if self._raw_stream is None: + range_header = "bytes=%d-" % self._position + logging.debug(f"Starting new stream at {range_header}...") + self._raw_stream = self.s3_object.get(Range=range_header)["Body"] + + return self._raw_stream + + def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: + current_position = self._position + + if whence == io.SEEK_SET: + self._position = offset + elif whence == io.SEEK_CUR: + self._position += offset + elif whence == io.SEEK_END: + self._position = self.size + offset + else: + raise ValueError(f"invalid whence ({whence}, should be {io.SEEK_SET}, {io.SEEK_CUR}, {io.SEEK_END})") + + # If we changed the position in the stream, we should invalidate the existing stream + # and open a new one on the next read + if current_position != self._position and self._raw_stream is not None: + self._raw_stream.close() + self._raw_stream = None + + return self._position + + def seekable(self) -> bool: + return True + + def readable(self) -> bool: + return True + + def writable(self) -> bool: + return False + + def read(self, size: Optional[int] = -1) -> Optional[bytes]: + size = None if size == -1 else size + data = self.raw_stream.read(size) + if data is not None: + self._position += len(data) + return data + + def readline(self, size: Optional[int] = None) -> bytes: + data = self.raw_stream.readline(size) + self._position += len(data) + return data + + @property + def closed(self) -> bool: + return self.raw_stream.closed + + def __next__(self): + return self.raw_stream.__next__() + + def __iter__(self): + return self.raw_stream.__iter__() + + +class S3Object(io.RawIOBase): + def __init__( + self, + bucket: str, + key: str, + version_id: Optional[str] = None, + boto3_s3_resource: Optional["S3ServiceResource"] = None, + gunzip: Optional[bool] = False, + json: Optional[bool] = False, + ): + self.bucket = bucket + self.key = key + self.version_id = version_id + self.raw_stream = _S3Proxy(bucket=bucket, key=key, version_id=version_id, boto3_s3_resource=boto3_s3_resource) + + self._transformed_stream: Optional[io.RawIOBase] = None + self._data_transformations: List[BaseTransform] = [] + if gunzip: + self._data_transformations.append(GzipTransform()) + if json: + self._data_transformations.append(JsonTransform()) + + @property + def size(self) -> int: + return self.raw_stream.size + + @property + def transformed_stream(self) -> io.RawIOBase: + if self._transformed_stream is None: + # Apply all the transformations + transformed_stream = self.raw_stream + for transformation in self._data_transformations: + transformed_stream = transformation.transform(transformed_stream) + + self._transformed_stream = transformed_stream + + return self._transformed_stream + + def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: + return self.raw_stream.seek(offset, whence) + + def seekable(self) -> bool: + return self.raw_stream.seekable() + + def readable(self) -> bool: + return self.raw_stream.readable() + + def writable(self) -> bool: + return self.raw_stream.writable() + + def tell(self) -> int: + return self.raw_stream.tell() + + @property + def closed(self) -> bool: + return self.raw_stream.closed + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def close(self): + self.raw_stream.close() + + # Also close transformed stream if there are any transformations + if self.raw_stream != self._transformed_stream and self._transformed_stream is not None: + self._transformed_stream.close() + + def read(self, size: int = -1) -> Optional[bytes]: + return self.transformed_stream.read(size) + + def readline(self, size: Optional[int] = None) -> bytes: + return self.transformed_stream.readline(size) + + def __next__(self): + return self.transformed_stream.__next__() + + def __iter__(self): + return self.transformed_stream.__iter__() + + @overload + def transform( + self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Literal[True] + ) -> T: + pass + + @overload + def transform( + self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Literal[False] + ) -> None: + pass + + @overload + def transform(self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]]) -> T: + pass + + def transform( + self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Optional[bool] = False + ) -> Optional[T]: + if self.tell() != 0: + raise ValueError(f"Cannot add transformations to a read object. Already read {self.tell()} bytes") + + # Make transformations always be a sequence to make mypy happy + if not isinstance(transformations, Sequence): + transformations = [transformations] + + if in_place: + self._data_transformations.extend(transformations) + + # Invalidate transformed stream + self._transformed_stream = None + return None + else: + # Tell MyPy that raw_stream actually implements T (bound to io.RawIOBase) + stream = typing.cast(T, self.raw_stream) + for transformation in transformations: + stream = transformation.transform(stream) + return stream diff --git a/aws_lambda_powertools/utilities/streaming/transformations/__init__.py b/aws_lambda_powertools/utilities/streaming/transformations/__init__.py new file mode 100644 index 00000000000..8c82731a2ab --- /dev/null +++ b/aws_lambda_powertools/utilities/streaming/transformations/__init__.py @@ -0,0 +1,6 @@ +from aws_lambda_powertools.utilities.streaming.transformations.base import BaseTransform +from aws_lambda_powertools.utilities.streaming.transformations.gzip import GzipTransform +from aws_lambda_powertools.utilities.streaming.transformations.json import JsonTransform +from aws_lambda_powertools.utilities.streaming.transformations.zip import ZipTransform + +__all__ = ["BaseTransform", "GzipTransform", "JsonTransform", "ZipTransform"] diff --git a/aws_lambda_powertools/utilities/streaming/transformations/base.py b/aws_lambda_powertools/utilities/streaming/transformations/base.py new file mode 100644 index 00000000000..fb098bb3bf6 --- /dev/null +++ b/aws_lambda_powertools/utilities/streaming/transformations/base.py @@ -0,0 +1,14 @@ +import io +from abc import abstractmethod +from typing import Generic, TypeVar + +T = TypeVar("T", bound=io.RawIOBase) + + +class BaseTransform(Generic[T]): + def __init__(self, **kwargs): + self.kwargs = kwargs + + @abstractmethod + def transform(self, input_stream: io.RawIOBase) -> T: + pass diff --git a/aws_lambda_powertools/utilities/streaming/transformations/gzip.py b/aws_lambda_powertools/utilities/streaming/transformations/gzip.py new file mode 100644 index 00000000000..66b1d67d31d --- /dev/null +++ b/aws_lambda_powertools/utilities/streaming/transformations/gzip.py @@ -0,0 +1,9 @@ +import io +from gzip import GzipFile + +from aws_lambda_powertools.utilities.streaming.transformations.base import BaseTransform + + +class GzipTransform(BaseTransform): + def transform(self, input_stream: io.RawIOBase) -> GzipFile: + return GzipFile(fileobj=input_stream, mode="rb", **self.kwargs) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/json.py b/aws_lambda_powertools/utilities/streaming/transformations/json.py new file mode 100644 index 00000000000..775bd6036fd --- /dev/null +++ b/aws_lambda_powertools/utilities/streaming/transformations/json.py @@ -0,0 +1,38 @@ +import io +import json +from json import JSONDecodeError +from typing import Optional + +from aws_lambda_powertools.utilities.streaming.transformations.base import BaseTransform + + +class JsonDeserializer(io.RawIOBase): + def __init__(self, input_stream: io.RawIOBase): + self.input = input_stream + + def read(self, size: int = -1) -> Optional[bytes]: + return self.input.read(size) + + def readline(self, size: Optional[int] = None) -> bytes: + return self.input.readline(size) + + def read_object(self) -> dict: + obj: dict = {} + + while not self.input.closed: + line = self.input.__next__() + try: + obj = json.loads(line) + except JSONDecodeError: + continue + break + + return obj + + def __next__(self): + return self.read_object() + + +class JsonTransform(BaseTransform): + def transform(self, input_stream: io.RawIOBase) -> JsonDeserializer: + return JsonDeserializer(input_stream=input_stream) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/zip.py b/aws_lambda_powertools/utilities/streaming/transformations/zip.py new file mode 100644 index 00000000000..15de2d301e5 --- /dev/null +++ b/aws_lambda_powertools/utilities/streaming/transformations/zip.py @@ -0,0 +1,11 @@ +import io +import typing +from zipfile import ZipFile + +from aws_lambda_powertools.utilities.streaming.transformations.base import BaseTransform + + +class ZipTransform(BaseTransform): + def transform(self, input_stream: io.RawIOBase) -> ZipFile: + input_as_io = typing.cast(typing.IO[bytes], input_stream) + return ZipFile(input_as_io, mode="r", **self.kwargs) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md new file mode 100644 index 00000000000..8f2aaab6c4b --- /dev/null +++ b/docs/utilities/streaming.md @@ -0,0 +1,102 @@ +--- +title: Streaming +description: Utility +--- + +The streaming utility handles streaming data from AWS for processing data sets bigger than the available memory. + +## Key Features + +* Simple interface to stream data from S3, even when the data is larger than memory +* Read your S3 file using the patterns you already know to deal with files in Python +* Includes common transformations to data stored in S3, like Gzip and Json deserialization +* Build your own data transformation and add it to the pipeline + +## Background + +Processing S3 files inside your Lambda function presents challenges when the file is bigger than the allocated +amount of memory. Your data may also be stored using a set of encapsulation layers (gzip, JSON strings, etc). + +This utility makes it easy to process data coming from S3 files, while applying data transformations transparently +to the data stream. + +## Getting started + +To stream an S3 file, you need the bucket name, the key and optionally a version ID. + +=== "Non-versioned bucket" + + ```python hl_lines="8 9" + --8<-- "examples/streaming/src/s3_basic_stream.py" + ``` + +=== "Versioned bucket" + + ```python hl_lines="8 9" + --8<-- "examples/streaming/src/s3_basic_stream_with_version.py" + ``` + +The code above will stream the contents from S3 as fast as possible, using minimal memory. + +### Data transformations + +The utility has some built-in data transformations to help deal with common scenarios while streaming data from S3. + +| Name | Description | +|---------------------------------|---------------------------------------------------------------------------------------------| +| **Gzip** | Gunzips the stream of data using the [gzip library](https://docs.python.org/3/library/gzip.html) | +| **Zip** | Exposes the stream as a [ZipFile object](https://docs.python.org/3/library/zipfile.html) | +| **JSON** | Parses each line as a JSON object, returning matched objects | + +Common options like gunzipping a stream and parsing data as JSON can be enabled directly on the constructor: + +```python hl_lines="8" +--8<-- "examples/streaming/src/s3_transform_common.py" +``` + +Additionally, you can transform the data in place, or return a new object that encapsulates the transformation. +Multiple transformations are applied in order. + +=== "Returning a new object" + + ```python hl_lines="13" + --8<-- "examples/streaming/src/s3_transform.py" + ``` + +=== "Transform in-place" + + ```python hl_lines="13" + --8<-- "examples/streaming/src/s3_transform_in_place.py" + ``` + +## Advanced + +### Custom options for data transformations + +Each data transformation class accepts additional options to customize the transformation. + +| Name | Description | +|----------|----------------------------------------------------------------------------------------------------------------| +| **Gzip** | All the options from the [GzipFile constructor](https://docs.python.org/3/library/gzip.html#gzip.GzipFile) | +| **Zip** | All the options from the [ZipFile constructor](https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile) | +| **JSON** | No additional options are supported at the moment | + +For instance, if you want to unzip an S3 file compressed using `LZMA` you could pass that option in the constructor: + +```python hl_lines="12" +--8<-- "examples/streaming/src/s3_transform_lzma.py" +``` + +### Building your own data transformation + +You can build your own custom data transformation by extending the `BaseTransform` class. +The `transform` method receives an `io.RawIOBase` object, and you are responsible for returning an object that is also +a `io.RawIOBase`. + +```python hl_lines="9 37 38" +--8<-- "aws_lambda_powertools/utilities/streaming/transformations/json.py" +``` + +## Testing your code + +TODO diff --git a/examples/streaming/src/s3_basic_stream.py b/examples/streaming/src/s3_basic_stream.py new file mode 100644 index 00000000000..5d1914479e0 --- /dev/null +++ b/examples/streaming/src/s3_basic_stream.py @@ -0,0 +1,10 @@ +from typing import Dict + +from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.typing import LambdaContext + + +def lambda_handler(event: Dict[str, str], context: LambdaContext): + s3 = S3Object(bucket=event["bucket"], key=event["key"]) + for line in s3: + print(line) diff --git a/examples/streaming/src/s3_basic_stream_with_version.py b/examples/streaming/src/s3_basic_stream_with_version.py new file mode 100644 index 00000000000..0be64ca8e6b --- /dev/null +++ b/examples/streaming/src/s3_basic_stream_with_version.py @@ -0,0 +1,10 @@ +from typing import Dict + +from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.typing import LambdaContext + + +def lambda_handler(event: Dict[str, str], context: LambdaContext): + s3 = S3Object(bucket=event["bucket"], key=event["key"], version_id=event["version_id"]) + for line in s3: + print(line) diff --git a/examples/streaming/src/s3_transform.py b/examples/streaming/src/s3_transform.py new file mode 100644 index 00000000000..d45e16dfd4d --- /dev/null +++ b/examples/streaming/src/s3_transform.py @@ -0,0 +1,15 @@ +from typing import Dict + +from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.streaming.transformations import ( + GzipTransform, + JsonTransform, +) +from aws_lambda_powertools.utilities.typing import LambdaContext + + +def lambda_handler(event: Dict[str, str], context: LambdaContext): + s3 = S3Object(bucket=event["bucket"], key=event["key"]) + data = s3.transform([GzipTransform(), JsonTransform()]) + for line in data: + print(line) # returns a dict diff --git a/examples/streaming/src/s3_transform_common.py b/examples/streaming/src/s3_transform_common.py new file mode 100644 index 00000000000..fc04423c040 --- /dev/null +++ b/examples/streaming/src/s3_transform_common.py @@ -0,0 +1,10 @@ +from typing import Dict + +from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.typing import LambdaContext + + +def lambda_handler(event: Dict[str, str], context: LambdaContext): + s3 = S3Object(bucket=event["bucket"], key=event["key"], gunzip=True) + for line in s3: + print(line) diff --git a/examples/streaming/src/s3_transform_in_place.py b/examples/streaming/src/s3_transform_in_place.py new file mode 100644 index 00000000000..ead1a20c240 --- /dev/null +++ b/examples/streaming/src/s3_transform_in_place.py @@ -0,0 +1,15 @@ +from typing import Dict + +from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.streaming.transformations import ( + GzipTransform, + JsonTransform, +) +from aws_lambda_powertools.utilities.typing import LambdaContext + + +def lambda_handler(event: Dict[str, str], context: LambdaContext): + s3 = S3Object(bucket=event["bucket"], key=event["key"]) + s3.transform([GzipTransform(), JsonTransform()], in_place=True) + for line in s3: + print(line) # returns a dict diff --git a/examples/streaming/src/s3_transform_lzma.py b/examples/streaming/src/s3_transform_lzma.py new file mode 100644 index 00000000000..0b172b17228 --- /dev/null +++ b/examples/streaming/src/s3_transform_lzma.py @@ -0,0 +1,15 @@ +import zipfile +from typing import Dict + +from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.streaming.transformations import ZipTransform +from aws_lambda_powertools.utilities.typing import LambdaContext + + +def lambda_handler(event: Dict[str, str], context: LambdaContext): + s3 = S3Object(bucket=event["bucket"], key=event["key"]) + + zf = s3.transform(ZipTransform(compression=zipfile.ZIP_LZMA)) + + print(zf.nameslist()) + zf.extract(zf.namelist()[0], "/tmp") diff --git a/mkdocs.yml b/mkdocs.yml index 65e053ae27c..e72465f5736 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -31,6 +31,7 @@ nav: - utilities/feature_flags.md - utilities/jmespath_functions.md - CloudFormation Custom Resources: https://github.com/aws-cloudformation/custom-resource-helper" target="_blank + - utilities/streaming.md theme: name: material From 1fddf1e85255b82f574300a3eabbf77cdc99d1f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Tue, 15 Nov 2022 15:27:16 +0100 Subject: [PATCH 02/43] chore: add streaming utility area and label --- .github/boring-cyborg.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/boring-cyborg.yml b/.github/boring-cyborg.yml index 325cd7c4ebc..7764a1d73d7 100644 --- a/.github/boring-cyborg.yml +++ b/.github/boring-cyborg.yml @@ -42,6 +42,8 @@ labelPRBasedOnFilePath: typing: - aws_lambda_powertools/utilities/typing/* - mypy.ini + streaming: + - aws_lambda_powertools/utilities/streaming/* commons: - aws_lambda_powertools/shared/* From 28c6003dbfaec8c6da57b95235c32aa382db4ddb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Tue, 15 Nov 2022 15:30:31 +0100 Subject: [PATCH 03/43] fix: use Literal from typing_extensions --- aws_lambda_powertools/utilities/streaming/s3.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aws_lambda_powertools/utilities/streaming/s3.py b/aws_lambda_powertools/utilities/streaming/s3.py index 4d1d9d5eaeb..9331d10399f 100644 --- a/aws_lambda_powertools/utilities/streaming/s3.py +++ b/aws_lambda_powertools/utilities/streaming/s3.py @@ -1,10 +1,11 @@ import io import logging import typing -from typing import TYPE_CHECKING, List, Literal, Optional, Sequence, Union, overload +from typing import TYPE_CHECKING, List, Optional, Sequence, Union, overload import boto3 from botocore.response import StreamingBody +from typing_extensions import Literal from aws_lambda_powertools.utilities.streaming.transformations.base import ( BaseTransform, From fc5bdbb8dbb70fbbb7a7fdc3574ea3fd99c68ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Wed, 16 Nov 2022 16:58:58 +0100 Subject: [PATCH 04/43] fix: re-wrote implementation and added documentation --- .../utilities/streaming/__init__.py | 3 + .../utilities/streaming/_s3_seekable_io.py | 180 +++++++++++++ .../utilities/streaming/s3.py | 242 ------------------ .../utilities/streaming/s3_object.py | 235 +++++++++++++++++ .../streaming/transformations/__init__.py | 3 +- .../streaming/transformations/base.py | 19 +- .../streaming/transformations/csv.py | 16 ++ .../streaming/transformations/gzip.py | 4 +- .../streaming/transformations/json.py | 7 +- .../streaming/transformations/zip.py | 8 +- docs/utilities/streaming.md | 14 +- examples/streaming/src/s3_basic_stream.py | 2 +- .../src/s3_basic_stream_with_version.py | 2 +- examples/streaming/src/s3_transform.py | 6 +- examples/streaming/src/s3_transform_common.py | 2 +- .../streaming/src/s3_transform_in_place.py | 6 +- examples/streaming/src/s3_transform_lzma.py | 2 +- 17 files changed, 476 insertions(+), 275 deletions(-) create mode 100644 aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py delete mode 100644 aws_lambda_powertools/utilities/streaming/s3.py create mode 100644 aws_lambda_powertools/utilities/streaming/s3_object.py create mode 100644 aws_lambda_powertools/utilities/streaming/transformations/csv.py diff --git a/aws_lambda_powertools/utilities/streaming/__init__.py b/aws_lambda_powertools/utilities/streaming/__init__.py index e69de29bb2d..8c326b99400 100644 --- a/aws_lambda_powertools/utilities/streaming/__init__.py +++ b/aws_lambda_powertools/utilities/streaming/__init__.py @@ -0,0 +1,3 @@ +from aws_lambda_powertools.utilities.streaming.s3_object import S3Object + +__all__ = ["S3Object"] diff --git a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py new file mode 100644 index 00000000000..fc4fb8777b8 --- /dev/null +++ b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py @@ -0,0 +1,180 @@ +import io +import logging +from typing import IO, TYPE_CHECKING, AnyStr, Iterable, List, Optional + +import boto3 +from botocore.response import StreamingBody + +if TYPE_CHECKING: + from mypy_boto3_s3 import S3ServiceResource + from mypy_boto3_s3.service_resource import Object + +logger = logging.getLogger(__name__) + + +class _S3SeekableIO(IO[bytes]): + """ + _S3SeekableIO wraps boto3.StreamingBody to allow for seeking. Seeking is achieved by closing the + existing connection and re-opening a new one, passing the correct HTTP Range header. + + Parameters + ---------- + bucket: str + The S3 bucket + key: str + The S3 key + version_id: str, optional + A version ID of the object, when the S3 bucket is versioned + boto3_s3_resource: S3ServiceResource, optional + An optional boto3 S3 resource. If missing, a new one will be created. + """ + + def __init__( + self, bucket: str, key: str, version_id: Optional[str] = None, boto3_s3_resource=Optional["S3ServiceResource"] + ): + self.bucket = bucket + self.key = key + self.version_id = version_id + + # Holds the current position in the stream + self._position = 0 + + # Caches the size of the object + self._size: Optional[int] = None + + self._s3_object: Optional["Object"] = None + self._s3_resource: Optional["S3ServiceResource"] = boto3_s3_resource + self._raw_stream: Optional[StreamingBody] = None + + @property + def s3_resource(self) -> "S3ServiceResource": + """ + Returns a boto3 S3ServiceResource + """ + if self._s3_resource is None: + self._s3_resource = boto3.resource("s3") + return self._s3_resource + + @property + def s3_object(self) -> "Object": + """ + Returns a boto3 S3Object + """ + if self._s3_object is None: + if self.version_id is not None: + self._s3_object = self.s3_resource.ObjectVersion( + bucket_name=self.bucket, object_key=self.key, id=self.version_id + ).Object() + else: + self._s3_object = self.s3_resource.Object(bucket_name=self.bucket, key=self.key) + + return self._s3_object + + @property + def size(self) -> int: + """ + Retrieves the size of the S3 object + """ + if self._size is None: + self._size = self.s3_object.content_length + return self._size + + @property + def raw_stream(self) -> StreamingBody: + """ + Returns the boto3 StreamingBody, starting the stream from the seeked position. + """ + if self._raw_stream is None: + range_header = "bytes=%d-" % self._position + logging.debug(f"Starting new stream at {range_header}...") + self._raw_stream = self.s3_object.get(Range=range_header)["Body"] + + return self._raw_stream + + def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: + """ + Seeks the current object, invalidating the underlying stream if the position changes. + """ + current_position = self._position + + if whence == io.SEEK_SET: + self._position = offset + elif whence == io.SEEK_CUR: + self._position += offset + elif whence == io.SEEK_END: + self._position = self.size + offset + else: + raise ValueError(f"invalid whence ({whence}, should be {io.SEEK_SET}, {io.SEEK_CUR}, {io.SEEK_END})") + + # If we changed the position in the stream, we should invalidate the existing stream + # and open a new one on the next read + if current_position != self._position and self._raw_stream is not None: + self._raw_stream.close() + self._raw_stream = None + + return self._position + + def seekable(self) -> bool: + return True + + def readable(self) -> bool: + return True + + def writable(self) -> bool: + return False + + def tell(self) -> int: + return self._position + + def read(self, size: Optional[int] = -1) -> bytes: + size = None if size == -1 else size + data = self.raw_stream.read(size) + if data is not None: + self._position += len(data) + return data + + def readline(self, size: Optional[int] = None) -> bytes: + data = self.raw_stream.readline(size) + self._position += len(data) + return data + + def readlines(self, hint: int = -1) -> List[bytes]: + # boto3's StreamingResponse doesn't implement the "hint" parameter + return self.raw_stream.readlines() + + @property + def closed(self) -> bool: + return self.raw_stream.closed + + def __next__(self): + return self.raw_stream.__next__() + + def __iter__(self): + return self.raw_stream.__iter__() + + def __enter__(self): + return self + + def __exit__(self, **kwargs): + self.close() + + def close(self) -> None: + self.raw_stream.close() + + def fileno(self) -> int: + raise NotImplementedError() + + def flush(self) -> None: + raise NotImplementedError() + + def isatty(self) -> bool: + return False + + def truncate(self, size: Optional[int] = 0) -> int: + raise NotImplementedError() + + def write(self, data: AnyStr) -> int: + raise NotImplementedError() + + def writelines(self, lines: Iterable[AnyStr]) -> None: + raise NotImplementedError() diff --git a/aws_lambda_powertools/utilities/streaming/s3.py b/aws_lambda_powertools/utilities/streaming/s3.py deleted file mode 100644 index 9331d10399f..00000000000 --- a/aws_lambda_powertools/utilities/streaming/s3.py +++ /dev/null @@ -1,242 +0,0 @@ -import io -import logging -import typing -from typing import TYPE_CHECKING, List, Optional, Sequence, Union, overload - -import boto3 -from botocore.response import StreamingBody -from typing_extensions import Literal - -from aws_lambda_powertools.utilities.streaming.transformations.base import ( - BaseTransform, - T, -) -from aws_lambda_powertools.utilities.streaming.transformations.gzip import GzipTransform -from aws_lambda_powertools.utilities.streaming.transformations.json import JsonTransform - -if TYPE_CHECKING: - from mypy_boto3_s3.service_resource import Object, S3ServiceResource - -logger = logging.getLogger(__name__) - - -class _S3Proxy(io.RawIOBase): - def __init__( - self, bucket: str, key: str, version_id: Optional[str] = None, boto3_s3_resource=Optional["S3ServiceResource"] - ): - self.bucket = bucket - self.key = key - self.version_id = version_id - - self._position = 0 - self._size: Optional[int] = None - - self._s3_object: Optional["Object"] = None - self._s3_resource: Optional["S3ServiceResource"] = boto3_s3_resource - self._raw_stream: Optional[StreamingBody] = None - - @property - def s3_resource(self) -> "S3ServiceResource": - if self._s3_resource is None: - self._s3_resource = boto3.resource("s3") - return self._s3_resource - - @property - def s3_object(self) -> "Object": - if self._s3_object is None: - if self.version_id is not None: - self._s3_object = self.s3_resource.ObjectVersion( - bucket_name=self.bucket, object_key=self.key, id=self.version_id - ).Object() - else: - self._s3_object = self.s3_resource.Object(bucket_name=self.bucket, key=self.key) - - return self._s3_object - - @property - def size(self) -> int: - if self._size is None: - self._size = self.s3_object.content_length - return self._size - - @property - def raw_stream(self) -> StreamingBody: - if self._raw_stream is None: - range_header = "bytes=%d-" % self._position - logging.debug(f"Starting new stream at {range_header}...") - self._raw_stream = self.s3_object.get(Range=range_header)["Body"] - - return self._raw_stream - - def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: - current_position = self._position - - if whence == io.SEEK_SET: - self._position = offset - elif whence == io.SEEK_CUR: - self._position += offset - elif whence == io.SEEK_END: - self._position = self.size + offset - else: - raise ValueError(f"invalid whence ({whence}, should be {io.SEEK_SET}, {io.SEEK_CUR}, {io.SEEK_END})") - - # If we changed the position in the stream, we should invalidate the existing stream - # and open a new one on the next read - if current_position != self._position and self._raw_stream is not None: - self._raw_stream.close() - self._raw_stream = None - - return self._position - - def seekable(self) -> bool: - return True - - def readable(self) -> bool: - return True - - def writable(self) -> bool: - return False - - def read(self, size: Optional[int] = -1) -> Optional[bytes]: - size = None if size == -1 else size - data = self.raw_stream.read(size) - if data is not None: - self._position += len(data) - return data - - def readline(self, size: Optional[int] = None) -> bytes: - data = self.raw_stream.readline(size) - self._position += len(data) - return data - - @property - def closed(self) -> bool: - return self.raw_stream.closed - - def __next__(self): - return self.raw_stream.__next__() - - def __iter__(self): - return self.raw_stream.__iter__() - - -class S3Object(io.RawIOBase): - def __init__( - self, - bucket: str, - key: str, - version_id: Optional[str] = None, - boto3_s3_resource: Optional["S3ServiceResource"] = None, - gunzip: Optional[bool] = False, - json: Optional[bool] = False, - ): - self.bucket = bucket - self.key = key - self.version_id = version_id - self.raw_stream = _S3Proxy(bucket=bucket, key=key, version_id=version_id, boto3_s3_resource=boto3_s3_resource) - - self._transformed_stream: Optional[io.RawIOBase] = None - self._data_transformations: List[BaseTransform] = [] - if gunzip: - self._data_transformations.append(GzipTransform()) - if json: - self._data_transformations.append(JsonTransform()) - - @property - def size(self) -> int: - return self.raw_stream.size - - @property - def transformed_stream(self) -> io.RawIOBase: - if self._transformed_stream is None: - # Apply all the transformations - transformed_stream = self.raw_stream - for transformation in self._data_transformations: - transformed_stream = transformation.transform(transformed_stream) - - self._transformed_stream = transformed_stream - - return self._transformed_stream - - def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: - return self.raw_stream.seek(offset, whence) - - def seekable(self) -> bool: - return self.raw_stream.seekable() - - def readable(self) -> bool: - return self.raw_stream.readable() - - def writable(self) -> bool: - return self.raw_stream.writable() - - def tell(self) -> int: - return self.raw_stream.tell() - - @property - def closed(self) -> bool: - return self.raw_stream.closed - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def close(self): - self.raw_stream.close() - - # Also close transformed stream if there are any transformations - if self.raw_stream != self._transformed_stream and self._transformed_stream is not None: - self._transformed_stream.close() - - def read(self, size: int = -1) -> Optional[bytes]: - return self.transformed_stream.read(size) - - def readline(self, size: Optional[int] = None) -> bytes: - return self.transformed_stream.readline(size) - - def __next__(self): - return self.transformed_stream.__next__() - - def __iter__(self): - return self.transformed_stream.__iter__() - - @overload - def transform( - self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Literal[True] - ) -> T: - pass - - @overload - def transform( - self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Literal[False] - ) -> None: - pass - - @overload - def transform(self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]]) -> T: - pass - - def transform( - self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Optional[bool] = False - ) -> Optional[T]: - if self.tell() != 0: - raise ValueError(f"Cannot add transformations to a read object. Already read {self.tell()} bytes") - - # Make transformations always be a sequence to make mypy happy - if not isinstance(transformations, Sequence): - transformations = [transformations] - - if in_place: - self._data_transformations.extend(transformations) - - # Invalidate transformed stream - self._transformed_stream = None - return None - else: - # Tell MyPy that raw_stream actually implements T (bound to io.RawIOBase) - stream = typing.cast(T, self.raw_stream) - for transformation in transformations: - stream = transformation.transform(stream) - return stream diff --git a/aws_lambda_powertools/utilities/streaming/s3_object.py b/aws_lambda_powertools/utilities/streaming/s3_object.py new file mode 100644 index 00000000000..a44162c80f9 --- /dev/null +++ b/aws_lambda_powertools/utilities/streaming/s3_object.py @@ -0,0 +1,235 @@ +import io +from typing import ( + IO, + TYPE_CHECKING, + AnyStr, + Iterable, + List, + Optional, + Sequence, + Union, + cast, + overload, +) + +from typing_extensions import Literal + +from aws_lambda_powertools.utilities.streaming._s3_seekable_io import _S3SeekableIO +from aws_lambda_powertools.utilities.streaming.transformations import ( + CsvTransform, + GzipTransform, +) +from aws_lambda_powertools.utilities.streaming.transformations.base import ( + BaseTransform, + T, +) + +if TYPE_CHECKING: + from mypy_boto3_s3 import S3ServiceResource + + +class S3Object(IO[bytes]): + """ + Seekable streamable S3 Object reader. + + S3Object implements the IO[bytes], backed by a seekable s3 streaming. + + Parameters + ---------- + bucket: str + The S3 bucket + key: str + The S3 key + version_id: str, optional + A version ID of the object, when the S3 bucket is versioned + boto3_s3_resource: S3ServiceResource, optional + An optional boto3 S3 resource. If missing, a new one will be created. + gunzip: bool, optional + Enables the Gunzip data transformation + csv: bool, optional + Enables the CSV data transformation + + Example + ------- + + ** Reads a line from an S3, loading as little data as necessary + + >>> from aws_lambda_powertools.utilities.streaming import S3Object + >>> + >>> line: bytes = S3Object(bucket="bucket", key="key").readline() + >>> + >>> print(line) + + """ + + def __init__( + self, + bucket: str, + key: str, + version_id: Optional[str] = None, + boto3_s3_resource: Optional["S3ServiceResource"] = None, + gunzip: Optional[bool] = False, + csv: Optional[bool] = False, + ): + self.bucket = bucket + self.key = key + self.version_id = version_id + + # The underlying seekable IO, where all the magic happens + self.raw_stream = _S3SeekableIO( + bucket=bucket, key=key, version_id=version_id, boto3_s3_resource=boto3_s3_resource + ) + + # Stores the list of data transformations + self._data_transformations: List[BaseTransform] = [] + if gunzip: + self._data_transformations.append(GzipTransform()) + if csv: + self._data_transformations.append(CsvTransform()) + + # Stores the cached transformed stream + self._transformed_stream: Optional[IO[bytes]] = None + + @property + def size(self) -> int: + """ + Retrieves the size of the underlying S3 object + """ + return self.raw_stream.size + + @property + def transformed_stream(self) -> IO[bytes]: + """ + Returns a IO[bytes] stream with all the data transformations applied in order + """ + if self._transformed_stream is None: + # Apply all the transformations + transformed_stream = self.raw_stream + for transformation in self._data_transformations: + transformed_stream = transformation.transform(transformed_stream) + + self._transformed_stream = transformed_stream + + return self._transformed_stream + + @overload + def transform( + self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Literal[True] + ) -> T: + pass + + @overload + def transform( + self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Literal[False] + ) -> None: + pass + + @overload + def transform(self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]]) -> T: + pass + + def transform( + self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Optional[bool] = False + ) -> Optional[T]: + """ + Applies one or more data transformations to the stream. + + Parameters + ---------- + transformations: BaseTransform[T] | Sequence[BaseTransform[T]] + One or more transformations to apply. Transformations are applied in the same order as they are declared. + in_place: bool, optional + Transforms the stream in place, instead of returning a new stream object. Defaults to false. + + Returns + ------- + T[bound=IO[bytes]], optional + If in_place is False, returns an IO[bytes] object representing the transformed stream + """ + if self.tell() != 0: + raise ValueError(f"Cannot add transformations to a read object. Already read {self.tell()} bytes") + + # Make transformations always be a sequence to make mypy happy + if not isinstance(transformations, Sequence): + transformations = [transformations] + + if in_place: + self._data_transformations.extend(transformations) + + # Invalidate any existing transformed stream. + # It will be created again next time it's accessed. + self._transformed_stream = None + return None + else: + # Tell mypy that raw_stream actually implements T (bound to IO[bytes]) + stream = cast(T, self.raw_stream) + for transformation in transformations: + stream = transformation.transform(stream) + return stream + + # From this point on, we're just implementing all the standard methods on the IO[bytes] type. + # There's no magic here, just delegating all the calls to our transformed_stream. + def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: + return self.transformed_stream.seek(offset, whence) + + def seekable(self) -> bool: + return self.transformed_stream.seekable() + + def readable(self) -> bool: + return self.transformed_stream.readable() + + def writable(self) -> bool: + return self.transformed_stream.writable() + + def tell(self) -> int: + return self.transformed_stream.tell() + + @property + def closed(self) -> bool: + return self.transformed_stream.closed + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def close(self): + self.raw_stream.close() + + # Also close transformed stream if there are any transformations + if self.raw_stream != self._transformed_stream and self._transformed_stream is not None: + self._transformed_stream.close() + + def read(self, size: int = -1) -> bytes: + return self.transformed_stream.read(size) + + def readline(self, size: Optional[int] = -1) -> bytes: + return self.transformed_stream.readline() + + def readlines(self, hint: int = -1) -> List[bytes]: + return self.transformed_stream.readlines(hint) + + def __next__(self): + return self.transformed_stream.__next__() + + def __iter__(self): + return self.transformed_stream.__iter__() + + def fileno(self) -> int: + raise NotImplementedError() + + def flush(self) -> None: + raise NotImplementedError() + + def isatty(self) -> bool: + return False + + def truncate(self, size: Optional[int] = 0) -> int: + raise NotImplementedError() + + def write(self, data: AnyStr) -> int: + raise NotImplementedError() + + def writelines(self, lines: Iterable[AnyStr]) -> None: + raise NotImplementedError() diff --git a/aws_lambda_powertools/utilities/streaming/transformations/__init__.py b/aws_lambda_powertools/utilities/streaming/transformations/__init__.py index 8c82731a2ab..044dadfe877 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/__init__.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/__init__.py @@ -1,6 +1,7 @@ from aws_lambda_powertools.utilities.streaming.transformations.base import BaseTransform +from aws_lambda_powertools.utilities.streaming.transformations.csv import CsvTransform from aws_lambda_powertools.utilities.streaming.transformations.gzip import GzipTransform from aws_lambda_powertools.utilities.streaming.transformations.json import JsonTransform from aws_lambda_powertools.utilities.streaming.transformations.zip import ZipTransform -__all__ = ["BaseTransform", "GzipTransform", "JsonTransform", "ZipTransform"] +__all__ = ["BaseTransform", "GzipTransform", "JsonTransform", "ZipTransform", "CsvTransform"] diff --git a/aws_lambda_powertools/utilities/streaming/transformations/base.py b/aws_lambda_powertools/utilities/streaming/transformations/base.py index fb098bb3bf6..d55deb637fd 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/base.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/base.py @@ -1,14 +1,23 @@ -import io from abc import abstractmethod -from typing import Generic, TypeVar +from typing import IO, Generic, TypeVar -T = TypeVar("T", bound=io.RawIOBase) +T = TypeVar("T", bound=IO[bytes]) class BaseTransform(Generic[T]): - def __init__(self, **kwargs): + """ + BaseTransform is the base class all data transformations need to implement. + """ + + def __init__(self, *args, **kwargs): + self.args = args self.kwargs = kwargs @abstractmethod - def transform(self, input_stream: io.RawIOBase) -> T: + def transform(self, input_stream: IO[bytes]) -> T: + """ + Transform the data from input_stream into something that implements IO[bytes]. + This allows you to return your own object while still conforming to a protocol + that allows transformations to be nested. + """ pass diff --git a/aws_lambda_powertools/utilities/streaming/transformations/csv.py b/aws_lambda_powertools/utilities/streaming/transformations/csv.py new file mode 100644 index 00000000000..2bc81bea6e3 --- /dev/null +++ b/aws_lambda_powertools/utilities/streaming/transformations/csv.py @@ -0,0 +1,16 @@ +import csv +import io +from csv import DictReader +from typing import IO + +from aws_lambda_powertools.utilities.streaming.transformations.base import BaseTransform + + +class CsvTransform(BaseTransform): + def transform(self, input_stream: IO[bytes]) -> DictReader: + encoding = self.kwargs.get("encoding", "utf-8") + newline = self.kwargs.get("newline") + + # csv module needs an Iterator[str], so we wrap the underlying stream into a TextIO + iterator = io.TextIOWrapper(input_stream, encoding=encoding, newline=newline) + return csv.DictReader(iterator, *self.args, **self.kwargs) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/gzip.py b/aws_lambda_powertools/utilities/streaming/transformations/gzip.py index 66b1d67d31d..859c80b83c9 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/gzip.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/gzip.py @@ -1,9 +1,9 @@ -import io from gzip import GzipFile +from typing import IO from aws_lambda_powertools.utilities.streaming.transformations.base import BaseTransform class GzipTransform(BaseTransform): - def transform(self, input_stream: io.RawIOBase) -> GzipFile: + def transform(self, input_stream: IO[bytes]) -> GzipFile: return GzipFile(fileobj=input_stream, mode="rb", **self.kwargs) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/json.py b/aws_lambda_powertools/utilities/streaming/transformations/json.py index 775bd6036fd..bd2aa2dcd24 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/json.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/json.py @@ -1,19 +1,20 @@ import io import json from json import JSONDecodeError -from typing import Optional +from typing import IO, Optional from aws_lambda_powertools.utilities.streaming.transformations.base import BaseTransform class JsonDeserializer(io.RawIOBase): - def __init__(self, input_stream: io.RawIOBase): + def __init__(self, input_stream: IO[bytes]): self.input = input_stream def read(self, size: int = -1) -> Optional[bytes]: return self.input.read(size) def readline(self, size: Optional[int] = None) -> bytes: + size = -1 if size is None else size return self.input.readline(size) def read_object(self) -> dict: @@ -34,5 +35,5 @@ def __next__(self): class JsonTransform(BaseTransform): - def transform(self, input_stream: io.RawIOBase) -> JsonDeserializer: + def transform(self, input_stream: IO[bytes]) -> JsonDeserializer: return JsonDeserializer(input_stream=input_stream) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/zip.py b/aws_lambda_powertools/utilities/streaming/transformations/zip.py index 15de2d301e5..9e8b52e8be0 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/zip.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/zip.py @@ -1,11 +1,9 @@ -import io -import typing +from typing import IO from zipfile import ZipFile from aws_lambda_powertools.utilities.streaming.transformations.base import BaseTransform class ZipTransform(BaseTransform): - def transform(self, input_stream: io.RawIOBase) -> ZipFile: - input_as_io = typing.cast(typing.IO[bytes], input_stream) - return ZipFile(input_as_io, mode="r", **self.kwargs) + def transform(self, input_stream: IO[bytes]) -> ZipFile: + return ZipFile(input_stream, mode="r", **self.kwargs) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index 8f2aaab6c4b..e5ffdc1efae 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -42,13 +42,13 @@ The code above will stream the contents from S3 as fast as possible, using minim The utility has some built-in data transformations to help deal with common scenarios while streaming data from S3. -| Name | Description | -|---------------------------------|---------------------------------------------------------------------------------------------| -| **Gzip** | Gunzips the stream of data using the [gzip library](https://docs.python.org/3/library/gzip.html) | -| **Zip** | Exposes the stream as a [ZipFile object](https://docs.python.org/3/library/zipfile.html) | -| **JSON** | Parses each line as a JSON object, returning matched objects | +| Name | Description | +|----------|--------------------------------------------------------------------------------------------------| +| **Gzip** | Gunzips the stream of data using the [gzip library](https://docs.python.org/3/library/gzip.html) | +| **Zip** | Exposes the stream as a [ZipFile object](https://docs.python.org/3/library/zipfile.html) | +| **CSV** | Parses each line as a CSV object, returning dictionary objects | -Common options like gunzipping a stream and parsing data as JSON can be enabled directly on the constructor: +Common options like gunzipping a stream and parsing data as CSV can be enabled directly on the constructor: ```python hl_lines="8" --8<-- "examples/streaming/src/s3_transform_common.py" @@ -79,7 +79,7 @@ Each data transformation class accepts additional options to customize the trans |----------|----------------------------------------------------------------------------------------------------------------| | **Gzip** | All the options from the [GzipFile constructor](https://docs.python.org/3/library/gzip.html#gzip.GzipFile) | | **Zip** | All the options from the [ZipFile constructor](https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile) | -| **JSON** | No additional options are supported at the moment | +| **CSV** | All the options from the [DictReader constructor](https://docs.python.org/3/library/csv.html#csv.DictReader) | For instance, if you want to unzip an S3 file compressed using `LZMA` you could pass that option in the constructor: diff --git a/examples/streaming/src/s3_basic_stream.py b/examples/streaming/src/s3_basic_stream.py index 5d1914479e0..b8adb8ed683 100644 --- a/examples/streaming/src/s3_basic_stream.py +++ b/examples/streaming/src/s3_basic_stream.py @@ -1,6 +1,6 @@ from typing import Dict -from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.streaming.s3_object import S3Object from aws_lambda_powertools.utilities.typing import LambdaContext diff --git a/examples/streaming/src/s3_basic_stream_with_version.py b/examples/streaming/src/s3_basic_stream_with_version.py index 0be64ca8e6b..78a93e51c38 100644 --- a/examples/streaming/src/s3_basic_stream_with_version.py +++ b/examples/streaming/src/s3_basic_stream_with_version.py @@ -1,6 +1,6 @@ from typing import Dict -from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.streaming.s3_object import S3Object from aws_lambda_powertools.utilities.typing import LambdaContext diff --git a/examples/streaming/src/s3_transform.py b/examples/streaming/src/s3_transform.py index d45e16dfd4d..0f5130bac96 100644 --- a/examples/streaming/src/s3_transform.py +++ b/examples/streaming/src/s3_transform.py @@ -1,15 +1,15 @@ from typing import Dict -from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.streaming.s3_object import S3Object from aws_lambda_powertools.utilities.streaming.transformations import ( + CsvTransform, GzipTransform, - JsonTransform, ) from aws_lambda_powertools.utilities.typing import LambdaContext def lambda_handler(event: Dict[str, str], context: LambdaContext): s3 = S3Object(bucket=event["bucket"], key=event["key"]) - data = s3.transform([GzipTransform(), JsonTransform()]) + data = s3.transform([GzipTransform(), CsvTransform()]) for line in data: print(line) # returns a dict diff --git a/examples/streaming/src/s3_transform_common.py b/examples/streaming/src/s3_transform_common.py index fc04423c040..e5ea7239ecd 100644 --- a/examples/streaming/src/s3_transform_common.py +++ b/examples/streaming/src/s3_transform_common.py @@ -1,6 +1,6 @@ from typing import Dict -from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.streaming.s3_object import S3Object from aws_lambda_powertools.utilities.typing import LambdaContext diff --git a/examples/streaming/src/s3_transform_in_place.py b/examples/streaming/src/s3_transform_in_place.py index ead1a20c240..3ad4ce4b2a2 100644 --- a/examples/streaming/src/s3_transform_in_place.py +++ b/examples/streaming/src/s3_transform_in_place.py @@ -1,15 +1,15 @@ from typing import Dict -from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.streaming.s3_object import S3Object from aws_lambda_powertools.utilities.streaming.transformations import ( + CsvTransform, GzipTransform, - JsonTransform, ) from aws_lambda_powertools.utilities.typing import LambdaContext def lambda_handler(event: Dict[str, str], context: LambdaContext): s3 = S3Object(bucket=event["bucket"], key=event["key"]) - s3.transform([GzipTransform(), JsonTransform()], in_place=True) + s3.transform([GzipTransform(), CsvTransform()], in_place=True) for line in s3: print(line) # returns a dict diff --git a/examples/streaming/src/s3_transform_lzma.py b/examples/streaming/src/s3_transform_lzma.py index 0b172b17228..01cb3b22161 100644 --- a/examples/streaming/src/s3_transform_lzma.py +++ b/examples/streaming/src/s3_transform_lzma.py @@ -1,7 +1,7 @@ import zipfile from typing import Dict -from aws_lambda_powertools.utilities.streaming.s3 import S3Object +from aws_lambda_powertools.utilities.streaming.s3_object import S3Object from aws_lambda_powertools.utilities.streaming.transformations import ZipTransform from aws_lambda_powertools.utilities.typing import LambdaContext From c11de5c8d9eece135923d605066e6f72e0f205ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Wed, 16 Nov 2022 21:37:13 +0100 Subject: [PATCH 05/43] chore: add initial tests for s3_seekable_io --- .../utilities/streaming/_s3_seekable_io.py | 13 +- tests/functional/streaming/__init__.py | 0 tests/functional/streaming/test_s3object.py | 201 ++++++++++++++++++ 3 files changed, 211 insertions(+), 3 deletions(-) create mode 100644 tests/functional/streaming/__init__.py create mode 100644 tests/functional/streaming/test_s3object.py diff --git a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py index fc4fb8777b8..1a933bd7577 100644 --- a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py +++ b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py @@ -39,6 +39,9 @@ def __init__( # Holds the current position in the stream self._position = 0 + # Stores the closed state of the stream + self._closed: bool = False + # Caches the size of the object self._size: Optional[int] = None @@ -88,6 +91,7 @@ def raw_stream(self) -> StreamingBody: range_header = "bytes=%d-" % self._position logging.debug(f"Starting new stream at {range_header}...") self._raw_stream = self.s3_object.get(Range=range_header)["Body"] + self._closed = False return self._raw_stream @@ -140,11 +144,13 @@ def readline(self, size: Optional[int] = None) -> bytes: def readlines(self, hint: int = -1) -> List[bytes]: # boto3's StreamingResponse doesn't implement the "hint" parameter - return self.raw_stream.readlines() + data = self.raw_stream.readlines() + self._position += sum(len(line) for line in data) + return data @property def closed(self) -> bool: - return self.raw_stream.closed + return self._closed def __next__(self): return self.raw_stream.__next__() @@ -155,11 +161,12 @@ def __iter__(self): def __enter__(self): return self - def __exit__(self, **kwargs): + def __exit__(self, *kwargs): self.close() def close(self) -> None: self.raw_stream.close() + self._closed = True def fileno(self) -> int: raise NotImplementedError() diff --git a/tests/functional/streaming/__init__.py b/tests/functional/streaming/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/functional/streaming/test_s3object.py b/tests/functional/streaming/test_s3object.py new file mode 100644 index 00000000000..4b3ed59355f --- /dev/null +++ b/tests/functional/streaming/test_s3object.py @@ -0,0 +1,201 @@ +import io + +import boto3 +import pytest +from botocore import stub +from botocore.response import StreamingBody + +from aws_lambda_powertools.utilities.streaming._s3_seekable_io import _S3SeekableIO + + +@pytest.fixture +def s3_resource(): + return boto3.resource("s3") + + +@pytest.fixture +def s3_seekable_obj(s3_resource): + return _S3SeekableIO(bucket="bucket", key="key", boto3_s3_resource=s3_resource) + + +@pytest.fixture +def s3_resource_stub(s3_resource): + s3_stub = stub.Stubber(s3_resource.meta.client) + s3_stub.activate() + return s3_stub + + +def test_seekable(s3_seekable_obj): + assert s3_seekable_obj.seekable() is True + + +def test_readable(s3_seekable_obj): + assert s3_seekable_obj.readable() is True + + +def test_writeable(s3_seekable_obj): + assert s3_seekable_obj.writable() is False + + +def test_tell_is_zero(s3_seekable_obj): + assert s3_seekable_obj.tell() == 0 + + +def test_seek_set_changes_position(s3_seekable_obj): + assert s3_seekable_obj.seek(300, io.SEEK_SET) == 300 + assert s3_seekable_obj.tell() == 300 + + +def test_seek_cur_changes_position(s3_seekable_obj): + assert s3_seekable_obj.seek(200, io.SEEK_CUR) == 200 + assert s3_seekable_obj.seek(100, io.SEEK_CUR) == 300 + assert s3_seekable_obj.tell() == 300 + + +def test_seek_end(s3_seekable_obj, s3_resource_stub): + s3_resource_stub.add_response("head_object", {"ContentLength": 1000}) + + assert s3_seekable_obj.seek(0, io.SEEK_END) == 1000 + assert s3_seekable_obj.tell() == 1000 + + +def test_size(s3_seekable_obj, s3_resource_stub): + s3_resource_stub.add_response("head_object", {"ContentLength": 1000}) + + assert s3_seekable_obj.size == 1000 + + +def test_raw_stream_fetches_with_range_header(s3_seekable_obj, s3_resource_stub): + s3_resource_stub.add_response( + "get_object", + {"Body": ""}, + {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, + ) + + assert s3_seekable_obj.raw_stream is not None + + +def test_raw_stream_fetches_with_range_header_after_seek(s3_seekable_obj, s3_resource_stub): + s3_seekable_obj.seek(100, io.SEEK_SET) + + s3_resource_stub.add_response( + "get_object", + {"Body": ""}, + {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=100-"}, + ) + + assert s3_seekable_obj.raw_stream is not None + + +def test_read(s3_seekable_obj, s3_resource_stub): + payload = b"hello world" + streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + + s3_resource_stub.add_response( + "get_object", + {"Body": streaming_body}, + {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, + ) + + assert s3_seekable_obj.read(5) == b"hello" + assert s3_seekable_obj.read(1) == b" " + assert s3_seekable_obj.read(10) == b"world" + assert s3_seekable_obj.tell() == len(payload) + + +def test_readline(s3_seekable_obj, s3_resource_stub): + payload = b"hello world\nworld hello" + streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + + s3_resource_stub.add_response( + "get_object", + {"Body": streaming_body}, + {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, + ) + + assert s3_seekable_obj.readline() == b"hello world\n" + assert s3_seekable_obj.readline() == b"world hello" + assert s3_seekable_obj.tell() == len(payload) + + +def test_readlines(s3_seekable_obj, s3_resource_stub): + payload = b"hello world\nworld hello" + streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + + s3_resource_stub.add_response( + "get_object", + {"Body": streaming_body}, + {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, + ) + + assert s3_seekable_obj.readlines() == [b"hello world\n", b"world hello"] + assert s3_seekable_obj.tell() == len(payload) + + +def test_closed(s3_seekable_obj, s3_resource_stub): + payload = b"test" + streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + + s3_resource_stub.add_response( + "get_object", + {"Body": streaming_body}, + {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, + ) + + s3_seekable_obj.close() + assert s3_seekable_obj.closed is True + + +def test_next(s3_seekable_obj, s3_resource_stub): + payload = b"test" + streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + + s3_resource_stub.add_response( + "get_object", + {"Body": streaming_body}, + {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, + ) + + assert next(s3_seekable_obj) == b"test" + with pytest.raises(StopIteration): + next(s3_seekable_obj) + + +def test_context_manager(s3_seekable_obj, s3_resource_stub): + payload = b"test" + streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + + s3_resource_stub.add_response( + "get_object", + {"Body": streaming_body}, + {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, + ) + + with s3_seekable_obj as f: + assert f.read(4) == b"test" + + assert s3_seekable_obj.closed is True + + +def test_fileno(s3_seekable_obj): + with pytest.raises(NotImplementedError): + s3_seekable_obj.fileno() + + +def test_flush(s3_seekable_obj): + with pytest.raises(NotImplementedError): + s3_seekable_obj.flush() + + +def test_isatty(s3_seekable_obj): + assert s3_seekable_obj.isatty() is False + + +def test_truncate(s3_seekable_obj): + with pytest.raises(NotImplementedError): + s3_seekable_obj.truncate() + + +def test_write(s3_seekable_obj): + with pytest.raises(NotImplementedError): + s3_seekable_obj.write(b"data") From 8dc8b84844b7f96347549613bcf178fe2caaca9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Wed, 16 Nov 2022 21:37:55 +0100 Subject: [PATCH 06/43] fix: renamed file --- .../streaming/{test_s3object.py => test_s3_seekable_io.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/functional/streaming/{test_s3object.py => test_s3_seekable_io.py} (100%) diff --git a/tests/functional/streaming/test_s3object.py b/tests/functional/streaming/test_s3_seekable_io.py similarity index 100% rename from tests/functional/streaming/test_s3object.py rename to tests/functional/streaming/test_s3_seekable_io.py From 77aba9b1d948c08a7ea0a9f0892a1f3759b6abd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Thu, 17 Nov 2022 10:55:26 +0100 Subject: [PATCH 07/43] fix: remove json transformation --- .../streaming/transformations/base.py | 3 +- .../streaming/transformations/json.py | 39 ------------------- docs/utilities/streaming.md | 16 +++++--- examples/streaming/src/s3_json_transform.py | 29 ++++++++++++++ examples/streaming/src/s3_transform_tsv.py | 13 +++++++ 5 files changed, 55 insertions(+), 45 deletions(-) delete mode 100644 aws_lambda_powertools/utilities/streaming/transformations/json.py create mode 100644 examples/streaming/src/s3_json_transform.py create mode 100644 examples/streaming/src/s3_transform_tsv.py diff --git a/aws_lambda_powertools/utilities/streaming/transformations/base.py b/aws_lambda_powertools/utilities/streaming/transformations/base.py index d55deb637fd..06d9aaa02aa 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/base.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/base.py @@ -16,7 +16,8 @@ def __init__(self, *args, **kwargs): @abstractmethod def transform(self, input_stream: IO[bytes]) -> T: """ - Transform the data from input_stream into something that implements IO[bytes]. + Transforms the data from input_stream into an implementation of IO[bytes]. + This allows you to return your own object while still conforming to a protocol that allows transformations to be nested. """ diff --git a/aws_lambda_powertools/utilities/streaming/transformations/json.py b/aws_lambda_powertools/utilities/streaming/transformations/json.py deleted file mode 100644 index bd2aa2dcd24..00000000000 --- a/aws_lambda_powertools/utilities/streaming/transformations/json.py +++ /dev/null @@ -1,39 +0,0 @@ -import io -import json -from json import JSONDecodeError -from typing import IO, Optional - -from aws_lambda_powertools.utilities.streaming.transformations.base import BaseTransform - - -class JsonDeserializer(io.RawIOBase): - def __init__(self, input_stream: IO[bytes]): - self.input = input_stream - - def read(self, size: int = -1) -> Optional[bytes]: - return self.input.read(size) - - def readline(self, size: Optional[int] = None) -> bytes: - size = -1 if size is None else size - return self.input.readline(size) - - def read_object(self) -> dict: - obj: dict = {} - - while not self.input.closed: - line = self.input.__next__() - try: - obj = json.loads(line) - except JSONDecodeError: - continue - break - - return obj - - def __next__(self): - return self.read_object() - - -class JsonTransform(BaseTransform): - def transform(self, input_stream: IO[bytes]) -> JsonDeserializer: - return JsonDeserializer(input_stream=input_stream) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index e5ffdc1efae..8624a00ebe7 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -9,13 +9,13 @@ The streaming utility handles streaming data from AWS for processing data sets b * Simple interface to stream data from S3, even when the data is larger than memory * Read your S3 file using the patterns you already know to deal with files in Python -* Includes common transformations to data stored in S3, like Gzip and Json deserialization +* Includes common transformations to data stored in S3, like Gzip and CSV deserialization * Build your own data transformation and add it to the pipeline ## Background Processing S3 files inside your Lambda function presents challenges when the file is bigger than the allocated -amount of memory. Your data may also be stored using a set of encapsulation layers (gzip, JSON strings, etc). +amount of memory. Your data may also be stored using a set of encapsulation layers (gzip, CSV, zip files, etc). This utility makes it easy to process data coming from S3 files, while applying data transformations transparently to the data stream. @@ -87,14 +87,20 @@ For instance, if you want to unzip an S3 file compressed using `LZMA` you could --8<-- "examples/streaming/src/s3_transform_lzma.py" ``` +Or, if you want to load a `TSV` file, you can just change the delimiter on the `CSV` transform: + +```python hl_lines="12" +--8<-- "examples/streaming/src/s3_transform_tsv.py" +``` + ### Building your own data transformation You can build your own custom data transformation by extending the `BaseTransform` class. -The `transform` method receives an `io.RawIOBase` object, and you are responsible for returning an object that is also -a `io.RawIOBase`. +The `transform` method receives an `IO[bytes]` object, and you are responsible for returning an object that is also +a `IO[bytes]`. ```python hl_lines="9 37 38" ---8<-- "aws_lambda_powertools/utilities/streaming/transformations/json.py" +--8<-- "examples/streaming/src/s3_json_transform.py" ``` ## Testing your code diff --git a/examples/streaming/src/s3_json_transform.py b/examples/streaming/src/s3_json_transform.py new file mode 100644 index 00000000000..30c31b0f32c --- /dev/null +++ b/examples/streaming/src/s3_json_transform.py @@ -0,0 +1,29 @@ +import io +from typing import IO, Optional + +import ijson + +from aws_lambda_powertools.utilities.streaming.transformations import BaseTransform + + +# Using io.RawIOBase gets us default implementations of many of the common IO methods +class JsonDeserializer(io.RawIOBase): + def __init__(self, input_stream: IO[bytes]): + self.input = ijson.items(input_stream, "", multiple_values=True) + + def read(self, size: int = -1) -> Optional[bytes]: + raise NotImplementedError(f"{__name__} does not implement read") + + def readline(self, size: Optional[int] = None) -> bytes: + raise NotImplementedError(f"{__name__} does not implement readline") + + def read_object(self) -> dict: + return self.input.__next__() + + def __next__(self): + return self.read_object() + + +class JsonTransform(BaseTransform): + def transform(self, input_stream: IO[bytes]) -> JsonDeserializer: + return JsonDeserializer(input_stream=input_stream) diff --git a/examples/streaming/src/s3_transform_tsv.py b/examples/streaming/src/s3_transform_tsv.py new file mode 100644 index 00000000000..d76f751652a --- /dev/null +++ b/examples/streaming/src/s3_transform_tsv.py @@ -0,0 +1,13 @@ +from typing import Dict + +from aws_lambda_powertools.utilities.streaming.s3_object import S3Object +from aws_lambda_powertools.utilities.streaming.transformations import CsvTransform +from aws_lambda_powertools.utilities.typing import LambdaContext + + +def lambda_handler(event: Dict[str, str], context: LambdaContext): + s3 = S3Object(bucket=event["bucket"], key=event["key"]) + + tsv_stream = s3.transform(CsvTransform(delimiter="\t")) + for obj in tsv_stream: + print(obj) From e29d46ddbac7dc6151679460502c49a059b16e39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Thu, 17 Nov 2022 10:56:53 +0100 Subject: [PATCH 08/43] fix: remove json transformation from import --- .../utilities/streaming/transformations/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/__init__.py b/aws_lambda_powertools/utilities/streaming/transformations/__init__.py index 044dadfe877..04e4a076a73 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/__init__.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/__init__.py @@ -1,7 +1,6 @@ from aws_lambda_powertools.utilities.streaming.transformations.base import BaseTransform from aws_lambda_powertools.utilities.streaming.transformations.csv import CsvTransform from aws_lambda_powertools.utilities.streaming.transformations.gzip import GzipTransform -from aws_lambda_powertools.utilities.streaming.transformations.json import JsonTransform from aws_lambda_powertools.utilities.streaming.transformations.zip import ZipTransform -__all__ = ["BaseTransform", "GzipTransform", "JsonTransform", "ZipTransform", "CsvTransform"] +__all__ = ["BaseTransform", "GzipTransform", "ZipTransform", "CsvTransform"] From be99144b9a14eefb24d7fa239ebc2ab03a5a2e28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Thu, 17 Nov 2022 11:18:56 +0100 Subject: [PATCH 09/43] chore: add ijson as a dev dependency --- mypy.ini | 3 + poetry.lock | 360 +++++++++++++++++++++++++++++++++++-------------- pyproject.toml | 1 + 3 files changed, 264 insertions(+), 100 deletions(-) diff --git a/mypy.ini b/mypy.ini index 4da15d3898a..03545a2cc9f 100644 --- a/mypy.ini +++ b/mypy.ini @@ -46,3 +46,6 @@ ignore_missing_imports = True [mypy-snappy] ignore_missing_imports = True + +[mypy-ijson] +ignore_missing_imports = True diff --git a/poetry.lock b/poetry.lock index 88343d8bbf3..7246cf056d6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -14,7 +14,7 @@ tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy [[package]] name = "aws-cdk-asset-awscli-v1" -version = "2.2.15" +version = "2.2.17" description = "A library that contains the AWS CLI for use in Lambda Layers" category = "dev" optional = false @@ -40,7 +40,7 @@ typeguard = ">=2.13.3,<2.14.0" [[package]] name = "aws-cdk-asset-node-proxy-agent-v5" -version = "2.0.21" +version = "2.0.23" description = "@aws-cdk/asset-node-proxy-agent-v5" category = "dev" optional = false @@ -53,32 +53,32 @@ typeguard = ">=2.13.3,<2.14.0" [[package]] name = "aws-cdk-aws-apigatewayv2-alpha" -version = "2.47.0a0" +version = "2.51.1a0" description = "The CDK Construct Library for AWS::APIGatewayv2" category = "dev" optional = false python-versions = "~=3.7" [package.dependencies] -aws-cdk-lib = ">=2.47.0,<3.0.0" +aws-cdk-lib = ">=2.51.1,<3.0.0" constructs = ">=10.0.0,<11.0.0" -jsii = ">=1.69.0,<2.0.0" +jsii = ">=1.71.0,<2.0.0" publication = ">=0.0.3" typeguard = ">=2.13.3,<2.14.0" [[package]] name = "aws-cdk-aws-apigatewayv2-integrations-alpha" -version = "2.47.0a0" +version = "2.51.1a0" description = "Integrations for AWS APIGateway V2" category = "dev" optional = false python-versions = "~=3.7" [package.dependencies] -"aws-cdk.aws-apigatewayv2-alpha" = "2.47.0.a0" -aws-cdk-lib = ">=2.47.0,<3.0.0" +"aws-cdk.aws-apigatewayv2-alpha" = "2.51.1.a0" +aws-cdk-lib = ">=2.51.1,<3.0.0" constructs = ">=10.0.0,<11.0.0" -jsii = ">=1.69.0,<2.0.0" +jsii = ">=1.71.0,<2.0.0" publication = ">=0.0.3" typeguard = ">=2.13.3,<2.14.0" @@ -101,7 +101,7 @@ typeguard = ">=2.13.3,<2.14.0" [[package]] name = "aws-sam-translator" -version = "1.53.0" +version = "1.54.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" category = "dev" optional = false @@ -112,7 +112,7 @@ boto3 = ">=1.19.5,<2.0.0" jsonschema = ">=3.2,<4.0" [package.extras] -dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "boto3-stubs[appconfig,serverlessrepo] (>=1.19.5,<2.0.0)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "mypy (==0.971)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.9.0,<2.10.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)", "types-PyYAML (>=5.4,<6.0)", "types-jsonschema (>=3.2,<4.0)"] +dev = ["black (==20.8b1)", "boto3 (>=1.23,<2)", "boto3-stubs[appconfig,serverlessrepo] (>=1.19.5,<2.0.0)", "click (>=7.1,<8.0)", "coverage (>=5.3,<6.0)", "dateparser (>=0.7,<1.0)", "docopt (>=0.6.2,<0.7.0)", "flake8 (>=3.8.4,<3.9.0)", "mypy (==0.971)", "parameterized (>=0.7.4,<0.8.0)", "pylint (>=2.15.0,<2.16.0)", "pytest (>=6.2.5,<6.3.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-rerunfailures (>=9.1.1,<9.2.0)", "pytest-xdist (>=2.5,<3.0)", "pyyaml (>=5.4,<6.0)", "requests (>=2.24.0,<2.25.0)", "ruamel.yaml (==0.17.21)", "tenacity (>=7.0.0,<7.1.0)", "tox (>=3.24,<4.0)", "types-PyYAML (>=5.4,<6.0)", "types-jsonschema (>=3.2,<4.0)"] [[package]] name = "aws-xray-sdk" @@ -158,6 +158,9 @@ click = ">=8.0.0" mypy-extensions = ">=0.4.3" pathspec = ">=0.9.0" platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} +typed-ast = {version = ">=1.4.2", markers = "python_version < \"3.8\" and implementation_name == \"cpython\""} +typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} [package.extras] colorama = ["colorama (>=0.4.3)"] @@ -167,14 +170,14 @@ uvloop = ["uvloop (>=0.15.2)"] [[package]] name = "boto3" -version = "1.25.0" +version = "1.26.16" description = "The AWS SDK for Python" category = "main" optional = false python-versions = ">= 3.7" [package.dependencies] -botocore = ">=1.28.0,<1.29.0" +botocore = ">=1.29.16,<1.30.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.6.0,<0.7.0" @@ -183,7 +186,7 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.28.0" +version = "1.29.16" description = "Low-level, data-driven core of boto 3." category = "main" optional = false @@ -207,6 +210,8 @@ python-versions = ">=3.7" [package.dependencies] attrs = ">=20" +exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +typing_extensions = {version = "*", markers = "python_version < \"3.8\""} [[package]] name = "certifi" @@ -263,6 +268,7 @@ python-versions = ">=3.7" [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} [[package]] name = "colorama" @@ -274,14 +280,14 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7 [[package]] name = "constructs" -version = "10.1.139" +version = "10.1.169" description = "A programming model for software-defined state" category = "dev" optional = false python-versions = "~=3.7" [package.dependencies] -jsii = ">=1.70.0,<2.0.0" +jsii = ">=1.71.0,<2.0.0" publication = ">=0.0.3" typeguard = ">=2.13.3,<2.14.0" @@ -293,6 +299,9 @@ category = "dev" optional = false python-versions = ">=3.7" +[package.dependencies] +tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} + [package.extras] toml = ["tomli"] @@ -312,6 +321,17 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "exceptiongroup" +version = "1.0.4" +description = "Backport of PEP 654 (exception groups)" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +test = ["pytest (>=6)"] + [[package]] name = "execnet" version = "1.9.0" @@ -355,13 +375,14 @@ optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" [package.dependencies] +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} mccabe = ">=0.6.0,<0.7.0" pycodestyle = ">=2.7.0,<2.8.0" pyflakes = ">=2.3.0,<2.4.0" [[package]] name = "flake8-black" -version = "0.3.3" +version = "0.3.5" description = "flake8 plugin to call black as a code style validator" category = "dev" optional = false @@ -369,9 +390,12 @@ python-versions = ">=3.7" [package.dependencies] black = ">=22.1.0" -flake8 = ">=3.0.0" +flake8 = ">=3" tomli = "*" +[package.extras] +develop = ["build", "twine"] + [[package]] name = "flake8-bugbear" version = "22.10.27" @@ -411,6 +435,7 @@ python-versions = ">=3.7" [package.dependencies] flake8 = ">=3.0,<3.2.0 || >3.2.0" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} [[package]] name = "flake8-debugger" @@ -436,6 +461,7 @@ python-versions = ">=3.7,<4.0" attrs = "*" eradicate = ">=2.0,<3.0" flake8 = ">=3.5,<6" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} [[package]] name = "flake8-fixme" @@ -477,11 +503,11 @@ dev = ["flake8", "markdown", "twine", "wheel"] [[package]] name = "gitdb" -version = "4.0.9" +version = "4.0.10" description = "Git Object Database" category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] smmap = ">=3.0.1,<6" @@ -496,6 +522,7 @@ python-versions = ">=3.7" [package.dependencies] gitdb = ">=4.0.1,<5" +typing-extensions = {version = ">=3.7.4.3", markers = "python_version < \"3.8\""} [[package]] name = "idna" @@ -505,6 +532,14 @@ category = "dev" optional = false python-versions = ">=3.5" +[[package]] +name = "ijson" +version = "3.1.4" +description = "Iterative JSON parser with standard Python iterator interfaces" +category = "dev" +optional = false +python-versions = "*" + [[package]] name = "importlib-metadata" version = "4.13.0" @@ -514,6 +549,7 @@ optional = false python-versions = ">=3.7" [package.dependencies] +typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} zipp = ">=0.5" [package.extras] @@ -613,6 +649,9 @@ category = "dev" optional = false python-versions = ">=2.7" +[package.dependencies] +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} + [package.extras] docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"] testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"] @@ -636,6 +675,7 @@ python-versions = "*" [package.dependencies] attrs = ">=17.4.0" +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} pyrsistent = ">=0.14.0" setuptools = "*" six = ">=1.11.0" @@ -657,13 +697,14 @@ six = "*" [[package]] name = "mako" -version = "1.2.3" +version = "1.2.4" description = "A super-fast templating language that borrows the best ideas from the existing templating languages." category = "dev" optional = false python-versions = ">=3.7" [package.dependencies] +importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} MarkupSafe = ">=0.9.2" [package.extras] @@ -693,6 +734,9 @@ category = "dev" optional = false python-versions = ">=3.6" +[package.dependencies] +importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} + [package.extras] testing = ["coverage", "pyyaml"] @@ -740,7 +784,7 @@ test = ["coverage", "flake8 (>=3.0)", "shtab"] [[package]] name = "mkdocs" -version = "1.4.1" +version = "1.4.2" description = "Project documentation with Markdown." category = "dev" optional = false @@ -750,12 +794,14 @@ python-versions = ">=3.7" click = ">=7.0" colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""} ghp-import = ">=1.0" +importlib-metadata = {version = ">=4.3", markers = "python_version < \"3.10\""} jinja2 = ">=2.11.1" markdown = ">=3.2.1,<3.4" mergedeep = ">=1.3.4" packaging = ">=20.5" pyyaml = ">=5.1" pyyaml-env-tag = ">=0.1" +typing-extensions = {version = ">=3.10", markers = "python_version < \"3.8\""} watchdog = ">=2.0" [package.extras] @@ -794,7 +840,7 @@ requests = ">=2.26" [[package]] name = "mkdocs-material-extensions" -version = "1.1" +version = "1.1.1" description = "Extension pack for Python Markdown and MkDocs Material." category = "dev" optional = false @@ -810,6 +856,8 @@ python-versions = ">=3.7" [package.dependencies] mypy-extensions = ">=0.4.3" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typed-ast = {version = ">=1.4.0,<2", markers = "python_version < \"3.8\""} typing-extensions = ">=3.10" [package.extras] @@ -974,7 +1022,7 @@ pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" [[package]] name = "pathspec" -version = "0.10.1" +version = "0.10.2" description = "Utility library for gitignore style pattern matching of file paths." category = "dev" optional = false @@ -1002,15 +1050,15 @@ markdown = ">=3.0" [[package]] name = "platformdirs" -version = "2.5.2" -description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +version = "2.5.4" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." category = "dev" optional = false python-versions = ">=3.7" [package.extras] -docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"] -test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"] +docs = ["furo (>=2022.9.29)", "proselint (>=0.13)", "sphinx (>=5.3)", "sphinx-autodoc-typehints (>=1.19.4)"] +test = ["appdirs (==1.4.4)", "pytest (>=7.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] [[package]] name = "pluggy" @@ -1020,6 +1068,9 @@ category = "dev" optional = false python-versions = ">=3.6" +[package.dependencies] +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} + [package.extras] dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] @@ -1042,8 +1093,8 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "py-cpuinfo" -version = "8.0.0" -description = "Get CPU info with pure Python 2 & 3" +version = "9.0.0" +description = "Get CPU info with pure Python" category = "dev" optional = false python-versions = "*" @@ -1092,7 +1143,7 @@ plugins = ["importlib-metadata"] [[package]] name = "pymdown-extensions" -version = "9.7" +version = "9.9" description = "Extension pack for Python Markdown." category = "dev" optional = false @@ -1114,7 +1165,7 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pyrsistent" -version = "0.18.1" +version = "0.19.2" description = "Persistent/Functional/Immutable data structures" category = "dev" optional = false @@ -1131,9 +1182,12 @@ python-versions = ">=3.7" [package.dependencies] attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] @@ -1148,6 +1202,7 @@ python-versions = ">=3.7" [package.dependencies] pytest = ">=6.1.0" +typing-extensions = {version = ">=3.7.2", markers = "python_version < \"3.8\""} [package.extras] testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"] @@ -1324,7 +1379,7 @@ pbr = "*" [[package]] name = "setuptools" -version = "65.5.0" +version = "65.6.3" description = "Easily download, build, install, upgrade, and uninstall Python packages" category = "dev" optional = false @@ -1332,7 +1387,7 @@ python-versions = ">=3.7" [package.extras] docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] @@ -1360,6 +1415,7 @@ optional = false python-versions = ">=3.6" [package.dependencies] +importlib-metadata = {version = ">=1.7.0", markers = "python_version < \"3.8\""} pbr = ">=2.0.0,<2.1.0 || >2.1.0" [[package]] @@ -1370,6 +1426,14 @@ category = "dev" optional = false python-versions = ">=3.7" +[[package]] +name = "typed-ast" +version = "1.5.4" +description = "a fork of Python 2 and 3 ast modules with type comment support" +category = "dev" +optional = false +python-versions = ">=3.6" + [[package]] name = "typeguard" version = "2.13.3" @@ -1395,7 +1459,7 @@ types-urllib3 = "<1.27" [[package]] name = "types-urllib3" -version = "1.26.25.1" +version = "1.26.25.4" description = "Typing stubs for urllib3" category = "dev" optional = false @@ -1411,11 +1475,11 @@ python-versions = ">=3.7" [[package]] name = "urllib3" -version = "1.26.12" +version = "1.26.13" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" [package.extras] brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] @@ -1486,8 +1550,8 @@ validation = ["fastjsonschema"] [metadata] lock-version = "1.1" -python-versions = "~3.11" -content-hash = "c70c8de71364d2e49b538fcc5fba03cf8ff8b956ee2760cf9406ce5ddc4e9689" +python-versions = "^3.7.4" +content-hash = "502dbe48f1ed8a7e4d4c755adf02f0641eef20ab6eecffe19a50cafcca0f237d" [metadata.files] attrs = [ @@ -1495,33 +1559,33 @@ attrs = [ {file = "attrs-22.1.0.tar.gz", hash = "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6"}, ] aws-cdk-asset-awscli-v1 = [ - {file = "aws-cdk.asset-awscli-v1-2.2.15.tar.gz", hash = "sha256:91b618cd8e1f9698a71d08b92e1cd23ac3f822140745c7c7d86e7f4df91ea90a"}, - {file = "aws_cdk.asset_awscli_v1-2.2.15-py3-none-any.whl", hash = "sha256:5e612f7583f8635f7bca16319a5441fe9c568b30605ed86ff7c18664b6a522c7"}, + {file = "aws-cdk.asset-awscli-v1-2.2.17.tar.gz", hash = "sha256:6d67c37d2989e1dfb3c376fdb773121af3879e363838991b47a5778f4ff1f114"}, + {file = "aws_cdk.asset_awscli_v1-2.2.17-py3-none-any.whl", hash = "sha256:8e71c77817d1c2315af0b17756384a7542737d621b2046a686eb310be5c17cc9"}, ] aws-cdk-asset-kubectl-v20 = [ {file = "aws-cdk.asset-kubectl-v20-2.1.1.tar.gz", hash = "sha256:9834cdb150c5590aea4e5eba6de2a89b4c60617451181c524810c5a75154565c"}, {file = "aws_cdk.asset_kubectl_v20-2.1.1-py3-none-any.whl", hash = "sha256:a2fad1a5a35a94a465efe60859f91e45dacc33261fb9bbf1cf9bbc6e2f70e9d6"}, ] aws-cdk-asset-node-proxy-agent-v5 = [ - {file = "aws-cdk.asset-node-proxy-agent-v5-2.0.21.tar.gz", hash = "sha256:1874b4861a286edd0d56514f1b1f287deded99a5dd645cb9899f853bbf4b93e0"}, - {file = "aws_cdk.asset_node_proxy_agent_v5-2.0.21-py3-none-any.whl", hash = "sha256:2101bc5da21239a42a4e8f052b4e32c5223e704f71b0de133f12d1fcebfdcd20"}, + {file = "aws-cdk.asset-node-proxy-agent-v5-2.0.23.tar.gz", hash = "sha256:98728eba7747af889b803462c26f46a831fd24abba0bb04b267aea5a4a9af738"}, + {file = "aws_cdk.asset_node_proxy_agent_v5-2.0.23-py3-none-any.whl", hash = "sha256:36b7cf42b0b940681c6e1c651f09332fbd779906985dc22828e7334efdff9216"}, ] aws-cdk-aws-apigatewayv2-alpha = [ - {file = "aws-cdk.aws-apigatewayv2-alpha-2.47.0a0.tar.gz", hash = "sha256:b1e32e046bd5ae224c8a962850215ac98e9639c453337fafeffb7a5618d66063"}, - {file = "aws_cdk.aws_apigatewayv2_alpha-2.47.0a0-py3-none-any.whl", hash = "sha256:f2d41b944d7781b9565135e832a416c54e2c1e52f31fefdc7b8b323142814033"}, + {file = "aws-cdk.aws-apigatewayv2-alpha-2.51.1a0.tar.gz", hash = "sha256:d8c1a914bb2b08c0e8999a3cc30e8b8c2bebeb580b253f32b025ee567f02fe3d"}, + {file = "aws_cdk.aws_apigatewayv2_alpha-2.51.1a0-py3-none-any.whl", hash = "sha256:23e14a68769e7fdf4f3c7f9bc56293a14d48b580074a9f327d432aa2ee81362c"}, ] aws-cdk-aws-apigatewayv2-integrations-alpha = [ - {file = "aws-cdk.aws-apigatewayv2-integrations-alpha-2.47.0a0.tar.gz", hash = "sha256:c86f0291c8cac2b8db0391bf52f626e541a82a3c7cbb4a03cd8d5014882862cc"}, - {file = "aws_cdk.aws_apigatewayv2_integrations_alpha-2.47.0a0-py3-none-any.whl", hash = "sha256:4633c4e020400c7ee5790652f099f02d4f84bab24fff013250e2b41a4ffca1b0"}, + {file = "aws-cdk.aws-apigatewayv2-integrations-alpha-2.51.1a0.tar.gz", hash = "sha256:ab7e5d4c7a0037f82115b51609aae283617f3692db41e4b3cc66051ef9f4cd3b"}, + {file = "aws_cdk.aws_apigatewayv2_integrations_alpha-2.51.1a0-py3-none-any.whl", hash = "sha256:ee6c7bb37afef99d3b56a8800f341fe521537406c1e69e46a1aaee9edbd6f6a8"}, ] aws-cdk-lib = [ {file = "aws-cdk-lib-2.51.1.tar.gz", hash = "sha256:35b66c2ed34490470d1917ee61011fbe053ad22cd1521ee34128f2db78bd2a8a"}, {file = "aws_cdk_lib-2.51.1-py3-none-any.whl", hash = "sha256:721f5477bad042162f5257fd70fae62a1c0fe499b6cbb71c1f27ea17ba988e96"}, ] aws-sam-translator = [ - {file = "aws-sam-translator-1.53.0.tar.gz", hash = "sha256:392ed4f5fb08f72cb68a8800f0bc278d2a3b6609bd1ac66bfcdeaaa94cdc18e5"}, - {file = "aws_sam_translator-1.53.0-py2-none-any.whl", hash = "sha256:85252646cf123642d08442137b60445e69e30bfd2f8b663b1202b20ab3782b10"}, - {file = "aws_sam_translator-1.53.0-py3-none-any.whl", hash = "sha256:84d780ad82f1a176e2f5d4c397749d1e71214cc97ee7cccd50f823fd7c7e7cdf"}, + {file = "aws-sam-translator-1.54.0.tar.gz", hash = "sha256:a3ae79f1f2d430f5ade4d245165d5612414233f540b471d170f1aab95c3713a6"}, + {file = "aws_sam_translator-1.54.0-py2-none-any.whl", hash = "sha256:1bb4abb197e6de3f935425e65f67d14f47eb620d984e9de963b666cc9deb66e4"}, + {file = "aws_sam_translator-1.54.0-py3-none-any.whl", hash = "sha256:10d6771ebbe9107a0ddb756ccffd68ba81d885ef2eace80358a098566e6abaf1"}, ] aws-xray-sdk = [ {file = "aws-xray-sdk-2.11.0.tar.gz", hash = "sha256:78835fc841f03e550858f18a9973eab8618f47f22d2f59edf130578fa545a867"}, @@ -1555,12 +1619,12 @@ black = [ {file = "black-22.10.0.tar.gz", hash = "sha256:f513588da599943e0cde4e32cc9879e825d58720d6557062d1098c5ad80080e1"}, ] boto3 = [ - {file = "boto3-1.25.0-py3-none-any.whl", hash = "sha256:81139cc9da154a1672c7dd92da1678cae0ea1601a3e1f0394c6cd010eab1acb6"}, - {file = "boto3-1.25.0.tar.gz", hash = "sha256:170eab4a87592741933b6f8a02c3a6a8664162ef33bb12a2c2b4d431490d9ac2"}, + {file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"}, + {file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"}, ] botocore = [ - {file = "botocore-1.28.0-py3-none-any.whl", hash = "sha256:5bc426647da9f7739b73b1ffb5fce37fb3691c3c66dd772bf541dc19f8da2f43"}, - {file = "botocore-1.28.0.tar.gz", hash = "sha256:75a4082543e2c1b005ccde90af87d0969003db06c3fcbe8a7854ddaa8d68fafb"}, + {file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"}, + {file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"}, ] cattrs = [ {file = "cattrs-22.2.0-py3-none-any.whl", hash = "sha256:bc12b1f0d000b9f9bee83335887d532a1d3e99a833d1bf0882151c97d3e68c21"}, @@ -1591,8 +1655,8 @@ colorama = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] constructs = [ - {file = "constructs-10.1.139-py3-none-any.whl", hash = "sha256:e809549fc4f1cad8ee57eb5a8a31d163f6321374c024a91e04bbb1f5520d53c7"}, - {file = "constructs-10.1.139.tar.gz", hash = "sha256:df319296e2efe699662323dc2b1dcb154439457aaaa036b8c7409975bfc5b43a"}, + {file = "constructs-10.1.169-py3-none-any.whl", hash = "sha256:e36cc48c564e9432f76c2e533c3dcdd052167ba34545daab5084a2ce2163f123"}, + {file = "constructs-10.1.169.tar.gz", hash = "sha256:ff4e69bf78affb5a797b0653bcd2b007e375b54d0e5c20f163e37be8fec565d5"}, ] coverage = [ {file = "coverage-6.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef8674b0ee8cc11e2d574e3e2998aea5df5ab242e012286824ea3c6970580e53"}, @@ -1654,6 +1718,10 @@ eradicate = [ {file = "eradicate-2.1.0-py3-none-any.whl", hash = "sha256:8bfaca181db9227dc88bdbce4d051a9627604c2243e7d85324f6d6ce0fd08bb2"}, {file = "eradicate-2.1.0.tar.gz", hash = "sha256:aac7384ab25b1bf21c4c012de9b4bf8398945a14c98c911545b2ea50ab558014"}, ] +exceptiongroup = [ + {file = "exceptiongroup-1.0.4-py3-none-any.whl", hash = "sha256:542adf9dea4055530d6e1279602fa5cb11dab2395fa650b8674eaec35fc4a828"}, + {file = "exceptiongroup-1.0.4.tar.gz", hash = "sha256:bd14967b79cd9bdb54d97323216f8fdf533e278df937aa2a90089e7d6e06e5ec"}, +] execnet = [ {file = "execnet-1.9.0-py2.py3-none-any.whl", hash = "sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"}, {file = "execnet-1.9.0.tar.gz", hash = "sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5"}, @@ -1671,8 +1739,8 @@ flake8 = [ {file = "flake8-3.9.2.tar.gz", hash = "sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b"}, ] flake8-black = [ - {file = "flake8-black-0.3.3.tar.gz", hash = "sha256:8211f5e20e954cb57c709acccf2f3281ce27016d4c4b989c3e51f878bb7ce12a"}, - {file = "flake8_black-0.3.3-py3-none-any.whl", hash = "sha256:7d667d0059fd1aa468de1669d77cc934b7f1feeac258d57bdae69a8e73c4cd90"}, + {file = "flake8-black-0.3.5.tar.gz", hash = "sha256:9e93252b1314a8eb3c2f55dec54a07239e502b12f57567f2c105f2202714b15e"}, + {file = "flake8_black-0.3.5-py3-none-any.whl", hash = "sha256:4948a579fdddd98fbf935fd94255dfcfce560c4ddc1ceee08e3f12d6114c8619"}, ] flake8-bugbear = [ {file = "flake8-bugbear-22.10.27.tar.gz", hash = "sha256:a6708608965c9e0de5fff13904fed82e0ba21ac929fe4896459226a797e11cd5"}, @@ -1710,8 +1778,8 @@ ghp-import = [ {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"}, ] gitdb = [ - {file = "gitdb-4.0.9-py3-none-any.whl", hash = "sha256:8033ad4e853066ba6ca92050b9df2f89301b8fc8bf7e9324d412a63f8bf1a8fd"}, - {file = "gitdb-4.0.9.tar.gz", hash = "sha256:bac2fd45c0a1c9cf619e63a90d62bdc63892ef92387424b855792a6cabe789aa"}, + {file = "gitdb-4.0.10-py3-none-any.whl", hash = "sha256:c286cf298426064079ed96a9e4a9d39e7f3e9bf15ba60701e95f5492f28415c7"}, + {file = "gitdb-4.0.10.tar.gz", hash = "sha256:6eb990b69df4e15bad899ea868dc46572c3f75339735663b81de79b06f17eb9a"}, ] gitpython = [ {file = "GitPython-3.1.29-py3-none-any.whl", hash = "sha256:41eea0deec2deea139b459ac03656f0dd28fc4a3387240ec1d3c259a2c47850f"}, @@ -1721,6 +1789,70 @@ idna = [ {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, ] +ijson = [ + {file = "ijson-3.1.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:6c1a777096be5f75ffebb335c6d2ebc0e489b231496b7f2ca903aa061fe7d381"}, + {file = "ijson-3.1.4-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:475fc25c3d2a86230b85777cae9580398b42eed422506bf0b6aacfa936f7bfcd"}, + {file = "ijson-3.1.4-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:f587699b5a759e30accf733e37950cc06c4118b72e3e146edcea77dded467426"}, + {file = "ijson-3.1.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:339b2b4c7bbd64849dd69ef94ee21e29dcd92c831f47a281fdd48122bb2a715a"}, + {file = "ijson-3.1.4-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:446ef8980504da0af8d20d3cb6452c4dc3d8aa5fd788098985e899b913191fe6"}, + {file = "ijson-3.1.4-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:3997a2fdb28bc04b9ab0555db5f3b33ed28d91e9d42a3bf2c1842d4990beb158"}, + {file = "ijson-3.1.4-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:fa10a1d88473303ec97aae23169d77c5b92657b7fb189f9c584974c00a79f383"}, + {file = "ijson-3.1.4-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:9a5bf5b9d8f2ceaca131ee21fc7875d0f34b95762f4f32e4d65109ca46472147"}, + {file = "ijson-3.1.4-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:81cc8cee590c8a70cca3c9aefae06dd7cb8e9f75f3a7dc12b340c2e332d33a2a"}, + {file = "ijson-3.1.4-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4ea5fc50ba158f72943d5174fbc29ebefe72a2adac051c814c87438dc475cf78"}, + {file = "ijson-3.1.4-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:3b98861a4280cf09d267986cefa46c3bd80af887eae02aba07488d80eb798afa"}, + {file = "ijson-3.1.4-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:068c692efba9692406b86736dcc6803e4a0b6280d7f0b7534bff3faec677ff38"}, + {file = "ijson-3.1.4-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:86884ac06ac69cea6d89ab7b84683b3b4159c4013e4a20276d3fc630fe9b7588"}, + {file = "ijson-3.1.4-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:41e5886ff6fade26f10b87edad723d2db14dcbb1178717790993fcbbb8ccd333"}, + {file = "ijson-3.1.4-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:24b58933bf777d03dc1caa3006112ec7f9e6f6db6ffe1f5f5bd233cb1281f719"}, + {file = "ijson-3.1.4-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:13f80aad0b84d100fb6a88ced24bade21dc6ddeaf2bba3294b58728463194f50"}, + {file = "ijson-3.1.4-cp35-cp35m-win32.whl", hash = "sha256:fa9a25d0bd32f9515e18a3611690f1de12cb7d1320bd93e9da835936b41ad3ff"}, + {file = "ijson-3.1.4-cp35-cp35m-win_amd64.whl", hash = "sha256:c4c1bf98aaab4c8f60d238edf9bcd07c896cfcc51c2ca84d03da22aad88957c5"}, + {file = "ijson-3.1.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f0f2a87c423e8767368aa055310024fa28727f4454463714fef22230c9717f64"}, + {file = "ijson-3.1.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:15507de59d74d21501b2a076d9c49abf927eb58a51a01b8f28a0a0565db0a99f"}, + {file = "ijson-3.1.4-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:2e6bd6ad95ab40c858592b905e2bbb4fe79bbff415b69a4923dafe841ffadcb4"}, + {file = "ijson-3.1.4-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:68e295bb12610d086990cedc89fb8b59b7c85740d66e9515aed062649605d0bf"}, + {file = "ijson-3.1.4-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:3bb461352c0f0f2ec460a4b19400a665b8a5a3a2da663a32093df1699642ee3f"}, + {file = "ijson-3.1.4-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:f91c75edd6cf1a66f02425bafc59a22ec29bc0adcbc06f4bfd694d92f424ceb3"}, + {file = "ijson-3.1.4-cp36-cp36m-win32.whl", hash = "sha256:4c53cc72f79a4c32d5fc22efb85aa22f248e8f4f992707a84bdc896cc0b1ecf9"}, + {file = "ijson-3.1.4-cp36-cp36m-win_amd64.whl", hash = "sha256:ac9098470c1ff6e5c23ec0946818bc102bfeeeea474554c8d081dc934be20988"}, + {file = "ijson-3.1.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dcd6f04df44b1945b859318010234651317db2c4232f75e3933f8bb41c4fa055"}, + {file = "ijson-3.1.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:5a2f40c053c837591636dc1afb79d85e90b9a9d65f3d9963aae31d1eb11bfed2"}, + {file = "ijson-3.1.4-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:f50337e3b8e72ec68441b573c2848f108a8976a57465c859b227ebd2a2342901"}, + {file = "ijson-3.1.4-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:454918f908abbed3c50a0a05c14b20658ab711b155e4f890900e6f60746dd7cc"}, + {file = "ijson-3.1.4-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:387c2ec434cc1bc7dc9bd33ec0b70d95d443cc1e5934005f26addc2284a437ab"}, + {file = "ijson-3.1.4-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:179ed6fd42e121d252b43a18833df2de08378fac7bce380974ef6f5e522afefa"}, + {file = "ijson-3.1.4-cp37-cp37m-win32.whl", hash = "sha256:26a6a550b270df04e3f442e2bf0870c9362db4912f0e7bdfd300f30ea43115a2"}, + {file = "ijson-3.1.4-cp37-cp37m-win_amd64.whl", hash = "sha256:ff8cf7507d9d8939264068c2cff0a23f99703fa2f31eb3cb45a9a52798843586"}, + {file = "ijson-3.1.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:09c9d7913c88a6059cd054ff854958f34d757402b639cf212ffbec201a705a0d"}, + {file = "ijson-3.1.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:702ba9a732116d659a5e950ee176be6a2e075998ef1bcde11cbf79a77ed0f717"}, + {file = "ijson-3.1.4-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:667841591521158770adc90793c2bdbb47c94fe28888cb802104b8bbd61f3d51"}, + {file = "ijson-3.1.4-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:df641dd07b38c63eecd4f454db7b27aa5201193df160f06b48111ba97ab62504"}, + {file = "ijson-3.1.4-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:9348e7d507eb40b52b12eecff3d50934fcc3d2a15a2f54ec1127a36063b9ba8f"}, + {file = "ijson-3.1.4-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:93455902fdc33ba9485c7fae63ac95d96e0ab8942224a357113174bbeaff92e9"}, + {file = "ijson-3.1.4-cp38-cp38-win32.whl", hash = "sha256:5b725f2e984ce70d464b195f206fa44bebbd744da24139b61fec72de77c03a16"}, + {file = "ijson-3.1.4-cp38-cp38-win_amd64.whl", hash = "sha256:a5965c315fbb2dc9769dfdf046eb07daf48ae20b637da95ec8d62b629be09df4"}, + {file = "ijson-3.1.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b8ee7dbb07cec9ba29d60cfe4954b3cc70adb5f85bba1f72225364b59c1cf82b"}, + {file = "ijson-3.1.4-cp39-cp39-manylinux1_i686.whl", hash = "sha256:d9e01c55d501e9c3d686b6ee3af351c9c0c8c3e45c5576bd5601bee3e1300b09"}, + {file = "ijson-3.1.4-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:297f26f27a04cd0d0a2f865d154090c48ea11b239cabe0a17a6c65f0314bd1ca"}, + {file = "ijson-3.1.4-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:9239973100338a4138d09d7a4602bd289861e553d597cd67390c33bfc452253e"}, + {file = "ijson-3.1.4-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:2a64c66a08f56ed45a805691c2fd2e1caef00edd6ccf4c4e5eff02cd94ad8364"}, + {file = "ijson-3.1.4-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:d17fd199f0d0a4ab6e0d541b4eec1b68b5bd5bb5d8104521e22243015b51049b"}, + {file = "ijson-3.1.4-cp39-cp39-win32.whl", hash = "sha256:70ee3c8fa0eba18c80c5911639c01a8de4089a4361bad2862a9949e25ec9b1c8"}, + {file = "ijson-3.1.4-cp39-cp39-win_amd64.whl", hash = "sha256:6bf2b64304321705d03fa5e403ec3f36fa5bb27bf661849ad62e0a3a49bc23e3"}, + {file = "ijson-3.1.4-pp27-pypy_73-macosx_10_9_x86_64.whl", hash = "sha256:5d7e3fcc3b6de76a9dba1e9fc6ca23dad18f0fa6b4e6499415e16b684b2e9af1"}, + {file = "ijson-3.1.4-pp27-pypy_73-manylinux1_x86_64.whl", hash = "sha256:a72eb0359ebff94754f7a2f00a6efe4c57716f860fc040c606dedcb40f49f233"}, + {file = "ijson-3.1.4-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:28fc168f5faf5759fdfa2a63f85f1f7a148bbae98f34404a6ba19f3d08e89e87"}, + {file = "ijson-3.1.4-pp36-pypy36_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2844d4a38d27583897ed73f7946e205b16926b4cab2525d1ce17e8b08064c706"}, + {file = "ijson-3.1.4-pp36-pypy36_pp73-manylinux1_x86_64.whl", hash = "sha256:252defd1f139b5fb8c764d78d5e3a6df81543d9878c58992a89b261369ea97a7"}, + {file = "ijson-3.1.4-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:15d5356b4d090c699f382c8eb6a2bcd5992a8c8e8b88c88bc6e54f686018328a"}, + {file = "ijson-3.1.4-pp36-pypy36_pp73-win32.whl", hash = "sha256:6774ec0a39647eea70d35fb76accabe3d71002a8701c0545b9120230c182b75b"}, + {file = "ijson-3.1.4-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f11da15ec04cc83ff0f817a65a3392e169be8d111ba81f24d6e09236597bb28c"}, + {file = "ijson-3.1.4-pp37-pypy37_pp73-manylinux1_x86_64.whl", hash = "sha256:ee13ceeed9b6cf81b3b8197ef15595fc43fd54276842ed63840ddd49db0603da"}, + {file = "ijson-3.1.4-pp37-pypy37_pp73-manylinux2010_x86_64.whl", hash = "sha256:97e4df67235fae40d6195711223520d2c5bf1f7f5087c2963fcde44d72ebf448"}, + {file = "ijson-3.1.4-pp37-pypy37_pp73-win32.whl", hash = "sha256:3d10eee52428f43f7da28763bb79f3d90bbbeea1accb15de01e40a00885b6e89"}, + {file = "ijson-3.1.4.tar.gz", hash = "sha256:1d1003ae3c6115ec9b587d29dd136860a81a23c7626b682e2b5b12c9fd30e4ea"}, +] importlib-metadata = [ {file = "importlib_metadata-4.13.0-py3-none-any.whl", hash = "sha256:8a8a81bcf996e74fee46f0d16bd3eaa382a7eb20fd82445c3ad11f4090334116"}, {file = "importlib_metadata-4.13.0.tar.gz", hash = "sha256:dd0173e8f150d6815e098fd354f6414b0f079af4644ddfe90c71e2fc6174346d"}, @@ -1769,8 +1901,8 @@ junit-xml = [ {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"}, ] mako = [ - {file = "Mako-1.2.3-py3-none-any.whl", hash = "sha256:c413a086e38cd885088d5e165305ee8eed04e8b3f8f62df343480da0a385735f"}, - {file = "Mako-1.2.3.tar.gz", hash = "sha256:7fde96466fcfeedb0eed94f187f20b23d85e4cb41444be0e542e2c8c65c396cd"}, + {file = "Mako-1.2.4-py3-none-any.whl", hash = "sha256:c97c79c018b9165ac9922ae4f32da095ffd3c4e6872b45eded42926deea46818"}, + {file = "Mako-1.2.4.tar.gz", hash = "sha256:d60a3903dc3bb01a18ad6a89cdbe2e4eadc69c0bc8ef1e3773ba53d44c3f7a34"}, ] mando = [ {file = "mando-0.6.4-py2.py3-none-any.whl", hash = "sha256:4ce09faec7e5192ffc3c57830e26acba0fd6cd11e1ee81af0d4df0657463bd1c"}, @@ -1835,8 +1967,8 @@ mike = [ {file = "mike-1.1.2.tar.gz", hash = "sha256:56c3f1794c2d0b5fdccfa9b9487beb013ca813de2e3ad0744724e9d34d40b77b"}, ] mkdocs = [ - {file = "mkdocs-1.4.1-py3-none-any.whl", hash = "sha256:2b7845c2775396214cd408753e4cfb01af3cfed36acc141a84bce2ceec9d705d"}, - {file = "mkdocs-1.4.1.tar.gz", hash = "sha256:07ed90be4062e4ef732bbac2623097b9dca35c67b562c38cfd0bfbc7151758c1"}, + {file = "mkdocs-1.4.2-py3-none-any.whl", hash = "sha256:c8856a832c1e56702577023cd64cc5f84948280c1c0fcc6af4cd39006ea6aa8c"}, + {file = "mkdocs-1.4.2.tar.gz", hash = "sha256:8947af423a6d0facf41ea1195b8e1e8c85ad94ac95ae307fe11232e0424b11c5"}, ] mkdocs-git-revision-date-plugin = [ {file = "mkdocs_git_revision_date_plugin-0.3.2-py3-none-any.whl", hash = "sha256:2e67956cb01823dd2418e2833f3623dee8604cdf223bddd005fe36226a56f6ef"}, @@ -1846,8 +1978,8 @@ mkdocs-material = [ {file = "mkdocs_material-8.5.10.tar.gz", hash = "sha256:7623608f746c6d9ff68a8ef01f13eddf32fa2cae5e15badb251f26d1196bc8f1"}, ] mkdocs-material-extensions = [ - {file = "mkdocs_material_extensions-1.1-py3-none-any.whl", hash = "sha256:bcc2e5fc70c0ec50e59703ee6e639d87c7e664c0c441c014ea84461a90f1e902"}, - {file = "mkdocs_material_extensions-1.1.tar.gz", hash = "sha256:96ca979dae66d65c2099eefe189b49d5ac62f76afb59c38e069ffc7cf3c131ec"}, + {file = "mkdocs_material_extensions-1.1.1-py3-none-any.whl", hash = "sha256:e41d9f38e4798b6617ad98ca8f7f1157b1e4385ac1459ca1e4ea219b556df945"}, + {file = "mkdocs_material_extensions-1.1.1.tar.gz", hash = "sha256:9c003da71e2cc2493d910237448c672e00cefc800d3d6ae93d2fc69979e3bd93"}, ] mypy = [ {file = "mypy-0.982-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5085e6f442003fa915aeb0a46d4da58128da69325d8213b4b35cc7054090aed5"}, @@ -1932,8 +2064,8 @@ packaging = [ {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, ] pathspec = [ - {file = "pathspec-0.10.1-py3-none-any.whl", hash = "sha256:46846318467efc4556ccfd27816e004270a9eeeeb4d062ce5e6fc7a87c573f93"}, - {file = "pathspec-0.10.1.tar.gz", hash = "sha256:7ace6161b621d31e7902eb6b5ae148d12cfd23f4a249b9ffb6b9fee12084323d"}, + {file = "pathspec-0.10.2-py3-none-any.whl", hash = "sha256:88c2606f2c1e818b978540f73ecc908e13999c6c3a383daf3705652ae79807a5"}, + {file = "pathspec-0.10.2.tar.gz", hash = "sha256:8f6bf73e5758fd365ef5d58ce09ac7c27d2833a8d7da51712eac6e27e35141b0"}, ] pbr = [ {file = "pbr-5.11.0-py2.py3-none-any.whl", hash = "sha256:db2317ff07c84c4c63648c9064a79fe9d9f5c7ce85a9099d4b6258b3db83225a"}, @@ -1944,8 +2076,8 @@ pdoc3 = [ {file = "pdoc3-0.10.0.tar.gz", hash = "sha256:5f22e7bcb969006738e1aa4219c75a32f34c2d62d46dc9d2fb2d3e0b0287e4b7"}, ] platformdirs = [ - {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"}, - {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"}, + {file = "platformdirs-2.5.4-py3-none-any.whl", hash = "sha256:af0276409f9a02373d540bf8480021a048711d572745aef4b7842dad245eba10"}, + {file = "platformdirs-2.5.4.tar.gz", hash = "sha256:1006647646d80f16130f052404c6b901e80ee4ed6bef6792e1f238a8969106f7"}, ] pluggy = [ {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, @@ -1960,7 +2092,8 @@ py = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] py-cpuinfo = [ - {file = "py-cpuinfo-8.0.0.tar.gz", hash = "sha256:5f269be0e08e33fd959de96b34cd4aeeeacac014dd8305f70eb28d06de2345c5"}, + {file = "py-cpuinfo-9.0.0.tar.gz", hash = "sha256:3cdbbf3fac90dc6f118bfd64384f309edeadd902d7c8fb17f02ffa1fc3f49690"}, + {file = "py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5"}, ] pycodestyle = [ {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"}, @@ -2013,35 +2146,36 @@ pygments = [ {file = "Pygments-2.13.0.tar.gz", hash = "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1"}, ] pymdown-extensions = [ - {file = "pymdown_extensions-9.7-py3-none-any.whl", hash = "sha256:767d07d9dead0f52f5135545c01f4ed627f9a7918ee86c646d893e24c59db87d"}, - {file = "pymdown_extensions-9.7.tar.gz", hash = "sha256:651b0107bc9ee790aedea3673cb88832c0af27d2569cf45c2de06f1d65292e96"}, + {file = "pymdown_extensions-9.9-py3-none-any.whl", hash = "sha256:ac698c15265680db5eb13cd4342abfcde2079ac01e5486028f47a1b41547b859"}, + {file = "pymdown_extensions-9.9.tar.gz", hash = "sha256:0f8fb7b74a37a61cc34e90b2c91865458b713ec774894ffad64353a5fce85cfc"}, ] pyparsing = [ {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, ] pyrsistent = [ - {file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"}, - {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"}, - {file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4ed6784ceac462a7d6fcb7e9b663e93b9a6fb373b7f43594f9ff68875788e01e"}, - {file = "pyrsistent-0.18.1-cp310-cp310-win32.whl", hash = "sha256:e4f3149fd5eb9b285d6bfb54d2e5173f6a116fe19172686797c056672689daf6"}, - {file = "pyrsistent-0.18.1-cp310-cp310-win_amd64.whl", hash = "sha256:636ce2dc235046ccd3d8c56a7ad54e99d5c1cd0ef07d9ae847306c91d11b5fec"}, - {file = "pyrsistent-0.18.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e92a52c166426efbe0d1ec1332ee9119b6d32fc1f0bbfd55d5c1088070e7fc1b"}, - {file = "pyrsistent-0.18.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7a096646eab884bf8bed965bad63ea327e0d0c38989fc83c5ea7b8a87037bfc"}, - {file = "pyrsistent-0.18.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cdfd2c361b8a8e5d9499b9082b501c452ade8bbf42aef97ea04854f4a3f43b22"}, - {file = "pyrsistent-0.18.1-cp37-cp37m-win32.whl", hash = "sha256:7ec335fc998faa4febe75cc5268a9eac0478b3f681602c1f27befaf2a1abe1d8"}, - {file = "pyrsistent-0.18.1-cp37-cp37m-win_amd64.whl", hash = "sha256:6455fc599df93d1f60e1c5c4fe471499f08d190d57eca040c0ea182301321286"}, - {file = "pyrsistent-0.18.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fd8da6d0124efa2f67d86fa70c851022f87c98e205f0594e1fae044e7119a5a6"}, - {file = "pyrsistent-0.18.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bfe2388663fd18bd8ce7db2c91c7400bf3e1a9e8bd7d63bf7e77d39051b85ec"}, - {file = "pyrsistent-0.18.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0e3e1fcc45199df76053026a51cc59ab2ea3fc7c094c6627e93b7b44cdae2c8c"}, - {file = "pyrsistent-0.18.1-cp38-cp38-win32.whl", hash = "sha256:b568f35ad53a7b07ed9b1b2bae09eb15cdd671a5ba5d2c66caee40dbf91c68ca"}, - {file = "pyrsistent-0.18.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1b96547410f76078eaf66d282ddca2e4baae8964364abb4f4dcdde855cd123a"}, - {file = "pyrsistent-0.18.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f87cc2863ef33c709e237d4b5f4502a62a00fab450c9e020892e8e2ede5847f5"}, - {file = "pyrsistent-0.18.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6bc66318fb7ee012071b2792024564973ecc80e9522842eb4e17743604b5e045"}, - {file = "pyrsistent-0.18.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:914474c9f1d93080338ace89cb2acee74f4f666fb0424896fcfb8d86058bf17c"}, - {file = "pyrsistent-0.18.1-cp39-cp39-win32.whl", hash = "sha256:1b34eedd6812bf4d33814fca1b66005805d3640ce53140ab8bbb1e2651b0d9bc"}, - {file = "pyrsistent-0.18.1-cp39-cp39-win_amd64.whl", hash = "sha256:e24a828f57e0c337c8d8bb9f6b12f09dfdf0273da25fda9e314f0b684b415a07"}, - {file = "pyrsistent-0.18.1.tar.gz", hash = "sha256:d4d61f8b993a7255ba714df3aca52700f8125289f84f704cf80916517c46eb96"}, + {file = "pyrsistent-0.19.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d6982b5a0237e1b7d876b60265564648a69b14017f3b5f908c5be2de3f9abb7a"}, + {file = "pyrsistent-0.19.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:187d5730b0507d9285a96fca9716310d572e5464cadd19f22b63a6976254d77a"}, + {file = "pyrsistent-0.19.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:055ab45d5911d7cae397dc418808d8802fb95262751872c841c170b0dbf51eed"}, + {file = "pyrsistent-0.19.2-cp310-cp310-win32.whl", hash = "sha256:456cb30ca8bff00596519f2c53e42c245c09e1a4543945703acd4312949bfd41"}, + {file = "pyrsistent-0.19.2-cp310-cp310-win_amd64.whl", hash = "sha256:b39725209e06759217d1ac5fcdb510e98670af9e37223985f330b611f62e7425"}, + {file = "pyrsistent-0.19.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2aede922a488861de0ad00c7630a6e2d57e8023e4be72d9d7147a9fcd2d30712"}, + {file = "pyrsistent-0.19.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:879b4c2f4d41585c42df4d7654ddffff1239dc4065bc88b745f0341828b83e78"}, + {file = "pyrsistent-0.19.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c43bec251bbd10e3cb58ced80609c5c1eb238da9ca78b964aea410fb820d00d6"}, + {file = "pyrsistent-0.19.2-cp37-cp37m-win32.whl", hash = "sha256:d690b18ac4b3e3cab73b0b7aa7dbe65978a172ff94970ff98d82f2031f8971c2"}, + {file = "pyrsistent-0.19.2-cp37-cp37m-win_amd64.whl", hash = "sha256:3ba4134a3ff0fc7ad225b6b457d1309f4698108fb6b35532d015dca8f5abed73"}, + {file = "pyrsistent-0.19.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a178209e2df710e3f142cbd05313ba0c5ebed0a55d78d9945ac7a4e09d923308"}, + {file = "pyrsistent-0.19.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e371b844cec09d8dc424d940e54bba8f67a03ebea20ff7b7b0d56f526c71d584"}, + {file = "pyrsistent-0.19.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:111156137b2e71f3a9936baf27cb322e8024dac3dc54ec7fb9f0bcf3249e68bb"}, + {file = "pyrsistent-0.19.2-cp38-cp38-win32.whl", hash = "sha256:e5d8f84d81e3729c3b506657dddfe46e8ba9c330bf1858ee33108f8bb2adb38a"}, + {file = "pyrsistent-0.19.2-cp38-cp38-win_amd64.whl", hash = "sha256:9cd3e9978d12b5d99cbdc727a3022da0430ad007dacf33d0bf554b96427f33ab"}, + {file = "pyrsistent-0.19.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f1258f4e6c42ad0b20f9cfcc3ada5bd6b83374516cd01c0960e3cb75fdca6770"}, + {file = "pyrsistent-0.19.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21455e2b16000440e896ab99e8304617151981ed40c29e9507ef1c2e4314ee95"}, + {file = "pyrsistent-0.19.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bfd880614c6237243ff53a0539f1cb26987a6dc8ac6e66e0c5a40617296a045e"}, + {file = "pyrsistent-0.19.2-cp39-cp39-win32.whl", hash = "sha256:71d332b0320642b3261e9fee47ab9e65872c2bd90260e5d225dabeed93cbd42b"}, + {file = "pyrsistent-0.19.2-cp39-cp39-win_amd64.whl", hash = "sha256:dec3eac7549869365fe263831f576c8457f6c833937c68542d08fde73457d291"}, + {file = "pyrsistent-0.19.2-py3-none-any.whl", hash = "sha256:ea6b79a02a28550c98b6ca9c35b9f492beaa54d7c5c9e9949555893c8a9234d0"}, + {file = "pyrsistent-0.19.2.tar.gz", hash = "sha256:bfa0351be89c9fcbcb8c9879b826f4353be10f58f8a677efab0c017bf7137ec2"}, ] pytest = [ {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"}, @@ -2188,8 +2322,8 @@ sarif-om = [ {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"}, ] setuptools = [ - {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"}, - {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"}, + {file = "setuptools-65.6.3-py3-none-any.whl", hash = "sha256:57f6f22bde4e042978bcd50176fdb381d7c21a9efa4041202288d3737a0c6a54"}, + {file = "setuptools-65.6.3.tar.gz", hash = "sha256:a7620757bf984b58deaf32fc8a4577a9bbc0850cf92c20e1ce41c38c19e5fb75"}, ] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, @@ -2207,6 +2341,32 @@ tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +typed-ast = [ + {file = "typed_ast-1.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:669dd0c4167f6f2cd9f57041e03c3c2ebf9063d0757dc89f79ba1daa2bfca9d4"}, + {file = "typed_ast-1.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:211260621ab1cd7324e0798d6be953d00b74e0428382991adfddb352252f1d62"}, + {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:267e3f78697a6c00c689c03db4876dd1efdfea2f251a5ad6555e82a26847b4ac"}, + {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c542eeda69212fa10a7ada75e668876fdec5f856cd3d06829e6aa64ad17c8dfe"}, + {file = "typed_ast-1.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:a9916d2bb8865f973824fb47436fa45e1ebf2efd920f2b9f99342cb7fab93f72"}, + {file = "typed_ast-1.5.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:79b1e0869db7c830ba6a981d58711c88b6677506e648496b1f64ac7d15633aec"}, + {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a94d55d142c9265f4ea46fab70977a1944ecae359ae867397757d836ea5a3f47"}, + {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:183afdf0ec5b1b211724dfef3d2cad2d767cbefac291f24d69b00546c1837fb6"}, + {file = "typed_ast-1.5.4-cp36-cp36m-win_amd64.whl", hash = "sha256:639c5f0b21776605dd6c9dbe592d5228f021404dafd377e2b7ac046b0349b1a1"}, + {file = "typed_ast-1.5.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cf4afcfac006ece570e32d6fa90ab74a17245b83dfd6655a6f68568098345ff6"}, + {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed855bbe3eb3715fca349c80174cfcfd699c2f9de574d40527b8429acae23a66"}, + {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6778e1b2f81dfc7bc58e4b259363b83d2e509a65198e85d5700dfae4c6c8ff1c"}, + {file = "typed_ast-1.5.4-cp37-cp37m-win_amd64.whl", hash = "sha256:0261195c2062caf107831e92a76764c81227dae162c4f75192c0d489faf751a2"}, + {file = "typed_ast-1.5.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2efae9db7a8c05ad5547d522e7dbe62c83d838d3906a3716d1478b6c1d61388d"}, + {file = "typed_ast-1.5.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7d5d014b7daa8b0bf2eaef684295acae12b036d79f54178b92a2b6a56f92278f"}, + {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:370788a63915e82fd6f212865a596a0fefcbb7d408bbbb13dea723d971ed8bdc"}, + {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4e964b4ff86550a7a7d56345c7864b18f403f5bd7380edf44a3c1fb4ee7ac6c6"}, + {file = "typed_ast-1.5.4-cp38-cp38-win_amd64.whl", hash = "sha256:683407d92dc953c8a7347119596f0b0e6c55eb98ebebd9b23437501b28dcbb8e"}, + {file = "typed_ast-1.5.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4879da6c9b73443f97e731b617184a596ac1235fe91f98d279a7af36c796da35"}, + {file = "typed_ast-1.5.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e123d878ba170397916557d31c8f589951e353cc95fb7f24f6bb69adc1a8a97"}, + {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebd9d7f80ccf7a82ac5f88c521115cc55d84e35bf8b446fcd7836eb6b98929a3"}, + {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98f80dee3c03455e92796b58b98ff6ca0b2a6f652120c263efdba4d6c5e58f72"}, + {file = "typed_ast-1.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:0fdbcf2fef0ca421a3f5912555804296f0b0960f0418c440f5d6d3abb549f3e1"}, + {file = "typed_ast-1.5.4.tar.gz", hash = "sha256:39e21ceb7388e4bb37f4c679d72707ed46c2fbf2a5609b8b8ebc4b067d977df2"}, +] typeguard = [ {file = "typeguard-2.13.3-py3-none-any.whl", hash = "sha256:5e3e3be01e887e7eafae5af63d1f36c849aaa94e3a0112097312aabfa16284f1"}, {file = "typeguard-2.13.3.tar.gz", hash = "sha256:00edaa8da3a133674796cf5ea87d9f4b4c367d77476e185e80251cc13dfbb8c4"}, @@ -2216,16 +2376,16 @@ types-requests = [ {file = "types_requests-2.28.11.5-py3-none-any.whl", hash = "sha256:091d4a5a33c1b4f20d8b1b952aa8fa27a6e767c44c3cf65e56580df0b05fd8a9"}, ] types-urllib3 = [ - {file = "types-urllib3-1.26.25.1.tar.gz", hash = "sha256:a948584944b2412c9a74b9cf64f6c48caf8652cb88b38361316f6d15d8a184cd"}, - {file = "types_urllib3-1.26.25.1-py3-none-any.whl", hash = "sha256:f6422596cc9ee5fdf68f9d547f541096a20c2dcfd587e37c804c9ea720bf5cb2"}, + {file = "types-urllib3-1.26.25.4.tar.gz", hash = "sha256:eec5556428eec862b1ac578fb69aab3877995a99ffec9e5a12cf7fbd0cc9daee"}, + {file = "types_urllib3-1.26.25.4-py3-none-any.whl", hash = "sha256:ed6b9e8a8be488796f72306889a06a3fc3cb1aa99af02ab8afb50144d7317e49"}, ] typing-extensions = [ {file = "typing_extensions-4.4.0-py3-none-any.whl", hash = "sha256:16fa4864408f655d35ec496218b85f79b3437c829e93320c7c9215ccfd92489e"}, {file = "typing_extensions-4.4.0.tar.gz", hash = "sha256:1511434bb92bf8dd198c12b1cc812e800d4181cfcb867674e0f8279cc93087aa"}, ] urllib3 = [ - {file = "urllib3-1.26.12-py2.py3-none-any.whl", hash = "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"}, - {file = "urllib3-1.26.12.tar.gz", hash = "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e"}, + {file = "urllib3-1.26.13-py2.py3-none-any.whl", hash = "sha256:47cc05d99aaa09c9e72ed5809b60e7ba354e64b59c9c173ac3018642d8bb41fc"}, + {file = "urllib3-1.26.13.tar.gz", hash = "sha256:c083dd0dce68dbfbe1129d5271cb90f9447dea7d52097c6e0126120c521ddea8"}, ] verspec = [ {file = "verspec-0.1.0-py3-none-any.whl", hash = "sha256:741877d5633cc9464c45a469ae2a31e801e6dbbaa85b9675d481cda100f11c31"}, diff --git a/pyproject.toml b/pyproject.toml index 12190d1a5ba..864f04c2c63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ filelock = "^3.8.0" checksumdir = "^1.2.0" mypy-boto3-appconfigdata = "^1.26.0" importlib-metadata = "^4.13" +ijson = "^3.1.4" [tool.poetry.extras] parser = ["pydantic"] From 822f597d8ae93c6d10a80a5e4a51658a69cbe94e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Thu, 17 Nov 2022 11:54:43 +0100 Subject: [PATCH 10/43] chore: add s3 objects tests --- tests/functional/streaming/test_s3_object.py | 66 ++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 tests/functional/streaming/test_s3_object.py diff --git a/tests/functional/streaming/test_s3_object.py b/tests/functional/streaming/test_s3_object.py new file mode 100644 index 00000000000..d2d18ea010f --- /dev/null +++ b/tests/functional/streaming/test_s3_object.py @@ -0,0 +1,66 @@ +import io +from csv import DictReader +from gzip import GzipFile + +import boto3 +import pytest +from botocore import stub +from botocore.response import StreamingBody + +from aws_lambda_powertools.utilities.streaming import S3Object +from aws_lambda_powertools.utilities.streaming._s3_seekable_io import _S3SeekableIO +from aws_lambda_powertools.utilities.streaming.transformations import GzipTransform + + +def test_s3_basic_stream(): + obj = S3Object(bucket="bucket", key="key") + assert type(obj.transformed_stream) is _S3SeekableIO + + +def test_s3_gzip_stream(): + obj = S3Object(bucket="bucket", key="key", gunzip=True) + assert type(obj.transformed_stream) is GzipFile + + +def test_s3_csv_stream(): + obj = S3Object(bucket="bucket", key="key", csv=True) + assert type(obj.transformed_stream) is DictReader + + +def test_s3_gzip_csv_stream(): + obj = S3Object(bucket="bucket", key="key", gunzip=True, csv=True) + assert type(obj.transformed_stream) is DictReader + + +def test_s3_transform(): + obj = S3Object(bucket="bucket", key="key") + + new_obj = obj.transform(GzipTransform()) + assert type(new_obj) is GzipFile + + +def test_s3_transform_in_palce(): + obj = S3Object(bucket="bucket", key="key") + + new_obj = obj.transform(GzipTransform(), in_place=True) + assert new_obj is None + + +def test_s3_transform_after_read(): + # GIVEN a S3 Object with a "hello world" payload + payload = b"hello world" + + s3_resource = boto3.resource("s3") + s3_stub = stub.Stubber(s3_resource.meta.client) + s3_stub.add_response( + "get_object", {"Body": StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload))} + ) + s3_stub.activate() + obj = S3Object(bucket="bucket", key="key", boto3_s3_resource=s3_resource) + + # WHEN you read some part of the object and then apply a transformation + assert obj.read(5) == b"hello" + + # THEN it raises ValueError + with pytest.raises(ValueError): + obj.transform(GzipTransform()) From b70fb6254d7737803a0cb6d22b9b946561e16f54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Thu, 17 Nov 2022 12:00:27 +0100 Subject: [PATCH 11/43] chore: update docs --- docs/utilities/streaming.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index 8624a00ebe7..038eef10f7d 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -22,6 +22,8 @@ to the data stream. ## Getting started +### Streaming from a S3 object + To stream an S3 file, you need the bucket name, the key and optionally a version ID. === "Non-versioned bucket" @@ -54,7 +56,7 @@ Common options like gunzipping a stream and parsing data as CSV can be enabled d --8<-- "examples/streaming/src/s3_transform_common.py" ``` -Additionally, you can transform the data in place, or return a new object that encapsulates the transformation. +Additionally, you can return a new object that encapsulates the transformation, or transform the data in place, Multiple transformations are applied in order. === "Returning a new object" @@ -99,7 +101,7 @@ You can build your own custom data transformation by extending the `BaseTransfor The `transform` method receives an `IO[bytes]` object, and you are responsible for returning an object that is also a `IO[bytes]`. -```python hl_lines="9 37 38" +```python hl_lines="10 12 27-29" --8<-- "examples/streaming/src/s3_json_transform.py" ``` From 320c6a9a86b8bbcb27882c780fc5dce132d0be18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Fri, 18 Nov 2022 16:23:50 +0100 Subject: [PATCH 12/43] chore: add e2e tests --- tests/e2e/streaming/__init__.py | 0 tests/e2e/streaming/assets/csv.txt | 2 + tests/e2e/streaming/assets/csv.txt.gz | Bin 0 -> 43 bytes tests/e2e/streaming/assets/fileset.zip | Bin 0 -> 328 bytes tests/e2e/streaming/assets/fileset.zip.lzma | Bin 0 -> 296 bytes tests/e2e/streaming/assets/plain.txt | 1 + tests/e2e/streaming/assets/plain.txt.gz | Bin 0 -> 42 bytes tests/e2e/streaming/conftest.py | 19 ++++ .../streaming/handlers/s3_object_handler.py | 70 +++++++++++++ tests/e2e/streaming/infrastructure.py | 52 +++++++++ tests/e2e/streaming/test_s3_object.py | 99 ++++++++++++++++++ 11 files changed, 243 insertions(+) create mode 100644 tests/e2e/streaming/__init__.py create mode 100644 tests/e2e/streaming/assets/csv.txt create mode 100644 tests/e2e/streaming/assets/csv.txt.gz create mode 100644 tests/e2e/streaming/assets/fileset.zip create mode 100644 tests/e2e/streaming/assets/fileset.zip.lzma create mode 100644 tests/e2e/streaming/assets/plain.txt create mode 100644 tests/e2e/streaming/assets/plain.txt.gz create mode 100644 tests/e2e/streaming/conftest.py create mode 100644 tests/e2e/streaming/handlers/s3_object_handler.py create mode 100644 tests/e2e/streaming/infrastructure.py create mode 100644 tests/e2e/streaming/test_s3_object.py diff --git a/tests/e2e/streaming/__init__.py b/tests/e2e/streaming/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/e2e/streaming/assets/csv.txt b/tests/e2e/streaming/assets/csv.txt new file mode 100644 index 00000000000..cfc6a96e553 --- /dev/null +++ b/tests/e2e/streaming/assets/csv.txt @@ -0,0 +1,2 @@ +name,value +hello,world diff --git a/tests/e2e/streaming/assets/csv.txt.gz b/tests/e2e/streaming/assets/csv.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..3455e090a8f2b48a528716a2a65dac5303df435d GIT binary patch literal 43 zcmV+`0M!2b0szD7jeQpY004|c B5=#I8 literal 0 HcmV?d00001 diff --git a/tests/e2e/streaming/assets/fileset.zip b/tests/e2e/streaming/assets/fileset.zip new file mode 100644 index 0000000000000000000000000000000000000000..095b8aed8a8d4e5658fdb4788aa529f87c3ac532 GIT binary patch literal 328 zcmWIWW@h1H0D)@}MWH_XLzeIZ*&xiyAj4p&S5i?D8p6rIY#dphYy!lk72FJrEMFNJ z7+6Grib66nixq$;*<8bK_mTUjA)MRO;zQJ79cHp&lZ6x5+Wqd*SDFp8B8WFHd{P6N_&KpX}D Drd~Y+ literal 0 HcmV?d00001 diff --git a/tests/e2e/streaming/assets/fileset.zip.lzma b/tests/e2e/streaming/assets/fileset.zip.lzma new file mode 100644 index 0000000000000000000000000000000000000000..964c0b33553197fe402d929e57bf3eda52d8d308 GIT binary patch literal 296 zcmWIWW@h1HW&na~5k;Xs`$LxS1KA+V3dDwbB^4zh8JWcjK$Mo5ld53I6#&<91E{0_ zO=Ul_4kHX5MnD~mO!myUY*7K)0|E*SOBz8m*iJ461rVEokwJo?;dfc7>F*ni4d#tb uW+0j?z!5|v8;j`_WMhqhCV?G>Y%If@!V2uhva*4!W&*->Ksp7)VE_QI%s}e^ literal 0 HcmV?d00001 diff --git a/tests/e2e/streaming/assets/plain.txt b/tests/e2e/streaming/assets/plain.txt new file mode 100644 index 00000000000..3b18e512dba --- /dev/null +++ b/tests/e2e/streaming/assets/plain.txt @@ -0,0 +1 @@ +hello world diff --git a/tests/e2e/streaming/assets/plain.txt.gz b/tests/e2e/streaming/assets/plain.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..cff6007d0f7a438420f7b8575fd5bde4c9419d57 GIT binary patch literal 42 ycmb2|=HTFtEKg=&F33sD%+o8WC}BA5arWfNaE 0: + if transform_in_place: + obj.transform(transformations, in_place=True) + else: + obj = obj.transform(transformations) + + if transform_zip or transform_zip_lzma: + response["manifest"] = obj.namelist() + response["body"] = obj.read(obj.namelist()[1]).rstrip() # extracts the second file on the zip + elif transform_csv or csv: + response["body"] = obj.__next__() + elif transform_gzip or gunzip: + response["body"] = obj.readline().rstrip() + + return response diff --git a/tests/e2e/streaming/infrastructure.py b/tests/e2e/streaming/infrastructure.py new file mode 100644 index 00000000000..919dfcd2abd --- /dev/null +++ b/tests/e2e/streaming/infrastructure.py @@ -0,0 +1,52 @@ +from pathlib import Path + +from aws_cdk import CfnOutput, RemovalPolicy +from aws_cdk import aws_s3 as s3 +from aws_cdk import aws_s3_deployment as s3deploy + +from tests.e2e.utils.infrastructure import BaseInfrastructure + + +class StreamingStack(BaseInfrastructure): + def create_resources(self): + functions = self.create_lambda_functions() + + regular_bucket = s3.Bucket( + self.stack, + "S3Bucket", + removal_policy=RemovalPolicy.DESTROY, + auto_delete_objects=True, + block_public_access=s3.BlockPublicAccess.BLOCK_ALL, + ) + self.create_s3_deployment(regular_bucket) + + for function in functions.values(): + regular_bucket.grant_read(function) + + CfnOutput(self.stack, "RegularBucket", value=regular_bucket.bucket_name) + + versioned_bucket = s3.Bucket( + self.stack, + "S3VersionedBucket", + versioned=True, + removal_policy=RemovalPolicy.DESTROY, + auto_delete_objects=True, + block_public_access=s3.BlockPublicAccess.BLOCK_ALL, + ) + self.create_s3_deployment(versioned_bucket) + + for function in functions.values(): + versioned_bucket.grant_read(function) + + CfnOutput(self.stack, "VersionedBucket", value=versioned_bucket.bucket_name) + + def create_s3_deployment(self, bucket: s3.IBucket): + current_dir = Path(__file__).parent.resolve() + sources = [s3deploy.Source.asset(str(current_dir / "assets"))] + + s3deploy.BucketDeployment( + self.stack, + f"Deployment{bucket.node.id}", + sources=sources, + destination_bucket=bucket, + ) diff --git a/tests/e2e/streaming/test_s3_object.py b/tests/e2e/streaming/test_s3_object.py new file mode 100644 index 00000000000..2c63dd9cc0a --- /dev/null +++ b/tests/e2e/streaming/test_s3_object.py @@ -0,0 +1,99 @@ +import json + +import pytest + +from tests.e2e.utils import data_fetcher + + +@pytest.fixture +def regular_bucket_name(infrastructure: dict) -> str: + return infrastructure.get("RegularBucket", "") + + +@pytest.fixture +def s3_object_handler_fn_arn(infrastructure: dict) -> str: + return infrastructure.get("S3ObjectHandler", "") + + +def get_lambda_result_payload(s3_object_handler_fn_arn: str, payload: dict) -> dict: + handler_result, _ = data_fetcher.get_lambda_response( + lambda_arn=s3_object_handler_fn_arn, payload=json.dumps(payload) + ) + + return json.loads(handler_result["Payload"].read()) + + +def test_s3_object_size(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "plain.txt"} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("size") == 12 + + +def test_s3_object_csv_constructor(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "csv.txt", "csv": True} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("body") == {"name": "hello", "value": "world"} + + +def test_s3_object_csv_transform(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "csv.txt", "transform_csv": True} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("body") == {"name": "hello", "value": "world"} + + +def test_s3_object_csv_transform_in_place(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "csv.txt", "transform_csv": True, "in_place": True} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("body") == {"name": "hello", "value": "world"} + + +def test_s3_object_csv_gzip_constructor(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "csv.txt.gz", "csv": True, "gunzip": True} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("body") == {"name": "hello", "value": "world"} + + +def test_s3_object_gzip_constructor(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "plain.txt.gz", "gunzip": True} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("body") == "hello world" + + +def test_s3_object_gzip_transform(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "plain.txt.gz", "transform_gzip": True} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("body") == "hello world" + + +def test_s3_object_gzip_transform_in_place(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "plain.txt.gz", "transform_gzip": True, "in_place": True} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("body") == "hello world" + + +def test_s3_object_zip_transform(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "fileset.zip", "transform_zip": True} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("manifest") == ["1.txt", "2.txt"] + assert result.get("body") == "This is file 2" + + +def test_s3_object_zip_transform_in_place(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "fileset.zip", "transform_zip": True, "in_place": True} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("manifest") == ["1.txt", "2.txt"] + assert result.get("body") == "This is file 2" + + +def test_s3_object_zip_lzma_transform(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "fileset.zip.lzma", "transform_zip_lzma": True} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("manifest") == ["1.txt", "2.txt"] + assert result.get("body") == "This is file 2" + + +def test_s3_object_zip_lzma_transform_in_place(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "fileset.zip.lzma", "transform_zip_lzma": True, "in_place": True} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("manifest") == ["1.txt", "2.txt"] + assert result.get("body") == "This is file 2" From 4c867a34d3da693e78738704e199d2d1fe067313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Fri, 18 Nov 2022 16:30:44 +0100 Subject: [PATCH 13/43] fix: bug when using encoding and newline on CSV transform --- .../utilities/streaming/transformations/csv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/csv.py b/aws_lambda_powertools/utilities/streaming/transformations/csv.py index 2bc81bea6e3..e2ad66a7a51 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/csv.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/csv.py @@ -8,8 +8,8 @@ class CsvTransform(BaseTransform): def transform(self, input_stream: IO[bytes]) -> DictReader: - encoding = self.kwargs.get("encoding", "utf-8") - newline = self.kwargs.get("newline") + encoding = self.kwargs.pop("encoding", "utf-8") + newline = self.kwargs.pop("newline", None) # csv module needs an Iterator[str], so we wrap the underlying stream into a TextIO iterator = io.TextIOWrapper(input_stream, encoding=encoding, newline=newline) From c66e045b0e068d16153a97b645f741309da90597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Mon, 21 Nov 2022 09:48:44 +0100 Subject: [PATCH 14/43] chore(docs): moved s3 streaming under streaming in docs --- docs/utilities/{streaming.md => streaming/s3.md} | 2 +- mkdocs.yml | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) rename docs/utilities/{streaming.md => streaming/s3.md} (99%) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming/s3.md similarity index 99% rename from docs/utilities/streaming.md rename to docs/utilities/streaming/s3.md index 038eef10f7d..5518af77d6c 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming/s3.md @@ -1,5 +1,5 @@ --- -title: Streaming +title: S3 description: Utility --- diff --git a/mkdocs.yml b/mkdocs.yml index e72465f5736..2731f5a8880 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -31,7 +31,8 @@ nav: - utilities/feature_flags.md - utilities/jmespath_functions.md - CloudFormation Custom Resources: https://github.com/aws-cloudformation/custom-resource-helper" target="_blank - - utilities/streaming.md + - Streaming: + - utilities/streaming/s3.md theme: name: material From 97ffd8a9b4eeeb45b7856d692c155db8fe58b463 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Mon, 21 Nov 2022 11:03:38 +0100 Subject: [PATCH 15/43] Revert "chore(docs): moved s3 streaming under streaming in docs" This reverts commit c1b2520145b59ad6a908b4d0f18b13eec1159663. --- docs/utilities/{streaming/s3.md => streaming.md} | 2 +- mkdocs.yml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) rename docs/utilities/{streaming/s3.md => streaming.md} (99%) diff --git a/docs/utilities/streaming/s3.md b/docs/utilities/streaming.md similarity index 99% rename from docs/utilities/streaming/s3.md rename to docs/utilities/streaming.md index 5518af77d6c..038eef10f7d 100644 --- a/docs/utilities/streaming/s3.md +++ b/docs/utilities/streaming.md @@ -1,5 +1,5 @@ --- -title: S3 +title: Streaming description: Utility --- diff --git a/mkdocs.yml b/mkdocs.yml index 2731f5a8880..e72465f5736 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -31,8 +31,7 @@ nav: - utilities/feature_flags.md - utilities/jmespath_functions.md - CloudFormation Custom Resources: https://github.com/aws-cloudformation/custom-resource-helper" target="_blank - - Streaming: - - utilities/streaming/s3.md + - utilities/streaming.md theme: name: material From 3cf13e52b7e48b70dd709408593bd93750e60d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Mon, 21 Nov 2022 14:02:09 +0100 Subject: [PATCH 16/43] chore: added compat StreamingBody for older botocore --- .../utilities/streaming/_s3_seekable_io.py | 10 +- .../utilities/streaming/compat.py | 157 ++++++++++++++++++ mypy.ini | 6 + .../streaming/handlers/s3_object_handler.py | 2 + tests/e2e/streaming/test_s3_object.py | 1 + .../streaming/test_s3_seekable_io.py | 14 +- 6 files changed, 180 insertions(+), 10 deletions(-) create mode 100644 aws_lambda_powertools/utilities/streaming/compat.py diff --git a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py index 1a933bd7577..ef29ab70f93 100644 --- a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py +++ b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py @@ -3,7 +3,11 @@ from typing import IO, TYPE_CHECKING, AnyStr, Iterable, List, Optional import boto3 -from botocore.response import StreamingBody +from botocore import response + +from aws_lambda_powertools.utilities.streaming.compat import PowertoolsStreamingBody + +response.StreamingBody = PowertoolsStreamingBody if TYPE_CHECKING: from mypy_boto3_s3 import S3ServiceResource @@ -47,7 +51,7 @@ def __init__( self._s3_object: Optional["Object"] = None self._s3_resource: Optional["S3ServiceResource"] = boto3_s3_resource - self._raw_stream: Optional[StreamingBody] = None + self._raw_stream: Optional[PowertoolsStreamingBody] = None @property def s3_resource(self) -> "S3ServiceResource": @@ -83,7 +87,7 @@ def size(self) -> int: return self._size @property - def raw_stream(self) -> StreamingBody: + def raw_stream(self) -> PowertoolsStreamingBody: """ Returns the boto3 StreamingBody, starting the stream from the seeked position. """ diff --git a/aws_lambda_powertools/utilities/streaming/compat.py b/aws_lambda_powertools/utilities/streaming/compat.py new file mode 100644 index 00000000000..89ef6e8094c --- /dev/null +++ b/aws_lambda_powertools/utilities/streaming/compat.py @@ -0,0 +1,157 @@ +import logging +from io import IOBase +from typing import Optional + +from botocore.compat import set_socket_timeout +from botocore.exceptions import ( + IncompleteReadError, + ReadTimeoutError, + ResponseStreamingError, +) +from urllib3.exceptions import ProtocolError as URLLib3ProtocolError +from urllib3.exceptions import ReadTimeoutError as URLLib3ReadTimeoutError + +logger = logging.getLogger(__name__) + + +class PowertoolsStreamingBody(IOBase): + """Wrapper class for a HTTP response body. + + # noqa: E501 + Currently, the same as https://github.com/boto/botocore/blob/b9c540905a6c9bf7184d251407555577bf1fa026/botocore/response.py + We created this because the version of StreamingBody included with the Lambda Runtime is too old, and + doesn't support many of the standard IO methods (like readline). + + As soon as the version of botocore included with the Lambda runtime is equal or greater than 1.29.13, we can drop + this file completely. See https://docs.aws.amazon.com/lambda/latest/dg/lambda-python.html. + + This provides a few additional conveniences that do not exist + in the urllib3 model: + * Set the timeout on the socket (i.e read() timeouts) + * Auto validation of content length, if the amount of bytes + we read does not match the content length, an exception + is raised. + """ + + _DEFAULT_CHUNK_SIZE = 1024 + + def __init__(self, raw_stream, content_length): + self._raw_stream = raw_stream + self._content_length = content_length + self._amount_read = 0 + + def __del__(self): + # Extending destructor in order to preserve the underlying raw_stream. + # The ability to add custom cleanup logic introduced in Python3.4+. + # https://www.python.org/dev/peps/pep-0442/ + pass + + def set_socket_timeout(self, timeout): + """Set the timeout seconds on the socket.""" + # The problem we're trying to solve is to prevent .read() calls from + # hanging. This can happen in rare cases. What we'd like to ideally + # do is set a timeout on the .read() call so that callers can retry + # the request. + # Unfortunately, this isn't currently possible in requests. + # See: https://github.com/kennethreitz/requests/issues/1803 + # So what we're going to do is reach into the guts of the stream and + # grab the socket object, which we can set the timeout on. We're + # putting in a check here so in case this interface goes away, we'll + # know. + try: + set_socket_timeout(self._raw_stream, timeout) + except AttributeError: + logger.error( + "Cannot access the socket object of " + "a streaming response. It's possible " + "the interface has changed.", + exc_info=True, + ) + raise + + def readable(self): + try: + return self._raw_stream.readable() + except AttributeError: + return False + + def read(self, amt=None): + """Read at most amt bytes from the stream. + If the amt argument is omitted, read all data. + """ + try: + chunk = self._raw_stream.read(amt) + except URLLib3ReadTimeoutError as e: + raise ReadTimeoutError(endpoint_url=e.url, error=e) + except URLLib3ProtocolError as e: + raise ResponseStreamingError(error=e) + self._amount_read += len(chunk) + if amt is None or (not chunk and amt > 0): + # If the server sends empty contents or + # we ask to read all of the contents, then we know + # we need to verify the content length. + self._verify_content_length() + return chunk + + def readlines(self, hint: Optional[int] = -1): + return self._raw_stream.readlines(hint) + + def __iter__(self): + """Return an iterator to yield 1k chunks from the raw stream.""" + return self.iter_chunks(self._DEFAULT_CHUNK_SIZE) + + def __next__(self): + """Return the next 1k chunk from the raw stream.""" + current_chunk = self.read(self._DEFAULT_CHUNK_SIZE) + if current_chunk: + return current_chunk + raise StopIteration() + + def __enter__(self): + return self._raw_stream + + def __exit__(self, *args): + self._raw_stream.close() + + next = __next__ # noqa: A003, VNE003 + + def iter_lines(self, chunk_size=_DEFAULT_CHUNK_SIZE, keepends=False): + """Return an iterator to yield lines from the raw stream. + This is achieved by reading chunk of bytes (of size chunk_size) at a + time from the raw stream, and then yielding lines from there. + """ + pending = b"" + for chunk in self.iter_chunks(chunk_size): + lines = (pending + chunk).splitlines(True) + for line in lines[:-1]: + yield line.splitlines(keepends)[0] + pending = lines[-1] + if pending: + yield pending.splitlines(keepends)[0] + + def iter_chunks(self, chunk_size=_DEFAULT_CHUNK_SIZE): + """Return an iterator to yield chunks of chunk_size bytes from the raw + stream. + """ + while True: + current_chunk = self.read(chunk_size) + if current_chunk == b"": + break + yield current_chunk + + def _verify_content_length(self): + # See: https://github.com/kennethreitz/requests/issues/1855 + # Basically, our http library doesn't do this for us, so we have + # to do this ourself. + if self._content_length is not None and self._amount_read != int(self._content_length): + raise IncompleteReadError( + actual_bytes=self._amount_read, + expected_bytes=int(self._content_length), + ) + + def tell(self): + return self._raw_stream.tell() + + def close(self): + """Close the underlying http response stream.""" + self._raw_stream.close() diff --git a/mypy.ini b/mypy.ini index 03545a2cc9f..6ab4cb0de32 100644 --- a/mypy.ini +++ b/mypy.ini @@ -20,6 +20,9 @@ ignore_missing_imports=True [mypy-boto3] ignore_missing_imports = True +[mypy-botocore] +ignore_missing_imports = True + [mypy-botocore.response] ignore_missing_imports = True @@ -29,6 +32,9 @@ ignore_missing_imports = True [mypy-botocore.config] ignore_missing_imports = True +[mypy-botocore.compat] +ignore_missing_imports = True + [mypy-botocore.exceptions] ignore_missing_imports = True diff --git a/tests/e2e/streaming/handlers/s3_object_handler.py b/tests/e2e/streaming/handlers/s3_object_handler.py index 123d1fcd5b9..95ed7b443dc 100644 --- a/tests/e2e/streaming/handlers/s3_object_handler.py +++ b/tests/e2e/streaming/handlers/s3_object_handler.py @@ -66,5 +66,7 @@ def lambda_handler(event, context): response["body"] = obj.__next__() elif transform_gzip or gunzip: response["body"] = obj.readline().rstrip() + else: + response["body"] = obj.readline().rstrip() return response diff --git a/tests/e2e/streaming/test_s3_object.py b/tests/e2e/streaming/test_s3_object.py index 2c63dd9cc0a..a13a4e6c0fc 100644 --- a/tests/e2e/streaming/test_s3_object.py +++ b/tests/e2e/streaming/test_s3_object.py @@ -27,6 +27,7 @@ def test_s3_object_size(s3_object_handler_fn_arn, regular_bucket_name): payload = {"bucket": regular_bucket_name, "key": "plain.txt"} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("size") == 12 + assert result.get("body") == "hello world" def test_s3_object_csv_constructor(s3_object_handler_fn_arn, regular_bucket_name): diff --git a/tests/functional/streaming/test_s3_seekable_io.py b/tests/functional/streaming/test_s3_seekable_io.py index 4b3ed59355f..372a680b31a 100644 --- a/tests/functional/streaming/test_s3_seekable_io.py +++ b/tests/functional/streaming/test_s3_seekable_io.py @@ -3,9 +3,9 @@ import boto3 import pytest from botocore import stub -from botocore.response import StreamingBody from aws_lambda_powertools.utilities.streaming._s3_seekable_io import _S3SeekableIO +from aws_lambda_powertools.utilities.streaming.compat import PowertoolsStreamingBody @pytest.fixture @@ -89,7 +89,7 @@ def test_raw_stream_fetches_with_range_header_after_seek(s3_seekable_obj, s3_res def test_read(s3_seekable_obj, s3_resource_stub): payload = b"hello world" - streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) s3_resource_stub.add_response( "get_object", @@ -105,7 +105,7 @@ def test_read(s3_seekable_obj, s3_resource_stub): def test_readline(s3_seekable_obj, s3_resource_stub): payload = b"hello world\nworld hello" - streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) s3_resource_stub.add_response( "get_object", @@ -120,7 +120,7 @@ def test_readline(s3_seekable_obj, s3_resource_stub): def test_readlines(s3_seekable_obj, s3_resource_stub): payload = b"hello world\nworld hello" - streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) s3_resource_stub.add_response( "get_object", @@ -134,7 +134,7 @@ def test_readlines(s3_seekable_obj, s3_resource_stub): def test_closed(s3_seekable_obj, s3_resource_stub): payload = b"test" - streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) s3_resource_stub.add_response( "get_object", @@ -148,7 +148,7 @@ def test_closed(s3_seekable_obj, s3_resource_stub): def test_next(s3_seekable_obj, s3_resource_stub): payload = b"test" - streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) s3_resource_stub.add_response( "get_object", @@ -163,7 +163,7 @@ def test_next(s3_seekable_obj, s3_resource_stub): def test_context_manager(s3_seekable_obj, s3_resource_stub): payload = b"test" - streaming_body = StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) + streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) s3_resource_stub.add_response( "get_object", From 49fc7b8f8de8cd6251fc2b77d75a25444e73d701 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Mon, 21 Nov 2022 14:29:24 +0100 Subject: [PATCH 17/43] fix: boto3 import order --- aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py index ef29ab70f93..989e9f78a92 100644 --- a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py +++ b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py @@ -2,13 +2,15 @@ import logging from typing import IO, TYPE_CHECKING, AnyStr, Iterable, List, Optional -import boto3 from botocore import response from aws_lambda_powertools.utilities.streaming.compat import PowertoolsStreamingBody response.StreamingBody = PowertoolsStreamingBody +# We need to import boto3 after monkey patching StreamingBody, otherwise the patch is not applied +import boto3 # noqa: E402 + if TYPE_CHECKING: from mypy_boto3_s3 import S3ServiceResource from mypy_boto3_s3.service_resource import Object From cda826d92f6adf212b9b885f9ce50897cb76a160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Mon, 21 Nov 2022 15:11:18 +0100 Subject: [PATCH 18/43] chore: use s3 client instead of s3 resource --- .../utilities/streaming/_s3_seekable_io.py | 52 ++++++++----------- .../utilities/streaming/s3_object.py | 12 ++--- tests/functional/streaming/test_s3_object.py | 6 +-- .../streaming/test_s3_seekable_io.py | 52 +++++++++---------- 4 files changed, 55 insertions(+), 67 deletions(-) diff --git a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py index 989e9f78a92..99619851e07 100644 --- a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py +++ b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py @@ -1,6 +1,6 @@ import io import logging -from typing import IO, TYPE_CHECKING, AnyStr, Iterable, List, Optional +from typing import IO, TYPE_CHECKING, Any, AnyStr, Dict, Iterable, List, Optional from botocore import response @@ -12,8 +12,7 @@ import boto3 # noqa: E402 if TYPE_CHECKING: - from mypy_boto3_s3 import S3ServiceResource - from mypy_boto3_s3.service_resource import Object + from mypy_boto3_s3 import Client logger = logging.getLogger(__name__) @@ -31,16 +30,19 @@ class _S3SeekableIO(IO[bytes]): The S3 key version_id: str, optional A version ID of the object, when the S3 bucket is versioned - boto3_s3_resource: S3ServiceResource, optional - An optional boto3 S3 resource. If missing, a new one will be created. + boto3_s3_client: boto3 S3 Client, optional + An optional boto3 S3 client. If missing, a new one will be created. """ def __init__( - self, bucket: str, key: str, version_id: Optional[str] = None, boto3_s3_resource=Optional["S3ServiceResource"] + self, + bucket: str, + key: str, + version_id: Optional[str] = None, + boto3_s3_client=Optional["Client"], ): self.bucket = bucket self.key = key - self.version_id = version_id # Holds the current position in the stream self._position = 0 @@ -51,33 +53,21 @@ def __init__( # Caches the size of the object self._size: Optional[int] = None - self._s3_object: Optional["Object"] = None - self._s3_resource: Optional["S3ServiceResource"] = boto3_s3_resource - self._raw_stream: Optional[PowertoolsStreamingBody] = None + self._s3_kwargs: Dict[str, Any] = {"Bucket": bucket, "Key": key} + if version_id is not None: + self._s3_kwargs["VersionId"] = version_id - @property - def s3_resource(self) -> "S3ServiceResource": - """ - Returns a boto3 S3ServiceResource - """ - if self._s3_resource is None: - self._s3_resource = boto3.resource("s3") - return self._s3_resource + self._s3_client: Optional["Client"] = boto3_s3_client + self._raw_stream: Optional[PowertoolsStreamingBody] = None @property - def s3_object(self) -> "Object": + def s3_client(self) -> "Client": """ - Returns a boto3 S3Object + Returns a boto3 S3 client """ - if self._s3_object is None: - if self.version_id is not None: - self._s3_object = self.s3_resource.ObjectVersion( - bucket_name=self.bucket, object_key=self.key, id=self.version_id - ).Object() - else: - self._s3_object = self.s3_resource.Object(bucket_name=self.bucket, key=self.key) - - return self._s3_object + if self._s3_client is None: + self._s3_client = boto3.client("s3") + return self._s3_client @property def size(self) -> int: @@ -85,7 +75,7 @@ def size(self) -> int: Retrieves the size of the S3 object """ if self._size is None: - self._size = self.s3_object.content_length + self._size = self.s3_client.head_object(**self._s3_kwargs).get("ContentLength", 0) return self._size @property @@ -96,7 +86,7 @@ def raw_stream(self) -> PowertoolsStreamingBody: if self._raw_stream is None: range_header = "bytes=%d-" % self._position logging.debug(f"Starting new stream at {range_header}...") - self._raw_stream = self.s3_object.get(Range=range_header)["Body"] + self._raw_stream = self.s3_client.get_object(**self._s3_kwargs, Range=range_header).get("Body") self._closed = False return self._raw_stream diff --git a/aws_lambda_powertools/utilities/streaming/s3_object.py b/aws_lambda_powertools/utilities/streaming/s3_object.py index a44162c80f9..ac2c3f6d864 100644 --- a/aws_lambda_powertools/utilities/streaming/s3_object.py +++ b/aws_lambda_powertools/utilities/streaming/s3_object.py @@ -25,7 +25,7 @@ ) if TYPE_CHECKING: - from mypy_boto3_s3 import S3ServiceResource + from mypy_boto3_s3 import Client class S3Object(IO[bytes]): @@ -42,8 +42,8 @@ class S3Object(IO[bytes]): The S3 key version_id: str, optional A version ID of the object, when the S3 bucket is versioned - boto3_s3_resource: S3ServiceResource, optional - An optional boto3 S3 resource. If missing, a new one will be created. + boto3_s3_client: S3Client, optional + An optional boto3 S3 client. If missing, a new one will be created. gunzip: bool, optional Enables the Gunzip data transformation csv: bool, optional @@ -67,7 +67,7 @@ def __init__( bucket: str, key: str, version_id: Optional[str] = None, - boto3_s3_resource: Optional["S3ServiceResource"] = None, + boto3_s3_client: Optional["Client"] = None, gunzip: Optional[bool] = False, csv: Optional[bool] = False, ): @@ -76,9 +76,7 @@ def __init__( self.version_id = version_id # The underlying seekable IO, where all the magic happens - self.raw_stream = _S3SeekableIO( - bucket=bucket, key=key, version_id=version_id, boto3_s3_resource=boto3_s3_resource - ) + self.raw_stream = _S3SeekableIO(bucket=bucket, key=key, version_id=version_id, boto3_s3_client=boto3_s3_client) # Stores the list of data transformations self._data_transformations: List[BaseTransform] = [] diff --git a/tests/functional/streaming/test_s3_object.py b/tests/functional/streaming/test_s3_object.py index d2d18ea010f..90a4c6f3398 100644 --- a/tests/functional/streaming/test_s3_object.py +++ b/tests/functional/streaming/test_s3_object.py @@ -50,13 +50,13 @@ def test_s3_transform_after_read(): # GIVEN a S3 Object with a "hello world" payload payload = b"hello world" - s3_resource = boto3.resource("s3") - s3_stub = stub.Stubber(s3_resource.meta.client) + s3_client = boto3.client("s3") + s3_stub = stub.Stubber(s3_client) s3_stub.add_response( "get_object", {"Body": StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload))} ) s3_stub.activate() - obj = S3Object(bucket="bucket", key="key", boto3_s3_resource=s3_resource) + obj = S3Object(bucket="bucket", key="key", boto3_s3_client=s3_client) # WHEN you read some part of the object and then apply a transformation assert obj.read(5) == b"hello" diff --git a/tests/functional/streaming/test_s3_seekable_io.py b/tests/functional/streaming/test_s3_seekable_io.py index 372a680b31a..30069376e2c 100644 --- a/tests/functional/streaming/test_s3_seekable_io.py +++ b/tests/functional/streaming/test_s3_seekable_io.py @@ -9,18 +9,18 @@ @pytest.fixture -def s3_resource(): - return boto3.resource("s3") +def s3_client(): + return boto3.client("s3") @pytest.fixture -def s3_seekable_obj(s3_resource): - return _S3SeekableIO(bucket="bucket", key="key", boto3_s3_resource=s3_resource) +def s3_seekable_obj(s3_client): + return _S3SeekableIO(bucket="bucket", key="key", boto3_s3_client=s3_client) @pytest.fixture -def s3_resource_stub(s3_resource): - s3_stub = stub.Stubber(s3_resource.meta.client) +def s3_client_stub(s3_client): + s3_stub = stub.Stubber(s3_client) s3_stub.activate() return s3_stub @@ -52,21 +52,21 @@ def test_seek_cur_changes_position(s3_seekable_obj): assert s3_seekable_obj.tell() == 300 -def test_seek_end(s3_seekable_obj, s3_resource_stub): - s3_resource_stub.add_response("head_object", {"ContentLength": 1000}) +def test_seek_end(s3_seekable_obj, s3_client_stub): + s3_client_stub.add_response("head_object", {"ContentLength": 1000}) assert s3_seekable_obj.seek(0, io.SEEK_END) == 1000 assert s3_seekable_obj.tell() == 1000 -def test_size(s3_seekable_obj, s3_resource_stub): - s3_resource_stub.add_response("head_object", {"ContentLength": 1000}) +def test_size(s3_seekable_obj, s3_client_stub): + s3_client_stub.add_response("head_object", {"ContentLength": 1000}) assert s3_seekable_obj.size == 1000 -def test_raw_stream_fetches_with_range_header(s3_seekable_obj, s3_resource_stub): - s3_resource_stub.add_response( +def test_raw_stream_fetches_with_range_header(s3_seekable_obj, s3_client_stub): + s3_client_stub.add_response( "get_object", {"Body": ""}, {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, @@ -75,10 +75,10 @@ def test_raw_stream_fetches_with_range_header(s3_seekable_obj, s3_resource_stub) assert s3_seekable_obj.raw_stream is not None -def test_raw_stream_fetches_with_range_header_after_seek(s3_seekable_obj, s3_resource_stub): +def test_raw_stream_fetches_with_range_header_after_seek(s3_seekable_obj, s3_client_stub): s3_seekable_obj.seek(100, io.SEEK_SET) - s3_resource_stub.add_response( + s3_client_stub.add_response( "get_object", {"Body": ""}, {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=100-"}, @@ -87,11 +87,11 @@ def test_raw_stream_fetches_with_range_header_after_seek(s3_seekable_obj, s3_res assert s3_seekable_obj.raw_stream is not None -def test_read(s3_seekable_obj, s3_resource_stub): +def test_read(s3_seekable_obj, s3_client_stub): payload = b"hello world" streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) - s3_resource_stub.add_response( + s3_client_stub.add_response( "get_object", {"Body": streaming_body}, {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, @@ -103,11 +103,11 @@ def test_read(s3_seekable_obj, s3_resource_stub): assert s3_seekable_obj.tell() == len(payload) -def test_readline(s3_seekable_obj, s3_resource_stub): +def test_readline(s3_seekable_obj, s3_client_stub): payload = b"hello world\nworld hello" streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) - s3_resource_stub.add_response( + s3_client_stub.add_response( "get_object", {"Body": streaming_body}, {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, @@ -118,11 +118,11 @@ def test_readline(s3_seekable_obj, s3_resource_stub): assert s3_seekable_obj.tell() == len(payload) -def test_readlines(s3_seekable_obj, s3_resource_stub): +def test_readlines(s3_seekable_obj, s3_client_stub): payload = b"hello world\nworld hello" streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) - s3_resource_stub.add_response( + s3_client_stub.add_response( "get_object", {"Body": streaming_body}, {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, @@ -132,11 +132,11 @@ def test_readlines(s3_seekable_obj, s3_resource_stub): assert s3_seekable_obj.tell() == len(payload) -def test_closed(s3_seekable_obj, s3_resource_stub): +def test_closed(s3_seekable_obj, s3_client_stub): payload = b"test" streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) - s3_resource_stub.add_response( + s3_client_stub.add_response( "get_object", {"Body": streaming_body}, {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, @@ -146,11 +146,11 @@ def test_closed(s3_seekable_obj, s3_resource_stub): assert s3_seekable_obj.closed is True -def test_next(s3_seekable_obj, s3_resource_stub): +def test_next(s3_seekable_obj, s3_client_stub): payload = b"test" streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) - s3_resource_stub.add_response( + s3_client_stub.add_response( "get_object", {"Body": streaming_body}, {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, @@ -161,11 +161,11 @@ def test_next(s3_seekable_obj, s3_resource_stub): next(s3_seekable_obj) -def test_context_manager(s3_seekable_obj, s3_resource_stub): +def test_context_manager(s3_seekable_obj, s3_client_stub): payload = b"test" streaming_body = PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload)) - s3_resource_stub.add_response( + s3_client_stub.add_response( "get_object", {"Body": streaming_body}, {"Bucket": s3_seekable_obj.bucket, "Key": s3_seekable_obj.key, "Range": "bytes=0-"}, From dbdef1e75d1290dbeb605b3665efe5a194daf43e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Mon, 21 Nov 2022 15:18:08 +0100 Subject: [PATCH 19/43] fix: replace Union with | --- .../utilities/streaming/s3_object.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/aws_lambda_powertools/utilities/streaming/s3_object.py b/aws_lambda_powertools/utilities/streaming/s3_object.py index ac2c3f6d864..bea4949ef06 100644 --- a/aws_lambda_powertools/utilities/streaming/s3_object.py +++ b/aws_lambda_powertools/utilities/streaming/s3_object.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import io from typing import ( IO, @@ -7,7 +9,6 @@ List, Optional, Sequence, - Union, cast, overload, ) @@ -30,9 +31,9 @@ class S3Object(IO[bytes]): """ - Seekable streamable S3 Object reader. + Seekable and streamable S3 Object reader. - S3Object implements the IO[bytes], backed by a seekable s3 streaming. + S3Object implements the IO[bytes], backed by a seekable S3 streaming. Parameters ---------- @@ -111,23 +112,21 @@ def transformed_stream(self) -> IO[bytes]: return self._transformed_stream @overload - def transform( - self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Literal[True] - ) -> T: + def transform(self, transformations: BaseTransform[T] | Sequence[BaseTransform[T]], in_place: Literal[True]) -> T: pass @overload def transform( - self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Literal[False] + self, transformations: BaseTransform[T] | Sequence[BaseTransform[T]], in_place: Literal[False] ) -> None: pass @overload - def transform(self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]]) -> T: + def transform(self, transformations: BaseTransform[T] | Sequence[BaseTransform[T]]) -> T: pass def transform( - self, transformations: Union[BaseTransform[T], Sequence[BaseTransform[T]]], in_place: Optional[bool] = False + self, transformations: BaseTransform[T] | Sequence[BaseTransform[T]], in_place: Optional[bool] = False ) -> Optional[T]: """ Applies one or more data transformations to the stream. From 0f43797048dbc3518a1a8a668c498b625d9f0c05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Mon, 21 Nov 2022 15:23:08 +0100 Subject: [PATCH 20/43] fix: add descriptions on NotImplementedErrors --- .../utilities/streaming/_s3_seekable_io.py | 10 +++++----- aws_lambda_powertools/utilities/streaming/s3_object.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py index 99619851e07..4612d20db3c 100644 --- a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py +++ b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py @@ -165,19 +165,19 @@ def close(self) -> None: self._closed = True def fileno(self) -> int: - raise NotImplementedError() + raise NotImplementedError("this stream is not backed by a file descriptor") def flush(self) -> None: - raise NotImplementedError() + raise NotImplementedError("this stream is not writable") def isatty(self) -> bool: return False def truncate(self, size: Optional[int] = 0) -> int: - raise NotImplementedError() + raise NotImplementedError("this stream is not writable") def write(self, data: AnyStr) -> int: - raise NotImplementedError() + raise NotImplementedError("this stream is not writable") def writelines(self, lines: Iterable[AnyStr]) -> None: - raise NotImplementedError() + raise NotImplementedError("this stream is not writable") diff --git a/aws_lambda_powertools/utilities/streaming/s3_object.py b/aws_lambda_powertools/utilities/streaming/s3_object.py index bea4949ef06..19b0db92ea3 100644 --- a/aws_lambda_powertools/utilities/streaming/s3_object.py +++ b/aws_lambda_powertools/utilities/streaming/s3_object.py @@ -214,19 +214,19 @@ def __iter__(self): return self.transformed_stream.__iter__() def fileno(self) -> int: - raise NotImplementedError() + raise NotImplementedError("this stream is not backed by a file descriptor") def flush(self) -> None: - raise NotImplementedError() + raise NotImplementedError("this stream is not writable") def isatty(self) -> bool: return False def truncate(self, size: Optional[int] = 0) -> int: - raise NotImplementedError() + raise NotImplementedError("this stream is not writable") def write(self, data: AnyStr) -> int: - raise NotImplementedError() + raise NotImplementedError("this stream is not writable") def writelines(self, lines: Iterable[AnyStr]) -> None: - raise NotImplementedError() + raise NotImplementedError("this stream is not writable") From c42395aaee1c65fb3517bc75e26381ddf8aebea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Mon, 21 Nov 2022 15:39:29 +0100 Subject: [PATCH 21/43] chore(docs): added programming documentation to the data transformations --- .../streaming/transformations/csv.py | 40 +++++++++++++++++++ .../streaming/transformations/gzip.py | 19 +++++++++ .../streaming/transformations/zip.py | 35 ++++++++++++++++ 3 files changed, 94 insertions(+) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/csv.py b/aws_lambda_powertools/utilities/streaming/transformations/csv.py index e2ad66a7a51..8bad3c37b4f 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/csv.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/csv.py @@ -7,6 +7,46 @@ class CsvTransform(BaseTransform): + """ + CSV data transform. + + Returns a csv.DictReader that reads data from the input stream: + https://docs.python.org/3/library/csv.html#csv.DictReader + + Example + ------- + + >>> from aws_lambda_powertools.utilities.streaming import S3Object + >>> from aws_lambda_powertools.utilities.streaming.transformations import CsvTransform + >>> + >>> s3object = S3Object(bucket="bucket", key="key") + >>> csv_reader = s3object.transform(CsvTransform()) + >>> for row in csv_reader: + >>> print(row) + + Since the underlying stream of bytes needs to be converted into a stream of characters (Iterator[str]), + we wrap the input into a io.TextIOWrapper. This means you have control over the text encoding + and line termination options. + + >>> from aws_lambda_powertools.utilities.streaming import S3Object + >>> from aws_lambda_powertools.utilities.streaming.transformations import CsvTransform + >>> + >>> s3object = S3Object(bucket="bucket", key="key") + >>> csv_reader = s3object.transform(CsvTransform(encoding="utf-8", newline="\\r\\n")) + >>> for row in csv_reader: + >>> print(row) + + Additional options passed on the constructor, will be pased to the csv.DictReader constructor. + + >>> from aws_lambda_powertools.utilities.streaming import S3Object + >>> from aws_lambda_powertools.utilities.streaming.transformations import CsvTransform + >>> + >>> s3object = S3Object(bucket="bucket", key="key") + >>> csv_reader = s3object.transform(CsvTransform(dialect="excel")) + >>> for row in csv_reader: + >>> print(row) + """ + def transform(self, input_stream: IO[bytes]) -> DictReader: encoding = self.kwargs.pop("encoding", "utf-8") newline = self.kwargs.pop("newline", None) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/gzip.py b/aws_lambda_powertools/utilities/streaming/transformations/gzip.py index 859c80b83c9..db859e308af 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/gzip.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/gzip.py @@ -5,5 +5,24 @@ class GzipTransform(BaseTransform): + """ + Gzip data transform. + + Returns a gzip.GzipFile instead that reads data from the input stream: + https://docs.python.org/3/library/gzip.html#gzip.GzipFile + + Example + ------- + + >>> from aws_lambda_powertools.utilities.streaming import S3Object + >>> from aws_lambda_powertools.utilities.streaming.transformations import GzipTransform + >>> + >>> s3object = S3Object(bucket="bucket", key="key") + >>> reader = s3object.transform(GzipTransform()) + >>> for line in reader: + >>> print(line) + + """ + def transform(self, input_stream: IO[bytes]) -> GzipFile: return GzipFile(fileobj=input_stream, mode="rb", **self.kwargs) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/zip.py b/aws_lambda_powertools/utilities/streaming/transformations/zip.py index 9e8b52e8be0..525e0f95c50 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/zip.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/zip.py @@ -5,5 +5,40 @@ class ZipTransform(BaseTransform): + """ + Zip data transform. + + Returns a zip.ZipFile that reads data from the input stream: + https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile + + Currently, it's not possible to pipe the zip file stream into another data transformation, + since a Zip file contains multiple files, and not a single stream. + + Example + ------- + + >>> from aws_lambda_powertools.utilities.streaming import S3Object + >>> from aws_lambda_powertools.utilities.streaming.transformations import ZipTransform + >>> + >>> s3object = S3Object(bucket="bucket", key="key") + >>> zip_reader = s3object.transform(ZipTransform()) + >>> for file in zip_reader.namelist(): + >>> print(file) + >>> zip_reader.extract(file) + + Additional options passed on the constructor, will be pased to the csv.DictReader constructor. + + >>> from aws_lambda_powertools.utilities.streaming import S3Object + >>> from aws_lambda_powertools.utilities.streaming.transformations import ZipTransform + >>> import zipfile + >>> + >>> s3object = S3Object(bucket="bucket", key="key") + >>> zip_reader = s3object.transform(ZipTransform(compression=zipfile.ZIP_LZMA)) + >>> for file in zip_reader.namelist(): + >>> print(file) + >>> zip_reader.extract(file) + + """ + def transform(self, input_stream: IO[bytes]) -> ZipFile: return ZipFile(input_stream, mode="r", **self.kwargs) From 47b8345e74bc0391517ac0ab80b7043869b3e3f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Mon, 21 Nov 2022 15:45:42 +0100 Subject: [PATCH 22/43] fix: typo --- tests/functional/streaming/test_s3_object.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/streaming/test_s3_object.py b/tests/functional/streaming/test_s3_object.py index 90a4c6f3398..8c5f5591ca0 100644 --- a/tests/functional/streaming/test_s3_object.py +++ b/tests/functional/streaming/test_s3_object.py @@ -39,7 +39,7 @@ def test_s3_transform(): assert type(new_obj) is GzipFile -def test_s3_transform_in_palce(): +def test_s3_transform_in_place(): obj = S3Object(bucket="bucket", key="key") new_obj = obj.transform(GzipTransform(), in_place=True) From 4c1ddd806c60be57ae8524f1ffdbfbb2ac403b3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Mon, 21 Nov 2022 16:48:22 +0100 Subject: [PATCH 23/43] chore: add versioned bucket tests --- .../streaming/handlers/s3_object_handler.py | 64 +++++++++------- tests/e2e/streaming/test_s3_object.py | 75 +++++++++++++++++++ tests/functional/streaming/test_s3_object.py | 3 +- 3 files changed, 114 insertions(+), 28 deletions(-) diff --git a/tests/e2e/streaming/handlers/s3_object_handler.py b/tests/e2e/streaming/handlers/s3_object_handler.py index 95ed7b443dc..d0bcc7830a2 100644 --- a/tests/e2e/streaming/handlers/s3_object_handler.py +++ b/tests/e2e/streaming/handlers/s3_object_handler.py @@ -1,5 +1,7 @@ import zipfile +import botocore.exceptions + from aws_lambda_powertools.utilities.streaming import S3Object from aws_lambda_powertools.utilities.streaming.transformations import ( CsvTransform, @@ -40,33 +42,41 @@ def lambda_handler(event, context): transform_zip_lzma = event.get("transform_zip_lzma", False) transform_in_place = event.get("transform_in_place", False) - obj = S3Object(bucket=bucket, key=key, version_id=version_id, gunzip=gunzip, csv=csv) - response = {"size": obj.size} - - transformations = [] - if transform_gzip: - transformations.append(GzipTransform()) - if transform_zip: - transformations.append(ZipTransform()) - if transform_csv: - transformations.append(CsvTransform()) - if transform_zip_lzma: - transformations.append(ZipTransform(compression=zipfile.ZIP_LZMA)) - - if len(transformations) > 0: - if transform_in_place: - obj.transform(transformations, in_place=True) + response = {} + + try: + obj = S3Object(bucket=bucket, key=key, version_id=version_id, gunzip=gunzip, csv=csv) + response["size"] = obj.size + + transformations = [] + if transform_gzip: + transformations.append(GzipTransform()) + if transform_zip: + transformations.append(ZipTransform()) + if transform_csv: + transformations.append(CsvTransform()) + if transform_zip_lzma: + transformations.append(ZipTransform(compression=zipfile.ZIP_LZMA)) + + if len(transformations) > 0: + if transform_in_place: + obj.transform(transformations, in_place=True) + else: + obj = obj.transform(transformations) + + if transform_zip or transform_zip_lzma: + response["manifest"] = obj.namelist() + response["body"] = obj.read(obj.namelist()[1]).rstrip() # extracts the second file on the zip + elif transform_csv or csv: + response["body"] = obj.__next__() + elif transform_gzip or gunzip: + response["body"] = obj.readline().rstrip() + else: + response["body"] = obj.readline().rstrip() + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + response["error"] = "Not found" else: - obj = obj.transform(transformations) - - if transform_zip or transform_zip_lzma: - response["manifest"] = obj.namelist() - response["body"] = obj.read(obj.namelist()[1]).rstrip() # extracts the second file on the zip - elif transform_csv or csv: - response["body"] = obj.__next__() - elif transform_gzip or gunzip: - response["body"] = obj.readline().rstrip() - else: - response["body"] = obj.readline().rstrip() + raise return response diff --git a/tests/e2e/streaming/test_s3_object.py b/tests/e2e/streaming/test_s3_object.py index a13a4e6c0fc..3020d86337e 100644 --- a/tests/e2e/streaming/test_s3_object.py +++ b/tests/e2e/streaming/test_s3_object.py @@ -1,5 +1,6 @@ import json +import boto3 import pytest from tests.e2e.utils import data_fetcher @@ -10,11 +11,30 @@ def regular_bucket_name(infrastructure: dict) -> str: return infrastructure.get("RegularBucket", "") +@pytest.fixture +def versioned_bucket_name(infrastructure: dict) -> str: + return infrastructure.get("VersionedBucket", "") + + @pytest.fixture def s3_object_handler_fn_arn(infrastructure: dict) -> str: return infrastructure.get("S3ObjectHandler", "") +def get_object_version(bucket, key) -> str: + s3 = boto3.client("s3") + versions = s3.list_object_versions(Bucket=bucket) + + for version in versions["Versions"]: + version_id = version["VersionId"] + version_key = version["Key"] + + if version_key == key: + return version_id + + raise ValueError(f"Cannot find versioned {key} inside {bucket}") + + def get_lambda_result_payload(s3_object_handler_fn_arn: str, payload: dict) -> dict: handler_result, _ = data_fetcher.get_lambda_response( lambda_arn=s3_object_handler_fn_arn, payload=json.dumps(payload) @@ -30,12 +50,42 @@ def test_s3_object_size(s3_object_handler_fn_arn, regular_bucket_name): assert result.get("body") == "hello world" +def test_s3_versioned_object_size(s3_object_handler_fn_arn, versioned_bucket_name): + key = "plain.txt" + payload = { + "bucket": versioned_bucket_name, + "key": key, + "version_id": get_object_version(versioned_bucket_name, key), + } + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("size") == 12 + assert result.get("body") == "hello world" + + +def test_s3_object_non_existent(s3_object_handler_fn_arn, regular_bucket_name): + payload = {"bucket": regular_bucket_name, "key": "NOTEXISTENT.txt"} + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("error") == "Not found" + + def test_s3_object_csv_constructor(s3_object_handler_fn_arn, regular_bucket_name): payload = {"bucket": regular_bucket_name, "key": "csv.txt", "csv": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == {"name": "hello", "value": "world"} +def test_s3_versioned_object_csv_constructor(s3_object_handler_fn_arn, versioned_bucket_name): + key = "csv.txt" + payload = { + "bucket": versioned_bucket_name, + "key": key, + "version_id": get_object_version(versioned_bucket_name, key), + "csv": True, + } + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("body") == {"name": "hello", "value": "world"} + + def test_s3_object_csv_transform(s3_object_handler_fn_arn, regular_bucket_name): payload = {"bucket": regular_bucket_name, "key": "csv.txt", "transform_csv": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) @@ -54,12 +104,37 @@ def test_s3_object_csv_gzip_constructor(s3_object_handler_fn_arn, regular_bucket assert result.get("body") == {"name": "hello", "value": "world"} +def test_s3_versioned_object_csv_gzip_constructor(s3_object_handler_fn_arn, versioned_bucket_name): + key = "csv.txt.gz" + payload = { + "bucket": versioned_bucket_name, + "key": key, + "version_id": get_object_version(versioned_bucket_name, key), + "csv": True, + "gunzip": True, + } + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("body") == {"name": "hello", "value": "world"} + + def test_s3_object_gzip_constructor(s3_object_handler_fn_arn, regular_bucket_name): payload = {"bucket": regular_bucket_name, "key": "plain.txt.gz", "gunzip": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == "hello world" +def test_s3_versioned_object_gzip_constructor(s3_object_handler_fn_arn, versioned_bucket_name): + key = "plain.txt.gz" + payload = { + "bucket": versioned_bucket_name, + "key": key, + "version_id": get_object_version(versioned_bucket_name, key), + "gunzip": True, + } + result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) + assert result.get("body") == "hello world" + + def test_s3_object_gzip_transform(s3_object_handler_fn_arn, regular_bucket_name): payload = {"bucket": regular_bucket_name, "key": "plain.txt.gz", "transform_gzip": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) diff --git a/tests/functional/streaming/test_s3_object.py b/tests/functional/streaming/test_s3_object.py index 8c5f5591ca0..c74f9c8fe71 100644 --- a/tests/functional/streaming/test_s3_object.py +++ b/tests/functional/streaming/test_s3_object.py @@ -62,5 +62,6 @@ def test_s3_transform_after_read(): assert obj.read(5) == b"hello" # THEN it raises ValueError - with pytest.raises(ValueError): + with pytest.raises(ValueError) as exc: obj.transform(GzipTransform()) + assert str(exc.value) == "Cannot add transformations to a read object. Already read 5 bytes" From e405c92e9adae7545be3556226687c0994bc22d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Tue, 22 Nov 2022 11:04:43 +0100 Subject: [PATCH 24/43] fix: monkey patch botocore response correctly --- .../utilities/streaming/_s3_seekable_io.py | 13 +++---- .../utilities/streaming/compat.py | 35 +++++++++++++++++++ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py index 4612d20db3c..1c7595cb556 100644 --- a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py +++ b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py @@ -2,14 +2,15 @@ import logging from typing import IO, TYPE_CHECKING, Any, AnyStr, Dict, Iterable, List, Optional -from botocore import response +import boto3 +from botocore import endpoint -from aws_lambda_powertools.utilities.streaming.compat import PowertoolsStreamingBody +from aws_lambda_powertools.utilities.streaming.compat import ( + PowertoolsStreamingBody, + convert_to_response_dict, +) -response.StreamingBody = PowertoolsStreamingBody - -# We need to import boto3 after monkey patching StreamingBody, otherwise the patch is not applied -import boto3 # noqa: E402 +endpoint.convert_to_response_dict = convert_to_response_dict if TYPE_CHECKING: from mypy_boto3_s3 import Client diff --git a/aws_lambda_powertools/utilities/streaming/compat.py b/aws_lambda_powertools/utilities/streaming/compat.py index 89ef6e8094c..8d29c825a43 100644 --- a/aws_lambda_powertools/utilities/streaming/compat.py +++ b/aws_lambda_powertools/utilities/streaming/compat.py @@ -155,3 +155,38 @@ def tell(self): def close(self): """Close the underlying http response stream.""" self._raw_stream.close() + + +def convert_to_response_dict(http_response, operation_model): + """Convert an HTTP response object to a request dict. + + This converts the requests library's HTTP response object to + a dictionary. + + :type http_response: botocore.vendored.requests.model.Response + :param http_response: The HTTP response from an AWS service request. + + :rtype: dict + :return: A response dictionary which will contain the following keys: + * headers (dict) + * status_code (int) + * body (string or file-like object) + + """ + response_dict = { + "headers": http_response.headers, + "status_code": http_response.status_code, + "context": { + "operation_name": operation_model.name, + }, + } + if response_dict["status_code"] >= 300: + response_dict["body"] = http_response.content + elif operation_model.has_event_stream_output: + response_dict["body"] = http_response.raw + elif operation_model.has_streaming_output: + length = response_dict["headers"].get("content-length") + response_dict["body"] = PowertoolsStreamingBody(http_response.raw, length) + else: + response_dict["body"] = http_response.content + return response_dict From 95f70364570ed1e68891ba207483320c4406d7b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Tue, 22 Nov 2022 11:53:47 +0100 Subject: [PATCH 25/43] fix: only patch StreamingBody if botocore is bellow 1.29.13 --- .../utilities/streaming/_s3_seekable_io.py | 8 +- .../utilities/streaming/compat.py | 358 +++++++++--------- 2 files changed, 185 insertions(+), 181 deletions(-) diff --git a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py index 1c7595cb556..41b1eca80ab 100644 --- a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py +++ b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py @@ -3,14 +3,8 @@ from typing import IO, TYPE_CHECKING, Any, AnyStr, Dict, Iterable, List, Optional import boto3 -from botocore import endpoint -from aws_lambda_powertools.utilities.streaming.compat import ( - PowertoolsStreamingBody, - convert_to_response_dict, -) - -endpoint.convert_to_response_dict = convert_to_response_dict +from aws_lambda_powertools.utilities.streaming.compat import PowertoolsStreamingBody if TYPE_CHECKING: from mypy_boto3_s3 import Client diff --git a/aws_lambda_powertools/utilities/streaming/compat.py b/aws_lambda_powertools/utilities/streaming/compat.py index 8d29c825a43..63c8a98ff97 100644 --- a/aws_lambda_powertools/utilities/streaming/compat.py +++ b/aws_lambda_powertools/utilities/streaming/compat.py @@ -1,7 +1,18 @@ +""" +Currently, the same as https://github.com/boto/botocore/blob/b9c540905a6c9/botocore/response.py +We created this because the version of StreamingBody included with the Lambda Runtime is too old, and +doesn't support many of the standard IO methods (like readline). + +As soon as the version of botocore included with the Lambda runtime is equal or greater than 1.29.13, we can drop +this file completely. See https://docs.aws.amazon.com/lambda/latest/dg/lambda-python.html. +""" import logging +from distutils.version import StrictVersion from io import IOBase from typing import Optional +import botocore +from botocore import endpoint from botocore.compat import set_socket_timeout from botocore.exceptions import ( IncompleteReadError, @@ -13,180 +24,179 @@ logger = logging.getLogger(__name__) +if StrictVersion(botocore.__version__) < StrictVersion("1.29.13"): # noqa: C901 -class PowertoolsStreamingBody(IOBase): - """Wrapper class for a HTTP response body. - - # noqa: E501 - Currently, the same as https://github.com/boto/botocore/blob/b9c540905a6c9bf7184d251407555577bf1fa026/botocore/response.py - We created this because the version of StreamingBody included with the Lambda Runtime is too old, and - doesn't support many of the standard IO methods (like readline). - - As soon as the version of botocore included with the Lambda runtime is equal or greater than 1.29.13, we can drop - this file completely. See https://docs.aws.amazon.com/lambda/latest/dg/lambda-python.html. - - This provides a few additional conveniences that do not exist - in the urllib3 model: - * Set the timeout on the socket (i.e read() timeouts) - * Auto validation of content length, if the amount of bytes - we read does not match the content length, an exception - is raised. - """ - - _DEFAULT_CHUNK_SIZE = 1024 - - def __init__(self, raw_stream, content_length): - self._raw_stream = raw_stream - self._content_length = content_length - self._amount_read = 0 - - def __del__(self): - # Extending destructor in order to preserve the underlying raw_stream. - # The ability to add custom cleanup logic introduced in Python3.4+. - # https://www.python.org/dev/peps/pep-0442/ - pass - - def set_socket_timeout(self, timeout): - """Set the timeout seconds on the socket.""" - # The problem we're trying to solve is to prevent .read() calls from - # hanging. This can happen in rare cases. What we'd like to ideally - # do is set a timeout on the .read() call so that callers can retry - # the request. - # Unfortunately, this isn't currently possible in requests. - # See: https://github.com/kennethreitz/requests/issues/1803 - # So what we're going to do is reach into the guts of the stream and - # grab the socket object, which we can set the timeout on. We're - # putting in a check here so in case this interface goes away, we'll - # know. - try: - set_socket_timeout(self._raw_stream, timeout) - except AttributeError: - logger.error( - "Cannot access the socket object of " - "a streaming response. It's possible " - "the interface has changed.", - exc_info=True, - ) - raise - - def readable(self): - try: - return self._raw_stream.readable() - except AttributeError: - return False - - def read(self, amt=None): - """Read at most amt bytes from the stream. - If the amt argument is omitted, read all data. - """ - try: - chunk = self._raw_stream.read(amt) - except URLLib3ReadTimeoutError as e: - raise ReadTimeoutError(endpoint_url=e.url, error=e) - except URLLib3ProtocolError as e: - raise ResponseStreamingError(error=e) - self._amount_read += len(chunk) - if amt is None or (not chunk and amt > 0): - # If the server sends empty contents or - # we ask to read all of the contents, then we know - # we need to verify the content length. - self._verify_content_length() - return chunk - - def readlines(self, hint: Optional[int] = -1): - return self._raw_stream.readlines(hint) - - def __iter__(self): - """Return an iterator to yield 1k chunks from the raw stream.""" - return self.iter_chunks(self._DEFAULT_CHUNK_SIZE) - - def __next__(self): - """Return the next 1k chunk from the raw stream.""" - current_chunk = self.read(self._DEFAULT_CHUNK_SIZE) - if current_chunk: - return current_chunk - raise StopIteration() - - def __enter__(self): - return self._raw_stream - - def __exit__(self, *args): - self._raw_stream.close() - - next = __next__ # noqa: A003, VNE003 - - def iter_lines(self, chunk_size=_DEFAULT_CHUNK_SIZE, keepends=False): - """Return an iterator to yield lines from the raw stream. - This is achieved by reading chunk of bytes (of size chunk_size) at a - time from the raw stream, and then yielding lines from there. + class PowertoolsStreamingBody(IOBase): + """Wrapper class for a HTTP response body. + + This provides a few additional conveniences that do not exist + in the urllib3 model: + * Set the timeout on the socket (i.e read() timeouts) + * Auto validation of content length, if the amount of bytes + we read does not match the content length, an exception + is raised. """ - pending = b"" - for chunk in self.iter_chunks(chunk_size): - lines = (pending + chunk).splitlines(True) - for line in lines[:-1]: - yield line.splitlines(keepends)[0] - pending = lines[-1] - if pending: - yield pending.splitlines(keepends)[0] - - def iter_chunks(self, chunk_size=_DEFAULT_CHUNK_SIZE): - """Return an iterator to yield chunks of chunk_size bytes from the raw - stream. + + _DEFAULT_CHUNK_SIZE = 1024 + + def __init__(self, raw_stream, content_length): + self._raw_stream = raw_stream + self._content_length = content_length + self._amount_read = 0 + + def __del__(self): + # Extending destructor in order to preserve the underlying raw_stream. + # The ability to add custom cleanup logic introduced in Python3.4+. + # https://www.python.org/dev/peps/pep-0442/ + pass + + def set_socket_timeout(self, timeout): + """Set the timeout seconds on the socket.""" + # The problem we're trying to solve is to prevent .read() calls from + # hanging. This can happen in rare cases. What we'd like to ideally + # do is set a timeout on the .read() call so that callers can retry + # the request. + # Unfortunately, this isn't currently possible in requests. + # See: https://github.com/kennethreitz/requests/issues/1803 + # So what we're going to do is reach into the guts of the stream and + # grab the socket object, which we can set the timeout on. We're + # putting in a check here so in case this interface goes away, we'll + # know. + try: + set_socket_timeout(self._raw_stream, timeout) + except AttributeError: + logger.error( + "Cannot access the socket object of " + "a streaming response. It's possible " + "the interface has changed.", + exc_info=True, + ) + raise + + def readable(self): + try: + return self._raw_stream.readable() + except AttributeError: + return False + + def read(self, amt=None): + """Read at most amt bytes from the stream. + If the amt argument is omitted, read all data. + """ + try: + chunk = self._raw_stream.read(amt) + except URLLib3ReadTimeoutError as e: + raise ReadTimeoutError(endpoint_url=e.url, error=e) + except URLLib3ProtocolError as e: + raise ResponseStreamingError(error=e) + self._amount_read += len(chunk) + if amt is None or (not chunk and amt > 0): + # If the server sends empty contents or + # we ask to read all of the contents, then we know + # we need to verify the content length. + self._verify_content_length() + return chunk + + def readlines(self, hint: Optional[int] = -1): + return self._raw_stream.readlines(hint) + + def __iter__(self): + """Return an iterator to yield 1k chunks from the raw stream.""" + return self.iter_chunks(self._DEFAULT_CHUNK_SIZE) + + def __next__(self): + """Return the next 1k chunk from the raw stream.""" + current_chunk = self.read(self._DEFAULT_CHUNK_SIZE) + if current_chunk: + return current_chunk + raise StopIteration() + + def __enter__(self): + return self._raw_stream + + def __exit__(self, *args): + self._raw_stream.close() + + next = __next__ # noqa: A003, VNE003 + + def iter_lines(self, chunk_size=_DEFAULT_CHUNK_SIZE, keepends=False): + """Return an iterator to yield lines from the raw stream. + This is achieved by reading chunk of bytes (of size chunk_size) at a + time from the raw stream, and then yielding lines from there. + """ + pending = b"" + for chunk in self.iter_chunks(chunk_size): + lines = (pending + chunk).splitlines(True) + for line in lines[:-1]: + yield line.splitlines(keepends)[0] + pending = lines[-1] + if pending: + yield pending.splitlines(keepends)[0] + + def iter_chunks(self, chunk_size=_DEFAULT_CHUNK_SIZE): + """Return an iterator to yield chunks of chunk_size bytes from the raw + stream. + """ + while True: + current_chunk = self.read(chunk_size) + if current_chunk == b"": + break + yield current_chunk + + def _verify_content_length(self): + # See: https://github.com/kennethreitz/requests/issues/1855 + # Basically, our http library doesn't do this for us, so we have + # to do this ourself. + if self._content_length is not None and self._amount_read != int(self._content_length): + raise IncompleteReadError( + actual_bytes=self._amount_read, + expected_bytes=int(self._content_length), + ) + + def tell(self): + return self._raw_stream.tell() + + def close(self): + """Close the underlying http response stream.""" + self._raw_stream.close() + + def convert_to_response_dict(http_response, operation_model): + """Convert an HTTP response object to a request dict. + + This converts the requests library's HTTP response object to + a dictionary. + + :type http_response: botocore.vendored.requests.model.Response + :param http_response: The HTTP response from an AWS service request. + + :rtype: dict + :return: A response dictionary which will contain the following keys: + * headers (dict) + * status_code (int) + * body (string or file-like object) + """ - while True: - current_chunk = self.read(chunk_size) - if current_chunk == b"": - break - yield current_chunk - - def _verify_content_length(self): - # See: https://github.com/kennethreitz/requests/issues/1855 - # Basically, our http library doesn't do this for us, so we have - # to do this ourself. - if self._content_length is not None and self._amount_read != int(self._content_length): - raise IncompleteReadError( - actual_bytes=self._amount_read, - expected_bytes=int(self._content_length), - ) - - def tell(self): - return self._raw_stream.tell() - - def close(self): - """Close the underlying http response stream.""" - self._raw_stream.close() - - -def convert_to_response_dict(http_response, operation_model): - """Convert an HTTP response object to a request dict. - - This converts the requests library's HTTP response object to - a dictionary. - - :type http_response: botocore.vendored.requests.model.Response - :param http_response: The HTTP response from an AWS service request. - - :rtype: dict - :return: A response dictionary which will contain the following keys: - * headers (dict) - * status_code (int) - * body (string or file-like object) - - """ - response_dict = { - "headers": http_response.headers, - "status_code": http_response.status_code, - "context": { - "operation_name": operation_model.name, - }, - } - if response_dict["status_code"] >= 300: - response_dict["body"] = http_response.content - elif operation_model.has_event_stream_output: - response_dict["body"] = http_response.raw - elif operation_model.has_streaming_output: - length = response_dict["headers"].get("content-length") - response_dict["body"] = PowertoolsStreamingBody(http_response.raw, length) - else: - response_dict["body"] = http_response.content - return response_dict + response_dict = { + "headers": http_response.headers, + "status_code": http_response.status_code, + "context": { + "operation_name": operation_model.name, + }, + } + if response_dict["status_code"] >= 300: + response_dict["body"] = http_response.content + elif operation_model.has_event_stream_output: + response_dict["body"] = http_response.raw + elif operation_model.has_streaming_output: + length = response_dict["headers"].get("content-length") + response_dict["body"] = PowertoolsStreamingBody(http_response.raw, length) + else: + response_dict["body"] = http_response.content + return response_dict + + endpoint.convert_to_response_dict = convert_to_response_dict +else: + from botocore.response import StreamingBody + + # Expose PowertoolsStreamingBody as StreamingBody + vars()["PowertoolsStreamingBody"] = StreamingBody From fa4dec25376845b7de03c9e1331aae7ede3b2b84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Tue, 22 Nov 2022 13:56:51 +0100 Subject: [PATCH 26/43] chore: improve docs --- docs/utilities/streaming.md | 64 +++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index 038eef10f7d..d52d2caddfd 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -8,8 +8,8 @@ The streaming utility handles streaming data from AWS for processing data sets b ## Key Features * Simple interface to stream data from S3, even when the data is larger than memory -* Read your S3 file using the patterns you already know to deal with files in Python -* Includes common transformations to data stored in S3, like Gzip and CSV deserialization +* Read your S3 file using the patterns you already know when dealing with files in Python +* Includes common transformations to data stored in S3, like gunzip and CSV deserialization * Build your own data transformation and add it to the pipeline ## Background @@ -17,7 +17,7 @@ The streaming utility handles streaming data from AWS for processing data sets b Processing S3 files inside your Lambda function presents challenges when the file is bigger than the allocated amount of memory. Your data may also be stored using a set of encapsulation layers (gzip, CSV, zip files, etc). -This utility makes it easy to process data coming from S3 files, while applying data transformations transparently +This utility makes it easy to process data coming from S3 files, while transparently applying data transformations to the data stream. ## Getting started @@ -42,22 +42,24 @@ The code above will stream the contents from S3 as fast as possible, using minim ### Data transformations -The utility has some built-in data transformations to help deal with common scenarios while streaming data from S3. +The utility has some built-in data transformations to help dealing with common scenarios while streaming data from S3. -| Name | Description | -|----------|--------------------------------------------------------------------------------------------------| -| **Gzip** | Gunzips the stream of data using the [gzip library](https://docs.python.org/3/library/gzip.html) | -| **Zip** | Exposes the stream as a [ZipFile object](https://docs.python.org/3/library/zipfile.html) | -| **CSV** | Parses each line as a CSV object, returning dictionary objects | +| Name | Description | Class name | +|----------|--------------------------------------------------------------------------------------------------|---------------| +| **Gzip** | Gunzips the stream of data using the [gzip library](https://docs.python.org/3/library/gzip.html) | GzipTransform | +| **Zip** | Exposes the stream as a [ZipFile object](https://docs.python.org/3/library/zipfile.html) | ZipTransform | +| **CSV** | Parses each line as a CSV object, returning dictionary objects | CsvTransform | Common options like gunzipping a stream and parsing data as CSV can be enabled directly on the constructor: -```python hl_lines="8" ---8<-- "examples/streaming/src/s3_transform_common.py" -``` +=== "Enabling inflation of gzip data" -Additionally, you can return a new object that encapsulates the transformation, or transform the data in place, -Multiple transformations are applied in order. + ```python hl_lines="8" + --8<-- "examples/streaming/src/s3_transform_common.py" + ``` + +Additionally, you can return a new object that encapsulates a transformation, or transform the data in place, by calling +the `transform` method. Multiple transformations are applied in order. === "Returning a new object" @@ -77,23 +79,27 @@ Multiple transformations are applied in order. Each data transformation class accepts additional options to customize the transformation. -| Name | Description | -|----------|----------------------------------------------------------------------------------------------------------------| -| **Gzip** | All the options from the [GzipFile constructor](https://docs.python.org/3/library/gzip.html#gzip.GzipFile) | -| **Zip** | All the options from the [ZipFile constructor](https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile) | -| **CSV** | All the options from the [DictReader constructor](https://docs.python.org/3/library/csv.html#csv.DictReader) | +| Name | Description | +|-------------------|----------------------------------------------------------------------------------------------------------------| +| **GzipTransform** | All the options from the [GzipFile constructor](https://docs.python.org/3/library/gzip.html#gzip.GzipFile) | +| **ZipTransform** | All the options from the [ZipFile constructor](https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile) | +| **CsvTransform** | All the options from the [DictReader constructor](https://docs.python.org/3/library/csv.html#csv.DictReader) | For instance, if you want to unzip an S3 file compressed using `LZMA` you could pass that option in the constructor: -```python hl_lines="12" ---8<-- "examples/streaming/src/s3_transform_lzma.py" -``` +=== "Unzipping LZMA data" + + ```python hl_lines="12" + --8<-- "examples/streaming/src/s3_transform_lzma.py" + ``` Or, if you want to load a `TSV` file, you can just change the delimiter on the `CSV` transform: -```python hl_lines="12" ---8<-- "examples/streaming/src/s3_transform_tsv.py" -``` +=== "Loading TSV data" + + ```python hl_lines="11" + --8<-- "examples/streaming/src/s3_transform_tsv.py" + ``` ### Building your own data transformation @@ -101,9 +107,11 @@ You can build your own custom data transformation by extending the `BaseTransfor The `transform` method receives an `IO[bytes]` object, and you are responsible for returning an object that is also a `IO[bytes]`. -```python hl_lines="10 12 27-29" ---8<-- "examples/streaming/src/s3_json_transform.py" -``` +=== "Custom JSON transform" + + ```python hl_lines="10 12 27-29" + --8<-- "examples/streaming/src/s3_json_transform.py" + ``` ## Testing your code From 3bfbb5c6769dd1654d94590d832c02949a3b8365 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Wed, 23 Nov 2022 15:01:26 +0100 Subject: [PATCH 27/43] chore: apply review feedback --- .../utilities/streaming/_s3_seekable_io.py | 42 +++++++------ .../utilities/streaming/compat.py | 1 + .../utilities/streaming/s3_object.py | 62 +++++++++++++++---- .../streaming/transformations/base.py | 10 ++- .../streaming/transformations/csv.py | 8 +-- .../streaming/transformations/gzip.py | 2 +- .../streaming/transformations/zip.py | 20 ++++-- docs/utilities/streaming.md | 12 +++- examples/streaming/src/s3_transform_common.py | 2 +- .../streaming/src/s3_transform_zipfile.py | 8 +++ .../streaming/handlers/s3_object_handler.py | 10 +-- tests/e2e/streaming/test_s3_object.py | 22 +++---- tests/functional/streaming/test_s3_object.py | 8 +-- .../streaming/test_s3_seekable_io.py | 2 +- 14 files changed, 142 insertions(+), 67 deletions(-) create mode 100644 examples/streaming/src/s3_transform_zipfile.py diff --git a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py index 41b1eca80ab..54d8459b659 100644 --- a/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py +++ b/aws_lambda_powertools/utilities/streaming/_s3_seekable_io.py @@ -1,6 +1,6 @@ import io import logging -from typing import IO, TYPE_CHECKING, Any, AnyStr, Dict, Iterable, List, Optional +from typing import IO, TYPE_CHECKING, AnyStr, Iterable, List, Optional import boto3 @@ -25,16 +25,14 @@ class _S3SeekableIO(IO[bytes]): The S3 key version_id: str, optional A version ID of the object, when the S3 bucket is versioned - boto3_s3_client: boto3 S3 Client, optional + boto3_client: boto3 S3 Client, optional An optional boto3 S3 client. If missing, a new one will be created. + sdk_options: dict, optional + Dictionary of options that will be passed to the S3 Client get_object API call """ def __init__( - self, - bucket: str, - key: str, - version_id: Optional[str] = None, - boto3_s3_client=Optional["Client"], + self, bucket: str, key: str, version_id: Optional[str] = None, boto3_client=Optional["Client"], **sdk_options ): self.bucket = bucket self.key = key @@ -48,13 +46,15 @@ def __init__( # Caches the size of the object self._size: Optional[int] = None - self._s3_kwargs: Dict[str, Any] = {"Bucket": bucket, "Key": key} - if version_id is not None: - self._s3_kwargs["VersionId"] = version_id - - self._s3_client: Optional["Client"] = boto3_s3_client + self._s3_client: Optional["Client"] = boto3_client self._raw_stream: Optional[PowertoolsStreamingBody] = None + self._sdk_options = sdk_options + self._sdk_options["Bucket"] = bucket + self._sdk_options["Key"] = key + if version_id is not None: + self._sdk_options["VersionId"] = version_id + @property def s3_client(self) -> "Client": """ @@ -70,7 +70,8 @@ def size(self) -> int: Retrieves the size of the S3 object """ if self._size is None: - self._size = self.s3_client.head_object(**self._s3_kwargs).get("ContentLength", 0) + logger.debug("Getting size of S3 object") + self._size = self.s3_client.head_object(**self._sdk_options).get("ContentLength", 0) return self._size @property @@ -79,9 +80,9 @@ def raw_stream(self) -> PowertoolsStreamingBody: Returns the boto3 StreamingBody, starting the stream from the seeked position. """ if self._raw_stream is None: - range_header = "bytes=%d-" % self._position - logging.debug(f"Starting new stream at {range_header}...") - self._raw_stream = self.s3_client.get_object(**self._s3_kwargs, Range=range_header).get("Body") + range_header = f"bytes={self._position}-" + logger.debug(f"Starting new stream at {range_header}") + self._raw_stream = self.s3_client.get_object(Range=range_header, **self._sdk_options).get("Body") self._closed = False return self._raw_stream @@ -101,8 +102,13 @@ def seek(self, offset: int, whence: int = io.SEEK_SET) -> int: else: raise ValueError(f"invalid whence ({whence}, should be {io.SEEK_SET}, {io.SEEK_CUR}, {io.SEEK_END})") - # If we changed the position in the stream, we should invalidate the existing stream - # and open a new one on the next read + # Invalidate the existing stream, so a new one will be open on the next IO operation. + # + # Some consumers of this class might call seek multiple times, without affecting the net position. + # zipfile.ZipFile does this often. If we just blindly invalidated the stream, we would have to re-open + # an S3 HTTP connection just to continue reading on the same position as before, which would be inefficient. + # + # So we only invalidate it if there's a net position change after seeking, and we have an existing S3 connection if current_position != self._position and self._raw_stream is not None: self._raw_stream.close() self._raw_stream = None diff --git a/aws_lambda_powertools/utilities/streaming/compat.py b/aws_lambda_powertools/utilities/streaming/compat.py index 63c8a98ff97..383edb8b70a 100644 --- a/aws_lambda_powertools/utilities/streaming/compat.py +++ b/aws_lambda_powertools/utilities/streaming/compat.py @@ -194,6 +194,7 @@ def convert_to_response_dict(http_response, operation_model): response_dict["body"] = http_response.content return response_dict + # monkey patch boto3 endpoint.convert_to_response_dict = convert_to_response_dict else: from botocore.response import StreamingBody diff --git a/aws_lambda_powertools/utilities/streaming/s3_object.py b/aws_lambda_powertools/utilities/streaming/s3_object.py index 19b0db92ea3..b64d5d38161 100644 --- a/aws_lambda_powertools/utilities/streaming/s3_object.py +++ b/aws_lambda_powertools/utilities/streaming/s3_object.py @@ -29,6 +29,7 @@ from mypy_boto3_s3 import Client +# Maintenance: almost all this logic should be moved to a base class class S3Object(IO[bytes]): """ Seekable and streamable S3 Object reader. @@ -43,24 +44,25 @@ class S3Object(IO[bytes]): The S3 key version_id: str, optional A version ID of the object, when the S3 bucket is versioned - boto3_s3_client: S3Client, optional + boto3_client: S3Client, optional An optional boto3 S3 client. If missing, a new one will be created. - gunzip: bool, optional + is_gzip: bool, optional Enables the Gunzip data transformation - csv: bool, optional + is_csv: bool, optional Enables the CSV data transformation + sdk_options: dict, optional + Dictionary of options that will be passed to the S3 Client get_object API call Example ------- - ** Reads a line from an S3, loading as little data as necessary + **Reads a line from an S3, loading as little data as necessary:** >>> from aws_lambda_powertools.utilities.streaming import S3Object >>> >>> line: bytes = S3Object(bucket="bucket", key="key").readline() >>> >>> print(line) - """ def __init__( @@ -68,22 +70,25 @@ def __init__( bucket: str, key: str, version_id: Optional[str] = None, - boto3_s3_client: Optional["Client"] = None, - gunzip: Optional[bool] = False, - csv: Optional[bool] = False, + boto3_client: Optional["Client"] = None, + is_gzip: Optional[bool] = False, + is_csv: Optional[bool] = False, + **sdk_options, ): self.bucket = bucket self.key = key self.version_id = version_id # The underlying seekable IO, where all the magic happens - self.raw_stream = _S3SeekableIO(bucket=bucket, key=key, version_id=version_id, boto3_s3_client=boto3_s3_client) + self.raw_stream = _S3SeekableIO( + bucket=bucket, key=key, version_id=version_id, boto3_client=boto3_client, **sdk_options + ) # Stores the list of data transformations self._data_transformations: List[BaseTransform] = [] - if gunzip: + if is_gzip: self._data_transformations.append(GzipTransform()) - if csv: + if is_csv: self._data_transformations.append(CsvTransform()) # Stores the cached transformed stream @@ -102,8 +107,16 @@ def transformed_stream(self) -> IO[bytes]: Returns a IO[bytes] stream with all the data transformations applied in order """ if self._transformed_stream is None: - # Apply all the transformations + # Create a stream which is the result of applying all the data transformations + + # To start with, our transformed stream is the same as our raw seekable stream. + # This means that if there are no data transformations to be applied, IO is just + # delegated directly to the raw_stream. transformed_stream = self.raw_stream + + # Now we apply each transformation in order + # e.g: when self._data_transformations is [transform_1, transform_2], then + # transformed_stream is the equivalent of doing transform_2(transform_1(...(raw_stream))) for transformation in self._data_transformations: transformed_stream = transformation.transform(transformed_stream) @@ -143,6 +156,12 @@ def transform( T[bound=IO[bytes]], optional If in_place is False, returns an IO[bytes] object representing the transformed stream """ + # Once we start reading the stream, we should not change the data transformation. + # This would be a programming error: + # + # >>> s3object.transform(GzipTransform(), in_place=True) + # >>> s3object.readline() + # >>> s3object.transform(CsvTransform(), in_place=True) if self.tell() != 0: raise ValueError(f"Cannot add transformations to a read object. Already read {self.tell()} bytes") @@ -150,6 +169,15 @@ def transform( if not isinstance(transformations, Sequence): transformations = [transformations] + # Scenario 1: user wants to transform the stream in place. + # In this case, we store the transformations and invalidate any existing transformed stream. + # This way, the transformed_stream is re-created on the next IO operation. + # This can happen when the user calls .transform multiple times before they start reading data + # + # >>> s3object.transform(GzipTransform(), in_place=True) + # >>> s3object.seek(0, io.SEEK_SET) <- this creates a transformed stream + # >>> s3object.transform(CsvTransform(), in_place=True) <- need to re-create transformed stream + # >>> s3object.read... if in_place: self._data_transformations.extend(transformations) @@ -192,9 +220,17 @@ def __exit__(self, *args): self.close() def close(self): + # Scenario 1: S3Object = SeekableIO, because there are no data transformations applied + # In this scenario, we can only close the raw_stream. If we tried to also close the transformed_stream we would + # get an error, since they are the same object, and we can't close the same stream twice. self.raw_stream.close() - # Also close transformed stream if there are any transformations + # Scenario 2: S3Object -> [Transformations] -> SeekableIO, because there are data transformations applied + # In this scenario, we also need to close the transformed_stream if it exists. The reason we test for + # existence is that the user might want to close the object without reading data from it. Example: + # + # >>> s3object = S3Object(...., is_gzip=True) + # >>> s3object.close() <- transformed_stream doesn't exist yet at this point if self.raw_stream != self._transformed_stream and self._transformed_stream is not None: self._transformed_stream.close() diff --git a/aws_lambda_powertools/utilities/streaming/transformations/base.py b/aws_lambda_powertools/utilities/streaming/transformations/base.py index 06d9aaa02aa..9eb20e2c622 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/base.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/base.py @@ -7,11 +7,15 @@ class BaseTransform(Generic[T]): """ BaseTransform is the base class all data transformations need to implement. + + Parameters + ---------- + transform_options: dict, optional + Dictionary of options that can be passed to the underlying transformation to customize the behavior. """ - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs + def __init__(self, **transform_options): + self.transform_options = transform_options @abstractmethod def transform(self, input_stream: IO[bytes]) -> T: diff --git a/aws_lambda_powertools/utilities/streaming/transformations/csv.py b/aws_lambda_powertools/utilities/streaming/transformations/csv.py index 8bad3c37b4f..c8654ac4db2 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/csv.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/csv.py @@ -25,7 +25,7 @@ class CsvTransform(BaseTransform): >>> print(row) Since the underlying stream of bytes needs to be converted into a stream of characters (Iterator[str]), - we wrap the input into a io.TextIOWrapper. This means you have control over the text encoding + we wrap the input into an io.TextIOWrapper. This means you have control over the text encoding and line termination options. >>> from aws_lambda_powertools.utilities.streaming import S3Object @@ -48,9 +48,9 @@ class CsvTransform(BaseTransform): """ def transform(self, input_stream: IO[bytes]) -> DictReader: - encoding = self.kwargs.pop("encoding", "utf-8") - newline = self.kwargs.pop("newline", None) + encoding = self.transform_options.pop("encoding", "utf-8") + newline = self.transform_options.pop("newline", None) # csv module needs an Iterator[str], so we wrap the underlying stream into a TextIO iterator = io.TextIOWrapper(input_stream, encoding=encoding, newline=newline) - return csv.DictReader(iterator, *self.args, **self.kwargs) + return csv.DictReader(iterator, **self.transform_options) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/gzip.py b/aws_lambda_powertools/utilities/streaming/transformations/gzip.py index db859e308af..83e22e1e408 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/gzip.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/gzip.py @@ -25,4 +25,4 @@ class GzipTransform(BaseTransform): """ def transform(self, input_stream: IO[bytes]) -> GzipFile: - return GzipFile(fileobj=input_stream, mode="rb", **self.kwargs) + return GzipFile(fileobj=input_stream, mode="rb", **self.transform_options) diff --git a/aws_lambda_powertools/utilities/streaming/transformations/zip.py b/aws_lambda_powertools/utilities/streaming/transformations/zip.py index 525e0f95c50..ffce75ee250 100644 --- a/aws_lambda_powertools/utilities/streaming/transformations/zip.py +++ b/aws_lambda_powertools/utilities/streaming/transformations/zip.py @@ -11,9 +11,6 @@ class ZipTransform(BaseTransform): Returns a zip.ZipFile that reads data from the input stream: https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile - Currently, it's not possible to pipe the zip file stream into another data transformation, - since a Zip file contains multiple files, and not a single stream. - Example ------- @@ -26,7 +23,7 @@ class ZipTransform(BaseTransform): >>> print(file) >>> zip_reader.extract(file) - Additional options passed on the constructor, will be pased to the csv.DictReader constructor. + Additional options passed on the constructor, will be pased to the is_csv.DictReader constructor. >>> from aws_lambda_powertools.utilities.streaming import S3Object >>> from aws_lambda_powertools.utilities.streaming.transformations import ZipTransform @@ -38,7 +35,20 @@ class ZipTransform(BaseTransform): >>> print(file) >>> zip_reader.extract(file) + Currently, it's not possible to pipe the Zip file stream into another data transformation, + since a Zip file contains multiple files, and not a single stream. However, you can still + open a specific file as a stream, reading only the necessary bytes to extract it: + + >>> from aws_lambda_powertools.utilities.streaming import S3Object + >>> from aws_lambda_powertools.utilities.streaming.transformations import ZipTransform + >>> import zipfile + >>> + >>> s3object = S3Object(bucket="bucket", key="key") + >>> zip_reader = s3object.transform(ZipTransform()) + >>> with zip_reader.open("filename.txt") as f: + >>> for line in f: + >>> print(line) """ def transform(self, input_stream: IO[bytes]) -> ZipFile: - return ZipFile(input_stream, mode="r", **self.kwargs) + return ZipFile(input_stream, mode="r", **self.transform_options) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index d52d2caddfd..9e7e4a55f89 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -50,7 +50,7 @@ The utility has some built-in data transformations to help dealing with common s | **Zip** | Exposes the stream as a [ZipFile object](https://docs.python.org/3/library/zipfile.html) | ZipTransform | | **CSV** | Parses each line as a CSV object, returning dictionary objects | CsvTransform | -Common options like gunzipping a stream and parsing data as CSV can be enabled directly on the constructor: +Common options like processing a gzipped stream or parsing data as CSV can be enabled directly on the constructor: === "Enabling inflation of gzip data" @@ -73,6 +73,16 @@ the `transform` method. Multiple transformations are applied in order. --8<-- "examples/streaming/src/s3_transform_in_place.py" ``` +???+ note "Handling ZIP files with ZipTransformation" + + Currently, it's not possible to pipe the `ZipTransformation` into another data transformation, + since a Zip file contains multiple files, and not a single stream. However, you can still + open a specific file as a stream, reading only the necessary bytes to extract it: + + ```python hl_lines="6" + --8<-- "examples/streaming/src/s3_transform_zipfile.py" + ``` + ## Advanced ### Custom options for data transformations diff --git a/examples/streaming/src/s3_transform_common.py b/examples/streaming/src/s3_transform_common.py index e5ea7239ecd..a63ed737a52 100644 --- a/examples/streaming/src/s3_transform_common.py +++ b/examples/streaming/src/s3_transform_common.py @@ -5,6 +5,6 @@ def lambda_handler(event: Dict[str, str], context: LambdaContext): - s3 = S3Object(bucket=event["bucket"], key=event["key"], gunzip=True) + s3 = S3Object(bucket=event["bucket"], key=event["key"], is_gzip=True) for line in s3: print(line) diff --git a/examples/streaming/src/s3_transform_zipfile.py b/examples/streaming/src/s3_transform_zipfile.py new file mode 100644 index 00000000000..276be7333a3 --- /dev/null +++ b/examples/streaming/src/s3_transform_zipfile.py @@ -0,0 +1,8 @@ +from aws_lambda_powertools.utilities.streaming import S3Object +from aws_lambda_powertools.utilities.streaming.transformations import ZipTransform + +s3object = S3Object(bucket="bucket", key="key") +zip_reader = s3object.transform(ZipTransform()) +with zip_reader.open("filename.txt") as f: + for line in f: + print(line) diff --git a/tests/e2e/streaming/handlers/s3_object_handler.py b/tests/e2e/streaming/handlers/s3_object_handler.py index d0bcc7830a2..98bda22c2bb 100644 --- a/tests/e2e/streaming/handlers/s3_object_handler.py +++ b/tests/e2e/streaming/handlers/s3_object_handler.py @@ -16,8 +16,8 @@ key: str version_id: str, optional -gunzip: bool, optional -csv: bool, optional +is_gzip: bool, optional +is_csv: bool, optional transform_gunzip: bool, optional transform_csv: bool, optional @@ -33,8 +33,8 @@ def lambda_handler(event, context): key = event.get("key") version_id = event.get("version_id", None) - gunzip = event.get("gunzip", False) - csv = event.get("csv", False) + gunzip = event.get("is_gzip", False) + csv = event.get("is_csv", False) transform_gzip = event.get("transform_gzip", False) transform_csv = event.get("transform_csv", False) @@ -45,7 +45,7 @@ def lambda_handler(event, context): response = {} try: - obj = S3Object(bucket=bucket, key=key, version_id=version_id, gunzip=gunzip, csv=csv) + obj = S3Object(bucket=bucket, key=key, version_id=version_id, is_gzip=gunzip, is_csv=csv) response["size"] = obj.size transformations = [] diff --git a/tests/e2e/streaming/test_s3_object.py b/tests/e2e/streaming/test_s3_object.py index 3020d86337e..f0879ac67bd 100644 --- a/tests/e2e/streaming/test_s3_object.py +++ b/tests/e2e/streaming/test_s3_object.py @@ -69,56 +69,56 @@ def test_s3_object_non_existent(s3_object_handler_fn_arn, regular_bucket_name): def test_s3_object_csv_constructor(s3_object_handler_fn_arn, regular_bucket_name): - payload = {"bucket": regular_bucket_name, "key": "csv.txt", "csv": True} + payload = {"bucket": regular_bucket_name, "key": "is_csv.txt", "is_csv": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == {"name": "hello", "value": "world"} def test_s3_versioned_object_csv_constructor(s3_object_handler_fn_arn, versioned_bucket_name): - key = "csv.txt" + key = "is_csv.txt" payload = { "bucket": versioned_bucket_name, "key": key, "version_id": get_object_version(versioned_bucket_name, key), - "csv": True, + "is_csv": True, } result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == {"name": "hello", "value": "world"} def test_s3_object_csv_transform(s3_object_handler_fn_arn, regular_bucket_name): - payload = {"bucket": regular_bucket_name, "key": "csv.txt", "transform_csv": True} + payload = {"bucket": regular_bucket_name, "key": "is_csv.txt", "transform_csv": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == {"name": "hello", "value": "world"} def test_s3_object_csv_transform_in_place(s3_object_handler_fn_arn, regular_bucket_name): - payload = {"bucket": regular_bucket_name, "key": "csv.txt", "transform_csv": True, "in_place": True} + payload = {"bucket": regular_bucket_name, "key": "is_csv.txt", "transform_csv": True, "in_place": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == {"name": "hello", "value": "world"} def test_s3_object_csv_gzip_constructor(s3_object_handler_fn_arn, regular_bucket_name): - payload = {"bucket": regular_bucket_name, "key": "csv.txt.gz", "csv": True, "gunzip": True} + payload = {"bucket": regular_bucket_name, "key": "is_csv.txt.gz", "is_csv": True, "is_gzip": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == {"name": "hello", "value": "world"} def test_s3_versioned_object_csv_gzip_constructor(s3_object_handler_fn_arn, versioned_bucket_name): - key = "csv.txt.gz" + key = "is_csv.txt.gz" payload = { "bucket": versioned_bucket_name, "key": key, "version_id": get_object_version(versioned_bucket_name, key), - "csv": True, - "gunzip": True, + "is_csv": True, + "is_gzip": True, } result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == {"name": "hello", "value": "world"} def test_s3_object_gzip_constructor(s3_object_handler_fn_arn, regular_bucket_name): - payload = {"bucket": regular_bucket_name, "key": "plain.txt.gz", "gunzip": True} + payload = {"bucket": regular_bucket_name, "key": "plain.txt.gz", "is_gzip": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == "hello world" @@ -129,7 +129,7 @@ def test_s3_versioned_object_gzip_constructor(s3_object_handler_fn_arn, versione "bucket": versioned_bucket_name, "key": key, "version_id": get_object_version(versioned_bucket_name, key), - "gunzip": True, + "is_gzip": True, } result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == "hello world" diff --git a/tests/functional/streaming/test_s3_object.py b/tests/functional/streaming/test_s3_object.py index c74f9c8fe71..7514ab11545 100644 --- a/tests/functional/streaming/test_s3_object.py +++ b/tests/functional/streaming/test_s3_object.py @@ -18,17 +18,17 @@ def test_s3_basic_stream(): def test_s3_gzip_stream(): - obj = S3Object(bucket="bucket", key="key", gunzip=True) + obj = S3Object(bucket="bucket", key="key", is_gzip=True) assert type(obj.transformed_stream) is GzipFile def test_s3_csv_stream(): - obj = S3Object(bucket="bucket", key="key", csv=True) + obj = S3Object(bucket="bucket", key="key", is_csv=True) assert type(obj.transformed_stream) is DictReader def test_s3_gzip_csv_stream(): - obj = S3Object(bucket="bucket", key="key", gunzip=True, csv=True) + obj = S3Object(bucket="bucket", key="key", is_gzip=True, is_csv=True) assert type(obj.transformed_stream) is DictReader @@ -56,7 +56,7 @@ def test_s3_transform_after_read(): "get_object", {"Body": StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload))} ) s3_stub.activate() - obj = S3Object(bucket="bucket", key="key", boto3_s3_client=s3_client) + obj = S3Object(bucket="bucket", key="key", boto3_client=s3_client) # WHEN you read some part of the object and then apply a transformation assert obj.read(5) == b"hello" diff --git a/tests/functional/streaming/test_s3_seekable_io.py b/tests/functional/streaming/test_s3_seekable_io.py index 30069376e2c..5cf1b0d9ab3 100644 --- a/tests/functional/streaming/test_s3_seekable_io.py +++ b/tests/functional/streaming/test_s3_seekable_io.py @@ -15,7 +15,7 @@ def s3_client(): @pytest.fixture def s3_seekable_obj(s3_client): - return _S3SeekableIO(bucket="bucket", key="key", boto3_s3_client=s3_client) + return _S3SeekableIO(bucket="bucket", key="key", boto3_client=s3_client) @pytest.fixture From 758b50f5b98801be23a2c4f69d6971c4f6111c57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Wed, 23 Nov 2022 15:02:46 +0100 Subject: [PATCH 28/43] fix: typos --- tests/e2e/streaming/test_s3_object.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/e2e/streaming/test_s3_object.py b/tests/e2e/streaming/test_s3_object.py index f0879ac67bd..1e2fe1a0222 100644 --- a/tests/e2e/streaming/test_s3_object.py +++ b/tests/e2e/streaming/test_s3_object.py @@ -69,13 +69,13 @@ def test_s3_object_non_existent(s3_object_handler_fn_arn, regular_bucket_name): def test_s3_object_csv_constructor(s3_object_handler_fn_arn, regular_bucket_name): - payload = {"bucket": regular_bucket_name, "key": "is_csv.txt", "is_csv": True} + payload = {"bucket": regular_bucket_name, "key": "csv.txt", "is_csv": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == {"name": "hello", "value": "world"} def test_s3_versioned_object_csv_constructor(s3_object_handler_fn_arn, versioned_bucket_name): - key = "is_csv.txt" + key = "csv.txt" payload = { "bucket": versioned_bucket_name, "key": key, @@ -87,25 +87,25 @@ def test_s3_versioned_object_csv_constructor(s3_object_handler_fn_arn, versioned def test_s3_object_csv_transform(s3_object_handler_fn_arn, regular_bucket_name): - payload = {"bucket": regular_bucket_name, "key": "is_csv.txt", "transform_csv": True} + payload = {"bucket": regular_bucket_name, "key": "csv.txt", "transform_csv": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == {"name": "hello", "value": "world"} def test_s3_object_csv_transform_in_place(s3_object_handler_fn_arn, regular_bucket_name): - payload = {"bucket": regular_bucket_name, "key": "is_csv.txt", "transform_csv": True, "in_place": True} + payload = {"bucket": regular_bucket_name, "key": "csv.txt", "transform_csv": True, "in_place": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == {"name": "hello", "value": "world"} def test_s3_object_csv_gzip_constructor(s3_object_handler_fn_arn, regular_bucket_name): - payload = {"bucket": regular_bucket_name, "key": "is_csv.txt.gz", "is_csv": True, "is_gzip": True} + payload = {"bucket": regular_bucket_name, "key": "csv.txt.gz", "is_csv": True, "is_gzip": True} result = get_lambda_result_payload(s3_object_handler_fn_arn, payload) assert result.get("body") == {"name": "hello", "value": "world"} def test_s3_versioned_object_csv_gzip_constructor(s3_object_handler_fn_arn, versioned_bucket_name): - key = "is_csv.txt.gz" + key = "csv.txt.gz" payload = { "bucket": versioned_bucket_name, "key": key, From f232810ef6b4edff0abafd48eca110b966df9734 Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Wed, 23 Nov 2022 16:18:23 +0100 Subject: [PATCH 29/43] docs(streaming): copywriting for intro and key features --- docs/utilities/streaming.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index 9e7e4a55f89..7e96e658b5c 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -3,13 +3,12 @@ title: Streaming description: Utility --- -The streaming utility handles streaming data from AWS for processing data sets bigger than the available memory. +The streaming utility handles datasets larger than the available memory as streaming data. ## Key Features -* Simple interface to stream data from S3, even when the data is larger than memory -* Read your S3 file using the patterns you already know when dealing with files in Python -* Includes common transformations to data stored in S3, like gunzip and CSV deserialization +* Stream Amazon S3 objects with a file-like interface with minimal memory consumption +* Built-in popular data transformations to decompress and deserialize (gzip, CSV, and ZIP) * Build your own data transformation and add it to the pipeline ## Background @@ -45,7 +44,7 @@ The code above will stream the contents from S3 as fast as possible, using minim The utility has some built-in data transformations to help dealing with common scenarios while streaming data from S3. | Name | Description | Class name | -|----------|--------------------------------------------------------------------------------------------------|---------------| +| -------- | ------------------------------------------------------------------------------------------------ | ------------- | | **Gzip** | Gunzips the stream of data using the [gzip library](https://docs.python.org/3/library/gzip.html) | GzipTransform | | **Zip** | Exposes the stream as a [ZipFile object](https://docs.python.org/3/library/zipfile.html) | ZipTransform | | **CSV** | Parses each line as a CSV object, returning dictionary objects | CsvTransform | @@ -90,7 +89,7 @@ the `transform` method. Multiple transformations are applied in order. Each data transformation class accepts additional options to customize the transformation. | Name | Description | -|-------------------|----------------------------------------------------------------------------------------------------------------| +| ----------------- | -------------------------------------------------------------------------------------------------------------- | | **GzipTransform** | All the options from the [GzipFile constructor](https://docs.python.org/3/library/gzip.html#gzip.GzipFile) | | **ZipTransform** | All the options from the [ZipFile constructor](https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile) | | **CsvTransform** | All the options from the [DictReader constructor](https://docs.python.org/3/library/csv.html#csv.DictReader) | From fb2909f66fdbe404082269c4b54dba7f11530223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Wed, 23 Nov 2022 17:12:48 +0100 Subject: [PATCH 30/43] chore: pushed an example of seeking a CSV file --- .../utilities/streaming/s3_object.py | 9 ------- examples/streaming/src/s3_csv_stream_seek.py | 25 +++++++++++++++++++ 2 files changed, 25 insertions(+), 9 deletions(-) create mode 100644 examples/streaming/src/s3_csv_stream_seek.py diff --git a/aws_lambda_powertools/utilities/streaming/s3_object.py b/aws_lambda_powertools/utilities/streaming/s3_object.py index b64d5d38161..c61d352037a 100644 --- a/aws_lambda_powertools/utilities/streaming/s3_object.py +++ b/aws_lambda_powertools/utilities/streaming/s3_object.py @@ -156,15 +156,6 @@ def transform( T[bound=IO[bytes]], optional If in_place is False, returns an IO[bytes] object representing the transformed stream """ - # Once we start reading the stream, we should not change the data transformation. - # This would be a programming error: - # - # >>> s3object.transform(GzipTransform(), in_place=True) - # >>> s3object.readline() - # >>> s3object.transform(CsvTransform(), in_place=True) - if self.tell() != 0: - raise ValueError(f"Cannot add transformations to a read object. Already read {self.tell()} bytes") - # Make transformations always be a sequence to make mypy happy if not isinstance(transformations, Sequence): transformations = [transformations] diff --git a/examples/streaming/src/s3_csv_stream_seek.py b/examples/streaming/src/s3_csv_stream_seek.py new file mode 100644 index 00000000000..5d740e069f2 --- /dev/null +++ b/examples/streaming/src/s3_csv_stream_seek.py @@ -0,0 +1,25 @@ +import io +from typing import Dict + +from aws_lambda_powertools.utilities.streaming.s3_object import S3Object +from aws_lambda_powertools.utilities.streaming.transformations import CsvTransform +from aws_lambda_powertools.utilities.typing import LambdaContext + +""" +Assuming the CSV files contains rows after the header always has 8 bytes + 1 byte newline: + +21.3,5,+ +23.4,4,+ +21.3,0,- +""" + + +def lambda_handler(event: Dict[str, str], context: LambdaContext): + s3 = S3Object(bucket=event["bucket"], key=event["key"]) + + # Jump 100 lines of 9 bytes each (8 bytes of data + 1 byte newline) + s3.seek(100 * 9, io.SEEK_SET) + + s3.transform(CsvTransform(), in_place=True) + for obj in s3: + print(obj) From f31459743be65af44f57db16e0a243343d20fd16 Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Wed, 23 Nov 2022 17:24:22 +0100 Subject: [PATCH 31/43] docs(streaming): copywriting background; split up data transformations --- docs/utilities/streaming.md | 44 +++++++++++-------- examples/streaming/src/s3_transform_common.py | 2 +- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index 7e96e658b5c..174519a7fe4 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -13,17 +13,17 @@ The streaming utility handles datasets larger than the available memory as strea ## Background -Processing S3 files inside your Lambda function presents challenges when the file is bigger than the allocated -amount of memory. Your data may also be stored using a set of encapsulation layers (gzip, CSV, zip files, etc). +Within Lambda, processing S3 objects larger than the allocated amount of memory can lead to out of memory or timeout situations. For cost efficiency, your S3 objects may be encoded and compressed in various formats (_gzip, CSV, zip files, etc_), increasing the amount of non-business logic and reliability risks. -This utility makes it easy to process data coming from S3 files, while transparently applying data transformations -to the data stream. +Streaming utility makes this process easier by fetching parts of your data as you consume it, and transparently applying data transformations to the data stream. This allows you to process one, a few, or all rows of your large dataset while consuming a few MBs only. ## Getting started ### Streaming from a S3 object -To stream an S3 file, you need the bucket name, the key and optionally a version ID. +With `S3Object`, you'll need the bucket, object key, and optionally a version ID to stream its content. + +We will fetch parts of your data from S3 as you process each line, consuming only the absolute minimal amount of memory. === "Non-versioned bucket" @@ -37,28 +37,24 @@ To stream an S3 file, you need the bucket name, the key and optionally a version --8<-- "examples/streaming/src/s3_basic_stream_with_version.py" ``` -The code above will stream the contents from S3 as fast as possible, using minimal memory. - ### Data transformations -The utility has some built-in data transformations to help dealing with common scenarios while streaming data from S3. +!!! tip "Think of data transformations like a data processing pipeline - apply one or more in order." -| Name | Description | Class name | -| -------- | ------------------------------------------------------------------------------------------------ | ------------- | -| **Gzip** | Gunzips the stream of data using the [gzip library](https://docs.python.org/3/library/gzip.html) | GzipTransform | -| **Zip** | Exposes the stream as a [ZipFile object](https://docs.python.org/3/library/zipfile.html) | ZipTransform | -| **CSV** | Parses each line as a CSV object, returning dictionary objects | CsvTransform | +As data is streamed, you can apply transformations to your data like decompressing gzip content and deserializing a CSV into a dictionary. -Common options like processing a gzipped stream or parsing data as CSV can be enabled directly on the constructor: +For popular data transformations like CSV or Gzip, you can quickly enable it at the constructor level: -=== "Enabling inflation of gzip data" +=== "Decompressing and deserializing CSV" ```python hl_lines="8" --8<-- "examples/streaming/src/s3_transform_common.py" ``` -Additionally, you can return a new object that encapsulates a transformation, or transform the data in place, by calling -the `transform` method. Multiple transformations are applied in order. +Alternatively, you can apply transformations later via the `transform` method. By default, it will return the transformed stream you can use to read its contents. If you prefer in-place modifications, use `in_place=True`. + +???+ question "When is this useful?" + In scenarios where you might have a reusable logic to apply common transformations. This might be a function or a class that receives an instance of `S3Object`. === "Returning a new object" @@ -68,6 +64,8 @@ the `transform` method. Multiple transformations are applied in order. === "Transform in-place" + Note that when using `in_place=True`, there is no return (`None`). + ```python hl_lines="13" --8<-- "examples/streaming/src/s3_transform_in_place.py" ``` @@ -82,6 +80,16 @@ the `transform` method. Multiple transformations are applied in order. --8<-- "examples/streaming/src/s3_transform_zipfile.py" ``` +#### Built-in data transformations + +We provide popular built-in transformations that you can apply against your streaming data. + +| Name | Description | Class name | +| -------- | ------------------------------------------------------------------------------------------------ | ------------- | +| **Gzip** | Gunzips the stream of data using the [gzip library](https://docs.python.org/3/library/gzip.html) | GzipTransform | +| **Zip** | Exposes the stream as a [ZipFile object](https://docs.python.org/3/library/zipfile.html) | ZipTransform | +| **CSV** | Parses each CSV line as a CSV object, returning dictionary objects | CsvTransform | + ## Advanced ### Custom options for data transformations @@ -94,7 +102,7 @@ Each data transformation class accepts additional options to customize the trans | **ZipTransform** | All the options from the [ZipFile constructor](https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile) | | **CsvTransform** | All the options from the [DictReader constructor](https://docs.python.org/3/library/csv.html#csv.DictReader) | -For instance, if you want to unzip an S3 file compressed using `LZMA` you could pass that option in the constructor: +For instance, if you want to unzip an S3 object compressed using `LZMA` you could pass that option in the constructor: === "Unzipping LZMA data" diff --git a/examples/streaming/src/s3_transform_common.py b/examples/streaming/src/s3_transform_common.py index a63ed737a52..b7cc570f98f 100644 --- a/examples/streaming/src/s3_transform_common.py +++ b/examples/streaming/src/s3_transform_common.py @@ -5,6 +5,6 @@ def lambda_handler(event: Dict[str, str], context: LambdaContext): - s3 = S3Object(bucket=event["bucket"], key=event["key"], is_gzip=True) + s3 = S3Object(bucket=event["bucket"], key=event["key"], is_gzip=True, is_csv=True) for line in s3: print(line) From a54277326005d79b2600dddcfa33fe82df14d4d2 Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Wed, 23 Nov 2022 17:38:29 +0100 Subject: [PATCH 32/43] docs(streaming): split up zip files to make limitation more explicit --- docs/utilities/streaming.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index 174519a7fe4..36ef5430693 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -70,15 +70,16 @@ Alternatively, you can apply transformations later via the `transform` method. B --8<-- "examples/streaming/src/s3_transform_in_place.py" ``` -???+ note "Handling ZIP files with ZipTransformation" +#### Handling ZIP files - Currently, it's not possible to pipe the `ZipTransformation` into another data transformation, - since a Zip file contains multiple files, and not a single stream. However, you can still - open a specific file as a stream, reading only the necessary bytes to extract it: +!!! warning "`ZipTransform` doesn't support combining other transformations." + This is because a Zip file contains multiple files while transformations apply to a single stream. - ```python hl_lines="6" - --8<-- "examples/streaming/src/s3_transform_zipfile.py" - ``` +That said, you can still open a specific file as a stream, reading only the necessary bytes to extract it: + +```python hl_lines="6" title="Reading an individual file in the zip as a stream" +--8<-- "examples/streaming/src/s3_transform_zipfile.py" +``` #### Built-in data transformations From c2a64e81f46dbabdfed32d6169f2ff1ea69b926e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Wed, 23 Nov 2022 17:40:59 +0100 Subject: [PATCH 33/43] chore: add testing examples to streaming docs --- docs/utilities/streaming.md | 30 +++++++++++++- .../streaming/src/test_s3_pipeline_result.py | 17 ++++++++ .../src/test_s3_transform_isolated.py | 25 ++++++++++++ .../streaming/src/test_s3_transform_mocked.py | 39 +++++++++++++++++++ 4 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 examples/streaming/src/test_s3_pipeline_result.py create mode 100644 examples/streaming/src/test_s3_transform_isolated.py create mode 100644 examples/streaming/src/test_s3_transform_mocked.py diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index 36ef5430693..c19433779ef 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -133,4 +133,32 @@ a `IO[bytes]`. ## Testing your code -TODO +### Testing that you transformation is applied + +Test that your transformation pipeline is returning the correct object: + +=== "Testing the data pipeline returned object" + + ```python hl_lines="14 17" + --8<-- "examples/streaming/src/test_s3_pipeline_result.py" + ``` + +### Testing that your transformation is working in isolation + +Create an input payload using `io.BytesIO` and assert the response of the transformation: + +=== "Testing transformation in isolation" + + ```python hl_lines="23-25" + --8<-- "examples/streaming/src/test_s3_transform_isolated.py" + ``` + +### Testing that your transformation is working with S3 data + +Use `botocore.stub` to stub the `get_object` response from S3: + +=== "Testing transformation with mocked S3 data" + + ```python hl_lines="32-34 37" + --8<-- "examples/streaming/src/test_s3_transform_mocked.py" + ``` diff --git a/examples/streaming/src/test_s3_pipeline_result.py b/examples/streaming/src/test_s3_pipeline_result.py new file mode 100644 index 00000000000..be157ecfff7 --- /dev/null +++ b/examples/streaming/src/test_s3_pipeline_result.py @@ -0,0 +1,17 @@ +from csv import DictReader + +from aws_lambda_powertools.utilities.streaming import S3Object +from aws_lambda_powertools.utilities.streaming.transformations import ( + CsvTransform, + GzipTransform, +) + + +def test_s3_pipeline_result(): + obj = S3Object(bucket="bucket", key="key") + + # Apply your transformations + obj.transform([GzipTransform(), CsvTransform()], in_place=True) + + # Check the object at the end of the pipeline is a csv.DictReader (and not a gzip.GzipFile) + assert type(obj.transformed_stream) is DictReader diff --git a/examples/streaming/src/test_s3_transform_isolated.py b/examples/streaming/src/test_s3_transform_isolated.py new file mode 100644 index 00000000000..b151eabf031 --- /dev/null +++ b/examples/streaming/src/test_s3_transform_isolated.py @@ -0,0 +1,25 @@ +import io +from typing import IO, Optional + +from aws_lambda_powertools.utilities.streaming.transformations import BaseTransform + + +class UpperIO(io.RawIOBase): + def __init__(self, input_stream: IO[bytes], encoding: str): + self.encoding = encoding + self.input_stream = io.TextIOWrapper(input_stream, encoding=encoding) + + def read(self, size: int = -1) -> Optional[bytes]: + data = self.input_stream.read(size) + return data.upper().encode(self.encoding) + + +class UpperTransform(BaseTransform): + def transform(self, input_stream: IO[bytes]) -> UpperIO: + return UpperIO(input_stream=input_stream, encoding="utf-8") + + +def test_s3_pipeline_result(): + stream = io.BytesIO(b"hello world") + stream = UpperTransform().transform(stream) + assert stream.read() == b"HELLO WORLD" diff --git a/examples/streaming/src/test_s3_transform_mocked.py b/examples/streaming/src/test_s3_transform_mocked.py new file mode 100644 index 00000000000..e2f923e7a52 --- /dev/null +++ b/examples/streaming/src/test_s3_transform_mocked.py @@ -0,0 +1,39 @@ +import io +from typing import IO, Optional + +import boto3 +from botocore import stub + +from aws_lambda_powertools.utilities.streaming import S3Object +from aws_lambda_powertools.utilities.streaming.compat import PowertoolsStreamingBody +from aws_lambda_powertools.utilities.streaming.transformations import BaseTransform + + +class UpperIO(io.RawIOBase): + def __init__(self, input_stream: IO[bytes], encoding: str): + self.encoding = encoding + self.input_stream = io.TextIOWrapper(input_stream, encoding=encoding) + + def read(self, size: int = -1) -> Optional[bytes]: + data = self.input_stream.read(size) + return data.upper().encode(self.encoding) + + +class UpperTransform(BaseTransform): + def transform(self, input_stream: IO[bytes]) -> UpperIO: + return UpperIO(input_stream=input_stream, encoding="utf-8") + + +def test_s3_pipeline_result(): + payload = b"hello world" + + s3_client = boto3.client("s3") + s3_stub = stub.Stubber(s3_client) + s3_stub.add_response( + "get_object", {"Body": PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload))} + ) + s3_stub.activate() + + obj = S3Object(bucket="bucket", key="key", boto3_client=s3_client) + uobj = obj.transform(UpperTransform()) + assert uobj.read() == b"HELLO WORLD" From 639f38e55ef853882fb039cb8904b3b6b2187aea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BAben=20Fonseca?= Date: Wed, 23 Nov 2022 17:42:52 +0100 Subject: [PATCH 34/43] fix: remove test --- tests/functional/streaming/test_s3_object.py | 27 -------------------- 1 file changed, 27 deletions(-) diff --git a/tests/functional/streaming/test_s3_object.py b/tests/functional/streaming/test_s3_object.py index 7514ab11545..e2b482bb732 100644 --- a/tests/functional/streaming/test_s3_object.py +++ b/tests/functional/streaming/test_s3_object.py @@ -1,12 +1,6 @@ -import io from csv import DictReader from gzip import GzipFile -import boto3 -import pytest -from botocore import stub -from botocore.response import StreamingBody - from aws_lambda_powertools.utilities.streaming import S3Object from aws_lambda_powertools.utilities.streaming._s3_seekable_io import _S3SeekableIO from aws_lambda_powertools.utilities.streaming.transformations import GzipTransform @@ -44,24 +38,3 @@ def test_s3_transform_in_place(): new_obj = obj.transform(GzipTransform(), in_place=True) assert new_obj is None - - -def test_s3_transform_after_read(): - # GIVEN a S3 Object with a "hello world" payload - payload = b"hello world" - - s3_client = boto3.client("s3") - s3_stub = stub.Stubber(s3_client) - s3_stub.add_response( - "get_object", {"Body": StreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload))} - ) - s3_stub.activate() - obj = S3Object(bucket="bucket", key="key", boto3_client=s3_client) - - # WHEN you read some part of the object and then apply a transformation - assert obj.read(5) == b"hello" - - # THEN it raises ValueError - with pytest.raises(ValueError) as exc: - obj.transform(GzipTransform()) - assert str(exc.value) == "Cannot add transformations to a read object. Already read 5 bytes" From f496d41516ecb149e190e5a84c0c0beb0cebeb8d Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Wed, 23 Nov 2022 17:45:58 +0100 Subject: [PATCH 35/43] docs(streaming): copywriting on custom transform options --- docs/utilities/streaming.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index c19433779ef..6ba491455d9 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -95,15 +95,15 @@ We provide popular built-in transformations that you can apply against your stre ### Custom options for data transformations -Each data transformation class accepts additional options to customize the transformation. +We will propagate additional options to the underlying implementation for each transform class. -| Name | Description | -| ----------------- | -------------------------------------------------------------------------------------------------------------- | -| **GzipTransform** | All the options from the [GzipFile constructor](https://docs.python.org/3/library/gzip.html#gzip.GzipFile) | -| **ZipTransform** | All the options from the [ZipFile constructor](https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile) | -| **CsvTransform** | All the options from the [DictReader constructor](https://docs.python.org/3/library/csv.html#csv.DictReader) | +| Name | Available options | +| ----------------- | ------------------------------------------------------------------------------------- | +| **GzipTransform** | [GzipFile constructor](https://docs.python.org/3/library/gzip.html#gzip.GzipFile) | +| **ZipTransform** | [ZipFile constructor](https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile) | +| **CsvTransform** | [DictReader constructor](https://docs.python.org/3/library/csv.html#csv.DictReader) | -For instance, if you want to unzip an S3 object compressed using `LZMA` you could pass that option in the constructor: +For instance, take `ZipTransform`. You can use the `compression` parameter if you want to unzip an S3 object compressed with `LZMA`. === "Unzipping LZMA data" @@ -111,9 +111,9 @@ For instance, if you want to unzip an S3 object compressed using `LZMA` you coul --8<-- "examples/streaming/src/s3_transform_lzma.py" ``` -Or, if you want to load a `TSV` file, you can just change the delimiter on the `CSV` transform: +Or, if you want to load a tab-separated file (TSV), you can use the `delimiter` parameter in the `CsvTransform`: -=== "Loading TSV data" +=== "Deserializing tab-separated data values" ```python hl_lines="11" --8<-- "examples/streaming/src/s3_transform_tsv.py" From 7c0cc3ab37a1ba6da4d46ee50a996593b2bc5e5e Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Wed, 23 Nov 2022 17:49:07 +0100 Subject: [PATCH 36/43] chore: line-editing on byot --- docs/utilities/streaming.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index 6ba491455d9..16dc7db90d7 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -121,9 +121,7 @@ Or, if you want to load a tab-separated file (TSV), you can use the `delimiter` ### Building your own data transformation -You can build your own custom data transformation by extending the `BaseTransform` class. -The `transform` method receives an `IO[bytes]` object, and you are responsible for returning an object that is also -a `IO[bytes]`. +You can build your own custom data transformation by extending the `BaseTransform` class. The `transform` method receives an `IO[bytes]` object, and you are responsible for returning an `IO[bytes]` object. === "Custom JSON transform" From a8e48c6bbcc00da9e3f629b1502db2fa9f16aff5 Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Wed, 23 Nov 2022 18:21:27 +0100 Subject: [PATCH 37/43] docs(streaming): combine testing your code to be more realistic --- docs/utilities/streaming.md | 28 ++++--------- .../streaming/src/assert_transformation.py | 35 +++++++++++++++++ ...ted.py => assert_transformation_module.py} | 6 --- .../streaming/src/test_s3_pipeline_result.py | 17 -------- .../streaming/src/test_s3_transform_mocked.py | 39 ------------------- 5 files changed, 42 insertions(+), 83 deletions(-) create mode 100644 examples/streaming/src/assert_transformation.py rename examples/streaming/src/{test_s3_transform_isolated.py => assert_transformation_module.py} (79%) delete mode 100644 examples/streaming/src/test_s3_pipeline_result.py delete mode 100644 examples/streaming/src/test_s3_transform_mocked.py diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index 16dc7db90d7..e2afd0508ae 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -131,32 +131,18 @@ You can build your own custom data transformation by extending the `BaseTransfor ## Testing your code -### Testing that you transformation is applied - -Test that your transformation pipeline is returning the correct object: - -=== "Testing the data pipeline returned object" - - ```python hl_lines="14 17" - --8<-- "examples/streaming/src/test_s3_pipeline_result.py" - ``` - -### Testing that your transformation is working in isolation +### Asserting data transformations Create an input payload using `io.BytesIO` and assert the response of the transformation: -=== "Testing transformation in isolation" +=== "assert_transformation.py" - ```python hl_lines="23-25" - --8<-- "examples/streaming/src/test_s3_transform_isolated.py" + ```python hl_lines="3 13 15 23-28 31-32" + --8<-- "examples/streaming/src/assert_transformation.py" ``` -### Testing that your transformation is working with S3 data - -Use `botocore.stub` to stub the `get_object` response from S3: - -=== "Testing transformation with mocked S3 data" +=== "assert_transformation_module.py" - ```python hl_lines="32-34 37" - --8<-- "examples/streaming/src/test_s3_transform_mocked.py" + ```python hl_lines="16" + --8<-- "examples/streaming/src/assert_transformation_module.py" ``` diff --git a/examples/streaming/src/assert_transformation.py b/examples/streaming/src/assert_transformation.py new file mode 100644 index 00000000000..fe96509a10c --- /dev/null +++ b/examples/streaming/src/assert_transformation.py @@ -0,0 +1,35 @@ +import io + +import boto3 +from assert_transformation_module import UpperTransform +from botocore import stub + +from aws_lambda_powertools.utilities.streaming import S3Object +from aws_lambda_powertools.utilities.streaming.compat import PowertoolsStreamingBody + + +def test_upper_transform(): + # GIVEN + data_stream = io.BytesIO(b"hello world") + # WHEN + data_stream = UpperTransform().transform(data_stream) + # THEN + assert data_stream.read() == b"HELLO WORLD" + + +def test_s3_object_with_upper_transform(): + # GIVEN + payload = b"hello world" + s3_client = boto3.client("s3") + s3_stub = stub.Stubber(s3_client) + s3_stub.add_response( + "get_object", {"Body": PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload))} + ) + s3_stub.activate() + + # WHEN + data_stream = S3Object(bucket="bucket", key="key", boto3_client=s3_client) + data_stream.transform(UpperTransform(), in_place=True) + + # THEN + assert data_stream.read() == b"HELLO WORLD" diff --git a/examples/streaming/src/test_s3_transform_isolated.py b/examples/streaming/src/assert_transformation_module.py similarity index 79% rename from examples/streaming/src/test_s3_transform_isolated.py rename to examples/streaming/src/assert_transformation_module.py index b151eabf031..eac11abd4af 100644 --- a/examples/streaming/src/test_s3_transform_isolated.py +++ b/examples/streaming/src/assert_transformation_module.py @@ -17,9 +17,3 @@ def read(self, size: int = -1) -> Optional[bytes]: class UpperTransform(BaseTransform): def transform(self, input_stream: IO[bytes]) -> UpperIO: return UpperIO(input_stream=input_stream, encoding="utf-8") - - -def test_s3_pipeline_result(): - stream = io.BytesIO(b"hello world") - stream = UpperTransform().transform(stream) - assert stream.read() == b"HELLO WORLD" diff --git a/examples/streaming/src/test_s3_pipeline_result.py b/examples/streaming/src/test_s3_pipeline_result.py deleted file mode 100644 index be157ecfff7..00000000000 --- a/examples/streaming/src/test_s3_pipeline_result.py +++ /dev/null @@ -1,17 +0,0 @@ -from csv import DictReader - -from aws_lambda_powertools.utilities.streaming import S3Object -from aws_lambda_powertools.utilities.streaming.transformations import ( - CsvTransform, - GzipTransform, -) - - -def test_s3_pipeline_result(): - obj = S3Object(bucket="bucket", key="key") - - # Apply your transformations - obj.transform([GzipTransform(), CsvTransform()], in_place=True) - - # Check the object at the end of the pipeline is a csv.DictReader (and not a gzip.GzipFile) - assert type(obj.transformed_stream) is DictReader diff --git a/examples/streaming/src/test_s3_transform_mocked.py b/examples/streaming/src/test_s3_transform_mocked.py deleted file mode 100644 index e2f923e7a52..00000000000 --- a/examples/streaming/src/test_s3_transform_mocked.py +++ /dev/null @@ -1,39 +0,0 @@ -import io -from typing import IO, Optional - -import boto3 -from botocore import stub - -from aws_lambda_powertools.utilities.streaming import S3Object -from aws_lambda_powertools.utilities.streaming.compat import PowertoolsStreamingBody -from aws_lambda_powertools.utilities.streaming.transformations import BaseTransform - - -class UpperIO(io.RawIOBase): - def __init__(self, input_stream: IO[bytes], encoding: str): - self.encoding = encoding - self.input_stream = io.TextIOWrapper(input_stream, encoding=encoding) - - def read(self, size: int = -1) -> Optional[bytes]: - data = self.input_stream.read(size) - return data.upper().encode(self.encoding) - - -class UpperTransform(BaseTransform): - def transform(self, input_stream: IO[bytes]) -> UpperIO: - return UpperIO(input_stream=input_stream, encoding="utf-8") - - -def test_s3_pipeline_result(): - payload = b"hello world" - - s3_client = boto3.client("s3") - s3_stub = stub.Stubber(s3_client) - s3_stub.add_response( - "get_object", {"Body": PowertoolsStreamingBody(raw_stream=io.BytesIO(payload), content_length=len(payload))} - ) - s3_stub.activate() - - obj = S3Object(bucket="bucket", key="key", boto3_client=s3_client) - uobj = obj.transform(UpperTransform()) - assert uobj.read() == b"HELLO WORLD" From 6ed058a1a77a89c2ce7e6bf729bfd1619a090d42 Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Wed, 23 Nov 2022 18:25:54 +0100 Subject: [PATCH 38/43] docs: add streaming as a new utility in features table --- docs/index.md | 1 + mkdocs.yml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index 3c26992221e..df91cb027d4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -691,6 +691,7 @@ Core utilities such as Tracing, Logging, Metrics, and Event Handler will be avai | [**Parser**](./utilities/parser.md) | Data parsing and deep validation using Pydantic | | [**Idempotency**](./utilities/idempotency.md) | Idempotent Lambda handler | | [**Feature Flags**](./utilities/feature_flags.md) | A simple rule engine to evaluate when one or multiple features should be enabled depending on the input | +| [**Streaming**](./utilities/streaming.md) | Streams datasets larger than the available memory as streaming data. | ## Environment variables diff --git a/mkdocs.yml b/mkdocs.yml index e72465f5736..c4fcb0d7c51 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -20,7 +20,6 @@ nav: - core/event_handler/api_gateway.md - core/event_handler/appsync.md - Utilities: - - utilities/middleware_factory.md - utilities/parameters.md - utilities/batch.md - utilities/typing.md @@ -29,9 +28,10 @@ nav: - utilities/parser.md - utilities/idempotency.md - utilities/feature_flags.md + - utilities/streaming.md + - utilities/middleware_factory.md - utilities/jmespath_functions.md - CloudFormation Custom Resources: https://github.com/aws-cloudformation/custom-resource-helper" target="_blank - - utilities/streaming.md theme: name: material From 60d3d865e61a2861850fc4b591679913ee460804 Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Thu, 24 Nov 2022 07:48:02 +0100 Subject: [PATCH 39/43] docs(index): add feature in project homepage --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 37eda06b5b4..49d53747a83 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,6 @@ A suite of Python utilities for AWS Lambda functions to ease adopting best pract ![hero-image](https://user-images.githubusercontent.com/3340292/198254617-d0fdb672-86a6-4988-8a40-adf437135e0a.png) - ## Features * **[Tracing](https://awslabs.github.io/aws-lambda-powertools-python/latest/core/tracer/)** - Decorators and utilities to trace Lambda function handlers, and both synchronous and asynchronous functions @@ -32,6 +31,7 @@ A suite of Python utilities for AWS Lambda functions to ease adopting best pract * **[Parser](https://awslabs.github.io/aws-lambda-powertools-python/latest/utilities/parser/)** - Data parsing and deep validation using Pydantic * **[Idempotency](https://awslabs.github.io/aws-lambda-powertools-python/latest/utilities/idempotency/)** - Convert your Lambda functions into idempotent operations which are safe to retry * **[Feature Flags](https://awslabs.github.io/aws-lambda-powertools-python/latest/utilities/feature_flags/)** - A simple rule engine to evaluate when one or multiple features should be enabled depending on the input +* **[Streaming](https://awslabs.github.io/aws-lambda-powertools-python/latest/utilities/streaming/)** - Streams datasets larger than the available memory as streaming data. ### Installation From 8586c2a8ec57f591d95967dbdf8f273661677cf0 Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Thu, 24 Nov 2022 07:59:37 +0100 Subject: [PATCH 40/43] docs(limitations): add new section to call out known issues --- docs/utilities/streaming.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index e2afd0508ae..b23efdeb40b 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -146,3 +146,11 @@ Create an input payload using `io.BytesIO` and assert the response of the transf ```python hl_lines="16" --8<-- "examples/streaming/src/assert_transformation_module.py" ``` + +## Known limitations + +### AWS X-Ray segment size limit + +We make multiple API calls to S3 as you read chunks from your S3 object. If your function is decorated with [Tracer](./../core/tracer.md), you can easily hit [AWS X-Ray 64K segment size](https://docs.aws.amazon.com/general/latest/gr/xray.html#limits_xray) when processing large files. + +!!! tip "Use tracer decorators in parts where you don't read your `S3Object` instead." From 23f672a165b336877b09d3094fdfc20ea23d6a6d Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Thu, 24 Nov 2022 09:37:36 +0100 Subject: [PATCH 41/43] docs(streaming): add reading ahead and backwards section --- docs/utilities/streaming.md | 36 +++++++++++++++++++ examples/streaming/src/non_uniform_sample.csv | 4 +++ .../src/s3_csv_stream_non_uniform_seek.py | 26 ++++++++++++++ examples/streaming/src/s3_csv_stream_seek.py | 20 ++++++++--- examples/streaming/src/uniform_sample.csv | 4 +++ 5 files changed, 85 insertions(+), 5 deletions(-) create mode 100644 examples/streaming/src/non_uniform_sample.csv create mode 100644 examples/streaming/src/s3_csv_stream_non_uniform_seek.py create mode 100644 examples/streaming/src/uniform_sample.csv diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index b23efdeb40b..80ef65602bf 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -93,6 +93,42 @@ We provide popular built-in transformations that you can apply against your stre ## Advanced +### Reading ahead or backwards + +`S3Object` implements [Python I/O interface](https://docs.python.org/3/tutorial/inputoutput.html){target="_blank"}. This means you can use `seek` to start reading contents of your file from any particular position, saving you processing time. + +#### Reading backwards + +For example, let's imagine you have a large CSV file, each row has a non-uniform size (bytes), and you want to read and process the last row only. + +```csv title="non_uniform_sample.csv" +--8<-- "examples/streaming/src/non_uniform_sample.csv" +``` + +You found out the last row has exactly 30 bytes. We can use `seek()` to skip to the end of the file, read 30 bytes, then transform to CSV. + +```python title="Reading only the last CSV row" hl_lines="16 18" +--8<-- "examples/streaming/src/s3_csv_stream_non_uniform_seek.py" +``` + +#### Reading ahead + +!!! question "What if we want to jump the first N rows?" + +You can also solve with `seek`, but let's take a large uniform CSV file to make this easier to grasp. + +```csv title="uniform_sample.csv" +--8<-- "examples/streaming/src/uniform_sample.csv" +``` + +You found out that each row has 8 bytes, the header line has 22 bytes, and every new line has 1 byte. + +You want to skip the first 100 lines. + +```python hl_lines="28 31" title="Skipping the first 100 rows" +--8<-- "examples/streaming/src/s3_csv_stream_seek.py" +``` + ### Custom options for data transformations We will propagate additional options to the underlying implementation for each transform class. diff --git a/examples/streaming/src/non_uniform_sample.csv b/examples/streaming/src/non_uniform_sample.csv new file mode 100644 index 00000000000..9ea3b7e2039 --- /dev/null +++ b/examples/streaming/src/non_uniform_sample.csv @@ -0,0 +1,4 @@ +id,name,location +1,Ruben Fonseca, Denmark +2,Heitor Lessa, Netherlands +3,Leandro Damascena, Portugal diff --git a/examples/streaming/src/s3_csv_stream_non_uniform_seek.py b/examples/streaming/src/s3_csv_stream_non_uniform_seek.py new file mode 100644 index 00000000000..a31e7c8ca6f --- /dev/null +++ b/examples/streaming/src/s3_csv_stream_non_uniform_seek.py @@ -0,0 +1,26 @@ +import io +from typing import Dict + +from aws_lambda_powertools.utilities.streaming.s3_object import S3Object +from aws_lambda_powertools.utilities.streaming.transformations import CsvTransform +from aws_lambda_powertools.utilities.typing import LambdaContext + +LAST_ROW_SIZE = 30 +CSV_HEADERS = ["id", "name", "location"] + + +def lambda_handler(event: Dict[str, str], context: LambdaContext): + sample_csv = S3Object(bucket=event["bucket"], key="sample.csv") + + # Jump to the end of the file + sample_csv.seek(0, io.SEEK_END) + # From the current position, jump exactly 30 bytes + sample_csv.seek(sample_csv.tell() - LAST_ROW_SIZE, io.SEEK_SET) + + # Transform portion of data into CSV with our headers + sample_csv.transform(CsvTransform(fieldnames=CSV_HEADERS), in_place=True) + + # We will only read the last portion of the file from S3 + # as we're only interested in the last 'location' from our dataset + for last_row in sample_csv: + print(last_row["location"]) diff --git a/examples/streaming/src/s3_csv_stream_seek.py b/examples/streaming/src/s3_csv_stream_seek.py index 5d740e069f2..c0f5eca8b3e 100644 --- a/examples/streaming/src/s3_csv_stream_seek.py +++ b/examples/streaming/src/s3_csv_stream_seek.py @@ -8,18 +8,28 @@ """ Assuming the CSV files contains rows after the header always has 8 bytes + 1 byte newline: +reading,position,type 21.3,5,+ 23.4,4,+ 21.3,0,- +... """ +CSV_HEADERS = ["reading", "position", "type"] +ROW_SIZE = 8 + 1 # 1 byte newline +HEADER_SIZE = 22 + 1 # 1 byte newline +LINES_TO_JUMP = 100 + def lambda_handler(event: Dict[str, str], context: LambdaContext): - s3 = S3Object(bucket=event["bucket"], key=event["key"]) + sample_csv = S3Object(bucket=event["bucket"], key=event["key"]) + + # Skip the header line + sample_csv.seek(HEADER_SIZE, io.SEEK_SET) # Jump 100 lines of 9 bytes each (8 bytes of data + 1 byte newline) - s3.seek(100 * 9, io.SEEK_SET) + sample_csv.seek(LINES_TO_JUMP * ROW_SIZE, io.SEEK_SET) - s3.transform(CsvTransform(), in_place=True) - for obj in s3: - print(obj) + sample_csv.transform(CsvTransform(), in_place=True) + for row in sample_csv: + print(row["reading"]) diff --git a/examples/streaming/src/uniform_sample.csv b/examples/streaming/src/uniform_sample.csv new file mode 100644 index 00000000000..59fe4c990a5 --- /dev/null +++ b/examples/streaming/src/uniform_sample.csv @@ -0,0 +1,4 @@ +reading,position,type +21.3,5,+ +23.4,4,+ +21.3,0,- \ No newline at end of file From 28e2040eac017b58c6e563d48f9516143562c483 Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Thu, 24 Nov 2022 10:04:55 +0100 Subject: [PATCH 42/43] docs: fix seek positioning and byte size --- docs/utilities/streaming.md | 4 ++-- examples/streaming/src/s3_csv_stream_non_uniform_seek.py | 6 ++---- examples/streaming/src/s3_csv_stream_seek.py | 4 ++-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index 80ef65602bf..2924b8f66f1 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -107,7 +107,7 @@ For example, let's imagine you have a large CSV file, each row has a non-uniform You found out the last row has exactly 30 bytes. We can use `seek()` to skip to the end of the file, read 30 bytes, then transform to CSV. -```python title="Reading only the last CSV row" hl_lines="16 18" +```python title="Reading only the last CSV row" hl_lines="16 19" --8<-- "examples/streaming/src/s3_csv_stream_non_uniform_seek.py" ``` @@ -121,7 +121,7 @@ You can also solve with `seek`, but let's take a large uniform CSV file to make --8<-- "examples/streaming/src/uniform_sample.csv" ``` -You found out that each row has 8 bytes, the header line has 22 bytes, and every new line has 1 byte. +You found out that each row has 8 bytes, the header line has 21 bytes, and every new line has 1 byte. You want to skip the first 100 lines. diff --git a/examples/streaming/src/s3_csv_stream_non_uniform_seek.py b/examples/streaming/src/s3_csv_stream_non_uniform_seek.py index a31e7c8ca6f..55454fd2a6e 100644 --- a/examples/streaming/src/s3_csv_stream_non_uniform_seek.py +++ b/examples/streaming/src/s3_csv_stream_non_uniform_seek.py @@ -12,10 +12,8 @@ def lambda_handler(event: Dict[str, str], context: LambdaContext): sample_csv = S3Object(bucket=event["bucket"], key="sample.csv") - # Jump to the end of the file - sample_csv.seek(0, io.SEEK_END) - # From the current position, jump exactly 30 bytes - sample_csv.seek(sample_csv.tell() - LAST_ROW_SIZE, io.SEEK_SET) + # From the end of the file, jump exactly 30 bytes backwards + sample_csv.seek(-LAST_ROW_SIZE, io.SEEK_END) # Transform portion of data into CSV with our headers sample_csv.transform(CsvTransform(fieldnames=CSV_HEADERS), in_place=True) diff --git a/examples/streaming/src/s3_csv_stream_seek.py b/examples/streaming/src/s3_csv_stream_seek.py index c0f5eca8b3e..6b48b9df13a 100644 --- a/examples/streaming/src/s3_csv_stream_seek.py +++ b/examples/streaming/src/s3_csv_stream_seek.py @@ -17,7 +17,7 @@ CSV_HEADERS = ["reading", "position", "type"] ROW_SIZE = 8 + 1 # 1 byte newline -HEADER_SIZE = 22 + 1 # 1 byte newline +HEADER_SIZE = 21 + 1 # 1 byte newline LINES_TO_JUMP = 100 @@ -28,7 +28,7 @@ def lambda_handler(event: Dict[str, str], context: LambdaContext): sample_csv.seek(HEADER_SIZE, io.SEEK_SET) # Jump 100 lines of 9 bytes each (8 bytes of data + 1 byte newline) - sample_csv.seek(LINES_TO_JUMP * ROW_SIZE, io.SEEK_SET) + sample_csv.seek(LINES_TO_JUMP * ROW_SIZE, io.SEEK_CUR) sample_csv.transform(CsvTransform(), in_place=True) for row in sample_csv: From 793dc7005f24b8e5118b31cfaf1f82ed03a500f0 Mon Sep 17 00:00:00 2001 From: heitorlessa Date: Thu, 24 Nov 2022 10:14:23 +0100 Subject: [PATCH 43/43] docs(streaming): fix semantic wording on read ahead vs skipping --- docs/utilities/streaming.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md index 2924b8f66f1..eebe4a32800 100644 --- a/docs/utilities/streaming.md +++ b/docs/utilities/streaming.md @@ -93,7 +93,7 @@ We provide popular built-in transformations that you can apply against your stre ## Advanced -### Reading ahead or backwards +### Skipping or reading backwards `S3Object` implements [Python I/O interface](https://docs.python.org/3/tutorial/inputoutput.html){target="_blank"}. This means you can use `seek` to start reading contents of your file from any particular position, saving you processing time. @@ -111,7 +111,7 @@ You found out the last row has exactly 30 bytes. We can use `seek()` to skip to --8<-- "examples/streaming/src/s3_csv_stream_non_uniform_seek.py" ``` -#### Reading ahead +#### Skipping !!! question "What if we want to jump the first N rows?"