docs(streaming): add reading ahead and backwards section

heitorlessa · rubenfonseca · commit 23f672a165b3 · 2022-11-24T10:06:28.000+01:00
diff --git a/docs/utilities/streaming.md b/docs/utilities/streaming.md
@@ -93,6 +93,42 @@ We provide popular built-in transformations that you can apply against your stre
 
 ## Advanced
 
+### Reading ahead or backwards
+
+`S3Object` implements [Python I/O interface](https://docs.python.org/3/tutorial/inputoutput.html){target="_blank"}. This means you can use `seek` to start reading contents of your file from any particular position, saving you processing time.
+
+#### Reading backwards
+
+For example, let's imagine you have a large CSV file, each row has a non-uniform size (bytes), and you want to read and process the last row only.
+
+```csv title="non_uniform_sample.csv"
+--8<-- "examples/streaming/src/non_uniform_sample.csv"
+```
+
+You found out the last row has exactly 30 bytes. We can use `seek()` to skip to the end of the file, read 30 bytes, then transform to CSV.
+
+```python title="Reading only the last CSV row" hl_lines="16 18"
+--8<-- "examples/streaming/src/s3_csv_stream_non_uniform_seek.py"
+```
+
+#### Reading ahead
+
+!!! question "What if we want to jump the first N rows?"
+
+You can also solve with `seek`, but let's take a large uniform CSV file to make this easier to grasp.
+
+```csv title="uniform_sample.csv"
+--8<-- "examples/streaming/src/uniform_sample.csv"
+```
+
+You found out that each row has 8 bytes, the header line has 22 bytes, and every new line has 1 byte.
+
+You want to skip the first 100 lines.
+
+```python hl_lines="28 31" title="Skipping the first 100 rows"
+--8<-- "examples/streaming/src/s3_csv_stream_seek.py"
+```
+
 ### Custom options for data transformations
 
 We will propagate additional options to the underlying implementation for each transform class.
diff --git a/examples/streaming/src/non_uniform_sample.csv b/examples/streaming/src/non_uniform_sample.csv
@@ -0,0 +1,4 @@
+id,name,location
+1,Ruben Fonseca, Denmark
+2,Heitor Lessa, Netherlands
+3,Leandro Damascena, Portugal
diff --git a/examples/streaming/src/s3_csv_stream_non_uniform_seek.py b/examples/streaming/src/s3_csv_stream_non_uniform_seek.py
@@ -0,0 +1,26 @@
+import io
+from typing import Dict
+
+from aws_lambda_powertools.utilities.streaming.s3_object import S3Object
+from aws_lambda_powertools.utilities.streaming.transformations import CsvTransform
+from aws_lambda_powertools.utilities.typing import LambdaContext
+
+LAST_ROW_SIZE = 30
+CSV_HEADERS = ["id", "name", "location"]
+
+
+def lambda_handler(event: Dict[str, str], context: LambdaContext):
+    sample_csv = S3Object(bucket=event["bucket"], key="sample.csv")
+
+    # Jump to the end of the file
+    sample_csv.seek(0, io.SEEK_END)
+    # From the current position, jump exactly 30 bytes
+    sample_csv.seek(sample_csv.tell() - LAST_ROW_SIZE, io.SEEK_SET)
+
+    # Transform portion of data into CSV with our headers
+    sample_csv.transform(CsvTransform(fieldnames=CSV_HEADERS), in_place=True)
+
+    # We will only read the last portion of the file from S3
+    # as we're only interested in the last 'location' from our dataset
+    for last_row in sample_csv:
+        print(last_row["location"])
diff --git a/examples/streaming/src/s3_csv_stream_seek.py b/examples/streaming/src/s3_csv_stream_seek.py
@@ -8,18 +8,28 @@
 """
 Assuming the CSV files contains rows after the header always has 8 bytes + 1 byte newline:
 
+reading,position,type
 21.3,5,+
 23.4,4,+
 21.3,0,-
+...
 """
 
+CSV_HEADERS = ["reading", "position", "type"]
+ROW_SIZE = 8 + 1  # 1 byte newline
+HEADER_SIZE = 22 + 1  # 1 byte newline
+LINES_TO_JUMP = 100
+
 
 def lambda_handler(event: Dict[str, str], context: LambdaContext):
-    s3 = S3Object(bucket=event["bucket"], key=event["key"])
+    sample_csv = S3Object(bucket=event["bucket"], key=event["key"])
+
+    # Skip the header line
+    sample_csv.seek(HEADER_SIZE, io.SEEK_SET)
 
     # Jump 100 lines of 9 bytes each (8 bytes of data + 1 byte newline)
-    s3.seek(100 * 9, io.SEEK_SET)
+    sample_csv.seek(LINES_TO_JUMP * ROW_SIZE, io.SEEK_SET)
 
-    s3.transform(CsvTransform(), in_place=True)
-    for obj in s3:
-        print(obj)
+    sample_csv.transform(CsvTransform(), in_place=True)
+    for row in sample_csv:
+        print(row["reading"])
diff --git a/examples/streaming/src/uniform_sample.csv b/examples/streaming/src/uniform_sample.csv
@@ -0,0 +1,4 @@
+reading,position,type
+21.3,5,+
+23.4,4,+
+21.3,0,-