aws
diff --git a/‎awswrangler/timestream.py
Lines changed: 146 additions & 61 deletions b/‎awswrangler/timestream.py
Lines changed: 146 additions & 61 deletions
@@ -10,7 +10,7 @@
 import pandas as pd
 from botocore.config import Config
 
-from awswrangler import _data_types, _utils
+from awswrangler import _data_types, _utils, exceptions
 
 _logger: logging.Logger = logging.getLogger(__name__)
 
@@ -27,61 +27,111 @@ def _df2list(df: pd.DataFrame) -> List[List[Any]]:
     return parameters
 
 
+def _format_timestamp(timestamp: Union[int, datetime]) -> str:
+    if isinstance(timestamp, int):
+        return str(round(timestamp / 1_000_000))
+    if isinstance(timestamp, datetime):
+        return str(round(timestamp.timestamp() * 1_000))
+    raise exceptions.InvalidArgumentType("`time_col` must be of type timestamp.")
+
+
 def _format_measure(measure_name: str, measure_value: Any, measure_type: str) -> Dict[str, str]:
     return {
         "Name": measure_name,
-        "Value": str(round(measure_value.timestamp() * 1_000) if measure_type == "TIMESTAMP" else measure_value),
+        "Value": _format_timestamp(measure_value) if measure_type == "TIMESTAMP" else str(measure_value),
         "Type": measure_type,
     }
 
 
+def _sanitize_common_attributes(
+    common_attributes: Optional[Dict[str, Any]],
+    version: int,
+    measure_name: Optional[str],
+) -> Dict[str, Any]:
+    common_attributes = {} if not common_attributes else common_attributes
+    # Values in common_attributes take precedence
+    common_attributes.setdefault("Version", version)
+
+    if "Time" not in common_attributes:
+        # TimeUnit is MILLISECONDS by default for Timestream writes
+        # But if a time_col is supplied (i.e. Time is not in common_attributes)
+        # then TimeUnit must be set to MILLISECONDS explicitly
+        common_attributes["TimeUnit"] = "MILLISECONDS"
+
+    if "MeasureValue" in common_attributes and "MeasureValueType" not in common_attributes:
+        raise exceptions.InvalidArgumentCombination(
+            "MeasureValueType must be supplied alongside MeasureValue in common_attributes."
+        )
+
+    if measure_name:
+        common_attributes.setdefault("MeasureName", measure_name)
+    elif "MeasureName" not in common_attributes:
+        raise exceptions.InvalidArgumentCombination(
+            "MeasureName must be supplied with the `measure_name` argument or in common_attributes."
+        )
+    return common_attributes
+
+
 def _write_batch(
+    timestream_client: boto3.client,
     database: str,
     table: str,
-    cols_names: List[str],
-    measure_cols_names: List[str],
+    common_attributes: Dict[str, Any],
+    cols_names: List[Optional[str]],
+    measure_cols: List[Optional[str]],
     measure_types: List[str],
-    version: int,
+    dimension_cols: List[Optional[str]],
     batch: List[Any],
-    timestream_client: boto3.client,
-    measure_name: Optional[str] = None,
 ) -> List[Dict[str, str]]:
-    try:
-        time_loc = 0
-        measure_cols_loc = 1
-        dimensions_cols_loc = 1 + len(measure_cols_names)
-        records: List[Dict[str, Any]] = []
-        for rec in batch:
-            record: Dict[str, Any] = {
-                "Dimensions": [
-                    {"Name": name, "DimensionValueType": "VARCHAR", "Value": str(value)}
-                    for name, value in zip(cols_names[dimensions_cols_loc:], rec[dimensions_cols_loc:])
-                ],
-                "Time": str(round(rec[time_loc].timestamp() * 1_000)),
-                "TimeUnit": "MILLISECONDS",
-                "Version": version,
-            }
-            if len(measure_cols_names) == 1:
-                measure_value = rec[measure_cols_loc]
-                if pd.isnull(measure_value):
-                    continue
-                record["MeasureName"] = measure_name if measure_name else measure_cols_names[0]
-                record["MeasureValueType"] = measure_types[0]
-                record["MeasureValue"] = str(measure_value)
-            else:
-                record["MeasureName"] = measure_name if measure_name else measure_cols_names[0]
-                record["MeasureValueType"] = "MULTI"
-                record["MeasureValues"] = [
-                    _format_measure(measure_name, measure_value, measure_value_type)
-                    for measure_name, measure_value, measure_value_type in zip(
-                        measure_cols_names, rec[measure_cols_loc:dimensions_cols_loc], measure_types
-                    )
-                    if not pd.isnull(measure_value)
-                ]
-                if len(record["MeasureValues"]) == 0:
-                    continue
+    records: List[Dict[str, Any]] = []
+    scalar = bool(len(measure_cols) == 1 and "MeasureValues" not in common_attributes)
+    time_loc = 0
+    measure_cols_loc = 1 if cols_names[0] else 0
+    dimensions_cols_loc = 1 if len(measure_cols) == 1 else 1 + len(measure_cols)
+    if all(cols_names):
+        # Time and Measures are supplied in the data frame
+        dimensions_cols_loc = 1 + len(measure_cols)
+    elif all(v is None for v in cols_names[:2]):
+        # Time and Measures are supplied in common_attributes
+        dimensions_cols_loc = 0
+
+    for row in batch:
+        record: Dict[str, Any] = {}
+        if "Time" not in common_attributes:
+            record["Time"] = _format_timestamp(row[time_loc])
+        if scalar and "MeasureValue" not in common_attributes:
+            measure_value = row[measure_cols_loc]
+            if pd.isnull(measure_value):
+                continue
+            record["MeasureValue"] = str(measure_value)
+        elif not scalar and "MeasureValues" not in common_attributes:
+            record["MeasureValues"] = [
+                _format_measure(measure_name, measure_value, measure_value_type)  # type: ignore[arg-type]
+                for measure_name, measure_value, measure_value_type in zip(
+                    measure_cols, row[measure_cols_loc:dimensions_cols_loc], measure_types
+                )
+                if not pd.isnull(measure_value)
+            ]
+            if len(record["MeasureValues"]) == 0:
+                continue
+        if "MeasureValueType" not in common_attributes:
+            record["MeasureValueType"] = measure_types[0] if scalar else "MULTI"
+        # Dimensions can be specified in both common_attributes and the data frame
+        dimensions = (
+            [
+                {"Name": name, "DimensionValueType": "VARCHAR", "Value": str(value)}
+                for name, value in zip(dimension_cols, row[dimensions_cols_loc:])
+            ]
+            if all(dimension_cols)
+            else []
+        )
+        if dimensions:
+            record["Dimensions"] = dimensions
+        if record:
             records.append(record)
-        if len(records) > 0:
+
+    try:
+        if records:
             _utils.try_it(
                 f=timestream_client.write_records,
                 ex=(
@@ -91,6 +141,7 @@ def _write_batch(
                 max_num_tries=5,
                 DatabaseName=database,
                 TableName=table,
+                CommonAttributes=common_attributes,
                 Records=records,
             )
     except timestream_client.exceptions.RejectedRecordsException as ex:
@@ -192,12 +243,13 @@ def write(
     df: pd.DataFrame,
     database: str,
     table: str,
-    time_col: str,
-    measure_col: Union[str, List[str]],
-    dimensions_cols: List[str],
+    time_col: Optional[str] = None,
+    measure_col: Union[str, List[Optional[str]], None] = None,
+    dimensions_cols: Optional[List[Optional[str]]] = None,
     version: int = 1,
     num_threads: int = 32,
     measure_name: Optional[str] = None,
+    common_attributes: Optional[Dict[str, Any]] = None,
     boto3_session: Optional[boto3.Session] = None,
 ) -> List[Dict[str, str]]:
     """Store a Pandas DataFrame into a Amazon Timestream table.
@@ -206,6 +258,16 @@ def write(
     this function will not throw a Python exception.
     Instead it will return the rejection information.
 
+    Note
+    ----
+    Values in `common_attributes` take precedence over all other arguments and data frame values.
+    Dimension attributes are merged with attributes in record objects.
+    Example: common_attributes = {"Dimensions": {"Name": "device_id", "Value": "12345"}, "MeasureValueType": "DOUBLE"}.
+
+    Note
+    ----
+    If the `time_col` column is supplied it must be of type timestamp. `TimeUnit` is set to MILLISECONDS by default.
+
     Parameters
     ----------
     df: pandas.DataFrame
@@ -214,18 +276,21 @@ def write(
         Amazon Timestream database name.
     table : str
         Amazon Timestream table name.
-    time_col : str
+    time_col : Optional[str]
         DataFrame column name to be used as time. MUST be a timestamp column.
-    measure_col : Union[str, List[str]]
+    measure_col : Union[str, List[str], None]
         DataFrame column name(s) to be used as measure.
-    dimensions_cols : List[str]
+    dimensions_cols : Optional[List[str]]
         List of DataFrame column names to be used as dimensions.
     version : int
         Version number used for upserts.
         Documentation https://docs.aws.amazon.com/timestream/latest/developerguide/API_WriteRecords.html.
     measure_name : Optional[str]
         Name that represents the data attribute of the time series.
         Overrides ``measure_col`` if specified.
+    common_attributes : Optional[Dict[str, Any]]
+        Dictionary of attributes that is shared across all records in the request.
+        Using common attributes can optimize the cost of writes by reducing the size of request payloads.
     num_threads : str
         Number of thread to be used for concurrent writing.
     boto3_session : boto3.Session(), optional
@@ -279,30 +344,50 @@ def write(
         session=boto3_session,
         botocore_config=Config(read_timeout=20, max_pool_connections=5000, retries={"max_attempts": 10}),
     )
-    measure_cols_names = measure_col if isinstance(measure_col, list) else [measure_col]
-    _logger.debug("measure_cols_names: %s", measure_cols_names)
 
-    measure_types: List[str] = [
-        _data_types.timestream_type_from_pandas(df[[measure_col_name]]) for measure_col_name in measure_cols_names
+    measure_cols = measure_col if isinstance(measure_col, list) else [measure_col]
+    measure_types = [
+        _data_types.timestream_type_from_pandas(df[[measure_col_name]])
+        for measure_col_name in measure_cols
+        if measure_col_name
     ]
-    _logger.debug("measure_types: %s", measure_types)
-    cols_names: List[str] = [time_col] + measure_cols_names + dimensions_cols
-    _logger.debug("cols_names: %s", cols_names)
-    batches: List[List[Any]] = _utils.chunkify(lst=_df2list(df=df[cols_names]), max_length=100)
-    _logger.debug("len(batches): %s", len(batches))
+    dimensions_cols = dimensions_cols if dimensions_cols else [dimensions_cols]  # type: ignore[list-item]
+    cols_names: List[Optional[str]] = [time_col] + measure_cols + dimensions_cols
+    measure_name = measure_name if measure_name else measure_cols[0]
+    common_attributes = _sanitize_common_attributes(common_attributes, version, measure_name)
+
+    _logger.debug(
+        "common_attributes: %s\n, cols_names: %s\n, measure_types: %s",
+        common_attributes,
+        cols_names,
+        measure_types,
+    )
+
+    # User can supply arguments in one of two ways:
+    # 1. With the `common_attributes` dictionary which takes precedence
+    # 2. With data frame columns
+    # However, the data frame cannot be completely empty.
+    # So if all values in `cols_names` are None, an exception is raised.
+    if any(cols_names):
+        batches: List[List[Any]] = _utils.chunkify(lst=_df2list(df=df[[c for c in cols_names if c]]), max_length=100)
+    else:
+        raise exceptions.InvalidArgumentCombination(
+            "At least one of `time_col`, `measure_col` or `dimensions_cols` must be specified."
+        )
+
     with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
         res: List[List[Any]] = list(
             executor.map(
                 _write_batch,
+                itertools.repeat(timestream_client),
                 itertools.repeat(database),
                 itertools.repeat(table),
+                itertools.repeat(common_attributes),
                 itertools.repeat(cols_names),
-                itertools.repeat(measure_cols_names),
+                itertools.repeat(measure_cols),
                 itertools.repeat(measure_types),
-                itertools.repeat(version),
+                itertools.repeat(dimensions_cols),
                 batches,
-                itertools.repeat(timestream_client),
-                itertools.repeat(measure_name),
             )
         )
         return [item for sublist in res for item in sublist]