Add SerDe options for Pandas.to_csv()

igorborgest · igorborgest · commit ae3fc5a05e88 · 2019-09-19T14:50:18.000-03:00
diff --git a/awswrangler/athena.py b/awswrangler/athena.py
@@ -29,8 +29,8 @@ def _type_athena2pandas(dtype):
             return "float64"
         elif dtype == "boolean":
             return "bool"
-        elif dtype in ["string", "char", "varchar", "array", "row", "map"]:
-            return "object"
+        elif dtype in ["string", "char", "varchar"]:
+            return "str"
         elif dtype == "timestamp":
             return "datetime64"
         elif dtype == "date":
@@ -53,6 +53,7 @@ def get_query_dtype(self, query_execution_id):
             else:
                 dtype[col_name] = ptype
         logger.debug(f"dtype: {dtype}")
+        logger.debug(f"parse_timestamps: {parse_timestamps}")
         logger.debug(f"parse_dates: {parse_dates}")
         return dtype, parse_timestamps, parse_dates
 
diff --git a/awswrangler/exceptions.py b/awswrangler/exceptions.py
@@ -64,3 +64,11 @@ class QueryCancelled(Exception):
 
 class QueryFailed(Exception):
     pass
+
+
+class InvalidSerDe(Exception):
+    pass
+
+
+class ApiError(Exception):
+    pass
diff --git a/awswrangler/glue.py b/awswrangler/glue.py
@@ -5,7 +5,7 @@
 
 import pyarrow
 
-from awswrangler.exceptions import UnsupportedType, UnsupportedFileFormat
+from awswrangler.exceptions import UnsupportedType, UnsupportedFileFormat, InvalidSerDe, ApiError
 
 logger = logging.getLogger(__name__)
 
@@ -155,12 +155,11 @@ def metadata_to_glue(self,
         if partition_cols:
             partitions_tuples = Glue._parse_partitions_tuples(
                 objects_paths=objects_paths, partition_cols=partition_cols)
-            self.add_partitions(
-                database=database,
-                table=table,
-                partition_paths=partitions_tuples,
-                file_format=file_format,
-            )
+            self.add_partitions(database=database,
+                                table=table,
+                                partition_paths=partitions_tuples,
+                                file_format=file_format,
+                                extra_args=extra_args)
 
     def delete_table_if_exists(self, database, table):
         try:
@@ -184,7 +183,8 @@ def create_table(self,
                      partition_cols_schema=None,
                      extra_args=None):
         if file_format == "parquet":
-            table_input = Glue.parquet_table_definition(table, partition_cols_schema, schema, path)
+            table_input = Glue.parquet_table_definition(
+                table, partition_cols_schema, schema, path)
         elif file_format == "csv":
             table_input = Glue.csv_table_definition(table,
                                                     partition_cols_schema,
@@ -196,25 +196,31 @@ def create_table(self,
         self._client_glue.create_table(DatabaseName=database,
                                        TableInput=table_input)
 
-    def add_partitions(self, database, table, partition_paths, file_format):
+    def add_partitions(self, database, table, partition_paths, file_format,
+                       extra_args):
         if not partition_paths:
             return None
         partitions = list()
         for partition in partition_paths:
             if file_format == "parquet":
-                partition_def = Glue.parquet_partition_definition(partition)
+                partition_def = Glue.parquet_partition_definition(
+                    partition=partition)
             elif file_format == "csv":
-                partition_def = Glue.csv_partition_definition(partition)
+                partition_def = Glue.csv_partition_definition(
+                    partition=partition, extra_args=extra_args)
             else:
                 raise UnsupportedFileFormat(file_format)
             partitions.append(partition_def)
         pages_num = int(ceil(len(partitions) / 100.0))
         for _ in range(pages_num):
             page = partitions[:100]
             del partitions[:100]
-            self._client_glue.batch_create_partition(DatabaseName=database,
-                                                     TableName=table,
-                                                     PartitionInputList=page)
+            res = self._client_glue.batch_create_partition(
+                DatabaseName=database,
+                TableName=table,
+                PartitionInputList=page)
+            if len(res["Errors"]) > 0:
+                raise ApiError(f"{res['Errors'][0]}")
 
     def get_connection_details(self, name):
         return self._client_glue.get_connection(
@@ -223,18 +229,25 @@ def get_connection_details(self, name):
     @staticmethod
     def _extract_pyarrow_schema(dataframe, preserve_index):
         cols = []
+        cols_dtypes = {}
         schema = []
+
         for name, dtype in dataframe.dtypes.to_dict().items():
             dtype = str(dtype)
             if str(dtype) == "Int64":
-                schema.append((name, "int64"))
+                cols_dtypes[name] = "int64"
             else:
                 cols.append(name)
 
-        # Convert pyarrow.Schema to list of tuples (e.g. [(name1, type1), (name2, type2)...])
-        schema += [(str(x.name), str(x.type))
-                   for x in pyarrow.Schema.from_pandas(
-                       df=dataframe[cols], preserve_index=preserve_index)]
+        for field in pyarrow.Schema.from_pandas(df=dataframe[cols],
+                                                preserve_index=preserve_index):
+            name = str(field.name)
+            dtype = str(field.type)
+            cols_dtypes[name] = dtype
+            if name not in dataframe.columns:
+                schema.append((name, dtype))
+
+        schema += [(name, cols_dtypes[name]) for name in dataframe.columns]
         logger.debug(f"schema: {schema}")
         return schema
 
@@ -256,7 +269,8 @@ def _build_schema(dataframe, partition_cols, preserve_index):
             else:
                 schema_built.append((name, athena_type))
 
-        partition_cols_schema_built = [(name, partition_cols_types[name]) for name in partition_cols]
+        partition_cols_schema_built = [(name, partition_cols_types[name])
+                                       for name in partition_cols]
 
         logger.debug(f"schema_built:\n{schema_built}")
         logger.debug(
@@ -270,17 +284,40 @@ def _parse_table_name(path):
         return path.rpartition("/")[2]
 
     @staticmethod
-    def csv_table_definition(table, partition_cols_schema, schema, path, extra_args):
-        sep = extra_args["sep"] if "sep" in extra_args else ","
+    def csv_table_definition(table, partition_cols_schema, schema, path,
+                             extra_args):
         if not partition_cols_schema:
             partition_cols_schema = []
+        sep = extra_args["sep"] if "sep" in extra_args else ","
+        serde = extra_args.get("serde")
+        if serde == "OpenCSVSerDe":
+            serde_fullname = "org.apache.hadoop.hive.serde2.OpenCSVSerde"
+            param = {
+                "separatorChar": sep,
+                "quoteChar": "\"",
+                "escapeChar": "\\",
+            }
+            refined_par_schema = [(name, "string")
+                                  for name, dtype in partition_cols_schema]
+            refined_schema = [(name, "string") for name, dtype in schema]
+        elif serde == "LazySimpleSerDe":
+            serde_fullname = "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"
+            param = {"field.delim": sep, "escape.delim": "\\"}
+            dtypes_allowed = ["int", "bigint", "float", "double"]
+            refined_par_schema = [(name, dtype) if dtype in dtypes_allowed else
+                                  (name, "string")
+                                  for name, dtype in partition_cols_schema]
+            refined_schema = [(name, dtype) if dtype in dtypes_allowed else
+                              (name, "string") for name, dtype in schema]
+        else:
+            raise InvalidSerDe(f"{serde} in not in the valid SerDe list.")
         return {
             "Name":
             table,
             "PartitionKeys": [{
                 "Name": x[0],
                 "Type": x[1]
-            } for x in partition_cols_schema],
+            } for x in refined_par_schema],
             "TableType":
             "EXTERNAL_TABLE",
             "Parameters": {
@@ -295,54 +332,61 @@ def csv_table_definition(table, partition_cols_schema, schema, path, extra_args)
                 "Columns": [{
                     "Name": x[0],
                     "Type": x[1]
-                } for x in schema],
+                } for x in refined_schema],
                 "Location": path,
                 "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
                 "OutputFormat":
                 "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
                 "Compressed": False,
                 "NumberOfBuckets": -1,
                 "SerdeInfo": {
-                    "Parameters": {
-                        "field.delim": sep
-                    },
-                    "SerializationLibrary":
-                    "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
+                    "Parameters": param,
+                    "SerializationLibrary": serde_fullname,
                 },
                 "StoredAsSubDirectories": False,
                 "SortColumns": [],
                 "Parameters": {
                     "classification": "csv",
                     "compressionType": "none",
                     "typeOfData": "file",
-                    "delimiter": ",",
+                    "delimiter": sep,
                     "columnsOrdered": "true",
                     "areColumnsQuoted": "false",
                 },
             },
         }
 
     @staticmethod
-    def csv_partition_definition(partition):
+    def csv_partition_definition(partition, extra_args):
+        sep = extra_args["sep"] if "sep" in extra_args else ","
+        serde = extra_args.get("serde")
+        if serde == "OpenCSVSerDe":
+            serde_fullname = "org.apache.hadoop.hive.serde2.OpenCSVSerde"
+            param = {
+                "separatorChar": sep,
+                "quoteChar": "\"",
+                "escapeChar": "\\",
+            }
+        elif serde == "LazySimpleSerDe":
+            serde_fullname = "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"
+            param = {"field.delim": sep, "escape.delim": "\\"}
+        else:
+            raise InvalidSerDe(f"{serde} in not in the valid SerDe list.")
         return {
             "StorageDescriptor": {
                 "InputFormat": "org.apache.hadoop.mapred.TextInputFormat",
                 "Location": partition[0],
                 "SerdeInfo": {
-                    "Parameters": {
-                        "field.delim": ","
-                    },
-                    "SerializationLibrary":
-                    "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
+                    "Parameters": param,
+                    "SerializationLibrary": serde_fullname,
                 },
                 "StoredAsSubDirectories": False,
             },
             "Values": partition[1],
         }
 
     @staticmethod
-    def parquet_table_definition(table, partition_cols_schema,
-                                 schema, path):
+    def parquet_table_definition(table, partition_cols_schema, schema, path):
         if not partition_cols_schema:
             partition_cols_schema = []
         return {
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -11,7 +11,7 @@
 from pyarrow import parquet
 
 from awswrangler.exceptions import UnsupportedWriteMode, UnsupportedFileFormat,\
-    AthenaQueryError, EmptyS3Object, LineTerminatorNotFound, EmptyDataframe
+    AthenaQueryError, EmptyS3Object, LineTerminatorNotFound, EmptyDataframe, InvalidSerDe
 from awswrangler.utils import calculate_bounders
 from awswrangler import s3
 
@@ -26,6 +26,9 @@ def _get_bounders(dataframe, num_partitions):
 
 
 class Pandas:
+
+    VALID_CSV_SERDES = ["OpenCSVSerDe", "LazySimpleSerDe"]
+
     def __init__(self, session):
         self._session = session
 
@@ -427,15 +430,17 @@ def read_sql_athena(self,
                                 parse_dates=parse_timestamps,
                                 quoting=csv.QUOTE_ALL,
                                 max_result_size=max_result_size)
-            for col in parse_dates:
-                ret[col] = ret[col].dt.date
+            if len(ret.index) > 0:
+                for col in parse_dates:
+                    ret[col] = ret[col].dt.date
         return ret
 
     def to_csv(
             self,
             dataframe,
             path,
             sep=",",
+            serde="OpenCSVSerDe",
             database=None,
             table=None,
             partition_cols=None,
@@ -451,6 +456,7 @@ def to_csv(
         :param dataframe: Pandas Dataframe
         :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/
         :param sep: Same as pandas.to_csv()
+        :param serde: SerDe library name (e.g. OpenCSVSerDe, LazySimpleSerDe)
         :param database: AWS Glue Database name
         :param table: AWS Glue table name
         :param partition_cols: List of columns names that will be partitions on S3
@@ -460,7 +466,11 @@ def to_csv(
         :param procs_io_bound: Number of cores used for I/O bound tasks
         :return: List of objects written on S3
         """
-        extra_args = {"sep": sep}
+        if serde not in Pandas.VALID_CSV_SERDES:
+            raise InvalidSerDe(
+                f"{serde} in not in the valid SerDe list ({Pandas.VALID_CSV_SERDES})"
+            )
+        extra_args = {"sep": sep, "serde": serde}
         return self.to_s3(dataframe=dataframe,
                           path=path,
                           file_format="csv",
@@ -745,8 +755,17 @@ def write_csv_dataframe(dataframe,
                             fs,
                             extra_args=None):
         csv_extra_args = {}
-        if "sep" in extra_args:
-            csv_extra_args["sep"] = extra_args["sep"]
+        sep = extra_args.get("sep")
+        if sep is not None:
+            csv_extra_args["sep"] = sep
+        serde = extra_args.get("serde")
+        if serde is not None:
+            if serde == "OpenCSVSerDe":
+                csv_extra_args["quoting"] = csv.QUOTE_ALL
+                csv_extra_args["escapechar"] = "\\"
+            elif serde == "LazySimpleSerDe":
+                csv_extra_args["quoting"] = csv.QUOTE_NONE
+                csv_extra_args["escapechar"] = "\\"
         csv_buffer = bytes(
             dataframe.to_csv(None,
                              header=False,
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py