Improving Spark.read_csv tests

igorborgest · igorborgest · commit 278b7c90ac17 · 2019-09-17T07:23:48.000-03:00
diff --git a/awswrangler/redshift.py b/awswrangler/redshift.py
@@ -341,9 +341,11 @@ def _get_redshift_schema(dataframe, dataframe_type, preserve_index=False):
                 dtype = str(dataframe.index.dtype)
                 redshift_type = Redshift._type_pandas2redshift(dtype)
                 schema_built.append((name, redshift_type))
-            for col, dtype in dataframe.dtypes:
+            for col in dataframe.columns:
+                name = str(col)
+                dtype = str(dataframe[name].dtype)
                 redshift_type = Redshift._type_pandas2redshift(dtype)
-                schema_built.append((col, redshift_type))
+                schema_built.append((name, redshift_type))
         elif dataframe_type == "spark":
             for name, dtype in dataframe.dtypes:
                 redshift_type = Redshift._type_spark2redshift(dtype)
diff --git a/testing/test_awswrangler/test_spark.py b/testing/test_awswrangler/test_spark.py
@@ -39,9 +39,33 @@ def bucket(session, cloudformation_outputs):
     session.s3.delete_objects(path=f"s3://{bucket}/")
 
 
-def test_read_csv(session, bucket):
-    boto3.client("s3").upload_file("data_samples/small.csv", bucket,
-                                   "data_samples/small.csv")
-    path = f"s3://{bucket}/data_samples/small.csv"
-    dataframe = session.spark.read_csv(path=path)
-    assert dataframe.count() == 100
+@pytest.mark.parametrize(
+    "sample_name",
+    ["nano", "micro", "small"],
+)
+def test_read_csv(session, bucket, sample_name):
+    path = f"data_samples/{sample_name}.csv"
+    if sample_name == "micro":
+        schema = "id SMALLINT, name STRING, value FLOAT, date TIMESTAMP"
+        timestamp_format = "yyyy-MM-dd"
+    elif sample_name == "small":
+        schema = "id BIGINT, name STRING, date DATE"
+        timestamp_format = "dd-MM-yy"
+    elif sample_name == "nano":
+        schema = "id INTEGER, name STRING, value DOUBLE, date TIMESTAMP, time TIMESTAMP"
+        timestamp_format = "yyyy-MM-dd"
+    dataframe = session.spark.read_csv(path=path,
+                                       schema=schema,
+                                       timestampFormat=timestamp_format,
+                                       dateFormat=timestamp_format,
+                                       header=True)
+
+    boto3.client("s3").upload_file(path, bucket, path)
+    path2 = f"s3://{bucket}/{path}"
+    dataframe2 = session.spark.read_csv(path=path2,
+                                        schema=schema,
+                                        timestampFormat=timestamp_format,
+                                        dateFormat=timestamp_format,
+                                        header=True)
+    assert dataframe.count() == dataframe2.count()
+    assert len(list(dataframe.columns)) == len(list(dataframe2.columns))