pandas-dev · jreback · Apr 26, 2020 · Apr 18, 2020 · Apr 18, 2020 · Apr 18, 2020
diff --git a/doc/source/whatsnew/v1.0.3.rst b/doc/source/whatsnew/v1.0.3.rst
@@ -23,6 +23,9 @@ Fixed regressions
 Bug fixes
 ~~~~~~~~~
 
+**I/O**
+- :func:`read_parquet` now supports an s3 directory (:issue:`26388`)
+
 Contributors
 ~~~~~~~~~~~~
 

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -158,6 +158,23 @@ def urlopen(*args, **kwargs):
     return urllib.request.urlopen(*args, **kwargs)
 
 
+def get_fs_for_path(filepath):
+    """
+    Get appropriate filesystem given a filepath.
+    Support s3fs, gcs and local disk fs
+    """
+    if is_s3_url(filepath):
+        from pandas.io import s3
+
+        return s3.get_fs()
+    elif is_gcs_url(filepath):
+        from pandas.io import gcs
+
+        return gcs.get_fs()
+    else:
+        return None
+
+
 def get_filepath_or_buffer(
     filepath_or_buffer: FilePathOrBuffer,
     encoding: Optional[str] = None,
@@ -192,7 +209,7 @@ def get_filepath_or_buffer(
             compression = "gzip"
         reader = BytesIO(req.read())
         req.close()
-        return reader, encoding, compression, True
+        return reader, encoding, compression, True, None
 
     if is_s3_url(filepath_or_buffer):
         from pandas.io import s3

diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py
@@ -6,13 +6,17 @@
 )
 
 
+def get_fs():
+    return gcsfs.GCSFileSystem()
+
+
 def get_filepath_or_buffer(
     filepath_or_buffer, encoding=None, compression=None, mode=None
 ):
 
     if mode is None:
         mode = "rb"
 
-    fs = gcsfs.GCSFileSystem()
+    fs = get_fs()
     filepath_or_buffer = fs.open(filepath_or_buffer, mode)
     return filepath_or_buffer, None, compression, True
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -8,7 +8,12 @@
 
 from pandas import DataFrame, get_option
 
-from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url
+from pandas.io.common import (
+    get_filepath_or_buffer,
+    is_gcs_url,
+    is_s3_url,
+    get_fs_for_path,
+)
 
 
 def get_engine(engine: str) -> "BaseImpl":
@@ -92,8 +97,7 @@ def write(
         **kwargs,
     ):
         self.validate_dataframe(df)
-        path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
-
+        file_obj, _, _, _ = get_filepath_or_buffer(path, mode="wb")
         from_pandas_kwargs: Dict[str, Any] = {"schema": kwargs.pop("schema", None)}
         if index is not None:
             from_pandas_kwargs["preserve_index"] = index
@@ -108,18 +112,16 @@ def write(
                 **kwargs,
             )
         else:
-            self.api.parquet.write_table(table, path, compression=compression, **kwargs)
+            self.api.parquet.write_table(
+                table, file_obj, compression=compression, **kwargs
+            )
 
     def read(self, path, columns=None, **kwargs):
-        path, _, _, should_close = get_filepath_or_buffer(path)
-
-        kwargs["use_pandas_metadata"] = True
-        result = self.api.parquet.read_table(
-            path, columns=columns, **kwargs
-        ).to_pandas()
-        if should_close:
-            path.close()
-
+        parquet_ds = self.api.parquet.ParquetDataset(
+            path, filesystem=get_fs_for_path(path), **kwargs
+        )
+        kwargs["columns"] = columns
+        result = parquet_ds.read_pandas(**kwargs).to_pandas()
         return result
 
 

diff --git a/pandas/io/s3.py b/pandas/io/s3.py
@@ -16,6 +16,10 @@ def _strip_schema(url):
     return result.netloc + result.path
 
 
+def get_fs():
+    return s3fs.S3FileSystem(anon=False)
+
+
 def get_file_and_filesystem(
     filepath_or_buffer: FilePathOrBuffer, mode: Optional[str] = None
 ) -> Tuple[IO, Any]:
@@ -24,7 +28,7 @@ def get_file_and_filesystem(
     if mode is None:
         mode = "rb"
 
-    fs = s3fs.S3FileSystem(anon=False)
+    fs = get_fs()
     try:
         file = fs.open(_strip_schema(filepath_or_buffer), mode)
     except (FileNotFoundError, NoCredentialsError):
@@ -34,7 +38,7 @@ def get_file_and_filesystem(
         # aren't valid for that bucket.
         # A NoCredentialsError is raised if you don't have creds
         # for that bucket.
-        fs = s3fs.S3FileSystem(anon=True)
+        fs = get_fs()
         file = fs.open(_strip_schema(filepath_or_buffer), mode)
     return file, fs
 

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -20,6 +20,7 @@
     read_parquet,
     to_parquet,
 )
+from pandas.io.s3 import get_fs as get_s3_fs
 
 try:
     import pyarrow  # noqa
@@ -131,6 +132,7 @@ def check_round_trip(
     read_kwargs=None,
     expected=None,
     check_names=True,
+    check_like=False,
     repeat=2,
 ):
     """Verify parquet serializer and deserializer produce the same results.
@@ -150,6 +152,8 @@ def check_round_trip(
         Expected deserialization result, otherwise will be equal to `df`
     check_names: list of str, optional
         Closed set of column names to be compared
+    check_like: bool, optional
+        If True, ignore the order of index & columns.
     repeat: int, optional
         How many times to repeat the test
     """
@@ -169,7 +173,9 @@ def compare(repeat):
             with catch_warnings(record=True):
                 actual = read_parquet(path, **read_kwargs)
 
-            tm.assert_frame_equal(expected, actual, check_names=check_names)
+            tm.assert_frame_equal(
+                expected, actual, check_names=check_names, check_like=check_like
+            )
 
     if path is None:
         with tm.ensure_clean() as path:
@@ -537,6 +543,29 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa):
         # GH #19134
         check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet")
 
+    def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa):
+        # GH #26388
+        # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716
+        # As per pyarrow partitioned columns become 'categorical' dtypes
+        # and are added to back of dataframe on read
+
+        partition_col = "A"
+        expected_df = df_compat.copy()
+        expected_df[partition_col] = expected_df[partition_col].astype("category")
+        check_round_trip(
+            df_compat,
+            pa,
+            expected=expected_df,
+            path="s3://pandas-test/parquet_dir",
+            write_kwargs={
+                "partition_cols": partition_col,
+                "compression": None,
+                "filesystem": get_s3_fs(),
+            },
+            check_like=True,
+            repeat=1,
+        )
+
     def test_partition_cols_supported(self, pa, df_full):
         # GH #23283
         partition_cols = ["bool", "int"]