Skip to content

Commit 57d056a

Browse files
REG: Fix read_parquet from file-like objects (pandas-dev#34500)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 08febd2 commit 57d056a

File tree

3 files changed

+32
-5
lines changed

3 files changed

+32
-5
lines changed

pandas/io/parquet.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -122,11 +122,20 @@ def write(
122122
file_obj_or_path.close()
123123

124124
def read(self, path, columns=None, **kwargs):
125-
parquet_ds = self.api.parquet.ParquetDataset(
126-
path, filesystem=get_fs_for_path(path), **kwargs
127-
)
128-
kwargs["columns"] = columns
129-
result = parquet_ds.read_pandas(**kwargs).to_pandas()
125+
fs = get_fs_for_path(path)
126+
should_close = None
127+
# Avoid calling get_filepath_or_buffer for s3/gcs URLs since
128+
# since it returns an S3File which doesn't support dir reads in arrow
129+
if not fs:
130+
path, _, _, should_close = get_filepath_or_buffer(path)
131+
132+
kwargs["use_pandas_metadata"] = True
133+
result = self.api.parquet.read_table(
134+
path, columns=columns, filesystem=fs, **kwargs
135+
).to_pandas()
136+
if should_close:
137+
path.close()
138+
130139
return result
131140

132141

2.11 KB
Binary file not shown.

pandas/tests/io/test_parquet.py

+18
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
""" test parquet compat """
22
import datetime
33
from distutils.version import LooseVersion
4+
from io import BytesIO
45
import os
56
from warnings import catch_warnings
67

@@ -567,6 +568,23 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col):
567568
repeat=1,
568569
)
569570

571+
@tm.network
572+
@td.skip_if_no("pyarrow")
573+
def test_parquet_read_from_url(self, df_compat):
574+
url = (
575+
"https://raw.githubusercontent.com/pandas-dev/pandas/"
576+
"master/pandas/tests/io/data/parquet/simple.parquet"
577+
)
578+
df = pd.read_parquet(url)
579+
tm.assert_frame_equal(df, df_compat)
580+
581+
@td.skip_if_no("pyarrow")
582+
def test_read_file_like_obj_support(self, df_compat):
583+
buffer = BytesIO()
584+
df_compat.to_parquet(buffer)
585+
df_from_buf = pd.read_parquet(buffer)
586+
tm.assert_frame_equal(df_compat, df_from_buf)
587+
570588
def test_partition_cols_supported(self, pa, df_full):
571589
# GH #23283
572590
partition_cols = ["bool", "int"]

0 commit comments

Comments
 (0)