Skip to content

Commit 5943291

Browse files
hoffmannNo-Stream
authored andcommitted
restrict columns to read for pandas.read_parquet (pandas-dev#18155)
1 parent 5541736 commit 5943291

File tree

4 files changed

+32
-8
lines changed

4 files changed

+32
-8
lines changed

doc/source/io.rst

+10
Original file line numberDiff line numberDiff line change
@@ -4538,6 +4538,16 @@ Read from a parquet file.
45384538
45394539
result.dtypes
45404540
4541+
Read only certain columns of a parquet file.
4542+
4543+
.. ipython:: python
4544+
4545+
result = pd.read_parquet('example_pa.parquet', engine='pyarrow', columns=['a', 'b'])
4546+
result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b'])
4547+
4548+
result.dtypes
4549+
4550+
45414551
.. ipython:: python
45424552
:suppress:
45434553

doc/source/whatsnew/v0.21.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ I/O
8282
- Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)
8383
- Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`)
8484
- Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`)
85+
- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`)
8586

8687
Plotting
8788
^^^^^^^^

pandas/io/parquet.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,9 @@ def write(self, df, path, compression='snappy',
7676
table, path, compression=compression,
7777
coerce_timestamps=coerce_timestamps, **kwargs)
7878

79-
def read(self, path):
79+
def read(self, path, columns=None):
8080
path, _, _ = get_filepath_or_buffer(path)
81-
return self.api.parquet.read_table(path).to_pandas()
81+
return self.api.parquet.read_table(path, columns=columns).to_pandas()
8282

8383

8484
class FastParquetImpl(object):
@@ -115,9 +115,9 @@ def write(self, df, path, compression='snappy', **kwargs):
115115
self.api.write(path, df,
116116
compression=compression, **kwargs)
117117

118-
def read(self, path):
118+
def read(self, path, columns=None):
119119
path, _, _ = get_filepath_or_buffer(path)
120-
return self.api.ParquetFile(path).to_pandas()
120+
return self.api.ParquetFile(path).to_pandas(columns=columns)
121121

122122

123123
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
@@ -178,7 +178,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
178178
return impl.write(df, path, compression=compression)
179179

180180

181-
def read_parquet(path, engine='auto', **kwargs):
181+
def read_parquet(path, engine='auto', columns=None, **kwargs):
182182
"""
183183
Load a parquet object from the file path, returning a DataFrame.
184184
@@ -188,6 +188,10 @@ def read_parquet(path, engine='auto', **kwargs):
188188
----------
189189
path : string
190190
File path
191+
columns: list, default=None
192+
If not None, only these columns will be read from the file.
193+
194+
.. versionadded 0.21.1
191195
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
192196
Parquet reader library to use. If 'auto', then the option
193197
'io.parquet.engine' is used. If 'auto', then the first
@@ -201,4 +205,4 @@ def read_parquet(path, engine='auto', **kwargs):
201205
"""
202206

203207
impl = get_engine(engine)
204-
return impl.read(path)
208+
return impl.read(path, columns=columns)

pandas/tests/io/test_parquet.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -192,15 +192,15 @@ def check_round_trip(self, df, engine, expected=None, **kwargs):
192192

193193
with tm.ensure_clean() as path:
194194
df.to_parquet(path, engine, **kwargs)
195-
result = read_parquet(path, engine)
195+
result = read_parquet(path, engine, **kwargs)
196196

197197
if expected is None:
198198
expected = df
199199
tm.assert_frame_equal(result, expected)
200200

201201
# repeat
202202
to_parquet(df, path, engine, **kwargs)
203-
result = pd.read_parquet(path, engine)
203+
result = pd.read_parquet(path, engine, **kwargs)
204204

205205
if expected is None:
206206
expected = df
@@ -282,6 +282,15 @@ def test_compression(self, engine, compression):
282282
df = pd.DataFrame({'A': [1, 2, 3]})
283283
self.check_round_trip(df, engine, compression=compression)
284284

285+
def test_read_columns(self, engine):
286+
# GH18154
287+
df = pd.DataFrame({'string': list('abc'),
288+
'int': list(range(1, 4))})
289+
290+
expected = pd.DataFrame({'string': list('abc')})
291+
self.check_round_trip(df, engine, expected=expected,
292+
compression=None, columns=["string"])
293+
285294

286295
class TestParquetPyArrow(Base):
287296

0 commit comments

Comments
 (0)