Skip to content

restrict columns to read for pandas.read_parquet #18155

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Nov 8, 2017
14 changes: 8 additions & 6 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ def write(self, df, path, compression='snappy',
table, path, compression=compression,
coerce_timestamps=coerce_timestamps, **kwargs)

def read(self, path):
def read(self, path, columns=None):
path, _, _ = get_filepath_or_buffer(path)
return self.api.parquet.read_table(path).to_pandas()
return self.api.parquet.read_table(path, columns).to_pandas()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pass columns as a kwarg to read_table and to_pandas



class FastParquetImpl(object):
Expand Down Expand Up @@ -115,9 +115,9 @@ def write(self, df, path, compression='snappy', **kwargs):
self.api.write(path, df,
compression=compression, **kwargs)

def read(self, path):
def read(self, path, columns=None):
path, _, _ = get_filepath_or_buffer(path)
return self.api.ParquetFile(path).to_pandas()
return self.api.ParquetFile(path).to_pandas(columns)


def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
Expand Down Expand Up @@ -178,7 +178,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
return impl.write(df, path, compression=compression)


def read_parquet(path, engine='auto', **kwargs):
def read_parquet(path, engine='auto', columns=None, **kwargs):
"""
Load a parquet object from the file path, returning a DataFrame.

Expand All @@ -188,6 +188,8 @@ def read_parquet(path, engine='auto', **kwargs):
----------
path : string
File path
columns: list
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Write out what the default is too i.e. "list, default None"

If not None, only these columns will be read from the file.
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet reader library to use. If 'auto', then the option
'io.parquet.engine' is used. If 'auto', then the first
Expand All @@ -201,4 +203,4 @@ def read_parquet(path, engine='auto', **kwargs):
"""

impl = get_engine(engine)
return impl.read(path)
return impl.read(path, columns)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

11 changes: 11 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,17 @@ def test_compression(self, engine, compression):
df = pd.DataFrame({'A': [1, 2, 3]})
self.check_round_trip(df, engine, compression=compression)

def test_read_columns(self, engine, fp):
df = pd.DataFrame({'string': list('abc'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reference issue number above.

'int': list(range(1, 4))})

with tm.ensure_clean() as path:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you don’t need the fp argument here: engine cycles thru both engines
use check_round_trip; pass in the expected (and the columns kwarg)

df.to_parquet(path, engine, compression=None)
result = read_parquet(path, engine, columns=["string"])

expected = pd.DataFrame({'string': list('abc')})
tm.assert_frame_equal(result, expected)


class TestParquetPyArrow(Base):

Expand Down