-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
restrict columns to read for pandas.read_parquet #18155
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
8c247c2
d00d222
c1449f5
22663e8
f31e6a2
f91f5f8
21c5f5e
ef30f39
54fc1c9
e5336b6
d6baa9d
7f6e7f6
4b22c88
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -109,7 +109,7 @@ I/O | |
^^^ | ||
|
||
- :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) | ||
- | ||
- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can put this on 0.21.1 |
||
- | ||
|
||
Plotting | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,9 +76,9 @@ def write(self, df, path, compression='snappy', | |
table, path, compression=compression, | ||
coerce_timestamps=coerce_timestamps, **kwargs) | ||
|
||
def read(self, path): | ||
def read(self, path, columns=None): | ||
path, _, _ = get_filepath_or_buffer(path) | ||
return self.api.parquet.read_table(path).to_pandas() | ||
return self.api.parquet.read_table(path, columns).to_pandas() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pass columns as a kwarg to read_table and to_pandas |
||
|
||
|
||
class FastParquetImpl(object): | ||
|
@@ -115,9 +115,9 @@ def write(self, df, path, compression='snappy', **kwargs): | |
self.api.write(path, df, | ||
compression=compression, **kwargs) | ||
|
||
def read(self, path): | ||
def read(self, path, columns=None): | ||
path, _, _ = get_filepath_or_buffer(path) | ||
return self.api.ParquetFile(path).to_pandas() | ||
return self.api.ParquetFile(path).to_pandas(columns) | ||
|
||
|
||
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): | ||
|
@@ -178,7 +178,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): | |
return impl.write(df, path, compression=compression) | ||
|
||
|
||
def read_parquet(path, engine='auto', **kwargs): | ||
def read_parquet(path, engine='auto', columns=None, **kwargs): | ||
""" | ||
Load a parquet object from the file path, returning a DataFrame. | ||
|
||
|
@@ -188,6 +188,8 @@ def read_parquet(path, engine='auto', **kwargs): | |
---------- | ||
path : string | ||
File path | ||
columns: list, default=None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a version added tag There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
If not None, only these columns will be read from the file. | ||
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' | ||
Parquet reader library to use. If 'auto', then the option | ||
'io.parquet.engine' is used. If 'auto', then the first | ||
|
@@ -201,4 +203,4 @@ def read_parquet(path, engine='auto', **kwargs): | |
""" | ||
|
||
impl = get_engine(engine) | ||
return impl.read(path) | ||
return impl.read(path, columns) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -282,6 +282,18 @@ def test_compression(self, engine, compression): | |
df = pd.DataFrame({'A': [1, 2, 3]}) | ||
self.check_round_trip(df, engine, compression=compression) | ||
|
||
def test_read_columns(self, engine, fp): | ||
# GH18154 | ||
df = pd.DataFrame({'string': list('abc'), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reference issue number above. |
||
'int': list(range(1, 4))}) | ||
|
||
with tm.ensure_clean() as path: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you don’t need the fp argument here: engine cycles thru both engines |
||
df.to_parquet(path, engine, compression=None) | ||
result = read_parquet(path, engine, columns=["string"]) | ||
|
||
expected = pd.DataFrame({'string': list('abc')}) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
class TestParquetPyArrow(Base): | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you add a small example in the docs in io.rst as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done 21c5f5e