From 3995d5a07bd6aaac76ad442c5c8788141aba1a22 Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sat, 17 Aug 2019 18:58:17 -0500 Subject: [PATCH 01/11] Added check for string in partition_cols --- pandas/io/parquet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6fc70e9f4a737..685a9a637cbca 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -244,6 +244,8 @@ def to_parquet( kwargs Additional keyword arguments passed to the engine """ + if isinstance(partition_cols, str): + partition_cols = [partition_cols] impl = get_engine(engine) return impl.write( df, From 93c461321c9a99c92d7b9ec70681f50e39810a80 Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sat, 17 Aug 2019 19:19:08 -0500 Subject: [PATCH 02/11] Added unit tests for PyArrow and FastParquet where partition_cols is string --- pandas/tests/io/test_parquet.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d634859e72d7b..f58ae25003b99 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -330,7 +330,7 @@ def test_write_index(self, engine): # non-default index for index in indexes: df.index = index - check_round_trip(df, engine, check_names=check_names) + check_round_trip(df, engine, check_names=chetest_partition_cols_supportedck_names) # index with meta-data df.index = [0, 1, 2] @@ -416,7 +416,7 @@ def test_basic_subset_columns(self, pa, df_full): # GH18628 df = df_full - # additional supported types for pyarrow + # additional supported types for pyarrowtest_partition_cols_supported df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") check_round_trip( @@ -473,6 +473,18 @@ def test_partition_cols_supported(self, pa, df_full): assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols) + def test_partition_cols_string(self, pa, df_full): + # GH #23283 + partition_cols = 'bool' + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet(path, partition_cols=partition_cols, compression=None) + import pyarrow.parquet as pq + + dataset = pq.ParquetDataset(path, validate_schema=False) + assert len(dataset.partitions.partition_names) == 1 + assert dataset.partitions.partition_names == set([partition_cols]) + def test_empty_dataframe(self, pa): # GH #27339 df = pd.DataFrame() @@ -543,6 +555,23 @@ def test_partition_cols_supported(self, fp, df_full): actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 + def test_partition_cols_string(self, fp, df_full): + # GH #23283 + partition_cols = 'bool' + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet( + path, + engine="fastparquet", + partition_cols=partition_cols, + compression=None, + ) + assert os.path.exists(path) + import fastparquet # noqa: F811 + + actual_partition_cols = fastparquet.ParquetFile(path, False).cats + assert len(actual_partition_cols) == 1 + def test_partition_on_supported(self, fp, df_full): # GH #23283 partition_cols = ["bool", "int"] From afd10c467ecf41f673646267ecf5c0e62d16dfed Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sat, 17 Aug 2019 19:55:48 -0500 Subject: [PATCH 03/11] Added docstring to to_parquet function --- pandas/core/frame.py | 7 ++++--- pandas/io/parquet.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 603a615c1f8cb..627acea4951f9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2156,9 +2156,10 @@ def to_parquet( .. versionadded:: 0.24.0 - partition_cols : list, optional, default None - Column names by which to partition the dataset - Columns are partitioned in the order they are given + partition_cols : list or string, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + String identifies a single column to be partitioned. .. versionadded:: 0.24.0 diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 685a9a637cbca..c01163cfd237c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -235,9 +235,10 @@ def to_parquet( .. versionadded:: 0.24.0 - partition_cols : list, optional, default None - Column names by which to partition the dataset - Columns are partitioned in the order they are given + partition_cols : list or string, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + String identifies a single column to be partitioned. .. versionadded:: 0.24.0 From ec927c36760ae0165ea23581c52ca858b69a464c Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sat, 17 Aug 2019 20:22:20 -0500 Subject: [PATCH 04/11] Deleted misadded code --- pandas/tests/io/test_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f58ae25003b99..830e93104c110 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -330,7 +330,7 @@ def test_write_index(self, engine): # non-default index for index in indexes: df.index = index - check_round_trip(df, engine, check_names=chetest_partition_cols_supportedck_names) + check_round_trip(df, engine, check_names=check_names) # index with meta-data df.index = [0, 1, 2] @@ -416,7 +416,7 @@ def test_basic_subset_columns(self, pa, df_full): # GH18628 df = df_full - # additional supported types for pyarrowtest_partition_cols_supported + # additional supported types for pyarrow df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels") check_round_trip( From 209eb7e8202b6a0ef9b5f1e2be6bef7850924634 Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sat, 17 Aug 2019 20:41:15 -0500 Subject: [PATCH 05/11] format with black pandas --- pandas/tests/io/test_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 830e93104c110..06e8c63ff0aed 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -475,7 +475,7 @@ def test_partition_cols_supported(self, pa, df_full): def test_partition_cols_string(self, pa, df_full): # GH #23283 - partition_cols = 'bool' + partition_cols = "bool" df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) @@ -557,7 +557,7 @@ def test_partition_cols_supported(self, fp, df_full): def test_partition_cols_string(self, fp, df_full): # GH #23283 - partition_cols = 'bool' + partition_cols = "bool" df = df_full with tm.ensure_clean_dir() as path: df.to_parquet( From 270e2c5a4175c75bf884c84f7ec91b87e1ac7ab5 Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sun, 18 Aug 2019 10:09:07 -0500 Subject: [PATCH 06/11] Fix issue from flake8 --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 06e8c63ff0aed..961b1020515ee 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -483,7 +483,7 @@ def test_partition_cols_string(self, pa, df_full): dataset = pq.ParquetDataset(path, validate_schema=False) assert len(dataset.partitions.partition_names) == 1 - assert dataset.partitions.partition_names == set([partition_cols]) + assert dataset.partitions.partition_names == set((partition_cols)) def test_empty_dataframe(self, pa): # GH #27339 From d200305c576a419e36e49e3ac5a47a4f8a6ed3dc Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sun, 18 Aug 2019 11:00:56 -0500 Subject: [PATCH 07/11] Fix issue with earlier commit --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 961b1020515ee..52f6fbe0bb6ac 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -483,7 +483,7 @@ def test_partition_cols_string(self, pa, df_full): dataset = pq.ParquetDataset(path, validate_schema=False) assert len(dataset.partitions.partition_names) == 1 - assert dataset.partitions.partition_names == set((partition_cols)) + assert dataset.partitions.partition_names == [partition_cols] def test_empty_dataframe(self, pa): # GH #27339 From 84ae85b32ec7b3beff35b50b768b80f41e2d8313 Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sun, 18 Aug 2019 11:12:17 -0500 Subject: [PATCH 08/11] Add partition_cols_list variable --- pandas/tests/io/test_parquet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 52f6fbe0bb6ac..0b2d3a07980fa 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -476,6 +476,7 @@ def test_partition_cols_supported(self, pa, df_full): def test_partition_cols_string(self, pa, df_full): # GH #23283 partition_cols = "bool" + partition_cols_list = [partition_cols] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet(path, partition_cols=partition_cols, compression=None) @@ -483,7 +484,7 @@ def test_partition_cols_string(self, pa, df_full): dataset = pq.ParquetDataset(path, validate_schema=False) assert len(dataset.partitions.partition_names) == 1 - assert dataset.partitions.partition_names == [partition_cols] + assert dataset.partitions.partition_names == set(partition_cols_list) def test_empty_dataframe(self, pa): # GH #27339 From 22075e7ff0e26b47b80f1947ac5fe7e5012fe0b6 Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Sun, 18 Aug 2019 11:28:20 -0500 Subject: [PATCH 09/11] Add whatsnew entry --- doc/source/whatsnew/v0.25.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 34b149a6b8261..ef6a47340378a 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -105,6 +105,7 @@ I/O - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) +- String support for paramater partition_cols in the :func:`pandas.to_parquet` (:issue:`27117`) - Plotting From 1d4c3d03d62a5e9dfdcfedb90c80e419e722392e Mon Sep 17 00:00:00 2001 From: Dalynn Hatch Date: Tue, 20 Aug 2019 18:04:40 -0500 Subject: [PATCH 10/11] Move to 1.0.0 whtsnew and add a versionchanged to docstring --- doc/source/whatsnew/v0.25.1.rst | 3 +-- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/frame.py | 5 +++++ pandas/io/parquet.py | 5 +++++ 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index ef6a47340378a..9213de7293e83 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -105,8 +105,7 @@ I/O - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) -- String support for paramater partition_cols in the :func:`pandas.to_parquet` (:issue:`27117`) -- +- Plotting ^^^^^^^^ diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0be4ebc627b30..e504a29748b38 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -29,7 +29,7 @@ Enhancements Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- String support for paramater partition_cols in the :func:`pandas.to_parquet` (:issue:`27117`) - .. _whatsnew_1000.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 627acea4951f9..ca63e7452b873 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2167,6 +2167,11 @@ def to_parquet( Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. + .. versionchanged:: 1.0.0 + + partition_cols + Added ability to pass in a string for a single column name + See Also -------- read_parquet : Read a parquet file. diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index c01163cfd237c..acf97e4b7a161 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -244,6 +244,11 @@ def to_parquet( kwargs Additional keyword arguments passed to the engine + + .. versionchanged:: 1.0.0 + + partition_cols + Added ability to pass in a string for a single column name """ if isinstance(partition_cols, str): partition_cols = [partition_cols] From c30029e2c7b90acdaf7fc479fa063694a6eb0e4a Mon Sep 17 00:00:00 2001 From: DalynnHatch <37126026+DalynnHatch@users.noreply.github.com> Date: Tue, 27 Aug 2019 10:56:18 -0500 Subject: [PATCH 11/11] Update doc/source/whatsnew/v0.25.1.rst Co-Authored-By: Joris Van den Bossche --- doc/source/whatsnew/v0.25.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 9213de7293e83..34b149a6b8261 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -105,7 +105,7 @@ I/O - Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) -- +- Plotting ^^^^^^^^