From 61888b20bd9a0eedafefd3c2196879e83d0dc1f8 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Tue, 20 Aug 2019 00:30:21 +0700 Subject: [PATCH 01/14] TST: Update pyarrow tests to test parquet faithful roundtrip for Categorical support Fixes #27955 --- pandas/tests/io/test_parquet.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d634859e72d7b..938dfa7945fca 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1,5 +1,6 @@ """ test parquet compat """ import datetime +from distutils.version import LooseVersion import os from warnings import catch_warnings @@ -166,6 +167,7 @@ def compare(repeat): df.to_parquet(path, **write_kwargs) with catch_warnings(record=True): actual = read_parquet(path, **read_kwargs) + tm.assert_frame_equal(expected, actual, check_names=check_names) if path is None: @@ -453,9 +455,12 @@ def test_categorical(self, pa): # supported in >= 0.7.0 df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) - # de-serialized as object - expected = df.assign(a=df.a.astype(object)) - check_round_trip(df, pa, expected=expected) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"): + check_round_trip(df, pa) + else: + # de-serialized as object for pyarrow < 0.15 + expected = df.assign(a=df.a.astype(object)) + check_round_trip(df, pa, expected=expected) def test_s3_roundtrip(self, df_compat, s3_resource, pa): # GH #19134 From 00282934a1a395eee53b1c8c115fecdcb23f02c0 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Tue, 20 Aug 2019 00:59:32 +0700 Subject: [PATCH 02/14] DOC: Update parquet docs regarding Categorical support Fixes #27955 --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 5d7a268631778..93b2ed290f930 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4700,7 +4700,7 @@ Several caveats. indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can force including or omitting indexes with the ``index`` argument, regardless of the underlying engine. * Index level names, if specified, must be strings. -* Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. +* Categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as ``object`` dtype. * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. From fdd2e3f7b027189aff833e7caa4d29cd41eb8418 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Tue, 20 Aug 2019 01:12:50 +0700 Subject: [PATCH 03/14] DOC: Add pyarrow test to test parquet faithful roundtrip for Categorical support Fixes #27955 --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0be4ebc627b30..10807489fff1f 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -88,7 +88,7 @@ Categorical ^^^^^^^^^^^ - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) -- +- Added test to assert roundtripping to parquet with :func:`to_parquet` or :func:`read_parquet` will preserve Categorical dtypes for string types (:issue:`27955`) - From ab9b082afcb6731973ea8985faab2cc7e975e738 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Wed, 21 Aug 2019 00:50:21 +0700 Subject: [PATCH 04/14] DOC: Add categorical column in example DataFrame Closes #27955 --- doc/source/user_guide/io.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 93b2ed290f930..9a3bf2fa751f6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4724,7 +4724,8 @@ See the documentation for `pyarrow `__ an 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('20130101', periods=3), - 'g': pd.date_range('20130101', periods=3, tz='US/Eastern')}) + 'g': pd.date_range('20130101', periods=3, tz='US/Eastern'), + 'h': pd.Categorical(list('abc'))}) df df.dtypes From cfc53ae517c9d99ed825e69b4b5484e4a10fc36d Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Wed, 21 Aug 2019 00:52:46 +0700 Subject: [PATCH 05/14] TST: Add test for null, out-of-order values, and unobserved category Closes #27955 --- pandas/tests/io/test_parquet.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 938dfa7945fca..21db24d73b0c0 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -453,7 +453,12 @@ def test_unsupported(self, pa): def test_categorical(self, pa): # supported in >= 0.7.0 - df = pd.DataFrame({"a": pd.Categorical(list("abc"))}) + df = pd.DataFrame() + df["a"] = pd.Categorical(list("abcdef")) + + # test for null, out-of-order values, and unobserved category + dtype = pd.CategoricalDtype(["foo", "bar", "baz"]) + df["b"] = pd.Categorical.from_codes(codes=[1, 0, 0, 1, -1, 1], dtype=dtype) if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"): check_round_trip(df, pa) From aebfae72e8b0da613e6e61b08084053c2f8f640e Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Wed, 21 Aug 2019 01:49:02 +0700 Subject: [PATCH 06/14] TST: Convert column b in expected to object for pyarrow < 0.15 Closes #27955 --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 21db24d73b0c0..f1dd98ce5d88f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -464,7 +464,7 @@ def test_categorical(self, pa): check_round_trip(df, pa) else: # de-serialized as object for pyarrow < 0.15 - expected = df.assign(a=df.a.astype(object)) + expected = df.assign(a=df.a.astype(object), b=df.b.astype(object)) check_round_trip(df, pa, expected=expected) def test_s3_roundtrip(self, df_compat, s3_resource, pa): From 791f1645d4114d6d7aadd09fafc5a7d419151048 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Sun, 25 Aug 2019 10:09:42 +0700 Subject: [PATCH 07/14] TST: Add test for ordered flag Closes #27955 --- pandas/tests/io/test_parquet.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f1dd98ce5d88f..26056ef8da046 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -460,11 +460,18 @@ def test_categorical(self, pa): dtype = pd.CategoricalDtype(["foo", "bar", "baz"]) df["b"] = pd.Categorical.from_codes(codes=[1, 0, 0, 1, -1, 1], dtype=dtype) + # test for ordered flag + df["c"] = pd.Categorical( + ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True + ) + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"): check_round_trip(df, pa) else: # de-serialized as object for pyarrow < 0.15 - expected = df.assign(a=df.a.astype(object), b=df.b.astype(object)) + expected = df.assign( + a=df.a.astype(object), b=df.b.astype(object), c=df.c.astype(object) + ) check_round_trip(df, pa, expected=expected) def test_s3_roundtrip(self, df_compat, s3_resource, pa): From 6c6b09dc7b0792c9896b8399ddf7a8800e40abd9 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Sun, 25 Aug 2019 11:15:37 +0700 Subject: [PATCH 08/14] TST: Add more rows for ordered flag test Closes #27955 --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 26056ef8da046..b6bb0373b93aa 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -462,7 +462,7 @@ def test_categorical(self, pa): # test for ordered flag df["c"] = pd.Categorical( - ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True + ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True ) if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"): From 1b0a444d9e5f1df047c77ebf47a191d161748871 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Tue, 27 Aug 2019 22:45:53 +0700 Subject: [PATCH 09/14] DOC: Change to_parquet to DataFrame.to_parquet Closes #27955 --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 87edb5b154dd7..a6685da652505 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -88,7 +88,7 @@ Categorical ^^^^^^^^^^^ - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) -- Added test to assert roundtripping to parquet with :func:`to_parquet` or :func:`read_parquet` will preserve Categorical dtypes for string types (:issue:`27955`) +- Added test to assert roundtripping to parquet with :func:`DataFrame.to_parquet` or :func:`read_parquet` will preserve Categorical dtypes for string types (:issue:`27955`) - From bf32a802ae8ea38d103e656f61cad7ad3650bc65 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Tue, 27 Aug 2019 22:48:36 +0700 Subject: [PATCH 10/14] TST: Replace from_codes Closes #27955 --- pandas/tests/io/test_parquet.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b6bb0373b93aa..e4e208069a432 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -457,8 +457,10 @@ def test_categorical(self, pa): df["a"] = pd.Categorical(list("abcdef")) # test for null, out-of-order values, and unobserved category - dtype = pd.CategoricalDtype(["foo", "bar", "baz"]) - df["b"] = pd.Categorical.from_codes(codes=[1, 0, 0, 1, -1, 1], dtype=dtype) + df["b"] = pd.Categorical( + ["bar", "foo", "foo", "bar", None, "bar"], + dtype=pd.CategoricalDtype(["foo", "bar", "baz"]), + ) # test for ordered flag df["c"] = pd.Categorical( From 159b27085bf8c6b8dc3c8936cfa3037a7d6e29ad Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Tue, 27 Aug 2019 22:50:21 +0700 Subject: [PATCH 11/14] TST: Simplify expected conversion to object Closes #27955 --- pandas/tests/io/test_parquet.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index e4e208069a432..d4f2a5bb3deaf 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -471,9 +471,7 @@ def test_categorical(self, pa): check_round_trip(df, pa) else: # de-serialized as object for pyarrow < 0.15 - expected = df.assign( - a=df.a.astype(object), b=df.b.astype(object), c=df.c.astype(object) - ) + expected = df.astype(object) check_round_trip(df, pa, expected=expected) def test_s3_roundtrip(self, df_compat, s3_resource, pa): From 421107133044d9bc7f43a3b65c95550d6a74f05f Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Thu, 29 Aug 2019 17:27:28 +0700 Subject: [PATCH 12/14] DOC: Update doc to clarify the differences between pyarrow and fastparquet Closes #27955 --- doc/source/user_guide/io.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 09181be93181b..cbad7fa490449 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4702,7 +4702,8 @@ Several caveats. indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can force including or omitting indexes with the ``index`` argument, regardless of the underlying engine. * Index level names, if specified, must be strings. -* Categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as ``object`` dtype. +* In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. +* The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. From a7c414d1084d8bf1672c286cba2d4ee7c517e8a4 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Thu, 29 Aug 2019 18:32:56 +0700 Subject: [PATCH 13/14] DOC: Remove whitespace --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index cbad7fa490449..8a7e840db9c72 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4702,7 +4702,7 @@ Several caveats. indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can force including or omitting indexes with the ``index`` argument, regardless of the underlying engine. * Index level names, if specified, must be strings. -* In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. +* In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message on an attempt at serialization. From bd08e1668818a416a73c7dfb3d084659038c8ab9 Mon Sep 17 00:00:00 2001 From: Galuh Sahid Date: Sat, 5 Oct 2019 00:48:06 +0700 Subject: [PATCH 14/14] DOC: Add ordered categorical --- doc/source/user_guide/io.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index fade28f52e673..ee097c1f4d5e8 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4736,7 +4736,8 @@ See the documentation for `pyarrow `__ an 'e': [True, False, True], 'f': pd.date_range('20130101', periods=3), 'g': pd.date_range('20130101', periods=3, tz='US/Eastern'), - 'h': pd.Categorical(list('abc'))}) + 'h': pd.Categorical(list('abc')), + 'i': pd.Categorical(list('abc'), ordered=True)}) df df.dtypes