From 61888b20bd9a0eedafefd3c2196879e83d0dc1f8 Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Tue, 20 Aug 2019 00:30:21 +0700
Subject: [PATCH 01/14] TST: Update pyarrow tests to test parquet faithful
 roundtrip for Categorical support

Fixes #27955
---
 pandas/tests/io/test_parquet.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index d634859e72d7b..938dfa7945fca 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -1,5 +1,6 @@
 """ test parquet compat """
 import datetime
+from distutils.version import LooseVersion
 import os
 from warnings import catch_warnings
 
@@ -166,6 +167,7 @@ def compare(repeat):
             df.to_parquet(path, **write_kwargs)
             with catch_warnings(record=True):
                 actual = read_parquet(path, **read_kwargs)
+
             tm.assert_frame_equal(expected, actual, check_names=check_names)
 
     if path is None:
@@ -453,9 +455,12 @@ def test_categorical(self, pa):
         # supported in >= 0.7.0
         df = pd.DataFrame({"a": pd.Categorical(list("abc"))})
 
-        # de-serialized as object
-        expected = df.assign(a=df.a.astype(object))
-        check_round_trip(df, pa, expected=expected)
+        if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"):
+            check_round_trip(df, pa)
+        else:
+            # de-serialized as object for pyarrow < 0.15
+            expected = df.assign(a=df.a.astype(object))
+            check_round_trip(df, pa, expected=expected)
 
     def test_s3_roundtrip(self, df_compat, s3_resource, pa):
         # GH #19134

From 00282934a1a395eee53b1c8c115fecdcb23f02c0 Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Tue, 20 Aug 2019 00:59:32 +0700
Subject: [PATCH 02/14] DOC: Update parquet docs regarding Categorical support

Fixes #27955
---
 doc/source/user_guide/io.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 5d7a268631778..93b2ed290f930 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -4700,7 +4700,7 @@ Several caveats.
   indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can
   force including or omitting indexes with the ``index`` argument, regardless of the underlying engine.
 * Index level names, if specified, must be strings.
-* Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
+* Categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as ``object`` dtype.
 * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
   on an attempt at serialization.
 

From fdd2e3f7b027189aff833e7caa4d29cd41eb8418 Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Tue, 20 Aug 2019 01:12:50 +0700
Subject: [PATCH 03/14] DOC: Add pyarrow test to test parquet faithful
 roundtrip for Categorical support

Fixes #27955
---
 doc/source/whatsnew/v1.0.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 0be4ebc627b30..10807489fff1f 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -88,7 +88,7 @@ Categorical
 ^^^^^^^^^^^
 
 - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`)
--
+- Added test to assert roundtripping to parquet with :func:`to_parquet` or :func:`read_parquet` will preserve Categorical dtypes for string types (:issue:`27955`)
 -
 
 

From ab9b082afcb6731973ea8985faab2cc7e975e738 Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Wed, 21 Aug 2019 00:50:21 +0700
Subject: [PATCH 04/14] DOC: Add categorical column in example DataFrame

Closes #27955
---
 doc/source/user_guide/io.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 93b2ed290f930..9a3bf2fa751f6 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -4724,7 +4724,8 @@ See the documentation for `pyarrow <https://arrow.apache.org/docs/python/>`__ an
                       'd': np.arange(4.0, 7.0, dtype='float64'),
                       'e': [True, False, True],
                       'f': pd.date_range('20130101', periods=3),
-                      'g': pd.date_range('20130101', periods=3, tz='US/Eastern')})
+                      'g': pd.date_range('20130101', periods=3, tz='US/Eastern'),
+                      'h': pd.Categorical(list('abc'))})
 
    df
    df.dtypes

From cfc53ae517c9d99ed825e69b4b5484e4a10fc36d Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Wed, 21 Aug 2019 00:52:46 +0700
Subject: [PATCH 05/14] TST: Add test for null, out-of-order values, and
 unobserved category

Closes #27955
---
 pandas/tests/io/test_parquet.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 938dfa7945fca..21db24d73b0c0 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -453,7 +453,12 @@ def test_unsupported(self, pa):
     def test_categorical(self, pa):
 
         # supported in >= 0.7.0
-        df = pd.DataFrame({"a": pd.Categorical(list("abc"))})
+        df = pd.DataFrame()
+        df["a"] = pd.Categorical(list("abcdef"))
+
+        # test for null, out-of-order values, and unobserved category
+        dtype = pd.CategoricalDtype(["foo", "bar", "baz"])
+        df["b"] = pd.Categorical.from_codes(codes=[1, 0, 0, 1, -1, 1], dtype=dtype)
 
         if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"):
             check_round_trip(df, pa)

From aebfae72e8b0da613e6e61b08084053c2f8f640e Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Wed, 21 Aug 2019 01:49:02 +0700
Subject: [PATCH 06/14] TST: Convert column b in expected to object for pyarrow
 < 0.15

Closes #27955
---
 pandas/tests/io/test_parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 21db24d73b0c0..f1dd98ce5d88f 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -464,7 +464,7 @@ def test_categorical(self, pa):
             check_round_trip(df, pa)
         else:
             # de-serialized as object for pyarrow < 0.15
-            expected = df.assign(a=df.a.astype(object))
+            expected = df.assign(a=df.a.astype(object), b=df.b.astype(object))
             check_round_trip(df, pa, expected=expected)
 
     def test_s3_roundtrip(self, df_compat, s3_resource, pa):

From 791f1645d4114d6d7aadd09fafc5a7d419151048 Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Sun, 25 Aug 2019 10:09:42 +0700
Subject: [PATCH 07/14] TST: Add test for ordered flag

Closes #27955
---
 pandas/tests/io/test_parquet.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index f1dd98ce5d88f..26056ef8da046 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -460,11 +460,18 @@ def test_categorical(self, pa):
         dtype = pd.CategoricalDtype(["foo", "bar", "baz"])
         df["b"] = pd.Categorical.from_codes(codes=[1, 0, 0, 1, -1, 1], dtype=dtype)
 
+        # test for ordered flag
+        df["c"] = pd.Categorical(
+            ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True
+        )
+
         if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"):
             check_round_trip(df, pa)
         else:
             # de-serialized as object for pyarrow < 0.15
-            expected = df.assign(a=df.a.astype(object), b=df.b.astype(object))
+            expected = df.assign(
+                a=df.a.astype(object), b=df.b.astype(object), c=df.c.astype(object)
+            )
             check_round_trip(df, pa, expected=expected)
 
     def test_s3_roundtrip(self, df_compat, s3_resource, pa):

From 6c6b09dc7b0792c9896b8399ddf7a8800e40abd9 Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Sun, 25 Aug 2019 11:15:37 +0700
Subject: [PATCH 08/14] TST: Add more rows for ordered flag test

Closes #27955
---
 pandas/tests/io/test_parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 26056ef8da046..b6bb0373b93aa 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -462,7 +462,7 @@ def test_categorical(self, pa):
 
         # test for ordered flag
         df["c"] = pd.Categorical(
-            ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True
+            ["a", "b", "c", "a", "c", "b"], categories=["b", "c", "d"], ordered=True
         )
 
         if LooseVersion(pyarrow.__version__) >= LooseVersion("0.15.0"):

From 1b0a444d9e5f1df047c77ebf47a191d161748871 Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Tue, 27 Aug 2019 22:45:53 +0700
Subject: [PATCH 09/14] DOC: Change to_parquet to DataFrame.to_parquet

Closes #27955
---
 doc/source/whatsnew/v1.0.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 87edb5b154dd7..a6685da652505 100644
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -88,7 +88,7 @@ Categorical
 ^^^^^^^^^^^
 
 - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`)
-- Added test to assert roundtripping to parquet with :func:`to_parquet` or :func:`read_parquet` will preserve Categorical dtypes for string types (:issue:`27955`)
+- Added test to assert roundtripping to parquet with :func:`DataFrame.to_parquet` or :func:`read_parquet` will preserve Categorical dtypes for string types (:issue:`27955`)
 -
 
 

From bf32a802ae8ea38d103e656f61cad7ad3650bc65 Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Tue, 27 Aug 2019 22:48:36 +0700
Subject: [PATCH 10/14] TST: Replace from_codes

Closes #27955
---
 pandas/tests/io/test_parquet.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index b6bb0373b93aa..e4e208069a432 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -457,8 +457,10 @@ def test_categorical(self, pa):
         df["a"] = pd.Categorical(list("abcdef"))
 
         # test for null, out-of-order values, and unobserved category
-        dtype = pd.CategoricalDtype(["foo", "bar", "baz"])
-        df["b"] = pd.Categorical.from_codes(codes=[1, 0, 0, 1, -1, 1], dtype=dtype)
+        df["b"] = pd.Categorical(
+            ["bar", "foo", "foo", "bar", None, "bar"],
+            dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
+        )
 
         # test for ordered flag
         df["c"] = pd.Categorical(

From 159b27085bf8c6b8dc3c8936cfa3037a7d6e29ad Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Tue, 27 Aug 2019 22:50:21 +0700
Subject: [PATCH 11/14] TST: Simplify expected conversion to object

Closes #27955
---
 pandas/tests/io/test_parquet.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index e4e208069a432..d4f2a5bb3deaf 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -471,9 +471,7 @@ def test_categorical(self, pa):
             check_round_trip(df, pa)
         else:
             # de-serialized as object for pyarrow < 0.15
-            expected = df.assign(
-                a=df.a.astype(object), b=df.b.astype(object), c=df.c.astype(object)
-            )
+            expected = df.astype(object)
             check_round_trip(df, pa, expected=expected)
 
     def test_s3_roundtrip(self, df_compat, s3_resource, pa):

From 421107133044d9bc7f43a3b65c95550d6a74f05f Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Thu, 29 Aug 2019 17:27:28 +0700
Subject: [PATCH 12/14] DOC: Update doc to clarify the differences between
 pyarrow and fastparquet

Closes #27955
---
 doc/source/user_guide/io.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 09181be93181b..cbad7fa490449 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -4702,7 +4702,8 @@ Several caveats.
   indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can
   force including or omitting indexes with the ``index`` argument, regardless of the underlying engine.
 * Index level names, if specified, must be strings.
-* Categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as ``object`` dtype.
+* In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. 
+* The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag.
 * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
   on an attempt at serialization.
 

From a7c414d1084d8bf1672c286cba2d4ee7c517e8a4 Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Thu, 29 Aug 2019 18:32:56 +0700
Subject: [PATCH 13/14] DOC: Remove whitespace

---
 doc/source/user_guide/io.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index cbad7fa490449..8a7e840db9c72 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -4702,7 +4702,7 @@ Several caveats.
   indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can
   force including or omitting indexes with the ``index`` argument, regardless of the underlying engine.
 * Index level names, if specified, must be strings.
-* In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. 
+* In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype.
 * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag.
 * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
   on an attempt at serialization.

From bd08e1668818a416a73c7dfb3d084659038c8ab9 Mon Sep 17 00:00:00 2001
From: Galuh Sahid <galuh.tunggadewi@gmail.com>
Date: Sat, 5 Oct 2019 00:48:06 +0700
Subject: [PATCH 14/14] DOC: Add ordered categorical

---
 doc/source/user_guide/io.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index fade28f52e673..ee097c1f4d5e8 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -4736,7 +4736,8 @@ See the documentation for `pyarrow <https://arrow.apache.org/docs/python/>`__ an
                       'e': [True, False, True],
                       'f': pd.date_range('20130101', periods=3),
                       'g': pd.date_range('20130101', periods=3, tz='US/Eastern'),
-                      'h': pd.Categorical(list('abc'))})
+                      'h': pd.Categorical(list('abc')),
+                      'i': pd.Categorical(list('abc'), ordered=True)})
 
    df
    df.dtypes