Skip to content

Commit c53d4e0

Browse files
itholicvpolet
authored and
vpolet
committed
[SPARK-43568][SPARK-43633][PS] Support Categorical APIs for pandas 2
### What changes were proposed in this pull request? This PR proposes to support `Categorical` APIs for [pandas 2](https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html), and match the behavior. ### Why are the changes needed? To support pandas API on Spark with pandas 2.0.0 and above. ### Does this PR introduce _any_ user-facing change? The behavior is matched with pandas 2.0.0 and above. e.g. ```diff >>> psser 0 1 1 2 2 3 3 1 4 2 5 3 Name: a, dtype: category Categories (3, int64): [1, 2, 3] >>> psser.cat.remove_categories([1, 2, 3]) 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN 5 NaN Name: a, dtype: category - Categories (0, object): [] + Categories (0, int64): [] ``` ### How was this patch tested? Enabling the existing tests. Closes apache#42273 from itholic/pandas_categorical. Authored-by: itholic <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
1 parent a6cbe6c commit c53d4e0

File tree

3 files changed

+24
-81
lines changed

3 files changed

+24
-81
lines changed

python/docs/source/migration_guide/pyspark_upgrade.rst

+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ Upgrading from PySpark 3.5 to 4.0
3030
* In Spark 4.0, ``DataFrame.mad`` has been removed from pandas API on Spark.
3131
* In Spark 4.0, ``Series.mad`` has been removed from pandas API on Spark.
3232
* In Spark 4.0, ``na_sentinel`` parameter from ``Index.factorize`` and `Series.factorize`` has been removed from pandas API on Spark, use ``use_na_sentinel`` instead.
33+
* In Spark 4.0, ``inplace`` parameter from ``Categorical.add_categories``, ``Categorical.remove_categories``, ``Categorical.set_categories``, ``Categorical.rename_categories``, ``Categorical.reorder_categories``, ``Categorical.as_ordered``, ``Categorical.as_unordered`` have been removed from pandas API on Spark.
3334

3435

3536
Upgrading from PySpark 3.3 to 3.4

python/pyspark/pandas/categorical.py

+21-45
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
# limitations under the License.
1616
#
1717
from typing import Any, Callable, List, Optional, Union, TYPE_CHECKING, cast
18-
import warnings
1918

2019
import pandas as pd
2120
from pandas.api.types import ( # type: ignore[attr-defined]
@@ -250,14 +249,11 @@ def add_categories(self, new_categories: Union[pd.Index, Any, List]) -> Optional
250249
)
251250
return DataFrame(internal)._psser_for(self._data._column_label).copy()
252251

253-
def _set_ordered(self, *, ordered: bool, inplace: bool) -> Optional["ps.Series"]:
252+
def _set_ordered(self, *, ordered: bool) -> Optional["ps.Series"]:
254253
from pyspark.pandas.frame import DataFrame
255254

256255
if self.ordered == ordered:
257-
if inplace:
258-
return None
259-
else:
260-
return self._data.copy()
256+
return self._data.copy()
261257
else:
262258
internal = self._data._psdf._internal.with_new_spark_column(
263259
self._data._column_label,
@@ -266,24 +262,12 @@ def _set_ordered(self, *, ordered: bool, inplace: bool) -> Optional["ps.Series"]
266262
dtype=CategoricalDtype(categories=self.categories, ordered=ordered)
267263
),
268264
)
269-
if inplace:
270-
self._data._psdf._update_internal_frame(internal)
271-
return None
272-
else:
273-
return DataFrame(internal)._psser_for(self._data._column_label).copy()
265+
return DataFrame(internal)._psser_for(self._data._column_label).copy()
274266

275-
def as_ordered(self, inplace: bool = False) -> Optional["ps.Series"]:
267+
def as_ordered(self) -> Optional["ps.Series"]:
276268
"""
277269
Set the Categorical to be ordered.
278270
279-
Parameters
280-
----------
281-
inplace : bool, default False
282-
Whether or not to set the ordered attribute in-place or return
283-
a copy of this categorical with ordered set to True.
284-
285-
.. deprecated:: 3.4.0
286-
287271
Returns
288272
-------
289273
Series or None
@@ -312,26 +296,12 @@ def as_ordered(self, inplace: bool = False) -> Optional["ps.Series"]:
312296
dtype: category
313297
Categories (3, object): ['a' < 'b' < 'c']
314298
"""
315-
if inplace:
316-
warnings.warn(
317-
"The `inplace` parameter in as_ordered is deprecated "
318-
"and will be removed in a future version.",
319-
FutureWarning,
320-
)
321-
return self._set_ordered(ordered=True, inplace=inplace)
299+
return self._set_ordered(ordered=True)
322300

323-
def as_unordered(self, inplace: bool = False) -> Optional["ps.Series"]:
301+
def as_unordered(self) -> Optional["ps.Series"]:
324302
"""
325303
Set the Categorical to be unordered.
326304
327-
Parameters
328-
----------
329-
inplace : bool, default False
330-
Whether or not to set the ordered attribute in-place or return
331-
a copy of this categorical with ordered set to False.
332-
333-
.. deprecated:: 3.4.0
334-
335305
Returns
336306
-------
337307
Series or None
@@ -360,13 +330,7 @@ def as_unordered(self, inplace: bool = False) -> Optional["ps.Series"]:
360330
dtype: category
361331
Categories (3, object): ['a', 'b', 'c']
362332
"""
363-
if inplace:
364-
warnings.warn(
365-
"The `inplace` parameter in as_unordered is deprecated "
366-
"and will be removed in a future version.",
367-
FutureWarning,
368-
)
369-
return self._set_ordered(ordered=False, inplace=inplace)
333+
return self._set_ordered(ordered=False)
370334

371335
def remove_categories(self, removals: Union[pd.Index, Any, List]) -> Optional["ps.Series"]:
372336
"""
@@ -441,8 +405,13 @@ def remove_categories(self, removals: Union[pd.Index, Any, List]) -> Optional["p
441405
if len(categories) == 0:
442406
return self._data.copy()
443407
else:
408+
data = [cat for cat in self.categories.sort_values() if cat not in categories]
409+
if len(data) == 0:
410+
# We should keep original dtype when even removing all categories.
411+
data = pd.Index(data, dtype=self.categories.dtype) # type: ignore[assignment]
444412
dtype = CategoricalDtype(
445-
[cat for cat in self.categories if cat not in categories], ordered=self.ordered
413+
categories=data,
414+
ordered=self.ordered,
446415
)
447416
return self._data.astype(dtype)
448417

@@ -488,7 +457,14 @@ def remove_unused_categories(self) -> Optional["ps.Series"]:
488457
"""
489458
categories = set(self._data.drop_duplicates()._to_pandas())
490459
removals = [cat for cat in self.categories if cat not in categories]
491-
return self.remove_categories(removals=removals)
460+
categories = [cat for cat in removals if cat is not None] # type: ignore[assignment]
461+
if len(categories) == 0:
462+
return self._data.copy()
463+
else:
464+
dtype = CategoricalDtype(
465+
[cat for cat in self.categories if cat not in categories], ordered=self.ordered
466+
)
467+
return self._data.astype(dtype)
492468

493469
def rename_categories(
494470
self, new_categories: Union[list, dict, Callable]

python/pyspark/pandas/tests/test_categorical.py

+2-36
Original file line numberDiff line numberDiff line change
@@ -65,21 +65,14 @@ def test_categorical_series(self):
6565
with self.assertRaisesRegex(ValueError, "Cannot call CategoricalAccessor on type int64"):
6666
ps.Series([1, 2, 3]).cat
6767

68-
@unittest.skipIf(
69-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
70-
"TODO(SPARK-43566): Enable CategoricalTests.test_categories_setter for pandas 2.0.0.",
71-
)
7268
def test_categories_setter(self):
7369
pdf, psdf = self.df_pair
7470

7571
pser = pdf.a
7672
psser = psdf.a
7773

78-
pser.cat.categories = ["z", "y", "x"]
79-
psser.cat.categories = ["z", "y", "x"]
80-
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
81-
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
82-
pser = pser.astype(CategoricalDtype(categories=["x", "y", "z"]))
74+
pser = pser.cat.rename_categories(["z", "y", "x"])
75+
psser = psser.cat.rename_categories(["z", "y", "x"])
8376

8477
self.assert_eq(pser, psser)
8578
self.assert_eq(pdf, psdf)
@@ -103,10 +96,6 @@ def test_add_categories(self):
10396
self.assertRaises(ValueError, lambda: psser.cat.add_categories(4))
10497
self.assertRaises(ValueError, lambda: psser.cat.add_categories([5, 5]))
10598

106-
@unittest.skipIf(
107-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
108-
"TODO(SPARK-43605): Enable CategoricalTests.test_remove_categories for pandas 2.0.0.",
109-
)
11099
def test_remove_categories(self):
111100
pdf, psdf = self.df_pair
112101

@@ -168,10 +157,6 @@ def test_reorder_categories(self):
168157
self.assertRaises(TypeError, lambda: psser.cat.reorder_categories(1))
169158
self.assertRaises(TypeError, lambda: psdf.b.cat.reorder_categories("abcd"))
170159

171-
@unittest.skipIf(
172-
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
173-
"TODO(SPARK-43565): Enable CategoricalTests.test_as_ordered_unordered for pandas 2.0.0.",
174-
)
175160
def test_as_ordered_unordered(self):
176161
pdf, psdf = self.df_pair
177162

@@ -181,28 +166,9 @@ def test_as_ordered_unordered(self):
181166
# as_ordered
182167
self.assert_eq(pser.cat.as_ordered(), psser.cat.as_ordered())
183168

184-
pser.cat.as_ordered(inplace=True)
185-
psser.cat.as_ordered(inplace=True)
186-
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
187-
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
188-
pser = pser.astype(CategoricalDtype(categories=[1, 2, 3], ordered=True))
189-
190-
self.assert_eq(pser, psser)
191-
self.assert_eq(pdf, psdf)
192-
193169
# as_unordered
194170
self.assert_eq(pser.cat.as_unordered(), psser.cat.as_unordered())
195171

196-
pser.cat.as_unordered(inplace=True)
197-
psser.cat.as_unordered(inplace=True)
198-
if LooseVersion(pd.__version__) >= LooseVersion("1.3"):
199-
# Bug in pandas 1.3. dtype is not updated properly with `inplace` argument.
200-
pser = pser.astype(CategoricalDtype(categories=[1, 2, 3], ordered=False))
201-
pdf.a = pser
202-
203-
self.assert_eq(pser, psser)
204-
self.assert_eq(pdf, psdf)
205-
206172
def test_astype(self):
207173
pser = pd.Series(["a", "b", "c"])
208174
psser = ps.from_pandas(pser)

0 commit comments

Comments
 (0)