Skip to content

Commit 7563d06

Browse files
jbrockmendelJulianWgs
authored andcommitted
ENH: Categorical.empty (pandas-dev#40602)
1 parent 8e0ca2d commit 7563d06

File tree

8 files changed

+164
-1
lines changed

8 files changed

+164
-1
lines changed

pandas/core/arrays/_mixins.py

+23
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
F,
1616
PositionalIndexer2D,
1717
Shape,
18+
type_t,
1819
)
1920
from pandas.compat.numpy import function as nv
2021
from pandas.errors import AbstractMethodError
@@ -28,6 +29,7 @@
2829
)
2930

3031
from pandas.core.dtypes.common import is_dtype_equal
32+
from pandas.core.dtypes.dtypes import ExtensionDtype
3133
from pandas.core.dtypes.missing import array_equivalent
3234

3335
from pandas.core import missing
@@ -465,3 +467,24 @@ def value_counts(self, dropna: bool = True):
465467
index_arr = self._from_backing_data(np.asarray(result.index._data))
466468
index = Index(index_arr, name=result.index.name)
467469
return Series(result._values, index=index, name=result.name)
470+
471+
# ------------------------------------------------------------------------
472+
# numpy-like methods
473+
474+
@classmethod
475+
def _empty(
476+
cls: type_t[NDArrayBackedExtensionArrayT], shape: Shape, dtype: ExtensionDtype
477+
) -> NDArrayBackedExtensionArrayT:
478+
"""
479+
Analogous to np.empty(shape, dtype=dtype)
480+
481+
Parameters
482+
----------
483+
shape : tuple[int]
484+
dtype : ExtensionDtype
485+
"""
486+
# The base implementation uses a naive approach to find the dtype
487+
# for the backing ndarray
488+
arr = cls._from_sequence([], dtype=dtype)
489+
backing = np.empty(shape, dtype=arr._ndarray.dtype)
490+
return arr._from_backing_data(backing)

pandas/core/arrays/base.py

+15
Original file line numberDiff line numberDiff line change
@@ -1304,6 +1304,21 @@ def delete(self: ExtensionArrayT, loc) -> ExtensionArrayT:
13041304
indexer = np.delete(np.arange(len(self)), loc)
13051305
return self.take(indexer)
13061306

1307+
@classmethod
1308+
def _empty(cls, shape: Shape, dtype: ExtensionDtype):
1309+
"""
1310+
Create an ExtensionArray with the given shape and dtype.
1311+
"""
1312+
obj = cls._from_sequence([], dtype=dtype)
1313+
1314+
taker = np.broadcast_to(np.intp(-1), shape)
1315+
result = obj.take(taker, allow_fill=True)
1316+
if not isinstance(result, cls) or dtype != result.dtype:
1317+
raise NotImplementedError(
1318+
f"Default 'empty' implementation is invalid for dtype='{dtype}'"
1319+
)
1320+
return result
1321+
13071322

13081323
class ExtensionOpsMixin:
13091324
"""

pandas/core/arrays/categorical.py

+26
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
NpDtype,
3131
Ordered,
3232
Scalar,
33+
Shape,
34+
type_t,
3335
)
3436
from pandas.compat.numpy import function as nv
3537
from pandas.util._decorators import (
@@ -1525,6 +1527,30 @@ def value_counts(self, dropna: bool = True):
15251527

15261528
return Series(count, index=CategoricalIndex(ix), dtype="int64")
15271529

1530+
# error: Argument 2 of "_empty" is incompatible with supertype
1531+
# "NDArrayBackedExtensionArray"; supertype defines the argument type as
1532+
# "ExtensionDtype"
1533+
@classmethod
1534+
def _empty( # type: ignore[override]
1535+
cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype
1536+
) -> Categorical:
1537+
"""
1538+
Analogous to np.empty(shape, dtype=dtype)
1539+
1540+
Parameters
1541+
----------
1542+
shape : tuple[int]
1543+
dtype : CategoricalDtype
1544+
"""
1545+
arr = cls._from_sequence([], dtype=dtype)
1546+
1547+
# We have to use np.zeros instead of np.empty otherwise the resulting
1548+
# ndarray may contain codes not supported by this dtype, in which
1549+
# case repr(result) could segfault.
1550+
backing = np.zeros(shape, dtype=arr._ndarray.dtype)
1551+
1552+
return arr._from_backing_data(backing)
1553+
15281554
def _internal_get_values(self):
15291555
"""
15301556
Return the values.

pandas/core/arrays/string_.py

+6
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,12 @@ def _from_sequence_of_strings(
254254
):
255255
return cls._from_sequence(strings, dtype=dtype, copy=copy)
256256

257+
@classmethod
258+
def _empty(cls, shape, dtype) -> StringArray:
259+
values = np.empty(shape, dtype=object)
260+
values[:] = libmissing.NA
261+
return cls(values).astype(dtype, copy=False)
262+
257263
def __arrow_array__(self, type=None):
258264
"""
259265
Convert myself into a pyarrow Array.
+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
"""
2+
Tests for subclasses of NDArrayBackedExtensionArray
3+
"""
4+
import numpy as np
5+
6+
from pandas import (
7+
CategoricalIndex,
8+
date_range,
9+
)
10+
from pandas.core.arrays import (
11+
Categorical,
12+
DatetimeArray,
13+
PandasArray,
14+
TimedeltaArray,
15+
)
16+
17+
18+
class TestEmpty:
19+
def test_empty_categorical(self):
20+
ci = CategoricalIndex(["a", "b", "c"], ordered=True)
21+
dtype = ci.dtype
22+
23+
# case with int8 codes
24+
shape = (4,)
25+
result = Categorical._empty(shape, dtype=dtype)
26+
assert isinstance(result, Categorical)
27+
assert result.shape == shape
28+
assert result._ndarray.dtype == np.int8
29+
30+
# case where repr would segfault if we didn't override base implementation
31+
result = Categorical._empty((4096,), dtype=dtype)
32+
assert isinstance(result, Categorical)
33+
assert result.shape == (4096,)
34+
assert result._ndarray.dtype == np.int8
35+
repr(result)
36+
37+
# case with int16 codes
38+
ci = CategoricalIndex(list(range(512)) * 4, ordered=False)
39+
dtype = ci.dtype
40+
result = Categorical._empty(shape, dtype=dtype)
41+
assert isinstance(result, Categorical)
42+
assert result.shape == shape
43+
assert result._ndarray.dtype == np.int16
44+
45+
def test_empty_dt64tz(self):
46+
dti = date_range("2016-01-01", periods=2, tz="Asia/Tokyo")
47+
dtype = dti.dtype
48+
49+
shape = (0,)
50+
result = DatetimeArray._empty(shape, dtype=dtype)
51+
assert result.dtype == dtype
52+
assert isinstance(result, DatetimeArray)
53+
assert result.shape == shape
54+
55+
def test_empty_dt64(self):
56+
shape = (3, 9)
57+
result = DatetimeArray._empty(shape, dtype="datetime64[ns]")
58+
assert isinstance(result, DatetimeArray)
59+
assert result.shape == shape
60+
61+
def test_empty_td64(self):
62+
shape = (3, 9)
63+
result = TimedeltaArray._empty(shape, dtype="m8[ns]")
64+
assert isinstance(result, TimedeltaArray)
65+
assert result.shape == shape
66+
67+
def test_empty_pandas_array(self):
68+
arr = PandasArray(np.array([1, 2]))
69+
dtype = arr.dtype
70+
71+
shape = (3, 9)
72+
result = PandasArray._empty(shape, dtype=dtype)
73+
assert isinstance(result, PandasArray)
74+
assert result.dtype == dtype
75+
assert result.shape == shape

pandas/tests/extension/arrow/test_bool.py

+4
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
8282
def test_construct_empty_dataframe(self, dtype):
8383
super().test_construct_empty_dataframe(dtype)
8484

85+
@pytest.mark.xfail(reason="_from_sequence ignores dtype keyword")
86+
def test_empty(self, dtype):
87+
super().test_empty(dtype)
88+
8589

8690
class TestReduce(base.BaseNoReduceTests):
8791
def test_reduce_series_boolean(self):

pandas/tests/extension/base/constructors.py

+7
Original file line numberDiff line numberDiff line change
@@ -122,3 +122,10 @@ def test_construct_empty_dataframe(self, dtype):
122122
{"a": pd.array([], dtype=dtype)}, index=pd.Index([], dtype="object")
123123
)
124124
self.assert_frame_equal(result, expected)
125+
126+
def test_empty(self, dtype):
127+
cls = dtype.construct_array_type()
128+
result = cls._empty((4,), dtype=dtype)
129+
130+
assert isinstance(result, cls)
131+
assert result.dtype == dtype

pandas/tests/extension/test_categorical.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,14 @@ def test_contains(self, data, data_missing):
117117

118118

119119
class TestConstructors(base.BaseConstructorsTests):
120-
pass
120+
def test_empty(self, dtype):
121+
cls = dtype.construct_array_type()
122+
result = cls._empty((4,), dtype=dtype)
123+
124+
assert isinstance(result, cls)
125+
# the dtype we passed is not initialized, so will not match the
126+
# dtype on our result.
127+
assert result.dtype == CategoricalDtype([])
121128

122129

123130
class TestReshaping(base.BaseReshapingTests):

0 commit comments

Comments
 (0)