Skip to content

Commit 636dd01

Browse files
TomAugspurgerjreback
authored andcommitted
REGR: NA-values in ctors with string dtype (#21366)
1 parent 93be27d commit 636dd01

File tree

9 files changed

+112
-15
lines changed

9 files changed

+112
-15
lines changed

doc/source/whatsnew/v0.23.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ and bug fixes. We recommend that all users upgrade to this version.
1010
:local:
1111
:backlinks: none
1212

13-
1413
.. _whatsnew_0231.fixed_regressions:
1514

1615
Fixed Regressions
@@ -29,6 +28,7 @@ Fixed Regressions
2928
- Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
3029
- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`)
3130
- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`)
31+
- Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`)
3232
- Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing
3333
values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`)
3434

pandas/conftest.py

+11
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,14 @@ def tz_aware_fixture(request):
159159
Fixture for trying explicit timezones: {0}
160160
"""
161161
return request.param
162+
163+
164+
@pytest.fixture(params=[str, 'str', 'U'])
165+
def string_dtype(request):
166+
"""Parametrized fixture for string dtypes.
167+
168+
* str
169+
* 'str'
170+
* 'U'
171+
"""
172+
return request.param

pandas/core/dtypes/cast.py

+42
Original file line numberDiff line numberDiff line change
@@ -1227,3 +1227,45 @@ def construct_1d_object_array_from_listlike(values):
12271227
result = np.empty(len(values), dtype='object')
12281228
result[:] = values
12291229
return result
1230+
1231+
1232+
def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False):
1233+
"""
1234+
Construct a new ndarray, coercing `values` to `dtype`, preserving NA.
1235+
1236+
Parameters
1237+
----------
1238+
values : Sequence
1239+
dtype : numpy.dtype, optional
1240+
copy : bool, default False
1241+
Note that copies may still be made with ``copy=False`` if casting
1242+
is required.
1243+
1244+
Returns
1245+
-------
1246+
arr : ndarray[dtype]
1247+
1248+
Examples
1249+
--------
1250+
>>> np.array([1.0, 2.0, None], dtype='str')
1251+
array(['1.0', '2.0', 'None'], dtype='<U4')
1252+
1253+
>>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype='str')
1254+
1255+
1256+
"""
1257+
subarr = np.array(values, dtype=dtype, copy=copy)
1258+
1259+
if dtype is not None and dtype.kind in ("U", "S"):
1260+
# GH-21083
1261+
# We can't just return np.array(subarr, dtype='str') since
1262+
# NumPy will convert the non-string objects into strings
1263+
# Including NA values. Se we have to go
1264+
# string -> object -> update NA, which requires an
1265+
# additional pass over the data.
1266+
na_values = isna(values)
1267+
subarr2 = subarr.astype(object)
1268+
subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
1269+
subarr = subarr2
1270+
1271+
return subarr

pandas/core/series.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
maybe_convert_platform,
4141
maybe_cast_to_datetime, maybe_castable,
4242
construct_1d_arraylike_from_scalar,
43+
construct_1d_ndarray_preserving_na,
4344
construct_1d_object_array_from_listlike)
4445
from pandas.core.dtypes.missing import (
4546
isna,
@@ -4074,7 +4075,8 @@ def _try_cast(arr, take_fast_path):
40744075
isinstance(subarr, np.ndarray))):
40754076
subarr = construct_1d_object_array_from_listlike(subarr)
40764077
elif not is_extension_type(subarr):
4077-
subarr = np.array(subarr, dtype=dtype, copy=copy)
4078+
subarr = construct_1d_ndarray_preserving_na(subarr, dtype,
4079+
copy=copy)
40784080
except (ValueError, TypeError):
40794081
if is_categorical_dtype(dtype):
40804082
# We *do* allow casting to categorical, since we know

pandas/tests/dtypes/test_cast.py

+13
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
maybe_convert_scalar,
2424
find_common_type,
2525
construct_1d_object_array_from_listlike,
26+
construct_1d_ndarray_preserving_na,
2627
construct_1d_arraylike_from_scalar)
2728
from pandas.core.dtypes.dtypes import (
2829
CategoricalDtype,
@@ -440,3 +441,15 @@ def test_cast_1d_arraylike_from_scalar_categorical(self):
440441
tm.assert_categorical_equal(result, expected,
441442
check_category_order=True,
442443
check_dtype=True)
444+
445+
446+
@pytest.mark.parametrize('values, dtype, expected', [
447+
([1, 2, 3], None, np.array([1, 2, 3])),
448+
(np.array([1, 2, 3]), None, np.array([1, 2, 3])),
449+
(['1', '2', None], None, np.array(['1', '2', None])),
450+
(['1', '2', None], np.dtype('str'), np.array(['1', '2', None])),
451+
([1, 2, None], np.dtype('str'), np.array(['1', '2', None])),
452+
])
453+
def test_construct_1d_ndarray_preserving_na(values, dtype, expected):
454+
result = construct_1d_ndarray_preserving_na(values, dtype=dtype)
455+
tm.assert_numpy_array_equal(result, expected)

pandas/tests/frame/test_constructors.py

+11
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,17 @@ def test_constructor_complex_dtypes(self):
151151
assert a.dtype == df.a.dtype
152152
assert b.dtype == df.b.dtype
153153

154+
def test_constructor_dtype_str_na_values(self, string_dtype):
155+
# https://github.com/pandas-dev/pandas/issues/21083
156+
df = DataFrame({'A': ['x', None]}, dtype=string_dtype)
157+
result = df.isna()
158+
expected = DataFrame({"A": [False, True]})
159+
tm.assert_frame_equal(result, expected)
160+
assert df.iloc[1, 0] is None
161+
162+
df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype)
163+
assert np.isnan(df.iloc[1, 0])
164+
154165
def test_constructor_rec(self):
155166
rec = self.frame.to_records(index=False)
156167

pandas/tests/frame/test_dtypes.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -794,22 +794,26 @@ def test_arg_for_errors_in_astype(self):
794794

795795
@pytest.mark.parametrize('input_vals', [
796796
([1, 2]),
797-
([1.0, 2.0, np.nan]),
798797
(['1', '2']),
799798
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
800799
(list(pd.date_range('1/1/2011', periods=2, freq='H',
801800
tz='US/Eastern'))),
802801
([pd.Interval(left=0, right=5)]),
803802
])
804-
def test_constructor_list_str(self, input_vals):
803+
def test_constructor_list_str(self, input_vals, string_dtype):
805804
# GH 16605
806805
# Ensure that data elements are converted to strings when
807806
# dtype is str, 'str', or 'U'
808807

809-
for dtype in ['str', str, 'U']:
810-
result = DataFrame({'A': input_vals}, dtype=dtype)
811-
expected = DataFrame({'A': input_vals}).astype({'A': dtype})
812-
assert_frame_equal(result, expected)
808+
result = DataFrame({'A': input_vals}, dtype=string_dtype)
809+
expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
810+
assert_frame_equal(result, expected)
811+
812+
def test_constructor_list_str_na(self, string_dtype):
813+
814+
result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
815+
expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
816+
assert_frame_equal(result, expected)
813817

814818

815819
class TestDataFrameDatetimeWithTZ(TestData):

pandas/tests/series/test_analytics.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1829,7 +1829,7 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3):
18291829

18301830
data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan]
18311831

1832-
s = Series(data, dtype=str)
1832+
s = Series(data, dtype=object).astype(str)
18331833
result = s.mode(dropna)
18341834
expected3 = Series(expected3, dtype=str)
18351835
tm.assert_series_equal(result, expected3)

pandas/tests/series/test_constructors.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,17 @@ def test_constructor_no_data_index_order(self):
137137
result = pd.Series(index=['b', 'a', 'c'])
138138
assert result.index.tolist() == ['b', 'a', 'c']
139139

140+
def test_constructor_dtype_str_na_values(self, string_dtype):
141+
# https://github.com/pandas-dev/pandas/issues/21083
142+
ser = Series(['x', None], dtype=string_dtype)
143+
result = ser.isna()
144+
expected = Series([False, True])
145+
tm.assert_series_equal(result, expected)
146+
assert ser.iloc[1] is None
147+
148+
ser = Series(['x', np.nan], dtype=string_dtype)
149+
assert np.isnan(ser.iloc[1])
150+
140151
def test_constructor_series(self):
141152
index1 = ['d', 'b', 'a', 'c']
142153
index2 = sorted(index1)
@@ -164,22 +175,25 @@ def test_constructor_list_like(self):
164175

165176
@pytest.mark.parametrize('input_vals', [
166177
([1, 2]),
167-
([1.0, 2.0, np.nan]),
168178
(['1', '2']),
169179
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
170180
(list(pd.date_range('1/1/2011', periods=2, freq='H',
171181
tz='US/Eastern'))),
172182
([pd.Interval(left=0, right=5)]),
173183
])
174-
def test_constructor_list_str(self, input_vals):
184+
def test_constructor_list_str(self, input_vals, string_dtype):
175185
# GH 16605
176186
# Ensure that data elements from a list are converted to strings
177187
# when dtype is str, 'str', or 'U'
188+
result = Series(input_vals, dtype=string_dtype)
189+
expected = Series(input_vals).astype(string_dtype)
190+
assert_series_equal(result, expected)
178191

179-
for dtype in ['str', str, 'U']:
180-
result = Series(input_vals, dtype=dtype)
181-
expected = Series(input_vals).astype(dtype)
182-
assert_series_equal(result, expected)
192+
def test_constructor_list_str_na(self, string_dtype):
193+
result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
194+
expected = Series(['1.0', '2.0', np.nan], dtype=object)
195+
assert_series_equal(result, expected)
196+
assert np.isnan(result[2])
183197

184198
def test_constructor_generator(self):
185199
gen = (i for i in range(10))

0 commit comments

Comments
 (0)