Skip to content

REGR: NA-values in ctors with string dtype #21366

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 8, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.23.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ and bug fixes. We recommend that all users upgrade to this version.
:local:
:backlinks: none


.. _whatsnew_0231.fixed_regressions:

Fixed Regressions
Expand All @@ -29,6 +28,7 @@ Fixed Regressions
- Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`)
- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`)
- Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`)
- Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing
values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`)

Expand Down
11 changes: 11 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,14 @@ def tz_aware_fixture(request):
Fixture for trying explicit timezones: {0}
"""
return request.param


@pytest.fixture(params=[str, 'str', 'U'])
def string_dtype(request):
"""Parametrized fixture for string dtypes.

* str
* 'str'
* 'U'
"""
return request.param
42 changes: 42 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,3 +1227,45 @@ def construct_1d_object_array_from_listlike(values):
result = np.empty(len(values), dtype='object')
result[:] = values
return result


def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False):
"""
Construct a new ndarray, coercing `values` to `dtype`, preserving NA.

Parameters
----------
values : Sequence
dtype : numpy.dtype, optional
copy : bool, default False
Note that copies may still be made with ``copy=False`` if casting
is required.

Returns
-------
arr : ndarray[dtype]

Examples
--------
>>> np.array([1.0, 2.0, None], dtype='str')
array(['1.0', '2.0', 'None'], dtype='<U4')

>>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype='str')


"""
subarr = np.array(values, dtype=dtype, copy=copy)

if dtype is not None and dtype.kind in ("U", "S"):
# GH-21083
# We can't just return np.array(subarr, dtype='str') since
# NumPy will convert the non-string objects into strings
# Including NA values. Se we have to go
# string -> object -> update NA, which requires an
# additional pass over the data.
na_values = isna(values)
subarr2 = subarr.astype(object)
subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
subarr = subarr2

return subarr
4 changes: 3 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
maybe_convert_platform,
maybe_cast_to_datetime, maybe_castable,
construct_1d_arraylike_from_scalar,
construct_1d_ndarray_preserving_na,
construct_1d_object_array_from_listlike)
from pandas.core.dtypes.missing import (
isna,
Expand Down Expand Up @@ -4074,7 +4075,8 @@ def _try_cast(arr, take_fast_path):
isinstance(subarr, np.ndarray))):
subarr = construct_1d_object_array_from_listlike(subarr)
elif not is_extension_type(subarr):
subarr = np.array(subarr, dtype=dtype, copy=copy)
subarr = construct_1d_ndarray_preserving_na(subarr, dtype,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback refactored to this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

k cool

copy=copy)
except (ValueError, TypeError):
if is_categorical_dtype(dtype):
# We *do* allow casting to categorical, since we know
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/dtypes/test_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
maybe_convert_scalar,
find_common_type,
construct_1d_object_array_from_listlike,
construct_1d_ndarray_preserving_na,
construct_1d_arraylike_from_scalar)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
Expand Down Expand Up @@ -440,3 +441,15 @@ def test_cast_1d_arraylike_from_scalar_categorical(self):
tm.assert_categorical_equal(result, expected,
check_category_order=True,
check_dtype=True)


@pytest.mark.parametrize('values, dtype, expected', [
([1, 2, 3], None, np.array([1, 2, 3])),
(np.array([1, 2, 3]), None, np.array([1, 2, 3])),
(['1', '2', None], None, np.array(['1', '2', None])),
(['1', '2', None], np.dtype('str'), np.array(['1', '2', None])),
([1, 2, None], np.dtype('str'), np.array(['1', '2', None])),
])
def test_construct_1d_ndarray_preserving_na(values, dtype, expected):
result = construct_1d_ndarray_preserving_na(values, dtype=dtype)
tm.assert_numpy_array_equal(result, expected)
11 changes: 11 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,17 @@ def test_constructor_complex_dtypes(self):
assert a.dtype == df.a.dtype
assert b.dtype == df.b.dtype

def test_constructor_dtype_str_na_values(self, string_dtype):
# https://github.com/pandas-dev/pandas/issues/21083
df = DataFrame({'A': ['x', None]}, dtype=string_dtype)
result = df.isna()
expected = DataFrame({"A": [False, True]})
tm.assert_frame_equal(result, expected)
assert df.iloc[1, 0] is None

df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype)
assert np.isnan(df.iloc[1, 0])

def test_constructor_rec(self):
rec = self.frame.to_records(index=False)

Expand Down
16 changes: 10 additions & 6 deletions pandas/tests/frame/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,22 +794,26 @@ def test_arg_for_errors_in_astype(self):

@pytest.mark.parametrize('input_vals', [
([1, 2]),
([1.0, 2.0, np.nan]),
(['1', '2']),
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
(list(pd.date_range('1/1/2011', periods=2, freq='H',
tz='US/Eastern'))),
([pd.Interval(left=0, right=5)]),
])
def test_constructor_list_str(self, input_vals):
def test_constructor_list_str(self, input_vals, string_dtype):
# GH 16605
# Ensure that data elements are converted to strings when
# dtype is str, 'str', or 'U'

for dtype in ['str', str, 'U']:
result = DataFrame({'A': input_vals}, dtype=dtype)
expected = DataFrame({'A': input_vals}).astype({'A': dtype})
assert_frame_equal(result, expected)
result = DataFrame({'A': input_vals}, dtype=string_dtype)
expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
assert_frame_equal(result, expected)

def test_constructor_list_str_na(self, string_dtype):

result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
assert_frame_equal(result, expected)


class TestDataFrameDatetimeWithTZ(TestData):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1829,7 +1829,7 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3):

data = ['foo', 'bar', 'bar', np.nan, np.nan, np.nan]

s = Series(data, dtype=str)
s = Series(data, dtype=object).astype(str)
result = s.mode(dropna)
expected3 = Series(expected3, dtype=str)
tm.assert_series_equal(result, expected3)
Expand Down
26 changes: 20 additions & 6 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,17 @@ def test_constructor_no_data_index_order(self):
result = pd.Series(index=['b', 'a', 'c'])
assert result.index.tolist() == ['b', 'a', 'c']

def test_constructor_dtype_str_na_values(self, string_dtype):
# https://github.com/pandas-dev/pandas/issues/21083
ser = Series(['x', None], dtype=string_dtype)
result = ser.isna()
expected = Series([False, True])
tm.assert_series_equal(result, expected)
assert ser.iloc[1] is None

ser = Series(['x', np.nan], dtype=string_dtype)
assert np.isnan(ser.iloc[1])

def test_constructor_series(self):
index1 = ['d', 'b', 'a', 'c']
index2 = sorted(index1)
Expand Down Expand Up @@ -164,22 +175,25 @@ def test_constructor_list_like(self):

@pytest.mark.parametrize('input_vals', [
([1, 2]),
([1.0, 2.0, np.nan]),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to remove this case, as it isn't correct. We aren't consistent between Series(... dtype=str) and .astype(str), which is maybe unfortunate...

In [4]: pd.Series([1, 2, None], dtype='str').tolist()
Out[4]: ['1', '2', None]

In [5]: pd.Series([1, 2, None], dtype='str').astype('str').tolist()
Out[5]: ['1', '2', 'None']

(['1', '2']),
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
(list(pd.date_range('1/1/2011', periods=2, freq='H',
tz='US/Eastern'))),
([pd.Interval(left=0, right=5)]),
])
def test_constructor_list_str(self, input_vals):
def test_constructor_list_str(self, input_vals, string_dtype):
# GH 16605
# Ensure that data elements from a list are converted to strings
# when dtype is str, 'str', or 'U'
result = Series(input_vals, dtype=string_dtype)
expected = Series(input_vals).astype(string_dtype)
assert_series_equal(result, expected)

for dtype in ['str', str, 'U']:
result = Series(input_vals, dtype=dtype)
expected = Series(input_vals).astype(dtype)
assert_series_equal(result, expected)
def test_constructor_list_str_na(self, string_dtype):
result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
expected = Series(['1.0', '2.0', np.nan], dtype=object)
assert_series_equal(result, expected)
assert np.isnan(result[2])

def test_constructor_generator(self):
gen = (i for i in range(10))
Expand Down