From 460e93a9013143e8c581952b5669bd75242580ac Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 22 Apr 2020 16:32:09 -0700 Subject: [PATCH 1/3] REF: implement test_astype files --- .../tests/indexes/categorical/test_astype.py | 66 +++++++++++++++ .../indexes/categorical/test_category.py | 61 +------------- pandas/tests/indexes/datetimes/test_astype.py | 44 ++++------ pandas/tests/indexes/numeric/test_astype.py | 81 +++++++++++++++++++ pandas/tests/indexes/test_numeric.py | 70 ---------------- pandas/tests/series/methods/test_astype.py | 25 ++++++ 6 files changed, 188 insertions(+), 159 deletions(-) create mode 100644 pandas/tests/indexes/categorical/test_astype.py create mode 100644 pandas/tests/indexes/numeric/test_astype.py create mode 100644 pandas/tests/series/methods/test_astype.py diff --git a/pandas/tests/indexes/categorical/test_astype.py b/pandas/tests/indexes/categorical/test_astype.py new file mode 100644 index 0000000000000..a4a0cb1978325 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_astype.py @@ -0,0 +1,66 @@ +import numpy as np +import pytest + +from pandas import Categorical, CategoricalDtype, CategoricalIndex, Index, IntervalIndex +import pandas._testing as tm + + +class TestAstype: + def test_astype(self): + ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + + result = ci.astype(object) + tm.assert_index_equal(result, Index(np.array(ci))) + + # this IS equal, but not the same class + assert result.equals(ci) + assert isinstance(result, Index) + assert not isinstance(result, CategoricalIndex) + + # interval + ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], right=[2, 4], closed="right") + + ci = CategoricalIndex( + Categorical.from_codes([0, 1, -1], categories=ii, ordered=True) + ) + + result = ci.astype("interval") + expected = ii.take([0, 1, -1]) + tm.assert_index_equal(result, expected) + + result = IntervalIndex(result.values) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("dtype_ordered", [True, False]) + @pytest.mark.parametrize("index_ordered", [True, False]) + def test_astype_category(self, name, dtype_ordered, index_ordered): + # GH#18630 + index = CategoricalIndex( + list("aabbca"), categories=list("cab"), ordered=index_ordered + ) + if name: + index = index.rename(name) + + # standard categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = index.astype(dtype) + expected = CategoricalIndex( + index.tolist(), + name=name, + categories=index.categories, + ordered=dtype_ordered, + ) + tm.assert_index_equal(result, expected) + + # non-standard categories + dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered) + result = index.astype(dtype) + expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) + tm.assert_index_equal(result, expected) + + if dtype_ordered is False: + # dtype='category' can't specify ordered, so only test once + result = index.astype("category") + expected = index + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 83fe21fd20bfe..9765c77c6b60c 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -3,10 +3,8 @@ from pandas._libs import index as libindex -from pandas.core.dtypes.dtypes import CategoricalDtype - import pandas as pd -from pandas import Categorical, IntervalIndex +from pandas import Categorical import pandas._testing as tm from pandas.core.indexes.api import CategoricalIndex, Index @@ -196,63 +194,6 @@ def test_delete(self): # Either depending on NumPy version ci.delete(10) - def test_astype(self): - - ci = self.create_index() - result = ci.astype(object) - tm.assert_index_equal(result, Index(np.array(ci))) - - # this IS equal, but not the same class - assert result.equals(ci) - assert isinstance(result, Index) - assert not isinstance(result, CategoricalIndex) - - # interval - ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], right=[2, 4], closed="right") - - ci = CategoricalIndex( - Categorical.from_codes([0, 1, -1], categories=ii, ordered=True) - ) - - result = ci.astype("interval") - expected = ii.take([0, 1, -1]) - tm.assert_index_equal(result, expected) - - result = IntervalIndex(result.values) - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize("name", [None, "foo"]) - @pytest.mark.parametrize("dtype_ordered", [True, False]) - @pytest.mark.parametrize("index_ordered", [True, False]) - def test_astype_category(self, name, dtype_ordered, index_ordered): - # GH 18630 - index = self.create_index(ordered=index_ordered) - if name: - index = index.rename(name) - - # standard categories - dtype = CategoricalDtype(ordered=dtype_ordered) - result = index.astype(dtype) - expected = CategoricalIndex( - index.tolist(), - name=name, - categories=index.categories, - ordered=dtype_ordered, - ) - tm.assert_index_equal(result, expected) - - # non-standard categories - dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered) - result = index.astype(dtype) - expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) - tm.assert_index_equal(result, expected) - - if dtype_ordered is False: - # dtype='category' can't specify ordered, so only test once - result = index.astype("category") - expected = index - tm.assert_index_equal(result, expected) - @pytest.mark.parametrize( "data, non_lexsorted_data", [[[1, 2, 3], [9, 0, 1, 2, 3]], [list("abc"), list("fabcd")]], diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index 34169a670c169..7001dba442d16 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -12,7 +12,6 @@ Int64Index, NaT, PeriodIndex, - Series, Timestamp, date_range, ) @@ -65,37 +64,21 @@ def test_astype_with_tz(self): ) tm.assert_index_equal(result, expected) - # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex - result = pd.Series(pd.date_range("2012-01-01", periods=3)).astype(str) - expected = pd.Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) - tm.assert_series_equal(result, expected) - - result = Series(pd.date_range("2012-01-01", periods=3, tz="US/Eastern")).astype( - str - ) - expected = Series( - [ - "2012-01-01 00:00:00-05:00", - "2012-01-02 00:00:00-05:00", - "2012-01-03 00:00:00-05:00", - ], - dtype=object, - ) - tm.assert_series_equal(result, expected) - + def test_astype_tzaware_to_tzaware(self): # GH 18951: tz-aware to tz-aware idx = date_range("20170101", periods=4, tz="US/Pacific") result = idx.astype("datetime64[ns, US/Eastern]") expected = date_range("20170101 03:00:00", periods=4, tz="US/Eastern") tm.assert_index_equal(result, expected) + def test_astype_tznaive_to_tzaware(self): # GH 18951: tz-naive to tz-aware idx = date_range("20170101", periods=4) result = idx.astype("datetime64[ns, US/Eastern]") expected = date_range("20170101", periods=4, tz="US/Eastern") tm.assert_index_equal(result, expected) - def test_astype_str_compat(self): + def test_astype_str_nat(self): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) @@ -106,7 +89,8 @@ def test_astype_str_compat(self): def test_astype_str(self): # test astype string - #10442 - result = date_range("2012-01-01", periods=4, name="test_name").astype(str) + dti = date_range("2012-01-01", periods=4, name="test_name") + result = dti.astype(str) expected = Index( ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], name="test_name", @@ -114,10 +98,10 @@ def test_astype_str(self): ) tm.assert_index_equal(result, expected) + def test_astype_str_tz_and_name(self): # test astype string with tz and name - result = date_range( - "2012-01-01", periods=3, name="test_name", tz="US/Eastern" - ).astype(str) + dti = date_range("2012-01-01", periods=3, name="test_name", tz="US/Eastern") + result = dti.astype(str) expected = Index( [ "2012-01-01 00:00:00-05:00", @@ -129,10 +113,10 @@ def test_astype_str(self): ) tm.assert_index_equal(result, expected) + def test_astype_str_freq_and_name(self): # test astype string with freqH and name - result = date_range("1/1/2011", periods=3, freq="H", name="test_name").astype( - str - ) + dti = date_range("1/1/2011", periods=3, freq="H", name="test_name") + result = dti.astype(str) expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], name="test_name", @@ -140,10 +124,12 @@ def test_astype_str(self): ) tm.assert_index_equal(result, expected) + def test_astype_str_freq_and_tz(self): # test astype string with freqH and timezone - result = date_range( + dti = date_range( "3/6/2012 00:00", periods=2, freq="H", tz="Europe/London", name="test_name" - ).astype(str) + ) + result = dti.astype(str) expected = Index( ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], dtype=object, diff --git a/pandas/tests/indexes/numeric/test_astype.py b/pandas/tests/indexes/numeric/test_astype.py new file mode 100644 index 0000000000000..b24a4e0f9a38c --- /dev/null +++ b/pandas/tests/indexes/numeric/test_astype.py @@ -0,0 +1,81 @@ +import re + +import numpy as np +import pytest + +from pandas.core.dtypes.common import pandas_dtype + +from pandas import Float64Index, Index, Int64Index +import pandas._testing as tm + + +class TestAstype: + def test_astype(self): + + float_index = Float64Index([0.0, 2.5, 5.0, 7.5, 10.0]) + result = float_index.astype(object) + assert result.equals(float_index) + assert float_index.equals(result) + assert isinstance(result, Index) and not isinstance(result, Float64Index) + + # mixed int-float + i = Float64Index([1.5, 2, 3, 4, 5]) + i.name = "foo" + result = i.astype(object) + assert result.equals(i) + assert i.equals(result) + assert isinstance(result, Index) and not isinstance(result, Float64Index) + + # GH#12881 + # a float astype int + for dtype in ["int16", "int32", "int64"]: + i = Float64Index([0, 1, 2]) + result = i.astype(dtype) + expected = Int64Index([0, 1, 2]) + tm.assert_index_equal(result, expected) + + i = Float64Index([0, 1.1, 2]) + result = i.astype(dtype) + expected = Int64Index([0, 1, 2]) + tm.assert_index_equal(result, expected) + + for dtype in ["float32", "float64"]: + i = Float64Index([0, 1, 2]) + result = i.astype(dtype) + expected = i + tm.assert_index_equal(result, expected) + + i = Float64Index([0, 1.1, 2]) + result = i.astype(dtype) + expected = Index(i.values.astype(dtype)) + tm.assert_index_equal(result, expected) + + # invalid + for dtype in ["M8[ns]", "m8[ns]"]: + msg = ( + f"Cannot convert Float64Index to dtype {pandas_dtype(dtype)}; " + f"integer values are required for conversion" + ) + with pytest.raises(TypeError, match=re.escape(msg)): + i.astype(dtype) + + # GH#13149 + for dtype in ["int16", "int32", "int64"]: + i = Float64Index([0, 1.1, np.NAN]) + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(ValueError, match=msg): + i.astype(dtype) + + def test_cannot_cast_inf_to_int(self): + idx = Float64Index([1, 2, np.inf]) + + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(ValueError, match=msg): + idx.astype(int) + + def test_astype_from_object(self): + index = Index([1.0, np.nan, 0.2], dtype="object") + result = index.astype(float) + expected = Float64Index([1.0, np.nan, 0.2]) + assert result.dtype == expected.dtype + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 35a1cbd141c89..0965694e5b51d 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -1,5 +1,4 @@ from datetime import datetime, timedelta -import re import numpy as np import pytest @@ -9,7 +8,6 @@ import pandas as pd from pandas import Float64Index, Index, Int64Index, Series, UInt64Index import pandas._testing as tm -from pandas.api.types import pandas_dtype from pandas.tests.indexes.common import Base @@ -213,67 +211,6 @@ def test_constructor_explicit(self, mixed_index, float_index): mixed_index, Index([1.5, 2, 3, 4, 5], dtype=object), is_float_index=False ) - def test_astype(self, mixed_index, float_index): - - result = float_index.astype(object) - assert result.equals(float_index) - assert float_index.equals(result) - self.check_is_index(result) - - i = mixed_index.copy() - i.name = "foo" - result = i.astype(object) - assert result.equals(i) - assert i.equals(result) - self.check_is_index(result) - - # GH 12881 - # a float astype int - for dtype in ["int16", "int32", "int64"]: - i = Float64Index([0, 1, 2]) - result = i.astype(dtype) - expected = Int64Index([0, 1, 2]) - tm.assert_index_equal(result, expected) - - i = Float64Index([0, 1.1, 2]) - result = i.astype(dtype) - expected = Int64Index([0, 1, 2]) - tm.assert_index_equal(result, expected) - - for dtype in ["float32", "float64"]: - i = Float64Index([0, 1, 2]) - result = i.astype(dtype) - expected = i - tm.assert_index_equal(result, expected) - - i = Float64Index([0, 1.1, 2]) - result = i.astype(dtype) - expected = Index(i.values.astype(dtype)) - tm.assert_index_equal(result, expected) - - # invalid - for dtype in ["M8[ns]", "m8[ns]"]: - msg = ( - f"Cannot convert Float64Index to dtype {pandas_dtype(dtype)}; " - f"integer values are required for conversion" - ) - with pytest.raises(TypeError, match=re.escape(msg)): - i.astype(dtype) - - # GH 13149 - for dtype in ["int16", "int32", "int64"]: - i = Float64Index([0, 1.1, np.NAN]) - msg = r"Cannot convert non-finite values \(NA or inf\) to integer" - with pytest.raises(ValueError, match=msg): - i.astype(dtype) - - def test_cannot_cast_inf_to_int(self): - idx = pd.Float64Index([1, 2, np.inf]) - - msg = r"Cannot convert non-finite values \(NA or inf\) to integer" - with pytest.raises(ValueError, match=msg): - idx.astype(int) - def test_type_coercion_fail(self, any_int_dtype): # see gh-15832 msg = "Trying to coerce float values to integers" @@ -359,13 +296,6 @@ def test_nan_multiple_containment(self): i = Float64Index([1.0, 2.0]) tm.assert_numpy_array_equal(i.isin([np.nan]), np.array([False, False])) - def test_astype_from_object(self): - index = Index([1.0, np.nan, 0.2], dtype="object") - result = index.astype(float) - expected = Float64Index([1.0, np.nan, 0.2]) - assert result.dtype == expected.dtype - tm.assert_index_equal(result, expected) - def test_fillna_float64(self): # GH 11343 idx = Index([1.0, np.nan, 3.0], dtype=float, name="x") diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py new file mode 100644 index 0000000000000..9fdc4179de2e1 --- /dev/null +++ b/pandas/tests/series/methods/test_astype.py @@ -0,0 +1,25 @@ +from pandas import Series, date_range +import pandas._testing as tm + + +class TestAstype: + def test_astype_dt64_to_str(self): + # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex + dti = date_range("2012-01-01", periods=3) + result = Series(dti).astype(str) + expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) + tm.assert_series_equal(result, expected) + + def test_astype_dt64tz_to_str(self): + # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex + dti_tz = date_range("2012-01-01", periods=3, tz="US/Eastern") + result = Series(dti_tz).astype(str) + expected = Series( + [ + "2012-01-01 00:00:00-05:00", + "2012-01-02 00:00:00-05:00", + "2012-01-03 00:00:00-05:00", + ], + dtype=object, + ) + tm.assert_series_equal(result, expected) From b108ea8afe930cd66e2e46b8a4e3ae2d601f52da Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 22 Apr 2020 17:41:43 -0700 Subject: [PATCH 2/3] implement test_astype --- pandas/tests/frame/methods/test_astype.py | 562 ++++++++++++++++++++++ pandas/tests/frame/test_dtypes.py | 546 +-------------------- 2 files changed, 564 insertions(+), 544 deletions(-) create mode 100644 pandas/tests/frame/methods/test_astype.py diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py new file mode 100644 index 0000000000000..dc779562b662d --- /dev/null +++ b/pandas/tests/frame/methods/test_astype.py @@ -0,0 +1,562 @@ +import re + +import numpy as np +import pytest + +from pandas import ( + Categorical, + CategoricalDtype, + DataFrame, + DatetimeTZDtype, + IntervalDtype, + NaT, + Series, + Timedelta, + Timestamp, + UInt64Index, + _np_version_under1p14, + concat, + date_range, + option_context, +) +import pandas._testing as tm +from pandas.core.arrays import integer_array + + +def _check_cast(df, v): + """ + Check if all dtypes of df are equal to v + """ + assert all(s.dtype.name == v for _, s in df.items()) + + +class TestAstype: + def test_astype_float(self, float_frame): + casted = float_frame.astype(int) + expected = DataFrame( + float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(casted, expected) + + casted = float_frame.astype(np.int32) + expected = DataFrame( + float_frame.values.astype(np.int32), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(casted, expected) + + float_frame["foo"] = "5" + casted = float_frame.astype(int) + expected = DataFrame( + float_frame.values.astype(int), + index=float_frame.index, + columns=float_frame.columns, + ) + tm.assert_frame_equal(casted, expected) + + def test_astype_mixed_float(self, mixed_float_frame): + # mixed casting + casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32") + _check_cast(casted, "float32") + + casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16") + _check_cast(casted, "float16") + + def test_astype_mixed_type(self, mixed_type_frame): + # mixed casting + mn = mixed_type_frame._get_numeric_data().copy() + mn["little_float"] = np.array(12345.0, dtype="float16") + mn["big_float"] = np.array(123456789101112.0, dtype="float64") + + casted = mn.astype("float64") + _check_cast(casted, "float64") + + casted = mn.astype("int64") + _check_cast(casted, "int64") + + casted = mn.reindex(columns=["little_float"]).astype("float16") + _check_cast(casted, "float16") + + casted = mn.astype("float32") + _check_cast(casted, "float32") + + casted = mn.astype("int32") + _check_cast(casted, "int32") + + # to object + casted = mn.astype("O") + _check_cast(casted, "object") + + def test_astype_with_exclude_string(self, float_frame): + df = float_frame.copy() + expected = float_frame.astype(int) + df["string"] = "foo" + casted = df.astype(int, errors="ignore") + + expected["string"] = "foo" + tm.assert_frame_equal(casted, expected) + + df = float_frame.copy() + expected = float_frame.astype(np.int32) + df["string"] = "foo" + casted = df.astype(np.int32, errors="ignore") + + expected["string"] = "foo" + tm.assert_frame_equal(casted, expected) + + def test_astype_with_view_float(self, float_frame): + + # this is the only real reason to do it this way + tf = np.round(float_frame).astype(np.int32) + casted = tf.astype(np.float32, copy=False) + + # TODO(wesm): verification? + tf = float_frame.astype(np.float64) + casted = tf.astype(np.int64, copy=False) # noqa + + def test_astype_with_view_mixed_float(self, mixed_float_frame): + + tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) + + casted = tf.astype(np.int64) + casted = tf.astype(np.float32) # noqa + + @pytest.mark.parametrize("dtype", [np.int32, np.int64]) + @pytest.mark.parametrize("val", [np.nan, np.inf]) + def test_astype_cast_nan_inf_int(self, val, dtype): + # see GH#14265 + # + # Check NaN and inf --> raise error when converting to int. + msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" + df = DataFrame([val]) + + with pytest.raises(ValueError, match=msg): + df.astype(dtype) + + def test_astype_str(self): + # see GH#9757 + a = Series(date_range("2010-01-04", periods=5)) + b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) + c = Series([Timedelta(x, unit="d") for x in range(5)]) + d = Series(range(5)) + e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + + df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) + + # Datetime-like + result = df.astype(str) + + expected = DataFrame( + { + "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), + "b": list(map(str, map(Timestamp, b._values))), + "c": list(map(lambda x: Timedelta(x)._repr_base(), c._values)), + "d": list(map(str, d._values)), + "e": list(map(str, e._values)), + } + ) + + tm.assert_frame_equal(result, expected) + + def test_astype_str_float(self): + # see GH#11302 + result = DataFrame([np.NaN]).astype(str) + expected = DataFrame(["nan"]) + + tm.assert_frame_equal(result, expected) + result = DataFrame([1.12345678901234567890]).astype(str) + + # < 1.14 truncates + # >= 1.14 preserves the full repr + val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457" + expected = DataFrame([val]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype_class", [dict, Series]) + def test_astype_dict_like(self, dtype_class): + # GH7271 & GH16717 + a = Series(date_range("2010-01-04", periods=5)) + b = Series(range(5)) + c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + d = Series(["1.0", "2", "3.14", "4", "5.4"]) + df = DataFrame({"a": a, "b": b, "c": c, "d": d}) + original = df.copy(deep=True) + + # change type of a subset of columns + dt1 = dtype_class({"b": "str", "d": "float32"}) + result = df.astype(dt1) + expected = DataFrame( + { + "a": a, + "b": Series(["0", "1", "2", "3", "4"]), + "c": c, + "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), + } + ) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, original) + + dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64}) + result = df.astype(dt2) + expected = DataFrame( + { + "a": a, + "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"), + "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"), + "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"), + } + ) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, original) + + # change all columns + dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str}) + tm.assert_frame_equal(df.astype(dt3), df.astype(str)) + tm.assert_frame_equal(df, original) + + # error should be raised when using something other than column labels + # in the keys of the dtype dict + dt4 = dtype_class({"b": str, 2: str}) + dt5 = dtype_class({"e": str}) + msg = "Only a column name can be used for the key in a dtype mappings argument" + with pytest.raises(KeyError, match=msg): + df.astype(dt4) + with pytest.raises(KeyError, match=msg): + df.astype(dt5) + tm.assert_frame_equal(df, original) + + # if the dtypes provided are the same as the original dtypes, the + # resulting DataFrame should be the same as the original DataFrame + dt6 = dtype_class({col: df[col].dtype for col in df.columns}) + equiv = df.astype(dt6) + tm.assert_frame_equal(df, equiv) + tm.assert_frame_equal(df, original) + + # GH#16717 + # if dtypes provided is empty, the resulting DataFrame + # should be the same as the original DataFrame + dt7 = dtype_class({}) if dtype_class is dict else dtype_class({}, dtype=object) + equiv = df.astype(dt7) + tm.assert_frame_equal(df, equiv) + tm.assert_frame_equal(df, original) + + def test_astype_duplicate_col(self): + a1 = Series([1, 2, 3, 4, 5], name="a") + b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b") + a2 = Series([0, 1, 2, 3, 4], name="a") + df = concat([a1, b, a2], axis=1) + + result = df.astype(str) + a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") + b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") + a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") + expected = concat([a1_str, b_str, a2_str], axis=1) + tm.assert_frame_equal(result, expected) + + result = df.astype({"a": "str"}) + expected = concat([a1_str, b, a2_str], axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", + [ + "category", + CategoricalDtype(), + CategoricalDtype(ordered=True), + CategoricalDtype(ordered=False), + CategoricalDtype(categories=list("abcdef")), + CategoricalDtype(categories=list("edba"), ordered=False), + CategoricalDtype(categories=list("edcb"), ordered=True), + ], + ids=repr, + ) + def test_astype_categorical(self, dtype): + # GH#18099 + d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} + df = DataFrame(d) + result = df.astype(dtype) + expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype]) + def test_astype_categoricaldtype_class_raises(self, cls): + df = DataFrame({"A": ["a", "a", "b", "c"]}) + xpr = f"Expected an instance of {cls.__name__}" + with pytest.raises(TypeError, match=xpr): + df.astype({"A": cls}) + + with pytest.raises(TypeError, match=xpr): + df["A"].astype(cls) + + @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) + def test_astype_extension_dtypes(self, dtype): + # GH#22578 + df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) + + expected1 = DataFrame( + { + "a": integer_array([1, 3, 5], dtype=dtype), + "b": integer_array([2, 4, 6], dtype=dtype), + } + ) + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + tm.assert_frame_equal(df.astype(dtype).astype("float64"), df) + + df = DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) + df["b"] = df["b"].astype(dtype) + expected2 = DataFrame( + {"a": [1.0, 3.0, 5.0], "b": integer_array([2, 4, 6], dtype=dtype)} + ) + tm.assert_frame_equal(df, expected2) + + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + + @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) + def test_astype_extension_dtypes_1d(self, dtype): + # GH#22578 + df = DataFrame({"a": [1.0, 2.0, 3.0]}) + + expected1 = DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + + df = DataFrame({"a": [1.0, 2.0, 3.0]}) + df["a"] = df["a"].astype(dtype) + expected2 = DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) + tm.assert_frame_equal(df, expected2) + + tm.assert_frame_equal(df.astype(dtype), expected1) + tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) + + @pytest.mark.parametrize("dtype", ["category", "Int64"]) + def test_astype_extension_dtypes_duplicate_col(self, dtype): + # GH#24704 + a1 = Series([0, np.nan, 4], name="a") + a2 = Series([np.nan, 3, 5], name="a") + df = concat([a1, a2], axis=1) + + result = df.astype(dtype) + expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"] + ) + def test_astype_column_metadata(self, dtype): + # GH#19920 + columns = UInt64Index([100, 200, 300], name="foo") + df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) + df = df.astype(dtype) + tm.assert_index_equal(df.columns, columns) + + @pytest.mark.parametrize("dtype", ["M8", "m8"]) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_from_datetimelike_to_object(self, dtype, unit): + # tests astype to object dtype + # GH#19223 / GH#12425 + dtype = f"{dtype}[{unit}]" + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(object) + assert (result.dtypes == object).all() + + if dtype.startswith("M8"): + assert result.iloc[0, 0] == Timestamp(1, unit=unit) + else: + assert result.iloc[0, 0] == Timedelta(1, unit=unit) + + @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) + @pytest.mark.parametrize("dtype", ["M8", "m8"]) + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): + # tests all units from numeric origination + # GH#19223 / GH#12425 + dtype = f"{dtype}[{unit}]" + arr = np.array([[1, 2, 3]], dtype=arr_dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(arr.astype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_to_datetime_unit(self, unit): + # tests all units from datetime origination + # GH#19223 + dtype = f"M8[{unit}]" + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(arr.astype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns"]) + def test_astype_to_timedelta_unit_ns(self, unit): + # preserver the timedelta conversion + # GH#19223 + dtype = f"m8[{unit}]" + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(arr.astype(dtype)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) + def test_astype_to_timedelta_unit(self, unit): + # coerce to float + # GH#19223 + dtype = f"m8[{unit}]" + arr = np.array([[1, 2, 3]], dtype=dtype) + df = DataFrame(arr) + result = df.astype(dtype) + expected = DataFrame(df.values.astype(dtype).astype(float)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) + def test_astype_to_incorrect_datetimelike(self, unit): + # trying to astype a m to a M, or vice-versa + # GH#19224 + dtype = f"M8[{unit}]" + other = f"m8[{unit}]" + + df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) + msg = ( + fr"cannot astype a datetimelike from \[datetime64\[ns\]\] to " + fr"\[timedelta64\[{unit}\]\]" + ) + with pytest.raises(TypeError, match=msg): + df.astype(other) + + msg = ( + fr"cannot astype a timedelta from \[timedelta64\[ns\]\] to " + fr"\[datetime64\[{unit}\]\]" + ) + df = DataFrame(np.array([[1, 2, 3]], dtype=other)) + with pytest.raises(TypeError, match=msg): + df.astype(dtype) + + def test_astype_arg_for_errors(self): + # GH#14878 + + df = DataFrame([1, 2, 3]) + + msg = ( + "Expected value of kwarg 'errors' to be one of " + "['raise', 'ignore']. Supplied value is 'True'" + ) + with pytest.raises(ValueError, match=re.escape(msg)): + df.astype(np.float64, errors=True) + + df.astype(np.int8, errors="ignore") + + def test_astype_arg_for_errors_dictlist(self): + # GH#25905 + df = DataFrame( + [ + {"a": "1", "b": "16.5%", "c": "test"}, + {"a": "2.2", "b": "15.3", "c": "another_test"}, + ] + ) + expected = DataFrame( + [ + {"a": 1.0, "b": "16.5%", "c": "test"}, + {"a": 2.2, "b": "15.3", "c": "another_test"}, + ] + ) + type_dict = {"a": "float64", "b": "float64", "c": "object"} + + result = df.astype(dtype=type_dict, errors="ignore") + + tm.assert_frame_equal(result, expected) + + def test_astype_dt64tz(self, timezone_frame): + # astype + expected = np.array( + [ + [ + Timestamp("2013-01-01 00:00:00"), + Timestamp("2013-01-02 00:00:00"), + Timestamp("2013-01-03 00:00:00"), + ], + [ + Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), + NaT, + Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), + ], + [ + Timestamp("2013-01-01 00:00:00+0100", tz="CET"), + NaT, + Timestamp("2013-01-03 00:00:00+0100", tz="CET"), + ], + ], + dtype=object, + ).T + expected = DataFrame( + expected, + index=timezone_frame.index, + columns=timezone_frame.columns, + dtype=object, + ) + result = timezone_frame.astype(object) + tm.assert_frame_equal(result, expected) + + result = timezone_frame.astype("datetime64[ns]") + expected = DataFrame( + { + "A": date_range("20130101", periods=3), + "B": ( + date_range("20130101", periods=3, tz="US/Eastern") + .tz_convert("UTC") + .tz_localize(None) + ), + "C": ( + date_range("20130101", periods=3, tz="CET") + .tz_convert("UTC") + .tz_localize(None) + ), + } + ) + expected.iloc[1, 1] = NaT + expected.iloc[1, 2] = NaT + tm.assert_frame_equal(result, expected) + + def test_astype_dt64tz_to_str(self, timezone_frame): + # str formatting + result = timezone_frame.astype(str) + expected = DataFrame( + [ + [ + "2013-01-01", + "2013-01-01 00:00:00-05:00", + "2013-01-01 00:00:00+01:00", + ], + ["2013-01-02", "NaT", "NaT"], + [ + "2013-01-03", + "2013-01-03 00:00:00-05:00", + "2013-01-03 00:00:00+01:00", + ], + ], + columns=timezone_frame.columns, + ) + tm.assert_frame_equal(result, expected) + + with option_context("display.max_columns", 20): + result = str(timezone_frame) + assert ( + "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00" + ) in result + assert ( + "1 2013-01-02 NaT NaT" + ) in result + assert ( + "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" + ) in result diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 27ebee4aaaccf..9d0c221923cda 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1,26 +1,14 @@ from collections import OrderedDict from datetime import timedelta -import re import numpy as np import pytest -from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, IntervalDtype +from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd -from pandas import ( - Categorical, - DataFrame, - Series, - Timedelta, - Timestamp, - _np_version_under1p14, - concat, - date_range, - option_context, -) +from pandas import DataFrame, Series, Timestamp, date_range, option_context import pandas._testing as tm -from pandas.core.arrays import integer_array def _check_cast(df, v): @@ -126,266 +114,6 @@ def test_dtypes_gh8722(self, float_string_frame): result = df.dtypes tm.assert_series_equal(result, Series({0: np.dtype("int64")})) - def test_astype_float(self, float_frame): - casted = float_frame.astype(int) - expected = DataFrame( - float_frame.values.astype(int), - index=float_frame.index, - columns=float_frame.columns, - ) - tm.assert_frame_equal(casted, expected) - - casted = float_frame.astype(np.int32) - expected = DataFrame( - float_frame.values.astype(np.int32), - index=float_frame.index, - columns=float_frame.columns, - ) - tm.assert_frame_equal(casted, expected) - - float_frame["foo"] = "5" - casted = float_frame.astype(int) - expected = DataFrame( - float_frame.values.astype(int), - index=float_frame.index, - columns=float_frame.columns, - ) - tm.assert_frame_equal(casted, expected) - - def test_astype_mixed_float(self, mixed_float_frame): - # mixed casting - casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float32") - _check_cast(casted, "float32") - - casted = mixed_float_frame.reindex(columns=["A", "B"]).astype("float16") - _check_cast(casted, "float16") - - def test_astype_mixed_type(self, mixed_type_frame): - # mixed casting - mn = mixed_type_frame._get_numeric_data().copy() - mn["little_float"] = np.array(12345.0, dtype="float16") - mn["big_float"] = np.array(123456789101112.0, dtype="float64") - - casted = mn.astype("float64") - _check_cast(casted, "float64") - - casted = mn.astype("int64") - _check_cast(casted, "int64") - - casted = mn.reindex(columns=["little_float"]).astype("float16") - _check_cast(casted, "float16") - - casted = mn.astype("float32") - _check_cast(casted, "float32") - - casted = mn.astype("int32") - _check_cast(casted, "int32") - - # to object - casted = mn.astype("O") - _check_cast(casted, "object") - - def test_astype_with_exclude_string(self, float_frame): - df = float_frame.copy() - expected = float_frame.astype(int) - df["string"] = "foo" - casted = df.astype(int, errors="ignore") - - expected["string"] = "foo" - tm.assert_frame_equal(casted, expected) - - df = float_frame.copy() - expected = float_frame.astype(np.int32) - df["string"] = "foo" - casted = df.astype(np.int32, errors="ignore") - - expected["string"] = "foo" - tm.assert_frame_equal(casted, expected) - - def test_astype_with_view_float(self, float_frame): - - # this is the only real reason to do it this way - tf = np.round(float_frame).astype(np.int32) - casted = tf.astype(np.float32, copy=False) - - # TODO(wesm): verification? - tf = float_frame.astype(np.float64) - casted = tf.astype(np.int64, copy=False) # noqa - - def test_astype_with_view_mixed_float(self, mixed_float_frame): - - tf = mixed_float_frame.reindex(columns=["A", "B", "C"]) - - casted = tf.astype(np.int64) - casted = tf.astype(np.float32) # noqa - - @pytest.mark.parametrize("dtype", [np.int32, np.int64]) - @pytest.mark.parametrize("val", [np.nan, np.inf]) - def test_astype_cast_nan_inf_int(self, val, dtype): - # see gh-14265 - # - # Check NaN and inf --> raise error when converting to int. - msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" - df = DataFrame([val]) - - with pytest.raises(ValueError, match=msg): - df.astype(dtype) - - def test_astype_str(self): - # see gh-9757 - a = Series(date_range("2010-01-04", periods=5)) - b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) - c = Series([Timedelta(x, unit="d") for x in range(5)]) - d = Series(range(5)) - e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) - - df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) - - # Datetime-like - result = df.astype(str) - - expected = DataFrame( - { - "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), - "b": list(map(str, map(Timestamp, b._values))), - "c": list(map(lambda x: Timedelta(x)._repr_base(), c._values)), - "d": list(map(str, d._values)), - "e": list(map(str, e._values)), - } - ) - - tm.assert_frame_equal(result, expected) - - def test_astype_str_float(self): - # see gh-11302 - result = DataFrame([np.NaN]).astype(str) - expected = DataFrame(["nan"]) - - tm.assert_frame_equal(result, expected) - result = DataFrame([1.12345678901234567890]).astype(str) - - # < 1.14 truncates - # >= 1.14 preserves the full repr - val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457" - expected = DataFrame([val]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("dtype_class", [dict, Series]) - def test_astype_dict_like(self, dtype_class): - # GH7271 & GH16717 - a = Series(date_range("2010-01-04", periods=5)) - b = Series(range(5)) - c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) - d = Series(["1.0", "2", "3.14", "4", "5.4"]) - df = DataFrame({"a": a, "b": b, "c": c, "d": d}) - original = df.copy(deep=True) - - # change type of a subset of columns - dt1 = dtype_class({"b": "str", "d": "float32"}) - result = df.astype(dt1) - expected = DataFrame( - { - "a": a, - "b": Series(["0", "1", "2", "3", "4"]), - "c": c, - "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), - } - ) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(df, original) - - dt2 = dtype_class({"b": np.float32, "c": "float32", "d": np.float64}) - result = df.astype(dt2) - expected = DataFrame( - { - "a": a, - "b": Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype="float32"), - "c": Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype="float32"), - "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float64"), - } - ) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(df, original) - - # change all columns - dt3 = dtype_class({"a": str, "b": str, "c": str, "d": str}) - tm.assert_frame_equal(df.astype(dt3), df.astype(str)) - tm.assert_frame_equal(df, original) - - # error should be raised when using something other than column labels - # in the keys of the dtype dict - dt4 = dtype_class({"b": str, 2: str}) - dt5 = dtype_class({"e": str}) - msg = "Only a column name can be used for the key in a dtype mappings argument" - with pytest.raises(KeyError, match=msg): - df.astype(dt4) - with pytest.raises(KeyError, match=msg): - df.astype(dt5) - tm.assert_frame_equal(df, original) - - # if the dtypes provided are the same as the original dtypes, the - # resulting DataFrame should be the same as the original DataFrame - dt6 = dtype_class({col: df[col].dtype for col in df.columns}) - equiv = df.astype(dt6) - tm.assert_frame_equal(df, equiv) - tm.assert_frame_equal(df, original) - - # GH 16717 - # if dtypes provided is empty, the resulting DataFrame - # should be the same as the original DataFrame - dt7 = dtype_class({}) if dtype_class is dict else dtype_class({}, dtype=object) - equiv = df.astype(dt7) - tm.assert_frame_equal(df, equiv) - tm.assert_frame_equal(df, original) - - def test_astype_duplicate_col(self): - a1 = Series([1, 2, 3, 4, 5], name="a") - b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name="b") - a2 = Series([0, 1, 2, 3, 4], name="a") - df = concat([a1, b, a2], axis=1) - - result = df.astype(str) - a1_str = Series(["1", "2", "3", "4", "5"], dtype="str", name="a") - b_str = Series(["0.1", "0.2", "0.4", "0.6", "0.8"], dtype=str, name="b") - a2_str = Series(["0", "1", "2", "3", "4"], dtype="str", name="a") - expected = concat([a1_str, b_str, a2_str], axis=1) - tm.assert_frame_equal(result, expected) - - result = df.astype({"a": "str"}) - expected = concat([a1_str, b, a2_str], axis=1) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "dtype", - [ - "category", - CategoricalDtype(), - CategoricalDtype(ordered=True), - CategoricalDtype(ordered=False), - CategoricalDtype(categories=list("abcdef")), - CategoricalDtype(categories=list("edba"), ordered=False), - CategoricalDtype(categories=list("edcb"), ordered=True), - ], - ids=repr, - ) - def test_astype_categorical(self, dtype): - # GH 18099 - d = {"A": list("abbc"), "B": list("bccd"), "C": list("cdde")} - df = DataFrame(d) - result = df.astype(dtype) - expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("cls", [CategoricalDtype, DatetimeTZDtype, IntervalDtype]) - def test_astype_categoricaldtype_class_raises(self, cls): - df = DataFrame({"A": ["a", "a", "b", "c"]}) - xpr = f"Expected an instance of {cls.__name__}" - with pytest.raises(TypeError, match=xpr): - df.astype({"A": cls}) - - with pytest.raises(TypeError, match=xpr): - df["A"].astype(cls) - def test_singlerow_slice_categoricaldtype_gives_series(self): # GH29521 df = pd.DataFrame({"x": pd.Categorical("a b c d e".split())}) @@ -395,158 +123,6 @@ def test_singlerow_slice_categoricaldtype_gives_series(self): tm.assert_series_equal(result, expected) - @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) - def test_astype_extension_dtypes(self, dtype): - # GH 22578 - df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) - - expected1 = pd.DataFrame( - { - "a": integer_array([1, 3, 5], dtype=dtype), - "b": integer_array([2, 4, 6], dtype=dtype), - } - ) - tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) - tm.assert_frame_equal(df.astype(dtype).astype("float64"), df) - - df = pd.DataFrame([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], columns=["a", "b"]) - df["b"] = df["b"].astype(dtype) - expected2 = pd.DataFrame( - {"a": [1.0, 3.0, 5.0], "b": integer_array([2, 4, 6], dtype=dtype)} - ) - tm.assert_frame_equal(df, expected2) - - tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) - - @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Int16"]) - def test_astype_extension_dtypes_1d(self, dtype): - # GH 22578 - df = pd.DataFrame({"a": [1.0, 2.0, 3.0]}) - - expected1 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) - tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) - - df = pd.DataFrame({"a": [1.0, 2.0, 3.0]}) - df["a"] = df["a"].astype(dtype) - expected2 = pd.DataFrame({"a": integer_array([1, 2, 3], dtype=dtype)}) - tm.assert_frame_equal(df, expected2) - - tm.assert_frame_equal(df.astype(dtype), expected1) - tm.assert_frame_equal(df.astype("int64").astype(dtype), expected1) - - @pytest.mark.parametrize("dtype", ["category", "Int64"]) - def test_astype_extension_dtypes_duplicate_col(self, dtype): - # GH 24704 - a1 = Series([0, np.nan, 4], name="a") - a2 = Series([np.nan, 3, 5], name="a") - df = concat([a1, a2], axis=1) - - result = df.astype(dtype) - expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "dtype", [{100: "float64", 200: "uint64"}, "category", "float64"] - ) - def test_astype_column_metadata(self, dtype): - # GH 19920 - columns = pd.UInt64Index([100, 200, 300], name="foo") - df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) - df = df.astype(dtype) - tm.assert_index_equal(df.columns, columns) - - @pytest.mark.parametrize("dtype", ["M8", "m8"]) - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) - def test_astype_from_datetimelike_to_object(self, dtype, unit): - # tests astype to object dtype - # gh-19223 / gh-12425 - dtype = f"{dtype}[{unit}]" - arr = np.array([[1, 2, 3]], dtype=dtype) - df = DataFrame(arr) - result = df.astype(object) - assert (result.dtypes == object).all() - - if dtype.startswith("M8"): - assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit) - else: - assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit) - - @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) - @pytest.mark.parametrize("dtype", ["M8", "m8"]) - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) - def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): - # tests all units from numeric origination - # gh-19223 / gh-12425 - dtype = f"{dtype}[{unit}]" - arr = np.array([[1, 2, 3]], dtype=arr_dtype) - df = DataFrame(arr) - result = df.astype(dtype) - expected = DataFrame(arr.astype(dtype)) - - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) - def test_astype_to_datetime_unit(self, unit): - # tests all units from datetime origination - # gh-19223 - dtype = f"M8[{unit}]" - arr = np.array([[1, 2, 3]], dtype=dtype) - df = DataFrame(arr) - result = df.astype(dtype) - expected = DataFrame(arr.astype(dtype)) - - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("unit", ["ns"]) - def test_astype_to_timedelta_unit_ns(self, unit): - # preserver the timedelta conversion - # gh-19223 - dtype = f"m8[{unit}]" - arr = np.array([[1, 2, 3]], dtype=dtype) - df = DataFrame(arr) - result = df.astype(dtype) - expected = DataFrame(arr.astype(dtype)) - - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) - def test_astype_to_timedelta_unit(self, unit): - # coerce to float - # gh-19223 - dtype = f"m8[{unit}]" - arr = np.array([[1, 2, 3]], dtype=dtype) - df = DataFrame(arr) - result = df.astype(dtype) - expected = DataFrame(df.values.astype(dtype).astype(float)) - - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) - def test_astype_to_incorrect_datetimelike(self, unit): - # trying to astype a m to a M, or vice-versa - # gh-19224 - dtype = f"M8[{unit}]" - other = f"m8[{unit}]" - - df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) - msg = ( - fr"cannot astype a datetimelike from \[datetime64\[ns\]\] to " - fr"\[timedelta64\[{unit}\]\]" - ) - with pytest.raises(TypeError, match=msg): - df.astype(other) - - msg = ( - fr"cannot astype a timedelta from \[timedelta64\[ns\]\] to " - fr"\[datetime64\[{unit}\]\]" - ) - df = DataFrame(np.array([[1, 2, 3]], dtype=other)) - with pytest.raises(TypeError, match=msg): - df.astype(dtype) - def test_timedeltas(self): df = DataFrame( dict( @@ -586,40 +162,6 @@ def test_timedeltas(self): ) tm.assert_series_equal(result, expected) - def test_arg_for_errors_in_astype(self): - # issue #14878 - - df = DataFrame([1, 2, 3]) - - msg = ( - "Expected value of kwarg 'errors' to be one of " - "['raise', 'ignore']. Supplied value is 'True'" - ) - with pytest.raises(ValueError, match=re.escape(msg)): - df.astype(np.float64, errors=True) - - df.astype(np.int8, errors="ignore") - - def test_arg_for_errors_in_astype_dictlist(self): - # GH-25905 - df = pd.DataFrame( - [ - {"a": "1", "b": "16.5%", "c": "test"}, - {"a": "2.2", "b": "15.3", "c": "another_test"}, - ] - ) - expected = pd.DataFrame( - [ - {"a": 1.0, "b": "16.5%", "c": "test"}, - {"a": 2.2, "b": "15.3", "c": "another_test"}, - ] - ) - type_dict = {"a": "float64", "b": "float64", "c": "object"} - - result = df.astype(dtype=type_dict, errors="ignore") - - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( "input_vals", [ @@ -785,87 +327,3 @@ def test_interleave(self, timezone_frame): dtype=object, ).T tm.assert_numpy_array_equal(result, expected) - - def test_astype(self, timezone_frame): - # astype - expected = np.array( - [ - [ - Timestamp("2013-01-01 00:00:00"), - Timestamp("2013-01-02 00:00:00"), - Timestamp("2013-01-03 00:00:00"), - ], - [ - Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"), - pd.NaT, - Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"), - ], - [ - Timestamp("2013-01-01 00:00:00+0100", tz="CET"), - pd.NaT, - Timestamp("2013-01-03 00:00:00+0100", tz="CET"), - ], - ], - dtype=object, - ).T - expected = DataFrame( - expected, - index=timezone_frame.index, - columns=timezone_frame.columns, - dtype=object, - ) - result = timezone_frame.astype(object) - tm.assert_frame_equal(result, expected) - - result = timezone_frame.astype("datetime64[ns]") - expected = DataFrame( - { - "A": date_range("20130101", periods=3), - "B": ( - date_range("20130101", periods=3, tz="US/Eastern") - .tz_convert("UTC") - .tz_localize(None) - ), - "C": ( - date_range("20130101", periods=3, tz="CET") - .tz_convert("UTC") - .tz_localize(None) - ), - } - ) - expected.iloc[1, 1] = pd.NaT - expected.iloc[1, 2] = pd.NaT - tm.assert_frame_equal(result, expected) - - def test_astype_str(self, timezone_frame): - # str formatting - result = timezone_frame.astype(str) - expected = DataFrame( - [ - [ - "2013-01-01", - "2013-01-01 00:00:00-05:00", - "2013-01-01 00:00:00+01:00", - ], - ["2013-01-02", "NaT", "NaT"], - [ - "2013-01-03", - "2013-01-03 00:00:00-05:00", - "2013-01-03 00:00:00+01:00", - ], - ], - columns=timezone_frame.columns, - ) - tm.assert_frame_equal(result, expected) - - with option_context("display.max_columns", 20): - result = str(timezone_frame) - assert ( - "0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00" - ) in result - assert ( - "1 2013-01-02 NaT NaT" - ) in result - assert ( - "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" - ) in result From 221835d7f49feb04eacb289b0fa62ccc84f82a9b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 22 Apr 2020 17:49:16 -0700 Subject: [PATCH 3/3] split_parametrize --- pandas/tests/indexes/numeric/test_astype.py | 86 +++++++++++---------- 1 file changed, 44 insertions(+), 42 deletions(-) diff --git a/pandas/tests/indexes/numeric/test_astype.py b/pandas/tests/indexes/numeric/test_astype.py index b24a4e0f9a38c..1771f4336df67 100644 --- a/pandas/tests/indexes/numeric/test_astype.py +++ b/pandas/tests/indexes/numeric/test_astype.py @@ -10,68 +10,70 @@ class TestAstype: - def test_astype(self): - + def test_astype_float64_to_object(self): float_index = Float64Index([0.0, 2.5, 5.0, 7.5, 10.0]) result = float_index.astype(object) assert result.equals(float_index) assert float_index.equals(result) assert isinstance(result, Index) and not isinstance(result, Float64Index) + def test_astype_float64_mixed_to_object(self): # mixed int-float - i = Float64Index([1.5, 2, 3, 4, 5]) - i.name = "foo" - result = i.astype(object) - assert result.equals(i) - assert i.equals(result) + idx = Float64Index([1.5, 2, 3, 4, 5]) + idx.name = "foo" + result = idx.astype(object) + assert result.equals(idx) + assert idx.equals(result) assert isinstance(result, Index) and not isinstance(result, Float64Index) + @pytest.mark.parametrize("dtype", ["int16", "int32", "int64"]) + def test_astype_float64_to_int_dtype(self, dtype): # GH#12881 # a float astype int - for dtype in ["int16", "int32", "int64"]: - i = Float64Index([0, 1, 2]) - result = i.astype(dtype) - expected = Int64Index([0, 1, 2]) - tm.assert_index_equal(result, expected) + idx = Float64Index([0, 1, 2]) + result = idx.astype(dtype) + expected = Int64Index([0, 1, 2]) + tm.assert_index_equal(result, expected) - i = Float64Index([0, 1.1, 2]) - result = i.astype(dtype) - expected = Int64Index([0, 1, 2]) - tm.assert_index_equal(result, expected) + idx = Float64Index([0, 1.1, 2]) + result = idx.astype(dtype) + expected = Int64Index([0, 1, 2]) + tm.assert_index_equal(result, expected) - for dtype in ["float32", "float64"]: - i = Float64Index([0, 1, 2]) - result = i.astype(dtype) - expected = i - tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("dtype", ["float32", "float64"]) + def test_astype_float64_to_float_dtype(self, dtype): + # GH#12881 + # a float astype int + idx = Float64Index([0, 1, 2]) + result = idx.astype(dtype) + expected = idx + tm.assert_index_equal(result, expected) + + idx = Float64Index([0, 1.1, 2]) + result = idx.astype(dtype) + expected = Index(idx.values.astype(dtype)) + tm.assert_index_equal(result, expected) - i = Float64Index([0, 1.1, 2]) - result = i.astype(dtype) - expected = Index(i.values.astype(dtype)) - tm.assert_index_equal(result, expected) + @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) + def test_cannot_cast_to_datetimelike(self, dtype): + idx = Float64Index([0, 1.1, 2]) - # invalid - for dtype in ["M8[ns]", "m8[ns]"]: - msg = ( - f"Cannot convert Float64Index to dtype {pandas_dtype(dtype)}; " - f"integer values are required for conversion" - ) - with pytest.raises(TypeError, match=re.escape(msg)): - i.astype(dtype) + msg = ( + f"Cannot convert Float64Index to dtype {pandas_dtype(dtype)}; " + f"integer values are required for conversion" + ) + with pytest.raises(TypeError, match=re.escape(msg)): + idx.astype(dtype) + @pytest.mark.parametrize("dtype", [int, "int16", "int32", "int64"]) + @pytest.mark.parametrize("non_finite", [np.inf, np.nan]) + def test_cannot_cast_inf_to_int(self, non_finite, dtype): # GH#13149 - for dtype in ["int16", "int32", "int64"]: - i = Float64Index([0, 1.1, np.NAN]) - msg = r"Cannot convert non-finite values \(NA or inf\) to integer" - with pytest.raises(ValueError, match=msg): - i.astype(dtype) - - def test_cannot_cast_inf_to_int(self): - idx = Float64Index([1, 2, np.inf]) + idx = Float64Index([1, 2, non_finite]) msg = r"Cannot convert non-finite values \(NA or inf\) to integer" with pytest.raises(ValueError, match=msg): - idx.astype(int) + idx.astype(dtype) def test_astype_from_object(self): index = Index([1.0, np.nan, 0.2], dtype="object")