From 3842ea9257c0130d36768c4e5c6882344c5f4156 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Mar 2021 17:54:35 -0800 Subject: [PATCH 1/3] BUG: DatetimeArray/TimedeltaArray from PandasArray GH#24615 --- pandas/core/arrays/datetimes.py | 9 ++-- pandas/core/arrays/timedeltas.py | 9 +++- pandas/tests/arrays/test_datetimelike.py | 55 ++++++++++++++++++++++-- 3 files changed, 64 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e28a1a2326d17..d572bde3078c0 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -69,7 +69,9 @@ from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range +from pandas.core.arrays.integer import IntegerArray import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.tseries.frequencies import get_period_alias from pandas.tseries.offsets import ( @@ -239,8 +241,9 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): _freq = None def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False): - if isinstance(values, (ABCSeries, ABCIndex)): - values = values._values + values = extract_array(values, extract_numpy=True) + if isinstance(values, IntegerArray): + values = values.to_numpy("int64", na_value=iNaT) inferred_freq = getattr(values, "_freq", None) @@ -266,7 +269,7 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False): if not isinstance(values, np.ndarray): raise ValueError( f"Unexpected type '{type(values).__name__}'. 'values' must be " - "a DatetimeArray ndarray, or Series or Index containing one of those." + "a DatetimeArray, ndarray, or Series or Index containing one of those." ) if values.ndim not in [1, 2]: raise ValueError("Only 1-dimensional input arrays are supported.") diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index c371e27eeceac..b75f11ca92200 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -63,6 +63,7 @@ from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import ( IntegerArray, + PandasArray, datetimelike as dtl, ) from pandas.core.arrays._ranges import generate_regular_range @@ -170,7 +171,9 @@ def dtype(self) -> np.dtype: _freq = None def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): - values = extract_array(values) + values = extract_array(values, extract_numpy=True) + if isinstance(values, IntegerArray): + values = values.to_numpy("int64", na_value=tslibs.iNaT) inferred_freq = getattr(values, "_freq", None) explicit_none = freq is None @@ -190,7 +193,7 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): if not isinstance(values, np.ndarray): msg = ( f"Unexpected type '{type(values).__name__}'. 'values' must be a " - "TimedeltaArray ndarray, or Series or Index containing one of those." + "TimedeltaArray, ndarray, or Series or Index containing one of those." ) raise ValueError(msg) if values.ndim not in [1, 2]: @@ -958,6 +961,8 @@ def sequence_to_td64ns( elif isinstance(data, TimedeltaArray): inferred_freq = data.freq data = data._ndarray + elif isinstance(data, PandasArray): + data = data.to_numpy() elif isinstance(data, IntegerArray): data = data.to_numpy("int64", na_value=tslibs.iNaT) elif is_categorical_dtype(data.dtype): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 87a095e1003c4..bffab6effa9fc 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -28,6 +28,8 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.datetimes import sequence_to_dt64ns +from pandas.core.arrays.timedeltas import sequence_to_td64ns # TODO: more freq variants @@ -224,7 +226,7 @@ def test_unbox_scalar(self): result = arr._unbox_scalar(pd.NaT) assert isinstance(result, expected) - msg = f"'value' should be a {self.dtype.__name__}." + msg = f"'value' should be a {self.scalar_type.__name__}." with pytest.raises(ValueError, match=msg): arr._unbox_scalar("foo") @@ -614,11 +616,21 @@ def test_median(self, arr1d): result = arr2.median(axis=1, skipna=False) tm.assert_equal(result, arr) + def test_from_integer_array(self): + arr = np.array([1, 2, 3], dtype=np.int64) + data = pd.array(arr, dtype="Int64") + + result = self.array_cls(data, dtype=self.example_dtype) + expected = self.array_cls(arr, dtype=self.example_dtype) + + tm.assert_extension_array_equal(result, expected) + class TestDatetimeArray(SharedTests): index_cls = pd.DatetimeIndex array_cls = DatetimeArray - dtype = Timestamp + scalar_type = Timestamp + example_dtype = "M8[ns]" @pytest.fixture def arr1d(self, tz_naive_fixture, freqstr): @@ -918,7 +930,8 @@ def test_strftime_nat(self): class TestTimedeltaArray(SharedTests): index_cls = TimedeltaIndex array_cls = TimedeltaArray - dtype = pd.Timedelta + scalar_type = pd.Timedelta + example_dtype = "m8[ns]" def test_from_tdi(self): tdi = TimedeltaIndex(["1 Day", "3 Hours"]) @@ -1037,7 +1050,8 @@ def test_take_fill_valid(self, timedelta_index): class TestPeriodArray(SharedTests): index_cls = PeriodIndex array_cls = PeriodArray - dtype = Period + scalar_type = Period + example_dtype = PeriodIndex([], freq="W").dtype @pytest.fixture def arr1d(self, period_index): @@ -1305,3 +1319,36 @@ def test_period_index_construction_from_strings(klass): result = PeriodIndex(data, freq="Q") expected = PeriodIndex([Period(s) for s in strings]) tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) +def test_from_pandas_array(dtype): + # GH#24615 + data = np.array([1, 2, 3], dtype=dtype) + arr = PandasArray(data) + + cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype] + + result = cls(arr) + expected = cls(data) + tm.assert_extension_array_equal(result, expected) + + result = cls._from_sequence(arr) + expected = cls._from_sequence(data) + tm.assert_extension_array_equal(result, expected) + + func = {"M8[ns]": sequence_to_dt64ns, "m8[ns]": sequence_to_td64ns}[dtype] + result = func(arr)[0] + expected = func(data)[0] + tm.assert_equal(result, expected) + + func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype] + result = func(arr).array + expected = func(data).array + tm.assert_equal(result, expected) + + # Let's check the Indexes while we're here + idx_cls = {"M8[ns]": DatetimeIndex, "m8[ns]": TimedeltaIndex}[dtype] + result = idx_cls(arr) + expected = idx_cls(data) + tm.assert_index_equal(result, expected) From 2e8f791e4e1e2a858bb9166ff94bd4fdd5e3c2ea Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 Mar 2021 15:33:51 -0800 Subject: [PATCH 2/3] BUG: constructing DTA/TDA from xarray/dask/pandasarray --- pandas/core/arrays/datetimes.py | 53 +++++++++--------- pandas/core/arrays/timedeltas.py | 32 ++++++----- pandas/tests/arrays/test_datetimelike.py | 69 +++++++++++++++++++++++- 3 files changed, 108 insertions(+), 46 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index e79248fbc96f1..73ffd52623c0a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -59,15 +59,14 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.generic import ( - ABCIndex, - ABCPandasArray, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCMultiIndex from pandas.core.dtypes.missing import isna from pandas.core.algorithms import checked_add_with_arr -from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays import ( + ExtensionArray, + datetimelike as dtl, +) from pandas.core.arrays._ranges import generate_regular_range from pandas.core.arrays.integer import IntegerArray import pandas.core.common as com @@ -1978,6 +1977,9 @@ def sequence_to_dt64ns( dtype = _validate_dt64_dtype(dtype) tz = timezones.maybe_get_tz(tz) + # if dtype has an embedded tz, capture it + tz = validate_tz_from_dtype(dtype, tz) + if not hasattr(data, "dtype"): # e.g. list, tuple if np.ndim(data) == 0: @@ -1985,23 +1987,19 @@ def sequence_to_dt64ns( data = list(data) data = np.asarray(data) copy = False - elif isinstance(data, ABCSeries): - data = data._values - if isinstance(data, ABCPandasArray): - data = data.to_numpy() - - if hasattr(data, "freq"): - # i.e. DatetimeArray/Index - inferred_freq = data.freq + elif isinstance(data, ABCMultiIndex): + raise TypeError("Cannot create a DatetimeArray from a MultiIndex.") + else: + data = extract_array(data, extract_numpy=True) - # if dtype has an embedded tz, capture it - tz = validate_tz_from_dtype(dtype, tz) + if isinstance(data, IntegerArray): + data = data.to_numpy("int64", na_value=iNaT) + elif not isinstance(data, (np.ndarray, ExtensionArray)): + # GH#24539 e.g. xarray, dask object + data = np.asarray(data) - if isinstance(data, ABCIndex): - if data.nlevels > 1: - # Without this check, data._data below is None - raise TypeError("Cannot create a DatetimeArray from a MultiIndex.") - data = data._data + if isinstance(data, DatetimeArray): + inferred_freq = data.freq # By this point we are assured to have either a numpy array or Index data, copy = maybe_convert_dtype(data, copy) @@ -2045,13 +2043,14 @@ def sequence_to_dt64ns( if is_datetime64tz_dtype(data_dtype): # DatetimeArray -> ndarray tz = _maybe_infer_tz(tz, data.tz) - result = data._data + result = data._ndarray elif is_datetime64_dtype(data_dtype): # tz-naive DatetimeArray or ndarray[datetime64] - data = getattr(data, "_data", data) + data = getattr(data, "_ndarray", data) if data.dtype != DT64NS_DTYPE: data = conversion.ensure_datetime64ns(data) + copy = False if tz is not None: # Convert tz-naive to UTC @@ -2088,11 +2087,11 @@ def sequence_to_dt64ns( def objects_to_datetime64ns( - data, - dayfirst, - yearfirst, + data: np.ndarray, + dayfirst: bool, + yearfirst: bool, utc=False, - errors="raise", + errors: str = "raise", require_iso8601: bool = False, allow_object: bool = False, allow_mixed: bool = False, diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index b75f11ca92200..81c8e2e7e4fb7 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -53,17 +53,14 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype -from pandas.core.dtypes.generic import ( - ABCSeries, - ABCTimedeltaIndex, -) +from pandas.core.dtypes.generic import ABCMultiIndex from pandas.core.dtypes.missing import isna from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import ( + ExtensionArray, IntegerArray, - PandasArray, datetimelike as dtl, ) from pandas.core.arrays._ranges import generate_regular_range @@ -953,22 +950,23 @@ def sequence_to_td64ns( # i.e. generator data = list(data) data = np.array(data, copy=False) - elif isinstance(data, ABCSeries): - data = data._values - elif isinstance(data, ABCTimedeltaIndex): - inferred_freq = data.freq - data = data._data._ndarray - elif isinstance(data, TimedeltaArray): - inferred_freq = data.freq - data = data._ndarray - elif isinstance(data, PandasArray): - data = data.to_numpy() - elif isinstance(data, IntegerArray): - data = data.to_numpy("int64", na_value=tslibs.iNaT) + elif isinstance(data, ABCMultiIndex): + raise TypeError("Cannot create a DatetimeArray from a MultiIndex.") + else: + data = extract_array(data, extract_numpy=True) + + if isinstance(data, IntegerArray): + data = data.to_numpy("int64", na_value=iNaT) + elif not isinstance(data, (np.ndarray, ExtensionArray)): + # GH#24539 e.g. xarray, dask object + data = np.asarray(data) elif is_categorical_dtype(data.dtype): data = data.categories.take(data.codes, fill_value=NaT)._values copy = False + if isinstance(data, TimedeltaArray): + inferred_freq = data.freq + # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): # no need to make a copy, need to convert if string-dtyped diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index bffab6effa9fc..2438f3364c632 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -13,6 +13,7 @@ Timestamp, ) from pandas.compat import np_version_under1p18 +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -618,10 +619,10 @@ def test_median(self, arr1d): def test_from_integer_array(self): arr = np.array([1, 2, 3], dtype=np.int64) - data = pd.array(arr, dtype="Int64") + expected = self.array_cls(arr, dtype=self.example_dtype) + data = pd.array(arr, dtype="Int64") result = self.array_cls(data, dtype=self.example_dtype) - expected = self.array_cls(arr, dtype=self.example_dtype) tm.assert_extension_array_equal(result, expected) @@ -1352,3 +1353,67 @@ def test_from_pandas_array(dtype): result = idx_cls(arr) expected = idx_cls(data) tm.assert_index_equal(result, expected) + + +@pytest.fixture( + params=[ + "memoryview", + "array", + pytest.param("dask", marks=td.skip_if_no("dask.array")), + pytest.param("xarray", marks=td.skip_if_no("xarray")), + ] +) +def array_likes(request): + # GH#24539 recognize e.g xarray, dask, ... + arr = np.array([1, 2, 3], dtype=np.int64) + + name = request.param + if name == "memoryview": + data = memoryview(arr) + elif name == "array": + # stdlib array + from array import array + + data = array("i", arr) + elif name == "dask": + import dask.array + + data = dask.array.array(arr) + elif name == "xarray": + import xarray as xr + + data = xr.DataArray(arr) + + return arr, data + + +@pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) +def test_from_obscure_array(dtype, array_likes): + # GH#24539 recognize e.g xarray, dask, ... + # Note: we dont do this for PeriodArray bc _from_sequence won't accept + # an array of integers + # TODO: could check with arraylike of Period objects + arr, data = array_likes + + cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype] + + expected = cls(arr) + result = cls._from_sequence(data) + tm.assert_extension_array_equal(result, expected) + + func = {"M8[ns]": sequence_to_dt64ns, "m8[ns]": sequence_to_td64ns}[dtype] + result = func(arr)[0] + expected = func(data)[0] + tm.assert_equal(result, expected) + + # FIXME: dask and memoryview both break on these + # func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype] + # result = func(arr).array + # expected = func(data).array + # tm.assert_equal(result, expected) + + # Let's check the Indexes while we're here + idx_cls = {"M8[ns]": DatetimeIndex, "m8[ns]": TimedeltaIndex}[dtype] + result = idx_cls(arr) + expected = idx_cls(data) + tm.assert_index_equal(result, expected) From 8e6e930dbc81de7214821c7f10de01ae62c9c651 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 Mar 2021 19:07:54 -0800 Subject: [PATCH 3/3] mypy fixup --- pandas/core/arrays/datetimes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 73ffd52623c0a..e163cd1cb8e06 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2088,10 +2088,10 @@ def sequence_to_dt64ns( def objects_to_datetime64ns( data: np.ndarray, - dayfirst: bool, - yearfirst: bool, + dayfirst, + yearfirst, utc=False, - errors: str = "raise", + errors="raise", require_iso8601: bool = False, allow_object: bool = False, allow_mixed: bool = False,