Skip to content

BUG: constructing DTA/TDA from xarray/dask/pandasarray #40210

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 29 additions & 27 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,18 @@
pandas_dtype,
)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.generic import (
ABCIndex,
ABCPandasArray,
ABCSeries,
)
from pandas.core.dtypes.generic import ABCMultiIndex
from pandas.core.dtypes.missing import isna

from pandas.core.algorithms import checked_add_with_arr
from pandas.core.arrays import datetimelike as dtl
from pandas.core.arrays import (
ExtensionArray,
datetimelike as dtl,
)
from pandas.core.arrays._ranges import generate_regular_range
from pandas.core.arrays.integer import IntegerArray
import pandas.core.common as com
from pandas.core.construction import extract_array

from pandas.tseries.frequencies import get_period_alias
from pandas.tseries.offsets import (
Expand Down Expand Up @@ -239,8 +240,9 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):
_freq = None

def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False):
if isinstance(values, (ABCSeries, ABCIndex)):
values = values._values
values = extract_array(values, extract_numpy=True)
if isinstance(values, IntegerArray):
values = values.to_numpy("int64", na_value=iNaT)

inferred_freq = getattr(values, "_freq", None)

Expand All @@ -266,7 +268,7 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False):
if not isinstance(values, np.ndarray):
raise ValueError(
f"Unexpected type '{type(values).__name__}'. 'values' must be "
"a DatetimeArray ndarray, or Series or Index containing one of those."
"a DatetimeArray, ndarray, or Series or Index containing one of those."
)
if values.ndim not in [1, 2]:
raise ValueError("Only 1-dimensional input arrays are supported.")
Expand Down Expand Up @@ -1975,30 +1977,29 @@ def sequence_to_dt64ns(
dtype = _validate_dt64_dtype(dtype)
tz = timezones.maybe_get_tz(tz)

# if dtype has an embedded tz, capture it
tz = validate_tz_from_dtype(dtype, tz)

if not hasattr(data, "dtype"):
# e.g. list, tuple
if np.ndim(data) == 0:
# i.e. generator
data = list(data)
data = np.asarray(data)
copy = False
elif isinstance(data, ABCSeries):
data = data._values
if isinstance(data, ABCPandasArray):
data = data.to_numpy()

if hasattr(data, "freq"):
# i.e. DatetimeArray/Index
inferred_freq = data.freq
elif isinstance(data, ABCMultiIndex):
raise TypeError("Cannot create a DatetimeArray from a MultiIndex.")
else:
data = extract_array(data, extract_numpy=True)

# if dtype has an embedded tz, capture it
tz = validate_tz_from_dtype(dtype, tz)
if isinstance(data, IntegerArray):
data = data.to_numpy("int64", na_value=iNaT)
elif not isinstance(data, (np.ndarray, ExtensionArray)):
# GH#24539 e.g. xarray, dask object
data = np.asarray(data)

if isinstance(data, ABCIndex):
if data.nlevels > 1:
# Without this check, data._data below is None
raise TypeError("Cannot create a DatetimeArray from a MultiIndex.")
data = data._data
if isinstance(data, DatetimeArray):
inferred_freq = data.freq

# By this point we are assured to have either a numpy array or Index
data, copy = maybe_convert_dtype(data, copy)
Expand Down Expand Up @@ -2042,13 +2043,14 @@ def sequence_to_dt64ns(
if is_datetime64tz_dtype(data_dtype):
# DatetimeArray -> ndarray
tz = _maybe_infer_tz(tz, data.tz)
result = data._data
result = data._ndarray

elif is_datetime64_dtype(data_dtype):
# tz-naive DatetimeArray or ndarray[datetime64]
data = getattr(data, "_data", data)
data = getattr(data, "_ndarray", data)
if data.dtype != DT64NS_DTYPE:
data = conversion.ensure_datetime64ns(data)
copy = False

if tz is not None:
# Convert tz-naive to UTC
Expand Down Expand Up @@ -2085,7 +2087,7 @@ def sequence_to_dt64ns(


def objects_to_datetime64ns(
data,
data: np.ndarray,
dayfirst,
yearfirst,
utc=False,
Expand Down
35 changes: 19 additions & 16 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,13 @@
pandas_dtype,
)
from pandas.core.dtypes.dtypes import DatetimeTZDtype
from pandas.core.dtypes.generic import (
ABCSeries,
ABCTimedeltaIndex,
)
from pandas.core.dtypes.generic import ABCMultiIndex
from pandas.core.dtypes.missing import isna

from pandas.core import nanops
from pandas.core.algorithms import checked_add_with_arr
from pandas.core.arrays import (
ExtensionArray,
IntegerArray,
datetimelike as dtl,
)
Expand Down Expand Up @@ -170,7 +168,9 @@ def dtype(self) -> np.dtype:
_freq = None

def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
values = extract_array(values)
values = extract_array(values, extract_numpy=True)
if isinstance(values, IntegerArray):
values = values.to_numpy("int64", na_value=tslibs.iNaT)

inferred_freq = getattr(values, "_freq", None)
explicit_none = freq is None
Expand All @@ -190,7 +190,7 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
if not isinstance(values, np.ndarray):
msg = (
f"Unexpected type '{type(values).__name__}'. 'values' must be a "
"TimedeltaArray ndarray, or Series or Index containing one of those."
"TimedeltaArray, ndarray, or Series or Index containing one of those."
)
raise ValueError(msg)
if values.ndim not in [1, 2]:
Expand Down Expand Up @@ -950,20 +950,23 @@ def sequence_to_td64ns(
# i.e. generator
data = list(data)
data = np.array(data, copy=False)
elif isinstance(data, ABCSeries):
data = data._values
elif isinstance(data, ABCTimedeltaIndex):
inferred_freq = data.freq
data = data._data._ndarray
elif isinstance(data, TimedeltaArray):
inferred_freq = data.freq
data = data._ndarray
elif isinstance(data, IntegerArray):
data = data.to_numpy("int64", na_value=tslibs.iNaT)
elif isinstance(data, ABCMultiIndex):
raise TypeError("Cannot create a DatetimeArray from a MultiIndex.")
else:
data = extract_array(data, extract_numpy=True)

if isinstance(data, IntegerArray):
data = data.to_numpy("int64", na_value=iNaT)
elif not isinstance(data, (np.ndarray, ExtensionArray)):
# GH#24539 e.g. xarray, dask object
data = np.asarray(data)
elif is_categorical_dtype(data.dtype):
data = data.categories.take(data.codes, fill_value=NaT)._values
copy = False

if isinstance(data, TimedeltaArray):
inferred_freq = data.freq

# Convert whatever we have into timedelta64[ns] dtype
if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
# no need to make a copy, need to convert if string-dtyped
Expand Down
120 changes: 116 additions & 4 deletions pandas/tests/arrays/test_datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
Timestamp,
)
from pandas.compat import np_version_under1p18
import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Expand All @@ -28,6 +29,8 @@
PeriodArray,
TimedeltaArray,
)
from pandas.core.arrays.datetimes import sequence_to_dt64ns
from pandas.core.arrays.timedeltas import sequence_to_td64ns


# TODO: more freq variants
Expand Down Expand Up @@ -224,7 +227,7 @@ def test_unbox_scalar(self):
result = arr._unbox_scalar(NaT)
assert isinstance(result, expected)

msg = f"'value' should be a {self.dtype.__name__}."
msg = f"'value' should be a {self.scalar_type.__name__}."
with pytest.raises(ValueError, match=msg):
arr._unbox_scalar("foo")

Expand Down Expand Up @@ -614,11 +617,21 @@ def test_median(self, arr1d):
result = arr2.median(axis=1, skipna=False)
tm.assert_equal(result, arr)

def test_from_integer_array(self):
arr = np.array([1, 2, 3], dtype=np.int64)
expected = self.array_cls(arr, dtype=self.example_dtype)

data = pd.array(arr, dtype="Int64")
result = self.array_cls(data, dtype=self.example_dtype)

tm.assert_extension_array_equal(result, expected)


class TestDatetimeArray(SharedTests):
index_cls = DatetimeIndex
array_cls = DatetimeArray
dtype = Timestamp
scalar_type = Timestamp
example_dtype = "M8[ns]"

@pytest.fixture
def arr1d(self, tz_naive_fixture, freqstr):
Expand Down Expand Up @@ -918,7 +931,8 @@ def test_strftime_nat(self):
class TestTimedeltaArray(SharedTests):
index_cls = TimedeltaIndex
array_cls = TimedeltaArray
dtype = pd.Timedelta
scalar_type = pd.Timedelta
example_dtype = "m8[ns]"

def test_from_tdi(self):
tdi = TimedeltaIndex(["1 Day", "3 Hours"])
Expand Down Expand Up @@ -1037,7 +1051,8 @@ def test_take_fill_valid(self, timedelta_index):
class TestPeriodArray(SharedTests):
index_cls = PeriodIndex
array_cls = PeriodArray
dtype = Period
scalar_type = Period
example_dtype = PeriodIndex([], freq="W").dtype

@pytest.fixture
def arr1d(self, period_index):
Expand Down Expand Up @@ -1305,3 +1320,100 @@ def test_period_index_construction_from_strings(klass):
result = PeriodIndex(data, freq="Q")
expected = PeriodIndex([Period(s) for s in strings])
tm.assert_index_equal(result, expected)


@pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
def test_from_pandas_array(dtype):
# GH#24615
data = np.array([1, 2, 3], dtype=dtype)
arr = PandasArray(data)

cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype]

result = cls(arr)
expected = cls(data)
tm.assert_extension_array_equal(result, expected)

result = cls._from_sequence(arr)
expected = cls._from_sequence(data)
tm.assert_extension_array_equal(result, expected)

func = {"M8[ns]": sequence_to_dt64ns, "m8[ns]": sequence_to_td64ns}[dtype]
result = func(arr)[0]
expected = func(data)[0]
tm.assert_equal(result, expected)

func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype]
result = func(arr).array
expected = func(data).array
tm.assert_equal(result, expected)

# Let's check the Indexes while we're here
idx_cls = {"M8[ns]": DatetimeIndex, "m8[ns]": TimedeltaIndex}[dtype]
result = idx_cls(arr)
expected = idx_cls(data)
tm.assert_index_equal(result, expected)


@pytest.fixture(
params=[
"memoryview",
"array",
pytest.param("dask", marks=td.skip_if_no("dask.array")),
pytest.param("xarray", marks=td.skip_if_no("xarray")),
]
)
def array_likes(request):
# GH#24539 recognize e.g xarray, dask, ...
arr = np.array([1, 2, 3], dtype=np.int64)

name = request.param
if name == "memoryview":
data = memoryview(arr)
elif name == "array":
# stdlib array
from array import array

data = array("i", arr)
elif name == "dask":
import dask.array

data = dask.array.array(arr)
elif name == "xarray":
import xarray as xr

data = xr.DataArray(arr)

return arr, data


@pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
def test_from_obscure_array(dtype, array_likes):
# GH#24539 recognize e.g xarray, dask, ...
# Note: we dont do this for PeriodArray bc _from_sequence won't accept
# an array of integers
# TODO: could check with arraylike of Period objects
arr, data = array_likes

cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype]

expected = cls(arr)
result = cls._from_sequence(data)
tm.assert_extension_array_equal(result, expected)

func = {"M8[ns]": sequence_to_dt64ns, "m8[ns]": sequence_to_td64ns}[dtype]
result = func(arr)[0]
expected = func(data)[0]
tm.assert_equal(result, expected)

# FIXME: dask and memoryview both break on these
# func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype]
# result = func(arr).array
# expected = func(data).array
# tm.assert_equal(result, expected)

# Let's check the Indexes while we're here
idx_cls = {"M8[ns]": DatetimeIndex, "m8[ns]": TimedeltaIndex}[dtype]
result = idx_cls(arr)
expected = idx_cls(data)
tm.assert_index_equal(result, expected)