Skip to content
forked from pydata/xarray

Commit 798f4d4

Browse files
agoodmIllviljanmathausedcherian
authored
Update contains_cftime_datetimes to avoid loading entire variable array (pydata#7494)
* Update contains_cftime_datetimes to avoid loading entire variable array * Update whats-new.rst * Convert arrays to variable instead for better control * fix mypy? * Update common.py * Update xarray/core/common.py Co-authored-by: Mathias Hauser <[email protected]> * Update common.py remove _variable_contains_cftime_datetimes * Avoid creating variable. * Add test * minimize diff * Update tests. * address comment * Fix test * Fix whats-new * Fix more tests * More fixes * fix iris tests --------- Co-authored-by: Illviljan <[email protected]> Co-authored-by: Mathias Hauser <[email protected]> Co-authored-by: dcherian <[email protected]> Co-authored-by: Deepak Cherian <[email protected]>
1 parent 830ee6d commit 798f4d4

12 files changed

+63
-37
lines changed

doc/whats-new.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ Bug fixes
4444
- Fix matplotlib raising a UserWarning when plotting a scatter plot
4545
with an unfilled marker (:issue:`7313`, :pull:`7318`).
4646
By `Jimmy Westling <https://github.com/illviljan>`_.
47+
- Improved performance in ``open_dataset`` for datasets with large object arrays (:issue:`7484`, :pull:`7494`).
48+
By `Alex Goodman <https://github.com/agoodm>`_ and `Deepak Cherian <https://github.com/dcherian>`_.
4749

4850
Documentation
4951
~~~~~~~~~~~~~

xarray/coding/calendar_ops.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def convert_calendar(
147147
from xarray.core.dataarray import DataArray
148148

149149
time = obj[dim]
150-
if not _contains_datetime_like_objects(time):
150+
if not _contains_datetime_like_objects(time.variable):
151151
raise ValueError(f"Coordinate {dim} must contain datetime objects.")
152152

153153
use_cftime = _should_cftime_be_used(time, calendar, use_cftime)
@@ -319,8 +319,8 @@ def interp_calendar(source, target, dim="time"):
319319
target = DataArray(target, dims=(dim,), name=dim)
320320

321321
if not _contains_datetime_like_objects(
322-
source[dim]
323-
) or not _contains_datetime_like_objects(target):
322+
source[dim].variable
323+
) or not _contains_datetime_like_objects(target.variable):
324324
raise ValueError(
325325
f"Both 'source.{dim}' and 'target' must contain datetime objects."
326326
)

xarray/coding/cftime_offsets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1267,7 +1267,7 @@ def date_range_like(source, calendar, use_cftime=None):
12671267
if not isinstance(source, (pd.DatetimeIndex, CFTimeIndex)) and (
12681268
isinstance(source, DataArray)
12691269
and (source.ndim != 1)
1270-
or not _contains_datetime_like_objects(source)
1270+
or not _contains_datetime_like_objects(source.variable)
12711271
):
12721272
raise ValueError(
12731273
"'source' must be a 1D array of datetime objects for inferring its range."

xarray/coding/frequencies.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,12 @@ def infer_freq(index):
7979
If there are fewer than three values or the index is not 1D.
8080
"""
8181
from xarray.core.dataarray import DataArray
82+
from xarray.core.variable import Variable
8283

8384
if isinstance(index, (DataArray, pd.Series)):
8485
if index.ndim != 1:
8586
raise ValueError("'index' must be 1D")
86-
elif not _contains_datetime_like_objects(DataArray(index)):
87+
elif not _contains_datetime_like_objects(Variable("dim", index)):
8788
raise ValueError("'index' must contain datetime-like objects")
8889
dtype = np.asarray(index).dtype
8990
if dtype == "datetime64[ns]":

xarray/core/accessor_dt.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -574,7 +574,7 @@ def __new__(cls, obj: T_DataArray) -> CombinedDatetimelikeAccessor:
574574
# we need to choose which parent (datetime or timedelta) is
575575
# appropriate. Since we're checking the dtypes anyway, we'll just
576576
# do all the validation here.
577-
if not _contains_datetime_like_objects(obj):
577+
if not _contains_datetime_like_objects(obj.variable):
578578
raise TypeError(
579579
"'.dt' accessor only available for "
580580
"DataArray with datetime64 timedelta64 dtype or "

xarray/core/common.py

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import pandas as pd
1212

1313
from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops
14+
from xarray.core.indexing import BasicIndexer, ExplicitlyIndexed
1415
from xarray.core.options import OPTIONS, _get_keep_attrs
1516
from xarray.core.pycompat import is_duck_dask_array
1617
from xarray.core.utils import Frozen, either_dict_or_kwargs, is_scalar
@@ -40,6 +41,7 @@
4041
ScalarOrArray,
4142
SideOptions,
4243
T_DataWithCoords,
44+
T_Variable,
4345
)
4446
from xarray.core.variable import Variable
4547

@@ -1770,31 +1772,27 @@ def is_np_timedelta_like(dtype: DTypeLike) -> bool:
17701772
return np.issubdtype(dtype, np.timedelta64)
17711773

17721774

1773-
def _contains_cftime_datetimes(array) -> bool:
1774-
"""Check if an array contains cftime.datetime objects"""
1775+
def _contains_cftime_datetimes(array: Any) -> bool:
1776+
"""Check if a array inside a Variable contains cftime.datetime objects"""
17751777
if cftime is None:
17761778
return False
1777-
else:
1778-
if array.dtype == np.dtype("O") and array.size > 0:
1779-
sample = np.asarray(array).flat[0]
1780-
if is_duck_dask_array(sample):
1781-
sample = sample.compute()
1782-
if isinstance(sample, np.ndarray):
1783-
sample = sample.item()
1784-
return isinstance(sample, cftime.datetime)
1785-
else:
1786-
return False
17871779

1780+
if array.dtype == np.dtype("O") and array.size > 0:
1781+
first_idx = (0,) * array.ndim
1782+
if isinstance(array, ExplicitlyIndexed):
1783+
first_idx = BasicIndexer(first_idx)
1784+
sample = array[first_idx]
1785+
return isinstance(np.asarray(sample).item(), cftime.datetime)
1786+
1787+
return False
17881788

1789-
def contains_cftime_datetimes(var) -> bool:
1789+
1790+
def contains_cftime_datetimes(var: T_Variable) -> bool:
17901791
"""Check if an xarray.Variable contains cftime.datetime objects"""
1791-
if var.dtype == np.dtype("O") and var.size > 0:
1792-
return _contains_cftime_datetimes(var.data)
1793-
else:
1794-
return False
1792+
return _contains_cftime_datetimes(var._data)
17951793

17961794

1797-
def _contains_datetime_like_objects(var) -> bool:
1795+
def _contains_datetime_like_objects(var: T_Variable) -> bool:
17981796
"""Check if a variable contains datetime like objects (either
17991797
np.datetime64, np.timedelta64, or cftime.datetime)
18001798
"""

xarray/tests/__init__.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,18 @@ def __init__(self, array):
143143
self.array = array
144144

145145
def __getitem__(self, key):
146-
raise UnexpectedDataAccess("Tried accessing data")
146+
raise UnexpectedDataAccess("Tried accessing data.")
147+
148+
def __array__(self):
149+
raise UnexpectedDataAccess("Tried accessing data.")
150+
151+
152+
class FirstElementAccessibleArray(InaccessibleArray):
153+
def __getitem__(self, key):
154+
tuple_idxr = key.tuple
155+
if len(tuple_idxr) > 1:
156+
raise UnexpectedDataAccess("Tried accessing more than one element.")
157+
return self.array[tuple_idxr]
147158

148159

149160
class ReturnItem:

xarray/tests/test_accessor_dt.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,6 @@ def test_calendar_cftime(data) -> None:
418418
assert data.time.dt.calendar == expected
419419

420420

421-
@requires_cftime
422421
def test_calendar_datetime64_2d() -> None:
423422
data = xr.DataArray(np.zeros((4, 5), dtype="datetime64[ns]"), dims=("x", "y"))
424423
assert data.dt.calendar == "proleptic_gregorian"

xarray/tests/test_backends.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2560,7 +2560,7 @@ def test_open_zarr_use_cftime(self) -> None:
25602560
ds_a = xr.open_zarr(store_target, **self.version_kwargs)
25612561
assert_identical(ds, ds_a)
25622562
ds_b = xr.open_zarr(store_target, use_cftime=True, **self.version_kwargs)
2563-
assert xr.coding.times.contains_cftime_datetimes(ds_b.time)
2563+
assert xr.coding.times.contains_cftime_datetimes(ds_b.time.variable)
25642564

25652565
def test_write_read_select_write(self) -> None:
25662566
# Test for https://github.com/pydata/xarray/issues/4084

xarray/tests/test_coding.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,13 @@ def test_CFMaskCoder_missing_value() -> None:
6565
expected.attrs["missing_value"] = -9999
6666

6767
decoded = xr.decode_cf(expected.to_dataset())
68-
encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs)
68+
encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs)
6969

7070
assert_equal(encoded["tmpk"], expected.variable)
7171

7272
decoded.tmpk.encoding["_FillValue"] = -9940
7373
with pytest.raises(ValueError):
74-
encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs)
74+
encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs)
7575

7676

7777
@requires_dask

xarray/tests/test_coding_times.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from xarray.core.common import contains_cftime_datetimes
3232
from xarray.testing import assert_equal, assert_identical
3333
from xarray.tests import (
34+
FirstElementAccessibleArray,
3435
arm_xfail,
3536
assert_array_equal,
3637
assert_no_warnings,
@@ -787,35 +788,35 @@ def times_3d(times):
787788

788789
@requires_cftime
789790
def test_contains_cftime_datetimes_1d(data) -> None:
790-
assert contains_cftime_datetimes(data.time)
791+
assert contains_cftime_datetimes(data.time.variable)
791792

792793

793794
@requires_cftime
794795
@requires_dask
795796
def test_contains_cftime_datetimes_dask_1d(data) -> None:
796-
assert contains_cftime_datetimes(data.time.chunk())
797+
assert contains_cftime_datetimes(data.time.variable.chunk())
797798

798799

799800
@requires_cftime
800801
def test_contains_cftime_datetimes_3d(times_3d) -> None:
801-
assert contains_cftime_datetimes(times_3d)
802+
assert contains_cftime_datetimes(times_3d.variable)
802803

803804

804805
@requires_cftime
805806
@requires_dask
806807
def test_contains_cftime_datetimes_dask_3d(times_3d) -> None:
807-
assert contains_cftime_datetimes(times_3d.chunk())
808+
assert contains_cftime_datetimes(times_3d.variable.chunk())
808809

809810

810811
@pytest.mark.parametrize("non_cftime_data", [DataArray([]), DataArray([1, 2])])
811812
def test_contains_cftime_datetimes_non_cftimes(non_cftime_data) -> None:
812-
assert not contains_cftime_datetimes(non_cftime_data)
813+
assert not contains_cftime_datetimes(non_cftime_data.variable)
813814

814815

815816
@requires_dask
816817
@pytest.mark.parametrize("non_cftime_data", [DataArray([]), DataArray([1, 2])])
817818
def test_contains_cftime_datetimes_non_cftimes_dask(non_cftime_data) -> None:
818-
assert not contains_cftime_datetimes(non_cftime_data.chunk())
819+
assert not contains_cftime_datetimes(non_cftime_data.variable.chunk())
819820

820821

821822
@requires_cftime
@@ -1176,3 +1177,17 @@ def test_scalar_unit() -> None:
11761177
variable = Variable(("x", "y"), np.array([[0, 1], [2, 3]]), {"units": np.nan})
11771178
result = coding.times.CFDatetimeCoder().decode(variable)
11781179
assert np.isnan(result.attrs["units"])
1180+
1181+
1182+
@requires_cftime
1183+
def test_contains_cftime_lazy() -> None:
1184+
import cftime
1185+
1186+
from xarray.core.common import _contains_cftime_datetimes
1187+
1188+
times = np.array(
1189+
[cftime.DatetimeGregorian(1, 1, 2, 0), cftime.DatetimeGregorian(1, 1, 2, 0)],
1190+
dtype=object,
1191+
)
1192+
array = FirstElementAccessibleArray(times)
1193+
assert _contains_cftime_datetimes(array)

xarray/tests/test_dataarray.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6219,7 +6219,7 @@ def test_to_and_from_iris(self) -> None:
62196219
original_coord = original.coords[orginal_key]
62206220
assert coord.var_name == original_coord.name
62216221
assert_array_equal(
6222-
coord.points, CFDatetimeCoder().encode(original_coord).values
6222+
coord.points, CFDatetimeCoder().encode(original_coord.variable).values
62236223
)
62246224
assert actual.coord_dims(coord) == original.get_axis_num(
62256225
original.coords[coord.var_name].dims
@@ -6295,7 +6295,7 @@ def test_to_and_from_iris_dask(self) -> None:
62956295
original_coord = original.coords[orginal_key]
62966296
assert coord.var_name == original_coord.name
62976297
assert_array_equal(
6298-
coord.points, CFDatetimeCoder().encode(original_coord).values
6298+
coord.points, CFDatetimeCoder().encode(original_coord.variable).values
62996299
)
63006300
assert actual.coord_dims(coord) == original.get_axis_num(
63016301
original.coords[coord.var_name].dims

0 commit comments

Comments
 (0)