Skip to content

ENH: make closed part of IntervalDtype #37933

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,8 +694,8 @@ def float_frame():
# ----------------------------------------------------------------
@pytest.fixture(
params=[
(Interval(left=0, right=5), IntervalDtype("int64")),
(Interval(left=0.1, right=0.5), IntervalDtype("float64")),
(Interval(left=0, right=5), IntervalDtype("int64", "right")),
(Interval(left=0.1, right=0.5), IntervalDtype("float64", "right")),
(Period("2012-01", freq="M"), "period[M]"),
(Period("2012-02-01", freq="D"), "period[D]"),
(
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/_arrow_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def __hash__(self):
def to_pandas_dtype(self):
import pandas as pd

return pd.IntervalDtype(self.subtype.to_pandas_dtype())
return pd.IntervalDtype(self.subtype.to_pandas_dtype(), self.closed)

# register the type with a dummy instance
_interval_type = ArrowIntervalType(pyarrow.int64(), "left")
Expand Down
26 changes: 14 additions & 12 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@
>>> pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)])
<IntervalArray>
[(0, 1], (1, 5]]
Length: 2, closed: right, dtype: interval[int64]
Length: 2, closed: right, dtype: interval[int64, right]

It may also be constructed using one of the constructor
methods: :meth:`IntervalArray.from_arrays`,
Expand Down Expand Up @@ -222,10 +222,12 @@ def __new__(
def _simple_new(cls, data, closed="right"):
result = IntervalMixin.__new__(cls)

dtype = IntervalDtype(data.dtype, closed=closed)
result._dtype = dtype

result._combined = data
result._left = data[:, 0]
result._right = data[:, 1]
result._closed = closed
return result

@classmethod
Expand Down Expand Up @@ -283,7 +285,7 @@ def _from_factorized(cls, values, original):
>>> pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3])
<IntervalArray>
[(0, 1], (1, 2], (2, 3]]
Length: 3, closed: right, dtype: interval[int64]
Length: 3, closed: right, dtype: interval[int64, right]
"""
),
)
Expand Down Expand Up @@ -352,7 +354,7 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None):
>>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3])
<IntervalArray>
[(0, 1], (1, 2], (2, 3]]
Length: 3, closed: right, dtype: interval[int64]
Length: 3, closed: right, dtype: interval[int64, right]
"""
),
)
Expand Down Expand Up @@ -415,7 +417,7 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None):
>>> pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)])
<IntervalArray>
[(0, 1], (1, 2]]
Length: 2, closed: right, dtype: interval[int64]
Length: 2, closed: right, dtype: interval[int64, right]
"""
),
)
Expand Down Expand Up @@ -480,7 +482,7 @@ def _validate(self):

@property
def dtype(self):
return IntervalDtype(self.left.dtype)
return self._dtype

@property
def nbytes(self) -> int:
Expand Down Expand Up @@ -1106,7 +1108,7 @@ def mid(self):
>>> intervals
<IntervalArray>
[(0, 1], (1, 3], (2, 4]]
Length: 3, closed: right, dtype: interval[int64]
Length: 3, closed: right, dtype: interval[int64, right]
"""
),
)
Expand Down Expand Up @@ -1135,7 +1137,7 @@ def closed(self):
Whether the intervals are closed on the left-side, right-side, both or
neither.
"""
return self._closed
return self.dtype.closed

_interval_shared_docs["set_closed"] = textwrap.dedent(
"""
Expand Down Expand Up @@ -1170,11 +1172,11 @@ def closed(self):
>>> index
<IntervalArray>
[(0, 1], (1, 2], (2, 3]]
Length: 3, closed: right, dtype: interval[int64]
Length: 3, closed: right, dtype: interval[int64, right]
>>> index.set_closed('both')
<IntervalArray>
[[0, 1], [1, 2], [2, 3]]
Length: 3, closed: both, dtype: interval[int64]
Length: 3, closed: both, dtype: interval[int64, both]
"""
),
)
Expand Down Expand Up @@ -1230,7 +1232,7 @@ def __array__(self, dtype=None) -> np.ndarray:
left = self._left
right = self._right
mask = self.isna()
closed = self._closed
closed = self.closed

result = np.empty(len(left), dtype=object)
for i in range(len(left)):
Expand Down Expand Up @@ -1369,7 +1371,7 @@ def repeat(self, repeats, axis=None):
>>> intervals
<IntervalArray>
[(0, 1], (1, 3], (2, 4]]
Length: 3, closed: right, dtype: interval[int64]
Length: 3, closed: right, dtype: interval[int64, right]
"""
),
)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,7 +815,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj,
dtype = PeriodDtype(freq=val.freq)
elif lib.is_interval(val):
subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0]
dtype = IntervalDtype(subtype=subtype)
dtype = IntervalDtype(subtype=subtype, closed=val.closed)

return dtype, val

Expand Down
36 changes: 28 additions & 8 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -998,8 +998,8 @@ class IntervalDtype(PandasExtensionDtype):

Examples
--------
>>> pd.IntervalDtype(subtype='int64')
interval[int64]
>>> pd.IntervalDtype(subtype='int64', closed='both')
interval[int64, both]
"""

name = "interval"
Expand All @@ -1008,27 +1008,37 @@ class IntervalDtype(PandasExtensionDtype):
base = np.dtype("O")
num = 103
_metadata = ("subtype",)
_match = re.compile(r"(I|i)nterval\[(?P<subtype>.+)\]")
_match = re.compile(
r"(I|i)nterval\[(?P<subtype>[^,]+)(, (?P<closed>(right|left|both|neither)))?\]"
)
_cache: Dict[str_type, PandasExtensionDtype] = {}

def __new__(cls, subtype=None):
def __new__(cls, subtype=None, closed: Optional[str_type] = None):
from pandas.core.dtypes.common import is_string_dtype, pandas_dtype

if isinstance(subtype, IntervalDtype):
if closed is not None and closed != subtype.closed:
raise ValueError(
"dtype.closed and 'closed' do not match. "
"Try IntervalDtype(dtype.subtype, closed) instead."
)
return subtype
elif subtype is None:
# we are called as an empty constructor
# generally for pickle compat
u = object.__new__(cls)
u._subtype = None
u._closed = closed
return u
elif isinstance(subtype, str) and subtype.lower() == "interval":
subtype = None
else:
if isinstance(subtype, str):
m = cls._match.search(subtype)
if m is not None:
subtype = m.group("subtype")
gd = m.groupdict()
subtype = gd["subtype"]
closed = gd.get("closed", closed)

try:
subtype = pandas_dtype(subtype)
Expand All @@ -1043,14 +1053,20 @@ def __new__(cls, subtype=None):
)
raise TypeError(msg)

key = str(subtype) + str(closed)
try:
return cls._cache[str(subtype)]
return cls._cache[key]
except KeyError:
u = object.__new__(cls)
u._subtype = subtype
cls._cache[str(subtype)] = u
u._closed = closed
cls._cache[key] = u
return u

@property
def closed(self):
return self._closed

@property
def subtype(self):
"""
Expand Down Expand Up @@ -1100,7 +1116,7 @@ def type(self):
def __str__(self) -> str_type:
if self.subtype is None:
return "interval"
return f"interval[{self.subtype}]"
return f"interval[{self.subtype}, {self.closed}]"

def __hash__(self) -> int:
# make myself hashable
Expand All @@ -1114,6 +1130,8 @@ def __eq__(self, other: Any) -> bool:
elif self.subtype is None or other.subtype is None:
# None should match any subtype
return True
elif self.closed != other.closed:
return False
else:
from pandas.core.dtypes.common import is_dtype_equal

Expand All @@ -1124,6 +1142,8 @@ def __setstate__(self, state):
# PandasExtensionDtype superclass and uses the public properties to
# pickle -> need to set the settable private ones here (see GH26067)
self._subtype = state["subtype"]
# backward-compat older pickles won't have "closed" key
self._closed = state.pop("closed", None)

@classmethod
def is_dtype(cls, dtype: object) -> bool:
Expand Down
26 changes: 13 additions & 13 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def wrapped(self, other, sort=False):
>>> pd.interval_range(start=0, end=5)
IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]],
closed='right',
dtype='interval[int64]')
dtype='interval[int64, right]')

It may also be constructed using one of the constructor
methods: :meth:`IntervalIndex.from_arrays`,
Expand Down Expand Up @@ -248,7 +248,7 @@ def _simple_new(cls, array: IntervalArray, name: Label = None):
>>> pd.IntervalIndex.from_breaks([0, 1, 2, 3])
IntervalIndex([(0, 1], (1, 2], (2, 3]],
closed='right',
dtype='interval[int64]')
dtype='interval[int64, right]')
"""
),
)
Expand All @@ -274,7 +274,7 @@ def from_breaks(
>>> pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3])
IntervalIndex([(0, 1], (1, 2], (2, 3]],
closed='right',
dtype='interval[int64]')
dtype='interval[int64, right]')
"""
),
)
Expand Down Expand Up @@ -306,7 +306,7 @@ def from_arrays(
>>> pd.IntervalIndex.from_tuples([(0, 1), (1, 2)])
IntervalIndex([(0, 1], (1, 2]],
closed='right',
dtype='interval[int64]')
dtype='interval[int64, right]')
"""
),
)
Expand Down Expand Up @@ -448,7 +448,7 @@ def is_overlapping(self) -> bool:
>>> index
IntervalIndex([(0, 2], (1, 3], (4, 5]],
closed='right',
dtype='interval[int64]')
dtype='interval[int64, right]')
>>> index.is_overlapping
True

Expand All @@ -458,7 +458,7 @@ def is_overlapping(self) -> bool:
>>> index
IntervalIndex([[0, 1], [1, 2], [2, 3]],
closed='both',
dtype='interval[int64]')
dtype='interval[int64, both]')
>>> index.is_overlapping
True

Expand All @@ -468,7 +468,7 @@ def is_overlapping(self) -> bool:
>>> index
IntervalIndex([[0, 1), [1, 2), [2, 3)],
closed='left',
dtype='interval[int64]')
dtype='interval[int64, left]')
>>> index.is_overlapping
False
"""
Expand Down Expand Up @@ -1134,23 +1134,23 @@ def interval_range(

>>> pd.interval_range(start=0, end=5)
IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]],
closed='right', dtype='interval[int64]')
closed='right', dtype='interval[int64, right]')

Additionally, datetime-like input is also supported.

>>> pd.interval_range(start=pd.Timestamp('2017-01-01'),
... end=pd.Timestamp('2017-01-04'))
IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03],
(2017-01-03, 2017-01-04]],
closed='right', dtype='interval[datetime64[ns]]')
closed='right', dtype='interval[datetime64[ns], right]')

The ``freq`` parameter specifies the frequency between the left and right.
endpoints of the individual intervals within the ``IntervalIndex``. For
numeric ``start`` and ``end``, the frequency must also be numeric.

>>> pd.interval_range(start=0, periods=4, freq=1.5)
IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]],
closed='right', dtype='interval[float64]')
closed='right', dtype='interval[float64, right]')

Similarly, for datetime-like ``start`` and ``end``, the frequency must be
convertible to a DateOffset.
Expand All @@ -1159,22 +1159,22 @@ def interval_range(
... periods=3, freq='MS')
IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01],
(2017-03-01, 2017-04-01]],
closed='right', dtype='interval[datetime64[ns]]')
closed='right', dtype='interval[datetime64[ns], right]')

Specify ``start``, ``end``, and ``periods``; the frequency is generated
automatically (linearly spaced).

>>> pd.interval_range(start=0, end=6, periods=4)
IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]],
closed='right',
dtype='interval[float64]')
dtype='interval[float64, right]')

The ``closed`` parameter specifies which endpoints of the individual
intervals within the ``IntervalIndex`` are closed.

>>> pd.interval_range(end=5, periods=4, closed='both')
IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]],
closed='both', dtype='interval[int64]')
closed='both', dtype='interval[int64, both]')
"""
start = maybe_box_datetimelike(start)
end = maybe_box_datetimelike(end)
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,12 @@ def cut(
>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
... # doctest: +ELLIPSIS
[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...

>>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
... # doctest: +ELLIPSIS
([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ...
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
array([0.994, 3. , 5. , 7. ]))

Discovers the same bins, but assign them specific labels. Notice that
Expand Down Expand Up @@ -176,7 +176,7 @@ def cut(
d (7.333, 10.0]
e (7.333, 10.0]
dtype: category
Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ...
Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ...

Passing a Series as an input returns a Series with mapping value.
It is used to map numerically to intervals based on bins.
Expand Down Expand Up @@ -214,7 +214,7 @@ def cut(
>>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
>>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
[NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]
Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]
Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]]
"""
# NOTE: this binning code is changed a bit from histogram for var(x) == 0

Expand Down Expand Up @@ -336,7 +336,7 @@ def qcut(
>>> pd.qcut(range(5), 4)
... # doctest: +ELLIPSIS
[(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ...
Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ...

>>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
... # doctest: +SKIP
Expand Down
Loading