Skip to content

Parse IntervalArray and IntervalIndex from strings #41451

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
00c7e52
Added IntervalArray.from_strings() and IntervalIndex.from_strings() t…
erikmannerfelt May 13, 2021
d2625a5
Ran pre-commit program on modified files.
erikmannerfelt May 13, 2021
88f4f69
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt May 22, 2021
27ae2bf
Documented exception to mypy linting and fixed doctest.
erikmannerfelt May 22, 2021
e6e072a
Replaced str.find() calls with more robust regex matching
erikmannerfelt Jun 1, 2021
6c1b871
Added parsing for int64 and improved tests.
erikmannerfelt Jun 1, 2021
5fd1526
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Jun 1, 2021
d646802
Moved error test to separate test function.
erikmannerfelt Jun 1, 2021
1355a17
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Jun 7, 2021
ec8a4cb
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Jun 8, 2021
ab76e68
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Jun 9, 2021
162045a
* Simplified pandas imports
erikmannerfelt Jun 9, 2021
e702840
Merge branch 'master' into intervalindex_from_string
erikmannerfelt Jun 28, 2021
f8ab67c
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Aug 19, 2021
0137f3e
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Sep 15, 2021
a8cc3b2
Added support for all closed types. Improved tests.
erikmannerfelt Sep 16, 2021
cb2ec12
Merge branch 'intervalindex_from_string' of github.com:erikmannerfelt…
erikmannerfelt Sep 16, 2021
27a5a2e
rerun CI
erikmannerfelt Sep 16, 2021
40af75d
Renamed IntervalArray method and removed nested try/except clauses.
erikmannerfelt Sep 17, 2021
cffc164
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Oct 4, 2021
8b29251
Merge branch 'pandas-dev:main' into intervalindex_from_string
erikmannerfelt Jan 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
le,
lt,
)
import re
import textwrap
from typing import (
Callable,
Sequence,
TypeVar,
cast,
Expand Down Expand Up @@ -562,6 +564,117 @@ def from_tuples(

return cls.from_arrays(left, right, closed, copy=False, dtype=dtype)

_interval_shared_docs["from_strings"] = textwrap.dedent(
"""
Construct from string representations of the left and right bounds.

Parameters
----------
data : array-like (1-dimensional)
Strings representing the Interval's to parse.
closed : {'left', 'right', 'both', 'neither'}, default 'right'
Whether the intervals are closed on the left-side, right-side, both
or neither.
dtype : dtype, optional
If None, dtype will be inferred.

Returns
-------
%(klass)s

Raises
------
ValueError
When a string cannot be parsed as an Interval
When the dtype of the string cannot be parsed as either float,
Timestamp or Timedelta

See Also
--------
interval_range : Function to create a fixed frequency IntervalIndex.
%(klass)s.from_breaks : Construct an %(klass)s from an array of
splits.
%(klass)s.from_tuples : Construct an %(klass)s from an
array-like of tuples.

%(examples)s\
"""
)

@classmethod
@Appender(
_interval_shared_docs["from_strings"]
% {
"klass": "IntervalIndex",
"examples": textwrap.dedent(
"""\
Examples
--------
>>> pd.IntervalIndex.from_strings(["(0, 1]", "(1, 2]"])
IntervalIndex([(0, 1], (1, 2]],
dtype='interval[int64, right]')
"""
),
}
)
def from_strings(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this public? shouldn't this be _from_sequence_of_strings?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 40af75d.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uuuh, changing that name went out of my depth a little... Apparently there are more meanings to that name? Now tests are failing in pandas/tests/extension/test_interval.py with testing EA types (I don't know what that means). What's the best way forward here??

cls: type[IntervalArrayT],
data: Sequence[str],
closed: str = "right",
dtype: Dtype | None = None,
) -> IntervalArrayT:
# These need to be imported here to avoid circular dependencies.
from pandas import (
to_datetime,
to_timedelta,
)

pattern = re.compile(r"\(.*,.*]")

left, right = [], []
for string in data:

# Try to match "(left, right]" where 'left' and 'right' are breaks.
breaks_match = pattern.match(string)

if breaks_match is None:
raise ValueError(
"Could not find opening '(' and closing ']' "
f"brackets in string: '{string}'"
)
# Try to split 'left' and 'right' based on a comma and a space.
breaks = breaks_match.string[1:-1].split(", ", 1)

if len(breaks) != 2:
raise ValueError(
f"Delimiter ', ' (comma + space) not found in string: {string}"
)

conversions: list[Callable] = [int, float, to_datetime, to_timedelta]
# Try to parse the breaks first as floats, then datetime, then timedelta.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm i am not a big fan of this, can we make the dtype strict e.g. interval[float] would completely solve this issue (forcing the user to do this)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the problem here is that there IS ambiguity.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The case that this string parsing functionality is relevant (and the only one I see for now) is reading from text files. The other parts of pandas infer types dynamically, so wouldn't it be best to do this by default here as well?

In 162045a, I added an optional dtype argument. If specified (not None), it will parse string representations first and then rely on IntervalArray.from_arrays() to (potentially) perform the more exact conversion. The IntervalArray.from_arrays() method unfortunately doesn't parse string representations of numeric or datetime-like values, so the IntervalArray.from_strings() method has to implement it in some way:

In [1]: import pandas as pd

In [2]: pd.arrays.IntervalArray.from_arrays(["0", "1", "2"], ["0", "1", "2"])
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-2-677073bd0961> in <module>
----> 1 pd.arrays.IntervalArray.from_arrays(["0", "1", "2"], ["0", "1", "2"])

~/Projects/pandas/pandas/core/arrays/interval.py in from_arrays(cls, left, right, closed, copy, dtype)
    479         right = _maybe_convert_platform_interval(right)
    480
--> 481         return cls._simple_new(
    482             left, right, closed, copy=copy, dtype=dtype, verify_integrity=True
    483         )

~/Projects/pandas/pandas/core/arrays/interval.py in _simple_new(cls, left, right, closed, copy, dtype, verify_integrity)
    293                 "for IntervalArray"
    294             )
--> 295             raise TypeError(msg)
    296         elif isinstance(left, ABCPeriodIndex):
    297             msg = "Period dtypes are not supported, use a PeriodIndex instead"

TypeError: category, object, and string subtypes are not supported for IntervalArray

for i, conversion in enumerate(conversions):
# Check if all breaks can be parsed as integers.
if i == 0 and not all(b.isdigit() for b in breaks):
continue
try:
newleft, newright = map(conversion, breaks)
left.append(newleft)
right.append(newright)
break
except ValueError:
continue
else:
raise ValueError(
"Could not parse string as Interval of float, Timedelta "
f"or Timestamp: {string}"
)

# If dtype was not an IntervalDtype, try to parse it as such.
if dtype is not None and not isinstance(dtype, IntervalDtype):
dtype = IntervalDtype(subtype=dtype, closed=closed)

return cls.from_arrays(left, right, closed=closed, copy=False, dtype=dtype)

def _validate(self):
"""
Verify that the IntervalArray is valid.
Expand Down
29 changes: 29 additions & 0 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import (
Any,
Hashable,
Sequence,
)

import numpy as np
Expand Down Expand Up @@ -314,6 +315,34 @@ def from_tuples(
arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype)
return cls._simple_new(arr, name=name)

@classmethod
@Appender(
_interval_shared_docs["from_strings"]
% {
"klass": "IntervalIndex",
"examples": textwrap.dedent(
"""\
Examples
--------
>>> pd.IntervalIndex.from_strings(["(0, 1]", "(1, 2]"])
IntervalIndex([(0, 1], (1, 2]],
dtype='interval[int64, right]')
"""
),
}
)
def from_strings(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you adding public api here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By making it public, do you mean with the @Appender decorator or by not preceding the name with a lowercase?

If the latter, how would one otherwise use this function?

cls,
data: Sequence[str],
closed: str = "right",
dtype: Dtype | None = None,
name: Hashable = None,
) -> IntervalIndex:
with rewrite_exception("IntervalArray", cls.__name__):
arr = IntervalArray.from_strings(data=data, dtype=dtype, closed=closed)

return cls._simple_new(arr, name=name)

# --------------------------------------------------------------------

@cache_readonly
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/indexes/interval/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,57 @@ def test_pickle_round_trip_closed(self, closed):
tm.assert_index_equal(result, idx)


@pytest.mark.parametrize(
"test_case",
[
(
"float64",
IntervalIndex.from_breaks([0.0, 0.5, 1.0]),
["(0.0, 0.5]", "(0.5, 1.0]"],
),
("int64", IntervalIndex.from_breaks([0, 5, 10]), ["(0, 5]", "(5, 10]"]),
(
"datetime64[ns]",
IntervalIndex.from_breaks(
[Timestamp(2015, 7, 1), Timestamp(2016, 8, 1), Timestamp(2018, 9, 1)]
),
["(2015-07-01, 2016-08-01]", "(2016-08-01, 2018-09-01]"],
),
],
)
def test_from_strings(test_case):
"""Test the IntervalIndex.from_strings class method."""
# See https://github.com/pandas-dev/pandas/pull/41451
dtype, expected, string = test_case

# Attempt to parse the type dynamically
parsed_index = IntervalIndex.from_strings(string)
assert np.array_equal(parsed_index, expected)
assert parsed_index.left.dtype == dtype

# Parse it with a fixed dtype and assert that the result is correct.
parsed_index_static = IntervalIndex.from_strings(string, dtype=np.dtype(dtype))
assert np.array_equal(parsed_index, parsed_index_static)
assert parsed_index.dtype == parsed_index_static.dtype


@pytest.mark.parametrize(
"wrong_indices",
[
("('hello', 'there']", r"Could not parse string as Interval"),
("(0.1,0.1)", r"Could not find opening '\(' and closing ']'"),
("(0.0,0.5]", r"Delimiter ', ' .* not found"),
],
)
def test_from_strings_errors(wrong_indices):
"""Validate the error messages from the IntervalIndex.from_strings method."""
# See https://github.com/pandas-dev/pandas/pull/41451
string, error = wrong_indices

with pytest.raises(ValueError, match=error):
IntervalIndex.from_strings([string])


def test_dir():
# GH#27571 dir(interval_index) should not raise
index = IntervalIndex.from_arrays([0, 1], [1, 2])
Expand Down