Skip to content

Parse IntervalArray and IntervalIndex from strings #41451

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
00c7e52
Added IntervalArray.from_strings() and IntervalIndex.from_strings() t…
erikmannerfelt May 13, 2021
d2625a5
Ran pre-commit program on modified files.
erikmannerfelt May 13, 2021
88f4f69
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt May 22, 2021
27ae2bf
Documented exception to mypy linting and fixed doctest.
erikmannerfelt May 22, 2021
e6e072a
Replaced str.find() calls with more robust regex matching
erikmannerfelt Jun 1, 2021
6c1b871
Added parsing for int64 and improved tests.
erikmannerfelt Jun 1, 2021
5fd1526
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Jun 1, 2021
d646802
Moved error test to separate test function.
erikmannerfelt Jun 1, 2021
1355a17
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Jun 7, 2021
ec8a4cb
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Jun 8, 2021
ab76e68
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Jun 9, 2021
162045a
* Simplified pandas imports
erikmannerfelt Jun 9, 2021
e702840
Merge branch 'master' into intervalindex_from_string
erikmannerfelt Jun 28, 2021
f8ab67c
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Aug 19, 2021
0137f3e
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Sep 15, 2021
a8cc3b2
Added support for all closed types. Improved tests.
erikmannerfelt Sep 16, 2021
cb2ec12
Merge branch 'intervalindex_from_string' of github.com:erikmannerfelt…
erikmannerfelt Sep 16, 2021
27a5a2e
rerun CI
erikmannerfelt Sep 16, 2021
40af75d
Renamed IntervalArray method and removed nested try/except clauses.
erikmannerfelt Sep 17, 2021
cffc164
Merge branch 'master' of github.com:pandas-dev/pandas into intervalin…
erikmannerfelt Oct 4, 2021
8b29251
Merge branch 'pandas-dev:main' into intervalindex_from_string
erikmannerfelt Jan 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
le,
lt,
)
import re
import textwrap
from typing import (
Sequence,
Expand Down Expand Up @@ -574,6 +575,129 @@ def from_tuples(

return cls.from_arrays(left, right, closed, copy=False, dtype=dtype)

_interval_shared_docs["from_strings"] = textwrap.dedent(
"""
Construct from string representations of the left and right bounds.

Parameters
----------
data : array-like (1-dimensional)
Strings representing the Interval's to parse.
closed : {'left', 'right', 'both', 'neither'}, default 'right'
Whether the intervals are closed on the left-side, right-side, both
or neither.
dtype : dtype, optional
If None, dtype will be inferred.

Returns
-------
%(klass)s

Raises
------
ValueError
When a string cannot be parsed as an Interval
When the dtype of the string cannot be parsed as either float,
Timestamp or Timedelta

See Also
--------
interval_range : Function to create a fixed frequency IntervalIndex.
%(klass)s.from_breaks : Construct an %(klass)s from an array of
splits.
%(klass)s.from_tuples : Construct an %(klass)s from an
array-like of tuples.

%(examples)s\
"""
)

@classmethod
@Appender(
_interval_shared_docs["from_strings"]
% {
"klass": "IntervalIndex",
"examples": textwrap.dedent(
"""\
Examples
--------
>>> pd.IntervalIndex.from_strings(["(0, 1]", "(1, 2]"])
IntervalIndex([(0, 1], (1, 2]],
dtype='interval[int64, right]')
"""
),
}
)
def from_strings(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this public? shouldn't this be _from_sequence_of_strings?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in 40af75d.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uuuh, changing that name went out of my depth a little... Apparently there are more meanings to that name? Now tests are failing in pandas/tests/extension/test_interval.py with testing EA types (I don't know what that means). What's the best way forward here??

cls: type[IntervalArrayT],
data: Sequence[str],
closed: str = "right",
dtype: Dtype | None = None,
) -> IntervalArrayT:
from pandas import (
to_datetime,
to_numeric,
to_timedelta,
)

# The different closing brackets define which pattern to look for.
brackets = {
"right": ("(", "]"),
"left": ("[", ")"),
"both": ("[", "]"),
"neither": ("(", ")"),
}
pattern = re.compile(
"\\" + brackets[closed][0] + ".*,.*\\" + brackets[closed][1]
)

left, right = [], []
for string in data:

# Try to match "(left, right]" where 'left' and 'right' are breaks.
breaks_match = pattern.match(string)

if breaks_match is None:
raise ValueError(
f"Could not find opening '{brackets[closed][0]}' "
f"and closing '{brackets[closed][1]}' "
f"brackets in string: '{string}'"
)
# Try to split 'left' and 'right' based on a comma and a space.
breaks = breaks_match.string[1:-1].split(", ", 1)

if len(breaks) != 2:
raise ValueError(
f"Delimiter ', ' (comma + space) not found in string: {string}"
)

# Try different types of string parsers in succession
# First try to parse the breaks as numbers (int, float etc.)
try:
newleft, newright = to_numeric(breaks, errors="raise")
except ValueError:
# If that failed, try parsing as datetime
try:
newleft, newright = to_datetime(breaks, errors="raise")
except ValueError:
# If that also failed, try as timedelta
try:
newleft, newright = to_timedelta(breaks, errors="raise")
except ValueError:
# Finally, if all fails, raise an exception
raise ValueError(
"Could not parse string as numeric, Timedelta "
f"or Timestamp Interval: {string}"
)
left.append(newleft)
right.append(newright)

# If dtype was not an IntervalDtype, try to parse it as such.
if dtype is not None and not isinstance(dtype, IntervalDtype):
dtype = IntervalDtype(subtype=dtype, closed=closed)

return cls.from_arrays(left, right, closed=closed, copy=False, dtype=dtype)

def _validate(self):
"""
Verify that the IntervalArray is valid.
Expand Down
29 changes: 29 additions & 0 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import (
Any,
Hashable,
Sequence,
)

import numpy as np
Expand Down Expand Up @@ -317,6 +318,34 @@ def from_tuples(
arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype)
return cls._simple_new(arr, name=name)

@classmethod
@Appender(
_interval_shared_docs["from_strings"]
% {
"klass": "IntervalIndex",
"examples": textwrap.dedent(
"""\
Examples
--------
>>> pd.IntervalIndex.from_strings(["(0, 1]", "(1, 2]"])
IntervalIndex([(0, 1], (1, 2]],
dtype='interval[int64, right]')
"""
),
}
)
def from_strings(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you adding public api here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By making it public, do you mean with the @Appender decorator or by not preceding the name with a lowercase?

If the latter, how would one otherwise use this function?

cls,
data: Sequence[str],
closed: str = "right",
dtype: Dtype | None = None,
name: Hashable = None,
) -> IntervalIndex:
with rewrite_exception("IntervalArray", cls.__name__):
arr = IntervalArray.from_strings(data=data, dtype=dtype, closed=closed)

return cls._simple_new(arr, name=name)

# --------------------------------------------------------------------

@cache_readonly
Expand Down
65 changes: 65 additions & 0 deletions pandas/tests/indexes/interval/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,71 @@ def test_pickle_round_trip_closed(self, closed):
tm.assert_index_equal(result, idx)


@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"])
@pytest.mark.parametrize(
"test_case",
[
(
"float64",
[0.0, 0.5, 1.0],
["0.0, 0.5", "0.5, 1.0"],
),
("int64", [0, 5, 10], ["0, 5", "5, 10"]),
(
"datetime64[ns]",
[Timestamp(2015, 7, 1), Timestamp(2016, 8, 1), Timestamp(2018, 9, 1)],
["2015-07-01, 2016-08-01", "2016-08-01, 2018-09-01"],
),
],
)
def test_from_strings(closed, test_case):
"""Test the IntervalIndex.from_strings class method."""
# See https://github.com/pandas-dev/pandas/pull/41451
dtype, expected, strings = test_case

brackets = {
"right": ("(", "]"),
"left": ("[", ")"),
"both": ("[", "]"),
"neither": ("(", ")"),
}
# Assign the brackets associated to the closed type to be tested
interval_strings = [brackets[closed][0] + s + brackets[closed][1] for s in strings]

# Attempt to infer the type dynamically
tm.assert_index_equal(
IntervalIndex.from_strings(interval_strings, closed=closed),
IntervalIndex.from_breaks(expected, closed=closed),
exact=True,
)

# Parse it with a fixed dtype and assert that the result is correct.
tm.assert_index_equal(
IntervalIndex.from_strings(
interval_strings, dtype=np.dtype(dtype), closed=closed
),
IntervalIndex.from_breaks(expected, closed=closed),
exact=True,
)


@pytest.mark.parametrize(
"wrong_indices",
[
("('hello', 'there']", r"Could not parse string as numeric"),
("(0.1,0.1)", r"Could not find opening '\(' and closing ']'"),
("(0.0,0.5]", r"Delimiter ', ' .* not found"),
],
)
def test_from_strings_errors(wrong_indices):
"""Validate the error messages from the IntervalIndex.from_strings method."""
# See https://github.com/pandas-dev/pandas/pull/41451
string, error = wrong_indices

with pytest.raises(ValueError, match=error):
IntervalIndex.from_strings([string])


def test_dir():
# GH#27571 dir(interval_index) should not raise
index = IntervalIndex.from_arrays([0, 1], [1, 2])
Expand Down