-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Parse IntervalArray and IntervalIndex from strings #41451
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
00c7e52
d2625a5
88f4f69
27ae2bf
e6e072a
6c1b871
5fd1526
d646802
1355a17
ec8a4cb
ab76e68
162045a
e702840
f8ab67c
0137f3e
a8cc3b2
cb2ec12
27a5a2e
40af75d
cffc164
8b29251
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,8 +5,10 @@ | |
le, | ||
lt, | ||
) | ||
import re | ||
import textwrap | ||
from typing import ( | ||
Callable, | ||
Sequence, | ||
TypeVar, | ||
cast, | ||
|
@@ -562,6 +564,117 @@ def from_tuples( | |
|
||
return cls.from_arrays(left, right, closed, copy=False, dtype=dtype) | ||
|
||
_interval_shared_docs["from_strings"] = textwrap.dedent( | ||
""" | ||
Construct from string representations of the left and right bounds. | ||
|
||
Parameters | ||
---------- | ||
data : array-like (1-dimensional) | ||
Strings representing the Interval's to parse. | ||
closed : {'left', 'right', 'both', 'neither'}, default 'right' | ||
Whether the intervals are closed on the left-side, right-side, both | ||
or neither. | ||
dtype : dtype, optional | ||
If None, dtype will be inferred. | ||
|
||
Returns | ||
------- | ||
%(klass)s | ||
|
||
Raises | ||
------ | ||
ValueError | ||
When a string cannot be parsed as an Interval | ||
When the dtype of the string cannot be parsed as either float, | ||
Timestamp or Timedelta | ||
|
||
See Also | ||
-------- | ||
interval_range : Function to create a fixed frequency IntervalIndex. | ||
%(klass)s.from_breaks : Construct an %(klass)s from an array of | ||
splits. | ||
%(klass)s.from_tuples : Construct an %(klass)s from an | ||
array-like of tuples. | ||
|
||
%(examples)s\ | ||
""" | ||
) | ||
|
||
@classmethod | ||
@Appender( | ||
_interval_shared_docs["from_strings"] | ||
% { | ||
"klass": "IntervalIndex", | ||
"examples": textwrap.dedent( | ||
"""\ | ||
Examples | ||
-------- | ||
>>> pd.IntervalIndex.from_strings(["(0, 1]", "(1, 2]"]) | ||
IntervalIndex([(0, 1], (1, 2]], | ||
dtype='interval[int64, right]') | ||
""" | ||
), | ||
} | ||
) | ||
def from_strings( | ||
cls: type[IntervalArrayT], | ||
data: Sequence[str], | ||
closed: str = "right", | ||
dtype: Dtype | None = None, | ||
) -> IntervalArrayT: | ||
# These need to be imported here to avoid circular dependencies. | ||
erikmannerfelt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
from pandas import ( | ||
erikmannerfelt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
to_datetime, | ||
to_timedelta, | ||
) | ||
|
||
pattern = re.compile(r"\(.*,.*]") | ||
|
||
left, right = [], [] | ||
for string in data: | ||
erikmannerfelt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Try to match "(left, right]" where 'left' and 'right' are breaks. | ||
breaks_match = pattern.match(string) | ||
|
||
if breaks_match is None: | ||
raise ValueError( | ||
"Could not find opening '(' and closing ']' " | ||
f"brackets in string: '{string}'" | ||
) | ||
# Try to split 'left' and 'right' based on a comma and a space. | ||
breaks = breaks_match.string[1:-1].split(", ", 1) | ||
erikmannerfelt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if len(breaks) != 2: | ||
raise ValueError( | ||
f"Delimiter ', ' (comma + space) not found in string: {string}" | ||
) | ||
|
||
conversions: list[Callable] = [int, float, to_datetime, to_timedelta] | ||
erikmannerfelt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# Try to parse the breaks first as floats, then datetime, then timedelta. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm i am not a big fan of this, can we make the dtype strict e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the problem here is that there IS ambiguity. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The case that this string parsing functionality is relevant (and the only one I see for now) is reading from text files. The other parts of pandas infer types dynamically, so wouldn't it be best to do this by default here as well? In 162045a, I added an optional In [1]: import pandas as pd
In [2]: pd.arrays.IntervalArray.from_arrays(["0", "1", "2"], ["0", "1", "2"])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-2-677073bd0961> in <module>
----> 1 pd.arrays.IntervalArray.from_arrays(["0", "1", "2"], ["0", "1", "2"])
~/Projects/pandas/pandas/core/arrays/interval.py in from_arrays(cls, left, right, closed, copy, dtype)
479 right = _maybe_convert_platform_interval(right)
480
--> 481 return cls._simple_new(
482 left, right, closed, copy=copy, dtype=dtype, verify_integrity=True
483 )
~/Projects/pandas/pandas/core/arrays/interval.py in _simple_new(cls, left, right, closed, copy, dtype, verify_integrity)
293 "for IntervalArray"
294 )
--> 295 raise TypeError(msg)
296 elif isinstance(left, ABCPeriodIndex):
297 msg = "Period dtypes are not supported, use a PeriodIndex instead"
TypeError: category, object, and string subtypes are not supported for IntervalArray |
||
for i, conversion in enumerate(conversions): | ||
# Check if all breaks can be parsed as integers. | ||
if i == 0 and not all(b.isdigit() for b in breaks): | ||
continue | ||
try: | ||
newleft, newright = map(conversion, breaks) | ||
left.append(newleft) | ||
right.append(newright) | ||
break | ||
except ValueError: | ||
continue | ||
else: | ||
raise ValueError( | ||
"Could not parse string as Interval of float, Timedelta " | ||
f"or Timestamp: {string}" | ||
) | ||
|
||
# If dtype was not an IntervalDtype, try to parse it as such. | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if dtype is not None and not isinstance(dtype, IntervalDtype): | ||
dtype = IntervalDtype(subtype=dtype, closed=closed) | ||
|
||
return cls.from_arrays(left, right, closed=closed, copy=False, dtype=dtype) | ||
|
||
def _validate(self): | ||
""" | ||
Verify that the IntervalArray is valid. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
from typing import ( | ||
Any, | ||
Hashable, | ||
Sequence, | ||
) | ||
|
||
import numpy as np | ||
|
@@ -314,6 +315,34 @@ def from_tuples( | |
arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) | ||
return cls._simple_new(arr, name=name) | ||
|
||
@classmethod | ||
@Appender( | ||
_interval_shared_docs["from_strings"] | ||
% { | ||
"klass": "IntervalIndex", | ||
"examples": textwrap.dedent( | ||
"""\ | ||
Examples | ||
-------- | ||
>>> pd.IntervalIndex.from_strings(["(0, 1]", "(1, 2]"]) | ||
IntervalIndex([(0, 1], (1, 2]], | ||
dtype='interval[int64, right]') | ||
""" | ||
), | ||
} | ||
) | ||
def from_strings( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are you adding public api here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By making it public, do you mean with the If the latter, how would one otherwise use this function? |
||
cls, | ||
data: Sequence[str], | ||
closed: str = "right", | ||
dtype: Dtype | None = None, | ||
name: Hashable = None, | ||
) -> IntervalIndex: | ||
with rewrite_exception("IntervalArray", cls.__name__): | ||
arr = IntervalArray.from_strings(data=data, dtype=dtype, closed=closed) | ||
|
||
return cls._simple_new(arr, name=name) | ||
|
||
# -------------------------------------------------------------------- | ||
|
||
@cache_readonly | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this public? shouldn't this be
_from_sequence_of_strings
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed in 40af75d.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Uuuh, changing that name went out of my depth a little... Apparently there are more meanings to that name? Now tests are failing in
pandas/tests/extension/test_interval.py
with testing EA types (I don't know what that means). What's the best way forward here??