From 00c7e5250c3f887bf4b526488e1d25ba435b575e Mon Sep 17 00:00:00 2001 From: Erik Mannerfelt <33550973+erikmannerfelt@users.noreply.github.com> Date: Thu, 13 May 2021 11:45:05 +0200 Subject: [PATCH 01/10] Added IntervalArray.from_strings() and IntervalIndex.from_strings() to parse string representations of Intervals --- pandas/core/arrays/interval.py | 90 +++++++++++++++++++ pandas/core/indexes/interval.py | 28 ++++++ .../tests/indexes/interval/test_interval.py | 50 +++++++++++ 3 files changed, 168 insertions(+) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index a99bf245a6073..200620759d400 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -562,6 +562,96 @@ def from_tuples( return cls.from_arrays(left, right, closed, copy=False, dtype=dtype) + + _interval_shared_docs["from_strings"] = textwrap.dedent( + """ + Construct from string representations of the left and right bounds. + + Parameters + ---------- + data : array-like (1-dimensional) + Strings representing the Interval's to parse. + copy : bool, default False + Copy the data. + dtype : dtype, optional + If None, dtype will be inferred. + + Returns + ------- + %(klass)s + + Raises + ------ + ValueError + When a string cannot be parsed as an Interval + When the dtype of the string cannot be parsed as either float, Timestamp or Timedelta + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits. + %(klass)s.from_tuples : Construct an %(klass)s from an + array-like of tuples. + + %(examples)s\ + """ + ) + @classmethod + @Appender( + _interval_shared_docs["from_strings"] + % { + "klass": "IntervalIndex", + "examples": textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_strings(["(0, 1]", "(1, 2]"]) + IntervalIndex([(0, 1], (1, 2]], + dtype='interval[int64, right]') + """ + ), + } + ) + def from_strings( + cls: type[IntervalArrayT], + data: Sequence[str], + ) -> IntervalArrayT: + # These need to be imported here to avoid circular dependencies. + from pandas.core.tools.timedeltas import to_timedelta + from pandas.core.tools.datetimes import to_datetime + + intervals: list[Interval] = [] + for string in data: + try: + # Find the first parenthesis and assume it is the start of the interval + start = string.index("(") + # Find the first closing square bracket and assume it is the end + end = string.rindex("]") + except ValueError: + raise ValueError(f"Could not find opening '(' and closing ']' brackets in string: '{string}'") + + # Extract that part and try to split based on a comma and a space. + breaks = string[start + 1:end].split(", ", 1) + + if len(breaks) != 2: + raise ValueError(f"Delimiter ', ' (comma + space) not found in string: {string}") + + # Try to parse the breaks first as floats, then datetime, then timedelta. + for conversion in [float, to_datetime, to_timedelta]: + try: + interval = Interval(*map(conversion, breaks)) + break + except ValueError: + continue + else: + raise ValueError(f"Could not parse string as Interval of float, Timedelta or Timestamp: {string}") + intervals.append(interval) + + + return cls(intervals) + + def _validate(self): """ Verify that the IntervalArray is valid. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index fc92a1b3afe53..acdf033bffb6d 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -12,6 +12,7 @@ Any, Hashable, cast, + Sequence ) import numpy as np @@ -387,6 +388,33 @@ def from_tuples( arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) return cls._simple_new(arr, name=name) + + @classmethod + @Appender( + _interval_shared_docs["from_strings"] + % { + "klass": "IntervalIndex", + "examples": textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_strings(["(0, 1]", "(1, 2]"]) + IntervalIndex([(0, 1], (1, 2]], + dtype='interval[int64, right]') + """ + ), + } + ) + def from_strings( + cls, + data: Sequence[str], + name: Hashable = None, + ) -> IntervalIndex: + with rewrite_exception("IntervalArray", cls.__name__): + arr = IntervalArray.from_strings(data=data) + + return cls._simple_new(arr, name=name) + # -------------------------------------------------------------------- @cache_readonly diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index cd61fcaa835a4..606716943c979 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -906,6 +906,56 @@ def test_pickle_round_trip_closed(self, closed): tm.assert_index_equal(result, idx) +def test_from_strings(): + """Test the IntervalIndex.from_strings class method.""" + # Create some float IntervalIndex string representations. + indices = [ + "(0.0, 0.5]", + "(0.5, 1.0]" + ] + + # Create some datetime-like string representations + datetime_indices = [ + "(2015-07-01, 2016-08-01]", + "(2016-08-01, 2018-09-01]" + ] + + + index = IntervalIndex.from_strings(indices) + dt_index = IntervalIndex.from_strings(datetime_indices) + + assert index[0].left == 0.0 + assert index[1].right == 1.0 + + assert dt_index[0].left.year == 2015 + assert dt_index[1].right.month == 9 + + # Create invalid interval indices (to make sure it fails correctly) + wrong_indices = [ + "('hello', 'there']", + "(0.1,0.1)", + "(0.0,0.5]", + ] + + # Make sure that the wrong indices raise the appropriate error + try: + IntervalIndex.from_strings([wrong_indices[0]]) + except ValueError as exception: + if "Could not parse string as Interval" not in str(exception): + raise exception + try: + IntervalIndex.from_strings([wrong_indices[1]]) + except ValueError as exception: + if "Could not find opening '('" not in str(exception): + raise exception + + try: + IntervalIndex.from_strings([wrong_indices[2]]) + except ValueError as exception: + if "Delimiter ', ' (comma + space) not found" not in str(exception): + raise exception + + def test_dir(): # GH#27571 dir(interval_index) should not raise index = IntervalIndex.from_arrays([0, 1], [1, 2]) From d2625a5f0dc0f35616b1435754db42587f40ce08 Mon Sep 17 00:00:00 2001 From: Erik Mannerfelt <33550973+erikmannerfelt@users.noreply.github.com> Date: Thu, 13 May 2021 12:01:57 +0200 Subject: [PATCH 02/10] Ran pre-commit program on modified files. --- pandas/core/arrays/interval.py | 31 ++++++++++++------- pandas/core/indexes/interval.py | 11 +++---- .../tests/indexes/interval/test_interval.py | 17 +++------- 3 files changed, 29 insertions(+), 30 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 200620759d400..1c669a86f4c74 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -562,7 +562,6 @@ def from_tuples( return cls.from_arrays(left, right, closed, copy=False, dtype=dtype) - _interval_shared_docs["from_strings"] = textwrap.dedent( """ Construct from string representations of the left and right bounds. @@ -584,7 +583,8 @@ def from_tuples( ------ ValueError When a string cannot be parsed as an Interval - When the dtype of the string cannot be parsed as either float, Timestamp or Timedelta + When the dtype of the string cannot be parsed as either float, + Timestamp or Timedelta See Also -------- @@ -597,6 +597,7 @@ def from_tuples( %(examples)s\ """ ) + @classmethod @Appender( _interval_shared_docs["from_strings"] @@ -614,12 +615,12 @@ def from_tuples( } ) def from_strings( - cls: type[IntervalArrayT], - data: Sequence[str], - ) -> IntervalArrayT: + cls: type[IntervalArrayT], + data: Sequence[str], + ) -> IntervalArrayT: # These need to be imported here to avoid circular dependencies. - from pandas.core.tools.timedeltas import to_timedelta from pandas.core.tools.datetimes import to_datetime + from pandas.core.tools.timedeltas import to_timedelta intervals: list[Interval] = [] for string in data: @@ -629,13 +630,18 @@ def from_strings( # Find the first closing square bracket and assume it is the end end = string.rindex("]") except ValueError: - raise ValueError(f"Could not find opening '(' and closing ']' brackets in string: '{string}'") + raise ValueError( + "Could not find opening '(' and closing ']' " + f"brackets in string: '{string}'" + ) # Extract that part and try to split based on a comma and a space. - breaks = string[start + 1:end].split(", ", 1) + breaks = string[start + 1 : end].split(", ", 1) if len(breaks) != 2: - raise ValueError(f"Delimiter ', ' (comma + space) not found in string: {string}") + raise ValueError( + f"Delimiter ', ' (comma + space) not found in string: {string}" + ) # Try to parse the breaks first as floats, then datetime, then timedelta. for conversion in [float, to_datetime, to_timedelta]: @@ -645,12 +651,13 @@ def from_strings( except ValueError: continue else: - raise ValueError(f"Could not parse string as Interval of float, Timedelta or Timestamp: {string}") + raise ValueError( + "Could not parse string as Interval of float, Timedelta " + f"or Timestamp: {string}" + ) intervals.append(interval) - return cls(intervals) - def _validate(self): """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index acdf033bffb6d..6c1cd2b62970d 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -11,8 +11,8 @@ TYPE_CHECKING, Any, Hashable, + Sequence, cast, - Sequence ) import numpy as np @@ -388,7 +388,6 @@ def from_tuples( arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) return cls._simple_new(arr, name=name) - @classmethod @Appender( _interval_shared_docs["from_strings"] @@ -406,10 +405,10 @@ def from_tuples( } ) def from_strings( - cls, - data: Sequence[str], - name: Hashable = None, - ) -> IntervalIndex: + cls, + data: Sequence[str], + name: Hashable = None, + ) -> IntervalIndex: with rewrite_exception("IntervalArray", cls.__name__): arr = IntervalArray.from_strings(data=data) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 606716943c979..acc6e4277e7ff 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -909,17 +909,10 @@ def test_pickle_round_trip_closed(self, closed): def test_from_strings(): """Test the IntervalIndex.from_strings class method.""" # Create some float IntervalIndex string representations. - indices = [ - "(0.0, 0.5]", - "(0.5, 1.0]" - ] + indices = ["(0.0, 0.5]", "(0.5, 1.0]"] # Create some datetime-like string representations - datetime_indices = [ - "(2015-07-01, 2016-08-01]", - "(2016-08-01, 2018-09-01]" - ] - + datetime_indices = ["(2015-07-01, 2016-08-01]", "(2016-08-01, 2018-09-01]"] index = IntervalIndex.from_strings(indices) dt_index = IntervalIndex.from_strings(datetime_indices) @@ -932,9 +925,9 @@ def test_from_strings(): # Create invalid interval indices (to make sure it fails correctly) wrong_indices = [ - "('hello', 'there']", - "(0.1,0.1)", - "(0.0,0.5]", + "('hello', 'there']", + "(0.1,0.1)", + "(0.0,0.5]", ] # Make sure that the wrong indices raise the appropriate error From 27ae2bf70553e84f6fac40460043bb8744b0c26c Mon Sep 17 00:00:00 2001 From: Erik Mannerfelt <33550973+erikmannerfelt@users.noreply.github.com> Date: Sat, 22 May 2021 10:30:31 +0200 Subject: [PATCH 03/10] Documented exception to mypy linting and fixed doctest. --- pandas/core/arrays/interval.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1c669a86f4c74..5b72fff231036 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -7,6 +7,7 @@ ) import textwrap from typing import ( + Callable, Sequence, TypeVar, cast, @@ -607,9 +608,9 @@ def from_tuples( """\ Examples -------- - >>> pd.IntervalIndex.from_strings(["(0, 1]", "(1, 2]"]) - IntervalIndex([(0, 1], (1, 2]], - dtype='interval[int64, right]') + >>> pd.IntervalIndex.from_strings(["(0.0, 1.0]", "(1.0, 2.0]"]) + IntervalIndex([(0.0, 1.0], (1.0, 2.0]], + dtype='interval[float64, right]') """ ), } @@ -643,8 +644,9 @@ def from_strings( f"Delimiter ', ' (comma + space) not found in string: {string}" ) + conversions: list[Callable] = [float, to_datetime, to_timedelta] # Try to parse the breaks first as floats, then datetime, then timedelta. - for conversion in [float, to_datetime, to_timedelta]: + for conversion in conversions: try: interval = Interval(*map(conversion, breaks)) break From e6e072adfe85a5a645d1f731a4ddd6d2f01520ce Mon Sep 17 00:00:00 2001 From: Erik Mannerfelt <33550973+erikmannerfelt@users.noreply.github.com> Date: Tue, 1 Jun 2021 13:02:50 +0200 Subject: [PATCH 04/10] Replaced str.find() calls with more robust regex matching --- pandas/core/arrays/interval.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 5b72fff231036..e2296b974b2a4 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -5,6 +5,7 @@ le, lt, ) +import re import textwrap from typing import ( Callable, @@ -625,19 +626,18 @@ def from_strings( intervals: list[Interval] = [] for string in data: - try: - # Find the first parenthesis and assume it is the start of the interval - start = string.index("(") - # Find the first closing square bracket and assume it is the end - end = string.rindex("]") - except ValueError: + + # Try to match "(left, right]" where 'left' and 'right' are breaks. + breaks_match = re.match(r"\(.*,.*]", string) + # Raise ValueError if no match was found. + if breaks_match is None: raise ValueError( "Could not find opening '(' and closing ']' " f"brackets in string: '{string}'" ) # Extract that part and try to split based on a comma and a space. - breaks = string[start + 1 : end].split(", ", 1) + breaks = breaks_match.string[1:-1].split(", ", 1) if len(breaks) != 2: raise ValueError( From 6c1b871bef7bd5f20260fdd344b0c91d17876997 Mon Sep 17 00:00:00 2001 From: Erik Mannerfelt <33550973+erikmannerfelt@users.noreply.github.com> Date: Tue, 1 Jun 2021 13:32:11 +0200 Subject: [PATCH 05/10] Added parsing for int64 and improved tests. --- pandas/core/arrays/interval.py | 15 +++-- .../tests/indexes/interval/test_interval.py | 60 ++++++++----------- 2 files changed, 34 insertions(+), 41 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index e2296b974b2a4..539d2524d213f 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -609,9 +609,9 @@ def from_tuples( """\ Examples -------- - >>> pd.IntervalIndex.from_strings(["(0.0, 1.0]", "(1.0, 2.0]"]) - IntervalIndex([(0.0, 1.0], (1.0, 2.0]], - dtype='interval[float64, right]') + >>> pd.IntervalIndex.from_strings(["(0, 1]", "(1, 2]"]) + IntervalIndex([(0, 1], (1, 2]], + dtype='interval[int64, right]') """ ), } @@ -636,7 +636,7 @@ def from_strings( f"brackets in string: '{string}'" ) - # Extract that part and try to split based on a comma and a space. + # Try to split 'left' and 'right' based on a comma and a space. breaks = breaks_match.string[1:-1].split(", ", 1) if len(breaks) != 2: @@ -644,9 +644,12 @@ def from_strings( f"Delimiter ', ' (comma + space) not found in string: {string}" ) - conversions: list[Callable] = [float, to_datetime, to_timedelta] + conversions: list[Callable] = [int, float, to_datetime, to_timedelta] # Try to parse the breaks first as floats, then datetime, then timedelta. - for conversion in conversions: + for i, conversion in enumerate(conversions): + # Check if all breaks can be parsed as integers. + if i == 0 and not all(b.isdigit() for b in breaks): + continue try: interval = Interval(*map(conversion, breaks)) break diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index acc6e4277e7ff..8993b9a16fc74 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -908,45 +908,35 @@ def test_pickle_round_trip_closed(self, closed): def test_from_strings(): """Test the IntervalIndex.from_strings class method.""" - # Create some float IntervalIndex string representations. - indices = ["(0.0, 0.5]", "(0.5, 1.0]"] - - # Create some datetime-like string representations - datetime_indices = ["(2015-07-01, 2016-08-01]", "(2016-08-01, 2018-09-01]"] - - index = IntervalIndex.from_strings(indices) - dt_index = IntervalIndex.from_strings(datetime_indices) - - assert index[0].left == 0.0 - assert index[1].right == 1.0 - - assert dt_index[0].left.year == 2015 - assert dt_index[1].right.month == 9 + # Create (expected, string_repr) tuples for test-cases. + test_cases = [ + ( + IntervalIndex.from_breaks([0.0, 0.5, 1.0]), + ["(0.0, 0.5]", "(0.5, 1.0]"], + ), + (IntervalIndex.from_breaks([0, 5, 10]), ["(0, 5]", "(5, 10]"]), + ( + IntervalIndex.from_breaks( + [Timestamp(2015, 7, 1), Timestamp(2016, 8, 1), Timestamp(2018, 9, 1)] + ), + ["(2015-07-01, 2016-08-01]", "(2016-08-01, 2018-09-01]"], + ), + ] + # Validate each test case. + for expected, string in test_cases: + parsed_index = IntervalIndex.from_strings(string) + assert np.array_equal(parsed_index, expected) # Create invalid interval indices (to make sure it fails correctly) wrong_indices = [ - "('hello', 'there']", - "(0.1,0.1)", - "(0.0,0.5]", + ("('hello', 'there']", r"Could not parse string as Interval"), + ("(0.1,0.1)", r"Could not find opening '\(' and closing ']'"), + ("(0.0,0.5]", r"Delimiter ', ' .* not found"), ] - - # Make sure that the wrong indices raise the appropriate error - try: - IntervalIndex.from_strings([wrong_indices[0]]) - except ValueError as exception: - if "Could not parse string as Interval" not in str(exception): - raise exception - try: - IntervalIndex.from_strings([wrong_indices[1]]) - except ValueError as exception: - if "Could not find opening '('" not in str(exception): - raise exception - - try: - IntervalIndex.from_strings([wrong_indices[2]]) - except ValueError as exception: - if "Delimiter ', ' (comma + space) not found" not in str(exception): - raise exception + # Validate that all cases raise ValueErrors with the correct message + for string, error in wrong_indices: + with pytest.raises(ValueError, match=error): + IntervalIndex.from_strings([string]) def test_dir(): From d646802726c712b8aec2386978c394908890d5d6 Mon Sep 17 00:00:00 2001 From: Erik Mannerfelt <33550973+erikmannerfelt@users.noreply.github.com> Date: Tue, 1 Jun 2021 14:46:53 +0200 Subject: [PATCH 06/10] Moved error test to separate test function. --- pandas/tests/indexes/interval/test_interval.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 8993b9a16fc74..98316a4941749 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -927,6 +927,9 @@ def test_from_strings(): parsed_index = IntervalIndex.from_strings(string) assert np.array_equal(parsed_index, expected) + +def test_from_strings_errors(): + """Validate the error messages from the IntervalIndex.from_strings method.""" # Create invalid interval indices (to make sure it fails correctly) wrong_indices = [ ("('hello', 'there']", r"Could not parse string as Interval"), From 162045ad95617d46751720c2d031c637fa40c85c Mon Sep 17 00:00:00 2001 From: Erik Mannerfelt <33550973+erikmannerfelt@users.noreply.github.com> Date: Wed, 9 Jun 2021 10:57:49 +0200 Subject: [PATCH 07/10] * Simplified pandas imports * Added argument for different types of closed intervals * Added dtype argument to allow strict typing. * Changed tests to use pytest.mark.parametrize * Improved documentation. --- pandas/core/arrays/interval.py | 33 ++++++++---- pandas/core/indexes/interval.py | 4 +- .../tests/indexes/interval/test_interval.py | 53 ++++++++++++------- 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 054a0b6a924d2..ba6c01531edab 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -571,8 +571,9 @@ def from_tuples( ---------- data : array-like (1-dimensional) Strings representing the Interval's to parse. - copy : bool, default False - Copy the data. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. dtype : dtype, optional If None, dtype will be inferred. @@ -618,23 +619,28 @@ def from_tuples( def from_strings( cls: type[IntervalArrayT], data: Sequence[str], + closed: str = "right", + dtype: Dtype | None = None, ) -> IntervalArrayT: # These need to be imported here to avoid circular dependencies. - from pandas.core.tools.datetimes import to_datetime - from pandas.core.tools.timedeltas import to_timedelta + from pandas import ( + to_datetime, + to_timedelta, + ) + + pattern = re.compile(r"\(.*,.*]") - intervals: list[Interval] = [] + left, right = [], [] for string in data: # Try to match "(left, right]" where 'left' and 'right' are breaks. - breaks_match = re.match(r"\(.*,.*]", string) - # Raise ValueError if no match was found. + breaks_match = pattern.match(string) + if breaks_match is None: raise ValueError( "Could not find opening '(' and closing ']' " f"brackets in string: '{string}'" ) - # Try to split 'left' and 'right' based on a comma and a space. breaks = breaks_match.string[1:-1].split(", ", 1) @@ -650,7 +656,9 @@ def from_strings( if i == 0 and not all(b.isdigit() for b in breaks): continue try: - interval = Interval(*map(conversion, breaks)) + newleft, newright = map(conversion, breaks) + left.append(newleft) + right.append(newright) break except ValueError: continue @@ -659,9 +667,12 @@ def from_strings( "Could not parse string as Interval of float, Timedelta " f"or Timestamp: {string}" ) - intervals.append(interval) - return cls(intervals) + # If dtype was not an IntervalDtype, try to parse it as such. + if dtype is not None and not isinstance(dtype, IntervalDtype): + dtype = IntervalDtype(subtype=dtype, closed=closed) + + return cls.from_arrays(left, right, closed=closed, copy=False, dtype=dtype) def _validate(self): """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index fd63bd72990ce..d6d54326139c4 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -340,10 +340,12 @@ def from_tuples( def from_strings( cls, data: Sequence[str], + closed: str = "right", + dtype: Dtype | None = None, name: Hashable = None, ) -> IntervalIndex: with rewrite_exception("IntervalArray", cls.__name__): - arr = IntervalArray.from_strings(data=data) + arr = IntervalArray.from_strings(data=data, dtype=dtype, closed=closed) return cls._simple_new(arr, name=name) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 98316a4941749..30a8d6672c22a 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -906,40 +906,55 @@ def test_pickle_round_trip_closed(self, closed): tm.assert_index_equal(result, idx) -def test_from_strings(): - """Test the IntervalIndex.from_strings class method.""" - # Create (expected, string_repr) tuples for test-cases. - test_cases = [ +@pytest.mark.parametrize( + "test_case", + [ ( + "float64", IntervalIndex.from_breaks([0.0, 0.5, 1.0]), ["(0.0, 0.5]", "(0.5, 1.0]"], ), - (IntervalIndex.from_breaks([0, 5, 10]), ["(0, 5]", "(5, 10]"]), + ("int64", IntervalIndex.from_breaks([0, 5, 10]), ["(0, 5]", "(5, 10]"]), ( + "datetime64[ns]", IntervalIndex.from_breaks( [Timestamp(2015, 7, 1), Timestamp(2016, 8, 1), Timestamp(2018, 9, 1)] ), ["(2015-07-01, 2016-08-01]", "(2016-08-01, 2018-09-01]"], ), - ] - # Validate each test case. - for expected, string in test_cases: - parsed_index = IntervalIndex.from_strings(string) - assert np.array_equal(parsed_index, expected) + ], +) +def test_from_strings(test_case): + """Test the IntervalIndex.from_strings class method.""" + # See https://github.com/pandas-dev/pandas/pull/41451 + dtype, expected, string = test_case + # Attempt to parse the type dynamically + parsed_index = IntervalIndex.from_strings(string) + assert np.array_equal(parsed_index, expected) + assert parsed_index.left.dtype == dtype -def test_from_strings_errors(): - """Validate the error messages from the IntervalIndex.from_strings method.""" - # Create invalid interval indices (to make sure it fails correctly) - wrong_indices = [ + # Parse it with a fixed dtype and assert that the result is correct. + parsed_index_static = IntervalIndex.from_strings(string, dtype=np.dtype(dtype)) + assert np.array_equal(parsed_index, parsed_index_static) + assert parsed_index.dtype == parsed_index_static.dtype + + +@pytest.mark.parametrize( + "wrong_indices", + [ ("('hello', 'there']", r"Could not parse string as Interval"), ("(0.1,0.1)", r"Could not find opening '\(' and closing ']'"), ("(0.0,0.5]", r"Delimiter ', ' .* not found"), - ] - # Validate that all cases raise ValueErrors with the correct message - for string, error in wrong_indices: - with pytest.raises(ValueError, match=error): - IntervalIndex.from_strings([string]) + ], +) +def test_from_strings_errors(wrong_indices): + """Validate the error messages from the IntervalIndex.from_strings method.""" + # See https://github.com/pandas-dev/pandas/pull/41451 + string, error = wrong_indices + + with pytest.raises(ValueError, match=error): + IntervalIndex.from_strings([string]) def test_dir(): From a8cc3b2caf5d3d93030eccf6a8c3759b804817ca Mon Sep 17 00:00:00 2001 From: Erik Mannerfelt <33550973+erikmannerfelt@users.noreply.github.com> Date: Thu, 16 Sep 2021 11:00:40 +0200 Subject: [PATCH 08/10] Added support for all closed types. Improved tests. --- pandas/core/arrays/interval.py | 51 +++++++++++-------- .../tests/indexes/interval/test_interval.py | 50 +++++++++++------- 2 files changed, 63 insertions(+), 38 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index b78bd630fc4aa..737e44ee59121 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -8,7 +8,6 @@ import re import textwrap from typing import ( - Callable, Sequence, TypeVar, Union, @@ -635,13 +634,22 @@ def from_strings( closed: str = "right", dtype: Dtype | None = None, ) -> IntervalArrayT: - # These need to be imported here to avoid circular dependencies. from pandas import ( to_datetime, + to_numeric, to_timedelta, ) - pattern = re.compile(r"\(.*,.*]") + # The different closing brackets define which pattern to look for. + brackets = { + "right": ("(", "]"), + "left": ("[", ")"), + "both": ("[", "]"), + "neither": ("(", ")"), + } + pattern = re.compile( + "\\" + brackets[closed][0] + ".*,.*\\" + brackets[closed][1] + ) left, right = [], [] for string in data: @@ -651,7 +659,8 @@ def from_strings( if breaks_match is None: raise ValueError( - "Could not find opening '(' and closing ']' " + f"Could not find opening '{brackets[closed][0]}' " + f"and closing '{brackets[closed][1]}' " f"brackets in string: '{string}'" ) # Try to split 'left' and 'right' based on a comma and a space. @@ -662,24 +671,26 @@ def from_strings( f"Delimiter ', ' (comma + space) not found in string: {string}" ) - conversions: list[Callable] = [int, float, to_datetime, to_timedelta] - # Try to parse the breaks first as floats, then datetime, then timedelta. - for i, conversion in enumerate(conversions): - # Check if all breaks can be parsed as integers. - if i == 0 and not all(b.isdigit() for b in breaks): - continue + # Try different types of string parsers in succession + # First try to parse the breaks as numbers (int, float etc.) + try: + newleft, newright = to_numeric(breaks, errors="raise") + except ValueError: + # If that failed, try parsing as datetime try: - newleft, newright = map(conversion, breaks) - left.append(newleft) - right.append(newright) - break + newleft, newright = to_datetime(breaks, errors="raise") except ValueError: - continue - else: - raise ValueError( - "Could not parse string as Interval of float, Timedelta " - f"or Timestamp: {string}" - ) + # If that also failed, try as timedelta + try: + newleft, newright = to_timedelta(breaks, errors="raise") + except ValueError: + # Finally, if all fails, raise an exception + raise ValueError( + "Could not parse string as numeric, Timedelta " + f"or Timestamp Interval: {string}" + ) + left.append(newleft) + right.append(newright) # If dtype was not an IntervalDtype, try to parse it as such. if dtype is not None and not isinstance(dtype, IntervalDtype): diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index c865bc4503509..dc3a2ec0c2064 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -927,44 +927,58 @@ def test_pickle_round_trip_closed(self, closed): tm.assert_index_equal(result, idx) +@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) @pytest.mark.parametrize( "test_case", [ ( "float64", - IntervalIndex.from_breaks([0.0, 0.5, 1.0]), - ["(0.0, 0.5]", "(0.5, 1.0]"], + [0.0, 0.5, 1.0], + ["0.0, 0.5", "0.5, 1.0"], ), - ("int64", IntervalIndex.from_breaks([0, 5, 10]), ["(0, 5]", "(5, 10]"]), + ("int64", [0, 5, 10], ["0, 5", "5, 10"]), ( "datetime64[ns]", - IntervalIndex.from_breaks( - [Timestamp(2015, 7, 1), Timestamp(2016, 8, 1), Timestamp(2018, 9, 1)] - ), - ["(2015-07-01, 2016-08-01]", "(2016-08-01, 2018-09-01]"], + [Timestamp(2015, 7, 1), Timestamp(2016, 8, 1), Timestamp(2018, 9, 1)], + ["2015-07-01, 2016-08-01", "2016-08-01, 2018-09-01"], ), ], ) -def test_from_strings(test_case): +def test_from_strings(closed, test_case): """Test the IntervalIndex.from_strings class method.""" # See https://github.com/pandas-dev/pandas/pull/41451 - dtype, expected, string = test_case - - # Attempt to parse the type dynamically - parsed_index = IntervalIndex.from_strings(string) - assert np.array_equal(parsed_index, expected) - assert parsed_index.left.dtype == dtype + dtype, expected, strings = test_case + + brackets = { + "right": ("(", "]"), + "left": ("[", ")"), + "both": ("[", "]"), + "neither": ("(", ")"), + } + # Assign the brackets associated to the closed type to be tested + interval_strings = [brackets[closed][0] + s + brackets[closed][1] for s in strings] + + # Attempt to infer the type dynamically + tm.assert_index_equal( + IntervalIndex.from_strings(interval_strings, closed=closed), + IntervalIndex.from_breaks(expected, closed=closed), + exact=True, + ) # Parse it with a fixed dtype and assert that the result is correct. - parsed_index_static = IntervalIndex.from_strings(string, dtype=np.dtype(dtype)) - assert np.array_equal(parsed_index, parsed_index_static) - assert parsed_index.dtype == parsed_index_static.dtype + tm.assert_index_equal( + IntervalIndex.from_strings( + interval_strings, dtype=np.dtype(dtype), closed=closed + ), + IntervalIndex.from_breaks(expected, closed=closed), + exact=True, + ) @pytest.mark.parametrize( "wrong_indices", [ - ("('hello', 'there']", r"Could not parse string as Interval"), + ("('hello', 'there']", r"Could not parse string as numeric"), ("(0.1,0.1)", r"Could not find opening '\(' and closing ']'"), ("(0.0,0.5]", r"Delimiter ', ' .* not found"), ], From 27a5a2e0303ba7dab46da1ff7166cff02c121195 Mon Sep 17 00:00:00 2001 From: Erik Mannerfelt <33550973+erikmannerfelt@users.noreply.github.com> Date: Thu, 16 Sep 2021 12:57:43 +0200 Subject: [PATCH 09/10] rerun CI From 40af75dbe02e93e74ae8e6171f1d89b5e6767b5f Mon Sep 17 00:00:00 2001 From: Erik Mannerfelt <33550973+erikmannerfelt@users.noreply.github.com> Date: Fri, 17 Sep 2021 08:49:11 +0200 Subject: [PATCH 10/10] Renamed IntervalArray method and removed nested try/except clauses. --- pandas/core/arrays/interval.py | 64 ++++++++++++++++++++------------- pandas/core/indexes/interval.py | 4 ++- 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 737e44ee59121..0671c4e73a25c 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -628,18 +628,12 @@ def from_tuples( ), } ) - def from_strings( + def _from_sequence_of_strings( cls: type[IntervalArrayT], data: Sequence[str], closed: str = "right", dtype: Dtype | None = None, ) -> IntervalArrayT: - from pandas import ( - to_datetime, - to_numeric, - to_timedelta, - ) - # The different closing brackets define which pattern to look for. brackets = { "right": ("(", "]"), @@ -671,24 +665,7 @@ def from_strings( f"Delimiter ', ' (comma + space) not found in string: {string}" ) - # Try different types of string parsers in succession - # First try to parse the breaks as numbers (int, float etc.) - try: - newleft, newright = to_numeric(breaks, errors="raise") - except ValueError: - # If that failed, try parsing as datetime - try: - newleft, newright = to_datetime(breaks, errors="raise") - except ValueError: - # If that also failed, try as timedelta - try: - newleft, newright = to_timedelta(breaks, errors="raise") - except ValueError: - # Finally, if all fails, raise an exception - raise ValueError( - "Could not parse string as numeric, Timedelta " - f"or Timestamp Interval: {string}" - ) + newleft, newright = _parse_breaks(breaks) left.append(newleft) right.append(newright) @@ -1817,3 +1794,40 @@ def _maybe_convert_platform_interval(values) -> ArrayLike: if not hasattr(values, "dtype"): return np.asarray(values) return values + + +def _parse_breaks(breaks: list[str]) -> ArrayLike: + """ + Parse string representations of interval breaks. + + The succession to try is: + 1. Numeric (float, int, etc) + 2. Timestamp + 3. Timedelta + + If none work, a ValueError is raised. + + Parameters + ---------- + breaks : A list of strings to parse. + + Returns + ------- + The parsed breaks + """ + from pandas import ( + to_datetime, + to_numeric, + to_timedelta, + ) + + for parser in [to_numeric, to_datetime, to_timedelta]: + try: + return parser(breaks, errors="raise") + except ValueError: + continue + else: + raise ValueError( + "Could not parse string as numeric, Timedelta " + f"or Timestamp Interval: {', '.join(breaks)}" + ) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 2058b8926109c..42134c83458a2 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -342,7 +342,9 @@ def from_strings( name: Hashable = None, ) -> IntervalIndex: with rewrite_exception("IntervalArray", cls.__name__): - arr = IntervalArray.from_strings(data=data, dtype=dtype, closed=closed) + arr = IntervalArray._from_sequence_of_strings( + data=data, dtype=dtype, closed=closed + ) return cls._simple_new(arr, name=name)