Skip to content

Commit 03835f8

Browse files
committed
BUG: boundary_slice assumes sorted indexes
Changes `boundary_slice` to handle cases where - the index is not sorted - using label-based indexing (loc) - the start or stop is missing See pandas-dev/pandas#8613 for details on the pandas side.
1 parent 081c1b8 commit 03835f8

File tree

2 files changed

+19
-0
lines changed

2 files changed

+19
-0
lines changed

dask/dataframe/methods.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@ def boundary_slice(df, start, stop, right_boundary=True, left_boundary=True,
5555
2 20
5656
2 30
5757
"""
58+
# Pandas treats missing keys differently for label-slicing
59+
# on monotonic vs. non-monotonic indexes
60+
# If the index is monotonic, `df.loc[start:stop]` is fine.
61+
# If it's not, `df.loc[start:stop]` raises when `start` is missing
62+
if kind == 'loc' and not df.index.is_monotonic:
63+
df = df.sort_index()
5864
result = getattr(df, kind)[start:stop]
5965
if not right_boundary:
6066
right_index = result.index.get_slice_bound(stop, 'left', kind)

dask/dataframe/tests/test_dataframe.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2720,3 +2720,16 @@ def test_to_datetime():
27202720

27212721
assert_eq(pd.to_datetime(s, infer_datetime_format=True),
27222722
dd.to_datetime(ds, infer_datetime_format=True))
2723+
2724+
2725+
def test_slice_on_filtered_boundary():
2726+
# https://github.com/dask/dask/issues/2211
2727+
x = np.arange(10)
2728+
x[[5, 6]] -= 2
2729+
df = pd.DataFrame({"A": x, "B": np.arange(len(x))})
2730+
pdf = df.set_index("A").query("B > 0")
2731+
ddf = dd.from_pandas(df, 1).set_index("A").query("B > 0")
2732+
2733+
result = dd.concat([ddf, ddf.rename(columns={"B": "C"})], axis=1)
2734+
expected = pd.concat([pdf, pdf.rename(columns={"B": "C"})], axis=1)
2735+
assert_eq(result, expected)

0 commit comments

Comments
 (0)