From 56cb6881a559442432d89c6650281dd96322f941 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Mon, 3 Jul 2023 12:13:46 +0100 Subject: [PATCH 1/2] add DataFrame.unique_indices --- .../dataframe_api/dataframe_object.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index ee8855bc..d893671c 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -740,6 +740,27 @@ def is_nan(self) -> DataFrame: """ ... + def unique_indices(self, keys: Sequence[str], *, skip_nulls: bool = True) -> Column[int]: + """ + Return indices corresponding to unique values across in selected columns. + + Returns + ------- + Column[int] + Indices corresponding to unique values. + + Notes + ----- + There are no ordering guarantees. In particular, if there are multiple + indices corresponding to the same unique value(s), there is no guarantee + about which one will appear in the result. + If the original column(s) contain multiple `'NaN'` values, then + only a single index corresponding to those values should be returned. + Likewise for null values (if ``skip_nulls=False``). + To get the unique values, you can do ``df.get_rows(df.unique_indices(keys))``. + """ + ... + def fill_nan(self, value: float | 'null', /) -> DataFrame: """ Fill ``nan`` values with the given fill value. From 7fc2e1b9a9781181c122d5c8f8906f8a0615ec2d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 7 Jul 2023 17:04:34 +0100 Subject: [PATCH 2/2] typo + minor reword --- spec/API_specification/dataframe_api/column_object.py | 2 +- spec/API_specification/dataframe_api/dataframe_object.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 10ba9a35..d45a543f 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -622,7 +622,7 @@ def unique_indices(self, *, skip_nulls: bool = True) -> Column[int]: indices corresponding to the same unique value, there is no guarantee about which one will appear in the result. If the original Column contains multiple `'NaN'` values, then - only a single index corresponding to those values should be returned. + only a single index corresponding to those values will be returned. Likewise for null values (if ``skip_nulls=False``). To get the unique values, you can do ``col.get_rows(col.unique_indices())``. """ diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index fab8c622..fb0948e8 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -760,7 +760,7 @@ def is_nan(self) -> DataFrame: def unique_indices(self, keys: Sequence[str], *, skip_nulls: bool = True) -> Column[int]: """ - Return indices corresponding to unique values across in selected columns. + Return indices corresponding to unique values across selected columns. Returns ------- @@ -773,7 +773,7 @@ def unique_indices(self, keys: Sequence[str], *, skip_nulls: bool = True) -> Col indices corresponding to the same unique value(s), there is no guarantee about which one will appear in the result. If the original column(s) contain multiple `'NaN'` values, then - only a single index corresponding to those values should be returned. + only a single index corresponding to those values will be returned. Likewise for null values (if ``skip_nulls=False``). To get the unique values, you can do ``df.get_rows(df.unique_indices(keys))``. """