Skip to content

Commit 41eb73d

Browse files
author
MarcoGorelli
committed
Merge remote-tracking branch 'upstream/main' into column-getitem-and-len
2 parents 2796b0d + c1b1ab1 commit 41eb73d

16 files changed

+239
-270
lines changed

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
sphinx==4.3.0
2-
sphinx-material==0.0.30
1+
sphinx==6.2.1
2+
sphinx-material==0.0.35
33
myst-parser
44
sphinx_markdown_tables
55
sphinx_copybutton
6-
docutils<0.18
6+
docutils==0.19
77
sphinx-math-dollar

spec/API_specification/column_object.rst

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,9 @@ Column object
44
=============
55

66
A conforming implementation of the dataframe API standard must provide and
7-
support a column object having the following attributes and methods.
7+
support a column object having the following methods, attributes, and
8+
behavior.
89

9-
-------------------------------------------------
10-
11-
Methods
12-
-------
13-
TODO
14-
15-
..
16-
NOTE: please keep the methods in alphabetical order
17-
18-
.. currentmodule:: dataframe_api
19-
20-
.. autosummary::
21-
:toctree: generated
22-
:template: property.rst
10+
.. currentmodule:: dataframe_api
2311

12+
.. autoclass:: Column

spec/API_specification/dataframe_api/column_object.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,47 @@
11
from __future__ import annotations
22

3-
from typing import NoReturn
3+
from typing import NoReturn, Sequence
4+
5+
from ._types import dtype
6+
7+
8+
__all__ = ['Column']
9+
410

511
class Column:
12+
"""
13+
Column object
14+
15+
Note that this column object is not meant to be instantiated directly by
16+
users of the library implementing the dataframe API standard. Rather, use
17+
constructor functions or an already-created dataframe object retrieved via
18+
19+
"""
20+
@classmethod
21+
def from_sequence(cls, sequence: Sequence[object], dtype: dtype) -> Column:
22+
"""
23+
Construct Column from sequence of elements.
24+
25+
Parameters
26+
----------
27+
sequence : Sequence[object]
28+
Sequence of elements. Each element must be of the specified
29+
``dtype``, the corresponding Python builtin scalar type, or
30+
coercible to that Python scalar type.
31+
dtype : str
32+
Dtype of result. Must be specified.
33+
34+
Returns
35+
-------
36+
Column
37+
"""
38+
...
39+
640
def __len__(self) -> int:
741
"""
842
Return the number of rows.
943
"""
10-
44+
1145
def __getitem__(self, row: int) -> object:
1246
"""
1347
Get the element at row index `key`.
@@ -33,9 +67,5 @@ def get_rows(self, indices: Column[int]) -> Column:
3367
----------
3468
indices : Column[int]
3569
Positions of rows to select.
36-
37-
Returns
38-
-------
39-
Column
4070
"""
41-
...
71+
...

spec/API_specification/dataframe_api/dataframe_object.py

Lines changed: 102 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import annotations
2-
from typing import Sequence, Union, TYPE_CHECKING, NoReturn
2+
3+
from typing import Literal, Mapping, Sequence, Union, TYPE_CHECKING, NoReturn
4+
35

46
if TYPE_CHECKING:
57
from .column_object import Column
@@ -11,6 +13,47 @@
1113

1214

1315
class DataFrame:
16+
"""
17+
DataFrame object
18+
19+
Note that this dataframe object is not meant to be instantiated directly by
20+
users of the library implementing the dataframe API standard. Rather, use
21+
constructor functions or an already-created dataframe object retrieved via
22+
23+
**Python operator support**
24+
25+
All arithmetic operators defined by the Python language, except for
26+
``__matmul__``, ``__neg__`` and ``__pos__``, must be supported for
27+
numerical data types.
28+
29+
All comparison operators defined by the Python language must be supported
30+
by the dataframe object for all data types for which those comparisons are
31+
supported by the builtin scalar types corresponding to a data type.
32+
33+
In-place operators must not be supported. All operations on the dataframe
34+
object are out-of-place.
35+
36+
**Methods and Attributes**
37+
38+
"""
39+
40+
@classmethod
41+
def from_dict(cls, data: Mapping[str, Column]) -> DataFrame:
42+
"""
43+
Construct DataFrame from map of column names to Columns.
44+
45+
Parameters
46+
----------
47+
data : Mapping[str, Column]
48+
Column must be of the corresponding type of the DataFrame.
49+
For example, it is only supported to build a ``LibraryXDataFrame`` using
50+
``LibraryXColumn`` instances.
51+
52+
Returns
53+
-------
54+
DataFrame
55+
"""
56+
1457
@property
1558
def dataframe(self) -> object:
1659
"""
@@ -88,24 +131,18 @@ def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame:
88131
"""
89132
...
90133

91-
def get_rows(self, indices: Sequence[int]) -> DataFrame:
134+
def get_rows(self, indices: "Column[int]") -> DataFrame:
92135
"""
93136
Select a subset of rows, similar to `ndarray.take`.
94137
95138
Parameters
96139
----------
97-
indices : Sequence[int]
140+
indices : Column[int]
98141
Positions of rows to select.
99142
100143
Returns
101144
-------
102145
DataFrame
103-
104-
Notes
105-
-----
106-
Some discussion participants prefer a stricter type Column[int] for
107-
indices in order to make it easier to implement in a performant manner
108-
on GPUs.
109146
"""
110147
...
111148

@@ -204,6 +241,47 @@ def get_column_names(self) -> Sequence[str]:
204241
"""
205242
...
206243

244+
def sorted_indices(
245+
self,
246+
keys: Sequence[str],
247+
*,
248+
ascending: Sequence[bool] | bool = True,
249+
nulls_position: Literal['first', 'last'] = 'last',
250+
) -> Column[int]:
251+
"""
252+
Return row numbers which would sort according to given columns.
253+
254+
If you need to sort the DataFrame, you can simply do::
255+
256+
df.get_rows(df.sorted_indices(keys))
257+
258+
Parameters
259+
----------
260+
keys : Sequence[str]
261+
Names of columns to sort by.
262+
ascending : Sequence[bool] or bool
263+
If `True`, sort by all keys in ascending order.
264+
If `False`, sort by all keys in descending order.
265+
If a sequence, it must be the same length as `keys`,
266+
and determines the direction with which to use each
267+
key to sort by.
268+
nulls_position : {'first', 'last'}
269+
Whether null values should be placed at the beginning
270+
or at the end of the result.
271+
Note that the position of NaNs is unspecified and may
272+
vary based on the implementation.
273+
274+
Returns
275+
-------
276+
Column[int]
277+
278+
Raises
279+
------
280+
ValueError
281+
If `keys` and `ascending` are sequences of different lengths.
282+
"""
283+
...
284+
207285
def __eq__(self, other: DataFrame | Scalar) -> DataFrame:
208286
"""
209287
Compare for equality.
@@ -465,7 +543,7 @@ def __iter__(self) -> NoReturn:
465543
"""
466544
raise NotImplementedError("'__iter__' is intentionally not implemented.")
467545

468-
def any(self, skipna: bool = True) -> DataFrame:
546+
def any(self, skip_nulls: bool = True) -> DataFrame:
469547
"""
470548
Reduction returns a 1-row DataFrame.
471549
@@ -476,7 +554,7 @@ def any(self, skipna: bool = True) -> DataFrame:
476554
"""
477555
...
478556

479-
def all(self, skipna: bool = True) -> DataFrame:
557+
def all(self, skip_nulls: bool = True) -> DataFrame:
480558
"""
481559
Reduction returns a 1-row DataFrame.
482560
@@ -515,49 +593,49 @@ def all_rowwise(self, skipna: bool = True) -> Column:
515593
"""
516594
...
517595

518-
def min(self, skipna: bool = True) -> DataFrame:
596+
def min(self, skip_nulls: bool = True) -> DataFrame:
519597
"""
520598
Reduction returns a 1-row DataFrame.
521599
"""
522600
...
523601

524-
def max(self, skipna: bool = True) -> DataFrame:
602+
def max(self, skip_nulls: bool = True) -> DataFrame:
525603
"""
526604
Reduction returns a 1-row DataFrame.
527605
"""
528606
...
529607

530-
def sum(self, skipna: bool = True) -> DataFrame:
608+
def sum(self, skip_nulls: bool = True) -> DataFrame:
531609
"""
532610
Reduction returns a 1-row DataFrame.
533611
"""
534612
...
535613

536-
def prod(self, skipna: bool = True) -> DataFrame:
614+
def prod(self, skip_nulls: bool = True) -> DataFrame:
537615
"""
538616
Reduction returns a 1-row DataFrame.
539617
"""
540618
...
541619

542-
def median(self, skipna: bool = True) -> DataFrame:
620+
def median(self, skip_nulls: bool = True) -> DataFrame:
543621
"""
544622
Reduction returns a 1-row DataFrame.
545623
"""
546624
...
547625

548-
def mean(self, skipna: bool = True) -> DataFrame:
626+
def mean(self, skip_nulls: bool = True) -> DataFrame:
549627
"""
550628
Reduction returns a 1-row DataFrame.
551629
"""
552630
...
553631

554-
def std(self, skipna: bool = True) -> DataFrame:
632+
def std(self, skip_nulls: bool = True) -> DataFrame:
555633
"""
556634
Reduction returns a 1-row DataFrame.
557635
"""
558636
...
559637

560-
def var(self, skipna: bool = True) -> DataFrame:
638+
def var(self, skip_nulls: bool = True) -> DataFrame:
561639
"""
562640
Reduction returns a 1-row DataFrame.
563641
"""
@@ -578,12 +656,14 @@ def isnull(self) -> DataFrame:
578656
Notes
579657
-----
580658
Does *not* include NaN-like entries.
659+
May optionally include 'NaT' values (if present in an implementation),
660+
but note that the Standard makes no guarantees about them.
581661
"""
582662
...
583663

584664
def isnan(self) -> DataFrame:
585665
"""
586-
Check for nan-like entries.
666+
Check for nan entries.
587667
588668
Returns
589669
-------
@@ -595,7 +675,8 @@ def isnan(self) -> DataFrame:
595675
596676
Notes
597677
-----
598-
Includes anything with NaN-like semantics, e.g. np.datetime64("NaT").
678+
This only checks for 'NaN'.
599679
Does *not* include 'missing' or 'null' entries.
680+
In particular, does not check for `np.timedelta64('NaT')`.
600681
"""
601682
...

spec/API_specification/dataframe_api/groupby_object.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,35 +4,47 @@
44
from .dataframe_object import DataFrame
55

66

7+
__all__ = ['GroupBy']
8+
9+
710
class GroupBy:
8-
def any(self, skipna: bool = True) -> "DataFrame":
11+
"""
12+
GroupBy object.
13+
14+
Note that this class is not meant to be constructed by users.
15+
It is returned from `DataFrame.groupby`.
16+
17+
**Methods**
18+
19+
"""
20+
def any(self, skip_nulls: bool = True) -> "DataFrame":
921
...
1022

11-
def all(self, skipna: bool = True) -> "DataFrame":
23+
def all(self, skip_nulls: bool = True) -> "DataFrame":
1224
...
1325

14-
def min(self, skipna: bool = True) -> "DataFrame":
26+
def min(self, skip_nulls: bool = True) -> "DataFrame":
1527
...
1628

17-
def max(self, skipna: bool = True) -> "DataFrame":
29+
def max(self, skip_nulls: bool = True) -> "DataFrame":
1830
...
1931

20-
def sum(self, skipna: bool = True) -> "DataFrame":
32+
def sum(self, skip_nulls: bool = True) -> "DataFrame":
2133
...
2234

23-
def prod(self, skipna: bool = True) -> "DataFrame":
35+
def prod(self, skip_nulls: bool = True) -> "DataFrame":
2436
...
2537

26-
def median(self, skipna: bool = True) -> "DataFrame":
38+
def median(self, skip_nulls: bool = True) -> "DataFrame":
2739
...
2840

29-
def mean(self, skipna: bool = True) -> "DataFrame":
41+
def mean(self, skip_nulls: bool = True) -> "DataFrame":
3042
...
3143

32-
def std(self, skipna: bool = True) -> "DataFrame":
44+
def std(self, skip_nulls: bool = True) -> "DataFrame":
3345
...
3446

35-
def var(self, skipna: bool = True) -> "DataFrame":
47+
def var(self, skip_nulls: bool = True) -> "DataFrame":
3648
...
3749

3850
def size(self) -> "DataFrame":

0 commit comments

Comments
 (0)