Skip to content

Commit d0a5ec8

Browse files
Upgrade pandas to 1.2 (#7375)
Fixes: #7367, #7446 This PR upgrades pandas to `1.2.2` in `cudf`. Changes include: - [x] Bumping up `pandas` version. - [x] Fixing `isin` behavior which now takes in types into accout: pandas-dev/pandas#38781 - [x] `CategoricalColumn.__setitem__` will now not allow setting of values that are not in existing categories. - [x] Introduced `cudf.core._compat.PANDAS_GE_120` variable to create back-ward compatibility. - [x] Updated usages of `pd.core.tools.datetimes._guess_datetime_format` to `pd.core.tools.datetimes.guess_datetime_format` - [x] Introduced `std` & `median` in `DateTimeColumn`. - [x] Fixed incorrect handling of passing `StringMethods` as an input to methods in string APIs. - [x] Fixed a typo in calling `is_valid` of `Scalar`. - [x] Removed unnecessary special handling in `TimeDeltaColumn.sum` logic for empty inputs. - [x] Introduced passing `dtype='float64'` wherever there is an empty series being created since pandas will soon be defaulting to `object` dtype if no type is passed and we don't have a perfectly resembling `object` dtype as that of pandas. - [x] Fixed deprecation warnings of `Index.__or__` and `Index.__xor__` by replacing with `union` & `symmetric_difference` APIs. - [x] Introduced mapping of our `float32` & `float64` dtypes to pandas Nullable dtypes `FLoat32Dtype` & `Float64Dtype` when `nullable=True` in `to_pandas`. - [x] With introduction of nullable float dtypes, there is an issue in creating `MultiIndex` from dataframe: pandas-dev/pandas#39984, so introduced a workaround in our `MultiIndex.__repr__` code. - [x] Removed usages of `check_less_precise` in our code-base as this is deprecated and is replaced with `rtol` & `atol`. Retained its usages in our testing APIs for back-ward compatibility. - [x] Removed good number `xfail` cases which are actually passing right now because of resolved issues in both `pandas` & `cudf`. - [x] Did some miscellaneous code-cleanup in pytests. - [x] Fixed pytests that will fail when run in parallel due to access to shared pytest params being manipulated inplace. - [x] Follow a standard import pattern across pytest files, some files do `from pandas import Series` and some do `from cudf.core import Series`. So removed both patterns and doing only simple `import cudf` & `import pandas as pd` to avoid confusion while debugging test failures across multiple files. (Made this change in all pytest files which I had to touch as part of pandas upgrade, we can make similar changes in future for the files which we touch). - [x] Fix issue with assigning `np.nan` values to a `CategoricalColumn` and fix related `__repr__` code: #7446 Authors: - GALI PREM SAGAR (@galipremsagar) Approvers: - Keith Kraus (@kkraus14) - AJ Schmidt (@ajschmidt8) URL: #7375
1 parent a0589c6 commit d0a5ec8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+2043
-1488
lines changed

conda/environments/cudf_dev_cuda10.1.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ dependencies:
1717
- python>=3.6,<3.8
1818
- numba>=0.49.0,!=0.51.0
1919
- numpy
20-
- pandas>=1.0,<1.2.0dev0
20+
- pandas>=1.0,<1.3.0dev0
2121
- pyarrow=1.0.1
2222
- fastavro>=0.22.9
2323
- notebook>=0.5.0

conda/environments/cudf_dev_cuda10.2.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ dependencies:
1717
- python>=3.6,<3.8
1818
- numba>=0.49,!=0.51.0
1919
- numpy
20-
- pandas>=1.0,<1.2.0dev0
20+
- pandas>=1.0,<1.3.0dev0
2121
- pyarrow=1.0.1
2222
- fastavro>=0.22.9
2323
- notebook>=0.5.0

conda/environments/cudf_dev_cuda11.0.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ dependencies:
1717
- python>=3.6,<3.8
1818
- numba>=0.49,!=0.51.0
1919
- numpy
20-
- pandas>=1.0,<1.2.0dev0
20+
- pandas>=1.0,<1.3.0dev0
2121
- pyarrow=1.0.1
2222
- fastavro>=0.22.9
2323
- notebook>=0.5.0

conda/recipes/cudf/meta.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2018, NVIDIA CORPORATION.
1+
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
22

33
{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
44
{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -35,7 +35,7 @@ requirements:
3535
- protobuf
3636
- python
3737
- typing_extensions
38-
- pandas >=1.0,<1.2.0dev0
38+
- pandas >=1.0,<1.3.0dev0
3939
- cupy >7.1.0,<9.0.0a0
4040
- numba >=0.49.0
4141
- numpy
@@ -45,6 +45,7 @@ requirements:
4545
- fsspec>=0.6.0
4646
- {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
4747
- nvtx >=0.2.1
48+
- packaging
4849
- cachetools
4950

5051
test:

python/cudf/cudf/core/_compat.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
# Copyright (c) 2020, NVIDIA CORPORATION.
1+
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
22

33
import pandas as pd
44
from packaging import version
55

66
PANDAS_VERSION = version.parse(pd.__version__)
77
PANDAS_GE_100 = PANDAS_VERSION >= version.parse("1.0")
88
PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1")
9+
PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2")

python/cudf/cudf/core/column/categorical.py

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
Dict,
1010
Mapping,
1111
Optional,
12+
Sequence,
1213
Tuple,
1314
Union,
1415
cast,
@@ -867,6 +868,15 @@ def set_base_data(self, value):
867868
else:
868869
super().set_base_data(value)
869870

871+
def _process_values_for_isin(
872+
self, values: Sequence
873+
) -> Tuple[ColumnBase, ColumnBase]:
874+
lhs = self
875+
# We need to convert values to same type as self,
876+
# hence passing dtype=self.dtype
877+
rhs = cudf.core.column.as_column(values, dtype=self.dtype)
878+
return lhs, rhs
879+
870880
def set_base_mask(self, value: Optional[Buffer]):
871881
super().set_base_mask(value)
872882
self._codes = None
@@ -936,6 +946,21 @@ def unary_operator(self, unaryop: str):
936946
)
937947

938948
def __setitem__(self, key, value):
949+
if cudf.utils.dtypes.is_scalar(
950+
value
951+
) and cudf._lib.scalar._is_null_host_scalar(value):
952+
to_add_categories = 0
953+
else:
954+
to_add_categories = len(
955+
cudf.Index(value).difference(self.categories)
956+
)
957+
958+
if to_add_categories > 0:
959+
raise ValueError(
960+
"Cannot setitem on a Categorical with a new "
961+
"category, set the categories first"
962+
)
963+
939964
if cudf.utils.dtypes.is_scalar(value):
940965
value = self._encode(value) if value is not None else value
941966
else:
@@ -1046,11 +1071,24 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
10461071
def to_pandas(
10471072
self, index: ColumnLike = None, nullable: bool = False, **kwargs
10481073
) -> pd.Series:
1049-
signed_dtype = min_signed_type(len(self.categories))
1050-
codes = self.cat().codes.astype(signed_dtype).fillna(-1).to_array()
1051-
categories = self.categories.to_pandas()
1074+
1075+
if self.categories.dtype.kind == "f":
1076+
new_mask = bools_to_mask(self.notnull())
1077+
col = column.build_categorical_column(
1078+
categories=self.categories,
1079+
codes=column.as_column(self.codes, dtype=self.codes.dtype),
1080+
mask=new_mask,
1081+
ordered=self.dtype.ordered,
1082+
size=self.codes.size,
1083+
)
1084+
else:
1085+
col = self
1086+
1087+
signed_dtype = min_signed_type(len(col.categories))
1088+
codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array()
1089+
categories = col.categories.dropna(drop_nan=True).to_pandas()
10521090
data = pd.Categorical.from_codes(
1053-
codes, categories=categories, ordered=self.ordered
1091+
codes, categories=categories, ordered=col.ordered
10541092
)
10551093
return pd.Series(data, index=index)
10561094

@@ -1180,6 +1218,38 @@ def find_and_replace(
11801218
ordered=self.dtype.ordered,
11811219
)
11821220

1221+
def isnull(self) -> ColumnBase:
1222+
"""
1223+
Identify missing values in a CategoricalColumn.
1224+
"""
1225+
result = libcudf.unary.is_null(self)
1226+
1227+
if self.categories.dtype.kind == "f":
1228+
# Need to consider `np.nan` values incase
1229+
# of an underlying float column
1230+
categories = libcudf.unary.is_nan(self.categories)
1231+
if categories.any():
1232+
code = self._encode(np.nan)
1233+
result = result | (self.codes == cudf.Scalar(code))
1234+
1235+
return result
1236+
1237+
def notnull(self) -> ColumnBase:
1238+
"""
1239+
Identify non-missing values in a CategoricalColumn.
1240+
"""
1241+
result = libcudf.unary.is_valid(self)
1242+
1243+
if self.categories.dtype.kind == "f":
1244+
# Need to consider `np.nan` values incase
1245+
# of an underlying float column
1246+
categories = libcudf.unary.is_nan(self.categories)
1247+
if categories.any():
1248+
code = self._encode(np.nan)
1249+
result = result & (self.codes != cudf.Scalar(code))
1250+
1251+
return result
1252+
11831253
def fillna(
11841254
self, fill_value: Any = None, method: Any = None, dtype: Dtype = None
11851255
) -> CategoricalColumn:
@@ -1204,6 +1274,12 @@ def fillna(
12041274
raise ValueError(err_msg) from err
12051275
else:
12061276
fill_value = column.as_column(fill_value, nan_as_null=False)
1277+
if isinstance(fill_value, CategoricalColumn):
1278+
if self.dtype != fill_value.dtype:
1279+
raise ValueError(
1280+
"Cannot set a Categorical with another, "
1281+
"without identical categories"
1282+
)
12071283
# TODO: only required if fill_value has a subset of the
12081284
# categories:
12091285
fill_value = fill_value.cat()._set_categories(

python/cudf/cudf/core/column/column.py

Lines changed: 54 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
2+
23
from __future__ import annotations
34

45
import builtins
@@ -49,12 +50,12 @@
4950
get_time_unit,
5051
is_categorical_dtype,
5152
is_decimal_dtype,
53+
is_interval_dtype,
5254
is_list_dtype,
5355
is_numerical_dtype,
5456
is_scalar,
5557
is_string_dtype,
5658
is_struct_dtype,
57-
is_interval_dtype,
5859
min_signed_type,
5960
min_unsigned_type,
6061
np_to_pa_dtype,
@@ -848,55 +849,65 @@ def isin(self, values: Sequence) -> ColumnBase:
848849
-------
849850
result: Column
850851
Column of booleans indicating if each element is in values.
851-
Raises
852-
-------
853-
TypeError
854-
If values is a string
855852
"""
856-
if is_scalar(values):
857-
raise TypeError(
858-
"only list-like objects are allowed to be passed "
859-
f"to isin(), you passed a [{type(values).__name__}]"
860-
)
861-
862853
lhs = self
863854
rhs = None
864855

865856
try:
866-
# We need to convert values to same type as self,
867-
# hence passing dtype=self.dtype
868-
rhs = as_column(values, dtype=self.dtype)
869-
870-
# Short-circuit if rhs is all null.
871-
if lhs.null_count == 0 and (rhs.null_count == len(rhs)):
872-
return full(len(self), False, dtype="bool")
857+
lhs, rhs = self._process_values_for_isin(values)
858+
res = lhs._isin_earlystop(rhs)
859+
if res is not None:
860+
return res
873861
except ValueError:
874862
# pandas functionally returns all False when cleansing via
875863
# typecasting fails
876864
return full(len(self), False, dtype="bool")
877865

878-
# If categorical, combine categories first
879-
if is_categorical_dtype(lhs):
880-
lhs_cats = lhs.cat().categories._values
881-
rhs_cats = rhs.cat().categories._values
882-
883-
if not np.issubdtype(rhs_cats.dtype, lhs_cats.dtype):
884-
# If they're not the same dtype, short-circuit if the values
885-
# list doesn't have any nulls. If it does have nulls, make
886-
# the values list a Categorical with a single null
887-
if not rhs.has_nulls:
888-
return full(len(self), False, dtype="bool")
889-
rhs = as_column(pd.Categorical.from_codes([-1], categories=[]))
890-
rhs = rhs.cat().set_categories(lhs_cats).astype(self.dtype)
891-
892-
ldf = cudf.DataFrame({"x": lhs, "orig_order": arange(len(lhs))})
866+
res = lhs._obtain_isin_result(rhs)
867+
868+
return res
869+
870+
def _process_values_for_isin(
871+
self, values: Sequence
872+
) -> Tuple[ColumnBase, ColumnBase]:
873+
"""
874+
Helper function for `isin` which pre-process `values` based on `self`.
875+
"""
876+
lhs = self
877+
rhs = as_column(values, nan_as_null=False)
878+
if lhs.null_count == len(lhs):
879+
lhs = lhs.astype(rhs.dtype)
880+
elif rhs.null_count == len(rhs):
881+
rhs = rhs.astype(lhs.dtype)
882+
return lhs, rhs
883+
884+
def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]:
885+
"""
886+
Helper function for `isin` which determines possibility of
887+
early-stopping or not.
888+
"""
889+
if self.dtype != rhs.dtype:
890+
if self.null_count and rhs.null_count:
891+
return self.isna()
892+
else:
893+
return cudf.core.column.full(len(self), False, dtype="bool")
894+
elif self.null_count == 0 and (rhs.null_count == len(rhs)):
895+
return cudf.core.column.full(len(self), False, dtype="bool")
896+
else:
897+
return None
898+
899+
def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
900+
"""
901+
Helper function for `isin` which merges `self` & `rhs`
902+
to determine what values of `rhs` exist in `self`.
903+
"""
904+
ldf = cudf.DataFrame({"x": self, "orig_order": arange(len(self))})
893905
rdf = cudf.DataFrame(
894906
{"x": rhs, "bool": full(len(rhs), True, dtype="bool")}
895907
)
896908
res = ldf.merge(rdf, on="x", how="left").sort_values(by="orig_order")
897909
res = res.drop_duplicates(subset="orig_order", ignore_index=True)
898910
res = res._data["bool"].fillna(False)
899-
900911
return res
901912

902913
def as_mask(self) -> Buffer:
@@ -1052,14 +1063,14 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
10521063

10531064
# columns include null index in factorization; remove:
10541065
if self.has_nulls:
1055-
cats = cats.dropna()
1066+
cats = cats._column.dropna(drop_nan=False)
10561067
min_type = min_unsigned_type(len(cats), 8)
10571068
labels = labels - 1
10581069
if np.dtype(min_type).itemsize < labels.dtype.itemsize:
10591070
labels = labels.astype(min_type)
10601071

10611072
return build_categorical_column(
1062-
categories=cats._column,
1073+
categories=cats,
10631074
codes=labels._column,
10641075
mask=self.mask,
10651076
ordered=ordered,
@@ -1250,7 +1261,7 @@ def sum(
12501261
def product(
12511262
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
12521263
):
1253-
raise TypeError(f"cannot perform prod with type {self.dtype}")
1264+
raise TypeError(f"cannot perform product with type {self.dtype}")
12541265

12551266
def mean(self, skipna: bool = None, dtype: Dtype = None):
12561267
raise TypeError(f"cannot perform mean with type {self.dtype}")
@@ -1262,7 +1273,7 @@ def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
12621273
raise TypeError(f"cannot perform var with type {self.dtype}")
12631274

12641275
def kurtosis(self, skipna: bool = None):
1265-
raise TypeError(f"cannot perform kurt with type {self.dtype}")
1276+
raise TypeError(f"cannot perform kurtosis with type {self.dtype}")
12661277

12671278
def skew(self, skipna: bool = None):
12681279
raise TypeError(f"cannot perform skew with type {self.dtype}")
@@ -2066,9 +2077,11 @@ def _construct_array(
20662077
arbitrary = cupy.asarray(arbitrary, dtype=dtype)
20672078
except (TypeError, ValueError):
20682079
native_dtype = dtype
2069-
if dtype is None and pd.api.types.infer_dtype(arbitrary) in (
2070-
"mixed",
2071-
"mixed-integer",
2080+
if (
2081+
dtype is None
2082+
and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
2083+
and pd.api.types.infer_dtype(arbitrary)
2084+
in ("mixed", "mixed-integer",)
20722085
):
20732086
native_dtype = "object"
20742087
arbitrary = np.asarray(

0 commit comments

Comments
 (0)