From d9629d6945f4d624290510bdcad76812d262590f Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 29 May 2021 13:44:11 -0700 Subject: [PATCH 1/4] REF: more explicit dtypes in strings.accessor --- pandas/core/series.py | 2 +- pandas/core/strings/accessor.py | 38 ++++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 2f45a2adbdec7..96bf1858de39c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3980,7 +3980,7 @@ def explode(self, ignore_index: bool = False) -> Series: else: index = self.index.repeat(counts) - return self._constructor(values, index=index, name=self.name) + return self._constructor(values, index=index, name=self.name, dtype=object) def unstack(self, level=-1, fill_value=None) -> DataFrame: """ diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7643019ff8c55..d6a70e7a4fcde 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -13,7 +13,10 @@ import numpy as np import pandas._libs.lib as lib -from pandas._typing import FrameOrSeriesUnion +from pandas._typing import ( + DtypeObj, + FrameOrSeriesUnion, +) from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -34,6 +37,7 @@ from pandas.core.dtypes.missing import isna from pandas.core.base import NoNewAttributesMixin +from pandas.core.construction import extract_array if TYPE_CHECKING: from pandas import Index @@ -122,7 +126,7 @@ def _map_and_wrap(name, docstring): @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): result = getattr(self._data.array, f"_str_{name}")() - return self._wrap_result(result) + return self._wrap_result(result, returns_bool=True) wrapper.__doc__ = docstring return wrapper @@ -209,8 +213,8 @@ def _validate(data): # see _libs/lib.pyx for list of inferred types allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - values = getattr(data, "values", data) # Series / Index - values = getattr(values, "categories", values) # categorical / normal + data = extract_array(data, extract_numpy=True) + values = getattr(data, "categories", data) # categorical / normal inferred_dtype = lib.infer_dtype(values, skipna=True) @@ -242,6 +246,7 @@ def _wrap_result( expand: bool | None = None, fill_value=np.nan, returns_string=True, + returns_bool: bool = False, ): from pandas import ( Index, @@ -319,11 +324,18 @@ def cons_row(x): else: index = self._orig.index # This is a mess. - dtype: str | None - if self._is_string and returns_string: - dtype = self._orig.dtype + dtype: DtypeObj | str | None + if self._is_string: + if returns_bool: + dtype = "boolean" + elif returns_string: + dtype = self._orig.dtype + else: + dtype = result.dtype + elif returns_bool: + dtype = result.dtype # i.e. bool else: - dtype = None + dtype = getattr(result, "dtype", None) if expand: cons = self._orig._constructor_expanddim @@ -331,7 +343,7 @@ def cons_row(x): else: # Must be a Series cons = self._orig._constructor - result = cons(result, name=name, index=index) + result = cons(result, name=name, index=index, dtype=dtype) result = result.__finalize__(self._orig, method="str") if name is not None and result.ndim == 1: # __finalize__ might copy over the original name, but we may @@ -369,7 +381,7 @@ def _get_series_list(self, others): if isinstance(others, ABCSeries): return [others] elif isinstance(others, ABCIndex): - return [Series(others._values, index=idx)] + return [Series(others._values, index=idx, dtype=others.dtype)] elif isinstance(others, ABCDataFrame): return [others[x] for x in others] elif isinstance(others, np.ndarray) and others.ndim == 2: @@ -547,7 +559,7 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): sep = "" if isinstance(self._orig, ABCIndex): - data = Series(self._orig, index=self._orig) + data = Series(self._orig, index=self._orig, dtype=self._orig.dtype) else: # Series data = self._orig @@ -2145,7 +2157,7 @@ def startswith(self, pat, na=None): dtype: bool """ result = self._data.array._str_startswith(pat, na=na) - return self._wrap_result(result, returns_string=False) + return self._wrap_result(result, returns_string=False, returns_bool=True) @forbid_nonstring_types(["bytes"]) def endswith(self, pat, na=None): @@ -2202,7 +2214,7 @@ def endswith(self, pat, na=None): dtype: bool """ result = self._data.array._str_endswith(pat, na=na) - return self._wrap_result(result, returns_string=False) + return self._wrap_result(result, returns_string=False, returns_bool=True) @forbid_nonstring_types(["bytes"]) def findall(self, pat, flags=0): From 4204274322602e0ec8909c76a6a288370e1f8497 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 29 May 2021 19:01:51 -0700 Subject: [PATCH 2/4] revert explode edit --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 96bf1858de39c..2f45a2adbdec7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3980,7 +3980,7 @@ def explode(self, ignore_index: bool = False) -> Series: else: index = self.index.repeat(counts) - return self._constructor(values, index=index, name=self.name, dtype=object) + return self._constructor(values, index=index, name=self.name) def unstack(self, level=-1, fill_value=None) -> DataFrame: """ From 234697013018cef6c27491d6a508e3af728114e0 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 1 Jun 2021 14:00:53 -0700 Subject: [PATCH 3/4] fix test_numpy tests --- pandas/core/strings/accessor.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index d6a70e7a4fcde..69fb4f6b61224 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -37,7 +37,6 @@ from pandas.core.dtypes.missing import isna from pandas.core.base import NoNewAttributesMixin -from pandas.core.construction import extract_array if TYPE_CHECKING: from pandas import Index @@ -213,7 +212,11 @@ def _validate(data): # see _libs/lib.pyx for list of inferred types allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - data = extract_array(data, extract_numpy=True) + # TODO: avoid kludge for tests.extension.test_numpy + from pandas.core.internals.managers import _extract_array + + data = _extract_array(data) + values = getattr(data, "categories", data) # categorical / normal inferred_dtype = lib.infer_dtype(values, skipna=True) From 8ed20e6dafce124ae63307a0c45b1962051c5f21 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jun 2021 14:20:54 -0700 Subject: [PATCH 4/4] remove returns_bool --- pandas/core/strings/accessor.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 69fb4f6b61224..29d37599b0785 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -125,7 +125,7 @@ def _map_and_wrap(name, docstring): @forbid_nonstring_types(["bytes"], name=name) def wrapper(self): result = getattr(self._data.array, f"_str_{name}")() - return self._wrap_result(result, returns_bool=True) + return self._wrap_result(result) wrapper.__doc__ = docstring return wrapper @@ -328,17 +328,16 @@ def cons_row(x): index = self._orig.index # This is a mess. dtype: DtypeObj | str | None + vdtype = getattr(result, "dtype", None) if self._is_string: - if returns_bool: - dtype = "boolean" + if is_bool_dtype(vdtype): + dtype = result.dtype elif returns_string: dtype = self._orig.dtype else: - dtype = result.dtype - elif returns_bool: - dtype = result.dtype # i.e. bool + dtype = vdtype else: - dtype = getattr(result, "dtype", None) + dtype = vdtype if expand: cons = self._orig._constructor_expanddim @@ -2160,7 +2159,7 @@ def startswith(self, pat, na=None): dtype: bool """ result = self._data.array._str_startswith(pat, na=na) - return self._wrap_result(result, returns_string=False, returns_bool=True) + return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) def endswith(self, pat, na=None): @@ -2217,7 +2216,7 @@ def endswith(self, pat, na=None): dtype: bool """ result = self._data.array._str_endswith(pat, na=na) - return self._wrap_result(result, returns_string=False, returns_bool=True) + return self._wrap_result(result, returns_string=False) @forbid_nonstring_types(["bytes"]) def findall(self, pat, flags=0):