|
3 | 3 | from collections.abc import Callable # noqa: PDF001
|
4 | 4 | import re
|
5 | 5 | from typing import (
|
6 |
| - TYPE_CHECKING, |
7 | 6 | Any,
|
8 | 7 | Union,
|
9 | 8 | overload,
|
|
28 | 27 | from pandas.compat import (
|
29 | 28 | pa_version_under1p01,
|
30 | 29 | pa_version_under2p0,
|
31 |
| - pa_version_under3p0, |
32 | 30 | pa_version_under4p0,
|
33 | 31 | )
|
34 | 32 | from pandas.util._decorators import doc
|
|
77 | 75 | }
|
78 | 76 |
|
79 | 77 |
|
80 |
| -if TYPE_CHECKING: |
81 |
| - from pandas import Series |
82 |
| - |
83 | 78 | ArrowStringScalarOrNAT = Union[str, libmissing.NAType]
|
84 | 79 |
|
85 | 80 |
|
@@ -140,6 +135,8 @@ class ArrowStringArray(
|
140 | 135 | Length: 4, dtype: string
|
141 | 136 | """
|
142 | 137 |
|
| 138 | + _pa_dtype = pa.string() |
| 139 | + |
143 | 140 | def __init__(self, values) -> None:
|
144 | 141 | self._dtype = StringDtype(storage="pyarrow")
|
145 | 142 | if isinstance(values, pa.Array):
|
@@ -170,11 +167,11 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
|
170 | 167 | na_values = scalars._mask
|
171 | 168 | result = scalars._data
|
172 | 169 | result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
|
173 |
| - return cls(pa.array(result, mask=na_values, type=pa.string())) |
| 170 | + return cls(pa.array(result, mask=na_values, type=cls._pa_dtype)) |
174 | 171 |
|
175 | 172 | # convert non-na-likes to str
|
176 | 173 | result = lib.ensure_string_array(scalars, copy=copy)
|
177 |
| - return cls(pa.array(result, type=pa.string(), from_pandas=True)) |
| 174 | + return cls(pa.array(result, type=cls._pa_dtype, from_pandas=True)) |
178 | 175 |
|
179 | 176 | @classmethod
|
180 | 177 | def _from_sequence_of_strings(
|
@@ -269,7 +266,7 @@ def __getitem__(
|
269 | 266 |
|
270 | 267 | if isinstance(item, np.ndarray):
|
271 | 268 | if not len(item):
|
272 |
| - return type(self)(pa.chunked_array([], type=pa.string())) |
| 269 | + return type(self)(pa.chunked_array([], type=self._pa_dtype)) |
273 | 270 | elif is_integer_dtype(item.dtype):
|
274 | 271 | return self.take(item)
|
275 | 272 | elif is_bool_dtype(item.dtype):
|
@@ -455,70 +452,6 @@ def take(
|
455 | 452 | indices_array[indices_array < 0] += len(self._data)
|
456 | 453 | return type(self)(self._data.take(indices_array))
|
457 | 454 |
|
458 |
| - def isin(self, values): |
459 |
| - if pa_version_under2p0: |
460 |
| - return super().isin(values) |
461 |
| - |
462 |
| - value_set = [ |
463 |
| - pa_scalar.as_py() |
464 |
| - for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values] |
465 |
| - if pa_scalar.type in (pa.string(), pa.null()) |
466 |
| - ] |
467 |
| - |
468 |
| - # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True |
469 |
| - # for null values, so we short-circuit to return all False array. |
470 |
| - if not len(value_set): |
471 |
| - return np.zeros(len(self), dtype=bool) |
472 |
| - |
473 |
| - kwargs = {} |
474 |
| - if pa_version_under3p0: |
475 |
| - # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises |
476 |
| - # with unexpected keyword argument in pyarrow 3.0.0+ |
477 |
| - kwargs["skip_null"] = True |
478 |
| - |
479 |
| - result = pc.is_in(self._data, value_set=pa.array(value_set), **kwargs) |
480 |
| - # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls |
481 |
| - # to False |
482 |
| - return np.array(result, dtype=np.bool_) |
483 |
| - |
484 |
| - def value_counts(self, dropna: bool = True) -> Series: |
485 |
| - """ |
486 |
| - Return a Series containing counts of each unique value. |
487 |
| -
|
488 |
| - Parameters |
489 |
| - ---------- |
490 |
| - dropna : bool, default True |
491 |
| - Don't include counts of missing values. |
492 |
| -
|
493 |
| - Returns |
494 |
| - ------- |
495 |
| - counts : Series |
496 |
| -
|
497 |
| - See Also |
498 |
| - -------- |
499 |
| - Series.value_counts |
500 |
| - """ |
501 |
| - from pandas import ( |
502 |
| - Index, |
503 |
| - Series, |
504 |
| - ) |
505 |
| - |
506 |
| - vc = self._data.value_counts() |
507 |
| - |
508 |
| - values = vc.field(0) |
509 |
| - counts = vc.field(1) |
510 |
| - if dropna and self._data.null_count > 0: |
511 |
| - mask = values.is_valid() |
512 |
| - values = values.filter(mask) |
513 |
| - counts = counts.filter(mask) |
514 |
| - |
515 |
| - # No missing values so we can adhere to the interface and return a numpy array. |
516 |
| - counts = np.array(counts) |
517 |
| - |
518 |
| - index = Index(type(self)(values)) |
519 |
| - |
520 |
| - return Series(counts, index=index).astype("Int64") |
521 |
| - |
522 | 455 | def astype(self, dtype, copy: bool = True):
|
523 | 456 | dtype = pandas_dtype(dtype)
|
524 | 457 |
|
@@ -590,7 +523,7 @@ def _str_map(
|
590 | 523 | result = lib.map_infer_mask(
|
591 | 524 | arr, f, mask.view("uint8"), convert=False, na_value=na_value
|
592 | 525 | )
|
593 |
| - result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) |
| 526 | + result = pa.array(result, mask=mask, type=self._pa_dtype, from_pandas=True) |
594 | 527 | return type(self)(result)
|
595 | 528 | else:
|
596 | 529 | # This is when the result type is object. We reach this when
|
|
0 commit comments