Skip to content

Commit 5eab8f9

Browse files
committed
Merge remote-tracking branch 'upstream/master' into 41423-doc-series-copy
2 parents ac82b6b + 671cf86 commit 5eab8f9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+1096
-941
lines changed

.github/workflows/database.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ jobs:
7070
- uses: conda-incubator/setup-miniconda@v2
7171
with:
7272
activate-environment: pandas-dev
73-
channel-priority: strict
73+
channel-priority: flexible
7474
environment-file: ${{ matrix.ENV_FILE }}
7575
use-only-tar-bz2: true
7676

.pre-commit-config.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ repos:
2121
- repo: https://github.com/pre-commit/pre-commit-hooks
2222
rev: v3.4.0
2323
hooks:
24+
- id: debug-statements
2425
- id: end-of-file-fixer
2526
exclude: \.txt$
2627
- id: trailing-whitespace

asv_bench/benchmarks/strings.py

+45-37
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,19 @@
1111
from .pandas_vb_common import tm
1212

1313

14+
class Dtypes:
15+
params = ["str", "string", "arrow_string"]
16+
param_names = ["dtype"]
17+
18+
def setup(self, dtype):
19+
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
20+
21+
try:
22+
self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
23+
except ImportError:
24+
raise NotImplementedError
25+
26+
1427
class Construction:
1528

1629
params = ["str", "string"]
@@ -49,18 +62,7 @@ def peakmem_cat_frame_construction(self, dtype):
4962
DataFrame(self.frame_cat_arr, dtype=dtype)
5063

5164

52-
class Methods:
53-
params = ["str", "string", "arrow_string"]
54-
param_names = ["dtype"]
55-
56-
def setup(self, dtype):
57-
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
58-
59-
try:
60-
self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
61-
except ImportError:
62-
raise NotImplementedError
63-
65+
class Methods(Dtypes):
6466
def time_center(self, dtype):
6567
self.s.str.center(100)
6668

@@ -83,6 +85,9 @@ def time_find(self, dtype):
8385
def time_rfind(self, dtype):
8486
self.s.str.rfind("[A-Z]+")
8587

88+
def time_fullmatch(self, dtype):
89+
self.s.str.fullmatch("A")
90+
8691
def time_get(self, dtype):
8792
self.s.str.get(0)
8893

@@ -211,35 +216,26 @@ def time_cat(self, other_cols, sep, na_rep, na_frac):
211216
self.s.str.cat(others=self.others, sep=sep, na_rep=na_rep)
212217

213218

214-
class Contains:
219+
class Contains(Dtypes):
215220

216-
params = (["str", "string", "arrow_string"], [True, False])
221+
params = (Dtypes.params, [True, False])
217222
param_names = ["dtype", "regex"]
218223

219224
def setup(self, dtype, regex):
220-
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
221-
222-
try:
223-
self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype)
224-
except ImportError:
225-
raise NotImplementedError
225+
super().setup(dtype)
226226

227227
def time_contains(self, dtype, regex):
228228
self.s.str.contains("A", regex=regex)
229229

230230

231-
class Split:
231+
class Split(Dtypes):
232232

233-
params = (["str", "string", "arrow_string"], [True, False])
233+
params = (Dtypes.params, [True, False])
234234
param_names = ["dtype", "expand"]
235235

236236
def setup(self, dtype, expand):
237-
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
238-
239-
try:
240-
self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("--")
241-
except ImportError:
242-
raise NotImplementedError
237+
super().setup(dtype)
238+
self.s = self.s.str.join("--")
243239

244240
def time_split(self, dtype, expand):
245241
self.s.str.split("--", expand=expand)
@@ -248,17 +244,23 @@ def time_rsplit(self, dtype, expand):
248244
self.s.str.rsplit("--", expand=expand)
249245

250246

251-
class Dummies:
252-
params = ["str", "string", "arrow_string"]
253-
param_names = ["dtype"]
247+
class Extract(Dtypes):
254248

255-
def setup(self, dtype):
256-
from pandas.core.arrays.string_arrow import ArrowStringDtype # noqa: F401
249+
params = (Dtypes.params, [True, False])
250+
param_names = ["dtype", "expand"]
257251

258-
try:
259-
self.s = Series(tm.makeStringIndex(10 ** 5), dtype=dtype).str.join("|")
260-
except ImportError:
261-
raise NotImplementedError
252+
def setup(self, dtype, expand):
253+
super().setup(dtype)
254+
255+
def time_extract_single_group(self, dtype, expand):
256+
with warnings.catch_warnings(record=True):
257+
self.s.str.extract("(\\w*)A", expand=expand)
258+
259+
260+
class Dummies(Dtypes):
261+
def setup(self, dtype):
262+
super().setup(dtype)
263+
self.s = self.s.str.join("|")
262264

263265
def time_get_dummies(self, dtype):
264266
self.s.str.get_dummies("|")
@@ -279,3 +281,9 @@ def setup(self):
279281
def time_vector_slice(self):
280282
# GH 2602
281283
self.s.str[:5]
284+
285+
286+
class Iter(Dtypes):
287+
def time_iter(self, dtype):
288+
for i in self.s:
289+
pass

ci/deps/actions-37-db-min.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ dependencies:
3131
- openpyxl
3232
- pandas-gbq
3333
- google-cloud-bigquery>=1.27.2 # GH 36436
34-
- pyarrow=0.17 # GH 38803
34+
- protobuf>=3.12.4
35+
- pyarrow=0.17.1 # GH 38803
3536
- pytables>=3.5.1
3637
- scipy
3738
- xarray=0.12.3

ci/deps/actions-37-db.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ dependencies:
3131
- pandas-gbq
3232
- google-cloud-bigquery>=1.27.2 # GH 36436
3333
- psycopg2
34-
- pyarrow>=0.15.0
34+
- pyarrow>=0.17.0
3535
- pymysql
3636
- pytables
3737
- python-snappy

ci/deps/actions-37-minimum_versions.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ dependencies:
2323
- pytables=3.5.1
2424
- python-dateutil=2.7.3
2525
- pytz=2017.3
26-
- pyarrow=0.15
26+
- pyarrow=0.17.0
2727
- scipy=1.2
2828
- xlrd=1.2.0
2929
- xlsxwriter=1.0.2

ci/deps/actions-37.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ dependencies:
1818
- numpy=1.19
1919
- python-dateutil
2020
- nomkl
21-
- pyarrow=0.15.1
21+
- pyarrow
2222
- pytz
2323
- s3fs>=0.4.0
2424
- moto>=1.3.14

ci/deps/azure-macos-37.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
name: pandas-dev
22
channels:
33
- defaults
4+
- conda-forge
45
dependencies:
56
- python=3.7.*
67

@@ -21,7 +22,7 @@ dependencies:
2122
- numexpr
2223
- numpy=1.17.3
2324
- openpyxl
24-
- pyarrow=0.15.1
25+
- pyarrow=0.17.0
2526
- pytables
2627
- python-dateutil==2.7.3
2728
- pytz

ci/deps/azure-windows-37.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ dependencies:
2626
- numexpr
2727
- numpy=1.17.*
2828
- openpyxl
29-
- pyarrow=0.15
29+
- pyarrow=0.17.0
3030
- pytables
3131
- python-dateutil
3232
- pytz

ci/deps/azure-windows-38.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ dependencies:
2525
- numpy=1.18.*
2626
- openpyxl
2727
- jinja2
28-
- pyarrow>=0.15.0
28+
- pyarrow>=0.17.0
2929
- pytables
3030
- python-dateutil
3131
- pytz

ci/run_tests.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile $TE
2424
if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then
2525
# GH#37455 windows py38 build appears to be running out of memory
2626
# skip collection of window tests
27-
PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/ --ignore=pandas/tests/plotting/"
27+
PYTEST_CMD="$PYTEST_CMD --ignore=pandas/tests/window/moments --ignore=pandas/tests/plotting/"
2828
fi
2929

3030
echo $PYTEST_CMD

doc/redirects.csv

+1
Original file line numberDiff line numberDiff line change
@@ -1197,6 +1197,7 @@ generated/pandas.Series.str.extractall,../reference/api/pandas.Series.str.extrac
11971197
generated/pandas.Series.str.extract,../reference/api/pandas.Series.str.extract
11981198
generated/pandas.Series.str.findall,../reference/api/pandas.Series.str.findall
11991199
generated/pandas.Series.str.find,../reference/api/pandas.Series.str.find
1200+
generated/pandas.Series.str.fullmatch,../reference/api/pandas.Series.str.fullmatch
12001201
generated/pandas.Series.str.get_dummies,../reference/api/pandas.Series.str.get_dummies
12011202
generated/pandas.Series.str.get,../reference/api/pandas.Series.str.get
12021203
generated/pandas.Series.str,../reference/api/pandas.Series.str

doc/source/getting_started/install.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ PyTables 3.5.1 HDF5-based reading / writing
358358
blosc 1.17.0 Compression for HDF5
359359
zlib Compression for HDF5
360360
fastparquet 0.4.0 Parquet reading / writing
361-
pyarrow 0.15.0 Parquet, ORC, and feather reading / writing
361+
pyarrow 0.17.0 Parquet, ORC, and feather reading / writing
362362
pyreadstat SPSS files (.sav) reading
363363
========================= ================== =============================================================
364364

doc/source/reference/series.rst

+1
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,7 @@ strings and apply several methods to it. These can be accessed like
415415
Series.str.extractall
416416
Series.str.find
417417
Series.str.findall
418+
Series.str.fullmatch
418419
Series.str.get
419420
Series.str.index
420421
Series.str.join

doc/source/whatsnew/v1.3.0.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ Other enhancements
224224
- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
225225
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
226226
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
227+
- :meth:`.GroupBy.rank` now supports object-dtype data (:issue:`38278`)
227228
- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
228229
- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
229230
- Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`)
@@ -578,7 +579,7 @@ Optional libraries below the lowest tested version may still work, but are not c
578579
+-----------------+-----------------+---------+
579580
| openpyxl | 3.0.0 | X |
580581
+-----------------+-----------------+---------+
581-
| pyarrow | 0.15.0 | |
582+
| pyarrow | 0.17.0 | X |
582583
+-----------------+-----------------+---------+
583584
| pymysql | 0.8.1 | X |
584585
+-----------------+-----------------+---------+
@@ -672,6 +673,7 @@ Performance improvements
672673
- Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`)
673674
- Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`)
674675
- Performance improvement in :meth:`Series.nunique` with nan values (:issue:`40865`)
676+
- Performance improvement in :meth:`DataFrame.transpose`, :meth:`Series.unstack` with ``DatetimeTZDtype`` (:issue:`40149`)
675677

676678
.. ---------------------------------------------------------------------------
677679
@@ -971,6 +973,7 @@ Other
971973
- Bug in :func:`pandas.util.show_versions` where console JSON output was not proper JSON (:issue:`39701`)
972974
- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised ValueError when called on an empty DataFrame (:issue:`40393`)
973975
- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`)
976+
- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`)
974977

975978
.. ---------------------------------------------------------------------------
976979

environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ dependencies:
100100
- odfpy
101101

102102
- fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet
103-
- pyarrow>=0.15.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
103+
- pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
104104
- python-snappy # required by pyarrow
105105

106106
- pyqt>=5.9.2 # pandas.read_clipboard

pandas/compat/_optional.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"odfpy": "1.3.0",
2222
"openpyxl": "3.0.0",
2323
"pandas_gbq": "0.12.0",
24-
"pyarrow": "0.15.0",
24+
"pyarrow": "0.17.0",
2525
"pytest": "5.0.1",
2626
"pyxlsb": "1.0.6",
2727
"s3fs": "0.4.0",

pandas/core/array_algos/take.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
from pandas._typing import ArrayLike
1717

1818
from pandas.core.dtypes.cast import maybe_promote
19-
from pandas.core.dtypes.common import ensure_platform_int
19+
from pandas.core.dtypes.common import (
20+
ensure_platform_int,
21+
is_1d_only_ea_obj,
22+
)
2023
from pandas.core.dtypes.missing import na_value_for_dtype
2124

2225
from pandas.core.construction import ensure_wrapped_if_datetimelike
@@ -91,12 +94,14 @@ def take_nd(
9194

9295
if not isinstance(arr, np.ndarray):
9396
# i.e. ExtensionArray,
94-
if arr.ndim == 2:
95-
# e.g. DatetimeArray, TimedeltArray
97+
# includes for EA to catch DatetimeArray, TimedeltaArray
98+
if not is_1d_only_ea_obj(arr):
99+
# i.e. DatetimeArray, TimedeltaArray
96100
arr = cast("NDArrayBackedExtensionArray", arr)
97101
return arr.take(
98102
indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis
99103
)
104+
100105
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
101106

102107
arr = np.asarray(arr)

pandas/core/arrays/string_arrow.py

+17-12
Original file line numberDiff line numberDiff line change
@@ -820,33 +820,38 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True):
820820
result[isna(result)] = bool(na)
821821
return result
822822

823-
def _str_startswith(self, pat, na=None):
823+
def _str_startswith(self, pat: str, na=None):
824824
if pa_version_under4p0:
825825
return super()._str_startswith(pat, na)
826826

827-
result = pc.match_substring_regex(self._data, "^" + re.escape(pat))
828-
result = BooleanDtype().__from_arrow__(result)
829-
if not isna(na):
830-
result[isna(result)] = bool(na)
831-
return result
827+
pat = "^" + re.escape(pat)
828+
return self._str_contains(pat, na=na, regex=True)
832829

833-
def _str_endswith(self, pat, na=None):
830+
def _str_endswith(self, pat: str, na=None):
834831
if pa_version_under4p0:
835832
return super()._str_endswith(pat, na)
836833

837-
result = pc.match_substring_regex(self._data, re.escape(pat) + "$")
838-
result = BooleanDtype().__from_arrow__(result)
839-
if not isna(na):
840-
result[isna(result)] = bool(na)
841-
return result
834+
pat = re.escape(pat) + "$"
835+
return self._str_contains(pat, na=na, regex=True)
842836

843837
def _str_match(
844838
self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None
845839
):
840+
if pa_version_under4p0:
841+
return super()._str_match(pat, case, flags, na)
842+
846843
if not pat.startswith("^"):
847844
pat = "^" + pat
848845
return self._str_contains(pat, case, flags, na, regex=True)
849846

847+
def _str_fullmatch(self, pat, case: bool = True, flags: int = 0, na: Scalar = None):
848+
if pa_version_under4p0:
849+
return super()._str_fullmatch(pat, case, flags, na)
850+
851+
if not pat.endswith("$") or pat.endswith("//$"):
852+
pat = pat + "$"
853+
return self._str_match(pat, case, flags, na)
854+
850855
def _str_isalnum(self):
851856
result = pc.utf8_is_alnum(self._data)
852857
return BooleanDtype().__from_arrow__(result)

0 commit comments

Comments
 (0)