Skip to content

Commit 7b8f437

Browse files
committed
Merge branch 'master' into reduction_dtypes_II
2 parents aa1ccd3 + fa29f09 commit 7b8f437

File tree

190 files changed

+1588
-1007
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

190 files changed

+1588
-1007
lines changed

.pre-commit-config.yaml

+6-4
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ repos:
2626
language: python
2727
require_serial: true
2828
types_or: [python, pyi]
29-
additional_dependencies: [black==23.1.0]
29+
additional_dependencies: [black==23.3.0]
3030
- repo: https://github.com/charliermarsh/ruff-pre-commit
3131
rev: v0.0.264
3232
hooks:
@@ -83,9 +83,6 @@ repos:
8383
hooks:
8484
- id: pylint
8585
stages: [manual]
86-
- repo: https://github.com/pycqa/pylint
87-
rev: v2.16.2
88-
hooks:
8986
- id: pylint
9087
alias: redefined-outer-name
9188
name: Redefining name from outer scope
@@ -99,6 +96,11 @@ repos:
9996
|^pandas/conftest\.py # keep excluded
10097
args: [--disable=all, --enable=redefined-outer-name]
10198
stages: [manual]
99+
- id: pylint
100+
alias: unspecified-encoding
101+
name: Using open without explicitly specifying an encoding
102+
args: [--disable=all, --enable=unspecified-encoding]
103+
stages: [manual]
102104
- repo: https://github.com/PyCQA/isort
103105
rev: 5.12.0
104106
hooks:

asv_bench/benchmarks/dtypes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525

2626
class Dtypes:
27-
params = _dtypes + list(map(lambda dt: dt.name, _dtypes))
27+
params = _dtypes + [dt.name for dt in _dtypes]
2828
param_names = ["dtype"]
2929

3030
def time_pandas_dtype(self, dtype):

asv_bench/benchmarks/io/csv.py

+60-7
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
DataFrame,
1313
concat,
1414
date_range,
15+
period_range,
1516
read_csv,
1617
to_datetime,
1718
)
@@ -98,24 +99,76 @@ def time_frame_date_no_format_index(self):
9899
self.data.to_csv(self.fname)
99100

100101

102+
class ToCSVPeriod(BaseIO):
103+
fname = "__test__.csv"
104+
105+
params = ([1000, 10000], ["D", "H"])
106+
param_names = ["nobs", "freq"]
107+
108+
def setup(self, nobs, freq):
109+
rng = period_range(start="2000-01-01", periods=nobs, freq=freq)
110+
self.data = DataFrame(rng)
111+
if freq == "D":
112+
self.default_fmt = "%Y-%m-%d"
113+
elif freq == "H":
114+
self.default_fmt = "%Y-%m-%d %H:00"
115+
116+
def time_frame_period_formatting_default(self, nobs, freq):
117+
self.data.to_csv(self.fname)
118+
119+
def time_frame_period_formatting_default_explicit(self, nobs, freq):
120+
self.data.to_csv(self.fname, date_format=self.default_fmt)
121+
122+
def time_frame_period_formatting(self, nobs, freq):
123+
# Nb: `date_format` is not actually taken into account here today, so the
124+
# performance is currently identical to `time_frame_period_formatting_default`
125+
# above. This timer is therefore expected to degrade when GH#51621 is fixed.
126+
# (Remove this comment when GH#51621 is fixed.)
127+
self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S")
128+
129+
130+
class ToCSVPeriodIndex(BaseIO):
131+
fname = "__test__.csv"
132+
133+
params = ([1000, 10000], ["D", "H"])
134+
param_names = ["nobs", "freq"]
135+
136+
def setup(self, nobs, freq):
137+
rng = period_range(start="2000-01-01", periods=nobs, freq=freq)
138+
self.data = DataFrame({"a": 1}, index=rng)
139+
if freq == "D":
140+
self.default_fmt = "%Y-%m-%d"
141+
elif freq == "H":
142+
self.default_fmt = "%Y-%m-%d %H:00"
143+
144+
def time_frame_period_formatting_index(self, nobs, freq):
145+
self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S")
146+
147+
def time_frame_period_formatting_index_default(self, nobs, freq):
148+
self.data.to_csv(self.fname)
149+
150+
def time_frame_period_formatting_index_default_explicit(self, nobs, freq):
151+
self.data.to_csv(self.fname, date_format=self.default_fmt)
152+
153+
101154
class ToCSVDatetimeBig(BaseIO):
102155
fname = "__test__.csv"
103156
timeout = 1500
104157
params = [1000, 10000, 100000]
105-
param_names = ["obs"]
158+
param_names = ["nobs"]
106159

107-
def setup(self, obs):
160+
def setup(self, nobs):
108161
d = "2018-11-29"
109162
dt = "2018-11-26 11:18:27.0"
110163
self.data = DataFrame(
111164
{
112-
"dt": [np.datetime64(dt)] * obs,
113-
"d": [np.datetime64(d)] * obs,
114-
"r": [np.random.uniform()] * obs,
165+
"dt": [np.datetime64(dt)] * nobs,
166+
"d": [np.datetime64(d)] * nobs,
167+
"r": [np.random.uniform()] * nobs,
115168
}
116169
)
117170

118-
def time_frame(self, obs):
171+
def time_frame(self, nobs):
119172
self.data.to_csv(self.fname)
120173

121174

@@ -444,7 +497,7 @@ class ReadCSVMemoryGrowth(BaseIO):
444497
param_names = ["engine"]
445498

446499
def setup(self, engine):
447-
with open(self.fname, "w") as f:
500+
with open(self.fname, "w", encoding="utf-8") as f:
448501
for i in range(self.num_rows):
449502
f.write(f"{i}\n")
450503

asv_bench/benchmarks/pandas_vb_common.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
try:
1818
import pandas._testing as tm
1919
except ImportError:
20-
import pandas.util.testing as tm # noqa:F401
20+
import pandas.util.testing as tm # noqa: F401
2121

2222

2323
numeric_dtypes = [

asv_bench/benchmarks/strftime.py

+69-18
Original file line numberDiff line numberDiff line change
@@ -7,58 +7,109 @@
77
class DatetimeStrftime:
88
timeout = 1500
99
params = [1000, 10000]
10-
param_names = ["obs"]
10+
param_names = ["nobs"]
1111

12-
def setup(self, obs):
12+
def setup(self, nobs):
1313
d = "2018-11-29"
1414
dt = "2018-11-26 11:18:27.0"
1515
self.data = pd.DataFrame(
1616
{
17-
"dt": [np.datetime64(dt)] * obs,
18-
"d": [np.datetime64(d)] * obs,
19-
"r": [np.random.uniform()] * obs,
17+
"dt": [np.datetime64(dt)] * nobs,
18+
"d": [np.datetime64(d)] * nobs,
19+
"r": [np.random.uniform()] * nobs,
2020
}
2121
)
2222

23-
def time_frame_date_to_str(self, obs):
23+
def time_frame_date_to_str(self, nobs):
2424
self.data["d"].astype(str)
2525

26-
def time_frame_date_formatting_default(self, obs):
26+
def time_frame_date_formatting_default(self, nobs):
27+
self.data["d"].dt.strftime(date_format=None)
28+
29+
def time_frame_date_formatting_default_explicit(self, nobs):
2730
self.data["d"].dt.strftime(date_format="%Y-%m-%d")
2831

29-
def time_frame_date_formatting_custom(self, obs):
32+
def time_frame_date_formatting_custom(self, nobs):
3033
self.data["d"].dt.strftime(date_format="%Y---%m---%d")
3134

32-
def time_frame_datetime_to_str(self, obs):
35+
def time_frame_datetime_to_str(self, nobs):
3336
self.data["dt"].astype(str)
3437

35-
def time_frame_datetime_formatting_default_date_only(self, obs):
38+
def time_frame_datetime_formatting_default(self, nobs):
39+
self.data["dt"].dt.strftime(date_format=None)
40+
41+
def time_frame_datetime_formatting_default_explicit_date_only(self, nobs):
3642
self.data["dt"].dt.strftime(date_format="%Y-%m-%d")
3743

38-
def time_frame_datetime_formatting_default(self, obs):
44+
def time_frame_datetime_formatting_default_explicit(self, nobs):
3945
self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S")
4046

41-
def time_frame_datetime_formatting_default_with_float(self, obs):
47+
def time_frame_datetime_formatting_default_with_float(self, nobs):
4248
self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f")
4349

44-
def time_frame_datetime_formatting_custom(self, obs):
50+
def time_frame_datetime_formatting_custom(self, nobs):
4551
self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S")
4652

4753

54+
class PeriodStrftime:
55+
timeout = 1500
56+
params = ([1000, 10000], ["D", "H"])
57+
param_names = ["nobs", "freq"]
58+
59+
def setup(self, nobs, freq):
60+
self.data = pd.DataFrame(
61+
{
62+
"p": pd.period_range(start="2000-01-01", periods=nobs, freq=freq),
63+
"r": [np.random.uniform()] * nobs,
64+
}
65+
)
66+
self.data["i"] = self.data["p"]
67+
self.data.set_index("i", inplace=True)
68+
if freq == "D":
69+
self.default_fmt = "%Y-%m-%d"
70+
elif freq == "H":
71+
self.default_fmt = "%Y-%m-%d %H:00"
72+
73+
def time_frame_period_to_str(self, nobs, freq):
74+
self.data["p"].astype(str)
75+
76+
def time_frame_period_formatting_default(self, nobs, freq):
77+
self.data["p"].dt.strftime(date_format=None)
78+
79+
def time_frame_period_formatting_default_explicit(self, nobs, freq):
80+
self.data["p"].dt.strftime(date_format=self.default_fmt)
81+
82+
def time_frame_period_formatting_index_default(self, nobs, freq):
83+
self.data.index.format()
84+
85+
def time_frame_period_formatting_index_default_explicit(self, nobs, freq):
86+
self.data.index.format(self.default_fmt)
87+
88+
def time_frame_period_formatting_custom(self, nobs, freq):
89+
self.data["p"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S")
90+
91+
def time_frame_period_formatting_iso8601_strftime_Z(self, nobs, freq):
92+
self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%SZ")
93+
94+
def time_frame_period_formatting_iso8601_strftime_offset(self, nobs, freq):
95+
"""Not optimized yet as %z is not supported by `convert_strftime_format`"""
96+
self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%S%z")
97+
98+
4899
class BusinessHourStrftime:
49100
timeout = 1500
50101
params = [1000, 10000]
51-
param_names = ["obs"]
102+
param_names = ["nobs"]
52103

53-
def setup(self, obs):
104+
def setup(self, nobs):
54105
self.data = pd.DataFrame(
55106
{
56-
"off": [offsets.BusinessHour()] * obs,
107+
"off": [offsets.BusinessHour()] * nobs,
57108
}
58109
)
59110

60-
def time_frame_offset_str(self, obs):
111+
def time_frame_offset_str(self, nobs):
61112
self.data["off"].apply(str)
62113

63-
def time_frame_offset_repr(self, obs):
114+
def time_frame_offset_repr(self, nobs):
64115
self.data["off"].apply(repr)

asv_bench/benchmarks/tslibs/period.py

+19
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ class PeriodUnaryMethods:
6060

6161
def setup(self, freq):
6262
self.per = Period("2012-06-01", freq=freq)
63+
if freq == "M":
64+
self.default_fmt = "%Y-%m"
65+
elif freq == "min":
66+
self.default_fmt = "%Y-%m-%d %H:%M"
6367

6468
def time_to_timestamp(self, freq):
6569
self.per.to_timestamp()
@@ -70,6 +74,21 @@ def time_now(self, freq):
7074
def time_asfreq(self, freq):
7175
self.per.asfreq("A")
7276

77+
def time_str(self, freq):
78+
str(self.per)
79+
80+
def time_repr(self, freq):
81+
repr(self.per)
82+
83+
def time_strftime_default(self, freq):
84+
self.per.strftime(None)
85+
86+
def time_strftime_default_explicit(self, freq):
87+
self.per.strftime(self.default_fmt)
88+
89+
def time_strftime_custom(self, freq):
90+
self.per.strftime("%b. %d, %Y was a %A")
91+
7392

7493
class PeriodConstructor:
7594
params = [["D"], [True, False]]

doc/make.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -163,12 +163,12 @@ def _get_page_title(self, page):
163163
components=(docutils.parsers.rst.Parser,)
164164
)
165165
doc = docutils.utils.new_document("<doc>", option_parser.get_default_values())
166-
with open(fname) as f:
166+
with open(fname, encoding="utf-8") as f:
167167
data = f.read()
168168

169169
parser = docutils.parsers.rst.Parser()
170170
# do not generate any warning when parsing the rst
171-
with open(os.devnull, "a") as f:
171+
with open(os.devnull, "a", encoding="utf-8") as f:
172172
doc.reporter.stream = f
173173
parser.parse(data, doc)
174174

@@ -186,7 +186,7 @@ def _add_redirects(self):
186186
Create in the build directory an html file with a redirect,
187187
for every row in REDIRECTS_FILE.
188188
"""
189-
with open(REDIRECTS_FILE) as mapping_fd:
189+
with open(REDIRECTS_FILE, encoding="utf-8") as mapping_fd:
190190
reader = csv.reader(mapping_fd)
191191
for row in reader:
192192
if not row or row[0].strip().startswith("#"):
@@ -209,7 +209,7 @@ def _add_redirects(self):
209209
# sphinx specific stuff
210210
title = "this page"
211211

212-
with open(path, "w") as moved_page_fd:
212+
with open(path, "w", encoding="utf-8") as moved_page_fd:
213213
html = f"""\
214214
<html>
215215
<head>

doc/source/conf.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -117,9 +117,9 @@
117117
elif single_doc and rel_fname != pattern:
118118
exclude_patterns.append(rel_fname)
119119

120-
with open(os.path.join(source_path, "index.rst.template")) as f:
120+
with open(os.path.join(source_path, "index.rst.template"), encoding="utf-8") as f:
121121
t = jinja2.Template(f.read())
122-
with open(os.path.join(source_path, "index.rst"), "w") as f:
122+
with open(os.path.join(source_path, "index.rst"), "w", encoding="utf-8") as f:
123123
f.write(
124124
t.render(
125125
include_api=include_api,

doc/source/user_guide/window.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ be calculated with :meth:`~Rolling.apply` by specifying a separate column of wei
9696
return arr
9797
9898
df = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]])
99-
df.rolling(2, method="table", min_periods=0).apply(weighted_mean, raw=True, engine="numba") # noqa:E501
99+
df.rolling(2, method="table", min_periods=0).apply(weighted_mean, raw=True, engine="numba") # noqa: E501
100100
101101
.. versionadded:: 1.3
102102

doc/source/whatsnew/v2.0.2.rst

+6-1
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,24 @@ including other versions of pandas.
1313

1414
Fixed regressions
1515
~~~~~~~~~~~~~~~~~
16+
- Fixed regression in :meth:`DataFrame.loc` losing :class:`MultiIndex` name when enlarging object (:issue:`53053`)
17+
- Fixed regression in :meth:`DataFrame.to_string` printing a backslash at the end of the first row of data, instead of headers, when the DataFrame doesn't fit the line width (:issue:`53054`)
18+
- Fixed regression in :meth:`MultiIndex.join` returning levels in wrong order (:issue:`53093`)
1619
-
1720

1821
.. ---------------------------------------------------------------------------
1922
.. _whatsnew_202.bug_fixes:
2023

2124
Bug fixes
2225
~~~~~~~~~
26+
- Bug in :func:`api.interchange.from_dataframe` was raising ``IndexError`` on empty categorical data (:issue:`53077`)
2327
- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
2428
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
29+
- Bug in :func:`to_timedelta` was raising ``ValueError`` with ``pandas.NA`` (:issue:`52909`)
2530
- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`)
2631
- Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`)
2732
- Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`)
28-
-
33+
2934

3035
.. ---------------------------------------------------------------------------
3136
.. _whatsnew_202.other:

0 commit comments

Comments
 (0)