Skip to content

Commit e172d74

Browse files
smarieSylvain MARIEMarcoGorelli
authored and
Yi Wei
committed
[READY] Improved performance of Period's default formatter (period_format) (pandas-dev#51459)
* Improved performance of default period formatting (`period_format`). Added corresponding ASVs * Improved ASV for period frames and datetimes * What's new * Update asv_bench/benchmarks/strftime.py * Fixed whats new backticks * Completed whatsnew * Added ASVs for to_csv for period * Aligned the namings * Completed Whats new * Added a docstring explaining why the ASV bench with custom date format was improved: the date_format parameter is not taken into account today. * Moved whatsnew to 2.0.0 * Moved whatsnew to 2.1 * Improved docstring as per code review * Renamed asv params as per code review * Fixed ASV comment as per code review * ASV: renamed parameters as per code review * Improved `period_format`: now the performance is the same when no format is provided and if an explicit `fmt` is provided matching the actual default format for this `freq` * Code review: Improved strftime ASV: set_index is now in the setup * Removed useless main * Removed wrong code * Improved ASVs for period formatting: now there is a "default explicit" everywhere * Update pandas/_libs/tslibs/period.pyx * Update pandas/_libs/tslibs/period.pyx * Update pandas/_libs/tslibs/period.pyx * Minor refactoring to avoid retesting for none several time * Fixed issue: bool does not exist, using bint * Added missing quarter variable as cdef * Fixed asv bug * Code review: fixed docstring * Update doc/source/whatsnew/v2.1.0.rst * fixup --------- Co-authored-by: Sylvain MARIE <[email protected]> Co-authored-by: Marco Edward Gorelli <[email protected]> Co-authored-by: MarcoGorelli <>
1 parent a249540 commit e172d74

File tree

5 files changed

+218
-63
lines changed

5 files changed

+218
-63
lines changed

asv_bench/benchmarks/io/csv.py

+59-6
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
DataFrame,
1313
concat,
1414
date_range,
15+
period_range,
1516
read_csv,
1617
to_datetime,
1718
)
@@ -98,24 +99,76 @@ def time_frame_date_no_format_index(self):
9899
self.data.to_csv(self.fname)
99100

100101

102+
class ToCSVPeriod(BaseIO):
103+
fname = "__test__.csv"
104+
105+
params = ([1000, 10000], ["D", "H"])
106+
param_names = ["nobs", "freq"]
107+
108+
def setup(self, nobs, freq):
109+
rng = period_range(start="2000-01-01", periods=nobs, freq=freq)
110+
self.data = DataFrame(rng)
111+
if freq == "D":
112+
self.default_fmt = "%Y-%m-%d"
113+
elif freq == "H":
114+
self.default_fmt = "%Y-%m-%d %H:00"
115+
116+
def time_frame_period_formatting_default(self, nobs, freq):
117+
self.data.to_csv(self.fname)
118+
119+
def time_frame_period_formatting_default_explicit(self, nobs, freq):
120+
self.data.to_csv(self.fname, date_format=self.default_fmt)
121+
122+
def time_frame_period_formatting(self, nobs, freq):
123+
# Nb: `date_format` is not actually taken into account here today, so the
124+
# performance is currently identical to `time_frame_period_formatting_default`
125+
# above. This timer is therefore expected to degrade when GH#51621 is fixed.
126+
# (Remove this comment when GH#51621 is fixed.)
127+
self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S")
128+
129+
130+
class ToCSVPeriodIndex(BaseIO):
131+
fname = "__test__.csv"
132+
133+
params = ([1000, 10000], ["D", "H"])
134+
param_names = ["nobs", "freq"]
135+
136+
def setup(self, nobs, freq):
137+
rng = period_range(start="2000-01-01", periods=nobs, freq=freq)
138+
self.data = DataFrame({"a": 1}, index=rng)
139+
if freq == "D":
140+
self.default_fmt = "%Y-%m-%d"
141+
elif freq == "H":
142+
self.default_fmt = "%Y-%m-%d %H:00"
143+
144+
def time_frame_period_formatting_index(self, nobs, freq):
145+
self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S")
146+
147+
def time_frame_period_formatting_index_default(self, nobs, freq):
148+
self.data.to_csv(self.fname)
149+
150+
def time_frame_period_formatting_index_default_explicit(self, nobs, freq):
151+
self.data.to_csv(self.fname, date_format=self.default_fmt)
152+
153+
101154
class ToCSVDatetimeBig(BaseIO):
102155
fname = "__test__.csv"
103156
timeout = 1500
104157
params = [1000, 10000, 100000]
105-
param_names = ["obs"]
158+
param_names = ["nobs"]
106159

107-
def setup(self, obs):
160+
def setup(self, nobs):
108161
d = "2018-11-29"
109162
dt = "2018-11-26 11:18:27.0"
110163
self.data = DataFrame(
111164
{
112-
"dt": [np.datetime64(dt)] * obs,
113-
"d": [np.datetime64(d)] * obs,
114-
"r": [np.random.uniform()] * obs,
165+
"dt": [np.datetime64(dt)] * nobs,
166+
"d": [np.datetime64(d)] * nobs,
167+
"r": [np.random.uniform()] * nobs,
115168
}
116169
)
117170

118-
def time_frame(self, obs):
171+
def time_frame(self, nobs):
119172
self.data.to_csv(self.fname)
120173

121174

asv_bench/benchmarks/strftime.py

+69-18
Original file line numberDiff line numberDiff line change
@@ -7,58 +7,109 @@
77
class DatetimeStrftime:
88
timeout = 1500
99
params = [1000, 10000]
10-
param_names = ["obs"]
10+
param_names = ["nobs"]
1111

12-
def setup(self, obs):
12+
def setup(self, nobs):
1313
d = "2018-11-29"
1414
dt = "2018-11-26 11:18:27.0"
1515
self.data = pd.DataFrame(
1616
{
17-
"dt": [np.datetime64(dt)] * obs,
18-
"d": [np.datetime64(d)] * obs,
19-
"r": [np.random.uniform()] * obs,
17+
"dt": [np.datetime64(dt)] * nobs,
18+
"d": [np.datetime64(d)] * nobs,
19+
"r": [np.random.uniform()] * nobs,
2020
}
2121
)
2222

23-
def time_frame_date_to_str(self, obs):
23+
def time_frame_date_to_str(self, nobs):
2424
self.data["d"].astype(str)
2525

26-
def time_frame_date_formatting_default(self, obs):
26+
def time_frame_date_formatting_default(self, nobs):
27+
self.data["d"].dt.strftime(date_format=None)
28+
29+
def time_frame_date_formatting_default_explicit(self, nobs):
2730
self.data["d"].dt.strftime(date_format="%Y-%m-%d")
2831

29-
def time_frame_date_formatting_custom(self, obs):
32+
def time_frame_date_formatting_custom(self, nobs):
3033
self.data["d"].dt.strftime(date_format="%Y---%m---%d")
3134

32-
def time_frame_datetime_to_str(self, obs):
35+
def time_frame_datetime_to_str(self, nobs):
3336
self.data["dt"].astype(str)
3437

35-
def time_frame_datetime_formatting_default_date_only(self, obs):
38+
def time_frame_datetime_formatting_default(self, nobs):
39+
self.data["dt"].dt.strftime(date_format=None)
40+
41+
def time_frame_datetime_formatting_default_explicit_date_only(self, nobs):
3642
self.data["dt"].dt.strftime(date_format="%Y-%m-%d")
3743

38-
def time_frame_datetime_formatting_default(self, obs):
44+
def time_frame_datetime_formatting_default_explicit(self, nobs):
3945
self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S")
4046

41-
def time_frame_datetime_formatting_default_with_float(self, obs):
47+
def time_frame_datetime_formatting_default_with_float(self, nobs):
4248
self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f")
4349

44-
def time_frame_datetime_formatting_custom(self, obs):
50+
def time_frame_datetime_formatting_custom(self, nobs):
4551
self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S")
4652

4753

54+
class PeriodStrftime:
55+
timeout = 1500
56+
params = ([1000, 10000], ["D", "H"])
57+
param_names = ["nobs", "freq"]
58+
59+
def setup(self, nobs, freq):
60+
self.data = pd.DataFrame(
61+
{
62+
"p": pd.period_range(start="2000-01-01", periods=nobs, freq=freq),
63+
"r": [np.random.uniform()] * nobs,
64+
}
65+
)
66+
self.data["i"] = self.data["p"]
67+
self.data.set_index("i", inplace=True)
68+
if freq == "D":
69+
self.default_fmt = "%Y-%m-%d"
70+
elif freq == "H":
71+
self.default_fmt = "%Y-%m-%d %H:00"
72+
73+
def time_frame_period_to_str(self, nobs, freq):
74+
self.data["p"].astype(str)
75+
76+
def time_frame_period_formatting_default(self, nobs, freq):
77+
self.data["p"].dt.strftime(date_format=None)
78+
79+
def time_frame_period_formatting_default_explicit(self, nobs, freq):
80+
self.data["p"].dt.strftime(date_format=self.default_fmt)
81+
82+
def time_frame_period_formatting_index_default(self, nobs, freq):
83+
self.data.index.format()
84+
85+
def time_frame_period_formatting_index_default_explicit(self, nobs, freq):
86+
self.data.index.format(self.default_fmt)
87+
88+
def time_frame_period_formatting_custom(self, nobs, freq):
89+
self.data["p"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S")
90+
91+
def time_frame_period_formatting_iso8601_strftime_Z(self, nobs, freq):
92+
self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%SZ")
93+
94+
def time_frame_period_formatting_iso8601_strftime_offset(self, nobs, freq):
95+
"""Not optimized yet as %z is not supported by `convert_strftime_format`"""
96+
self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%S%z")
97+
98+
4899
class BusinessHourStrftime:
49100
timeout = 1500
50101
params = [1000, 10000]
51-
param_names = ["obs"]
102+
param_names = ["nobs"]
52103

53-
def setup(self, obs):
104+
def setup(self, nobs):
54105
self.data = pd.DataFrame(
55106
{
56-
"off": [offsets.BusinessHour()] * obs,
107+
"off": [offsets.BusinessHour()] * nobs,
57108
}
58109
)
59110

60-
def time_frame_offset_str(self, obs):
111+
def time_frame_offset_str(self, nobs):
61112
self.data["off"].apply(str)
62113

63-
def time_frame_offset_repr(self, obs):
114+
def time_frame_offset_repr(self, nobs):
64115
self.data["off"].apply(repr)

asv_bench/benchmarks/tslibs/period.py

+19
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ class PeriodUnaryMethods:
6060

6161
def setup(self, freq):
6262
self.per = Period("2012-06-01", freq=freq)
63+
if freq == "M":
64+
self.default_fmt = "%Y-%m"
65+
elif freq == "min":
66+
self.default_fmt = "%Y-%m-%d %H:%M"
6367

6468
def time_to_timestamp(self, freq):
6569
self.per.to_timestamp()
@@ -70,6 +74,21 @@ def time_now(self, freq):
7074
def time_asfreq(self, freq):
7175
self.per.asfreq("A")
7276

77+
def time_str(self, freq):
78+
str(self.per)
79+
80+
def time_repr(self, freq):
81+
repr(self.per)
82+
83+
def time_strftime_default(self, freq):
84+
self.per.strftime(None)
85+
86+
def time_strftime_default_explicit(self, freq):
87+
self.per.strftime(self.default_fmt)
88+
89+
def time_strftime_custom(self, freq):
90+
self.per.strftime("%b. %d, %Y was a %A")
91+
7392

7493
class PeriodConstructor:
7594
params = [["D"], [True, False]]

doc/source/whatsnew/v2.1.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,7 @@ Performance improvements
283283
- Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`)
284284
- Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`)
285285
- Performance improvement in :func:`concat` (:issue:`52291`, :issue:`52290`)
286+
- :class:`Period`'s default formatter (`period_format`) is now significantly (~twice) faster. This improves performance of ``str(Period)``, ``repr(Period)``, and :meth:`Period.strftime(fmt=None)`, as well as ``PeriodArray.strftime(fmt=None)``, ``PeriodIndex.strftime(fmt=None)`` and ``PeriodIndex.format(fmt=None)``. Finally, ``to_csv`` operations involving :class:`PeriodArray` or :class:`PeriodIndex` with default ``date_format`` are also significantly accelerated. (:issue:`51459`)
286287
- Performance improvement accessing :attr:`arrays.IntegerArrays.dtype` & :attr:`arrays.FloatingArray.dtype` (:issue:`52998`)
287288
- Performance improvement in :class:`Series` reductions (:issue:`52341`)
288289
- Performance improvement in :func:`concat` when ``axis=1`` and objects have different indexes (:issue:`52541`)
@@ -291,7 +292,6 @@ Performance improvements
291292
- Performance improvement in :meth:`Series.corr` and :meth:`Series.cov` for extension dtypes (:issue:`52502`)
292293
- Performance improvement in :meth:`Series.to_numpy` when dtype is a numpy float dtype and ``na_value`` is ``np.nan`` (:issue:`52430`)
293294
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`)
294-
-
295295

296296
.. ---------------------------------------------------------------------------
297297
.. _whatsnew_210.bug_fixes:

0 commit comments

Comments
 (0)