@@ -83,98 +83,15 @@ def describe_ndframe(
83
83
raise ValueError ("percentiles cannot contain duplicates" )
84
84
percentiles = unique_pcts
85
85
86
- formatted_percentiles = format_percentiles (percentiles )
87
-
88
- def describe_numeric_1d (series ) -> "Series" :
89
- from pandas import Series
90
-
91
- stat_index = ["count" , "mean" , "std" , "min" ] + formatted_percentiles + ["max" ]
92
- d = (
93
- [series .count (), series .mean (), series .std (), series .min ()]
94
- + series .quantile (percentiles ).tolist ()
95
- + [series .max ()]
96
- )
97
- return Series (d , index = stat_index , name = series .name )
98
-
99
- def describe_categorical_1d (data ) -> "Series" :
100
- names = ["count" , "unique" ]
101
- objcounts = data .value_counts ()
102
- count_unique = len (objcounts [objcounts != 0 ])
103
- result = [data .count (), count_unique ]
104
- dtype = None
105
- if result [1 ] > 0 :
106
- top , freq = objcounts .index [0 ], objcounts .iloc [0 ]
107
- if is_datetime64_any_dtype (data .dtype ):
108
- if obj .ndim == 1 :
109
- stacklevel = 5
110
- else :
111
- stacklevel = 6
112
- warnings .warn (
113
- "Treating datetime data as categorical rather than numeric in "
114
- "`.describe` is deprecated and will be removed in a future "
115
- "version of pandas. Specify `datetime_is_numeric=True` to "
116
- "silence this warning and adopt the future behavior now." ,
117
- FutureWarning ,
118
- stacklevel = stacklevel ,
119
- )
120
- tz = data .dt .tz
121
- asint = data .dropna ().values .view ("i8" )
122
- top = Timestamp (top )
123
- if top .tzinfo is not None and tz is not None :
124
- # Don't tz_localize(None) if key is already tz-aware
125
- top = top .tz_convert (tz )
126
- else :
127
- top = top .tz_localize (tz )
128
- names += ["top" , "freq" , "first" , "last" ]
129
- result += [
130
- top ,
131
- freq ,
132
- Timestamp (asint .min (), tz = tz ),
133
- Timestamp (asint .max (), tz = tz ),
134
- ]
135
- else :
136
- names += ["top" , "freq" ]
137
- result += [top , freq ]
138
-
139
- # If the DataFrame is empty, set 'top' and 'freq' to None
140
- # to maintain output shape consistency
141
- else :
142
- names += ["top" , "freq" ]
143
- result += [np .nan , np .nan ]
144
- dtype = "object"
145
-
146
- from pandas import Series
147
-
148
- return Series (result , index = names , name = data .name , dtype = dtype )
149
-
150
- def describe_timestamp_1d (data ) -> "Series" :
151
- # GH-30164
152
- from pandas import Series
153
-
154
- stat_index = ["count" , "mean" , "min" ] + formatted_percentiles + ["max" ]
155
- d = (
156
- [data .count (), data .mean (), data .min ()]
157
- + data .quantile (percentiles ).tolist ()
158
- + [data .max ()]
159
- )
160
- return Series (d , index = stat_index , name = data .name )
161
-
162
- def describe_1d (data ) -> "Series" :
163
- if is_bool_dtype (data .dtype ):
164
- return describe_categorical_1d (data )
165
- elif is_numeric_dtype (data ):
166
- return describe_numeric_1d (data )
167
- elif is_datetime64_any_dtype (data .dtype ) and datetime_is_numeric :
168
- return describe_timestamp_1d (data )
169
- elif is_timedelta64_dtype (data .dtype ):
170
- return describe_numeric_1d (data )
171
- else :
172
- return describe_categorical_1d (data )
173
-
174
86
if obj .ndim == 1 :
175
87
# Incompatible return value type
176
88
# (got "Series", expected "FrameOrSeries") [return-value]
177
- return describe_1d (obj ) # type:ignore[return-value]
89
+ return describe_1d (
90
+ obj ,
91
+ percentiles ,
92
+ datetime_is_numeric ,
93
+ is_series = True ,
94
+ ) # type:ignore[return-value]
178
95
elif (include is None ) and (exclude is None ):
179
96
# when some numerics are found, keep only numerics
180
97
default_include = [np .number ]
@@ -191,7 +108,10 @@ def describe_1d(data) -> "Series":
191
108
else :
192
109
data = obj .select_dtypes (include = include , exclude = exclude )
193
110
194
- ldesc = [describe_1d (s ) for _ , s in data .items ()]
111
+ ldesc = [
112
+ describe_1d (s , percentiles , datetime_is_numeric , is_series = False )
113
+ for _ , s in data .items ()
114
+ ]
195
115
# set a convenient order for rows
196
116
names : List [Hashable ] = []
197
117
ldesc_indexes = sorted ((x .index for x in ldesc ), key = len )
@@ -203,3 +123,143 @@ def describe_1d(data) -> "Series":
203
123
d = concat ([x .reindex (names , copy = False ) for x in ldesc ], axis = 1 , sort = False )
204
124
d .columns = data .columns .copy ()
205
125
return d
126
+
127
+
128
+ def describe_numeric_1d (series , percentiles ) -> "Series" :
129
+ """Describe series containing numerical data.
130
+
131
+ Parameters
132
+ ----------
133
+ series : Series
134
+ Series to be described.
135
+ percentiles : list-like of numbers, optional
136
+ The percentiles to include in the output.
137
+ """
138
+ from pandas import Series
139
+
140
+ formatted_percentiles = format_percentiles (percentiles )
141
+
142
+ stat_index = ["count" , "mean" , "std" , "min" ] + formatted_percentiles + ["max" ]
143
+ d = (
144
+ [series .count (), series .mean (), series .std (), series .min ()]
145
+ + series .quantile (percentiles ).tolist ()
146
+ + [series .max ()]
147
+ )
148
+ return Series (d , index = stat_index , name = series .name )
149
+
150
+
151
+ def describe_categorical_1d (data , is_series ) -> "Series" :
152
+ """Describe series containing categorical data.
153
+
154
+ Parameters
155
+ ----------
156
+ data : Series
157
+ Series to be described.
158
+ is_series : bool
159
+ True if the original object is a Series.
160
+ False if the one column of the DataFrame is described.
161
+ """
162
+ names = ["count" , "unique" ]
163
+ objcounts = data .value_counts ()
164
+ count_unique = len (objcounts [objcounts != 0 ])
165
+ result = [data .count (), count_unique ]
166
+ dtype = None
167
+ if result [1 ] > 0 :
168
+ top , freq = objcounts .index [0 ], objcounts .iloc [0 ]
169
+ if is_datetime64_any_dtype (data .dtype ):
170
+ if is_series :
171
+ stacklevel = 5
172
+ else :
173
+ stacklevel = 6
174
+ warnings .warn (
175
+ "Treating datetime data as categorical rather than numeric in "
176
+ "`.describe` is deprecated and will be removed in a future "
177
+ "version of pandas. Specify `datetime_is_numeric=True` to "
178
+ "silence this warning and adopt the future behavior now." ,
179
+ FutureWarning ,
180
+ stacklevel = stacklevel ,
181
+ )
182
+ tz = data .dt .tz
183
+ asint = data .dropna ().values .view ("i8" )
184
+ top = Timestamp (top )
185
+ if top .tzinfo is not None and tz is not None :
186
+ # Don't tz_localize(None) if key is already tz-aware
187
+ top = top .tz_convert (tz )
188
+ else :
189
+ top = top .tz_localize (tz )
190
+ names += ["top" , "freq" , "first" , "last" ]
191
+ result += [
192
+ top ,
193
+ freq ,
194
+ Timestamp (asint .min (), tz = tz ),
195
+ Timestamp (asint .max (), tz = tz ),
196
+ ]
197
+ else :
198
+ names += ["top" , "freq" ]
199
+ result += [top , freq ]
200
+
201
+ # If the DataFrame is empty, set 'top' and 'freq' to None
202
+ # to maintain output shape consistency
203
+ else :
204
+ names += ["top" , "freq" ]
205
+ result += [np .nan , np .nan ]
206
+ dtype = "object"
207
+
208
+ from pandas import Series
209
+
210
+ return Series (result , index = names , name = data .name , dtype = dtype )
211
+
212
+
213
+ def describe_timestamp_1d (data , percentiles ) -> "Series" :
214
+ """Describe series containing datetime64 dtype.
215
+
216
+ Parameters
217
+ ----------
218
+ data : Series
219
+ Series to be described.
220
+ percentiles : list-like of numbers, optional
221
+ The percentiles to include in the output.
222
+ """
223
+ # GH-30164
224
+ from pandas import Series
225
+
226
+ formatted_percentiles = format_percentiles (percentiles )
227
+
228
+ stat_index = ["count" , "mean" , "min" ] + formatted_percentiles + ["max" ]
229
+ d = (
230
+ [data .count (), data .mean (), data .min ()]
231
+ + data .quantile (percentiles ).tolist ()
232
+ + [data .max ()]
233
+ )
234
+ return Series (d , index = stat_index , name = data .name )
235
+
236
+
237
+ def describe_1d (data , percentiles , datetime_is_numeric , * , is_series ) -> "Series" :
238
+ """Describe series.
239
+
240
+ Parameters
241
+ ----------
242
+ data : Series
243
+ Series to be described.
244
+ percentiles : list-like of numbers, optional
245
+ The percentiles to include in the output.
246
+ datetime_is_numeric : bool, default False
247
+ Whether to treat datetime dtypes as numeric.
248
+ is_series : bool
249
+ True if the original object is a Series.
250
+ False if the one column of the DataFrame is described.
251
+
252
+ Returns
253
+ -------
254
+ Series
255
+ """
256
+ if is_bool_dtype (data .dtype ):
257
+ return describe_categorical_1d (data , is_series )
258
+ elif is_numeric_dtype (data ):
259
+ return describe_numeric_1d (data , percentiles )
260
+ elif is_datetime64_any_dtype (data .dtype ) and datetime_is_numeric :
261
+ return describe_timestamp_1d (data , percentiles )
262
+ elif is_timedelta64_dtype (data .dtype ):
263
+ return describe_numeric_1d (data , percentiles )
264
+ else :
265
+ return describe_categorical_1d (data , is_series )
0 commit comments