@@ -5201,60 +5201,222 @@ def abs(self):
5201
5201
"""
5202
5202
return np .abs (self )
5203
5203
5204
- _shared_docs ['describe' ] = """
5205
- Generate various summary statistics, excluding NaN values.
5204
+ def describe (self , percentiles = None , include = None , exclude = None ):
5205
+ """
5206
+ Generates descriptive statistics that summarize the central tendency,
5207
+ dispersion and shape of a dataset's distribution, excluding
5208
+ ``NaN`` values.
5209
+
5210
+ Analyzes both numeric and object series, as well
5211
+ as ``DataFrame`` column sets of mixed data types. The output
5212
+ will vary depending on what is provided. Refer to the notes
5213
+ below for more detail.
5206
5214
5207
5215
Parameters
5208
5216
----------
5209
- percentiles : array-like, optional
5210
- The percentiles to include in the output. Should all
5211
- be in the interval [0, 1]. By default `percentiles` is
5212
- [.25, .5, .75], returning the 25th, 50th, and 75th percentiles.
5213
- include, exclude : list-like, 'all', or None (default)
5214
- Specify the form of the returned result. Either:
5215
-
5216
- - None to both (default). The result will include only
5217
- numeric-typed columns or, if none are, only categorical columns.
5218
- - A list of dtypes or strings to be included/excluded.
5219
- To select all numeric types use numpy numpy.number. To select
5220
- categorical objects use type object. See also the select_dtypes
5221
- documentation. eg. df.describe(include=['O'])
5222
- - If include is the string 'all', the output column-set will
5223
- match the input one.
5217
+ percentiles : list-like of numbers, optional
5218
+ The percentiles to include in the output. All should
5219
+ fall between 0 and 1. The default is
5220
+ ``[.25, .5, .75]``, which returns the 25th, 50th, and
5221
+ 75th percentiles.
5222
+ include : 'all', list-like of dtypes or None (default), optional
5223
+ A white list of data types to include in the result. Ignored
5224
+ for ``Series``. Here are the options:
5225
+
5226
+ - 'all' : All columns of the input will be included in the output.
5227
+ - A list-like of dtypes : Limits the results to the
5228
+ provided data types.
5229
+ To limit the result to numeric types submit
5230
+ ``numpy.number``. To limit it instead to categorical
5231
+ objects submit the ``numpy.object`` data type. Strings
5232
+ can also be used in the style of
5233
+ ``select_dtypes`` (e.g. ``df.describe(include=['O'])``)
5234
+ - None (default) : The result will include all numeric columns.
5235
+ exclude : list-like of dtypes or None (default), optional,
5236
+ A black list of data types to omit from the result. Ignored
5237
+ for ``Series``. Here are the options:
5238
+
5239
+ - A list-like of dtypes : Excludes the provided data types
5240
+ from the result. To select numeric types submit
5241
+ ``numpy.number``. To select categorical objects submit the data
5242
+ type ``numpy.object``. Strings can also be used in the style of
5243
+ ``select_dtypes`` (e.g. ``df.describe(include=['O'])``)
5244
+ - None (default) : The result will exclude nothing.
5224
5245
5225
5246
Returns
5226
5247
-------
5227
- summary: %(klass)s of summary statistics
5248
+ summary: Series/DataFrame of summary statistics
5228
5249
5229
5250
Notes
5230
5251
-----
5231
- The output DataFrame index depends on the requested dtypes:
5232
-
5233
- For numeric dtypes, it will include: count, mean, std, min,
5234
- max, and lower, 50, and upper percentiles.
5252
+ For numeric data, the result's index will include ``count``,
5253
+ ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
5254
+ upper percentiles. By default the lower percentile is ``25`` and the
5255
+ upper percentile is ``75``. The ``50`` percentile is the
5256
+ same as the median.
5257
+
5258
+ For object data (e.g. strings or timestamps), the result's index
5259
+ will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``
5260
+ is the most common value. The ``freq`` is the most common value's
5261
+ frequency. Timestamps also include the ``first`` and ``last`` items.
5262
+
5263
+ If multiple object values have the highest count, then the
5264
+ ``count`` and ``top`` results will be arbitrarily chosen from
5265
+ among those with the highest count.
5235
5266
5236
- For object dtypes (e.g. timestamps or strings), the index
5237
- will include the count, unique, most common, and frequency of the
5238
- most common. Timestamps also include the first and last items.
5267
+ For mixed data types provided via a ``DataFrame``, the default is to
5268
+ return only an analysis of numeric columns. If ``include='all'``
5269
+ is provided as an option, the result will include a union of
5270
+ attributes of each type.
5239
5271
5240
- For mixed dtypes, the index will be the union of the corresponding
5241
- output types. Non-applicable entries will be filled with NaN.
5242
- Note that mixed-dtype outputs can only be returned from mixed-dtype
5243
- inputs and appropriate use of the include/exclude arguments.
5272
+ The `include` and `exclude` parameters can be used to limit
5273
+ which columns in a ``DataFrame`` are analyzed for the output.
5274
+ The parameters are ignored when analyzing a ``Series``.
5244
5275
5245
- If multiple values have the highest count, then the
5246
- `count` and `most common` pair will be arbitrarily chosen from
5247
- among those with the highest count .
5276
+ Examples
5277
+ --------
5278
+ Describing a numeric ``Series`` .
5248
5279
5249
- The include, exclude arguments are ignored for Series.
5280
+ >>> import pandas as pd
5281
+ >>> s = pd.Series([1, 2, 3])
5282
+ >>> s.describe()
5283
+ count 3.0
5284
+ mean 2.0
5285
+ std 1.0
5286
+ min 1.0
5287
+ 25% 1.5
5288
+ 50% 2.0
5289
+ 75% 2.5
5290
+ max 3.0
5291
+
5292
+ Describing a categorical ``Series``.
5293
+
5294
+ >>> s = pd.Series(['a', 'a', 'b', 'c'])
5295
+ >>> s.describe()
5296
+ count 4
5297
+ unique 3
5298
+ top a
5299
+ freq 2
5300
+ dtype: object
5301
+
5302
+ Describing a timestamp ``Series``.
5303
+
5304
+ >>> import numpy as np
5305
+ >>> s = pd.Series([
5306
+ ... np.datetime64("2000-01-01"),
5307
+ ... np.datetime64("2010-01-01"),
5308
+ ... np.datetime64("2010-01-01")
5309
+ ... ])
5310
+ >>> s.describe()
5311
+ count 3
5312
+ unique 2
5313
+ top 2010-01-01 00:00:00
5314
+ freq 2
5315
+ first 2000-01-01 00:00:00
5316
+ last 2010-01-01 00:00:00
5317
+ dtype: object
5318
+
5319
+ Describing a ``DataFrame``. By default only numeric fields
5320
+ are returned.
5321
+
5322
+ >>> df = pd.DataFrame(
5323
+ ... [[1, 'a'], [2, 'b'], [3, 'c']],
5324
+ ... columns=['numeric', 'object']
5325
+ ... )
5326
+ >>> df.describe()
5327
+ numeric
5328
+ count 3.0
5329
+ mean 2.0
5330
+ std 1.0
5331
+ min 1.0
5332
+ 25% 1.5
5333
+ 50% 2.0
5334
+ 75% 2.5
5335
+ max 3.0
5336
+
5337
+ Describing all columns of a ``DataFrame`` regardless of data type.
5338
+
5339
+ >>> df.describe(include='all')
5340
+ numeric object
5341
+ count 3.0 3
5342
+ unique NaN 3
5343
+ top NaN b
5344
+ freq NaN 1
5345
+ mean 2.0 NaN
5346
+ std 1.0 NaN
5347
+ min 1.0 NaN
5348
+ 25% 1.5 NaN
5349
+ 50% 2.0 NaN
5350
+ 75% 2.5 NaN
5351
+ max 3.0 NaN
5352
+
5353
+ Describing a column from a ``DataFrame`` by accessing it as
5354
+ an attribute.
5355
+
5356
+ >>> df.numeric.describe()
5357
+ count 3.0
5358
+ mean 2.0
5359
+ std 1.0
5360
+ min 1.0
5361
+ 25% 1.5
5362
+ 50% 2.0
5363
+ 75% 2.5
5364
+ max 3.0
5365
+ Name: numeric, dtype: float64
5366
+
5367
+ Including only numeric columns in a ``DataFrame`` description.
5368
+
5369
+ >>> df.describe(include=[np.number])
5370
+ numeric
5371
+ count 3.0
5372
+ mean 2.0
5373
+ std 1.0
5374
+ min 1.0
5375
+ 25% 1.5
5376
+ 50% 2.0
5377
+ 75% 2.5
5378
+ max 3.0
5379
+
5380
+ Including only string columns in a ``DataFrame`` description.
5381
+
5382
+ >>> df.describe(include=[np.object])
5383
+ object
5384
+ count 3
5385
+ unique 3
5386
+ top b
5387
+ freq 1
5388
+
5389
+ Excluding numeric columns from a ``DataFrame`` description.
5390
+
5391
+ >>> df.describe(exclude=[np.number])
5392
+ object
5393
+ count 3
5394
+ unique 3
5395
+ top b
5396
+ freq 1
5397
+
5398
+ Excluding object columns from a ``DataFrame`` description.
5399
+
5400
+ >>> df.describe(exclude=[np.object])
5401
+ numeric
5402
+ count 3.0
5403
+ mean 2.0
5404
+ std 1.0
5405
+ min 1.0
5406
+ 25% 1.5
5407
+ 50% 2.0
5408
+ 75% 2.5
5409
+ max 3.0
5250
5410
5251
5411
See Also
5252
5412
--------
5413
+ DataFrame.count
5414
+ DataFrame.max
5415
+ DataFrame.min
5416
+ DataFrame.mean
5417
+ DataFrame.std
5253
5418
DataFrame.select_dtypes
5254
5419
"""
5255
-
5256
- @Appender (_shared_docs ['describe' ] % _shared_doc_kwargs )
5257
- def describe (self , percentiles = None , include = None , exclude = None ):
5258
5420
if self .ndim >= 3 :
5259
5421
msg = "describe is not implemented on Panel or PanelND objects."
5260
5422
raise NotImplementedError (msg )
0 commit comments