From 4d7afe5b527acc66261497b0af5fcbfacbf00230 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Tue, 3 Oct 2017 23:41:48 +0100 Subject: [PATCH 1/2] DOC: Clarifying use of categorical data in describe docstring (#16722) --- pandas/core/generic.py | 98 +++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 40 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5dd770b2600a0..ae01f7a3e6acb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6352,20 +6352,22 @@ def describe(self, percentiles=None, include=None, exclude=None): - A list-like of dtypes : Limits the results to the provided data types. To limit the result to numeric types submit - ``numpy.number``. To limit it instead to categorical - objects submit the ``numpy.object`` data type. Strings + ``numpy.number``. To limit it instead to object columns submit + the ``numpy.object`` data type. Strings can also be used in the style of - ``select_dtypes`` (e.g. ``df.describe(include=['O'])``) + ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To + select pandas categorical columns, use ``'category'`` - None (default) : The result will include all numeric columns. exclude : list-like of dtypes or None (default), optional, A black list of data types to omit from the result. Ignored for ``Series``. Here are the options: - A list-like of dtypes : Excludes the provided data types - from the result. To select numeric types submit - ``numpy.number``. To select categorical objects submit the data + from the result. To exclude numeric types submit + ``numpy.number``. To exclude object columns submit the data type ``numpy.object``. Strings can also be used in the style of - ``select_dtypes`` (e.g. ``df.describe(include=['O'])``) + ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To + exclude pandas categorical columns, use ``'category'`` - None (default) : The result will exclude nothing. Returns @@ -6390,9 +6392,11 @@ def describe(self, percentiles=None, include=None, exclude=None): among those with the highest count. For mixed data types provided via a ``DataFrame``, the default is to - return only an analysis of numeric columns. If ``include='all'`` - is provided as an option, the result will include a union of - attributes of each type. + return only an analysis of numeric columns. If the dataframe consists + only of object and categorical data without any numeric columns, the + default is to return an analysis of both the object and categorical + columns. If ``include='all'`` is provided as an option, the result + will include a union of attributes of each type. The `include` and `exclude` parameters can be used to limit which columns in a ``DataFrame`` are analyzed for the output. @@ -6442,8 +6446,10 @@ def describe(self, percentiles=None, include=None, exclude=None): Describing a ``DataFrame``. By default only numeric fields are returned. - >>> df = pd.DataFrame([[1, 'a'], [2, 'b'], [3, 'c']], - ... columns=['numeric', 'object']) + >>> df = pd.DataFrame({ 'object': ['a', 'b', 'c'], + ... 'numeric': [1, 2, 3], + ... 'categorical': pd.Categorical(['d','e','f']) + ... }) >>> df.describe() numeric count 3.0 @@ -6457,19 +6463,19 @@ def describe(self, percentiles=None, include=None, exclude=None): Describing all columns of a ``DataFrame`` regardless of data type. - >>> df.describe(include='all') - numeric object - count 3.0 3 - unique NaN 3 - top NaN b - freq NaN 1 - mean 2.0 NaN - std 1.0 NaN - min 1.0 NaN - 25% 1.5 NaN - 50% 2.0 NaN - 75% 2.5 NaN - max 3.0 NaN + >>> df.describe(include='all') + categorical numeric object + count 3 3.0 3 + unique 3 NaN 3 + top f NaN c + freq 1 NaN 1 + mean NaN 2.0 NaN + std NaN 1.0 NaN + min NaN 1.0 NaN + 25% NaN 1.5 NaN + 50% NaN 2.0 NaN + 75% NaN 2.5 NaN + max NaN 3.0 NaN Describing a column from a ``DataFrame`` by accessing it as an attribute. @@ -6504,31 +6510,43 @@ def describe(self, percentiles=None, include=None, exclude=None): object count 3 unique 3 - top b + top c freq 1 + Including only categorical columns from a ``DataFrame`` description. + + >>> df.describe(include=['category']) + categorical + count 3 + unique 3 + top f + freq 1 + Excluding numeric columns from a ``DataFrame`` description. >>> df.describe(exclude=[np.number]) - object - count 3 - unique 3 - top b - freq 1 + categorical object + count 3 3 + unique 3 3 + top f c + freq 1 1 Excluding object columns from a ``DataFrame`` description. >>> df.describe(exclude=[np.object]) - numeric - count 3.0 - mean 2.0 - std 1.0 - min 1.0 - 25% 1.5 - 50% 2.0 - 75% 2.5 - max 3.0 - + categorical numeric + count 3 3.0 + unique 3 NaN + top f NaN + freq 1 NaN + mean NaN 2.0 + std NaN 1.0 + min NaN 1.0 + 25% NaN 1.5 + 50% NaN 2.0 + 75% NaN 2.5 + max NaN 3.0 + See Also -------- DataFrame.count From b300fb57bd4bec794d7a5086de14c01e90b16105 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Wed, 4 Oct 2017 21:50:22 +0100 Subject: [PATCH 2/2] Fix pep8 issues --- pandas/core/generic.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ae01f7a3e6acb..ed2a592a64efe 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6352,10 +6352,10 @@ def describe(self, percentiles=None, include=None, exclude=None): - A list-like of dtypes : Limits the results to the provided data types. To limit the result to numeric types submit - ``numpy.number``. To limit it instead to object columns submit + ``numpy.number``. To limit it instead to object columns submit the ``numpy.object`` data type. Strings can also be used in the style of - ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To + ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To select pandas categorical columns, use ``'category'`` - None (default) : The result will include all numeric columns. exclude : list-like of dtypes or None (default), optional, @@ -6366,7 +6366,7 @@ def describe(self, percentiles=None, include=None, exclude=None): from the result. To exclude numeric types submit ``numpy.number``. To exclude object columns submit the data type ``numpy.object``. Strings can also be used in the style of - ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To + ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To exclude pandas categorical columns, use ``'category'`` - None (default) : The result will exclude nothing. @@ -6393,9 +6393,9 @@ def describe(self, percentiles=None, include=None, exclude=None): For mixed data types provided via a ``DataFrame``, the default is to return only an analysis of numeric columns. If the dataframe consists - only of object and categorical data without any numeric columns, the - default is to return an analysis of both the object and categorical - columns. If ``include='all'`` is provided as an option, the result + only of object and categorical data without any numeric columns, the + default is to return an analysis of both the object and categorical + columns. If ``include='all'`` is provided as an option, the result will include a union of attributes of each type. The `include` and `exclude` parameters can be used to limit @@ -6448,7 +6448,7 @@ def describe(self, percentiles=None, include=None, exclude=None): >>> df = pd.DataFrame({ 'object': ['a', 'b', 'c'], ... 'numeric': [1, 2, 3], - ... 'categorical': pd.Categorical(['d','e','f']) + ... 'categorical': pd.Categorical(['d','e','f']) ... }) >>> df.describe() numeric @@ -6463,7 +6463,7 @@ def describe(self, percentiles=None, include=None, exclude=None): Describing all columns of a ``DataFrame`` regardless of data type. - >>> df.describe(include='all') + >>> df.describe(include='all') categorical numeric object count 3 3.0 3 unique 3 NaN 3 @@ -6514,7 +6514,7 @@ def describe(self, percentiles=None, include=None, exclude=None): freq 1 Including only categorical columns from a ``DataFrame`` description. - + >>> df.describe(include=['category']) categorical count 3 @@ -6546,7 +6546,7 @@ def describe(self, percentiles=None, include=None, exclude=None): 50% NaN 2.0 75% NaN 2.5 max NaN 3.0 - + See Also -------- DataFrame.count