From cb0cd651e63956132a4dd89bbc56fbb0beae3609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Adri=C3=A1n=20Ca=C3=B1ones=20Castellano?= Date: Sat, 10 Mar 2018 17:35:34 +0100 Subject: [PATCH 1/9] DOC: Improved the docstring of pandas.DataFrame.info --- pandas/core/frame.py | 54 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a66d00fff9714..d7c29898ccfed 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1815,6 +1815,9 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, """ Concise summary of a DataFrame. + This method shows information about DataFrame type of index, columns + dtypes and non-null values and memory usage. + Parameters ---------- verbose : {None, True, False}, optional @@ -1822,6 +1825,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, None follows the `display.max_info_columns` setting. True or False overrides the `display.max_info_columns` setting. buf : writable buffer, defaults to sys.stdout + Whether to pipe the output. max_cols : int, default None Determines whether full summary or short summary is printed. None follows the `display.max_info_columns` setting. @@ -1840,6 +1844,56 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, - If True, always show counts. - If False, never show counts. + Returns + ------- + None: NoneType + This method outputs a summary of a DataFrame and returns None. + + Examples + -------- + >>> df = pd.DataFrame({"int_col": [1, 2, 3, 4, 5], "text_col": ['alpha', 'beta', 'gamma', 'delta', 'epsilon'], "float_col": [0.0, 0.25, 0.5, 0.75, 1.0]}) + >>> df + int_col text_col float_col + 0 1 alpha 0.00 + 1 2 beta 0.25 + 2 3 gamma 0.50 + 3 4 delta 0.75 + 4 5 epsilon 1.00 + + >>> df.info(verbose=True) + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + int_col 5 non-null int64 + text_col 5 non-null object + float_col 5 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 200.0+ bytes + + >>> df.info(verbose=False) + + RangeIndex: 5 entries, 0 to 4 + Columns: 3 entries, int_col to float_col + dtypes: float64(1), int64(1), object(1) + memory usage: 200.0+ bytes + + >>> file = open("df_info.txt", "w", encoding="utf-8") + >>> df.info(buf=file) + + >>> df.drop('text_col', axis=1, inplace=True) + >>> df.info(memory_usage='Deep') + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 2 columns): + int_col 5 non-null int64 + float_col 5 non-null float64 + dtypes: float64(1), int64(1) + memory usage: 160.0 bytes + + See Also + -------- + + describe: Generate descriptive statistics of DataFrame columns. """ from pandas.io.formats.format import _put_lines From 06fff9414349673f47bd8332d54fbe712663d30b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Adri=C3=A1n=20Ca=C3=B1ones=20Castellano?= Date: Sat, 10 Mar 2018 17:42:51 +0100 Subject: [PATCH 2/9] DOC: update the docstring of pandas.DataFrame.info PEP-8 fixing --- pandas/core/frame.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d7c29898ccfed..f5639a36b5359 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1851,7 +1851,10 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, Examples -------- - >>> df = pd.DataFrame({"int_col": [1, 2, 3, 4, 5], "text_col": ['alpha', 'beta', 'gamma', 'delta', 'epsilon'], "float_col": [0.0, 0.25, 0.5, 0.75, 1.0]}) + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] + >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, "float_col": float_values}) >>> df int_col text_col float_col 0 1 alpha 0.00 From c666c7a6b4a36cb1261000d02342a1887035f724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Adri=C3=A1n=20Ca=C3=B1ones=20Castellano?= Date: Sat, 10 Mar 2018 18:26:20 +0100 Subject: [PATCH 3/9] DOC: Improved the docstring of pandas.DataFrame.info Small improvements --- pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f5639a36b5359..42d160c6d5951 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1816,7 +1816,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, Concise summary of a DataFrame. This method shows information about DataFrame type of index, columns - dtypes and non-null values and memory usage. + dtypes, non-null values and memory usage. Parameters ---------- @@ -1882,6 +1882,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, >>> file = open("df_info.txt", "w", encoding="utf-8") >>> df.info(buf=file) + >>> file.close() >>> df.drop('text_col', axis=1, inplace=True) >>> df.info(memory_usage='Deep') From 39bf30a2879675172456f415be859fd7e391c37a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Adri=C3=A1n=20Ca=C3=B1ones=20Castellano?= Date: Sat, 10 Mar 2018 21:07:03 +0100 Subject: [PATCH 4/9] DOC: update the docstring of pandas.DataFrame.info Reviewers feedback --- pandas/core/frame.py | 61 +++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 42d160c6d5951..eb3a6ea6a3e4d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1815,7 +1815,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, """ Concise summary of a DataFrame. - This method shows information about DataFrame type of index, columns + This method prints information about DataFrame: index dtype, columns dtypes, non-null values and memory usage. Parameters @@ -1835,7 +1835,11 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, the `display.memory_usage` setting. True or False overrides the `display.memory_usage` setting. A value of 'deep' is equivalent of True, with deep introspection. Memory usage is shown in - human-readable units (base-2 representation). + human-readable units (base-2 representation). Without deep instrospection + a memory estimation is made based in column dtype and number of rows + assuming values consume the same memory amount for corresponding dtypes. + With deep memory introspection, a real memory usage calculation is performed + at the cost of computational resources. null_counts : boolean, default None Whether to show the non-null counts @@ -1847,7 +1851,12 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, Returns ------- None: NoneType - This method outputs a summary of a DataFrame and returns None. + This method prints a summary of a DataFrame and returns None. + + See Also + -------- + + describe: Generate descriptive statistics of DataFrame columns. Examples -------- @@ -1880,24 +1889,36 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, dtypes: float64(1), int64(1), object(1) memory usage: 200.0+ bytes - >>> file = open("df_info.txt", "w", encoding="utf-8") - >>> df.info(buf=file) - >>> file.close() - - >>> df.drop('text_col', axis=1, inplace=True) - >>> df.info(memory_usage='Deep') + >>> import io + >>> buffer = io.StringIO() + >>> df.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", encoding="utf-8") as f: + ... f.write(s) + 260 + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> df = pd.DataFrame({'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)}) + >>> df.info() - RangeIndex: 5 entries, 0 to 4 - Data columns (total 2 columns): - int_col 5 non-null int64 - float_col 5 non-null float64 - dtypes: float64(1), int64(1) - memory usage: 160.0 bytes - - See Also - -------- - - describe: Generate descriptive statistics of DataFrame columns. + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + column_1 1000000 non-null object + column_2 1000000 non-null object + column_3 1000000 non-null object + dtypes: object(3) + memory usage: 22.9+ MB + >>> df.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + column_1 1000000 non-null object + column_2 1000000 non-null object + column_3 1000000 non-null object + dtypes: object(3) + memory usage: 188.8 MB """ from pandas.io.formats.format import _put_lines From 3da09a91effe4ef378d2031569e6e7cd2dc43866 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Adri=C3=A1n=20Ca=C3=B1ones=20Castellano?= Date: Sun, 11 Mar 2018 13:12:19 +0100 Subject: [PATCH 5/9] DOC: update the docstring of pandas.DataFrame.info Improvements and extended examples introductions --- pandas/core/frame.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eb3a6ea6a3e4d..9d4bc6136096a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1835,7 +1835,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, the `display.memory_usage` setting. True or False overrides the `display.memory_usage` setting. A value of 'deep' is equivalent of True, with deep introspection. Memory usage is shown in - human-readable units (base-2 representation). Without deep instrospection + human-readable units (base-2 representation). Without deep introspection a memory estimation is made based in column dtype and number of rows assuming values consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed @@ -1856,7 +1856,8 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, See Also -------- - describe: Generate descriptive statistics of DataFrame columns. + DataFrame.describe: Generate descriptive statistics of DataFrame columns. + DataFrame.memory_usage: Memory usage of DataFrame columns. Examples -------- @@ -1872,6 +1873,8 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, 3 4 delta 0.75 4 5 epsilon 1.00 + Prints information of all columns overriding default `display.max_info_columns` setting: + >>> df.info(verbose=True) RangeIndex: 5 entries, 0 to 4 @@ -1882,6 +1885,8 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, dtypes: float64(1), int64(1), object(1) memory usage: 200.0+ bytes + Prints a summary of columns count and its dtypes but not per column information: + >>> df.info(verbose=False) RangeIndex: 5 entries, 0 to 4 @@ -1889,6 +1894,9 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, dtypes: float64(1), int64(1), object(1) memory usage: 200.0+ bytes + Pipe output of DataFrame.info to buffer instead of sys.stdout, get buffer content + and writes to a text file: + >>> import io >>> buffer = io.StringIO() >>> df.info(buf=buffer) @@ -1897,6 +1905,9 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, ... f.write(s) 260 + The `memory_usage` parameter allows deep introspection mode, specially useful for + big DataFrames and fine-tune memory optimization: + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) >>> df = pd.DataFrame({'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), From 7bf0f11c5ecba7d03d07d65832983a80519a7e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Adri=C3=A1n=20Ca=C3=B1ones=20Castellano?= Date: Fri, 16 Mar 2018 20:35:29 +0100 Subject: [PATCH 6/9] DOC: update the docstring of pandas.DataFrame.info Added reviewer feedback. --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9d4bc6136096a..6db1e137502e9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1813,7 +1813,7 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None): """ - Concise summary of a DataFrame. + Prints a concise summary of a DataFrame. This method prints information about DataFrame: index dtype, columns dtypes, non-null values and memory usage. From 0c2919d44707a02be4f9c68dc8d51a77e3138c8c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Mar 2018 15:09:14 -0500 Subject: [PATCH 7/9] Updates --- pandas/core/frame.py | 84 ++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 38 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6db1e137502e9..0f8d0c28adac6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1813,50 +1813,53 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None): """ - Prints a concise summary of a DataFrame. + Print a concise summary of a DataFrame. This method prints information about DataFrame: index dtype, columns dtypes, non-null values and memory usage. Parameters ---------- - verbose : {None, True, False}, optional - Whether to print the full summary. - None follows the `display.max_info_columns` setting. - True or False overrides the `display.max_info_columns` setting. + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. This can + be overridden by passing `verbose`. buf : writable buffer, defaults to sys.stdout - Whether to pipe the output. - max_cols : int, default None - Determines whether full summary or short summary is printed. - None follows the `display.max_info_columns` setting. - memory_usage : boolean/string, default None + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + max_cols : int, optional + When to switch from the verbose to the truncated output. By + default, the setting in ``pandas.options.display.max_info_columns`` + is used. This can be overridden by passing `max_cols`. + memory_usage : bool, str, optional Specifies whether total memory usage of the DataFrame - elements (including index) should be displayed. None follows - the `display.memory_usage` setting. True or False overrides - the `display.memory_usage` setting. A value of 'deep' is equivalent - of True, with deep introspection. Memory usage is shown in - human-readable units (base-2 representation). Without deep introspection - a memory estimation is made based in column dtype and number of rows - assuming values consume the same memory amount for corresponding dtypes. - With deep memory introspection, a real memory usage calculation is performed + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + This can be overridden by passing `memory_usage`. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed at the cost of computational resources. - null_counts : boolean, default None - Whether to show the non-null counts - - - If None, then only show if the frame is smaller than - max_info_rows and max_info_columns. - - If True, always show counts. - - If False, never show counts. + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the frame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. Returns ------- - None: NoneType + None This method prints a summary of a DataFrame and returns None. See Also -------- - - DataFrame.describe: Generate descriptive statistics of DataFrame columns. + DataFrame.describe: Generate descriptive statistics of DataFrame + columns. DataFrame.memory_usage: Memory usage of DataFrame columns. Examples @@ -1864,7 +1867,8 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] - >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, "float_col": float_values}) + >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, + ... "float_col": float_values}) >>> df int_col text_col float_col 0 1 alpha 0.00 @@ -1873,7 +1877,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, 3 4 delta 0.75 4 5 epsilon 1.00 - Prints information of all columns overriding default `display.max_info_columns` setting: + Prints information of all columns: >>> df.info(verbose=True) @@ -1885,7 +1889,8 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, dtypes: float64(1), int64(1), object(1) memory usage: 200.0+ bytes - Prints a summary of columns count and its dtypes but not per column information: + Prints a summary of columns count and its dtypes but not per column + information: >>> df.info(verbose=False) @@ -1894,8 +1899,8 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, dtypes: float64(1), int64(1), object(1) memory usage: 200.0+ bytes - Pipe output of DataFrame.info to buffer instead of sys.stdout, get buffer content - and writes to a text file: + Pipe output of DataFrame.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: >>> import io >>> buffer = io.StringIO() @@ -1905,13 +1910,15 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, ... f.write(s) 260 - The `memory_usage` parameter allows deep introspection mode, specially useful for - big DataFrames and fine-tune memory optimization: + The `memory_usage` parameter allows deep introspection mode, specially + useful for big DataFrames and fine-tune memory optimization: >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) - >>> df = pd.DataFrame({'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)}) + >>> df = pd.DataFrame({ + ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) + ... }) >>> df.info() RangeIndex: 1000000 entries, 0 to 999999 @@ -1921,6 +1928,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, column_3 1000000 non-null object dtypes: object(3) memory usage: 22.9+ MB + >>> df.info(memory_usage='deep') RangeIndex: 1000000 entries, 0 to 999999 From 65439a19c39efe1687aee24bb348b61a9827431a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Mar 2018 15:10:53 -0500 Subject: [PATCH 8/9] Wording --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0f8d0c28adac6..2a0169e5c1710 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1815,8 +1815,8 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, """ Print a concise summary of a DataFrame. - This method prints information about DataFrame: index dtype, columns - dtypes, non-null values and memory usage. + This method prints information about a DataFrame including + the index dtype and column dtypes, non-null values and memory usage. Parameters ---------- From 3e785f8e595287efeef4ae877dc4596f24713362 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 16 Mar 2018 15:12:47 -0500 Subject: [PATCH 9/9] More wording --- pandas/core/frame.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2a0169e5c1710..22e972850bbbf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1822,21 +1822,22 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, ---------- verbose : bool, optional Whether to print the full summary. By default, the setting in - ``pandas.options.display.max_info_columns`` is followed. This can - be overridden by passing `verbose`. + ``pandas.options.display.max_info_columns`` is followed. buf : writable buffer, defaults to sys.stdout Where to send the output. By default, the output is printed to sys.stdout. Pass a writable buffer if you need to further process the output. max_cols : int, optional - When to switch from the verbose to the truncated output. By - default, the setting in ``pandas.options.display.max_info_columns`` - is used. This can be overridden by passing `max_cols`. + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used. memory_usage : bool, str, optional Specifies whether total memory usage of the DataFrame elements (including the index) should be displayed. By default, this follows the ``pandas.options.display.memory_usage`` setting. - This can be overridden by passing `memory_usage`. + + True always show memory usage. False never shows memory usage. A value of 'deep' is equivalent to "True with deep introspection". Memory usage is shown in human-readable units (base-2 representation). Without deep introspection a memory estimation is