Skip to content

CLN: Move info #31876

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
282 changes: 3 additions & 279 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import datetime
from io import StringIO
import itertools
import sys
from textwrap import dedent
from typing import (
IO,
Expand Down Expand Up @@ -131,7 +130,7 @@

from pandas.io.common import get_filepath_or_buffer
from pandas.io.formats import console, format as fmt
from pandas.io.formats.printing import pprint_thing
from pandas.io.formats.info import info
import pandas.plotting

if TYPE_CHECKING:
Expand Down Expand Up @@ -2225,282 +2224,11 @@ def to_html(
)

# ----------------------------------------------------------------------

@Appender(info.__doc__)
def info(
self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None
) -> None:
"""
Print a concise summary of a DataFrame.

This method prints information about a DataFrame including
the index dtype and column dtypes, non-null values and memory usage.

Parameters
----------
verbose : bool, optional
Whether to print the full summary. By default, the setting in
``pandas.options.display.max_info_columns`` is followed.
buf : writable buffer, defaults to sys.stdout
Where to send the output. By default, the output is printed to
sys.stdout. Pass a writable buffer if you need to further process
the output.
max_cols : int, optional
When to switch from the verbose to the truncated output. If the
DataFrame has more than `max_cols` columns, the truncated output
is used. By default, the setting in
``pandas.options.display.max_info_columns`` is used.
memory_usage : bool, str, optional
Specifies whether total memory usage of the DataFrame
elements (including the index) should be displayed. By default,
this follows the ``pandas.options.display.memory_usage`` setting.

True always show memory usage. False never shows memory usage.
A value of 'deep' is equivalent to "True with deep introspection".
Memory usage is shown in human-readable units (base-2
representation). Without deep introspection a memory estimation is
made based in column dtype and number of rows assuming values
consume the same memory amount for corresponding dtypes. With deep
memory introspection, a real memory usage calculation is performed
at the cost of computational resources.
null_counts : bool, optional
Whether to show the non-null counts. By default, this is shown
only if the frame is smaller than
``pandas.options.display.max_info_rows`` and
``pandas.options.display.max_info_columns``. A value of True always
shows the counts, and False never shows the counts.

Returns
-------
None
This method prints a summary of a DataFrame and returns None.

See Also
--------
DataFrame.describe: Generate descriptive statistics of DataFrame
columns.
DataFrame.memory_usage: Memory usage of DataFrame columns.

Examples
--------
>>> int_values = [1, 2, 3, 4, 5]
>>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
>>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
>>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
... "float_col": float_values})
>>> df
int_col text_col float_col
0 1 alpha 0.00
1 2 beta 0.25
2 3 gamma 0.50
3 4 delta 0.75
4 5 epsilon 1.00

Prints information of all columns:

>>> df.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 int_col 5 non-null int64
1 text_col 5 non-null object
2 float_col 5 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes

Prints a summary of columns count and its dtypes but not per column
information:

>>> df.info(verbose=False)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Columns: 3 entries, int_col to float_col
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes

Pipe output of DataFrame.info to buffer instead of sys.stdout, get
buffer content and writes to a text file:

>>> import io
>>> buffer = io.StringIO()
>>> df.info(buf=buffer)
>>> s = buffer.getvalue()
>>> with open("df_info.txt", "w",
... encoding="utf-8") as f: # doctest: +SKIP
... f.write(s)
260

The `memory_usage` parameter allows deep introspection mode, specially
useful for big DataFrames and fine-tune memory optimization:

>>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
>>> df = pd.DataFrame({
... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
... })
>>> df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 column_1 1000000 non-null object
1 column_2 1000000 non-null object
2 column_3 1000000 non-null object
dtypes: object(3)
memory usage: 22.9+ MB

>>> df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 column_1 1000000 non-null object
1 column_2 1000000 non-null object
2 column_3 1000000 non-null object
dtypes: object(3)
memory usage: 188.8 MB
"""
if buf is None: # pragma: no cover
buf = sys.stdout

lines = []

lines.append(str(type(self)))
lines.append(self.index._summary())

if len(self.columns) == 0:
lines.append(f"Empty {type(self).__name__}")
fmt.buffer_put_lines(buf, lines)
return

cols = self.columns
col_count = len(self.columns)

# hack
if max_cols is None:
max_cols = get_option("display.max_info_columns", len(self.columns) + 1)

max_rows = get_option("display.max_info_rows", len(self) + 1)

if null_counts is None:
show_counts = (col_count <= max_cols) and (len(self) < max_rows)
else:
show_counts = null_counts
exceeds_info_cols = col_count > max_cols

def _verbose_repr():
lines.append(f"Data columns (total {len(self.columns)} columns):")

id_head = " # "
column_head = "Column"
col_space = 2

max_col = max(len(pprint_thing(k)) for k in cols)
len_column = len(pprint_thing(column_head))
space = max(max_col, len_column) + col_space

max_id = len(pprint_thing(col_count))
len_id = len(pprint_thing(id_head))
space_num = max(max_id, len_id) + col_space
counts = None

header = _put_str(id_head, space_num) + _put_str(column_head, space)
if show_counts:
counts = self.count()
if len(cols) != len(counts): # pragma: no cover
raise AssertionError(
f"Columns must equal counts ({len(cols)} != {len(counts)})"
)
count_header = "Non-Null Count"
len_count = len(count_header)
non_null = " non-null"
max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
space_count = max(len_count, max_count) + col_space
count_temp = "{count}" + non_null
else:
count_header = ""
space_count = len(count_header)
len_count = space_count
count_temp = "{count}"

dtype_header = "Dtype"
len_dtype = len(dtype_header)
max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes)
space_dtype = max(len_dtype, max_dtypes)
header += _put_str(count_header, space_count) + _put_str(
dtype_header, space_dtype
)

lines.append(header)
lines.append(
_put_str("-" * len_id, space_num)
+ _put_str("-" * len_column, space)
+ _put_str("-" * len_count, space_count)
+ _put_str("-" * len_dtype, space_dtype)
)

for i, col in enumerate(self.columns):
dtype = self.dtypes.iloc[i]
col = pprint_thing(col)

line_no = _put_str(f" {i}", space_num)
count = ""
if show_counts:
count = counts.iloc[i]

lines.append(
line_no
+ _put_str(col, space)
+ _put_str(count_temp.format(count=count), space_count)
+ _put_str(dtype, space_dtype)
)

def _non_verbose_repr():
lines.append(self.columns._summary(name="Columns"))

def _sizeof_fmt(num, size_qualifier):
# returns size in human readable format
for x in ["bytes", "KB", "MB", "GB", "TB"]:
if num < 1024.0:
return f"{num:3.1f}{size_qualifier} {x}"
num /= 1024.0
return f"{num:3.1f}{size_qualifier} PB"

if verbose:
_verbose_repr()
elif verbose is False: # specifically set to False, not nesc None
_non_verbose_repr()
else:
if exceeds_info_cols:
_non_verbose_repr()
else:
_verbose_repr()

counts = self._data.get_dtype_counts()
dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())]
lines.append(f"dtypes: {', '.join(dtypes)}")

if memory_usage is None:
memory_usage = get_option("display.memory_usage")
if memory_usage:
# append memory usage of df to display
size_qualifier = ""
if memory_usage == "deep":
deep = True
else:
# size_qualifier is just a best effort; not guaranteed to catch
# all cases (e.g., it misses categorical data even with object
# categories)
deep = False
if "object" in counts or self.index._is_memory_usage_qualified():
size_qualifier = "+"
mem_usage = self.memory_usage(index=True, deep=deep).sum()
lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n")
fmt.buffer_put_lines(buf, lines)
return info(self, verbose, buf, max_cols, memory_usage, null_counts)

def memory_usage(self, index=True, deep=False) -> Series:
"""
Expand Down Expand Up @@ -8623,7 +8351,3 @@ def _from_nested_dict(data):
new_data[col] = new_data.get(col, {})
new_data[col][index] = v
return new_data


def _put_str(s, space):
return str(s)[:space].ljust(space)
Loading