Skip to content

ENH: show numbers on .info() with verbose flag #28876

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jan 3, 2020
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ including other versions of pandas.
Enhancements
~~~~~~~~~~~~

- :meth:`Dataframe.info` now shows line numbers for the columns summary (:issue:`17304`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make a small sub-section showing these changes. move to the api breaking section as this is a change in the output repr.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added the difference to the Backwards incompatible API changes section. Hope that is what you meant


.. _whatsnew_100.string:

Dedicated string data type
Expand Down
81 changes: 64 additions & 17 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2343,9 +2343,11 @@ def info(
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
int_col 5 non-null int64
text_col 5 non-null object
float_col 5 non-null float64
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 int_col 5 non-null int64
1 text_col 5 non-null object
2 float_col 5 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes

Expand Down Expand Up @@ -2384,19 +2386,23 @@ def info(
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
column_1 1000000 non-null object
column_2 1000000 non-null object
column_3 1000000 non-null object
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 column_1 1000000 non-null object
1 column_2 1000000 non-null object
2 column_3 1000000 non-null object
dtypes: object(3)
memory usage: 22.9+ MB

>>> df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
column_1 1000000 non-null object
column_2 1000000 non-null object
column_3 1000000 non-null object
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 column_1 1000000 non-null object
1 column_2 1000000 non-null object
2 column_3 1000000 non-null object
dtypes: object(3)
memory usage: 188.8 MB
"""
Expand All @@ -2415,6 +2421,7 @@ def info(
return

cols = self.columns
col_count = len(self.columns)

# hack
if max_cols is None:
Expand All @@ -2423,17 +2430,28 @@ def info(
max_rows = get_option("display.max_info_rows", len(self) + 1)

if null_counts is None:
show_counts = (len(self.columns) <= max_cols) and (len(self) < max_rows)
show_counts = (col_count <= max_cols) and (len(self) < max_rows)
else:
show_counts = null_counts
exceeds_info_cols = len(self.columns) > max_cols
exceeds_info_cols = col_count > max_cols

def _verbose_repr():
lines.append("Data columns (total %d columns):" % len(self.columns))
space = max(len(pprint_thing(k)) for k in self.columns) + 4

id_head = " # "
column_head = "Column"
col_space = 2

max_col = max(len(pprint_thing(k)) for k in cols)
len_column = len(pprint_thing(column_head))
space = max(max_col, len_column) + col_space

max_id = len(pprint_thing(col_count))
len_id = len(pprint_thing(id_head))
space_num = max(max_id, len_id) + col_space
counts = None

tmpl = "{count}{dtype}"
header = _put_str(id_head, space_num) + _put_str(column_head, space)
if show_counts:
counts = self.count()
if len(cols) != len(counts): # pragma: no cover
Expand All @@ -2443,19 +2461,48 @@ def _verbose_repr():
cols=len(cols), counts=len(counts)
)
)
tmpl = "{count} non-null {dtype}"
count_header = "Non-Null Count"
len_count = len(count_header)
non_null = " non-null"
max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
space_count = max(len_count, max_count) + col_space
count_temp = "{count}" + non_null
else:
count_header = ""
space_count = len(count_header)
len_count = space_count
count_temp = "{count}"

dtype_header = "Dtype"
len_dtype = len(dtype_header)
max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes)
space_dtype = max(len_dtype, max_dtypes)
header += _put_str(count_header, space_count) + _put_str(
dtype_header, space_dtype
)

lines.append(header)
lines.append(
_put_str("-" * len_id, space_num)
+ _put_str("-" * len_column, space)
+ _put_str("-" * len_count, space_count)
+ _put_str("-" * len_dtype, space_dtype)
)

dtypes = self.dtypes
for i, col in enumerate(self.columns):
dtype = dtypes.iloc[i]
dtype = self.dtypes.iloc[i]
col = pprint_thing(col)

line_no = _put_str(" {num}".format(num=i), space_num)
count = ""
if show_counts:
count = counts.iloc[i]

lines.append(
_put_str(col, space) + tmpl.format(count=count, dtype=dtype)
line_no
+ _put_str(col, space)
+ _put_str(count_temp.format(count=count), space_count)
+ _put_str(dtype, space_dtype)
)

def _non_verbose_repr():
Expand Down
47 changes: 39 additions & 8 deletions pandas/tests/frame/test_repr_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,28 @@ def test_info(self):
frame.info()
frame.info(verbose=False)

def test_info_verbose(self):
buf = StringIO()
size = 1001
start = 5
frame = DataFrame(np.random.randn(3, size))
frame.info(verbose=True, buf=buf)

res = buf.getvalue()
header = " # Column Dtype \n" "--- ------ ----- "
assert header in res

frame.info(verbose=True, buf=buf)
buf.seek(0)
lines = buf.readlines()
assert len(lines) > 0

for i, line in enumerate(lines):
if i >= start and i < start + size:
index = i - start
line_nr = " {} ".format(index)
assert line.startswith(line_nr)

def test_info_memory(self):
# https://github.com/pandas-dev/pandas/issues/21056
df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")})
Expand All @@ -219,7 +241,9 @@ def test_info_memory(self):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
a 2 non-null int64
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 a 2 non-null int64
dtypes: int64(1)
memory usage: {} bytes
""".format(
Expand Down Expand Up @@ -263,8 +287,8 @@ def test_info_duplicate_columns_shows_correct_dtypes(self):
frame.info(buf=io)
io.seek(0)
lines = io.readlines()
assert "a 1 non-null int64\n" == lines[3]
assert "a 1 non-null float64\n" == lines[4]
assert " 0 a 1 non-null int64 \n" == lines[5]
assert " 1 a 1 non-null float64\n" == lines[6]

def test_info_shows_column_dtypes(self):
dtypes = [
Expand All @@ -284,30 +308,37 @@ def test_info_shows_column_dtypes(self):
buf = StringIO()
df.info(buf=buf)
res = buf.getvalue()
header = (
" # Column Non-Null Count Dtype \n"
"--- ------ -------------- ----- "
)
assert header in res
for i, dtype in enumerate(dtypes):
name = "{i:d} {n:d} non-null {dtype}".format(i=i, n=n, dtype=dtype)
name = " {i:d} {i:d} {n:d} non-null {dtype}".format(
i=i, n=n, dtype=dtype
)
assert name in res

def test_info_max_cols(self):
df = DataFrame(np.random.randn(10, 5))
for len_, verbose in [(5, None), (5, False), (10, True)]:
for len_, verbose in [(5, None), (5, False), (12, True)]:
# For verbose always ^ setting ^ summarize ^ full output
with option_context("max_info_columns", 4):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_

for len_, verbose in [(10, None), (5, False), (10, True)]:
for len_, verbose in [(12, None), (5, False), (12, True)]:

# max_cols no exceeded
# max_cols not exceeded
with option_context("max_info_columns", 5):
buf = StringIO()
df.info(buf=buf, verbose=verbose)
res = buf.getvalue()
assert len(res.strip().split("\n")) == len_

for len_, max_cols in [(10, 5), (5, 4)]:
for len_, max_cols in [(12, 5), (5, 4)]:
# setting truncates
with option_context("max_info_columns", 4):
buf = StringIO()
Expand Down