Skip to content

DOC: read_excel doc - fixed formatting and added examples #18753

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Dec 30, 2017
Merged
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ Other API Changes
- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`)
- :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`)
- :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`)
- In :func:`read_excel`, the ``comment`` argument is now exposed as a named parameter (:issue:`18735`)
- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:issue:`16672`)

.. _whatsnew_0230.deprecations:

Expand Down
83 changes: 82 additions & 1 deletion pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@
na_values : scalar, str, list-like, or dict, default None
Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values. By default the following values are interpreted
as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70) + """'.
as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70, subsequent_indent=" ") + """'.
keep_default_na : bool, default True
If na_values are specified and keep_default_na is False the default NaN
values are overridden, otherwise they're appended to.
Expand All @@ -148,6 +148,10 @@
this parameter is only necessary for columns stored as TEXT in Excel,
any numeric columns will automatically be parsed, regardless of display
format.
comment : str, default None
Comments out remainder of line. Pass a character or characters to this
argument to indicate comments in the input file. Any data between the
comment string and the end of the current line is ignored.
skip_footer : int, default 0

.. deprecated:: 0.23.0
Expand All @@ -164,6 +168,77 @@
parsed : DataFrame or Dict of DataFrames
DataFrame from the passed in Excel file. See notes in sheet_name
argument for more information on when a Dict of Dataframes is returned.

Examples
--------

An example DataFrame written to a local file

>>> df_out = pd.DataFrame([('string1', 1),
... ('string2', 2),
... ('string3', 3)],
... columns=['Name', 'Value'])
>>> df_out
Name Value
0 string1 1
1 string2 2
2 string3 3
>>> df_out.to_excel('tmp.xlsx')

The file can be read using the file name as string or an open file object:

>>> pd.read_excel('tmp.xlsx')
Name Value
0 string1 1
1 string2 2
2 string3 3

>>> pd.read_excel(open('tmp.xlsx','rb'))
Name Value
0 string1 1
1 string2 2
2 string3 3

Index and header can be specified via the `index_col` and `header` arguments

>>> pd.read_excel('tmp.xlsx', index_col=None, header=None)
0 1 2
0 NaN Name Value
1 0.0 string1 1
2 1.0 string2 2
3 2.0 string3 3

Column types are inferred but can be explicitly specified

>>> pd.read_excel('tmp.xlsx', dtype={'Name':str, 'Value':float})
Name Value
0 string1 1.0
1 string2 2.0
2 string3 3.0

True, False, and NA values, and thousands separators have defaults,
but can be explicitly specified, too. Supply the values you would like
as strings or lists of strings!

>>> pd.read_excel('tmp.xlsx',
... na_values=['string1', 'string2'])
Name Value
0 NaN 1
1 NaN 2
2 string3 3

Comment lines in the excel input file can be skipped using the `comment` kwarg

>>> df = pd.DataFrame({'a': ['1', '#2'], 'b': ['2', '3']})
>>> df.to_excel('tmp.xlsx', index=False)
>>> pd.read_excel('tmp.xlsx')
a b
0 1 2
1 #2 3

>>> pd.read_excel('tmp.xlsx', comment='#')
a b
0 1 2
"""


Expand Down Expand Up @@ -223,6 +298,7 @@ def read_excel(io,
parse_dates=False,
date_parser=None,
thousands=None,
comment=None,
skipfooter=0,
convert_float=True,
**kwds):
Expand Down Expand Up @@ -256,6 +332,7 @@ def read_excel(io,
parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
convert_float=convert_float,
**kwds)
Expand Down Expand Up @@ -338,6 +415,7 @@ def parse(self,
parse_dates=False,
date_parser=None,
thousands=None,
comment=None,
skipfooter=0,
convert_float=True,
**kwds):
Expand All @@ -363,6 +441,7 @@ def parse(self,
parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
convert_float=convert_float,
**kwds)
Expand Down Expand Up @@ -417,6 +496,7 @@ def _parse_excel(self,
parse_dates=False,
date_parser=None,
thousands=None,
comment=None,
skipfooter=0,
convert_float=True,
**kwds):
Expand Down Expand Up @@ -591,6 +671,7 @@ def _parse_cell(cell_contents, cell_typ):
parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
**kwds)

Expand Down
62 changes: 62 additions & 0 deletions pandas/tests/io/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1858,6 +1858,68 @@ def test_invalid_columns(self):
with pytest.raises(KeyError):
write_frame.to_excel(path, 'test1', columns=['C', 'D'])

def test_comment_arg(self):
# Re issue #18735
# Test the comment argument functionality to read_excel
with ensure_clean(self.ext) as path:

# Create file to read in
df = DataFrame({'A': ['one', '#one', 'one'],
'B': ['two', 'two', '#two']})
df.to_excel(path, 'test_c')

# Read file without comment arg
result1 = read_excel(path, 'test_c')
result1.iloc[1, 0] = None
result1.iloc[1, 1] = None
result1.iloc[2, 1] = None
result2 = read_excel(path, 'test_c', comment='#')
tm.assert_frame_equal(result1, result2)

def test_comment_default(self):
# Re issue #18735
# Test the comment argument default to read_excel
with ensure_clean(self.ext) as path:

# Create file to read in
df = DataFrame({'A': ['one', '#one', 'one'],
'B': ['two', 'two', '#two']})
df.to_excel(path, 'test_c')

# Read file with default and explicit comment=None
result1 = read_excel(path, 'test_c')
result2 = read_excel(path, 'test_c', comment=None)
tm.assert_frame_equal(result1, result2)

def test_comment_used(self):
# Re issue #18735
# Test the comment argument is working as expected when used
with ensure_clean(self.ext) as path:

# Create file to read in
df = DataFrame({'A': ['one', '#one', 'one'],
'B': ['two', 'two', '#two']})
df.to_excel(path, 'test_c')

# Test read_frame_comment against manually produced expected output
expected = DataFrame({'A': ['one', None, 'one'],
'B': ['two', None, None]})
result = read_excel(path, 'test_c', comment='#')
tm.assert_frame_equal(result, expected)

def test_comment_emptyline(self):
# Re issue #18735
# Test that read_excel ignores commented lines at the end of file
with ensure_clean(self.ext) as path:

df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']})
df.to_excel(path, index=False)

# Test that all-comment lines at EoF are ignored
expected = DataFrame({'a': [1], 'b': [2]})
result = read_excel(path, comment='#')
tm.assert_frame_equal(result, expected)

def test_datetimes(self):

# Test writing and reading datetimes. For issue #9139. (xref #9185)
Expand Down