Skip to content

Commit ff1964b

Browse files
committed
GH13967: move around _NA_VALUES and add doc for read_csv().na_values
1 parent a01e58f commit ff1964b

File tree

7 files changed

+121
-97
lines changed

7 files changed

+121
-97
lines changed

pandas/io/common.py

+8
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@
1414
from pandas.core.common import AbstractMethodError
1515
from pandas.types.common import is_number
1616

17+
# common NA values
18+
# no longer excluding inf representations
19+
# '1.#INF','-1.#INF', '1.#INF000000',
20+
_NA_VALUES = set([
21+
'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A',
22+
'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', ''
23+
])
24+
1725
try:
1826
import pathlib
1927
_PATHLIB_INSTALLED = True

pandas/io/excel.py

+96-89
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
from pandas.core.frame import DataFrame
1717
from pandas.io.parsers import TextParser
1818
from pandas.io.common import (_is_url, _urlopen, _validate_header_arg,
19-
EmptyDataError, get_filepath_or_buffer)
19+
EmptyDataError, get_filepath_or_buffer,
20+
_NA_VALUES)
2021
from pandas.tseries.period import Period
2122
from pandas import json
2223
from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass,
@@ -27,12 +28,105 @@
2728
import pandas.compat.openpyxl_compat as openpyxl_compat
2829
from warnings import warn
2930
from distutils.version import LooseVersion
31+
from pandas.util.decorators import Appender
3032

3133
__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]
3234

3335
_writer_extensions = ["xlsx", "xls", "xlsm"]
3436
_writers = {}
3537

38+
_read_excel_doc = """
39+
Read an Excel table into a pandas DataFrame
40+
41+
Parameters
42+
----------
43+
io : string, path object (pathlib.Path or py._path.local.LocalPath),
44+
file-like object, pandas ExcelFile, or xlrd workbook.
45+
The string could be a URL. Valid URL schemes include http, ftp, s3,
46+
and file. For file URLs, a host is expected. For instance, a local
47+
file could be file://localhost/path/to/workbook.xlsx
48+
sheetname : string, int, mixed list of strings/ints, or None, default 0
49+
50+
Strings are used for sheet names, Integers are used in zero-indexed
51+
sheet positions.
52+
53+
Lists of strings/integers are used to request multiple sheets.
54+
55+
Specify None to get all sheets.
56+
57+
str|int -> DataFrame is returned.
58+
list|None -> Dict of DataFrames is returned, with keys representing
59+
sheets.
60+
61+
Available Cases
62+
63+
* Defaults to 0 -> 1st sheet as a DataFrame
64+
* 1 -> 2nd sheet as a DataFrame
65+
* "Sheet1" -> 1st sheet as a DataFrame
66+
* [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames
67+
* None -> All sheets as a dictionary of DataFrames
68+
69+
header : int, list of ints, default 0
70+
Row (0-indexed) to use for the column labels of the parsed
71+
DataFrame. If a list of integers is passed those row positions will
72+
be combined into a ``MultiIndex``
73+
skiprows : list-like
74+
Rows to skip at the beginning (0-indexed)
75+
skip_footer : int, default 0
76+
Rows at the end to skip (0-indexed)
77+
index_col : int, list of ints, default None
78+
Column (0-indexed) to use as the row labels of the DataFrame.
79+
Pass None if there is no such column. If a list is passed,
80+
those columns will be combined into a ``MultiIndex``
81+
names : array-like, default None
82+
List of column names to use. If file contains no header row,
83+
then you should explicitly pass header=None
84+
converters : dict, default None
85+
Dict of functions for converting values in certain columns. Keys can
86+
either be integers or column labels, values are functions that take one
87+
input argument, the Excel cell content, and return the transformed
88+
content.
89+
parse_cols : int or list, default None
90+
* If None then parse all columns,
91+
* If int then indicates last column to be parsed
92+
* If list of ints then indicates list of column numbers to be parsed
93+
* If string then indicates comma separated list of column names and
94+
column ranges (e.g. "A:E" or "A,C,E:F")
95+
squeeze : boolean, default False
96+
If the parsed data only contains one column then return a Series
97+
na_values : str or list-like or dict, default None
98+
Additional strings to recognize as NA/NaN. If dict passed, specific
99+
per-column NA values. By default the following values are interpreted
100+
as NaN: '""" + "', '".join(sorted(_NA_VALUES)) + """'.
101+
thousands : str, default None
102+
Thousands separator for parsing string columns to numeric. Note that
103+
this parameter is only necessary for columns stored as TEXT in Excel,
104+
any numeric columns will automatically be parsed, regardless of display
105+
format.
106+
keep_default_na : bool, default True
107+
If na_values are specified and keep_default_na is False the default NaN
108+
values are overridden, otherwise they're appended to.
109+
verbose : boolean, default False
110+
Indicate number of NA values placed in non-numeric columns
111+
engine: string, default None
112+
If io is not a buffer or path, this must be set to identify io.
113+
Acceptable values are None or xlrd
114+
convert_float : boolean, default True
115+
convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
116+
data will be read in as floats: Excel stores all numbers as floats
117+
internally
118+
has_index_names : boolean, default None
119+
DEPRECATED: for version 0.17+ index names will be automatically
120+
inferred based on index_col. To read Excel output from 0.16.2 and
121+
prior that had saved index names, use True.
122+
123+
Returns
124+
-------
125+
parsed : DataFrame or Dict of DataFrames
126+
DataFrame from the passed in Excel file. See notes in sheetname
127+
argument for more information on when a Dict of Dataframes is returned.
128+
"""
129+
36130

37131
def register_writer(klass):
38132
"""Adds engine to the excel writer registry. You must use this method to
@@ -74,100 +168,13 @@ def get_writer(engine_name):
74168
raise ValueError("No Excel writer '%s'" % engine_name)
75169

76170

171+
@Appender(_read_excel_doc)
77172
def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0,
78173
index_col=None, names=None, parse_cols=None, parse_dates=False,
79174
date_parser=None, na_values=None, thousands=None,
80175
convert_float=True, has_index_names=None, converters=None,
81176
engine=None, squeeze=False, **kwds):
82-
"""
83-
Read an Excel table into a pandas DataFrame
84-
85-
Parameters
86-
----------
87-
io : string, path object (pathlib.Path or py._path.local.LocalPath),
88-
file-like object, pandas ExcelFile, or xlrd workbook.
89-
The string could be a URL. Valid URL schemes include http, ftp, s3,
90-
and file. For file URLs, a host is expected. For instance, a local
91-
file could be file://localhost/path/to/workbook.xlsx
92-
sheetname : string, int, mixed list of strings/ints, or None, default 0
93-
94-
Strings are used for sheet names, Integers are used in zero-indexed
95-
sheet positions.
96-
97-
Lists of strings/integers are used to request multiple sheets.
98-
99-
Specify None to get all sheets.
100-
101-
str|int -> DataFrame is returned.
102-
list|None -> Dict of DataFrames is returned, with keys representing
103-
sheets.
104-
105-
Available Cases
106-
107-
* Defaults to 0 -> 1st sheet as a DataFrame
108-
* 1 -> 2nd sheet as a DataFrame
109-
* "Sheet1" -> 1st sheet as a DataFrame
110-
* [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames
111-
* None -> All sheets as a dictionary of DataFrames
112-
113-
header : int, list of ints, default 0
114-
Row (0-indexed) to use for the column labels of the parsed
115-
DataFrame. If a list of integers is passed those row positions will
116-
be combined into a ``MultiIndex``
117-
skiprows : list-like
118-
Rows to skip at the beginning (0-indexed)
119-
skip_footer : int, default 0
120-
Rows at the end to skip (0-indexed)
121-
index_col : int, list of ints, default None
122-
Column (0-indexed) to use as the row labels of the DataFrame.
123-
Pass None if there is no such column. If a list is passed,
124-
those columns will be combined into a ``MultiIndex``
125-
names : array-like, default None
126-
List of column names to use. If file contains no header row,
127-
then you should explicitly pass header=None
128-
converters : dict, default None
129-
Dict of functions for converting values in certain columns. Keys can
130-
either be integers or column labels, values are functions that take one
131-
input argument, the Excel cell content, and return the transformed
132-
content.
133-
parse_cols : int or list, default None
134-
* If None then parse all columns,
135-
* If int then indicates last column to be parsed
136-
* If list of ints then indicates list of column numbers to be parsed
137-
* If string then indicates comma separated list of column names and
138-
column ranges (e.g. "A:E" or "A,C,E:F")
139-
squeeze : boolean, default False
140-
If the parsed data only contains one column then return a Series
141-
na_values : list-like, default None
142-
List of additional strings to recognize as NA/NaN
143-
thousands : str, default None
144-
Thousands separator for parsing string columns to numeric. Note that
145-
this parameter is only necessary for columns stored as TEXT in Excel,
146-
any numeric columns will automatically be parsed, regardless of display
147-
format.
148-
keep_default_na : bool, default True
149-
If na_values are specified and keep_default_na is False the default NaN
150-
values are overridden, otherwise they're appended to
151-
verbose : boolean, default False
152-
Indicate number of NA values placed in non-numeric columns
153-
engine: string, default None
154-
If io is not a buffer or path, this must be set to identify io.
155-
Acceptable values are None or xlrd
156-
convert_float : boolean, default True
157-
convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
158-
data will be read in as floats: Excel stores all numbers as floats
159-
internally
160-
has_index_names : boolean, default None
161-
DEPRECATED: for version 0.17+ index names will be automatically
162-
inferred based on index_col. To read Excel output from 0.16.2 and
163-
prior that had saved index names, use True.
164177

165-
Returns
166-
-------
167-
parsed : DataFrame or Dict of DataFrames
168-
DataFrame from the passed in Excel file. See notes in sheetname
169-
argument for more information on when a Dict of Dataframes is returned.
170-
"""
171178
if not isinstance(io, ExcelFile):
172179
io = ExcelFile(io, engine=engine)
173180

pandas/io/parsers.py

+1-8
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,14 @@
2525
from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg,
2626
_get_handle, UnicodeReader, UTF8Recoder,
2727
BaseIterator, CParserError, EmptyDataError,
28-
ParserWarning)
28+
ParserWarning, _NA_VALUES)
2929
from pandas.tseries import tools
3030

3131
from pandas.util.decorators import Appender
3232

3333
import pandas.lib as lib
3434
import pandas.parser as _parser
3535

36-
# common NA values
37-
# no longer excluding inf representations
38-
# '1.#INF','-1.#INF', '1.#INF000000',
39-
_NA_VALUES = set([
40-
'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A',
41-
'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', ''
42-
])
4336

4437
# BOM character (byte order mark)
4538
# This exists at the beginning of a file to indicate endianness

pandas/io/tests/data/test5.xls

20 KB
Binary file not shown.

pandas/io/tests/data/test5.xlsm

7.83 KB
Binary file not shown.

pandas/io/tests/data/test5.xlsx

7.81 KB
Binary file not shown.

pandas/io/tests/test_excel.py

+16
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,22 @@ def test_excel_passes_na(self):
244244
columns=['Test'])
245245
tm.assert_frame_equal(parsed, expected)
246246

247+
def test_excel_passes_additional_na(self):
248+
249+
excel = self.get_excelfile('test5')
250+
251+
parsed = read_excel(excel, 'Sheet1', keep_default_na=False,
252+
na_values=['apple'])
253+
expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']],
254+
columns=['Test'])
255+
tm.assert_frame_equal(parsed, expected)
256+
257+
parsed = read_excel(excel, 'Sheet1', keep_default_na=True,
258+
na_values=['apple'])
259+
expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
260+
columns=['Test'])
261+
tm.assert_frame_equal(parsed, expected)
262+
247263
def test_excel_table_sheet_by_index(self):
248264

249265
excel = self.get_excelfile('test1')

0 commit comments

Comments
 (0)