From 5697abb1e72d4c126b5d5111566b61d014aa2833 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 23 Sep 2015 19:06:23 -0500 Subject: [PATCH] API: raise on header=bool in parsers --- doc/source/whatsnew/v0.17.0.txt | 17 +++++++++++++++++ pandas/io/common.py | 6 ++++++ pandas/io/excel.py | 3 ++- pandas/io/html.py | 3 ++- pandas/io/parsers.py | 4 +++- pandas/io/tests/test_excel.py | 9 ++++++++- pandas/io/tests/test_html.py | 5 +++++ pandas/io/tests/test_parsers.py | 16 ++++++++++++++++ 8 files changed, 59 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 6be913e1e5f51..3d4d113940dec 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -907,6 +907,23 @@ Changes to ``Categorical.unique`` cat cat.unique() +Changes to ``bool`` passed as ``header`` in Parsers +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In earlier versions of pandas, if a bool was passed the ``header`` argument of +``read_csv``, ``read_excel``, or ``read_html`` it was implicitly converted to +an integer, resulting in ``header=0`` for ``False`` and ``header=1`` for ``True`` +(:issue:`6113`) + +A ``bool`` input to ``header`` will now raise a ``TypeError`` + +.. code-block :: python + + In [29]: df = pd.read_csv('data.csv', header=False) + TypeError: Passing a bool to header is invalid. Use header=None for no header or + header=int or list-like of ints to specify the row(s) making up the column names + + .. _whatsnew_0170.api_breaking.other: Other API Changes diff --git a/pandas/io/common.py b/pandas/io/common.py index e13c402b454d1..b9cdd44e52555 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -194,6 +194,12 @@ def _expand_user(filepath_or_buffer): return os.path.expanduser(filepath_or_buffer) return filepath_or_buffer +def _validate_header_arg(header): + if isinstance(header, bool): + raise TypeError("Passing a bool to header is invalid. " + "Use header=None for no header or " + "header=int or list-like of ints to specify " + "the row(s) making up the column names") def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 5767af1ad3862..d90a8b8e90a93 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -11,7 +11,7 @@ from pandas.core.frame import DataFrame from pandas.io.parsers import TextParser -from pandas.io.common import _is_url, _urlopen +from pandas.io.common import _is_url, _urlopen, _validate_header_arg from pandas.tseries.period import Period from pandas import json from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass, @@ -217,6 +217,7 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, if skipfooter is not None: skip_footer = skipfooter + _validate_header_arg(header) if has_index_names is not None: warn("\nThe has_index_names argument is deprecated; index names " "will be automatically inferred based on index_col.\n" diff --git a/pandas/io/html.py b/pandas/io/html.py index cb2ee7b1c1e3f..f175702dedabc 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -13,7 +13,7 @@ import numpy as np -from pandas.io.common import _is_url, urlopen, parse_url +from pandas.io.common import _is_url, urlopen, parse_url, _validate_header_arg from pandas.io.parsers import TextParser from pandas.compat import (lrange, lmap, u, string_types, iteritems, raise_with_traceback, binary_type) @@ -861,5 +861,6 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, if isinstance(skiprows, numbers.Integral) and skiprows < 0: raise ValueError('cannot skip rows starting from the end of the ' 'data (you passed a negative value)') + _validate_header_arg(header) return _parse(flavor, io, match, header, index_col, skiprows, parse_dates, tupleize_cols, thousands, attrs, encoding) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 15e11193fd1b7..8ac1aed9d9af7 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -17,7 +17,7 @@ from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, _validate_header_arg from pandas.tseries import tools from pandas.util.decorators import Appender @@ -673,6 +673,8 @@ def _clean_options(self, options, engine): # really delete this one keep_default_na = result.pop('keep_default_na') + _validate_header_arg(options['header']) + if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") if _is_index_col(index_col): diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 657789fe8ce9b..e20703398b5f6 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -384,6 +384,8 @@ def test_read_excel_blank_with_header(self): tm.assert_frame_equal(actual, expected) + + class XlrdTests(ReadingTestsBase): """ This is the base class for the xlrd tests, and 3 different file formats @@ -641,7 +643,12 @@ def test_excel_oldindex_format(self): has_index_names=False) tm.assert_frame_equal(actual, expected, check_names=False) - + def test_read_excel_bool_header_arg(self): + #GH 6114 + for arg in [True, False]: + with tm.assertRaises(TypeError): + pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), + header=arg) class XlsReaderTests(XlrdTests, tm.TestCase): ext = '.xls' diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index c00517ab92f96..5c8c15c7c2ae0 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -637,6 +637,11 @@ def test_wikipedia_states_table(self): result = self.read_html(data, 'Arizona', header=1)[0] nose.tools.assert_equal(result['sq mi'].dtype, np.dtype('float64')) + def test_bool_header_arg(self): + #GH 6114 + for arg in [True, False]: + with tm.assertRaises(TypeError): + read_html(self.spam_data, header=arg) def _lang_enc(filename): return os.path.splitext(os.path.basename(filename))[0].split('_') diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index c9ae2f6029530..799c573b13c8b 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -4117,6 +4117,22 @@ def test_single_char_leading_whitespace(self): skipinitialspace=True) tm.assert_frame_equal(result, expected) + def test_bool_header_arg(self): + # GH 6114 + data = """\ +MyColumn + a + b + a + b""" + for arg in [True, False]: + with tm.assertRaises(TypeError): + pd.read_csv(StringIO(data), header=arg) + with tm.assertRaises(TypeError): + pd.read_table(StringIO(data), header=arg) + with tm.assertRaises(TypeError): + pd.read_fwf(StringIO(data), header=arg) + class TestMiscellaneous(tm.TestCase): # for tests that don't fit into any of the other classes, e.g. those that