diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py deleted file mode 100644 index 950795b33d460..0000000000000 --- a/pandas/tests/io/parser/dtypes.py +++ /dev/null @@ -1,400 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests dtype specification during parsing -for all of the parsers defined in parsers.py -""" - -import numpy as np -import pytest - -from pandas.compat import StringIO -from pandas.errors import ParserWarning - -from pandas.core.dtypes.dtypes import CategoricalDtype - -import pandas as pd -from pandas import Categorical, DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm - - -class DtypeTests(object): - - def test_passing_dtype(self): - # see gh-6607 - df = DataFrame(np.random.rand(5, 2).round(4), columns=list( - 'AB'), index=['1A', '1B', '1C', '1D', '1E']) - - with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: - df.to_csv(path) - - # see gh-3795: passing 'str' as the dtype - result = self.read_csv(path, dtype=str, index_col=0) - expected = df.astype(str) - tm.assert_frame_equal(result, expected) - - # for parsing, interpret object as str - result = self.read_csv(path, dtype=object, index_col=0) - tm.assert_frame_equal(result, expected) - - # we expect all object columns, so need to - # convert to test for equivalence - result = result.astype(float) - tm.assert_frame_equal(result, df) - - # invalid dtype - pytest.raises(TypeError, self.read_csv, path, - dtype={'A': 'foo', 'B': 'float64'}, - index_col=0) - - # see gh-12048: empty frame - actual = self.read_csv(StringIO('A,B'), dtype=str) - expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) - tm.assert_frame_equal(actual, expected) - - def test_pass_dtype(self): - data = """\ -one,two -1,2.5 -2,3.5 -3,4.5 -4,5.5""" - - result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}) - assert result['one'].dtype == 'u1' - assert result['two'].dtype == 'object' - - def test_categorical_dtype(self): - # GH 10153 - data = """a,b,c -1,a,3.4 -1,a,3.4 -2,b,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['a', 'a', 'b']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={'a': 'category', - 'b': 'category', - 'c': CategoricalDtype()}) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) - expected = pd.DataFrame({'a': [1, 1, 2], - 'b': Categorical(['a', 'a', 'b']), - 'c': [3.4, 3.4, 4.5]}) - tm.assert_frame_equal(actual, expected) - - actual = self.read_csv(StringIO(data), dtype={1: 'category'}) - tm.assert_frame_equal(actual, expected) - - # unsorted - data = """a,b,c -1,b,3.4 -1,b,3.4 -2,a,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['b', 'b', 'a']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - # missing - data = """a,b,c -1,b,3.4 -1,nan,3.4 -2,a,4.5""" - expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), - 'b': Categorical(['b', np.nan, 'a']), - 'c': Categorical(['3.4', '3.4', '4.5'])}) - actual = self.read_csv(StringIO(data), dtype='category') - tm.assert_frame_equal(actual, expected) - - @pytest.mark.slow - def test_categorical_dtype_high_cardinality_numeric(self): - # GH 18186 - data = np.sort([str(i) for i in range(524289)]) - expected = DataFrame({'a': Categorical(data, ordered=True)}) - actual = self.read_csv(StringIO('a\n' + '\n'.join(data)), - dtype='category') - actual["a"] = actual["a"].cat.reorder_categories( - np.sort(actual.a.cat.categories), ordered=True) - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_encoding(self, datapath): - # GH 10153 - pth = datapath('io', 'parser', 'data', 'unicode_series.csv') - encoding = 'latin-1' - expected = self.read_csv(pth, header=None, encoding=encoding) - expected[1] = Categorical(expected[1]) - actual = self.read_csv(pth, header=None, encoding=encoding, - dtype={1: 'category'}) - tm.assert_frame_equal(actual, expected) - - pth = datapath('io', 'parser', 'data', 'utf16_ex.txt') - encoding = 'utf-16' - expected = self.read_table(pth, encoding=encoding) - expected = expected.apply(Categorical) - actual = self.read_table(pth, encoding=encoding, dtype='category') - tm.assert_frame_equal(actual, expected) - - def test_categorical_dtype_chunksize(self): - # GH 10153 - data = """a,b -1,a -1,b -1,b -2,c""" - expecteds = [pd.DataFrame({'a': [1, 1], - 'b': Categorical(['a', 'b'])}), - pd.DataFrame({'a': [1, 2], - 'b': Categorical(['b', 'c'])}, - index=[2, 3])] - actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, - chunksize=2) - - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - - @pytest.mark.parametrize('ordered', [False, True]) - @pytest.mark.parametrize('categories', [ - ['a', 'b', 'c'], - ['a', 'c', 'b'], - ['a', 'b', 'c', 'd'], - ['c', 'b', 'a'], - ]) - def test_categorical_categoricaldtype(self, categories, ordered): - data = """a,b -1,a -1,b -1,b -2,c""" - expected = pd.DataFrame({ - "a": [1, 1, 1, 2], - "b": Categorical(['a', 'b', 'b', 'c'], - categories=categories, - ordered=ordered) - }) - dtype = {"b": CategoricalDtype(categories=categories, - ordered=ordered)} - result = self.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - def test_categorical_categoricaldtype_unsorted(self): - data = """a,b -1,a -1,b -1,b -2,c""" - dtype = CategoricalDtype(['c', 'b', 'a']) - expected = pd.DataFrame({ - 'a': [1, 1, 1, 2], - 'b': Categorical(['a', 'b', 'b', 'c'], categories=['c', 'b', 'a']) - }) - result = self.read_csv(StringIO(data), dtype={'b': dtype}) - tm.assert_frame_equal(result, expected) - - def test_categoricaldtype_coerces_numeric(self): - dtype = {'b': CategoricalDtype([1, 2, 3])} - data = "b\n1\n1\n2\n3" - expected = pd.DataFrame({'b': Categorical([1, 1, 2, 3])}) - result = self.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - def test_categoricaldtype_coerces_datetime(self): - dtype = { - 'b': CategoricalDtype(pd.date_range('2017', '2019', freq='AS')) - } - data = "b\n2017-01-01\n2018-01-01\n2019-01-01" - expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)}) - result = self.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - dtype = { - 'b': CategoricalDtype([pd.Timestamp("2014")]) - } - data = "b\n2014-01-01\n2014-01-01T00:00:00" - expected = pd.DataFrame({'b': Categorical([pd.Timestamp('2014')] * 2)}) - result = self.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - def test_categoricaldtype_coerces_timedelta(self): - dtype = {'b': CategoricalDtype(pd.to_timedelta(['1H', '2H', '3H']))} - data = "b\n1H\n2H\n3H" - expected = pd.DataFrame({'b': Categorical(dtype['b'].categories)}) - result = self.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - def test_categoricaldtype_unexpected_categories(self): - dtype = {'b': CategoricalDtype(['a', 'b', 'd', 'e'])} - data = "b\nd\na\nc\nd" # Unexpected c - expected = pd.DataFrame({"b": Categorical(list('dacd'), - dtype=dtype['b'])}) - result = self.read_csv(StringIO(data), dtype=dtype) - tm.assert_frame_equal(result, expected) - - def test_categorical_categoricaldtype_chunksize(self): - # GH 10153 - data = """a,b -1,a -1,b -1,b -2,c""" - cats = ['a', 'b', 'c'] - expecteds = [pd.DataFrame({'a': [1, 1], - 'b': Categorical(['a', 'b'], - categories=cats)}), - pd.DataFrame({'a': [1, 2], - 'b': Categorical(['b', 'c'], - categories=cats)}, - index=[2, 3])] - dtype = CategoricalDtype(cats) - actuals = self.read_csv(StringIO(data), dtype={'b': dtype}, - chunksize=2) - - for actual, expected in zip(actuals, expecteds): - tm.assert_frame_equal(actual, expected) - - def test_empty_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) - - expected = DataFrame({'one': np.empty(0, dtype='u1'), - 'two': np.empty(0, dtype=np.object)}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_index_pass_dtype(self): - data = 'one,two' - result = self.read_csv(StringIO(data), index_col=['one'], - dtype={'one': 'u1', 1: 'f'}) - - expected = DataFrame({'two': np.empty(0, dtype='f')}, - index=Index([], dtype='u1', name='one')) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_multiindex_pass_dtype(self): - data = 'one,two,three' - result = self.read_csv(StringIO(data), index_col=['one', 'two'], - dtype={'one': 'u1', 1: 'f8'}) - - exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), - np.empty(0, dtype='O')], - names=['one', 'two']) - expected = DataFrame( - {'three': np.empty(0, dtype=np.object)}, index=exp_idx) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_names(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={ - 'one': 'u1', 'one.1': 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_mangled_column_pass_dtype_by_indexes(self): - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - - expected = DataFrame( - {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_dup_column_pass_dtype_by_indexes(self): - # see gh-9424 - expected = pd.concat([Series([], name='one', dtype='u1'), - Series([], name='one.1', dtype='f')], axis=1) - - data = 'one,one' - result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - data = '' - result = self.read_csv(StringIO(data), names=['one', 'one'], - dtype={0: 'u1', 1: 'f'}) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_raise_on_passed_int_dtype_with_nas(self): - # see gh-2631 - data = """YEAR, DOY, a -2001,106380451,10 -2001,,11 -2001,106380451,67""" - pytest.raises(ValueError, self.read_csv, StringIO(data), - sep=",", skipinitialspace=True, - dtype={'DOY': np.int64}) - - def test_dtype_with_converter(self): - data = """a,b -1.1,2.2 -1.2,2.3""" - # dtype spec ignored if converted specified - with tm.assert_produces_warning(ParserWarning): - result = self.read_csv(StringIO(data), dtype={'a': 'i8'}, - converters={'a': lambda x: str(x)}) - expected = DataFrame({'a': ['1.1', '1.2'], 'b': [2.2, 2.3]}) - tm.assert_frame_equal(result, expected) - - def test_empty_dtype(self): - # see gh-14712 - data = 'a,b' - - expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64) - result = self.read_csv(StringIO(data), header=0, dtype=np.float64) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame({'a': pd.Categorical([]), - 'b': pd.Categorical([])}, - index=[]) - result = self.read_csv(StringIO(data), header=0, - dtype='category') - tm.assert_frame_equal(result, expected) - result = self.read_csv(StringIO(data), header=0, - dtype={'a': 'category', 'b': 'category'}) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') - result = self.read_csv(StringIO(data), header=0, - dtype='datetime64[ns]') - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'), - 'b': pd.Series([], dtype='timedelta64[ns]')}, - index=[]) - result = self.read_csv(StringIO(data), header=0, - dtype='timedelta64[ns]') - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={'a': np.float64}) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={0: np.float64}) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame(columns=['a', 'b']) - expected['a'] = expected['a'].astype(np.int32) - expected['b'] = expected['b'].astype(np.float64) - result = self.read_csv(StringIO(data), header=0, - dtype={'a': np.int32, 1: np.float64}) - tm.assert_frame_equal(result, expected) - - def test_numeric_dtype(self): - data = '0\n1' - - for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: - expected = pd.DataFrame([0, 1], dtype=dt) - result = self.read_csv(StringIO(data), header=None, dtype=dt) - tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py deleted file mode 100644 index 2191fdceb6928..0000000000000 --- a/pandas/tests/io/parser/header.py +++ /dev/null @@ -1,407 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that the file header is properly handled or inferred -during parsing for all of the parsers defined in parsers.py -""" - -from collections import namedtuple - -import numpy as np -import pytest - -from pandas.compat import StringIO, lrange, u -from pandas.errors import ParserError - -from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm - - -class HeaderTests(object): - - def test_read_with_bad_header(self): - errmsg = r"but only \d+ lines in file" - - with pytest.raises(ValueError, match=errmsg): - s = StringIO(',,') - self.read_csv(s, header=[10]) - - def test_bool_header_arg(self): - # see gh-6114 - data = """\ -MyColumn - a - b - a - b""" - for arg in [True, False]: - with pytest.raises(TypeError): - self.read_csv(StringIO(data), header=arg) - with pytest.raises(TypeError): - self.read_table(StringIO(data), header=arg) - - def test_no_header_prefix(self): - data = """1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - df_pref = self.read_table(StringIO(data), sep=',', prefix='Field', - header=None) - - expected = np.array([[1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]], dtype=np.int64) - tm.assert_almost_equal(df_pref.values, expected) - - tm.assert_index_equal(df_pref.columns, - Index(['Field0', 'Field1', 'Field2', - 'Field3', 'Field4'])) - - def test_header_with_index_col(self): - data = """foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - names = ['A', 'B', 'C'] - df = self.read_csv(StringIO(data), names=names) - - assert list(df.columns) == ['A', 'B', 'C'] - - values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] - expected = DataFrame(values, index=['foo', 'bar', 'baz'], - columns=['A', 'B', 'C']) - tm.assert_frame_equal(df, expected) - - def test_header_not_first_line(self): - data = """got,to,ignore,this,line -got,to,ignore,this,line -index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -""" - data2 = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -""" - - df = self.read_csv(StringIO(data), header=2, index_col=0) - expected = self.read_csv(StringIO(data2), header=0, index_col=0) - tm.assert_frame_equal(df, expected) - - def test_header_multi_index(self): - expected = tm.makeCustomDataframe( - 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) - - data = """\ -C0,,C_l0_g0,C_l0_g1,C_l0_g2 - -C1,,C_l1_g0,C_l1_g1,C_l1_g2 -C2,,C_l2_g0,C_l2_g1,C_l2_g2 -C3,,C_l3_g0,C_l3_g1,C_l3_g2 -R0,R1,,, -R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 -R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 -R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 -R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 -R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 -""" - - df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1]) - tm.assert_frame_equal(df, expected) - - # skipping lines in the header - df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1]) - tm.assert_frame_equal(df, expected) - - # INVALID OPTIONS - - # names - pytest.raises(ValueError, self.read_csv, - StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1], names=['foo', 'bar']) - - # usecols - pytest.raises(ValueError, self.read_csv, - StringIO(data), header=[0, 1, 2, 3], - index_col=[0, 1], usecols=['foo', 'bar']) - - # non-numeric index_col - pytest.raises(ValueError, self.read_csv, - StringIO(data), header=[0, 1, 2, 3], - index_col=['foo', 'bar']) - - def test_header_multiindex_common_format(self): - - df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], - index=['one', 'two'], - columns=MultiIndex.from_tuples( - [('a', 'q'), ('a', 'r'), ('a', 's'), - ('b', 't'), ('c', 'u'), ('c', 'v')])) - - # to_csv - data = """,a,a,a,b,c,c -,q,r,s,t,u,v -,,,,,, -one,1,2,3,4,5,6 -two,7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(df, result) - - # to_csv, tuples - result = self.read_csv(StringIO(data), skiprows=3, - names=[('a', 'q'), ('a', 'r'), ('a', 's'), - ('b', 't'), ('c', 'u'), ('c', 'v')], - index_col=0) - tm.assert_frame_equal(df, result) - - # to_csv, namedtuples - TestTuple = namedtuple('names', ['first', 'second']) - result = self.read_csv( - StringIO(data), skiprows=3, index_col=0, - names=[TestTuple('a', 'q'), TestTuple('a', 'r'), - TestTuple('a', 's'), TestTuple('b', 't'), - TestTuple('c', 'u'), TestTuple('c', 'v')]) - tm.assert_frame_equal(df, result) - - # common - data = """,a,a,a,b,c,c -,q,r,s,t,u,v -one,1,2,3,4,5,6 -two,7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(df, result) - - # common, tuples - result = self.read_csv(StringIO(data), skiprows=2, - names=[('a', 'q'), ('a', 'r'), ('a', 's'), - ('b', 't'), ('c', 'u'), ('c', 'v')], - index_col=0) - tm.assert_frame_equal(df, result) - - # common, namedtuples - TestTuple = namedtuple('names', ['first', 'second']) - result = self.read_csv( - StringIO(data), skiprows=2, index_col=0, - names=[TestTuple('a', 'q'), TestTuple('a', 'r'), - TestTuple('a', 's'), TestTuple('b', 't'), - TestTuple('c', 'u'), TestTuple('c', 'v')]) - tm.assert_frame_equal(df, result) - - # common, no index_col - data = """a,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=None) - tm.assert_frame_equal(df.reset_index(drop=True), result) - - # common, no index_col, tuples - result = self.read_csv(StringIO(data), skiprows=2, - names=[('a', 'q'), ('a', 'r'), ('a', 's'), - ('b', 't'), ('c', 'u'), ('c', 'v')], - index_col=None) - tm.assert_frame_equal(df.reset_index(drop=True), result) - - # common, no index_col, namedtuples - TestTuple = namedtuple('names', ['first', 'second']) - result = self.read_csv( - StringIO(data), skiprows=2, index_col=None, - names=[TestTuple('a', 'q'), TestTuple('a', 'r'), - TestTuple('a', 's'), TestTuple('b', 't'), - TestTuple('c', 'u'), TestTuple('c', 'v')]) - tm.assert_frame_equal(df.reset_index(drop=True), result) - - # malformed case 1 - expected = DataFrame(np.array( - [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), - index=Index([1, 7]), - columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], - [u('r'), u('s'), u('t'), - u('u'), u('v')]], - labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], - names=[u('a'), u('q')])) - - data = """a,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(expected, result) - - # malformed case 2 - expected = DataFrame(np.array( - [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), - index=Index([1, 7]), - columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], - [u('r'), u('s'), u('t'), - u('u'), u('v')]], - labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], - names=[None, u('q')])) - - data = """,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) - tm.assert_frame_equal(expected, result) - - # mi on columns and index (malformed) - expected = DataFrame(np.array( - [[3, 4, 5, 6], [9, 10, 11, 12]], dtype='int64'), - index=MultiIndex(levels=[[1, 7], [2, 8]], - labels=[[0, 1], [0, 1]]), - columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], - [u('s'), u('t'), u('u'), u('v')]], - labels=[[0, 1, 2, 2], [0, 1, 2, 3]], - names=[None, u('q')])) - - data = """,a,a,b,c,c -q,r,s,t,u,v -1,2,3,4,5,6 -7,8,9,10,11,12""" - - result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) - tm.assert_frame_equal(expected, result) - - def test_header_names_backward_compat(self): - # #2539 - data = '1,2,3\n4,5,6' - - result = self.read_csv(StringIO(data), names=['a', 'b', 'c']) - expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None) - tm.assert_frame_equal(result, expected) - - data2 = 'foo,bar,baz\n' + data - result = self.read_csv(StringIO(data2), names=['a', 'b', 'c'], - header=0) - tm.assert_frame_equal(result, expected) - - def test_read_only_header_no_rows(self): - # See gh-7773 - expected = DataFrame(columns=['a', 'b', 'c']) - - df = self.read_csv(StringIO('a,b,c')) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO('a,b,c'), index_col=False) - tm.assert_frame_equal(df, expected) - - def test_no_header(self): - data = """1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - df = self.read_table(StringIO(data), sep=',', header=None) - df_pref = self.read_table(StringIO(data), sep=',', prefix='X', - header=None) - - names = ['foo', 'bar', 'baz', 'quux', 'panda'] - df2 = self.read_table(StringIO(data), sep=',', names=names) - expected = np.array([[1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]], dtype=np.int64) - tm.assert_almost_equal(df.values, expected) - tm.assert_almost_equal(df.values, df2.values) - - tm.assert_index_equal(df_pref.columns, - Index(['X0', 'X1', 'X2', 'X3', 'X4'])) - tm.assert_index_equal(df.columns, Index(lrange(5))) - - tm.assert_index_equal(df2.columns, Index(names)) - - def test_non_int_header(self): - # GH 16338 - msg = 'header must be integer or list of integers' - data = """1,2\n3,4""" - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(data), sep=',', header=['a', 'b']) - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(data), sep=',', header='string_header') - - def test_singleton_header(self): - # See GH #7757 - data = """a,b,c\n0,1,2\n1,2,3""" - df = self.read_csv(StringIO(data), header=[0]) - expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]}) - tm.assert_frame_equal(df, expected) - - def test_mangles_multi_index(self): - # See GH 18062 - data = """A,A,A,B\none,one,one,two\n0,40,34,0.1""" - df = self.read_csv(StringIO(data), header=[0, 1]) - expected = DataFrame([[0, 40, 34, 0.1]], - columns=MultiIndex.from_tuples( - [('A', 'one'), ('A', 'one.1'), - ('A', 'one.2'), ('B', 'two')])) - tm.assert_frame_equal(df, expected) - - data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1""" - df = self.read_csv(StringIO(data), header=[0, 1]) - expected = DataFrame([[0, 40, 34, 0.1]], - columns=MultiIndex.from_tuples( - [('A', 'one'), ('A', 'one.1'), - ('A', 'one.1.1'), ('B', 'two')])) - tm.assert_frame_equal(df, expected) - - data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1""" - df = self.read_csv(StringIO(data), header=[0, 1]) - expected = DataFrame([[0, 40, 34, 0.1, 0.1]], - columns=MultiIndex.from_tuples( - [('A', 'one'), ('A', 'one.1'), - ('A', 'one.1.1'), ('B', 'two'), - ('B', 'two.1')])) - tm.assert_frame_equal(df, expected) - - @pytest.mark.parametrize("index_col", [None, [0]]) - @pytest.mark.parametrize("columns", [None, - (["", "Unnamed"]), - (["Unnamed", ""]), - (["Unnamed", "NotUnnamed"])]) - def test_multi_index_unnamed(self, index_col, columns): - # see gh-23687 - # - # When specifying a multi-index header, make sure that - # we don't error just because one of the rows in our header - # has ALL column names containing the string "Unnamed". The - # correct condition to check is whether the row contains - # ALL columns that did not have names (and instead were given - # placeholder ones). - header = [0, 1] - - if index_col is None: - data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n" - else: - data = (",".join([""] + (columns or ["", ""])) + - "\n,0,1\n0,2,3\n1,4,5\n") - - if columns is None: - msg = (r"Passed header=\[0,1\] are too " - r"many rows for this multi_index of columns") - with pytest.raises(ParserError, match=msg): - self.read_csv(StringIO(data), header=header, - index_col=index_col) - else: - result = self.read_csv(StringIO(data), header=header, - index_col=index_col) - template = "Unnamed: {i}_level_0" - exp_columns = [] - - for i, col in enumerate(columns): - if not col: # Unnamed. - col = template.format(i=i if index_col is None else i + 1) - - exp_columns.append(col) - - columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) - expected = DataFrame([[2, 3], [4, 5]], columns=columns) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/index_col.py b/pandas/tests/io/parser/index_col.py deleted file mode 100644 index ba54ed4620199..0000000000000 --- a/pandas/tests/io/parser/index_col.py +++ /dev/null @@ -1,171 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that the specified index column (a.k.a 'index_col') -is properly handled or inferred during parsing for all of -the parsers defined in parsers.py -""" - -import pytest - -from pandas.compat import StringIO - -from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm - - -class IndexColTests(object): - - def test_index_col_named(self): - no_header = """\ -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa - - h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa - data = h + no_header - rs = self.read_csv(StringIO(data), index_col='ID') - xp = self.read_csv(StringIO(data), header=0).set_index('ID') - tm.assert_frame_equal(rs, xp) - - pytest.raises(ValueError, self.read_csv, StringIO(no_header), - index_col='ID') - - data = """\ -1,2,3,4,hello -5,6,7,8,world -9,10,11,12,foo -""" - names = ['a', 'b', 'c', 'd', 'message'] - xp = DataFrame({'a': [1, 5, 9], 'b': [2, 6, 10], 'c': [3, 7, 11], - 'd': [4, 8, 12]}, - index=Index(['hello', 'world', 'foo'], name='message')) - rs = self.read_csv(StringIO(data), names=names, index_col=['message']) - tm.assert_frame_equal(xp, rs) - assert xp.index.name == rs.index.name - - rs = self.read_csv(StringIO(data), names=names, index_col='message') - tm.assert_frame_equal(xp, rs) - assert xp.index.name == rs.index.name - - def test_index_col_is_true(self): - # see gh-9798 - pytest.raises(ValueError, self.read_csv, - StringIO(self.ts_data), index_col=True) - - def test_infer_index_col(self): - data = """A,B,C -foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - data = self.read_csv(StringIO(data)) - assert data.index.equals(Index(['foo', 'bar', 'baz'])) - - def test_empty_index_col_scenarios(self): - data = 'x,y,z' - - # None, no index - index_col, expected = None, DataFrame([], columns=list('xyz')), - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # False, no index - index_col, expected = False, DataFrame([], columns=list('xyz')), - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # int, first column - index_col, expected = 0, DataFrame( - [], columns=['y', 'z'], index=Index([], name='x')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # int, not first column - index_col, expected = 1, DataFrame( - [], columns=['x', 'z'], index=Index([], name='y')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # str, first column - index_col, expected = 'x', DataFrame( - [], columns=['y', 'z'], index=Index([], name='x')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # str, not the first column - index_col, expected = 'y', DataFrame( - [], columns=['x', 'z'], index=Index([], name='y')) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), expected) - - # list of int - index_col, expected = [0, 1], DataFrame( - [], columns=['z'], index=MultiIndex.from_arrays( - [[]] * 2, names=['x', 'y'])) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), - expected, check_index_type=False) - - # list of str - index_col = ['x', 'y'] - expected = DataFrame([], columns=['z'], - index=MultiIndex.from_arrays( - [[]] * 2, names=['x', 'y'])) - tm.assert_frame_equal(self.read_csv(StringIO( - data), index_col=index_col), - expected, check_index_type=False) - - # list of int, reversed sequence - index_col = [1, 0] - expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( - [[]] * 2, names=['y', 'x'])) - tm.assert_frame_equal(self.read_csv( - StringIO(data), index_col=index_col), - expected, check_index_type=False) - - # list of str, reversed sequence - index_col = ['y', 'x'] - expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( - [[]] * 2, names=['y', 'x'])) - tm.assert_frame_equal(self.read_csv(StringIO( - data), index_col=index_col), - expected, check_index_type=False) - - def test_empty_with_index_col_false(self): - # see gh-10413 - data = 'x,y' - result = self.read_csv(StringIO(data), index_col=False) - expected = DataFrame([], columns=['x', 'y']) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("index_names", [ - ["", ""], - ["foo", ""], - ["", "bar"], - ["foo", "bar"], - ["NotReallyUnnamed", "Unnamed: 0"], - ]) - def test_multi_index_naming(self, index_names): - # We don't want empty index names being replaced with "Unnamed: 0" - data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) - result = self.read_csv(StringIO(data), index_col=[0, 1]) - - expected = DataFrame({"col": [1, 2, 3, 4]}, - index=MultiIndex.from_product([["a", "b"], - ["c", "d"]])) - expected.index.names = [name if name else None for name in index_names] - tm.assert_frame_equal(result, expected) - - def test_multi_index_naming_not_all_at_beginning(self): - data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" - result = self.read_csv(StringIO(data), index_col=[0, 2]) - - expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]}, - index=MultiIndex( - levels=[['a', 'b'], [1, 2, 3, 4]], - labels=[[0, 0, 1, 1], [0, 1, 2, 3]])) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/mangle_dupes.py b/pandas/tests/io/parser/mangle_dupes.py deleted file mode 100644 index 56d59060cc17d..0000000000000 --- a/pandas/tests/io/parser/mangle_dupes.py +++ /dev/null @@ -1,107 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that duplicate columns are handled appropriately when parsed by the -CSV engine. In general, the expected result is that they are either thoroughly -de-duplicated (if mangling requested) or ignored otherwise. -""" - -from pandas.compat import StringIO - -from pandas import DataFrame -import pandas.util.testing as tm - - -class DupeColumnTests(object): - def test_basic(self): - # TODO: add test for condition "mangle_dupe_cols=False" - # once it is actually supported (gh-12935) - data = "a,a,b,b,b\n1,2,3,4,5" - - for method in ("read_csv", "read_table"): - # Check default behavior. - expected = ["a", "a.1", "b", "b.1", "b.2"] - df = getattr(self, method)(StringIO(data), sep=",") - assert list(df.columns) == expected - - df = getattr(self, method)(StringIO(data), sep=",", - mangle_dupe_cols=True) - assert list(df.columns) == expected - - def test_basic_names(self): - # See gh-7160 - data = "a,b,a\n0,1,2\n3,4,5" - expected = DataFrame([[0, 1, 2], [3, 4, 5]], - columns=["a", "b", "a.1"]) - - df = self.read_csv(StringIO(data)) - tm.assert_frame_equal(df, expected) - - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - data = "0,1,2\n3,4,5" - df = self.read_csv(StringIO(data), - names=["a", "b", "a"]) - tm.assert_frame_equal(df, expected) - - def test_thorough_mangle_columns(self): - # see gh-17060 - data = "a,a,a.1\n1,2,3" - df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True) - assert list(df.columns) == ["a", "a.1", "a.1.1"] - - data = "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6" - df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True) - assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1", - "a.1.1.1.1", "a.1.1.1.1.1"] - - data = "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7" - df = self.read_csv(StringIO(data), sep=",", mangle_dupe_cols=True) - assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1", - "a.2", "a.2.1", "a.3.1"] - - def test_thorough_mangle_names(self): - # see gh-17095 - data = "a,b,b\n1,2,3" - names = ["a.1", "a.1", "a.1.1"] - - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a.1", "a.1.1", "a.1.1.1"] - - data = "a,b,c,d,e,f\n1,2,3,4,5,6" - names = ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"] - - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a", "a.1", "a.1.1", "a.1.1.1", - "a.1.1.1.1", "a.1.1.1.1.1"] - - data = "a,b,c,d,e,f,g\n1,2,3,4,5,6,7" - names = ["a", "a", "a.3", "a.1", "a.2", "a", "a"] - - with tm.assert_produces_warning(UserWarning, check_stacklevel=False): - df = self.read_csv(StringIO(data), sep=",", names=names, - mangle_dupe_cols=True) - assert list(df.columns) == ["a", "a.1", "a.3", "a.1.1", - "a.2", "a.2.1", "a.3.1"] - - def test_mangled_unnamed_placeholders(self): - # xref gh-13017 - orig_key = "0" - orig_value = [1, 2, 3] - - df = DataFrame({orig_key: orig_value}) - - # This test recursively updates `df`. - for i in range(3): - expected = DataFrame() - - for j in range(i + 1): - expected["Unnamed: 0" + ".1" * j] = [0, 1, 2] - - expected[orig_key] = orig_value - df = self.read_csv(StringIO(df.to_csv())) - - tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py new file mode 100644 index 0000000000000..17cd0ab16ea61 --- /dev/null +++ b/pandas/tests/io/parser/test_dtypes.py @@ -0,0 +1,498 @@ +# -*- coding: utf-8 -*- + +""" +Tests dtype specification during parsing +for all of the parsers defined in parsers.py +""" + +import os + +import numpy as np +import pytest + +from pandas.compat import StringIO +from pandas.errors import ParserWarning + +from pandas.core.dtypes.dtypes import CategoricalDtype + +import pandas as pd +from pandas import ( + Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat) +import pandas.util.testing as tm + + +@pytest.mark.parametrize("dtype", [str, object]) +@pytest.mark.parametrize("check_orig", [True, False]) +def test_dtype_all_columns(all_parsers, dtype, check_orig): + # see gh-3795, gh-6607 + parser = all_parsers + + df = DataFrame(np.random.rand(5, 2).round(4), columns=list("AB"), + index=["1A", "1B", "1C", "1D", "1E"]) + + with tm.ensure_clean("__passing_str_as_dtype__.csv") as path: + df.to_csv(path) + + result = parser.read_csv(path, dtype=dtype, index_col=0) + + if check_orig: + expected = df.copy() + result = result.astype(float) + else: + expected = df.astype(str) + + tm.assert_frame_equal(result, expected) + + +def test_dtype_all_columns_empty(all_parsers): + # see gh-12048 + parser = all_parsers + result = parser.read_csv(StringIO("A,B"), dtype=str) + + expected = DataFrame({"A": [], "B": []}, index=[], dtype=str) + tm.assert_frame_equal(result, expected) + + +def test_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + expected = DataFrame([[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], + columns=["one", "two"]) + expected["one"] = expected["one"].astype(np.float64) + expected["two"] = expected["two"].astype(object) + + result = parser.read_csv(StringIO(data), dtype={"one": np.float64, + 1: str}) + tm.assert_frame_equal(result, expected) + + +def test_invalid_dtype_per_column(all_parsers): + parser = all_parsers + data = """\ +one,two +1,2.5 +2,3.5 +3,4.5 +4,5.5""" + + with pytest.raises(TypeError, match="data type 'foo' not understood"): + parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) + + +@pytest.mark.parametrize("dtype", [ + "category", + CategoricalDtype(), + {"a": "category", + "b": "category", + "c": CategoricalDtype()} +]) +def test_categorical_dtype(all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = DataFrame({"a": Categorical(["1", "1", "2"]), + "b": Categorical(["a", "a", "b"]), + "c": Categorical(["3.4", "3.4", "4.5"])}) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("dtype", [ + {"b": "category"}, + {1: "category"} +]) +def test_categorical_dtype_single(all_parsers, dtype): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = DataFrame({"a": [1, 1, 2], + "b": Categorical(["a", "a", "b"]), + "c": [3.4, 3.4, 4.5]}) + actual = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_unsorted(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = DataFrame({"a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", "b", "a"]), + "c": Categorical(["3.4", "3.4", "4.5"])}) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_missing(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = DataFrame({"a": Categorical(["1", "1", "2"]), + "b": Categorical(["b", np.nan, "a"]), + "c": Categorical(["3.4", "3.4", "4.5"])}) + actual = parser.read_csv(StringIO(data), dtype="category") + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.slow +def test_categorical_dtype_high_cardinality_numeric(all_parsers): + # see gh-18186 + parser = all_parsers + data = np.sort([str(i) for i in range(524289)]) + expected = DataFrame({"a": Categorical(data, ordered=True)}) + + actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), + dtype="category") + actual["a"] = actual["a"].cat.reorder_categories( + np.sort(actual.a.cat.categories), ordered=True) + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_latin1(all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + encoding = "latin-1" + + expected = parser.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + + actual = parser.read_csv(pth, header=None, encoding=encoding, + dtype={1: "category"}) + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_utf16(all_parsers, csv_dir_path): + # see gh-10153 + pth = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + encoding = "utf-16" + sep = "," + + expected = parser.read_csv(pth, sep=sep, encoding=encoding) + expected = expected.apply(Categorical) + + actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category") + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_chunksize_infer_categories(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [DataFrame({"a": [1, 1], + "b": Categorical(["a", "b"])}), + DataFrame({"a": [1, 2], + "b": Categorical(["b", "c"])}, + index=[2, 3])] + actuals = parser.read_csv(StringIO(data), dtype={"b": "category"}, + chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +def test_categorical_dtype_chunksize_explicit_categories(all_parsers): + # see gh-10153 + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + cats = ["a", "b", "c"] + expecteds = [DataFrame({"a": [1, 1], + "b": Categorical(["a", "b"], + categories=cats)}), + DataFrame({"a": [1, 2], + "b": Categorical(["b", "c"], + categories=cats)}, + index=[2, 3])] + dtype = CategoricalDtype(cats) + actuals = parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + + +@pytest.mark.parametrize("ordered", [False, True]) +@pytest.mark.parametrize("categories", [ + ["a", "b", "c"], + ["a", "c", "b"], + ["a", "b", "c", "d"], + ["c", "b", "a"], +]) +def test_categorical_category_dtype(all_parsers, categories, ordered): + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + expected = DataFrame({ + "a": [1, 1, 1, 2], + "b": Categorical(["a", "b", "b", "c"], + categories=categories, + ordered=ordered) + }) + + dtype = {"b": CategoricalDtype(categories=categories, + ordered=ordered)} + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_category_dtype_unsorted(all_parsers): + parser = all_parsers + data = """a,b +1,a +1,b +1,b +2,c""" + dtype = CategoricalDtype(["c", "b", "a"]) + expected = DataFrame({ + "a": [1, 1, 1, 2], + "b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]) + }) + + result = parser.read_csv(StringIO(data), dtype={"b": dtype}) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_numeric(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([1, 2, 3])} + + data = "b\n1\n1\n2\n3" + expected = DataFrame({"b": Categorical([1, 1, 2, 3])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_datetime(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(pd.date_range("2017", "2019", freq="AS"))} + + data = "b\n2017-01-01\n2018-01-01\n2019-01-01" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_timestamp(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype([Timestamp("2014")])} + + data = "b\n2014-01-01\n2014-01-01T00:00:00" + expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_coerces_timedelta(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))} + + data = "b\n1H\n2H\n3H" + expected = DataFrame({"b": Categorical(dtype["b"].categories)}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_categorical_unexpected_categories(all_parsers): + parser = all_parsers + dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])} + + data = "b\nd\na\nc\nd" # Unexpected c + expected = DataFrame({"b": Categorical(list("dacd"), + dtype=dtype["b"])}) + + result = parser.read_csv(StringIO(data), dtype=dtype) + tm.assert_frame_equal(result, expected) + + +def test_empty_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two" + result = parser.read_csv(StringIO(data), dtype={"one": "u1"}) + + expected = DataFrame({"one": np.empty(0, dtype="u1"), + "two": np.empty(0, dtype=np.object)}, + index=Index([], dtype=object)) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_index_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two" + result = parser.read_csv(StringIO(data), index_col=["one"], + dtype={"one": "u1", 1: "f"}) + + expected = DataFrame({"two": np.empty(0, dtype="f")}, + index=Index([], dtype="u1", name="one")) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_multi_index_pass_dtype(all_parsers): + parser = all_parsers + + data = "one,two,three" + result = parser.read_csv(StringIO(data), index_col=["one", "two"], + dtype={"one": "u1", 1: "f8"}) + + exp_idx = MultiIndex.from_arrays([np.empty(0, dtype="u1"), + np.empty(0, dtype=np.float64)], + names=["one", "two"]) + expected = DataFrame({"three": np.empty(0, dtype=np.object)}, + index=exp_idx) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_mangled_column_pass_dtype_by_names(all_parsers): + parser = all_parsers + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={"one": "u1", "one.1": "f"}) + + expected = DataFrame({"one": np.empty(0, dtype="u1"), + "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object)) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_mangled_column_pass_dtype_by_indexes(all_parsers): + parser = all_parsers + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + + expected = DataFrame({"one": np.empty(0, dtype="u1"), + "one.1": np.empty(0, dtype="f")}, + index=Index([], dtype=object)) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_dup_column_pass_dtype_by_indexes(all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat([Series([], name="one", dtype="u1"), + Series([], name="one.1", dtype="f")], axis=1) + expected.index = expected.index.astype(object) + + data = "one,one" + result = parser.read_csv(StringIO(data), dtype={0: "u1", 1: "f"}) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_dup_column_pass_dtype_by_indexes_warn(all_parsers): + # see gh-9424 + parser = all_parsers + expected = concat([Series([], name="one", dtype="u1"), + Series([], name="one.1", dtype="f")], axis=1) + expected.index = expected.index.astype(object) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + data = "" + result = parser.read_csv(StringIO(data), names=["one", "one"], + dtype={0: "u1", 1: "f"}) + tm.assert_frame_equal(result, expected) + + +def test_raise_on_passed_int_dtype_with_nas(all_parsers): + # see gh-2631 + parser = all_parsers + data = """YEAR, DOY, a +2001,106380451,10 +2001,,11 +2001,106380451,67""" + + msg = ("Integer column has NA values" if parser.engine == "c" else + "Unable to convert column DOY") + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dtype={"DOY": np.int64}, + skipinitialspace=True) + + +def test_dtype_with_converters(all_parsers): + parser = all_parsers + data = """a,b +1.1,2.2 +1.2,2.3""" + + # Dtype spec ignored if converted specified. + with tm.assert_produces_warning(ParserWarning): + result = parser.read_csv(StringIO(data), dtype={"a": "i8"}, + converters={"a": lambda x: str(x)}) + expected = DataFrame({"a": ["1.1", "1.2"], "b": [2.2, 2.3]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype,expected", [ + (np.float64, DataFrame(columns=["a", "b"], dtype=np.float64)), + ("category", DataFrame({"a": Categorical([]), + "b": Categorical([])}, + index=[])), + (dict(a="category", b="category"), + DataFrame({"a": Categorical([]), + "b": Categorical([])}, + index=[])), + ("datetime64[ns]", DataFrame(columns=["a", "b"], dtype="datetime64[ns]")), + ("timedelta64[ns]", DataFrame({"a": Series([], dtype="timedelta64[ns]"), + "b": Series([], dtype="timedelta64[ns]")}, + index=[])), + (dict(a=np.int64, + b=np.int32), DataFrame({"a": Series([], dtype=np.int64), + "b": Series([], dtype=np.int32)}, + index=[])), + ({0: np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64), + "b": Series([], dtype=np.int32)}, + index=[])), + ({"a": np.int64, 1: np.int32}, DataFrame({"a": Series([], dtype=np.int64), + "b": Series([], dtype=np.int32)}, + index=[])), +]) +def test_empty_dtype(all_parsers, dtype, expected): + # see gh-14712 + parser = all_parsers + data = "a,b" + + result = parser.read_csv(StringIO(data), header=0, dtype=dtype) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", list(np.typecodes["AllInteger"] + + np.typecodes["Float"])) +def test_numeric_dtype(all_parsers, dtype): + data = "0\n1" + parser = all_parsers + expected = DataFrame([0, 1], dtype=dtype) + + result = parser.read_csv(StringIO(data), header=None, dtype=dtype) + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py new file mode 100644 index 0000000000000..47b13ae6c50b1 --- /dev/null +++ b/pandas/tests/io/parser/test_header.py @@ -0,0 +1,428 @@ +# -*- coding: utf-8 -*- + +""" +Tests that the file header is properly handled or inferred +during parsing for all of the parsers defined in parsers.py +""" + +from collections import namedtuple + +import numpy as np +import pytest + +from pandas.compat import StringIO, u +from pandas.errors import ParserError + +from pandas import DataFrame, Index, MultiIndex +import pandas.util.testing as tm + + +def test_read_with_bad_header(all_parsers): + parser = all_parsers + msg = r"but only \d+ lines in file" + + with pytest.raises(ValueError, match=msg): + s = StringIO(",,") + parser.read_csv(s, header=[10]) + + +@pytest.mark.parametrize("header", [True, False]) +def test_bool_header_arg(all_parsers, header): + # see gh-6114 + parser = all_parsers + data = """\ +MyColumn +a +b +a +b""" + msg = "Passing a bool to header is invalid" + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), header=header) + + +def test_no_header_prefix(all_parsers): + parser = all_parsers + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + result = parser.read_csv(StringIO(data), prefix="Field", header=None) + expected = DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]], + columns=["Field0", "Field1", "Field2", + "Field3", "Field4"]) + tm.assert_frame_equal(result, expected) + + +def test_header_with_index_col(all_parsers): + parser = all_parsers + data = """foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + names = ["A", "B", "C"] + result = parser.read_csv(StringIO(data), names=names) + + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +def test_header_not_first_line(all_parsers): + parser = all_parsers + data = """got,to,ignore,this,line +got,to,ignore,this,line +index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + data2 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + + result = parser.read_csv(StringIO(data), header=2, index_col=0) + expected = parser.read_csv(StringIO(data2), header=0, index_col=0) + tm.assert_frame_equal(result, expected) + + +def test_header_multi_index(all_parsers): + parser = all_parsers + expected = tm.makeCustomDataframe( + 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) + + data = """\ +C0,,C_l0_g0,C_l0_g1,C_l0_g2 + +C1,,C_l1_g0,C_l1_g1,C_l1_g2 +C2,,C_l2_g0,C_l2_g1,C_l2_g2 +C3,,C_l3_g0,C_l3_g1,C_l3_g2 +R0,R1,,, +R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 +R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 +R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 +R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 +R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 +""" + result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], + index_col=[0, 1]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs,msg", [ + (dict(index_col=["foo", "bar"]), ("index_col must only contain " + "row numbers when specifying " + "a multi-index header")), + (dict(index_col=[0, 1], names=["foo", "bar"]), ("cannot specify names " + "when specifying a " + "multi-index header")), + (dict(index_col=[0, 1], usecols=["foo", "bar"]), ("cannot specify " + "usecols when " + "specifying a " + "multi-index header")), +]) +def test_header_multi_index_invalid(all_parsers, kwargs, msg): + data = """\ +C0,,C_l0_g0,C_l0_g1,C_l0_g2 + +C1,,C_l1_g0,C_l1_g1,C_l1_g2 +C2,,C_l2_g0,C_l2_g1,C_l2_g2 +C3,,C_l3_g0,C_l3_g1,C_l3_g2 +R0,R1,,, +R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2 +R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2 +R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2 +R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2 +R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 +""" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs) + + +_TestTuple = namedtuple("names", ["first", "second"]) + + +@pytest.mark.parametrize("kwargs", [ + dict(header=[0, 1]), + dict(skiprows=3, + names=[("a", "q"), ("a", "r"), ("a", "s"), + ("b", "t"), ("c", "u"), ("c", "v")]), + dict(skiprows=3, + names=[_TestTuple("a", "q"), _TestTuple("a", "r"), + _TestTuple("a", "s"), _TestTuple("b", "t"), + _TestTuple("c", "u"), _TestTuple("c", "v")]) +]) +def test_header_multi_index_common_format1(all_parsers, kwargs): + parser = all_parsers + expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), + ("b", "t"), ("c", "u"), ("c", "v")])) + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +,,,,,, +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), index_col=0, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [ + dict(header=[0, 1]), + dict(skiprows=2, + names=[("a", "q"), ("a", "r"), ("a", "s"), + ("b", "t"), ("c", "u"), ("c", "v")]), + dict(skiprows=2, + names=[_TestTuple("a", "q"), _TestTuple("a", "r"), + _TestTuple("a", "s"), _TestTuple("b", "t"), + _TestTuple("c", "u"), _TestTuple("c", "v")]) +]) +def test_header_multi_index_common_format2(all_parsers, kwargs): + parser = all_parsers + expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), + ("b", "t"), ("c", "u"), ("c", "v")])) + data = """,a,a,a,b,c,c +,q,r,s,t,u,v +one,1,2,3,4,5,6 +two,7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), index_col=0, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [ + dict(header=[0, 1]), + dict(skiprows=2, + names=[("a", "q"), ("a", "r"), ("a", "s"), + ("b", "t"), ("c", "u"), ("c", "v")]), + dict(skiprows=2, + names=[_TestTuple("a", "q"), _TestTuple("a", "r"), + _TestTuple("a", "s"), _TestTuple("b", "t"), + _TestTuple("c", "u"), _TestTuple("c", "v")]) +]) +def test_header_multi_index_common_format3(all_parsers, kwargs): + parser = all_parsers + expected = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=["one", "two"], + columns=MultiIndex.from_tuples( + [("a", "q"), ("a", "r"), ("a", "s"), + ("b", "t"), ("c", "u"), ("c", "v")])) + expected = expected.reset_index(drop=True) + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), index_col=None, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_header_multi_index_common_format_malformed1(all_parsers): + parser = all_parsers + expected = DataFrame(np.array( + [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), + index=Index([1, 7]), + columns=MultiIndex(levels=[[u("a"), u("b"), u("c")], + [u("r"), u("s"), u("t"), + u("u"), u("v")]], + labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[u("a"), u("q")])) + data = """a,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) + + +def test_header_multi_index_common_format_malformed2(all_parsers): + parser = all_parsers + expected = DataFrame(np.array( + [[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"), + index=Index([1, 7]), + columns=MultiIndex(levels=[[u("a"), u("b"), u("c")], + [u("r"), u("s"), u("t"), + u("u"), u("v")]], + labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + names=[None, u("q")])) + + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) + + +def test_header_multi_index_common_format_malformed3(all_parsers): + parser = all_parsers + expected = DataFrame(np.array( + [[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"), + index=MultiIndex(levels=[[1, 7], [2, 8]], + labels=[[0, 1], [0, 1]]), + columns=MultiIndex(levels=[[u("a"), u("b"), u("c")], + [u("s"), u("t"), u("u"), u("v")]], + labels=[[0, 1, 2, 2], [0, 1, 2, 3]], + names=[None, u("q")])) + data = """,a,a,b,c,c +q,r,s,t,u,v +1,2,3,4,5,6 +7,8,9,10,11,12""" + + result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("data,header", [ + ("1,2,3\n4,5,6", None), + ("foo,bar,baz\n1,2,3\n4,5,6", 0), +]) +def test_header_names_backward_compat(all_parsers, data, header): + # see gh-2539 + parser = all_parsers + expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), + names=["a", "b", "c"]) + + result = parser.read_csv(StringIO(data), names=["a", "b", "c"], + header=header) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [ + dict(), dict(index_col=False) +]) +def test_read_only_header_no_rows(all_parsers, kwargs): + # See gh-7773 + parser = all_parsers + expected = DataFrame(columns=["a", "b", "c"]) + + result = parser.read_csv(StringIO("a,b,c"), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs,names", [ + (dict(), [0, 1, 2, 3, 4]), + (dict(prefix="X"), ["X0", "X1", "X2", "X3", "X4"]), + (dict(names=["foo", "bar", "baz", "quux", "panda"]), + ["foo", "bar", "baz", "quux", "panda"]) +]) +def test_no_header(all_parsers, kwargs, names): + parser = all_parsers + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + expected = DataFrame([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]], columns=names) + result = parser.read_csv(StringIO(data), header=None, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("header", [ + ["a", "b"], + "string_header" +]) +def test_non_int_header(all_parsers, header): + # see gh-16338 + msg = "header must be integer or list of integers" + data = """1,2\n3,4""" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=header) + + +def test_singleton_header(all_parsers): + # see gh-7757 + data = """a,b,c\n0,1,2\n1,2,3""" + parser = all_parsers + + expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]}) + result = parser.read_csv(StringIO(data), header=[0]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,expected", [ + ("A,A,A,B\none,one,one,two\n0,40,34,0.1", + DataFrame([[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "one.1"), + ("A", "one.2"), ("B", "two")]))), + ("A,A,A,B\none,one,one.1,two\n0,40,34,0.1", + DataFrame([[0, 40, 34, 0.1]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "one.1"), + ("A", "one.1.1"), ("B", "two")]))), + ("A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1", + DataFrame([[0, 40, 34, 0.1, 0.1]], + columns=MultiIndex.from_tuples( + [("A", "one"), ("A", "one.1"), + ("A", "one.1.1"), ("B", "two"), + ("B", "two.1")]))) +]) +def test_mangles_multi_index(all_parsers, data, expected): + # see gh-18062 + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=[0, 1]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", [None, [0]]) +@pytest.mark.parametrize("columns", [None, + (["", "Unnamed"]), + (["Unnamed", ""]), + (["Unnamed", "NotUnnamed"])]) +def test_multi_index_unnamed(all_parsers, index_col, columns): + # see gh-23687 + # + # When specifying a multi-index header, make sure that + # we don't error just because one of the rows in our header + # has ALL column names containing the string "Unnamed". The + # correct condition to check is whether the row contains + # ALL columns that did not have names (and instead were given + # placeholder ones). + parser = all_parsers + header = [0, 1] + + if index_col is None: + data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n" + else: + data = (",".join([""] + (columns or ["", ""])) + + "\n,0,1\n0,2,3\n1,4,5\n") + + if columns is None: + msg = (r"Passed header=\[0,1\] are too " + r"many rows for this multi_index of columns") + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), header=header, + index_col=index_col) + else: + result = parser.read_csv(StringIO(data), header=header, + index_col=index_col) + template = "Unnamed: {i}_level_0" + exp_columns = [] + + for i, col in enumerate(columns): + if not col: # Unnamed. + col = template.format(i=i if index_col is None else i + 1) + + exp_columns.append(col) + + columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"])) + expected = DataFrame([[2, 3], [4, 5]], columns=columns) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py new file mode 100644 index 0000000000000..8c2de40b46114 --- /dev/null +++ b/pandas/tests/io/parser/test_index_col.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- + +""" +Tests that the specified index column (a.k.a "index_col") +is properly handled or inferred during parsing for all of +the parsers defined in parsers.py +""" + +import pytest + +from pandas.compat import StringIO + +from pandas import DataFrame, Index, MultiIndex +import pandas.util.testing as tm + + +@pytest.mark.parametrize("with_header", [True, False]) +def test_index_col_named(all_parsers, with_header): + parser = all_parsers + no_header = """\ +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa + header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" # noqa + + if with_header: + data = header + no_header + + result = parser.read_csv(StringIO(data), index_col="ID") + expected = parser.read_csv(StringIO(data), header=0).set_index("ID") + tm.assert_frame_equal(result, expected) + else: + data = no_header + msg = "Index ID invalid" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), index_col="ID") + + +def test_index_col_named2(all_parsers): + parser = all_parsers + data = """\ +1,2,3,4,hello +5,6,7,8,world +9,10,11,12,foo +""" + + expected = DataFrame({"a": [1, 5, 9], "b": [2, 6, 10], + "c": [3, 7, 11], "d": [4, 8, 12]}, + index=Index(["hello", "world", "foo"], + name="message")) + names = ["a", "b", "c", "d", "message"] + + result = parser.read_csv(StringIO(data), names=names, + index_col=["message"]) + tm.assert_frame_equal(result, expected) + + +def test_index_col_is_true(all_parsers): + # see gh-9798 + data = "a,b\n1,2" + parser = all_parsers + + with pytest.raises(ValueError, match="The value of index_col " + "couldn't be 'True'"): + parser.read_csv(StringIO(data), index_col=True) + + +def test_infer_index_col(all_parsers): + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col,kwargs", [ + (None, dict(columns=["x", "y", "z"])), + (False, dict(columns=["x", "y", "z"])), + (0, dict(columns=["y", "z"], index=Index([], name="x"))), + (1, dict(columns=["x", "z"], index=Index([], name="y"))), + ("x", dict(columns=["y", "z"], index=Index([], name="x"))), + ("y", dict(columns=["x", "z"], index=Index([], name="y"))), + ([0, 1], dict(columns=["z"], index=MultiIndex.from_arrays( + [[]] * 2, names=["x", "y"]))), + (["x", "y"], dict(columns=["z"], index=MultiIndex.from_arrays( + [[]] * 2, names=["x", "y"]))), + ([1, 0], dict(columns=["z"], index=MultiIndex.from_arrays( + [[]] * 2, names=["y", "x"]))), + (["y", "x"], dict(columns=["z"], index=MultiIndex.from_arrays( + [[]] * 2, names=["y", "x"]))), +]) +def test_index_col_empty_data(all_parsers, index_col, kwargs): + data = "x,y,z" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=index_col) + + expected = DataFrame([], **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_index_col_false(all_parsers): + # see gh-10413 + data = "x,y" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=False) + + expected = DataFrame([], columns=["x", "y"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_names", [ + ["", ""], + ["foo", ""], + ["", "bar"], + ["foo", "bar"], + ["NotReallyUnnamed", "Unnamed: 0"], +]) +def test_multi_index_naming(all_parsers, index_names): + parser = all_parsers + + # We don't want empty index names being replaced with "Unnamed: 0" + data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) + result = parser.read_csv(StringIO(data), index_col=[0, 1]) + + expected = DataFrame({"col": [1, 2, 3, 4]}, + index=MultiIndex.from_product([["a", "b"], + ["c", "d"]])) + expected.index.names = [name if name else None for name in index_names] + tm.assert_frame_equal(result, expected) + + +def test_multi_index_naming_not_all_at_beginning(all_parsers): + parser = all_parsers + data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" + result = parser.read_csv(StringIO(data), index_col=[0, 2]) + + expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]}, + index=MultiIndex( + levels=[['a', 'b'], [1, 2, 3, 4]], + labels=[[0, 0, 1, 1], [0, 1, 2, 3]])) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py new file mode 100644 index 0000000000000..0efc0c2c13557 --- /dev/null +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +""" +Tests that duplicate columns are handled appropriately when parsed by the +CSV engine. In general, the expected result is that they are either thoroughly +de-duplicated (if mangling requested) or ignored otherwise. +""" + +import pytest + +from pandas.compat import StringIO + +from pandas import DataFrame +import pandas.util.testing as tm + + +@pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)]) +def test_basic(all_parsers, kwargs): + # TODO: add test for condition "mangle_dupe_cols=False" + # once it is actually supported (gh-12935) + parser = all_parsers + + data = "a,a,b,b,b\n1,2,3,4,5" + result = parser.read_csv(StringIO(data), sep=",", **kwargs) + + expected = DataFrame([[1, 2, 3, 4, 5]], + columns=["a", "a.1", "b", "b.1", "b.2"]) + tm.assert_frame_equal(result, expected) + + +def test_basic_names(all_parsers): + # See gh-7160 + parser = all_parsers + + data = "a,b,a\n0,1,2\n3,4,5" + expected = DataFrame([[0, 1, 2], [3, 4, 5]], + columns=["a", "b", "a.1"]) + + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_basic_names_warn(all_parsers): + # See gh-7160 + parser = all_parsers + + data = "0,1,2\n3,4,5" + expected = DataFrame([[0, 1, 2], [3, 4, 5]], + columns=["a", "b", "a.1"]) + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + result = parser.read_csv(StringIO(data), names=["a", "b", "a"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,expected", [ + ("a,a,a.1\n1,2,3", + DataFrame([[1, 2, 3]], columns=["a", "a.1", "a.1.1"])), + ("a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6", + DataFrame([[1, 2, 3, 4, 5, 6]], columns=["a", "a.1", "a.1.1", "a.1.1.1", + "a.1.1.1.1", "a.1.1.1.1.1"])), + ("a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7", + DataFrame([[1, 2, 3, 4, 5, 6, 7]], columns=["a", "a.1", "a.3", "a.1.1", + "a.2", "a.2.1", "a.3.1"])) +]) +def test_thorough_mangle_columns(all_parsers, data, expected): + # see gh-17060 + parser = all_parsers + + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,names,expected", [ + ("a,b,b\n1,2,3", + ["a.1", "a.1", "a.1.1"], + DataFrame([["a", "b", "b"], ["1", "2", "3"]], + columns=["a.1", "a.1.1", "a.1.1.1"])), + ("a,b,c,d,e,f\n1,2,3,4,5,6", + ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"], + DataFrame([["a", "b", "c", "d", "e", "f"], + ["1", "2", "3", "4", "5", "6"]], + columns=["a", "a.1", "a.1.1", "a.1.1.1", + "a.1.1.1.1", "a.1.1.1.1.1"])), + ("a,b,c,d,e,f,g\n1,2,3,4,5,6,7", + ["a", "a", "a.3", "a.1", "a.2", "a", "a"], + DataFrame([["a", "b", "c", "d", "e", "f", "g"], + ["1", "2", "3", "4", "5", "6", "7"]], + columns=["a", "a.1", "a.3", "a.1.1", + "a.2", "a.2.1", "a.3.1"])), +]) +def test_thorough_mangle_names(all_parsers, data, names, expected): + # see gh-17095 + parser = all_parsers + + with tm.assert_produces_warning(UserWarning, check_stacklevel=False): + result = parser.read_csv(StringIO(data), names=names) + tm.assert_frame_equal(result, expected) + + +def test_mangled_unnamed_placeholders(all_parsers): + # xref gh-13017 + orig_key = "0" + parser = all_parsers + + orig_value = [1, 2, 3] + df = DataFrame({orig_key: orig_value}) + + # This test recursively updates `df`. + for i in range(3): + expected = DataFrame() + + for j in range(i + 1): + expected["Unnamed: 0" + ".1" * j] = [0, 1, 2] + + expected[orig_key] = orig_value + df = parser.read_csv(StringIO(df.to_csv())) + + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 2dfcec161342c..8c1641b1d2abe 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -12,10 +12,6 @@ import pandas.util.testing as tm from .common import ParserTests -from .dtypes import DtypeTests -from .header import HeaderTests -from .index_col import IndexColTests -from .mangle_dupes import DupeColumnTests from .multithread import MultithreadTests from .na_values import NAvaluesTests from .parse_dates import ParseDatesTests @@ -25,9 +21,7 @@ from .usecols import UsecolsTests -class BaseParser(DtypeTests, DupeColumnTests, - HeaderTests, IndexColTests, - MultithreadTests, NAvaluesTests, +class BaseParser(MultithreadTests, NAvaluesTests, ParseDatesTests, ParserTests, SkipRowsTests, UsecolsTests, QuotingTests):