diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 77adbfc41b97a..9672066197969 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -357,7 +357,8 @@ Bug Fixes - Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`) - Bug in `pandas.concat` with ``axis=0`` when column is of dtype ``category`` (:issue:`10177`) - Bug in ``read_msgpack`` where input type is not always checked (:issue:`10369`) -- Bug in `pandas.read_csv` with ``index_col=False`` or with ``index_col=['a', 'b']`` (:issue:`10413`, :issue:`10467`) +- Bug in `pandas.read_csv` with kwargs ``index_col=False``, ``index_col=['a', 'b']`` or ``dtype`` + (:issue:`10413`, :issue:`10467`, :issue:`10577`) - Bug in `Series.from_csv` with ``header`` kwarg not setting the ``Series.name`` or the ``Series.index.name`` (:issue:`10483`) - Bug in `groupby.var` which caused variance to be inaccurate for small float values (:issue:`10448`) - Bug in ``Series.plot(kind='hist')`` Y Label not informative (:issue:`10485`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1ebe1ad137698..62d51fc510f97 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1170,7 +1170,8 @@ def read(self, nrows=None): if nrows is None: return _get_empty_meta(self.orig_names, self.index_col, - self.index_names) + self.index_names, + dtype=self.kwds.get('dtype')) else: raise @@ -2219,19 +2220,30 @@ def _clean_index_names(columns, index_col): return index_names, columns, index_col -def _get_empty_meta(columns, index_col, index_names): +def _get_empty_meta(columns, index_col, index_names, dtype=None): columns = list(columns) + if dtype is None: + dtype = {} + else: + # Convert column indexes to column names. + dtype = dict((columns[k] if com.is_integer(k) else k, v) + for k, v in compat.iteritems(dtype)) + if index_col is None or index_col is False: index = Index([]) else: - index_col = list(index_col) - index = MultiIndex.from_arrays([[]] * len(index_col), names=index_names) + index = [ np.empty(0, dtype=dtype.get(index_name, np.object)) + for index_name in index_names ] + index = MultiIndex.from_arrays(index, names=index_names) index_col.sort() for i, n in enumerate(index_col): columns.pop(n-i) - return index, columns, {} + col_dict = dict((col_name, np.empty(0, dtype=dtype.get(col_name, np.object))) + for col_name in columns) + + return index, columns, col_dict def _floatify_na_values(na_values): diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index e6aee76df4e74..0f0486e8ea596 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3540,6 +3540,64 @@ def test_pass_dtype(self): self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'S1') + def test_empty_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), dtype={'one': 'u1'}) + + expected = DataFrame({'one': np.empty(0, dtype='u1'), + 'two': np.empty(0, dtype=np.object)}) + tm.assert_frame_equal(result, expected) + + def test_empty_with_index_pass_dtype(self): + data = 'one,two' + result = self.read_csv(StringIO(data), index_col=['one'], + dtype={'one': 'u1', 1: 'f'}) + + expected = DataFrame({'two': np.empty(0, dtype='f')}, + index=Index([], dtype='u1', name='one')) + tm.assert_frame_equal(result, expected) + + def test_empty_with_multiindex_pass_dtype(self): + data = 'one,two,three' + result = self.read_csv(StringIO(data), index_col=['one', 'two'], + dtype={'one': 'u1', 1: 'f8'}) + + expected = DataFrame({'three': np.empty(0, dtype=np.object)}, index=MultiIndex.from_arrays( + [np.empty(0, dtype='u1'), np.empty(0, dtype='O')], + names=['one', 'two']) + ) + tm.assert_frame_equal(result, expected) + + def test_empty_with_mangled_column_pass_dtype_by_names(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={'one': 'u1', 'one.1': 'f'}) + + expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected) + + def test_empty_with_mangled_column_pass_dtype_by_indexes(self): + data = 'one,one' + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + + expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + tm.assert_frame_equal(result, expected) + + def test_empty_with_dup_column_pass_dtype_by_names(self): + data = 'one,one' + result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'}) + expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1) + tm.assert_frame_equal(result, expected) + + def test_empty_with_dup_column_pass_dtype_by_indexes(self): + ### FIXME in GH9424 + raise nose.SkipTest("GH 9424; known failure read_csv with duplicate columns") + + data = 'one,one' + result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'}) + expected = pd.concat([Series([], name='one', dtype='u1'), + Series([], name='one', dtype='f')], axis=1) + tm.assert_frame_equal(result, expected) + def test_usecols_dtypes(self): data = """\ 1,2,3