diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 5a255d1e62043..49c8330490ed1 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -61,6 +61,7 @@ Bug Fixes - Bug in ``.to_clipboard()`` and Excel compat (:issue:`12529`) +- Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 581106924c77e..ff2874041f6f9 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -81,3 +81,4 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3fe5e5e826ebd..929b360854d5b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,6 +20,7 @@ is_float, is_scalar) from pandas.core.index import Index, MultiIndex, RangeIndex +from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.common import AbstractMethodError from pandas.core.config import get_option @@ -2791,19 +2792,27 @@ def _clean_index_names(columns, index_col): def _get_empty_meta(columns, index_col, index_names, dtype=None): columns = list(columns) - if dtype is None: - dtype = {} + # Convert `dtype` to a defaultdict of some kind. + # This will enable us to write `dtype[col_name]` + # without worrying about KeyError issues later on. + if not isinstance(dtype, dict): + # if dtype == None, default will be np.object. + default_dtype = dtype or np.object + dtype = defaultdict(lambda: default_dtype) else: - if not isinstance(dtype, dict): - dtype = defaultdict(lambda: dtype) + # Save a copy of the dictionary. + _dtype = dtype.copy() + dtype = defaultdict(lambda: np.object) + # Convert column indexes to column names. - dtype = dict((columns[k] if is_integer(k) else k, v) - for k, v in compat.iteritems(dtype)) + for k, v in compat.iteritems(_dtype): + col = columns[k] if is_integer(k) else k + dtype[col] = v if index_col is None or index_col is False: index = Index([]) else: - index = [np.empty(0, dtype=dtype.get(index_name, np.object)) + index = [Series([], dtype=dtype[index_name]) for index_name in index_names] index = MultiIndex.from_arrays(index, names=index_names) index_col.sort() @@ -2811,7 +2820,7 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): columns.pop(n - i) col_dict = dict((col_name, - np.empty(0, dtype=dtype.get(col_name, np.object))) + Series([], dtype=dtype[col_name])) for col_name in columns) return index, columns, col_dict diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 75b99654dbf89..9cbe88d4032a3 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -561,3 +561,49 @@ def test_internal_null_byte(self): result = self.read_csv(StringIO(data), names=names) tm.assert_frame_equal(result, expected) + + def test_empty_dtype(self): + # see gh-14712 + data = 'a,b' + + expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64) + result = self.read_csv(StringIO(data), header=0, dtype=np.float64) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'a': pd.Categorical([]), + 'b': pd.Categorical([])}, + index=[]) + result = self.read_csv(StringIO(data), header=0, + dtype='category') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]') + result = self.read_csv(StringIO(data), header=0, + dtype='datetime64[ns]') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'), + 'b': pd.Series([], dtype='timedelta64[ns]')}, + index=[]) + result = self.read_csv(StringIO(data), header=0, + dtype='timedelta64[ns]') + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={'a': np.float64}) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={0: np.float64}) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame(columns=['a', 'b']) + expected['a'] = expected['a'].astype(np.int32) + expected['b'] = expected['b'].astype(np.float64) + result = self.read_csv(StringIO(data), header=0, + dtype={'a': np.int32, 1: np.float64}) + tm.assert_frame_equal(result, expected)