Skip to content

Commit 75b606a

Browse files
gfyoungjorisvandenbossche
authored andcommitted
BUG: Respect the dtype parameter for empty CSV (#14717)
1 parent 22d982a commit 75b606a

File tree

4 files changed

+65
-8
lines changed

4 files changed

+65
-8
lines changed

doc/source/whatsnew/v0.19.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ Bug Fixes
6161
- Bug in ``.to_clipboard()`` and Excel compat (:issue:`12529`)
6262

6363

64+
- Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`)
6465

6566

6667

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,4 @@ Performance Improvements
8383

8484
Bug Fixes
8585
~~~~~~~~~
86+

pandas/io/parsers.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
is_float,
2121
is_scalar)
2222
from pandas.core.index import Index, MultiIndex, RangeIndex
23+
from pandas.core.series import Series
2324
from pandas.core.frame import DataFrame
2425
from pandas.core.common import AbstractMethodError
2526
from pandas.core.config import get_option
@@ -2791,27 +2792,35 @@ def _clean_index_names(columns, index_col):
27912792
def _get_empty_meta(columns, index_col, index_names, dtype=None):
27922793
columns = list(columns)
27932794

2794-
if dtype is None:
2795-
dtype = {}
2795+
# Convert `dtype` to a defaultdict of some kind.
2796+
# This will enable us to write `dtype[col_name]`
2797+
# without worrying about KeyError issues later on.
2798+
if not isinstance(dtype, dict):
2799+
# if dtype == None, default will be np.object.
2800+
default_dtype = dtype or np.object
2801+
dtype = defaultdict(lambda: default_dtype)
27962802
else:
2797-
if not isinstance(dtype, dict):
2798-
dtype = defaultdict(lambda: dtype)
2803+
# Save a copy of the dictionary.
2804+
_dtype = dtype.copy()
2805+
dtype = defaultdict(lambda: np.object)
2806+
27992807
# Convert column indexes to column names.
2800-
dtype = dict((columns[k] if is_integer(k) else k, v)
2801-
for k, v in compat.iteritems(dtype))
2808+
for k, v in compat.iteritems(_dtype):
2809+
col = columns[k] if is_integer(k) else k
2810+
dtype[col] = v
28022811

28032812
if index_col is None or index_col is False:
28042813
index = Index([])
28052814
else:
2806-
index = [np.empty(0, dtype=dtype.get(index_name, np.object))
2815+
index = [Series([], dtype=dtype[index_name])
28072816
for index_name in index_names]
28082817
index = MultiIndex.from_arrays(index, names=index_names)
28092818
index_col.sort()
28102819
for i, n in enumerate(index_col):
28112820
columns.pop(n - i)
28122821

28132822
col_dict = dict((col_name,
2814-
np.empty(0, dtype=dtype.get(col_name, np.object)))
2823+
Series([], dtype=dtype[col_name]))
28152824
for col_name in columns)
28162825

28172826
return index, columns, col_dict

pandas/io/tests/parser/c_parser_only.py

+46
Original file line numberDiff line numberDiff line change
@@ -561,3 +561,49 @@ def test_internal_null_byte(self):
561561

562562
result = self.read_csv(StringIO(data), names=names)
563563
tm.assert_frame_equal(result, expected)
564+
565+
def test_empty_dtype(self):
566+
# see gh-14712
567+
data = 'a,b'
568+
569+
expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64)
570+
result = self.read_csv(StringIO(data), header=0, dtype=np.float64)
571+
tm.assert_frame_equal(result, expected)
572+
573+
expected = pd.DataFrame({'a': pd.Categorical([]),
574+
'b': pd.Categorical([])},
575+
index=[])
576+
result = self.read_csv(StringIO(data), header=0,
577+
dtype='category')
578+
tm.assert_frame_equal(result, expected)
579+
580+
expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
581+
result = self.read_csv(StringIO(data), header=0,
582+
dtype='datetime64[ns]')
583+
tm.assert_frame_equal(result, expected)
584+
585+
expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'),
586+
'b': pd.Series([], dtype='timedelta64[ns]')},
587+
index=[])
588+
result = self.read_csv(StringIO(data), header=0,
589+
dtype='timedelta64[ns]')
590+
tm.assert_frame_equal(result, expected)
591+
592+
expected = pd.DataFrame(columns=['a', 'b'])
593+
expected['a'] = expected['a'].astype(np.float64)
594+
result = self.read_csv(StringIO(data), header=0,
595+
dtype={'a': np.float64})
596+
tm.assert_frame_equal(result, expected)
597+
598+
expected = pd.DataFrame(columns=['a', 'b'])
599+
expected['a'] = expected['a'].astype(np.float64)
600+
result = self.read_csv(StringIO(data), header=0,
601+
dtype={0: np.float64})
602+
tm.assert_frame_equal(result, expected)
603+
604+
expected = pd.DataFrame(columns=['a', 'b'])
605+
expected['a'] = expected['a'].astype(np.int32)
606+
expected['b'] = expected['b'].astype(np.float64)
607+
result = self.read_csv(StringIO(data), header=0,
608+
dtype={'a': np.int32, 1: np.float64})
609+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)