BUG: Respect the dtype parameter for empty CSV (#14717)

gfyoung · jorisvandenbossche · commit 75b606abad51 · 2016-11-24T22:18:20.000+01:00
diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt
@@ -61,6 +61,7 @@ Bug Fixes
 - Bug in ``.to_clipboard()`` and Excel compat (:issue:`12529`)
 
 
+- Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`)
 
 
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -83,3 +83,4 @@ Performance Improvements
 
 Bug Fixes
 ~~~~~~~~~
+
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -20,6 +20,7 @@
                                  is_float,
                                  is_scalar)
 from pandas.core.index import Index, MultiIndex, RangeIndex
+from pandas.core.series import Series
 from pandas.core.frame import DataFrame
 from pandas.core.common import AbstractMethodError
 from pandas.core.config import get_option
@@ -2791,27 +2792,35 @@ def _clean_index_names(columns, index_col):
 def _get_empty_meta(columns, index_col, index_names, dtype=None):
     columns = list(columns)
 
-    if dtype is None:
-        dtype = {}
+    # Convert `dtype` to a defaultdict of some kind.
+    # This will enable us to write `dtype[col_name]`
+    # without worrying about KeyError issues later on.
+    if not isinstance(dtype, dict):
+        # if dtype == None, default will be np.object.
+        default_dtype = dtype or np.object
+        dtype = defaultdict(lambda: default_dtype)
     else:
-        if not isinstance(dtype, dict):
-            dtype = defaultdict(lambda: dtype)
+        # Save a copy of the dictionary.
+        _dtype = dtype.copy()
+        dtype = defaultdict(lambda: np.object)
+
         # Convert column indexes to column names.
-        dtype = dict((columns[k] if is_integer(k) else k, v)
-                     for k, v in compat.iteritems(dtype))
+        for k, v in compat.iteritems(_dtype):
+            col = columns[k] if is_integer(k) else k
+            dtype[col] = v
 
     if index_col is None or index_col is False:
         index = Index([])
     else:
-        index = [np.empty(0, dtype=dtype.get(index_name, np.object))
+        index = [Series([], dtype=dtype[index_name])
                  for index_name in index_names]
         index = MultiIndex.from_arrays(index, names=index_names)
         index_col.sort()
         for i, n in enumerate(index_col):
             columns.pop(n - i)
 
     col_dict = dict((col_name,
-                     np.empty(0, dtype=dtype.get(col_name, np.object)))
+                     Series([], dtype=dtype[col_name]))
                     for col_name in columns)
 
     return index, columns, col_dict
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -561,3 +561,49 @@ def test_internal_null_byte(self):
 
         result = self.read_csv(StringIO(data), names=names)
         tm.assert_frame_equal(result, expected)
+
+    def test_empty_dtype(self):
+        # see gh-14712
+        data = 'a,b'
+
+        expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64)
+        result = self.read_csv(StringIO(data), header=0, dtype=np.float64)
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame({'a': pd.Categorical([]),
+                                 'b': pd.Categorical([])},
+                                index=[])
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype='category')
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype='datetime64[ns]')
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'),
+                                 'b': pd.Series([], dtype='timedelta64[ns]')},
+                                index=[])
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype='timedelta64[ns]')
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame(columns=['a', 'b'])
+        expected['a'] = expected['a'].astype(np.float64)
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype={'a': np.float64})
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame(columns=['a', 'b'])
+        expected['a'] = expected['a'].astype(np.float64)
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype={0: np.float64})
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame(columns=['a', 'b'])
+        expected['a'] = expected['a'].astype(np.int32)
+        expected['b'] = expected['b'].astype(np.float64)
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype={'a': np.int32, 1: np.float64})
+        tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@ Bug Fixes`
`61`	`61`	- Bug in ``.to_clipboard()`` and Excel compat (:issue:`12529`)
`62`	`62`
`63`	`63`
	`64`	+- Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`)
`64`	`65`
`65`	`66`
`66`	`67`
Original file line number	Diff line number	Diff line change
`@@ -83,3 +83,4 @@ Performance Improvements`
`83`	`83`
`84`	`84`	`Bug Fixes`
`85`	`85`	`~~~~~~~~~`
	`86`	`+`