BUG: Respect the dtype parameter for empty CSV

gfyoung · gfyoung · commit 30ac6107b5ae · 2016-11-22T16:00:03.000-05:00
Closes gh-14712.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -81,3 +81,5 @@ Performance Improvements
 
 Bug Fixes
 ~~~~~~~~~
+
+- Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2791,27 +2791,35 @@ def _clean_index_names(columns, index_col):
 def _get_empty_meta(columns, index_col, index_names, dtype=None):
     columns = list(columns)
 
-    if dtype is None:
-        dtype = {}
+    # Convert `dtype` to a defaultdict of some kind.
+    # This will enable us to write `dtype[col_name]`
+    # without worrying about KeyError issues later on.
+    if not isinstance(dtype, dict):
+        # if dtype == None, default will be np.object.
+        default_dtype = dtype or np.object
+        dtype = defaultdict(lambda: default_dtype)
     else:
-        if not isinstance(dtype, dict):
-            dtype = defaultdict(lambda: dtype)
+        # Save a copy of the dictionary.
+        _dtype = dtype.copy()
+        dtype = defaultdict(lambda: np.object)
+
         # Convert column indexes to column names.
-        dtype = dict((columns[k] if is_integer(k) else k, v)
-                     for k, v in compat.iteritems(dtype))
+        for k, v in compat.iteritems(_dtype):
+            col = columns[k] if is_integer(k) else k
+            dtype[col] = v
 
     if index_col is None or index_col is False:
         index = Index([])
     else:
-        index = [np.empty(0, dtype=dtype.get(index_name, np.object))
+        index = [np.empty(0, dtype=dtype[index_name])
                  for index_name in index_names]
         index = MultiIndex.from_arrays(index, names=index_names)
         index_col.sort()
         for i, n in enumerate(index_col):
             columns.pop(n - i)
 
     col_dict = dict((col_name,
-                     np.empty(0, dtype=dtype.get(col_name, np.object)))
+                     np.empty(0, dtype=dtype[col_name]))
                     for col_name in columns)
 
     return index, columns, col_dict
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -561,3 +561,30 @@ def test_internal_null_byte(self):
 
         result = self.read_csv(StringIO(data), names=names)
         tm.assert_frame_equal(result, expected)
+
+    def test_empty_dtype(self):
+        # see gh-14712
+        data = 'a,b'
+
+        expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64)
+        result = self.read_csv(StringIO(data), header=0, dtype=np.float64)
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame(columns=['a', 'b'])
+        expected['a'] = expected['a'].astype(np.float64)
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype={'a': np.float64})
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame(columns=['a', 'b'])
+        expected['a'] = expected['a'].astype(np.float64)
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype={0: np.float64})
+        tm.assert_frame_equal(result, expected)
+
+        expected = pd.DataFrame(columns=['a', 'b'])
+        expected['a'] = expected['a'].astype(np.int32)
+        expected['b'] = expected['b'].astype(np.float64)
+        result = self.read_csv(StringIO(data), header=0,
+                               dtype={'a': np.int32, 1: np.float64})
+        tm.assert_frame_equal(result, expected)