Skip to content

Fixed bug where read_csv ignores dtype arg if input is empty. #10577

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,8 @@ Bug Fixes
- Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`)
- Bug in `pandas.concat` with ``axis=0`` when column is of dtype ``category`` (:issue:`10177`)
- Bug in ``read_msgpack`` where input type is not always checked (:issue:`10369`)
- Bug in `pandas.read_csv` with ``index_col=False`` or with ``index_col=['a', 'b']`` (:issue:`10413`, :issue:`10467`)
- Bug in `pandas.read_csv` with kwargs ``index_col=False``, ``index_col=['a', 'b']`` or ``dtype``
(:issue:`10413`, :issue:`10467`, :issue:`10577`)
- Bug in `Series.from_csv` with ``header`` kwarg not setting the ``Series.name`` or the ``Series.index.name`` (:issue:`10483`)
- Bug in `groupby.var` which caused variance to be inaccurate for small float values (:issue:`10448`)
- Bug in ``Series.plot(kind='hist')`` Y Label not informative (:issue:`10485`)
Expand Down
22 changes: 17 additions & 5 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1170,7 +1170,8 @@ def read(self, nrows=None):
if nrows is None:
return _get_empty_meta(self.orig_names,
self.index_col,
self.index_names)
self.index_names,
dtype=self.kwds.get('dtype'))
else:
raise

Expand Down Expand Up @@ -2219,19 +2220,30 @@ def _clean_index_names(columns, index_col):
return index_names, columns, index_col


def _get_empty_meta(columns, index_col, index_names):
def _get_empty_meta(columns, index_col, index_names, dtype=None):
columns = list(columns)

if dtype is None:
dtype = {}
else:
# Convert column indexes to column names.
dtype = dict((columns[k] if com.is_integer(k) else k, v)
for k, v in compat.iteritems(dtype))

if index_col is None or index_col is False:
index = Index([])
else:
index_col = list(index_col)
index = MultiIndex.from_arrays([[]] * len(index_col), names=index_names)
index = [ np.empty(0, dtype=dtype.get(index_name, np.object))
for index_name in index_names ]
index = MultiIndex.from_arrays(index, names=index_names)
index_col.sort()
for i, n in enumerate(index_col):
columns.pop(n-i)

return index, columns, {}
col_dict = dict((col_name, np.empty(0, dtype=dtype.get(col_name, np.object)))
for col_name in columns)

return index, columns, col_dict


def _floatify_na_values(na_values):
Expand Down
58 changes: 58 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3540,6 +3540,64 @@ def test_pass_dtype(self):
self.assertEqual(result['one'].dtype, 'u1')
self.assertEqual(result['two'].dtype, 'S1')

def test_empty_pass_dtype(self):
data = 'one,two'
result = self.read_csv(StringIO(data), dtype={'one': 'u1'})

expected = DataFrame({'one': np.empty(0, dtype='u1'),
'two': np.empty(0, dtype=np.object)})
tm.assert_frame_equal(result, expected)

def test_empty_with_index_pass_dtype(self):
data = 'one,two'
result = self.read_csv(StringIO(data), index_col=['one'],
dtype={'one': 'u1', 1: 'f'})

expected = DataFrame({'two': np.empty(0, dtype='f')},
index=Index([], dtype='u1', name='one'))
tm.assert_frame_equal(result, expected)

def test_empty_with_multiindex_pass_dtype(self):
data = 'one,two,three'
result = self.read_csv(StringIO(data), index_col=['one', 'two'],
dtype={'one': 'u1', 1: 'f8'})

expected = DataFrame({'three': np.empty(0, dtype=np.object)}, index=MultiIndex.from_arrays(
[np.empty(0, dtype='u1'), np.empty(0, dtype='O')],
names=['one', 'two'])
)
tm.assert_frame_equal(result, expected)

def test_empty_with_mangled_column_pass_dtype_by_names(self):
data = 'one,one'
result = self.read_csv(StringIO(data), dtype={'one': 'u1', 'one.1': 'f'})

expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
tm.assert_frame_equal(result, expected)

def test_empty_with_mangled_column_pass_dtype_by_indexes(self):
data = 'one,one'
result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})

expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
tm.assert_frame_equal(result, expected)

def test_empty_with_dup_column_pass_dtype_by_names(self):
data = 'one,one'
result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'})
expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1)
tm.assert_frame_equal(result, expected)

def test_empty_with_dup_column_pass_dtype_by_indexes(self):
### FIXME in GH9424
raise nose.SkipTest("GH 9424; known failure read_csv with duplicate columns")

data = 'one,one'
result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'})
expected = pd.concat([Series([], name='one', dtype='u1'),
Series([], name='one', dtype='f')], axis=1)
tm.assert_frame_equal(result, expected)

def test_usecols_dtypes(self):
data = """\
1,2,3
Expand Down