Skip to content

Commit 061c506

Browse files
committed
Merge pull request #10577 from santegoeds/bugfix/csv_reader-empty-data-with-dtype-args
Fixed bug where read_csv ignores dtype arg if input is empty.
2 parents 3089006 + 904aaea commit 061c506

File tree

3 files changed

+77
-6
lines changed

3 files changed

+77
-6
lines changed

doc/source/whatsnew/v0.17.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,8 @@ Bug Fixes
368368
- Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`)
369369
- Bug in `pandas.concat` with ``axis=0`` when column is of dtype ``category`` (:issue:`10177`)
370370
- Bug in ``read_msgpack`` where input type is not always checked (:issue:`10369`)
371-
- Bug in `pandas.read_csv` with ``index_col=False`` or with ``index_col=['a', 'b']`` (:issue:`10413`, :issue:`10467`)
371+
- Bug in `pandas.read_csv` with kwargs ``index_col=False``, ``index_col=['a', 'b']`` or ``dtype``
372+
(:issue:`10413`, :issue:`10467`, :issue:`10577`)
372373
- Bug in `Series.from_csv` with ``header`` kwarg not setting the ``Series.name`` or the ``Series.index.name`` (:issue:`10483`)
373374
- Bug in `groupby.var` which caused variance to be inaccurate for small float values (:issue:`10448`)
374375
- Bug in ``Series.plot(kind='hist')`` Y Label not informative (:issue:`10485`)

pandas/io/parsers.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -1170,7 +1170,8 @@ def read(self, nrows=None):
11701170
if nrows is None:
11711171
return _get_empty_meta(self.orig_names,
11721172
self.index_col,
1173-
self.index_names)
1173+
self.index_names,
1174+
dtype=self.kwds.get('dtype'))
11741175
else:
11751176
raise
11761177

@@ -2219,19 +2220,30 @@ def _clean_index_names(columns, index_col):
22192220
return index_names, columns, index_col
22202221

22212222

2222-
def _get_empty_meta(columns, index_col, index_names):
2223+
def _get_empty_meta(columns, index_col, index_names, dtype=None):
22232224
columns = list(columns)
22242225

2226+
if dtype is None:
2227+
dtype = {}
2228+
else:
2229+
# Convert column indexes to column names.
2230+
dtype = dict((columns[k] if com.is_integer(k) else k, v)
2231+
for k, v in compat.iteritems(dtype))
2232+
22252233
if index_col is None or index_col is False:
22262234
index = Index([])
22272235
else:
2228-
index_col = list(index_col)
2229-
index = MultiIndex.from_arrays([[]] * len(index_col), names=index_names)
2236+
index = [ np.empty(0, dtype=dtype.get(index_name, np.object))
2237+
for index_name in index_names ]
2238+
index = MultiIndex.from_arrays(index, names=index_names)
22302239
index_col.sort()
22312240
for i, n in enumerate(index_col):
22322241
columns.pop(n-i)
22332242

2234-
return index, columns, {}
2243+
col_dict = dict((col_name, np.empty(0, dtype=dtype.get(col_name, np.object)))
2244+
for col_name in columns)
2245+
2246+
return index, columns, col_dict
22352247

22362248

22372249
def _floatify_na_values(na_values):

pandas/io/tests/test_parsers.py

+58
Original file line numberDiff line numberDiff line change
@@ -3540,6 +3540,64 @@ def test_pass_dtype(self):
35403540
self.assertEqual(result['one'].dtype, 'u1')
35413541
self.assertEqual(result['two'].dtype, 'S1')
35423542

3543+
def test_empty_pass_dtype(self):
3544+
data = 'one,two'
3545+
result = self.read_csv(StringIO(data), dtype={'one': 'u1'})
3546+
3547+
expected = DataFrame({'one': np.empty(0, dtype='u1'),
3548+
'two': np.empty(0, dtype=np.object)})
3549+
tm.assert_frame_equal(result, expected)
3550+
3551+
def test_empty_with_index_pass_dtype(self):
3552+
data = 'one,two'
3553+
result = self.read_csv(StringIO(data), index_col=['one'],
3554+
dtype={'one': 'u1', 1: 'f'})
3555+
3556+
expected = DataFrame({'two': np.empty(0, dtype='f')},
3557+
index=Index([], dtype='u1', name='one'))
3558+
tm.assert_frame_equal(result, expected)
3559+
3560+
def test_empty_with_multiindex_pass_dtype(self):
3561+
data = 'one,two,three'
3562+
result = self.read_csv(StringIO(data), index_col=['one', 'two'],
3563+
dtype={'one': 'u1', 1: 'f8'})
3564+
3565+
expected = DataFrame({'three': np.empty(0, dtype=np.object)}, index=MultiIndex.from_arrays(
3566+
[np.empty(0, dtype='u1'), np.empty(0, dtype='O')],
3567+
names=['one', 'two'])
3568+
)
3569+
tm.assert_frame_equal(result, expected)
3570+
3571+
def test_empty_with_mangled_column_pass_dtype_by_names(self):
3572+
data = 'one,one'
3573+
result = self.read_csv(StringIO(data), dtype={'one': 'u1', 'one.1': 'f'})
3574+
3575+
expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
3576+
tm.assert_frame_equal(result, expected)
3577+
3578+
def test_empty_with_mangled_column_pass_dtype_by_indexes(self):
3579+
data = 'one,one'
3580+
result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'})
3581+
3582+
expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')})
3583+
tm.assert_frame_equal(result, expected)
3584+
3585+
def test_empty_with_dup_column_pass_dtype_by_names(self):
3586+
data = 'one,one'
3587+
result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'})
3588+
expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1)
3589+
tm.assert_frame_equal(result, expected)
3590+
3591+
def test_empty_with_dup_column_pass_dtype_by_indexes(self):
3592+
### FIXME in GH9424
3593+
raise nose.SkipTest("GH 9424; known failure read_csv with duplicate columns")
3594+
3595+
data = 'one,one'
3596+
result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'})
3597+
expected = pd.concat([Series([], name='one', dtype='u1'),
3598+
Series([], name='one', dtype='f')], axis=1)
3599+
tm.assert_frame_equal(result, expected)
3600+
35433601
def test_usecols_dtypes(self):
35443602
data = """\
35453603
1,2,3

0 commit comments

Comments
 (0)