Skip to content

Commit 75bb530

Browse files
chris-b1jorisvandenbossche
authored andcommitted
API: add dtype= option to python parser (#14295)
1 parent 58731c4 commit 75bb530

File tree

8 files changed

+435
-305
lines changed

8 files changed

+435
-305
lines changed

doc/source/io.rst

+6-4
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ dtype : Type name or dict of column -> type, default ``None``
157157
Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}``
158158
(unsupported with ``engine='python'``). Use `str` or `object` to preserve and
159159
not interpret dtype.
160+
161+
.. versionadded:: 0.20.0 support for the Python parser.
162+
160163
engine : {``'c'``, ``'python'``}
161164
Parser engine to use. The C engine is faster while the python engine is
162165
currently more feature-complete.
@@ -473,10 +476,9 @@ However, if you wanted for all the data to be coerced, no matter the type, then
473476
using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be
474477
worth trying.
475478

476-
.. note::
477-
The ``dtype`` option is currently only supported by the C engine.
478-
Specifying ``dtype`` with ``engine`` other than 'c' raises a
479-
``ValueError``.
479+
.. versionadded:: 0.20.0 support for the Python parser.
480+
481+
The ``dtype`` option is supported by the 'python' engine
480482

481483
.. note::
482484
In some cases, reading in abnormal data with columns containing mixed dtypes

doc/source/whatsnew/v0.20.0.txt

+9
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,17 @@ New features
2222
~~~~~~~~~~~~
2323

2424

25+
``read_csv`` supports ``dtype`` keyword for python engine
26+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2527

28+
The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns
29+
is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs <io.dtypes>` for more information.
2630

31+
.. ipython:: python
32+
33+
data = "a,b\n1,2\n3,4"
34+
pd.read_csv(StringIO(data), engine='python').dtypes
35+
pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes
2736

2837
.. _whatsnew_0200.enhancements.other:
2938

pandas/io/parsers.py

+108-24
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,15 @@
1717
zip, string_types, map, u)
1818
from pandas.types.common import (is_integer, _ensure_object,
1919
is_list_like, is_integer_dtype,
20-
is_float,
21-
is_scalar)
20+
is_float, is_dtype_equal,
21+
is_object_dtype,
22+
is_scalar, is_categorical_dtype)
23+
from pandas.types.missing import isnull
24+
from pandas.types.cast import _astype_nansafe
2225
from pandas.core.index import Index, MultiIndex, RangeIndex
2326
from pandas.core.series import Series
2427
from pandas.core.frame import DataFrame
28+
from pandas.core.categorical import Categorical
2529
from pandas.core.common import AbstractMethodError
2630
from pandas.core.config import get_option
2731
from pandas.io.date_converters import generic_parser
@@ -111,8 +115,9 @@
111115
are duplicate names in the columns.
112116
dtype : Type name or dict of column -> type, default None
113117
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
114-
(Unsupported with engine='python'). Use `str` or `object` to preserve and
115-
not interpret dtype.
118+
Use `str` or `object` to preserve and not interpret dtype.
119+
If converters are specified, they will be applied INSTEAD
120+
of dtype conversion.
116121
%s
117122
converters : dict, default None
118123
Dict of functions for converting values in certain columns. Keys can either
@@ -421,6 +426,7 @@ def _read(filepath_or_buffer, kwds):
421426
'true_values': None,
422427
'false_values': None,
423428
'converters': None,
429+
'dtype': None,
424430
'skipfooter': 0,
425431

426432
'keep_default_na': True,
@@ -461,7 +467,6 @@ def _read(filepath_or_buffer, kwds):
461467
'buffer_lines': None,
462468
'error_bad_lines': True,
463469
'warn_bad_lines': True,
464-
'dtype': None,
465470
'float_precision': None
466471
}
467472

@@ -476,7 +481,6 @@ def _read(filepath_or_buffer, kwds):
476481
'buffer_lines',
477482
'error_bad_lines',
478483
'warn_bad_lines',
479-
'dtype',
480484
'float_precision',
481485
])
482486
_deprecated_args = set([
@@ -834,9 +838,6 @@ def _clean_options(self, options, engine):
834838
" ignored as it is not supported by the 'python'"
835839
" engine.").format(reason=fallback_reason,
836840
option=arg)
837-
if arg == 'dtype':
838-
msg += " (Note the 'converters' option provides"\
839-
" similar functionality.)"
840841
raise ValueError(msg)
841842
del result[arg]
842843

@@ -1285,36 +1286,59 @@ def _agg_index(self, index, try_parse_dates=True):
12851286
col_na_values, col_na_fvalues = _get_na_values(
12861287
col_name, self.na_values, self.na_fvalues)
12871288

1288-
arr, _ = self._convert_types(arr, col_na_values | col_na_fvalues)
1289+
arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
12891290
arrays.append(arr)
12901291

12911292
index = MultiIndex.from_arrays(arrays, names=self.index_names)
12921293

12931294
return index
12941295

12951296
def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
1296-
converters=None):
1297+
converters=None, dtypes=None):
12971298
result = {}
12981299
for c, values in compat.iteritems(dct):
12991300
conv_f = None if converters is None else converters.get(c, None)
1301+
if isinstance(dtypes, dict):
1302+
cast_type = dtypes.get(c, None)
1303+
else:
1304+
# single dtype or None
1305+
cast_type = dtypes
13001306

13011307
if self.na_filter:
13021308
col_na_values, col_na_fvalues = _get_na_values(
13031309
c, na_values, na_fvalues)
13041310
else:
13051311
col_na_values, col_na_fvalues = set(), set()
13061312

1307-
coerce_type = True
13081313
if conv_f is not None:
1314+
# conv_f applied to data before inference
1315+
if cast_type is not None:
1316+
warnings.warn(("Both a converter and dtype were specified "
1317+
"for column {0} - only the converter will "
1318+
"be used").format(c), ParserWarning,
1319+
stacklevel=7)
1320+
13091321
try:
13101322
values = lib.map_infer(values, conv_f)
13111323
except ValueError:
13121324
mask = lib.ismember(values, na_values).view(np.uint8)
13131325
values = lib.map_infer_mask(values, conv_f, mask)
1314-
coerce_type = False
13151326

1316-
cvals, na_count = self._convert_types(
1317-
values, set(col_na_values) | col_na_fvalues, coerce_type)
1327+
cvals, na_count = self._infer_types(
1328+
values, set(col_na_values) | col_na_fvalues,
1329+
try_num_bool=False)
1330+
else:
1331+
# skip inference if specified dtype is object
1332+
try_num_bool = not (cast_type and is_object_dtype(cast_type))
1333+
1334+
# general type inference and conversion
1335+
cvals, na_count = self._infer_types(
1336+
values, set(col_na_values) | col_na_fvalues,
1337+
try_num_bool)
1338+
1339+
# type specificed in dtype param
1340+
if cast_type and not is_dtype_equal(cvals, cast_type):
1341+
cvals = self._cast_types(cvals, cast_type, c)
13181342

13191343
if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
13201344
cvals = lib.downcast_int64(
@@ -1326,7 +1350,23 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
13261350
print('Filled %d NA values in column %s' % (na_count, str(c)))
13271351
return result
13281352

1329-
def _convert_types(self, values, na_values, try_num_bool=True):
1353+
def _infer_types(self, values, na_values, try_num_bool=True):
1354+
"""
1355+
Infer types of values, possibly casting
1356+
1357+
Parameters
1358+
----------
1359+
values : ndarray
1360+
na_values : set
1361+
try_num_bool : bool, default try
1362+
try to cast values to numeric (first preference) or boolean
1363+
1364+
Returns:
1365+
--------
1366+
converted : ndarray
1367+
na_count : int
1368+
"""
1369+
13301370
na_count = 0
13311371
if issubclass(values.dtype.type, (np.number, np.bool_)):
13321372
mask = lib.ismember(values, na_values)
@@ -1340,6 +1380,7 @@ def _convert_types(self, values, na_values, try_num_bool=True):
13401380
if try_num_bool:
13411381
try:
13421382
result = lib.maybe_convert_numeric(values, na_values, False)
1383+
na_count = isnull(result).sum()
13431384
except Exception:
13441385
result = values
13451386
if values.dtype == np.object_:
@@ -1356,6 +1397,38 @@ def _convert_types(self, values, na_values, try_num_bool=True):
13561397

13571398
return result, na_count
13581399

1400+
def _cast_types(self, values, cast_type, column):
1401+
"""
1402+
Cast values to specified type
1403+
1404+
Parameters
1405+
----------
1406+
values : ndarray
1407+
cast_type : string or np.dtype
1408+
dtype to cast values to
1409+
column : string
1410+
column name - used only for error reporting
1411+
1412+
Returns
1413+
-------
1414+
converted : ndarray
1415+
"""
1416+
1417+
if is_categorical_dtype(cast_type):
1418+
# XXX this is for consistency with
1419+
# c-parser which parses all categories
1420+
# as strings
1421+
if not is_object_dtype(values):
1422+
values = _astype_nansafe(values, str)
1423+
values = Categorical(values)
1424+
else:
1425+
try:
1426+
values = _astype_nansafe(values, cast_type, copy=True)
1427+
except ValueError:
1428+
raise ValueError("Unable to convert column %s to "
1429+
"type %s" % (column, cast_type))
1430+
return values
1431+
13591432
def _do_date_conversions(self, names, data):
13601433
# returns data, columns
13611434
if self.parse_dates is not None:
@@ -1784,6 +1857,7 @@ def __init__(self, f, **kwds):
17841857

17851858
self.verbose = kwds['verbose']
17861859
self.converters = kwds['converters']
1860+
self.dtype = kwds['dtype']
17871861

17881862
self.compact_ints = kwds['compact_ints']
17891863
self.use_unsigned = kwds['use_unsigned']
@@ -1982,7 +2056,7 @@ def read(self, rows=None):
19822056
# DataFrame with the right metadata, even though it's length 0
19832057
names = self._maybe_dedup_names(self.orig_names)
19842058
index, columns, col_dict = _get_empty_meta(
1985-
names, self.index_col, self.index_names)
2059+
names, self.index_col, self.index_names, self.dtype)
19862060
columns = self._maybe_make_multi_index_columns(
19872061
columns, self.col_names)
19882062
return index, columns, col_dict
@@ -2033,15 +2107,25 @@ def get_chunk(self, size=None):
20332107

20342108
def _convert_data(self, data):
20352109
# apply converters
2036-
clean_conv = {}
2037-
2038-
for col, f in compat.iteritems(self.converters):
2039-
if isinstance(col, int) and col not in self.orig_names:
2040-
col = self.orig_names[col]
2041-
clean_conv[col] = f
2110+
def _clean_mapping(mapping):
2111+
"converts col numbers to names"
2112+
clean = {}
2113+
for col, v in compat.iteritems(mapping):
2114+
if isinstance(col, int) and col not in self.orig_names:
2115+
col = self.orig_names[col]
2116+
clean[col] = v
2117+
return clean
2118+
2119+
clean_conv = _clean_mapping(self.converters)
2120+
if not isinstance(self.dtype, dict):
2121+
# handles single dtype applied to all columns
2122+
clean_dtypes = self.dtype
2123+
else:
2124+
clean_dtypes = _clean_mapping(self.dtype)
20422125

20432126
return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues,
2044-
self.verbose, clean_conv)
2127+
self.verbose, clean_conv,
2128+
clean_dtypes)
20452129

20462130
def _to_recarray(self, data, columns):
20472131
dtypes = []

0 commit comments

Comments
 (0)