17
17
zip , string_types , map , u )
18
18
from pandas .types .common import (is_integer , _ensure_object ,
19
19
is_list_like , is_integer_dtype ,
20
- is_float ,
21
- is_scalar )
20
+ is_float , is_dtype_equal ,
21
+ is_object_dtype ,
22
+ is_scalar , is_categorical_dtype )
23
+ from pandas .types .missing import isnull
24
+ from pandas .types .cast import _astype_nansafe
22
25
from pandas .core .index import Index , MultiIndex , RangeIndex
23
26
from pandas .core .series import Series
24
27
from pandas .core .frame import DataFrame
28
+ from pandas .core .categorical import Categorical
25
29
from pandas .core .common import AbstractMethodError
26
30
from pandas .core .config import get_option
27
31
from pandas .io .date_converters import generic_parser
111
115
are duplicate names in the columns.
112
116
dtype : Type name or dict of column -> type, default None
113
117
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
114
- (Unsupported with engine='python'). Use `str` or `object` to preserve and
115
- not interpret dtype.
118
+ Use `str` or `object` to preserve and not interpret dtype.
119
+ If converters are specified, they will be applied INSTEAD
120
+ of dtype conversion.
116
121
%s
117
122
converters : dict, default None
118
123
Dict of functions for converting values in certain columns. Keys can either
@@ -421,6 +426,7 @@ def _read(filepath_or_buffer, kwds):
421
426
'true_values' : None ,
422
427
'false_values' : None ,
423
428
'converters' : None ,
429
+ 'dtype' : None ,
424
430
'skipfooter' : 0 ,
425
431
426
432
'keep_default_na' : True ,
@@ -461,7 +467,6 @@ def _read(filepath_or_buffer, kwds):
461
467
'buffer_lines' : None ,
462
468
'error_bad_lines' : True ,
463
469
'warn_bad_lines' : True ,
464
- 'dtype' : None ,
465
470
'float_precision' : None
466
471
}
467
472
@@ -476,7 +481,6 @@ def _read(filepath_or_buffer, kwds):
476
481
'buffer_lines' ,
477
482
'error_bad_lines' ,
478
483
'warn_bad_lines' ,
479
- 'dtype' ,
480
484
'float_precision' ,
481
485
])
482
486
_deprecated_args = set ([
@@ -834,9 +838,6 @@ def _clean_options(self, options, engine):
834
838
" ignored as it is not supported by the 'python'"
835
839
" engine." ).format (reason = fallback_reason ,
836
840
option = arg )
837
- if arg == 'dtype' :
838
- msg += " (Note the 'converters' option provides" \
839
- " similar functionality.)"
840
841
raise ValueError (msg )
841
842
del result [arg ]
842
843
@@ -1285,36 +1286,59 @@ def _agg_index(self, index, try_parse_dates=True):
1285
1286
col_na_values , col_na_fvalues = _get_na_values (
1286
1287
col_name , self .na_values , self .na_fvalues )
1287
1288
1288
- arr , _ = self ._convert_types (arr , col_na_values | col_na_fvalues )
1289
+ arr , _ = self ._infer_types (arr , col_na_values | col_na_fvalues )
1289
1290
arrays .append (arr )
1290
1291
1291
1292
index = MultiIndex .from_arrays (arrays , names = self .index_names )
1292
1293
1293
1294
return index
1294
1295
1295
1296
def _convert_to_ndarrays (self , dct , na_values , na_fvalues , verbose = False ,
1296
- converters = None ):
1297
+ converters = None , dtypes = None ):
1297
1298
result = {}
1298
1299
for c , values in compat .iteritems (dct ):
1299
1300
conv_f = None if converters is None else converters .get (c , None )
1301
+ if isinstance (dtypes , dict ):
1302
+ cast_type = dtypes .get (c , None )
1303
+ else :
1304
+ # single dtype or None
1305
+ cast_type = dtypes
1300
1306
1301
1307
if self .na_filter :
1302
1308
col_na_values , col_na_fvalues = _get_na_values (
1303
1309
c , na_values , na_fvalues )
1304
1310
else :
1305
1311
col_na_values , col_na_fvalues = set (), set ()
1306
1312
1307
- coerce_type = True
1308
1313
if conv_f is not None :
1314
+ # conv_f applied to data before inference
1315
+ if cast_type is not None :
1316
+ warnings .warn (("Both a converter and dtype were specified "
1317
+ "for column {0} - only the converter will "
1318
+ "be used" ).format (c ), ParserWarning ,
1319
+ stacklevel = 7 )
1320
+
1309
1321
try :
1310
1322
values = lib .map_infer (values , conv_f )
1311
1323
except ValueError :
1312
1324
mask = lib .ismember (values , na_values ).view (np .uint8 )
1313
1325
values = lib .map_infer_mask (values , conv_f , mask )
1314
- coerce_type = False
1315
1326
1316
- cvals , na_count = self ._convert_types (
1317
- values , set (col_na_values ) | col_na_fvalues , coerce_type )
1327
+ cvals , na_count = self ._infer_types (
1328
+ values , set (col_na_values ) | col_na_fvalues ,
1329
+ try_num_bool = False )
1330
+ else :
1331
+ # skip inference if specified dtype is object
1332
+ try_num_bool = not (cast_type and is_object_dtype (cast_type ))
1333
+
1334
+ # general type inference and conversion
1335
+ cvals , na_count = self ._infer_types (
1336
+ values , set (col_na_values ) | col_na_fvalues ,
1337
+ try_num_bool )
1338
+
1339
+ # type specificed in dtype param
1340
+ if cast_type and not is_dtype_equal (cvals , cast_type ):
1341
+ cvals = self ._cast_types (cvals , cast_type , c )
1318
1342
1319
1343
if issubclass (cvals .dtype .type , np .integer ) and self .compact_ints :
1320
1344
cvals = lib .downcast_int64 (
@@ -1326,7 +1350,23 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
1326
1350
print ('Filled %d NA values in column %s' % (na_count , str (c )))
1327
1351
return result
1328
1352
1329
- def _convert_types (self , values , na_values , try_num_bool = True ):
1353
+ def _infer_types (self , values , na_values , try_num_bool = True ):
1354
+ """
1355
+ Infer types of values, possibly casting
1356
+
1357
+ Parameters
1358
+ ----------
1359
+ values : ndarray
1360
+ na_values : set
1361
+ try_num_bool : bool, default try
1362
+ try to cast values to numeric (first preference) or boolean
1363
+
1364
+ Returns:
1365
+ --------
1366
+ converted : ndarray
1367
+ na_count : int
1368
+ """
1369
+
1330
1370
na_count = 0
1331
1371
if issubclass (values .dtype .type , (np .number , np .bool_ )):
1332
1372
mask = lib .ismember (values , na_values )
@@ -1340,6 +1380,7 @@ def _convert_types(self, values, na_values, try_num_bool=True):
1340
1380
if try_num_bool :
1341
1381
try :
1342
1382
result = lib .maybe_convert_numeric (values , na_values , False )
1383
+ na_count = isnull (result ).sum ()
1343
1384
except Exception :
1344
1385
result = values
1345
1386
if values .dtype == np .object_ :
@@ -1356,6 +1397,38 @@ def _convert_types(self, values, na_values, try_num_bool=True):
1356
1397
1357
1398
return result , na_count
1358
1399
1400
+ def _cast_types (self , values , cast_type , column ):
1401
+ """
1402
+ Cast values to specified type
1403
+
1404
+ Parameters
1405
+ ----------
1406
+ values : ndarray
1407
+ cast_type : string or np.dtype
1408
+ dtype to cast values to
1409
+ column : string
1410
+ column name - used only for error reporting
1411
+
1412
+ Returns
1413
+ -------
1414
+ converted : ndarray
1415
+ """
1416
+
1417
+ if is_categorical_dtype (cast_type ):
1418
+ # XXX this is for consistency with
1419
+ # c-parser which parses all categories
1420
+ # as strings
1421
+ if not is_object_dtype (values ):
1422
+ values = _astype_nansafe (values , str )
1423
+ values = Categorical (values )
1424
+ else :
1425
+ try :
1426
+ values = _astype_nansafe (values , cast_type , copy = True )
1427
+ except ValueError :
1428
+ raise ValueError ("Unable to convert column %s to "
1429
+ "type %s" % (column , cast_type ))
1430
+ return values
1431
+
1359
1432
def _do_date_conversions (self , names , data ):
1360
1433
# returns data, columns
1361
1434
if self .parse_dates is not None :
@@ -1784,6 +1857,7 @@ def __init__(self, f, **kwds):
1784
1857
1785
1858
self .verbose = kwds ['verbose' ]
1786
1859
self .converters = kwds ['converters' ]
1860
+ self .dtype = kwds ['dtype' ]
1787
1861
1788
1862
self .compact_ints = kwds ['compact_ints' ]
1789
1863
self .use_unsigned = kwds ['use_unsigned' ]
@@ -1982,7 +2056,7 @@ def read(self, rows=None):
1982
2056
# DataFrame with the right metadata, even though it's length 0
1983
2057
names = self ._maybe_dedup_names (self .orig_names )
1984
2058
index , columns , col_dict = _get_empty_meta (
1985
- names , self .index_col , self .index_names )
2059
+ names , self .index_col , self .index_names , self . dtype )
1986
2060
columns = self ._maybe_make_multi_index_columns (
1987
2061
columns , self .col_names )
1988
2062
return index , columns , col_dict
@@ -2033,15 +2107,25 @@ def get_chunk(self, size=None):
2033
2107
2034
2108
def _convert_data (self , data ):
2035
2109
# apply converters
2036
- clean_conv = {}
2037
-
2038
- for col , f in compat .iteritems (self .converters ):
2039
- if isinstance (col , int ) and col not in self .orig_names :
2040
- col = self .orig_names [col ]
2041
- clean_conv [col ] = f
2110
+ def _clean_mapping (mapping ):
2111
+ "converts col numbers to names"
2112
+ clean = {}
2113
+ for col , v in compat .iteritems (mapping ):
2114
+ if isinstance (col , int ) and col not in self .orig_names :
2115
+ col = self .orig_names [col ]
2116
+ clean [col ] = v
2117
+ return clean
2118
+
2119
+ clean_conv = _clean_mapping (self .converters )
2120
+ if not isinstance (self .dtype , dict ):
2121
+ # handles single dtype applied to all columns
2122
+ clean_dtypes = self .dtype
2123
+ else :
2124
+ clean_dtypes = _clean_mapping (self .dtype )
2042
2125
2043
2126
return self ._convert_to_ndarrays (data , self .na_values , self .na_fvalues ,
2044
- self .verbose , clean_conv )
2127
+ self .verbose , clean_conv ,
2128
+ clean_dtypes )
2045
2129
2046
2130
def _to_recarray (self , data , columns ):
2047
2131
dtypes = []
0 commit comments