-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Add option to use nullable dtypes in read_csv #48776
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
d1c0b51
d7a7eca
4f05540
afe9ca5
af6056b
8b0dc2f
291e761
8a4d206
30d68a8
6139d87
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
Hashable, | ||
Iterable, | ||
List, | ||
Literal, | ||
Mapping, | ||
Sequence, | ||
Tuple, | ||
|
@@ -50,6 +51,7 @@ | |
is_dict_like, | ||
is_dtype_equal, | ||
is_extension_array_dtype, | ||
is_float_dtype, | ||
is_integer, | ||
is_integer_dtype, | ||
is_list_like, | ||
|
@@ -61,8 +63,14 @@ | |
from pandas.core.dtypes.dtypes import CategoricalDtype | ||
from pandas.core.dtypes.missing import isna | ||
|
||
from pandas import StringDtype | ||
from pandas.core import algorithms | ||
from pandas.core.arrays import Categorical | ||
from pandas.core.arrays import ( | ||
BooleanArray, | ||
Categorical, | ||
FloatingArray, | ||
IntegerArray, | ||
) | ||
from pandas.core.indexes.api import ( | ||
Index, | ||
MultiIndex, | ||
|
@@ -110,6 +118,7 @@ def __init__(self, kwds) -> None: | |
|
||
self.dtype = copy(kwds.get("dtype", None)) | ||
self.converters = kwds.get("converters") | ||
self.use_nullable_dtypes = kwds.get("use_nullable_dtypes", False) | ||
|
||
self.true_values = kwds.get("true_values") | ||
self.false_values = kwds.get("false_values") | ||
|
@@ -508,7 +517,7 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: | |
) | ||
|
||
arr, _ = self._infer_types( | ||
arr, col_na_values | col_na_fvalues, try_num_bool | ||
arr, col_na_values | col_na_fvalues, cast_type, try_num_bool | ||
) | ||
arrays.append(arr) | ||
|
||
|
@@ -574,7 +583,10 @@ def _convert_to_ndarrays( | |
values = lib.map_infer_mask(values, conv_f, mask) | ||
|
||
cvals, na_count = self._infer_types( | ||
values, set(col_na_values) | col_na_fvalues, try_num_bool=False | ||
values, | ||
set(col_na_values) | col_na_fvalues, | ||
cast_type, | ||
try_num_bool=False, | ||
) | ||
else: | ||
is_ea = is_extension_array_dtype(cast_type) | ||
|
@@ -585,14 +597,11 @@ def _convert_to_ndarrays( | |
|
||
# general type inference and conversion | ||
cvals, na_count = self._infer_types( | ||
values, set(col_na_values) | col_na_fvalues, try_num_bool | ||
values, set(col_na_values) | col_na_fvalues, cast_type, try_num_bool | ||
) | ||
|
||
# type specified in dtype param or cast_type is an EA | ||
if cast_type and ( | ||
not is_dtype_equal(cvals, cast_type) | ||
or is_extension_array_dtype(cast_type) | ||
): | ||
if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea): | ||
if not is_ea and na_count > 0: | ||
try: | ||
if is_bool_dtype(cast_type): | ||
|
@@ -679,14 +688,15 @@ def _set(x) -> int: | |
|
||
return noconvert_columns | ||
|
||
def _infer_types(self, values, na_values, try_num_bool: bool = True): | ||
def _infer_types(self, values, na_values, cast_type, try_num_bool: bool = True): | ||
""" | ||
Infer types of values, possibly casting | ||
|
||
Parameters | ||
---------- | ||
values : ndarray | ||
na_values : set | ||
cast_type: Specifies if we want to cast explicitly | ||
try_num_bool : bool, default try | ||
try to cast values to numeric (first preference) or boolean | ||
|
||
|
@@ -707,28 +717,58 @@ def _infer_types(self, values, na_values, try_num_bool: bool = True): | |
np.putmask(values, mask, np.nan) | ||
return values, na_count | ||
|
||
use_nullable_dtypes: Literal[True] | Literal[False] = ( | ||
self.use_nullable_dtypes and cast_type is None | ||
) | ||
result: ArrayLike | ||
|
||
if try_num_bool and is_object_dtype(values.dtype): | ||
# exclude e.g DatetimeIndex here | ||
try: | ||
result, _ = lib.maybe_convert_numeric(values, na_values, False) | ||
result, result_mask = lib.maybe_convert_numeric( | ||
values, | ||
na_values, | ||
False, | ||
convert_to_masked_nullable=use_nullable_dtypes, | ||
) | ||
except (ValueError, TypeError): | ||
# e.g. encountering datetime string gets ValueError | ||
# TypeError can be raised in floatify | ||
na_count = parsers.sanitize_objects(values, na_values) | ||
result = values | ||
na_count = parsers.sanitize_objects(result, na_values) | ||
else: | ||
na_count = isna(result).sum() | ||
if use_nullable_dtypes: | ||
if result_mask is None: | ||
result_mask = np.zeros(result.shape, dtype=np.bool_) | ||
|
||
if is_integer_dtype(result): | ||
result = IntegerArray(result, result_mask) | ||
elif is_bool_dtype(result): | ||
result = BooleanArray(result, result_mask) | ||
elif is_float_dtype(result): | ||
result = FloatingArray(result, result_mask) | ||
|
||
na_count = result_mask.sum() | ||
else: | ||
na_count = isna(result).sum() | ||
else: | ||
result = values | ||
if values.dtype == np.object_: | ||
na_count = parsers.sanitize_objects(values, na_values) | ||
|
||
if result.dtype == np.object_ and try_num_bool: | ||
result, _ = libops.maybe_convert_bool( | ||
result, bool_mask = libops.maybe_convert_bool( | ||
np.asarray(values), | ||
true_values=self.true_values, | ||
false_values=self.false_values, | ||
convert_to_masked_nullable=use_nullable_dtypes, | ||
) | ||
if result.dtype == np.bool_ and use_nullable_dtypes: | ||
if bool_mask is None: | ||
bool_mask = np.zeros(result.shape, dtype=np.bool_) | ||
result = BooleanArray(result, bool_mask) | ||
elif result.dtype == np.object_ and use_nullable_dtypes: | ||
result = StringDtype().construct_array_type()._from_sequence(values) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you test what happens when the string pyarrow global config is true? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
||
return result, na_count | ||
|
||
|
@@ -1146,6 +1186,7 @@ def converter(*date_cols): | |
"on_bad_lines": ParserBase.BadLineHandleMethod.ERROR, | ||
"error_bad_lines": None, | ||
"warn_bad_lines": None, | ||
"use_nullable_dtypes": False, | ||
} | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -385,3 +385,45 @@ def test_dtypes_defaultdict_invalid(all_parsers): | |
parser = all_parsers | ||
with pytest.raises(TypeError, match="not understood"): | ||
parser.read_csv(StringIO(data), dtype=dtype) | ||
|
||
|
||
def test_use_nullabla_dtypes(all_parsers): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: typo here and below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thx, fixed |
||
# GH#36712 | ||
|
||
parser = all_parsers | ||
|
||
data = """a,b,c,d,e,f,g,h,i | ||
1,2.5,True,a,,,,,12-31-2019 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add a column here where both rows have an empty value? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added, casts to Int64 now in both cases. Better question is what we actually want here, because this could be everything |
||
3,4.5,False,b,6,7.5,True,a,12-31-2019 | ||
""" | ||
result = parser.read_csv( | ||
StringIO(data), use_nullable_dtypes=True, parse_dates=["i"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you parametrize for use_nullable_dtypes = True/False here and for the other tests? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No this is impossible to understand if paramterized. Expected looks completely different. I could add a new test in theory, but would not bring much value, we are testing all possible cases already with numpy dtypes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, thanks for checking. |
||
) | ||
expected = DataFrame( | ||
{ | ||
"a": pd.Series([1, 3], dtype="Int64"), | ||
"b": pd.Series([2.5, 4.5], dtype="Float64"), | ||
"c": pd.Series([True, False], dtype="boolean"), | ||
"d": pd.Series(["a", "b"], dtype="string"), | ||
"e": pd.Series([pd.NA, 6], dtype="Int64"), | ||
"f": pd.Series([pd.NA, 7.5], dtype="Float64"), | ||
"g": pd.Series([pd.NA, True], dtype="boolean"), | ||
"h": pd.Series([pd.NA, "a"], dtype="string"), | ||
"i": pd.Series([Timestamp("2019-12-31")] * 2), | ||
} | ||
) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
def test_use_nullabla_dtypes_and_dtype(all_parsers): | ||
# GH#36712 | ||
|
||
parser = all_parsers | ||
|
||
data = """a,b | ||
1,2.5 | ||
, | ||
""" | ||
result = parser.read_csv(StringIO(data), use_nullable_dtypes=True, dtype="float64") | ||
expected = DataFrame({"a": [1.0, np.nan], "b": [2.5, np.nan]}) | ||
tm.assert_frame_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we make this bool? Looks like we only need to check that it's not
None
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed