diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3f0a3590e24a3..ba01d7338a8d0 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,11 +1,12 @@ import copy import sys -from typing import Type +from typing import Sequence, Type import warnings import numpy as np from pandas._libs import lib +from pandas._typing import Dtype from pandas.compat import set_function_name from pandas.util._decorators import cache_readonly @@ -304,9 +305,18 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): return integer_array(scalars, dtype=dtype, copy=copy) @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): - scalars = to_numeric(strings, errors="raise") - return cls._from_sequence(scalars, dtype, copy) + def _from_sequence_of_strings(cls, + strings: Sequence[str], + dtype: Dtype = None, + copy: bool = False) -> 'IntegerArray': + # Mask the NA location before sending to to_numeric to prevent + # undesirable cast to float which may lose precision + mask = isna(strings) + masked_strings = np.where(mask, 0, strings) + + scalars = to_numeric(masked_strings, errors="raise") + + return IntegerArray(scalars, mask) @classmethod def _from_factorized(cls, values, original): diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 1d3c935e9101b..92574c81b4fa2 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -509,3 +509,27 @@ def test_numeric_dtype(all_parsers, dtype): result = parser.read_csv(StringIO(data), header=None, dtype=dtype) tm.assert_frame_equal(expected, result) + + +def test_intna_precision(all_parsers): + parser = all_parsers + data = "1556559573141592653\n1556559573141592654\n\n1556559573141592655" + dtype = 'Int64' + + expected = DataFrame([ + [1556559573141592653], + [1556559573141592654], + [0], + [1556559573141592655], + ], dtype=dtype) + expected.iloc[2] = np.nan # TODO: fix general bug on df construction + + result = parser.read_csv(StringIO(data), header=None, dtype=dtype, + skip_blank_lines=False) + + tm.assert_frame_equal(result, expected) + + # See why tm.assert_frame_equal doesn't fail... + assert result.iloc[0] == expected.iloc[0] + assert result.iloc[1] == expected.iloc[1] + assert result.iloc[3] == expected.iloc[3]