diff --git a/doc/source/io.rst b/doc/source/io.rst index 7230ff55f9a6c..3bbd4e8410fa5 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -362,16 +362,17 @@ columns: .. ipython:: python - data = ('a,b,c\n' - '1,2,3\n' - '4,5,6\n' - '7,8,9') + data = ('a,b,c,d\n' + '1,2,3,4\n' + '5,6,7,8\n' + '9,10,11') print(data) df = pd.read_csv(StringIO(data), dtype=object) df df['a'][0] - df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64}) + df = pd.read_csv(StringIO(data), + dtype={'b': object, 'c': np.float64, 'd': 'Int64'}) df.dtypes Fortunately, pandas offers more than one way to ensure that your column(s) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a84fd118061bc..028ed99b23a6c 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -31,6 +31,7 @@ New features - :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`) - :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame. See the :ref:`section on writing HTML ` in the IO docs for example usage. (:issue:`2679`) +- :func:`pandas.read_csv` now supports pandas extension types as an argument to ``dtype``, allowing the user to use pandas extension types when reading CSVs. (:issue:`23228`) - :meth:`DataFrame.shift` :meth:`Series.shift`, :meth:`ExtensionArray.shift`, :meth:`SparseArray.shift`, :meth:`Period.shift`, :meth:`GroupBy.shift`, :meth:`Categorical.shift`, :meth:`NDFrame.shift` and :meth:`Block.shift` now accept `fill_value` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`) .. _whatsnew_0240.values_api: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 169aa4ffe72da..0b7edaf04a1ed 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -50,7 +50,7 @@ from pandas.core.dtypes.common import ( is_integer_dtype, is_float_dtype, is_bool_dtype, is_object_dtype, is_datetime64_dtype, - pandas_dtype) + pandas_dtype, is_extension_array_dtype) from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals import pandas.io.common as icom @@ -983,7 +983,6 @@ cdef class TextReader: footer=footer, upcast_na=True) self._end_clock('Type conversion') - self._start_clock() if len(columns) > 0: rows_read = len(list(columns.values())[0]) @@ -1123,7 +1122,9 @@ cdef class TextReader: if na_filter: self._free_na_set(na_hashset) - if upcast_na and na_count > 0: + # don't try to upcast EAs + try_upcast = upcast_na and na_count > 0 + if try_upcast and not is_extension_array_dtype(col_dtype): col_res = _maybe_upcast(col_res) if col_res is None: @@ -1215,6 +1216,22 @@ cdef class TextReader: cats, codes, dtype, true_values=true_values) return cat, na_count + elif is_extension_array_dtype(dtype): + result, na_count = self._string_convert(i, start, end, na_filter, + na_hashset) + array_type = dtype.construct_array_type() + try: + # use _from_sequence_of_strings if the class defines it + result = array_type._from_sequence_of_strings(result, + dtype=dtype) + except NotImplementedError: + raise NotImplementedError( + "Extension Array: {ea} must implement " + "_from_sequence_of_strings in order " + "to be used in parser methods".format(ea=array_type)) + + return result, na_count + elif is_integer_dtype(dtype): try: result, na_count = _try_int64(self.parser, i, start, @@ -1240,7 +1257,6 @@ cdef class TextReader: if result is not None and dtype != 'float64': result = result.astype(dtype) return result, na_count - elif is_bool_dtype(dtype): result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, @@ -2173,7 +2189,11 @@ def _concatenate_chunks(list chunks): result[name] = union_categoricals(arrs, sort_categories=sort_categories) else: - result[name] = np.concatenate(arrs) + if is_extension_array_dtype(dtype): + array_type = dtype.construct_array_type() + result[name] = array_type._concat_same_type(arrs) + else: + result[name] = np.concatenate(arrs) if warning_columns: warning_names = ','.join(warning_columns) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2d4f8ca9c2cee..7aaefef3d03e5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -78,6 +78,11 @@ class ExtensionArray(object): * _reduce + One can implement methods to handle parsing from strings that will be used + in methods such as ``pandas.io.parsers.read_csv``. + + * _from_sequence_of_strings + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is @@ -128,6 +133,30 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): """ raise AbstractMethodError(cls) + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + """Construct a new ExtensionArray from a sequence of strings. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + strings : Sequence + Each element will be an instance of the scalar type for this + array, ``cls.dtype.type``. + dtype : dtype, optional + Construct for this particular dtype. This should be a Dtype + compatible with the ExtensionArray. + copy : boolean, default False + If True, copy the underlying data. + + Returns + ------- + ExtensionArray + + """ + raise AbstractMethodError(cls) + @classmethod def _from_factorized(cls, values, original): """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 38dc68e8f77a3..eaec76b96a24d 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -19,6 +19,7 @@ from pandas.core import nanops from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.tools.numeric import to_numeric class _IntegerDtype(ExtensionDtype): @@ -261,6 +262,11 @@ def __init__(self, values, mask, copy=False): def _from_sequence(cls, scalars, dtype=None, copy=False): return integer_array(scalars, dtype=dtype, copy=copy) + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + scalars = to_numeric(strings, errors="raise") + return cls._from_sequence(scalars, dtype, copy) + @classmethod def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index de0ed9407e161..0d10e8b8985cb 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -28,8 +28,8 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal, - is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype, - is_scalar, is_string_dtype) + is_extension_array_dtype, is_float, is_integer, is_integer_dtype, + is_list_like, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna @@ -134,7 +134,8 @@ 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. dtype : Type name or dict of column -> type, optional - Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}} + Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, + 'c': 'Int64'}} Use `str` or `object` together with suitable `na_values` settings to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD @@ -1659,16 +1660,20 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, values, set(col_na_values) | col_na_fvalues, try_num_bool=False) else: + is_str_or_ea_dtype = (is_string_dtype(cast_type) + or is_extension_array_dtype(cast_type)) # skip inference if specified dtype is object - try_num_bool = not (cast_type and is_string_dtype(cast_type)) + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, try_num_bool) - # type specified in dtype param - if cast_type and not is_dtype_equal(cvals, cast_type): + # type specified in dtype param or cast_type is an EA + if cast_type and (not is_dtype_equal(cvals, cast_type) + or is_extension_array_dtype(cast_type)): try: if (is_bool_dtype(cast_type) and not is_categorical_dtype(cast_type) @@ -1765,6 +1770,20 @@ def _cast_types(self, values, cast_type, column): cats, cats.get_indexer(values), cast_type, true_values=self.true_values) + # use the EA's implementation of casting + elif is_extension_array_dtype(cast_type): + # ensure cast_type is an actual dtype and not a string + cast_type = pandas_dtype(cast_type) + array_type = cast_type.construct_array_type() + try: + return array_type._from_sequence_of_strings(values, + dtype=cast_type) + except NotImplementedError: + raise NotImplementedError( + "Extension Array: {ea} must implement " + "_from_sequence_of_strings in order " + "to be used in parser methods".format(ea=array_type)) + else: try: values = astype_nansafe(values, cast_type, @@ -2174,8 +2193,8 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] - self.dtype = kwds['dtype'] + self.dtype = kwds['dtype'] self.thousands = kwds['thousands'] self.decimal = kwds['decimal'] diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 57704b77bb233..1f7ee2ae17e4a 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -53,3 +53,4 @@ class TestMyDtype(BaseDtypeTests): from .missing import BaseMissingTests # noqa from .reshaping import BaseReshapingTests # noqa from .setitem import BaseSetitemTests # noqa +from .io import BaseParsingTests # noqa diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py new file mode 100644 index 0000000000000..7ea62e4e9d678 --- /dev/null +++ b/pandas/tests/extension/base/io.py @@ -0,0 +1,23 @@ +import numpy as np +import pytest + +from pandas.compat import StringIO + +import pandas as pd + +from .base import BaseExtensionTests + + +class BaseParsingTests(BaseExtensionTests): + + @pytest.mark.parametrize('engine', ['c', 'python']) + def test_EA_types(self, engine, data): + df = pd.DataFrame({ + 'with_dtype': pd.Series(data, dtype=str(data.dtype)) + }) + csv_output = df.to_csv(index=False, na_rep=np.nan) + result = pd.read_csv(StringIO(csv_output), dtype={ + 'with_dtype': str(data.dtype) + }, engine=engine) + expected = df + self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 7e618dfd2b92e..1823eeb4d7fc0 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -75,6 +75,11 @@ def dtype(self): def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(scalars) + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence([decimal.Decimal(x) for x in strings], + dtype, copy) + @classmethod def _from_factorized(cls, values, original): return cls(values) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index c876db416470c..ac52d8f15b8ce 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -237,3 +237,7 @@ def _compare_other(self, s, data, op_name, other): else: with pytest.raises(TypeError): op(data, other) + + +class TestParsing(base.BaseParsingTests): + pass diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index e21ca81bcf5c3..aadf9f2f12b68 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -218,3 +218,7 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): class TestPrinting(base.BasePrintingTests): pass + + +class TestParsing(base.BaseParsingTests): + pass diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 644f3ef94f40b..6eedbfb4aba39 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -152,3 +152,11 @@ class TestPrinting(BaseInterval, base.BasePrintingTests): @pytest.mark.skip(reason="custom repr") def test_array_repr(self, data, size): pass + + +class TestParsing(BaseInterval, base.BaseParsingTests): + @pytest.mark.parametrize('engine', ['c', 'python']) + def test_EA_types(self, engine, data): + expected_msg = r'.*must implement _from_sequence_of_strings.*' + with pytest.raises(NotImplementedError, match=expected_msg): + super(TestParsing, self).test_EA_types(engine, data) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 70a3a8ab58aac..7ca6882c7441b 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -210,3 +210,7 @@ def test_concat_mixed_dtypes(self, data): class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): pass + + +class TestParsing(BaseNumPyTests, base.BaseParsingTests): + pass diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 08e21fc30ad10..813efcb5678d3 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -156,3 +156,11 @@ class TestGroupby(BasePeriodTests, base.BaseGroupbyTests): class TestPrinting(BasePeriodTests, base.BasePrintingTests): pass + + +class TestParsing(BasePeriodTests, base.BaseParsingTests): + @pytest.mark.parametrize('engine', ['c', 'python']) + def test_EA_types(self, engine, data): + expected_msg = r'.*must implement _from_sequence_of_strings.*' + with pytest.raises(NotImplementedError, match=expected_msg): + super(TestParsing, self).test_EA_types(engine, data) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 257eb44cd94fe..39a138ed534bd 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -359,3 +359,11 @@ class TestPrinting(BaseSparseTests, base.BasePrintingTests): @pytest.mark.xfail(reason='Different repr', strict=True) def test_array_repr(self, data, size): super(TestPrinting, self).test_array_repr(data, size) + + +class TestParsing(BaseSparseTests, base.BaseParsingTests): + @pytest.mark.parametrize('engine', ['c', 'python']) + def test_EA_types(self, engine, data): + expected_msg = r'.*must implement _from_sequence_of_strings.*' + with pytest.raises(NotImplementedError, match=expected_msg): + super(TestParsing, self).test_EA_types(engine, data) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py new file mode 100644 index 0000000000000..e69de29bb2d1d