ENH:Add EA types to read CSV (#23255)

kprestel · jreback · commit f67aa133d033 · 2019-01-01T21:57:19.000-05:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -362,16 +362,17 @@ columns:
 
 .. ipython:: python
 
-    data = ('a,b,c\n'
-            '1,2,3\n'
-            '4,5,6\n'
-            '7,8,9')
+    data = ('a,b,c,d\n'
+            '1,2,3,4\n'
+            '5,6,7,8\n'
+            '9,10,11')
     print(data)
 
     df = pd.read_csv(StringIO(data), dtype=object)
     df
     df['a'][0]
-    df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64})
+    df = pd.read_csv(StringIO(data),
+                     dtype={'b': object, 'c': np.float64, 'd': 'Int64'})
     df.dtypes
 
 Fortunately, pandas offers more than one way to ensure that your column(s)
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -31,6 +31,7 @@ New features
 - :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`)
 - :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame.
   See the :ref:`section on writing HTML <io.html>` in the IO docs for example usage. (:issue:`2679`)
+- :func:`pandas.read_csv` now supports pandas extension types as an argument to ``dtype``, allowing the user to use pandas extension types when reading CSVs. (:issue:`23228`)
 - :meth:`DataFrame.shift` :meth:`Series.shift`, :meth:`ExtensionArray.shift`, :meth:`SparseArray.shift`, :meth:`Period.shift`, :meth:`GroupBy.shift`, :meth:`Categorical.shift`, :meth:`NDFrame.shift` and :meth:`Block.shift` now accept `fill_value` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`)
 
 .. _whatsnew_0240.values_api:
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -50,7 +50,7 @@ from pandas.core.dtypes.common import (
     is_integer_dtype, is_float_dtype,
     is_bool_dtype, is_object_dtype,
     is_datetime64_dtype,
-    pandas_dtype)
+    pandas_dtype, is_extension_array_dtype)
 from pandas.core.arrays import Categorical
 from pandas.core.dtypes.concat import union_categoricals
 import pandas.io.common as icom
@@ -983,7 +983,6 @@ cdef class TextReader:
                                             footer=footer,
                                             upcast_na=True)
         self._end_clock('Type conversion')
-
         self._start_clock()
         if len(columns) > 0:
             rows_read = len(list(columns.values())[0])
@@ -1123,7 +1122,9 @@ cdef class TextReader:
                 if na_filter:
                     self._free_na_set(na_hashset)
 
-            if upcast_na and na_count > 0:
+            # don't try to upcast EAs
+            try_upcast = upcast_na and na_count > 0
+            if try_upcast and not is_extension_array_dtype(col_dtype):
                 col_res = _maybe_upcast(col_res)
 
             if col_res is None:
@@ -1215,6 +1216,22 @@ cdef class TextReader:
                 cats, codes, dtype, true_values=true_values)
             return cat, na_count
 
+        elif is_extension_array_dtype(dtype):
+            result, na_count = self._string_convert(i, start, end, na_filter,
+                                                    na_hashset)
+            array_type = dtype.construct_array_type()
+            try:
+                # use _from_sequence_of_strings if the class defines it
+                result = array_type._from_sequence_of_strings(result,
+                                                              dtype=dtype)
+            except NotImplementedError:
+                raise NotImplementedError(
+                    "Extension Array: {ea} must implement "
+                    "_from_sequence_of_strings in order "
+                    "to be used in parser methods".format(ea=array_type))
+
+            return result, na_count
+
         elif is_integer_dtype(dtype):
             try:
                 result, na_count = _try_int64(self.parser, i, start,
@@ -1240,7 +1257,6 @@ cdef class TextReader:
             if result is not None and dtype != 'float64':
                 result = result.astype(dtype)
             return result, na_count
-
         elif is_bool_dtype(dtype):
             result, na_count = _try_bool_flex(self.parser, i, start, end,
                                               na_filter, na_hashset,
@@ -2173,7 +2189,11 @@ def _concatenate_chunks(list chunks):
             result[name] = union_categoricals(arrs,
                                               sort_categories=sort_categories)
         else:
-            result[name] = np.concatenate(arrs)
+            if is_extension_array_dtype(dtype):
+                array_type = dtype.construct_array_type()
+                result[name] = array_type._concat_same_type(arrs)
+            else:
+                result[name] = np.concatenate(arrs)
 
     if warning_columns:
         warning_names = ','.join(warning_columns)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -78,6 +78,11 @@ class ExtensionArray(object):
 
     * _reduce
 
+    One can implement methods to handle parsing from strings that will be used
+    in methods such as ``pandas.io.parsers.read_csv``.
+
+    * _from_sequence_of_strings
+
     This class does not inherit from 'abc.ABCMeta' for performance reasons.
     Methods and properties required by the interface raise
     ``pandas.errors.AbstractMethodError`` and no ``register`` method is
@@ -128,6 +133,30 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         """
         raise AbstractMethodError(cls)
 
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        """Construct a new ExtensionArray from a sequence of strings.
+
+        .. versionadded:: 0.24.0
+
+        Parameters
+        ----------
+        strings : Sequence
+            Each element will be an instance of the scalar type for this
+            array, ``cls.dtype.type``.
+        dtype : dtype, optional
+            Construct for this particular dtype. This should be a Dtype
+            compatible with the ExtensionArray.
+        copy : boolean, default False
+            If True, copy the underlying data.
+
+        Returns
+        -------
+        ExtensionArray
+
+        """
+        raise AbstractMethodError(cls)
+
     @classmethod
     def _from_factorized(cls, values, original):
         """
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -19,6 +19,7 @@
 
 from pandas.core import nanops
 from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
+from pandas.core.tools.numeric import to_numeric
 
 
 class _IntegerDtype(ExtensionDtype):
@@ -261,6 +262,11 @@ def __init__(self, values, mask, copy=False):
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         return integer_array(scalars, dtype=dtype, copy=copy)
 
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        scalars = to_numeric(strings, errors="raise")
+        return cls._from_sequence(scalars, dtype, copy)
+
     @classmethod
     def _from_factorized(cls, values, original):
         return integer_array(values, dtype=original.dtype)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -28,8 +28,8 @@
 from pandas.core.dtypes.cast import astype_nansafe
 from pandas.core.dtypes.common import (
     ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal,
-    is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype,
-    is_scalar, is_string_dtype)
+    is_extension_array_dtype, is_float, is_integer, is_integer_dtype,
+    is_list_like, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype)
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
 
@@ -134,7 +134,8 @@
     'X'...'X'. Passing in False will cause data to be overwritten if there
     are duplicate names in the columns.
 dtype : Type name or dict of column -> type, optional
-    Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}}
+    Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
+    'c': 'Int64'}}
     Use `str` or `object` together with suitable `na_values` settings
     to preserve and not interpret dtype.
     If converters are specified, they will be applied INSTEAD
@@ -1659,16 +1660,20 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
                     values, set(col_na_values) | col_na_fvalues,
                     try_num_bool=False)
             else:
+                is_str_or_ea_dtype = (is_string_dtype(cast_type)
+                                      or is_extension_array_dtype(cast_type))
                 # skip inference if specified dtype is object
-                try_num_bool = not (cast_type and is_string_dtype(cast_type))
+                # or casting to an EA
+                try_num_bool = not (cast_type and is_str_or_ea_dtype)
 
                 # general type inference and conversion
                 cvals, na_count = self._infer_types(
                     values, set(col_na_values) | col_na_fvalues,
                     try_num_bool)
 
-                # type specified in dtype param
-                if cast_type and not is_dtype_equal(cvals, cast_type):
+                # type specified in dtype param or cast_type is an EA
+                if cast_type and (not is_dtype_equal(cvals, cast_type)
+                                  or is_extension_array_dtype(cast_type)):
                     try:
                         if (is_bool_dtype(cast_type) and
                                 not is_categorical_dtype(cast_type)
@@ -1765,6 +1770,20 @@ def _cast_types(self, values, cast_type, column):
                 cats, cats.get_indexer(values), cast_type,
                 true_values=self.true_values)
 
+        # use the EA's implementation of casting
+        elif is_extension_array_dtype(cast_type):
+            # ensure cast_type is an actual dtype and not a string
+            cast_type = pandas_dtype(cast_type)
+            array_type = cast_type.construct_array_type()
+            try:
+                return array_type._from_sequence_of_strings(values,
+                                                            dtype=cast_type)
+            except NotImplementedError:
+                raise NotImplementedError(
+                    "Extension Array: {ea} must implement "
+                    "_from_sequence_of_strings in order "
+                    "to be used in parser methods".format(ea=array_type))
+
         else:
             try:
                 values = astype_nansafe(values, cast_type,
@@ -2174,8 +2193,8 @@ def __init__(self, f, **kwds):
 
         self.verbose = kwds['verbose']
         self.converters = kwds['converters']
-        self.dtype = kwds['dtype']
 
+        self.dtype = kwds['dtype']
         self.thousands = kwds['thousands']
         self.decimal = kwds['decimal']
 
diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py
@@ -53,3 +53,4 @@ class TestMyDtype(BaseDtypeTests):
 from .missing import BaseMissingTests  # noqa
 from .reshaping import BaseReshapingTests  # noqa
 from .setitem import BaseSetitemTests  # noqa
+from .io import BaseParsingTests  # noqa
diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py
@@ -0,0 +1,23 @@
+import numpy as np
+import pytest
+
+from pandas.compat import StringIO
+
+import pandas as pd
+
+from .base import BaseExtensionTests
+
+
+class BaseParsingTests(BaseExtensionTests):
+
+    @pytest.mark.parametrize('engine', ['c', 'python'])
+    def test_EA_types(self, engine, data):
+        df = pd.DataFrame({
+            'with_dtype': pd.Series(data, dtype=str(data.dtype))
+        })
+        csv_output = df.to_csv(index=False, na_rep=np.nan)
+        result = pd.read_csv(StringIO(csv_output), dtype={
+            'with_dtype': str(data.dtype)
+        }, engine=engine)
+        expected = df
+        self.assert_frame_equal(result, expected)
diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
@@ -75,6 +75,11 @@ def dtype(self):
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         return cls(scalars)
 
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        return cls._from_sequence([decimal.Decimal(x) for x in strings],
+                                  dtype, copy)
+
     @classmethod
     def _from_factorized(cls, values, original):
         return cls(values)
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
@@ -237,3 +237,7 @@ def _compare_other(self, s, data, op_name, other):
         else:
             with pytest.raises(TypeError):
                 op(data, other)
+
+
+class TestParsing(base.BaseParsingTests):
+    pass
diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py
@@ -218,3 +218,7 @@ class TestBooleanReduce(base.BaseBooleanReduceTests):
 
 class TestPrinting(base.BasePrintingTests):
     pass
+
+
+class TestParsing(base.BaseParsingTests):
+    pass
diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py
@@ -152,3 +152,11 @@ class TestPrinting(BaseInterval, base.BasePrintingTests):
     @pytest.mark.skip(reason="custom repr")
     def test_array_repr(self, data, size):
         pass
+
+
+class TestParsing(BaseInterval, base.BaseParsingTests):
+    @pytest.mark.parametrize('engine', ['c', 'python'])
+    def test_EA_types(self, engine, data):
+        expected_msg = r'.*must implement _from_sequence_of_strings.*'
+        with pytest.raises(NotImplementedError, match=expected_msg):
+            super(TestParsing, self).test_EA_types(engine, data)
diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
@@ -210,3 +210,7 @@ def test_concat_mixed_dtypes(self, data):
 
 class TestSetitem(BaseNumPyTests, base.BaseSetitemTests):
     pass
+
+
+class TestParsing(BaseNumPyTests, base.BaseParsingTests):
+    pass
diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py
@@ -156,3 +156,11 @@ class TestGroupby(BasePeriodTests, base.BaseGroupbyTests):
 
 class TestPrinting(BasePeriodTests, base.BasePrintingTests):
     pass
+
+
+class TestParsing(BasePeriodTests, base.BaseParsingTests):
+    @pytest.mark.parametrize('engine', ['c', 'python'])
+    def test_EA_types(self, engine, data):
+        expected_msg = r'.*must implement _from_sequence_of_strings.*'
+        with pytest.raises(NotImplementedError, match=expected_msg):
+            super(TestParsing, self).test_EA_types(engine, data)
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
@@ -359,3 +359,11 @@ class TestPrinting(BaseSparseTests, base.BasePrintingTests):
     @pytest.mark.xfail(reason='Different repr', strict=True)
     def test_array_repr(self, data, size):
         super(TestPrinting, self).test_array_repr(data, size)
+
+
+class TestParsing(BaseSparseTests, base.BaseParsingTests):
+    @pytest.mark.parametrize('engine', ['c', 'python'])
+    def test_EA_types(self, engine, data):
+        expected_msg = r'.*must implement _from_sequence_of_strings.*'
+        with pytest.raises(NotImplementedError, match=expected_msg):
+            super(TestParsing, self).test_EA_types(engine, data)
diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py