WIP:Make python engine support EA types when reading CSVs

kprestel · kprestel · commit 22f2fa147411 · 2018-11-23T12:57:18.000-05:00
The C engine is the real WIP.
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1209,7 +1209,11 @@ cdef class TextReader:
 
             if result is not None and dtype != 'int64':
                 if is_extension_array_dtype(dtype):
-                    result = result.astype(dtype.numpy_dtype)
+                    try:
+                        result = dtype.construct_array_type()._from_sequence(
+                                result, dtype=dtype)
+                    except Exception as e:
+                        raise
                 else:
                     result = result.astype(dtype)
 
@@ -1221,7 +1225,11 @@ cdef class TextReader:
 
             if result is not None and dtype != 'float64':
                 if is_extension_array_dtype(dtype):
-                    result = result.astype(dtype.numpy_dtype)
+                    try:
+                        result = dtype.construct_array_type()._from_sequence(
+                                result)
+                    except Exception as e:
+                        raise
                 else:
                     result = result.astype(dtype)
             return result, na_count
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -117,6 +117,27 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         """
         raise AbstractMethodError(cls)
 
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        """Construct a new ExtensionArray from a sequence of scalars.
+
+        Parameters
+        ----------
+        strings : Sequence
+            Each element will be an instance of the scalar type for this
+            array, ``cls.dtype.type``.
+        dtype : dtype, optional
+            Construct for this particular dtype. This should be a Dtype
+            compatible with the ExtensionArray.
+        copy : boolean, default False
+            If True, copy the underlying data.
+
+        Returns
+        -------
+        ExtensionArray
+        """
+        raise AbstractMethodError(cls)
+
     @classmethod
     def _from_factorized(cls, values, original):
         """Reconstruct an ExtensionArray after factorization.
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -156,7 +156,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
             dtype = dtype.lower()
         if not issubclass(type(dtype), _IntegerDtype):
             try:
-                dtype = _dtypes[str(np.dtype(dtype))]
+                dtype = _dtypes[str(np.dtype(dtype.name.lower()))]
             except KeyError:
                 raise ValueError("invalid dtype specified {}".format(dtype))
 
@@ -263,6 +263,10 @@ def __init__(self, values, mask, copy=False):
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         return integer_array(scalars, dtype=dtype, copy=copy)
 
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        return cls._from_sequence([int(x) for x in strings], dtype, copy)
+
     @classmethod
     def _from_factorized(cls, values, original):
         return integer_array(values, dtype=original.dtype)
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -661,8 +661,22 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False):
 
     # dispatch on extension dtype if needed
     if is_extension_array_dtype(dtype):
-        return dtype.construct_array_type()._from_sequence(
-            arr, dtype=dtype, copy=copy)
+        if is_object_dtype(arr):
+            try:
+                return dtype.construct_array_type()._from_sequence_of_strings(
+                    arr, dtype=dtype, copy=copy)
+            except AttributeError:
+                dtype = pandas_dtype(dtype)
+                return dtype.construct_array_type()._from_sequence_of_strings(
+                    arr, dtype=dtype, copy=copy)
+        else:
+            try:
+                return dtype.construct_array_type()._from_sequence(
+                    arr, dtype=dtype, copy=copy)
+            except AttributeError:
+                dtype = pandas_dtype(dtype)
+                return dtype.construct_array_type()._from_sequence(
+                    arr, dtype=dtype, copy=copy)
 
     if not isinstance(dtype, np.dtype):
         dtype = pandas_dtype(dtype)
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1886,7 +1886,10 @@ def _get_dtype(arr_or_dtype):
     if isinstance(arr_or_dtype, np.dtype):
         return arr_or_dtype
     elif isinstance(arr_or_dtype, type):
-        return np.dtype(arr_or_dtype)
+        try:
+            return pandas_dtype(arr_or_dtype)
+        except TypeError:
+            return np.dtype(arr_or_dtype)
     elif isinstance(arr_or_dtype, ExtensionDtype):
         return arr_or_dtype
     elif isinstance(arr_or_dtype, DatetimeTZDtype):
@@ -1904,6 +1907,11 @@ def _get_dtype(arr_or_dtype):
             return PeriodDtype.construct_from_string(arr_or_dtype)
         elif is_interval_dtype(arr_or_dtype):
             return IntervalDtype.construct_from_string(arr_or_dtype)
+        else:
+            try:
+                return pandas_dtype(arr_or_dtype)
+            except TypeError:
+                pass
     elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex,
                                    ABCSparseArray, ABCSparseSeries)):
         return arr_or_dtype.dtype
@@ -1934,7 +1942,15 @@ def _get_dtype_type(arr_or_dtype):
     if isinstance(arr_or_dtype, np.dtype):
         return arr_or_dtype.type
     elif isinstance(arr_or_dtype, type):
-        return np.dtype(arr_or_dtype).type
+        try:
+            dtype = pandas_dtype(arr_or_dtype)
+            try:
+                return dtype.type
+            except AttributeError:
+                raise TypeError
+        except TypeError:
+            return np.dtype(arr_or_dtype).type
+
     elif isinstance(arr_or_dtype, CategoricalDtype):
         return CategoricalDtypeType
     elif isinstance(arr_or_dtype, DatetimeTZDtype):
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -4211,7 +4211,10 @@ def _try_cast(arr, take_fast_path):
             # that we can convert the data to the requested dtype.
             if is_integer_dtype(dtype):
                 subarr = maybe_cast_to_integer_array(arr, dtype)
-
+            if is_extension_array_dtype(dtype):
+                # create an extension array from its dtype
+                array_type = dtype.construct_array_type()._from_sequence
+                return array_type(arr, dtype=dtype, copy=copy)
             subarr = maybe_cast_to_datetime(arr, dtype)
             # Take care in creating object arrays (but iterators are not
             # supported):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -28,7 +28,8 @@
 from pandas.core.dtypes.common import (
     ensure_object, is_categorical_dtype, is_dtype_equal, is_float, is_integer,
     is_integer_dtype, is_list_like, is_object_dtype, is_scalar,
-    is_string_dtype)
+    is_string_dtype, is_extension_array_dtype,
+)
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import isna
 
@@ -1590,15 +1591,17 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,
                     try_num_bool=False)
             else:
                 # skip inference if specified dtype is object
-                try_num_bool = not (cast_type and is_string_dtype(cast_type))
+                try_num_bool = not (cast_type and (is_string_dtype(cast_type)
+                                                   or is_extension_array_dtype(cast_type)))
 
                 # general type inference and conversion
                 cvals, na_count = self._infer_types(
                     values, set(col_na_values) | col_na_fvalues,
                     try_num_bool)
 
                 # type specified in dtype param
-                if cast_type and not is_dtype_equal(cvals, cast_type):
+                if cast_type and (not is_dtype_equal(cvals, cast_type)
+                        or is_extension_array_dtype(cast_type)):
                     cvals = self._cast_types(cvals, cast_type, c)
 
             result[c] = cvals
diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py
@@ -0,0 +1,19 @@
+import pandas as pd
+from pandas.compat import StringIO
+from pandas.core.arrays.integer import Int64Dtype
+from .base import BaseExtensionTests
+
+
+class ExtensionParsingTests(BaseExtensionTests):
+    def test_EA_types(self):
+        df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'),
+                           'A': [1, 2, 1]})
+        data = df.to_csv(index=False)
+        result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype})
+        assert result is not None
+
+        df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int8'),
+                           'A': [1, 2, 1]})
+        data = df.to_csv(index=False)
+        result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'})
+        assert result is not None
diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
@@ -73,6 +73,11 @@ def dtype(self):
     def _from_sequence(cls, scalars, dtype=None, copy=False):
         return cls(scalars)
 
+    @classmethod
+    def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):
+        return cls._from_sequence([decimal.Decimal(x) for x in strings],
+                                  dtype, copy)
+
     @classmethod
     def _from_factorized(cls, values, original):
         return cls(values)
diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
@@ -8,6 +8,8 @@
 import platform
 import re
 import sys
+import decimal
+from io import TextIOWrapper
 
 import numpy as np
 import pytest
@@ -23,6 +25,7 @@
 from pandas.io.common import URLError
 from pandas.io.parsers import TextFileReader, TextParser
 from pandas.core.arrays.integer import Int64Dtype
+from pandas.tests.extension.decimal import DecimalDtype
 
 
 class ParserTests(object):
@@ -1630,13 +1633,23 @@ def test_buffer_rd_bytes_bad_unicode(self):
 
     def test_EA_types(self):
         df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'),
-                           'A': [1, 2, 1]})
+                           'A': pd.Series([1, 2, 1], dtype=Int64Dtype)})
         data = df.to_csv(index=False)
-        result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype})
-        assert result is not None
+        result = pd.read_csv(StringIO(data), dtype={'Int': 'Int64',
+        'A': Int64Dtype})
+        tm.assert_frame_equal(df, result)
 
         df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int8'),
                            'A': [1, 2, 1]})
         data = df.to_csv(index=False)
         result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'})
-        assert result is not None
+        tm.assert_frame_equal(df, result)
+
+        df = pd.DataFrame({'Dec': pd.Series([decimal.Decimal('1.234'),
+                                             decimal.Decimal('2.123'),
+                                             decimal.Decimal('4.521')],
+                                            dtype=DecimalDtype),
+                           'A': [1, 2, 1]})
+        data = df.to_csv(index=False)
+        result = pd.read_csv(StringIO(data), dtype={'Dec': DecimalDtype})
+        tm.assert_frame_equal(df, result)