From c4e0058a15610e4233692c8d8d409bb4beb6cdc4 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sat, 20 Oct 2018 14:47:55 -0400 Subject: [PATCH 01/22] ENH:Add EA types to read CSV Closes GH23228 --- pandas/_libs/parsers.pyx | 10 ++++++++-- pandas/core/dtypes/common.py | 5 ++++- pandas/tests/io/parser/common.py | 0 3 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 pandas/tests/io/parser/common.py diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 169aa4ffe72da..a32b31c89bc4b 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1229,7 +1229,10 @@ cdef class TextReader: na_count = 0 if result is not None and dtype != 'int64': - result = result.astype(dtype) + try: + result = result.astype(dtype) + except TypeError: + result = result.astype(dtype.numpy_dtype) return result, na_count @@ -1238,7 +1241,10 @@ cdef class TextReader: na_filter, na_hashset, na_flist) if result is not None and dtype != 'float64': - result = result.astype(dtype) + try: + result = result.astype(dtype) + except TypeError: + result = result.astype(dtype.numpy_dtype) return result, na_count elif is_bool_dtype(dtype): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index e1141c6b6b3a8..1f09ed0624f68 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1872,7 +1872,10 @@ def _get_dtype_type(arr_or_dtype): try: return arr_or_dtype.dtype.type except AttributeError: - return type(None) + try: + return arr_or_dtype.numpy_dtype.type + except AttributeError: + return type(None) def _get_dtype_from_object(dtype): diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 744d45070dce2e6cc71df32eb8e7b1576ef943cd Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sat, 3 Nov 2018 14:19:03 -0400 Subject: [PATCH 02/22] Address merge comments --- pandas/_libs/parsers.pyx | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a32b31c89bc4b..559c1a56a45a2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -50,7 +50,8 @@ from pandas.core.dtypes.common import ( is_integer_dtype, is_float_dtype, is_bool_dtype, is_object_dtype, is_datetime64_dtype, - pandas_dtype) + pandas_dtype, is_extension_array_dtype, +) from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals import pandas.io.common as icom @@ -1229,10 +1230,10 @@ cdef class TextReader: na_count = 0 if result is not None and dtype != 'int64': - try: - result = result.astype(dtype) - except TypeError: + if is_extension_array_dtype(dtype): result = result.astype(dtype.numpy_dtype) + else: + result = result.astype(dtype) return result, na_count @@ -1241,10 +1242,10 @@ cdef class TextReader: na_filter, na_hashset, na_flist) if result is not None and dtype != 'float64': - try: - result = result.astype(dtype) - except TypeError: + if is_extension_array_dtype(dtype): result = result.astype(dtype.numpy_dtype) + else: + result = result.astype(dtype) return result, na_count elif is_bool_dtype(dtype): From 8368d19cb53b818188c778c8290791266e2fc721 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Fri, 23 Nov 2018 12:57:18 -0500 Subject: [PATCH 03/22] WIP:Make python engine support EA types when reading CSVs The C engine is the real WIP. --- pandas/_libs/parsers.pyx | 12 ++++++++++-- pandas/core/arrays/base.py | 21 +++++++++++++++++++++ pandas/core/arrays/integer.py | 6 +++++- pandas/core/dtypes/cast.py | 18 ++++++++++++++++-- pandas/core/dtypes/common.py | 20 ++++++++++++++++++-- pandas/io/parsers.py | 8 +++++--- pandas/tests/extension/base/io.py | 19 +++++++++++++++++++ pandas/tests/extension/decimal/array.py | 5 +++++ 8 files changed, 99 insertions(+), 10 deletions(-) create mode 100644 pandas/tests/extension/base/io.py diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 559c1a56a45a2..cac2ceb9c0fdf 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1231,7 +1231,11 @@ cdef class TextReader: if result is not None and dtype != 'int64': if is_extension_array_dtype(dtype): - result = result.astype(dtype.numpy_dtype) + try: + result = dtype.construct_array_type()._from_sequence( + result, dtype=dtype) + except Exception as e: + raise else: result = result.astype(dtype) @@ -1243,7 +1247,11 @@ cdef class TextReader: if result is not None and dtype != 'float64': if is_extension_array_dtype(dtype): - result = result.astype(dtype.numpy_dtype) + try: + result = dtype.construct_array_type()._from_sequence( + result) + except Exception as e: + raise else: result = result.astype(dtype) return result, na_count diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2d4f8ca9c2cee..e1137b070cc58 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -128,6 +128,27 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): """ raise AbstractMethodError(cls) + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + """Construct a new ExtensionArray from a sequence of scalars. + + Parameters + ---------- + strings : Sequence + Each element will be an instance of the scalar type for this + array, ``cls.dtype.type``. + dtype : dtype, optional + Construct for this particular dtype. This should be a Dtype + compatible with the ExtensionArray. + copy : boolean, default False + If True, copy the underlying data. + + Returns + ------- + ExtensionArray + """ + raise AbstractMethodError(cls) + @classmethod def _from_factorized(cls, values, original): """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 38dc68e8f77a3..2c4d35e633e16 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -154,7 +154,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): dtype = dtype.lower() if not issubclass(type(dtype), _IntegerDtype): try: - dtype = _dtypes[str(np.dtype(dtype))] + dtype = _dtypes[str(np.dtype(dtype.name.lower()))] except KeyError: raise ValueError("invalid dtype specified {}".format(dtype)) @@ -261,6 +261,10 @@ def __init__(self, values, mask, copy=False): def _from_sequence(cls, scalars, dtype=None, copy=False): return integer_array(scalars, dtype=dtype, copy=copy) + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence([int(x) for x in strings], dtype, copy) + @classmethod def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index eae9eb97f35fe..ac370491db9a7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -615,8 +615,22 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): # dispatch on extension dtype if needed if is_extension_array_dtype(dtype): - return dtype.construct_array_type()._from_sequence( - arr, dtype=dtype, copy=copy) + if is_object_dtype(arr): + try: + return dtype.construct_array_type()._from_sequence_of_strings( + arr, dtype=dtype, copy=copy) + except AttributeError: + dtype = pandas_dtype(dtype) + return dtype.construct_array_type()._from_sequence_of_strings( + arr, dtype=dtype, copy=copy) + else: + try: + return dtype.construct_array_type()._from_sequence( + arr, dtype=dtype, copy=copy) + except AttributeError: + dtype = pandas_dtype(dtype) + return dtype.construct_array_type()._from_sequence( + arr, dtype=dtype, copy=copy) if not isinstance(dtype, np.dtype): dtype = pandas_dtype(dtype) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 1f09ed0624f68..338c30ceeca6e 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1795,7 +1795,10 @@ def _get_dtype(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype elif isinstance(arr_or_dtype, type): - return np.dtype(arr_or_dtype) + try: + return pandas_dtype(arr_or_dtype) + except TypeError: + return np.dtype(arr_or_dtype) elif isinstance(arr_or_dtype, ExtensionDtype): return arr_or_dtype elif isinstance(arr_or_dtype, DatetimeTZDtype): @@ -1813,6 +1816,11 @@ def _get_dtype(arr_or_dtype): return PeriodDtype.construct_from_string(arr_or_dtype) elif is_interval_dtype(arr_or_dtype): return IntervalDtype.construct_from_string(arr_or_dtype) + else: + try: + return pandas_dtype(arr_or_dtype) + except TypeError: + pass elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex, ABCSparseArray, ABCSparseSeries)): return arr_or_dtype.dtype @@ -1843,7 +1851,15 @@ def _get_dtype_type(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype.type elif isinstance(arr_or_dtype, type): - return np.dtype(arr_or_dtype).type + try: + dtype = pandas_dtype(arr_or_dtype) + try: + return dtype.type + except AttributeError: + raise TypeError + except TypeError: + return np.dtype(arr_or_dtype).type + elif isinstance(arr_or_dtype, CategoricalDtype): return CategoricalDtypeType elif isinstance(arr_or_dtype, DatetimeTZDtype): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index de0ed9407e161..b8ade3250d34c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -29,7 +29,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal, is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype, - is_scalar, is_string_dtype) + is_scalar, is_string_dtype, is_extension_array_dtype) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna @@ -1660,7 +1660,8 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try_num_bool=False) else: # skip inference if specified dtype is object - try_num_bool = not (cast_type and is_string_dtype(cast_type)) + try_num_bool = not (cast_type and (is_string_dtype(cast_type) + or is_extension_array_dtype(cast_type))) # general type inference and conversion cvals, na_count = self._infer_types( @@ -1668,7 +1669,8 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try_num_bool) # type specified in dtype param - if cast_type and not is_dtype_equal(cvals, cast_type): + if cast_type and not (is_dtype_equal(cvals, cast_type) + or is_extension_array_dtype(cast_type)): try: if (is_bool_dtype(cast_type) and not is_categorical_dtype(cast_type) diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py new file mode 100644 index 0000000000000..d96590b4699a0 --- /dev/null +++ b/pandas/tests/extension/base/io.py @@ -0,0 +1,19 @@ +import pandas as pd +from pandas.compat import StringIO +from pandas.core.arrays.integer import Int64Dtype +from .base import BaseExtensionTests + + +class ExtensionParsingTests(BaseExtensionTests): + def test_EA_types(self): + df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'), + 'A': [1, 2, 1]}) + data = df.to_csv(index=False) + result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype}) + assert result is not None + + df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int8'), + 'A': [1, 2, 1]}) + data = df.to_csv(index=False) + result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'}) + assert result is not None diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 7e618dfd2b92e..1823eeb4d7fc0 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -75,6 +75,11 @@ def dtype(self): def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(scalars) + @classmethod + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): + return cls._from_sequence([decimal.Decimal(x) for x in strings], + dtype, copy) + @classmethod def _from_factorized(cls, values, original): return cls(values) From fc9ba8ec5620df3e9889fa781ae0d85204ccd3a2 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sat, 1 Dec 2018 12:37:14 -0500 Subject: [PATCH 04/22] Fix CI --- pandas/core/arrays/integer.py | 5 ++++- pandas/core/dtypes/cast.py | 14 ++++++++++---- pandas/core/indexes/numeric.py | 6 ++++-- pandas/tests/extension/base/io.py | 18 +++++++++++++++++- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 2c4d35e633e16..aeb510e002241 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -154,7 +154,10 @@ def coerce_to_array(values, dtype, mask=None, copy=False): dtype = dtype.lower() if not issubclass(type(dtype), _IntegerDtype): try: - dtype = _dtypes[str(np.dtype(dtype.name.lower()))] + try: + dtype = _dtypes[str(np.dtype(dtype.name.lower()))] + except AttributeError: + dtype = _dtypes[str(np.dtype(dtype.lower()))] except KeyError: raise ValueError("invalid dtype specified {}".format(dtype)) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ac370491db9a7..b925974d060c6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -4,6 +4,7 @@ import numpy as np +from pandas.errors import AbstractMethodError from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import OutOfBoundsDatetime, Period, iNaT from pandas.compat import PY3, string_types, text_type, to_str @@ -617,12 +618,17 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): if is_extension_array_dtype(dtype): if is_object_dtype(arr): try: - return dtype.construct_array_type()._from_sequence_of_strings( - arr, dtype=dtype, copy=copy) + array_type = dtype.construct_array_type() except AttributeError: dtype = pandas_dtype(dtype) - return dtype.construct_array_type()._from_sequence_of_strings( - arr, dtype=dtype, copy=copy) + array_type = dtype.construct_array_type() + try: + # use _from_sequence_of_strings if the class defines it + return array_type._from_sequence_of_strings(arr, + dtype=dtype, + copy=copy) + except AbstractMethodError: + return array_type._from_sequence(arr, dtype=dtype, copy=copy) else: try: return dtype.construct_array_type()._from_sequence( diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 9d6a56200df6e..c214f567e435a 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -8,7 +8,8 @@ from pandas.core.dtypes.common import ( is_bool, is_bool_dtype, is_dtype_equal, is_float, is_integer_dtype, - is_scalar, needs_i8_conversion, pandas_dtype) + is_scalar, needs_i8_conversion, pandas_dtype, is_extension_array_dtype, +) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna @@ -328,7 +329,8 @@ def astype(self, dtype, copy=True): msg = ('Cannot convert Float64Index to dtype {dtype}; integer ' 'values are required for conversion').format(dtype=dtype) raise TypeError(msg) - elif is_integer_dtype(dtype) and self.hasnans: + elif is_integer_dtype(dtype) and self.hasnans and \ + not is_extension_array_dtype(dtype): # GH 13149 raise ValueError('Cannot convert NA to integer') return super(Float64Index, self).astype(dtype, copy=copy) diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index d96590b4699a0..fd429f70565d5 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -1,8 +1,24 @@ import pandas as pd from pandas.compat import StringIO -from pandas.core.arrays.integer import Int64Dtype +from pandas.core.arrays.integer import ( + Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, + UInt32Dtype, UInt64Dtype) from .base import BaseExtensionTests +def make_data(): + return (list(range(1, 9)) + [np.nan] + list(range(10, 98)) + + [np.nan] + [99, 100]) + + +@pytest.fixture(params=[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, + UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype]) +def dtype(request): + return request.param() + + +@pytest.fixture +def data(dtype): + return integer_array(make_data(), dtype=dtype) class ExtensionParsingTests(BaseExtensionTests): def test_EA_types(self): From 1d9ee6548df3542130bdd8de1b7e956ade0979db Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sat, 1 Dec 2018 12:44:54 -0500 Subject: [PATCH 05/22] Formatting --- pandas/io/parsers.py | 2 +- pandas/tests/extension/base/io.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b8ade3250d34c..1493203d08b3c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1661,7 +1661,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, else: # skip inference if specified dtype is object try_num_bool = not (cast_type and (is_string_dtype(cast_type) - or is_extension_array_dtype(cast_type))) + or is_extension_array_dtype(cast_type))) # noqa # general type inference and conversion cvals, na_count = self._infer_types( diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index fd429f70565d5..db8241bfdc869 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -1,10 +1,14 @@ +import pytest import pandas as pd +import numpy as np from pandas.compat import StringIO from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype) + UInt32Dtype, UInt64Dtype, integer_array, +) from .base import BaseExtensionTests + def make_data(): return (list(range(1, 9)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100]) @@ -20,6 +24,7 @@ def dtype(request): def data(dtype): return integer_array(make_data(), dtype=dtype) + class ExtensionParsingTests(BaseExtensionTests): def test_EA_types(self): df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'), From d98ce71a86a3c04c29cdf949a2235fcd12fa5988 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sat, 1 Dec 2018 13:26:19 -0500 Subject: [PATCH 06/22] Make C engine the same as python parser engine --- pandas/_libs/parsers.pyx | 35 +++++++++++++++++-------- pandas/tests/extension/base/__init__.py | 1 + 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index cac2ceb9c0fdf..767ef9d2f43c9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -56,8 +56,10 @@ from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals import pandas.io.common as icom -from pandas.errors import (ParserError, DtypeWarning, - EmptyDataError, ParserWarning) +from pandas.errors import ( + ParserError, DtypeWarning, + EmptyDataError, ParserWarning, AbstractMethodError, +) # Import CParserError as alias of ParserError for backwards compatibility. # Ultimately, we want to remove this import. See gh-12665 and gh-14479. @@ -1232,10 +1234,16 @@ cdef class TextReader: if result is not None and dtype != 'int64': if is_extension_array_dtype(dtype): try: - result = dtype.construct_array_type()._from_sequence( - result, dtype=dtype) - except Exception as e: - raise + array_type = dtype.construct_array_type() + except AttributeError: + dtype = pandas_dtype(dtype) + array_type = dtype.construct_array_type() + try: + # use _from_sequence_of_strings if the class defines it + return array_type._from_sequence_of_strings(result, + dtype=dtype) # noqa + except AbstractMethodError: + return array_type._from_sequence(result, dtype=dtype) else: result = result.astype(dtype) @@ -1248,14 +1256,19 @@ cdef class TextReader: if result is not None and dtype != 'float64': if is_extension_array_dtype(dtype): try: - result = dtype.construct_array_type()._from_sequence( - result) - except Exception as e: - raise + array_type = dtype.construct_array_type() + except AttributeError: + dtype = pandas_dtype(dtype) + array_type = dtype.construct_array_type() + try: + # use _from_sequence_of_strings if the class defines it + return array_type._from_sequence_of_strings(result, + dtype=dtype) # noqa + except AbstractMethodError: + return array_type._from_sequence(result, dtype=dtype) else: result = result.astype(dtype) return result, na_count - elif is_bool_dtype(dtype): result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 57704b77bb233..370f48e8e114e 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -53,3 +53,4 @@ class TestMyDtype(BaseDtypeTests): from .missing import BaseMissingTests # noqa from .reshaping import BaseReshapingTests # noqa from .setitem import BaseSetitemTests # noqa +from .io import ExtensionParsingTests # noqa From 7b50b2c2d6b3f0e1fb0db41ee6aa36153d2091f9 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sat, 8 Dec 2018 11:21:32 -0500 Subject: [PATCH 07/22] Merging master --- pandas/_libs/parsers.pyx | 8 ++++---- pandas/core/internals/construction.py | 3 ++- pandas/io/parsers.py | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 767ef9d2f43c9..f9141d2d339b8 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1240,10 +1240,10 @@ cdef class TextReader: array_type = dtype.construct_array_type() try: # use _from_sequence_of_strings if the class defines it - return array_type._from_sequence_of_strings(result, + result = array_type._from_sequence_of_strings(result, dtype=dtype) # noqa except AbstractMethodError: - return array_type._from_sequence(result, dtype=dtype) + result = array_type._from_sequence(result, dtype=dtype) else: result = result.astype(dtype) @@ -1262,10 +1262,10 @@ cdef class TextReader: array_type = dtype.construct_array_type() try: # use _from_sequence_of_strings if the class defines it - return array_type._from_sequence_of_strings(result, + result = array_type._from_sequence_of_strings(result, dtype=dtype) # noqa except AbstractMethodError: - return array_type._from_sequence(result, dtype=dtype) + result = array_type._from_sequence(result, dtype=dtype) else: result = result.astype(dtype) return result, na_count diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b18b966406bbb..86cc535c74614 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -558,7 +558,8 @@ def sanitize_array(data, index, dtype=None, copy=False, subarr = np.array(data, copy=False) # possibility of nan -> garbage - if is_float_dtype(data.dtype) and is_integer_dtype(dtype): + if is_float_dtype(data.dtype) and is_integer_dtype(dtype) \ + and not is_extension_array_dtype(dtype): if not isna(data).any(): subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1493203d08b3c..c703480e9b0f8 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1669,8 +1669,8 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try_num_bool) # type specified in dtype param - if cast_type and not (is_dtype_equal(cvals, cast_type) - or is_extension_array_dtype(cast_type)): + if cast_type and not is_dtype_equal(cvals, cast_type): + # or is_extension_array_dtype(cast_type)): try: if (is_bool_dtype(cast_type) and not is_categorical_dtype(cast_type) From 662e03738f4da4dbc607acbbffd95ed2411946c1 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sat, 8 Dec 2018 11:36:08 -0500 Subject: [PATCH 08/22] Update whatsnew --- doc/source/whatsnew/v0.24.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a84fd118061bc..c007c483141e7 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -32,6 +32,8 @@ New features - :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame. See the :ref:`section on writing HTML ` in the IO docs for example usage. (:issue:`2679`) - :meth:`DataFrame.shift` :meth:`Series.shift`, :meth:`ExtensionArray.shift`, :meth:`SparseArray.shift`, :meth:`Period.shift`, :meth:`GroupBy.shift`, :meth:`Categorical.shift`, :meth:`NDFrame.shift` and :meth:`Block.shift` now accept `fill_value` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`) +- :func:`pandas.read_csv` now supports ``EA`` types as an argument to ``dtype``, + allowing the user to use ``EA`` types when reading CSVs. (:issue:`23228`) .. _whatsnew_0240.values_api: From 41f2b4ef9f1268cc3873a4d64708f74cc8d9cff7 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sun, 9 Dec 2018 13:10:26 -0500 Subject: [PATCH 09/22] Fix low_memory C engine parser --- pandas/_libs/parsers.pyx | 9 ++++++--- pandas/core/internals/construction.py | 2 +- pandas/io/parsers.py | 4 ++-- pandas/tests/extension/base/io.py | 10 +++++++--- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index f9141d2d339b8..934b62de63da2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -986,7 +986,6 @@ cdef class TextReader: footer=footer, upcast_na=True) self._end_clock('Type conversion') - self._start_clock() if len(columns) > 0: rows_read = len(list(columns.values())[0]) @@ -1241,7 +1240,7 @@ cdef class TextReader: try: # use _from_sequence_of_strings if the class defines it result = array_type._from_sequence_of_strings(result, - dtype=dtype) # noqa + dtype=dtype) # noqa except AbstractMethodError: result = array_type._from_sequence(result, dtype=dtype) else: @@ -2201,7 +2200,11 @@ def _concatenate_chunks(list chunks): result[name] = union_categoricals(arrs, sort_categories=sort_categories) else: - result[name] = np.concatenate(arrs) + if is_extension_array_dtype(dtype): + result[name] = dtype \ + .construct_array_type()._concat_same_type(arrs) + else: + result[name] = np.concatenate(arrs) if warning_columns: warning_names = ','.join(warning_columns) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 86cc535c74614..855439509c7a6 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -559,7 +559,7 @@ def sanitize_array(data, index, dtype=None, copy=False, # possibility of nan -> garbage if is_float_dtype(data.dtype) and is_integer_dtype(dtype) \ - and not is_extension_array_dtype(dtype): + and not is_extension_array_dtype(dtype): if not isna(data).any(): subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c703480e9b0f8..690440933df41 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1669,8 +1669,8 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try_num_bool) # type specified in dtype param - if cast_type and not is_dtype_equal(cvals, cast_type): - # or is_extension_array_dtype(cast_type)): + if cast_type and (not is_dtype_equal(cvals, cast_type) + or is_extension_array_dtype(cast_type)): try: if (is_bool_dtype(cast_type) and not is_categorical_dtype(cast_type) diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index db8241bfdc869..a2e2accea1bf1 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -26,15 +26,19 @@ def data(dtype): class ExtensionParsingTests(BaseExtensionTests): - def test_EA_types(self): + + @pytest.mark.parametrize('engine', ['c', 'python']) + def test_EA_types(self, engine): df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'), 'A': [1, 2, 1]}) data = df.to_csv(index=False) - result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype}) + result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype}, + engine=engine) assert result is not None df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int8'), 'A': [1, 2, 1]}) data = df.to_csv(index=False) - result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'}) + result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'}, + engine=engine) assert result is not None From d118921e7c6010bbfdd23c9d8af291028c539a77 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sun, 9 Dec 2018 15:47:28 -0500 Subject: [PATCH 10/22] Address merge comments --- pandas/_libs/parsers.pyx | 48 +++++++++++-------------------- pandas/core/arrays/base.py | 3 ++ pandas/core/arrays/integer.py | 6 ++-- pandas/core/dtypes/cast.py | 33 ++++++++------------- pandas/core/dtypes/common.py | 25 ++-------------- pandas/tests/extension/base/io.py | 14 +++------ 6 files changed, 42 insertions(+), 87 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 934b62de63da2..cc46b5a8fc86c 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -56,10 +56,8 @@ from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals import pandas.io.common as icom -from pandas.errors import ( - ParserError, DtypeWarning, - EmptyDataError, ParserWarning, AbstractMethodError, -) +from pandas.errors import ( ParserError, DtypeWarning, + EmptyDataError, ParserWarning ) # Import CParserError as alias of ParserError for backwards compatibility. # Ultimately, we want to remove this import. See gh-12665 and gh-14479. @@ -1217,6 +1215,18 @@ cdef class TextReader: cats, codes, dtype, true_values=true_values) return cat, na_count + elif is_extension_array_dtype(dtype): + result, na_count = self._string_convert(i, start, end, na_filter, + na_hashset) + try: + # use _from_sequence_of_strings if the class defines it + result = dtype.construct_array_type() \ + ._from_sequence_of_strings(result, dtype=dtype) + except NotImplementedError: + result = dtype.construct_array_type() \ + ._from_sequence(result, dtype=dtype) + return result, na_count + elif is_integer_dtype(dtype): try: result, na_count = _try_int64(self.parser, i, start, @@ -1231,20 +1241,7 @@ cdef class TextReader: na_count = 0 if result is not None and dtype != 'int64': - if is_extension_array_dtype(dtype): - try: - array_type = dtype.construct_array_type() - except AttributeError: - dtype = pandas_dtype(dtype) - array_type = dtype.construct_array_type() - try: - # use _from_sequence_of_strings if the class defines it - result = array_type._from_sequence_of_strings(result, - dtype=dtype) # noqa - except AbstractMethodError: - result = array_type._from_sequence(result, dtype=dtype) - else: - result = result.astype(dtype) + result = result.astype(dtype) return result, na_count @@ -1253,20 +1250,7 @@ cdef class TextReader: na_filter, na_hashset, na_flist) if result is not None and dtype != 'float64': - if is_extension_array_dtype(dtype): - try: - array_type = dtype.construct_array_type() - except AttributeError: - dtype = pandas_dtype(dtype) - array_type = dtype.construct_array_type() - try: - # use _from_sequence_of_strings if the class defines it - result = array_type._from_sequence_of_strings(result, - dtype=dtype) # noqa - except AbstractMethodError: - result = array_type._from_sequence(result, dtype=dtype) - else: - result = result.astype(dtype) + result = result.astype(dtype) return result, na_count elif is_bool_dtype(dtype): result, na_count = _try_bool_flex(self.parser, i, start, end, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e1137b070cc58..b4f5f7e8ce27f 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -132,6 +132,8 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): """Construct a new ExtensionArray from a sequence of scalars. + .. versionadded:: 0.24.0 + Parameters ---------- strings : Sequence @@ -146,6 +148,7 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): Returns ------- ExtensionArray + """ raise AbstractMethodError(cls) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index aeb510e002241..691cab2a1ed93 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -19,6 +19,7 @@ from pandas.core import nanops from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.tools.numeric import to_numeric class _IntegerDtype(ExtensionDtype): @@ -157,7 +158,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): try: dtype = _dtypes[str(np.dtype(dtype.name.lower()))] except AttributeError: - dtype = _dtypes[str(np.dtype(dtype.lower()))] + dtype = _dtypes[str(np.dtype(dtype))] except KeyError: raise ValueError("invalid dtype specified {}".format(dtype)) @@ -266,7 +267,8 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): - return cls._from_sequence([int(x) for x in strings], dtype, copy) + scalars = to_numeric(strings, errors='raise') + return cls._from_sequence(scalars, dtype, copy) @classmethod def _from_factorized(cls, values, original): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b925974d060c6..b6b6a4fa1d8e9 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -616,27 +616,18 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): # dispatch on extension dtype if needed if is_extension_array_dtype(dtype): - if is_object_dtype(arr): - try: - array_type = dtype.construct_array_type() - except AttributeError: - dtype = pandas_dtype(dtype) - array_type = dtype.construct_array_type() - try: - # use _from_sequence_of_strings if the class defines it - return array_type._from_sequence_of_strings(arr, - dtype=dtype, - copy=copy) - except AbstractMethodError: - return array_type._from_sequence(arr, dtype=dtype, copy=copy) - else: - try: - return dtype.construct_array_type()._from_sequence( - arr, dtype=dtype, copy=copy) - except AttributeError: - dtype = pandas_dtype(dtype) - return dtype.construct_array_type()._from_sequence( - arr, dtype=dtype, copy=copy) + try: + array_type = dtype.construct_array_type() + except AttributeError: + dtype = pandas_dtype(dtype) + array_type = dtype.construct_array_type() + try: + # use _from_sequence_of_strings if the class defines it + return array_type._from_sequence_of_strings(arr, + dtype=dtype, + copy=copy) + except NotImplementedError: + return array_type._from_sequence(arr, dtype=dtype, copy=copy) if not isinstance(dtype, np.dtype): dtype = pandas_dtype(dtype) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 338c30ceeca6e..e1141c6b6b3a8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1795,10 +1795,7 @@ def _get_dtype(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype elif isinstance(arr_or_dtype, type): - try: - return pandas_dtype(arr_or_dtype) - except TypeError: - return np.dtype(arr_or_dtype) + return np.dtype(arr_or_dtype) elif isinstance(arr_or_dtype, ExtensionDtype): return arr_or_dtype elif isinstance(arr_or_dtype, DatetimeTZDtype): @@ -1816,11 +1813,6 @@ def _get_dtype(arr_or_dtype): return PeriodDtype.construct_from_string(arr_or_dtype) elif is_interval_dtype(arr_or_dtype): return IntervalDtype.construct_from_string(arr_or_dtype) - else: - try: - return pandas_dtype(arr_or_dtype) - except TypeError: - pass elif isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex, ABCSparseArray, ABCSparseSeries)): return arr_or_dtype.dtype @@ -1851,15 +1843,7 @@ def _get_dtype_type(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype.type elif isinstance(arr_or_dtype, type): - try: - dtype = pandas_dtype(arr_or_dtype) - try: - return dtype.type - except AttributeError: - raise TypeError - except TypeError: - return np.dtype(arr_or_dtype).type - + return np.dtype(arr_or_dtype).type elif isinstance(arr_or_dtype, CategoricalDtype): return CategoricalDtypeType elif isinstance(arr_or_dtype, DatetimeTZDtype): @@ -1888,10 +1872,7 @@ def _get_dtype_type(arr_or_dtype): try: return arr_or_dtype.dtype.type except AttributeError: - try: - return arr_or_dtype.numpy_dtype.type - except AttributeError: - return type(None) + return type(None) def _get_dtype_from_object(dtype): diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index a2e2accea1bf1..55b235ce75521 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -28,17 +28,11 @@ def data(dtype): class ExtensionParsingTests(BaseExtensionTests): @pytest.mark.parametrize('engine', ['c', 'python']) - def test_EA_types(self, engine): - df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'), - 'A': [1, 2, 1]}) + def test_EA_types(self, engine, data): + df = pd.DataFrame({'Int': pd.Series(data, dtype=str(data.dtype)), + 'A': data}) data = df.to_csv(index=False) - result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype}, + result = pd.read_csv(StringIO(data), dtype={'Int': str(data.dtype)}, engine=engine) assert result is not None - df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int8'), - 'A': [1, 2, 1]}) - data = df.to_csv(index=False) - result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'}, - engine=engine) - assert result is not None From b2dd5e49678458211a6dd7a81fa87630f9a3c264 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Fri, 21 Dec 2018 16:48:06 -0500 Subject: [PATCH 11/22] Require EAs to implement _from_sequence_of_strings to be used in parsers --- pandas/_libs/parsers.pyx | 8 ++++-- pandas/core/arrays/base.py | 4 +-- pandas/core/arrays/integer.py | 2 +- pandas/core/dtypes/cast.py | 1 - pandas/tests/extension/base/io.py | 1 - pandas/tests/extension/test_common.py | 40 +++++++++++++++++++++++++++ 6 files changed, 49 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index cc46b5a8fc86c..0dbbaab02353a 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1223,8 +1223,12 @@ cdef class TextReader: result = dtype.construct_array_type() \ ._from_sequence_of_strings(result, dtype=dtype) except NotImplementedError: - result = dtype.construct_array_type() \ - ._from_sequence(result, dtype=dtype) + raise NotImplementedError( + "Extension Array: {ea} must implement " + "_from_sequence_of_strings in order " + "to be used in parser methods".format( + ea=dtype.construct_array_type())) + return result, na_count elif is_integer_dtype(dtype): diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b4f5f7e8ce27f..bb8364c96a5dc 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -130,7 +130,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): - """Construct a new ExtensionArray from a sequence of scalars. + """Construct a new ExtensionArray from a sequence of strings. .. versionadded:: 0.24.0 @@ -150,7 +150,7 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): ExtensionArray """ - raise AbstractMethodError(cls) + raise NotImplementedError(cls) @classmethod def _from_factorized(cls, values, original): diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 691cab2a1ed93..68b823f7c5a11 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -267,7 +267,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): - scalars = to_numeric(strings, errors='raise') + scalars = to_numeric(strings, errors="raise") return cls._from_sequence(scalars, dtype, copy) @classmethod diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b6b6a4fa1d8e9..5fa0bcc535e6b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -4,7 +4,6 @@ import numpy as np -from pandas.errors import AbstractMethodError from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import OutOfBoundsDatetime, Period, iNaT from pandas.compat import PY3, string_types, text_type, to_str diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index 55b235ce75521..6c59816540a0a 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -35,4 +35,3 @@ def test_EA_types(self, engine, data): result = pd.read_csv(StringIO(data), dtype={'Int': str(data.dtype)}, engine=engine) assert result is not None - diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 2bc4bf5df2298..0551ad469869f 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -7,6 +7,27 @@ import pandas as pd from pandas.core.arrays import ExtensionArray import pandas.util.testing as tm +from pandas.compat import StringIO +from pandas.core.arrays.integer import ( + Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, + UInt32Dtype, UInt64Dtype, integer_array, +) + + +def make_data(): + return (list(range(1, 9)) + [np.nan] + list(range(10, 98)) + + [np.nan] + [99, 100]) + + +@pytest.fixture(params=[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, + UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype]) +def dtype(request): + return request.param() + + +@pytest.fixture +def data(dtype): + return integer_array(make_data(), dtype=dtype) class DummyDtype(dtypes.ExtensionDtype): @@ -92,3 +113,22 @@ def test_is_not_extension_array_dtype(dtype): def test_is_extension_array_dtype(dtype): assert isinstance(dtype, dtypes.ExtensionDtype) assert is_extension_array_dtype(dtype) + + +@pytest.mark.parametrize('engine', ['c', 'python']) +def test_EA_types(engine): + df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'), + 'A': [1, 2, 1]}) + data = df.to_csv(index=False) + result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype}, + engine=engine) + assert result is not None + tm.assert_frame_equal(result, df) + + df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int8'), + 'A': [1, 2, 1]}) + data = df.to_csv(index=False) + result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'}, + engine=engine) + assert result is not None + tm.assert_frame_equal(result, df) From f1f5aaa03ea0fa2887d1eed6d0bc749b0c87d8a6 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sat, 22 Dec 2018 11:03:41 -0500 Subject: [PATCH 12/22] pep8 --- pandas/_libs/parsers.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 0dbbaab02353a..22d559f3991ee 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1224,10 +1224,10 @@ cdef class TextReader: ._from_sequence_of_strings(result, dtype=dtype) except NotImplementedError: raise NotImplementedError( - "Extension Array: {ea} must implement " - "_from_sequence_of_strings in order " - "to be used in parser methods".format( - ea=dtype.construct_array_type())) + "Extension Array: {ea} must implement " + "_from_sequence_of_strings in order " + "to be used in parser methods".format( + ea=dtype.construct_array_type())) return result, na_count From b96da9dc3c49e281009422d873c24070836f9307 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sun, 23 Dec 2018 12:38:37 -0500 Subject: [PATCH 13/22] Address merge comments Reverted unintended changes to whatsnew Raise AbstractMethodError instead of NotImplemented Clean up changes in parsers.pyx --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/_libs/parsers.pyx | 12 ++++++------ pandas/core/arrays/base.py | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index c007c483141e7..8f8f6cac8bad9 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -31,6 +31,7 @@ New features - :func:`read_feather` now accepts ``columns`` as an argument, allowing the user to specify which columns should be read. (:issue:`24025`) - :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame. See the :ref:`section on writing HTML ` in the IO docs for example usage. (:issue:`2679`) +- :func:`pandas.read_csv` now supports pandas extension types as an argument to ``dtype``, allowing the user to use pandas extension types when reading CSVs. (:issue:`23228`) - :meth:`DataFrame.shift` :meth:`Series.shift`, :meth:`ExtensionArray.shift`, :meth:`SparseArray.shift`, :meth:`Period.shift`, :meth:`GroupBy.shift`, :meth:`Categorical.shift`, :meth:`NDFrame.shift` and :meth:`Block.shift` now accept `fill_value` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`) - :func:`pandas.read_csv` now supports ``EA`` types as an argument to ``dtype``, allowing the user to use ``EA`` types when reading CSVs. (:issue:`23228`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 22d559f3991ee..4372dbde37ba6 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1218,16 +1218,16 @@ cdef class TextReader: elif is_extension_array_dtype(dtype): result, na_count = self._string_convert(i, start, end, na_filter, na_hashset) + array_type = dtype.construct_array_type() try: # use _from_sequence_of_strings if the class defines it - result = dtype.construct_array_type() \ - ._from_sequence_of_strings(result, dtype=dtype) + result = array_type._from_sequence_of_strings(result, + dtype=dtype) except NotImplementedError: raise NotImplementedError( "Extension Array: {ea} must implement " "_from_sequence_of_strings in order " - "to be used in parser methods".format( - ea=dtype.construct_array_type())) + "to be used in parser methods".format(ea=array_type)) return result, na_count @@ -2189,8 +2189,8 @@ def _concatenate_chunks(list chunks): sort_categories=sort_categories) else: if is_extension_array_dtype(dtype): - result[name] = dtype \ - .construct_array_type()._concat_same_type(arrs) + array_type = dtype.construct_array_type() + result[name] = array_type._concat_same_type(arrs) else: result[name] = np.concatenate(arrs) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bb8364c96a5dc..1b46d9e3c2c48 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -150,7 +150,7 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): ExtensionArray """ - raise NotImplementedError(cls) + raise AbstractMethodError(cls) @classmethod def _from_factorized(cls, values, original): From 50ca0c3b194973a5fb92011625266ea0b2010f05 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sun, 23 Dec 2018 13:10:33 -0500 Subject: [PATCH 14/22] Do EA type inference in the parser method rather than the cast method --- pandas/core/dtypes/cast.py | 14 ++------------ pandas/io/parsers.py | 22 ++++++++++++++++++++-- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5fa0bcc535e6b..eae9eb97f35fe 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -615,18 +615,8 @@ def astype_nansafe(arr, dtype, copy=True, skipna=False): # dispatch on extension dtype if needed if is_extension_array_dtype(dtype): - try: - array_type = dtype.construct_array_type() - except AttributeError: - dtype = pandas_dtype(dtype) - array_type = dtype.construct_array_type() - try: - # use _from_sequence_of_strings if the class defines it - return array_type._from_sequence_of_strings(arr, - dtype=dtype, - copy=copy) - except NotImplementedError: - return array_type._from_sequence(arr, dtype=dtype, copy=copy) + return dtype.construct_array_type()._from_sequence( + arr, dtype=dtype, copy=copy) if not isinstance(dtype, np.dtype): dtype = pandas_dtype(dtype) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 690440933df41..638d014d7f9a5 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -29,7 +29,8 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal, is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype, - is_scalar, is_string_dtype, is_extension_array_dtype) + is_scalar, is_string_dtype, is_extension_array_dtype, pandas_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna @@ -1661,7 +1662,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, else: # skip inference if specified dtype is object try_num_bool = not (cast_type and (is_string_dtype(cast_type) - or is_extension_array_dtype(cast_type))) # noqa + or is_extension_array_dtype(cast_type))) # noqa # general type inference and conversion cvals, na_count = self._infer_types( @@ -1767,6 +1768,23 @@ def _cast_types(self, values, cast_type, column): cats, cats.get_indexer(values), cast_type, true_values=self.true_values) + elif is_extension_array_dtype(cast_type): + try: + array_type = cast_type.construct_array_type() + except AttributeError: + cast_type = pandas_dtype(cast_type) + array_type = cast_type.construct_array_type() + + try: + # use _from_sequence_of_strings if the class defines it + return array_type._from_sequence_of_strings(values, + dtype=cast_type) + except NotImplementedError: + raise NotImplementedError( + "Extension Array: {ea} must implement " + "_from_sequence_of_strings in order " + "to be used in parser methods".format(ea=array_type)) + else: try: values = astype_nansafe(values, cast_type, From ecf26532a5064a070af5cac858d397545b323332 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sat, 29 Dec 2018 10:54:42 -0500 Subject: [PATCH 15/22] Fix imports using isort --- pandas/core/indexes/numeric.py | 5 ++--- pandas/io/parsers.py | 5 ++--- pandas/tests/extension/base/io.py | 10 ++++++---- pandas/tests/extension/test_common.py | 8 ++++---- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index c214f567e435a..74ff84106240b 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -7,9 +7,8 @@ from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - is_bool, is_bool_dtype, is_dtype_equal, is_float, is_integer_dtype, - is_scalar, needs_i8_conversion, pandas_dtype, is_extension_array_dtype, -) + is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float, + is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 638d014d7f9a5..d6bf12a43ab66 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -28,9 +28,8 @@ from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, is_categorical_dtype, is_dtype_equal, - is_float, is_integer, is_integer_dtype, is_list_like, is_object_dtype, - is_scalar, is_string_dtype, is_extension_array_dtype, pandas_dtype, -) + is_extension_array_dtype, is_float, is_integer, is_integer_dtype, + is_list_like, is_object_dtype, is_scalar, is_string_dtype, pandas_dtype) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import isna diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index 6c59816540a0a..c2d94624ec8e9 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -1,11 +1,13 @@ -import pytest -import pandas as pd import numpy as np +import pytest + from pandas.compat import StringIO + +import pandas as pd from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype, integer_array, -) + UInt32Dtype, UInt64Dtype, integer_array) + from .base import BaseExtensionTests diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 0551ad469869f..4356ea54e62b8 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -1,17 +1,17 @@ import numpy as np import pytest +from pandas.compat import StringIO + from pandas.core.dtypes import dtypes from pandas.core.dtypes.common import is_extension_array_dtype import pandas as pd from pandas.core.arrays import ExtensionArray -import pandas.util.testing as tm -from pandas.compat import StringIO from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype, integer_array, -) + UInt32Dtype, UInt64Dtype, integer_array) +import pandas.util.testing as tm def make_data(): From 5cce856c7301d8d30f50e864d8cb79a58c85fee9 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sun, 30 Dec 2018 13:17:59 -0500 Subject: [PATCH 16/22] Correct tests to match existing patterns, address merge comments Ensure that dtype is always an initiated object --- pandas/_libs/parsers.pyx | 11 +++--- pandas/core/arrays/integer.py | 5 +-- pandas/core/dtypes/common.py | 3 ++ pandas/core/indexes/numeric.py | 5 ++- pandas/core/internals/construction.py | 3 +- pandas/io/parsers.py | 19 +++++++--- pandas/tests/extension/base/__init__.py | 2 +- pandas/tests/extension/base/io.py | 35 ++++++------------- pandas/tests/extension/test_categorical.py | 4 +++ pandas/tests/extension/test_common.py | 40 ---------------------- pandas/tests/extension/test_integer.py | 4 +++ pandas/tests/extension/test_interval.py | 8 +++++ pandas/tests/extension/test_numpy.py | 4 +++ pandas/tests/extension/test_period.py | 8 +++++ pandas/tests/extension/test_sparse.py | 9 +++++ 15 files changed, 75 insertions(+), 85 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 4372dbde37ba6..e7e6b36b7e5ef 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -50,14 +50,13 @@ from pandas.core.dtypes.common import ( is_integer_dtype, is_float_dtype, is_bool_dtype, is_object_dtype, is_datetime64_dtype, - pandas_dtype, is_extension_array_dtype, -) + pandas_dtype, is_extension_array_dtype) from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals import pandas.io.common as icom -from pandas.errors import ( ParserError, DtypeWarning, - EmptyDataError, ParserWarning ) +from pandas.errors import (ParserError, DtypeWarning, + EmptyDataError, ParserWarning) # Import CParserError as alias of ParserError for backwards compatibility. # Ultimately, we want to remove this import. See gh-12665 and gh-14479. @@ -1123,7 +1122,9 @@ cdef class TextReader: if na_filter: self._free_na_set(na_hashset) - if upcast_na and na_count > 0: + try_upcast = upcast_na and na_count > 0 + # don't try to upcast EAs + if try_upcast and not is_extension_array_dtype(col_dtype): col_res = _maybe_upcast(col_res) if col_res is None: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 68b823f7c5a11..eaec76b96a24d 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -155,10 +155,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False): dtype = dtype.lower() if not issubclass(type(dtype), _IntegerDtype): try: - try: - dtype = _dtypes[str(np.dtype(dtype.name.lower()))] - except AttributeError: - dtype = _dtypes[str(np.dtype(dtype))] + dtype = _dtypes[str(np.dtype(dtype))] except KeyError: raise ValueError("invalid dtype specified {}".format(dtype)) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index e1141c6b6b3a8..b67f57e4aeee3 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1986,6 +1986,9 @@ def pandas_dtype(dtype): # registered extension types result = _pandas_registry.find(dtype) or registry.find(dtype) if result is not None: + # ensure result is an instantiated type + if isinstance(result, type): + return result() return result # un-registered extension types diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 74ff84106240b..445cadfa18dfc 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -7,7 +7,7 @@ from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - is_bool, is_bool_dtype, is_dtype_equal, is_extension_array_dtype, is_float, + is_bool, is_bool_dtype, is_dtype_equal, is_float, is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna @@ -328,8 +328,7 @@ def astype(self, dtype, copy=True): msg = ('Cannot convert Float64Index to dtype {dtype}; integer ' 'values are required for conversion').format(dtype=dtype) raise TypeError(msg) - elif is_integer_dtype(dtype) and self.hasnans and \ - not is_extension_array_dtype(dtype): + elif is_integer_dtype(dtype) and self.hasnans: # GH 13149 raise ValueError('Cannot convert NA to integer') return super(Float64Index, self).astype(dtype, copy=copy) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 855439509c7a6..b18b966406bbb 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -558,8 +558,7 @@ def sanitize_array(data, index, dtype=None, copy=False, subarr = np.array(data, copy=False) # possibility of nan -> garbage - if is_float_dtype(data.dtype) and is_integer_dtype(dtype) \ - and not is_extension_array_dtype(dtype): + if is_float_dtype(data.dtype) and is_integer_dtype(dtype): if not isna(data).any(): subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d6bf12a43ab66..74bdd25f7090b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1659,16 +1659,18 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, values, set(col_na_values) | col_na_fvalues, try_num_bool=False) else: + is_str_or_ea_dtype = (is_string_dtype(cast_type) + or is_extension_array_dtype(cast_type)) # skip inference if specified dtype is object - try_num_bool = not (cast_type and (is_string_dtype(cast_type) - or is_extension_array_dtype(cast_type))) # noqa + # or casting to an EA + try_num_bool = not (cast_type and is_str_or_ea_dtype) # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, try_num_bool) - # type specified in dtype param + # type specified in dtype param or cast_type is an EA if cast_type and (not is_dtype_equal(cvals, cast_type) or is_extension_array_dtype(cast_type)): try: @@ -1767,6 +1769,7 @@ def _cast_types(self, values, cast_type, column): cats, cats.get_indexer(values), cast_type, true_values=self.true_values) + # use the EA's implementation of casting elif is_extension_array_dtype(cast_type): try: array_type = cast_type.construct_array_type() @@ -1775,7 +1778,6 @@ def _cast_types(self, values, cast_type, column): array_type = cast_type.construct_array_type() try: - # use _from_sequence_of_strings if the class defines it return array_type._from_sequence_of_strings(values, dtype=cast_type) except NotImplementedError: @@ -2193,7 +2195,14 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] - self.dtype = kwds['dtype'] + + # convert dtype to a pandas_dtype + dtype = kwds['dtype'] + if isinstance(dtype, dict): + self.dtype = {k: pandas_dtype(dtype[k]) + for k in dtype} + else: + self.dtype = dtype self.thousands = kwds['thousands'] self.decimal = kwds['decimal'] diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 370f48e8e114e..1f7ee2ae17e4a 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -53,4 +53,4 @@ class TestMyDtype(BaseDtypeTests): from .missing import BaseMissingTests # noqa from .reshaping import BaseReshapingTests # noqa from .setitem import BaseSetitemTests # noqa -from .io import ExtensionParsingTests # noqa +from .io import BaseParsingTests # noqa diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index c2d94624ec8e9..41417b086fc9a 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -4,36 +4,21 @@ from pandas.compat import StringIO import pandas as pd -from pandas.core.arrays.integer import ( - Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype, integer_array) +import pandas.testing as tm from .base import BaseExtensionTests -def make_data(): - return (list(range(1, 9)) + [np.nan] + list(range(10, 98)) - + [np.nan] + [99, 100]) - - -@pytest.fixture(params=[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, - UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype]) -def dtype(request): - return request.param() - - -@pytest.fixture -def data(dtype): - return integer_array(make_data(), dtype=dtype) - - -class ExtensionParsingTests(BaseExtensionTests): +class BaseParsingTests(BaseExtensionTests): @pytest.mark.parametrize('engine', ['c', 'python']) def test_EA_types(self, engine, data): - df = pd.DataFrame({'Int': pd.Series(data, dtype=str(data.dtype)), - 'A': data}) - data = df.to_csv(index=False) - result = pd.read_csv(StringIO(data), dtype={'Int': str(data.dtype)}, - engine=engine) + df = pd.DataFrame({ + 'with_dtype': pd.Series(data, dtype=str(data.dtype)) + }) + csv_output = df.to_csv(index=False, na_rep=np.nan) + result = pd.read_csv(StringIO(csv_output), dtype={ + 'with_dtype': str(data.dtype) + }, engine=engine) assert result is not None + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index c876db416470c..ac52d8f15b8ce 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -237,3 +237,7 @@ def _compare_other(self, s, data, op_name, other): else: with pytest.raises(TypeError): op(data, other) + + +class TestParsing(base.BaseParsingTests): + pass diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 4356ea54e62b8..2bc4bf5df2298 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -1,35 +1,14 @@ import numpy as np import pytest -from pandas.compat import StringIO - from pandas.core.dtypes import dtypes from pandas.core.dtypes.common import is_extension_array_dtype import pandas as pd from pandas.core.arrays import ExtensionArray -from pandas.core.arrays.integer import ( - Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, - UInt32Dtype, UInt64Dtype, integer_array) import pandas.util.testing as tm -def make_data(): - return (list(range(1, 9)) + [np.nan] + list(range(10, 98)) - + [np.nan] + [99, 100]) - - -@pytest.fixture(params=[Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, - UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype]) -def dtype(request): - return request.param() - - -@pytest.fixture -def data(dtype): - return integer_array(make_data(), dtype=dtype) - - class DummyDtype(dtypes.ExtensionDtype): pass @@ -113,22 +92,3 @@ def test_is_not_extension_array_dtype(dtype): def test_is_extension_array_dtype(dtype): assert isinstance(dtype, dtypes.ExtensionDtype) assert is_extension_array_dtype(dtype) - - -@pytest.mark.parametrize('engine', ['c', 'python']) -def test_EA_types(engine): - df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int64'), - 'A': [1, 2, 1]}) - data = df.to_csv(index=False) - result = pd.read_csv(StringIO(data), dtype={'Int': Int64Dtype}, - engine=engine) - assert result is not None - tm.assert_frame_equal(result, df) - - df = pd.DataFrame({'Int': pd.Series([1, 2, 3], dtype='Int8'), - 'A': [1, 2, 1]}) - data = df.to_csv(index=False) - result = pd.read_csv(StringIO(data), dtype={'Int': 'Int8'}, - engine=engine) - assert result is not None - tm.assert_frame_equal(result, df) diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index e21ca81bcf5c3..aadf9f2f12b68 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -218,3 +218,7 @@ class TestBooleanReduce(base.BaseBooleanReduceTests): class TestPrinting(base.BasePrintingTests): pass + + +class TestParsing(base.BaseParsingTests): + pass diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 644f3ef94f40b..6eedbfb4aba39 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -152,3 +152,11 @@ class TestPrinting(BaseInterval, base.BasePrintingTests): @pytest.mark.skip(reason="custom repr") def test_array_repr(self, data, size): pass + + +class TestParsing(BaseInterval, base.BaseParsingTests): + @pytest.mark.parametrize('engine', ['c', 'python']) + def test_EA_types(self, engine, data): + expected_msg = r'.*must implement _from_sequence_of_strings.*' + with pytest.raises(NotImplementedError, match=expected_msg): + super(TestParsing, self).test_EA_types(engine, data) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 70a3a8ab58aac..7ca6882c7441b 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -210,3 +210,7 @@ def test_concat_mixed_dtypes(self, data): class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): pass + + +class TestParsing(BaseNumPyTests, base.BaseParsingTests): + pass diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index 08e21fc30ad10..813efcb5678d3 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -156,3 +156,11 @@ class TestGroupby(BasePeriodTests, base.BaseGroupbyTests): class TestPrinting(BasePeriodTests, base.BasePrintingTests): pass + + +class TestParsing(BasePeriodTests, base.BaseParsingTests): + @pytest.mark.parametrize('engine', ['c', 'python']) + def test_EA_types(self, engine, data): + expected_msg = r'.*must implement _from_sequence_of_strings.*' + with pytest.raises(NotImplementedError, match=expected_msg): + super(TestParsing, self).test_EA_types(engine, data) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 257eb44cd94fe..678b7cd1b6cd2 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -359,3 +359,12 @@ class TestPrinting(BaseSparseTests, base.BasePrintingTests): @pytest.mark.xfail(reason='Different repr', strict=True) def test_array_repr(self, data, size): super(TestPrinting, self).test_array_repr(data, size) + + +class TestParsing(BaseSparseTests, base.BaseParsingTests): + @pytest.mark.parametrize('engine', ['c', 'python']) + def test_EA_types(self, engine, data): + expected_msg = r'.*must implement _from_sequence_of_strings.*' + with pytest.raises(NotImplementedError, match=expected_msg): + super(TestParsing, self).test_EA_types(engine, data) + From 4de830c37a9d0676a537aabb5b324d5290fe1ebd Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sun, 30 Dec 2018 14:33:40 -0500 Subject: [PATCH 17/22] Addressing more merge comments Reverted change where pandas_dtype returns instantiated object --- doc/source/whatsnew/v0.24.0.rst | 2 -- pandas/_libs/parsers.pyx | 2 +- pandas/core/dtypes/common.py | 3 --- pandas/io/parsers.py | 18 ++++-------------- pandas/tests/extension/base/io.py | 4 ++-- 5 files changed, 7 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 8f8f6cac8bad9..028ed99b23a6c 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -33,8 +33,6 @@ New features See the :ref:`section on writing HTML ` in the IO docs for example usage. (:issue:`2679`) - :func:`pandas.read_csv` now supports pandas extension types as an argument to ``dtype``, allowing the user to use pandas extension types when reading CSVs. (:issue:`23228`) - :meth:`DataFrame.shift` :meth:`Series.shift`, :meth:`ExtensionArray.shift`, :meth:`SparseArray.shift`, :meth:`Period.shift`, :meth:`GroupBy.shift`, :meth:`Categorical.shift`, :meth:`NDFrame.shift` and :meth:`Block.shift` now accept `fill_value` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`) -- :func:`pandas.read_csv` now supports ``EA`` types as an argument to ``dtype``, - allowing the user to use ``EA`` types when reading CSVs. (:issue:`23228`) .. _whatsnew_0240.values_api: diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e7e6b36b7e5ef..0b7edaf04a1ed 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1122,8 +1122,8 @@ cdef class TextReader: if na_filter: self._free_na_set(na_hashset) - try_upcast = upcast_na and na_count > 0 # don't try to upcast EAs + try_upcast = upcast_na and na_count > 0 if try_upcast and not is_extension_array_dtype(col_dtype): col_res = _maybe_upcast(col_res) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index b67f57e4aeee3..e1141c6b6b3a8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1986,9 +1986,6 @@ def pandas_dtype(dtype): # registered extension types result = _pandas_registry.find(dtype) or registry.find(dtype) if result is not None: - # ensure result is an instantiated type - if isinstance(result, type): - return result() return result # un-registered extension types diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 74bdd25f7090b..eac3018b4a6fe 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1771,12 +1771,9 @@ def _cast_types(self, values, cast_type, column): # use the EA's implementation of casting elif is_extension_array_dtype(cast_type): - try: - array_type = cast_type.construct_array_type() - except AttributeError: - cast_type = pandas_dtype(cast_type) - array_type = cast_type.construct_array_type() - + # ensure cast_type is an actual dtype and not a string + cast_type = pandas_dtype(cast_type) + array_type = cast_type.construct_array_type() try: return array_type._from_sequence_of_strings(values, dtype=cast_type) @@ -2196,14 +2193,7 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] - # convert dtype to a pandas_dtype - dtype = kwds['dtype'] - if isinstance(dtype, dict): - self.dtype = {k: pandas_dtype(dtype[k]) - for k in dtype} - else: - self.dtype = dtype - + self.dtype = kwds['dtype'] self.thousands = kwds['thousands'] self.decimal = kwds['decimal'] diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index 41417b086fc9a..75e443e1d0195 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -20,5 +20,5 @@ def test_EA_types(self, engine, data): result = pd.read_csv(StringIO(csv_output), dtype={ 'with_dtype': str(data.dtype) }, engine=engine) - assert result is not None - tm.assert_frame_equal(df, result) + expected = df + tm.assert_frame_equal(result, expected) From f908e2e3f00bfd0206bbb205372bc4e5dcfeb976 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Sun, 30 Dec 2018 15:06:12 -0500 Subject: [PATCH 18/22] Update docs to show using Int64 dtype --- doc/source/io.rst | 10 +++++----- pandas/io/parsers.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 7230ff55f9a6c..ab3750e6e544d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -362,16 +362,16 @@ columns: .. ipython:: python - data = ('a,b,c\n' - '1,2,3\n' - '4,5,6\n' - '7,8,9') + data = ('a,b,c,d\n' + '1,2,3,4\n' + '5,6,7,8\n' + '9,10,11') print(data) df = pd.read_csv(StringIO(data), dtype=object) df df['a'][0] - df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64}) + df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64, 'd': 'Int64'}) df.dtypes Fortunately, pandas offers more than one way to ensure that your column(s) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index eac3018b4a6fe..e8c676ab0b9c7 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -134,7 +134,8 @@ 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. dtype : Type name or dict of column -> type, optional - Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}} + Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, + 'c': 'Int64'}} Use `str` or `object` together with suitable `na_values` settings to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD From e60549c136ed6323ce856e7394682e6cb5972fa5 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Mon, 31 Dec 2018 14:31:01 -0500 Subject: [PATCH 19/22] Update docs per comments in PR --- doc/source/io.rst | 4 +++- pandas/core/arrays/base.py | 5 +++++ pandas/io/parsers.py | 4 ++-- pandas/tests/extension/test_sparse.py | 1 - 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index ab3750e6e544d..ed413273a5c7f 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -371,7 +371,9 @@ columns: df = pd.read_csv(StringIO(data), dtype=object) df df['a'][0] - df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64, 'd': 'Int64'}) + df = pd.read_csv(StringIO(data), + dtype={'b': object, 'c': np.float64, + 'd': 'Int64'}) df.dtypes Fortunately, pandas offers more than one way to ensure that your column(s) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1b46d9e3c2c48..7aaefef3d03e5 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -78,6 +78,11 @@ class ExtensionArray(object): * _reduce + One can implement methods to handle parsing from strings that will be used + in methods such as ``pandas.io.parsers.read_csv``. + + * _from_sequence_of_strings + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e8c676ab0b9c7..20d12225f5485 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -134,8 +134,8 @@ 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. dtype : Type name or dict of column -> type, optional - Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, - 'c': 'Int64'}} + Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, +'c': 'Int64'}} Use `str` or `object` together with suitable `na_values` settings to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 678b7cd1b6cd2..39a138ed534bd 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -367,4 +367,3 @@ def test_EA_types(self, engine, data): expected_msg = r'.*must implement _from_sequence_of_strings.*' with pytest.raises(NotImplementedError, match=expected_msg): super(TestParsing, self).test_EA_types(engine, data) - From a6a2d99af981fc41eca1cd2b9e81bfd5b4b3c5c5 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Mon, 31 Dec 2018 19:10:04 -0500 Subject: [PATCH 20/22] Fix linters --- doc/source/io.rst | 5 ++--- pandas/io/parsers.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index ed413273a5c7f..3bbd4e8410fa5 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -371,9 +371,8 @@ columns: df = pd.read_csv(StringIO(data), dtype=object) df df['a'][0] - df = pd.read_csv(StringIO(data), - dtype={'b': object, 'c': np.float64, - 'd': 'Int64'}) + df = pd.read_csv(StringIO(data), + dtype={'b': object, 'c': np.float64, 'd': 'Int64'}) df.dtypes Fortunately, pandas offers more than one way to ensure that your column(s) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 20d12225f5485..0d10e8b8985cb 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -135,7 +135,7 @@ are duplicate names in the columns. dtype : Type name or dict of column -> type, optional Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, -'c': 'Int64'}} + 'c': 'Int64'}} Use `str` or `object` together with suitable `na_values` settings to preserve and not interpret dtype. If converters are specified, they will be applied INSTEAD From 053b442a47fecb01b62844b61835bed528e5876b Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Tue, 1 Jan 2019 15:47:54 -0500 Subject: [PATCH 21/22] Fix linters again --- pandas/tests/extension/base/io.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/extension/base/io.py b/pandas/tests/extension/base/io.py index 75e443e1d0195..7ea62e4e9d678 100644 --- a/pandas/tests/extension/base/io.py +++ b/pandas/tests/extension/base/io.py @@ -4,7 +4,6 @@ from pandas.compat import StringIO import pandas as pd -import pandas.testing as tm from .base import BaseExtensionTests @@ -21,4 +20,4 @@ def test_EA_types(self, engine, data): 'with_dtype': str(data.dtype) }, engine=engine) expected = df - tm.assert_frame_equal(result, expected) + self.assert_frame_equal(result, expected) From f42235aad6aa0dcdb4d3324792001062a7242c43 Mon Sep 17 00:00:00 2001 From: Kyle Prestel Date: Tue, 1 Jan 2019 21:00:14 -0500 Subject: [PATCH 22/22] isort --- pandas/core/indexes/numeric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 445cadfa18dfc..9d6a56200df6e 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -7,8 +7,8 @@ from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - is_bool, is_bool_dtype, is_dtype_equal, is_float, - is_integer_dtype, is_scalar, needs_i8_conversion, pandas_dtype) + is_bool, is_bool_dtype, is_dtype_equal, is_float, is_integer_dtype, + is_scalar, needs_i8_conversion, pandas_dtype) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna