From 3313f237c14224660d4b8fe2384b9b77eec91bf7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 22 Nov 2019 09:56:22 -0600 Subject: [PATCH 01/12] API: Infer extension types in array * string * integer --- doc/source/user_guide/integer_na.rst | 34 ++++++++++++++++++++------- doc/source/whatsnew/v1.0.0.rst | 31 ++++++++++++++++++++++++ pandas/_libs/lib.pyx | 8 +++++-- pandas/core/construction.py | 25 ++++++++++++++++---- pandas/tests/arrays/test_array.py | 25 +++++++++++++++----- pandas/tests/dtypes/test_inference.py | 9 ++++--- 6 files changed, 108 insertions(+), 24 deletions(-) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index f1f3d79eed61e..63cd5dbe03239 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -25,8 +25,7 @@ numbers. Pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` -implemented within pandas. It is not the default dtype for integers, and will not be inferred; -you must explicitly pass the dtype into :meth:`array` or :class:`Series`: +implemented within pandas. .. ipython:: python @@ -50,17 +49,34 @@ NumPy array. You can also pass the list-like object to the :class:`Series` constructor with the dtype. -.. ipython:: python +.. warning:: - s = pd.Series([1, 2, np.nan], dtype="Int64") - s + Currently :meth:`pandas.array` and :meth:`pandas.Series` use different + rules for dtype inference. :meth:`pandas.array` will infer a nullable- + integer dtype -By default (if you don't specify ``dtype``), NumPy is used, and you'll end -up with a ``float64`` dtype Series: + .. ipython:: python -.. ipython:: python + pd.array([1, None]) + pd.array([1, 2]) + + For backwards-compatibility, :class:`Series` infers these as either + integer or float dtype + + .. ipython:: python + + pd.Series([1, None]) + pd.Series([1, 2]) + + We recommend explicitly providing the dtype to avoid confusion. + + .. ipython:: python + + pd.array([1, None], dtype="Int64") + pd.Series([1, None], dtype="Int64") - pd.Series([1, 2, np.nan]) + In the future, we may provide an option for :class:`Series` to infer a + nullable-integer dtype. Operations involving an integer array will behave similar to NumPy arrays. Missing values will be propagated, and the data will be coerced to another diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ac440c263088b..0a510dae0454c 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -234,6 +234,37 @@ The following methods now also correctly output values for unobserved categories df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() +:meth:`pandas.array` inference changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`pandas.array` now infers pandas' new extension types in several cases: + +1. Sting data (including missing values) now returns a :class:`arrays.StringArray`. +2. Integer data (including missing values) now returns a :class:`arrays.IntegerArray`. + +*pandas 0.25.x* + +.. code-block:: python + + >>> pd.array(["a", None]) + + ['a', None] + Length: 2, dtype: object + + >>> pd.array([1, None]) + + [1, None] + Length: 2, dtype: object + + +*pandas 1.0.0* + +.. ipython:: python + + pd.array(["a", None]) + pd.array([1, None]) + +As a reminder, you can specify the ``dtype`` to disable all inference. .. _whatsnew_1000.api_breaking.deps: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index aaf6456df8f8e..7b31ebaf9ba9b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1113,6 +1113,7 @@ def infer_dtype(value: object, skipna: object=None) -> str: Results can include: - string + - mixed-string - unicode - bytes - floating @@ -1319,8 +1320,11 @@ def infer_dtype(value: object, skipna: object=None) -> str: return 'boolean' elif isinstance(val, str): - if is_string_array(values, skipna=skipna): - return 'string' + if is_string_array(values, skipna=True): + if isnaobj(values).any(): + return "mixed-string" + else: + return "string" elif isinstance(val, bytes): if is_bytes_array(values, skipna=skipna): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index c0b08beead0ca..edb99776bafc0 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -94,11 +94,18 @@ def array( :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` + :class:`int` :class:`pandas.arrays.IntegerArray` + :class:`str` :class:`pandas.arrays.StringArray` ============================== ===================================== For all other cases, NumPy's usual inference rules will be used. - copy : bool, default True + .. versionchanged:: 1.0.0 + + Pandas infers nullable-integer dtype for integer data and + string dtype for string data. + + copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require copying data, even if ``copy=False``. @@ -246,21 +253,25 @@ def array( """ from pandas.core.arrays import ( period_array, + IntegerArray, IntervalArray, PandasArray, DatetimeArray, TimedeltaArray, + StringArray, ) if lib.is_scalar(data): msg = "Cannot pass scalar '{}' to 'pandas.array'." raise ValueError(msg.format(data)) - data = extract_array(data, extract_numpy=True) - - if dtype is None and isinstance(data, ABCExtensionArray): + if dtype is None and isinstance( + data, (ABCSeries, ABCIndexClass, ABCExtensionArray) + ): dtype = data.dtype + data = extract_array(data, extract_numpy=True) + # this returns None for not-found dtypes. if isinstance(dtype, str): dtype = registry.find(dtype) or dtype @@ -298,6 +309,12 @@ def array( # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) + elif inferred_dtype in {"string", "mixed-string"}: + return StringArray._from_sequence(data, copy=copy) + + elif inferred_dtype in {"integer", "mixed-integer"}: + return IntegerArray._from_sequence(data, copy=copy) + # TODO(BooleanArray): handle this type # Pandas overrides NumPy for diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index e8d9ecfac61e4..ba2d5f8ee9a03 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -19,14 +19,14 @@ "data, dtype, expected", [ # Basic NumPy defaults. - ([1, 2], None, PandasArray(np.array([1, 2]))), + ([1, 2], None, pd.arrays.IntegerArray._from_sequence([1, 2])), ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), ( [1, 2], np.dtype("float32"), PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), ), - (np.array([1, 2]), None, PandasArray(np.array([1, 2]))), + (np.array([1, 2]), None, pd.arrays.IntegerArray._from_sequence([1, 2])), # String alias passes through to NumPy ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), # Period alias @@ -113,6 +113,13 @@ # IntegerNA ([1, None], "Int16", integer_array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), + # String + (["a", None], "string", pd.arrays.StringArray._from_sequence(["a", None])), + ( + ["a", None], + pd.StringDtype(), + pd.arrays.StringArray._from_sequence(["a", None]), + ), # Index (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA @@ -139,15 +146,15 @@ def test_array(data, dtype, expected): def test_array_copy(): a = np.array([1, 2]) # default is to copy - b = pd.array(a) + b = pd.array(a, dtype=a.dtype) assert np.shares_memory(a, b._ndarray) is False # copy=True - b = pd.array(a, copy=True) + b = pd.array(a, dtype=a.dtype, copy=True) assert np.shares_memory(a, b._ndarray) is False # copy=False - b = pd.array(a, copy=False) + b = pd.array(a, dtype=a.dtype, copy=False) assert np.shares_memory(a, b._ndarray) is True @@ -211,6 +218,12 @@ def test_array_copy(): np.array([1, 2], dtype="m8[us]"), pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), ), + # integer + ([1, 2], pd.arrays.IntegerArray._from_sequence([1, 2])), + ([1, None], pd.arrays.IntegerArray._from_sequence([1, None])), + # string + (["a", "b"], pd.arrays.StringArray._from_sequence(["a", "b"])), + (["a", None], pd.arrays.StringArray._from_sequence(["a", None])), ], ) def test_array_inference(data, expected): @@ -241,7 +254,7 @@ def test_array_inference_fails(data): @pytest.mark.parametrize("data", [np.array([[1, 2], [3, 4]]), [[1, 2], [3, 4]]]) def test_nd_raises(data): with pytest.raises(ValueError, match="PandasArray must be 1-dimensional"): - pd.array(data) + pd.array(data, dtype="int64") def test_scalar_raises(): diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 743b844917edf..282e8fdf45aef 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -732,12 +732,15 @@ def test_string(self): def test_unicode(self): arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=False) - assert result == "mixed" + assert result == "mixed-string" arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=True) - expected = "string" - assert result == expected + assert result == "string" + + arr = ["a", "c"] + result = lib.infer_dtype(arr, skipna=False) + assert result == "string" @pytest.mark.parametrize( "dtype, missing, skipna, expected", From dd02d69f3761e36b8fb9ecc20fd1a6f437bf2a6b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 22 Nov 2019 13:51:30 -0600 Subject: [PATCH 02/12] update docstring --- pandas/core/construction.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index edb99776bafc0..de7e4b932e1a0 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -219,24 +219,17 @@ def array( [a, b, a] Categories (3, object): [a < b < c] - Because omitting the `dtype` passes the data through to NumPy, - a mixture of valid integers and NA will return a floating-point - NumPy array. + Pandas will infer an ExtensionArray for some types of data: >>> pd.array([1, 2, np.nan]) - - [1.0, 2.0, nan] - Length: 3, dtype: float64 - - To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify - the dtype: - - >>> pd.array([1, 2, np.nan], dtype='Int64') [1, 2, NaN] Length: 3, dtype: Int64 - Pandas will infer an ExtensionArray for some types of data: + >>> pd.array(["a", None, "c"]) + + ['a', nan, 'c'] + Length: 3, dtype: string >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) From 5a9c306a0ee8e339a8ea0f51e00fb3275598127c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 22 Nov 2019 14:50:00 -0600 Subject: [PATCH 03/12] remove mixed-string --- doc/source/whatsnew/v1.0.0.rst | 4 ++-- pandas/_libs/lib.pyx | 7 ++----- pandas/core/construction.py | 2 +- pandas/tests/dtypes/test_inference.py | 2 +- pandas/tests/frame/test_block_internals.py | 4 ++-- pandas/tests/internals/test_internals.py | 2 +- 6 files changed, 9 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0a510dae0454c..803d1d359cf10 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -237,7 +237,7 @@ The following methods now also correctly output values for unobserved categories :meth:`pandas.array` inference changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:meth:`pandas.array` now infers pandas' new extension types in several cases: +:meth:`pandas.array` now infers pandas' new extension types in several cases (:issue:`29791`): 1. Sting data (including missing values) now returns a :class:`arrays.StringArray`. 2. Integer data (including missing values) now returns a :class:`arrays.IntegerArray`. @@ -350,7 +350,7 @@ Other API changes - :meth:`Series.dropna` has dropped its ``**kwargs`` argument in favor of a single ``how`` parameter. Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) -- +- :meth:`pandas.api.types.infer_dtype` returns ``"string"`` rather than ``"mixed"`` for a mixture of strings and NA values (:issue:`29799`) .. _whatsnew_1000.api.documentation: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7b31ebaf9ba9b..0e5241d8bef34 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1113,7 +1113,6 @@ def infer_dtype(value: object, skipna: object=None) -> str: Results can include: - string - - mixed-string - unicode - bytes - floating @@ -1320,11 +1319,9 @@ def infer_dtype(value: object, skipna: object=None) -> str: return 'boolean' elif isinstance(val, str): + # we deliberately ignore skipna if is_string_array(values, skipna=True): - if isnaobj(values).any(): - return "mixed-string" - else: - return "string" + return "string" elif isinstance(val, bytes): if is_bytes_array(values, skipna=skipna): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index de7e4b932e1a0..506b2c82134ee 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -302,7 +302,7 @@ def array( # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) - elif inferred_dtype in {"string", "mixed-string"}: + elif inferred_dtype == "string": return StringArray._from_sequence(data, copy=copy) elif inferred_dtype in {"integer", "mixed-integer"}: diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 282e8fdf45aef..3c1705ddb785e 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -732,7 +732,7 @@ def test_string(self): def test_unicode(self): arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=False) - assert result == "mixed-string" + assert result == "string" arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=True) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index d491e9f25c897..b27e7c217c4c2 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -615,12 +615,12 @@ def test_constructor_no_pandas_array(self): def test_add_column_with_pandas_array(self): # GH 26390 df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - df["c"] = pd.array([1, 2, None, 3]) + df["c"] = pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)) df2 = pd.DataFrame( { "a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"], - "c": pd.array([1, 2, None, 3]), + "c": pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)), } ) assert type(df["c"]._data.blocks[0]) == ObjectBlock diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index abe2ddf955ad8..551782d0b363a 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1268,7 +1268,7 @@ def test_block_shape(): def test_make_block_no_pandas_array(): # https://github.com/pandas-dev/pandas/pull/24866 - arr = pd.array([1, 2]) + arr = pd.arrays.PandasArray(np.array([1, 2])) # PandasArray, no dtype result = make_block(arr, slice(len(arr))) From e3ba8464e2e9f90ffde417c20db9170132457cca Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Nov 2019 06:09:47 -0600 Subject: [PATCH 04/12] skipna=True --- pandas/_libs/lib.pyx | 3 +-- pandas/core/construction.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0e5241d8bef34..48cbfa61e0863 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1319,8 +1319,7 @@ def infer_dtype(value: object, skipna: object=None) -> str: return 'boolean' elif isinstance(val, str): - # we deliberately ignore skipna - if is_string_array(values, skipna=True): + if is_string_array(values, skipna=skipna): return "string" elif isinstance(val, bytes): diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 506b2c82134ee..bfb41c7a011d3 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -274,7 +274,7 @@ def array( return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: - inferred_dtype = lib.infer_dtype(data, skipna=False) + inferred_dtype = lib.infer_dtype(data, skipna=True) if inferred_dtype == "period": try: return period_array(data, copy=copy) @@ -305,7 +305,7 @@ def array( elif inferred_dtype == "string": return StringArray._from_sequence(data, copy=copy) - elif inferred_dtype in {"integer", "mixed-integer"}: + elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) # TODO(BooleanArray): handle this type From e055ada39319133a609bb4f834a553e8bc87e537 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Nov 2019 07:51:19 -0600 Subject: [PATCH 05/12] update new test --- pandas/tests/dtypes/test_inference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 3c1705ddb785e..d7693d29c08b9 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -732,7 +732,9 @@ def test_string(self): def test_unicode(self): arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=False) - assert result == "string" + # This currently returns "mixed", but it's not clear that's optimal. + # This could also return "string" or "mixed-string" + assert result == "mixed" arr = ["a", np.nan, "c"] result = lib.infer_dtype(arr, skipna=True) From ad43c3a966f59d2e49df435565a103488706991a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Nov 2019 08:26:45 -0600 Subject: [PATCH 06/12] reduce --- pandas/tests/series/test_ufunc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index c8a127f89bf91..6c665917210a4 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -251,7 +251,7 @@ def __add__(self, other): @pytest.mark.parametrize( "values", [ - pd.array([1, 3, 2]), + pd.array([1, 3, 2], dtype="int64"), pd.array([1, 10, 0], dtype="Sparse[int]"), pd.to_datetime(["2000", "2010", "2001"]), pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"), From 77c5d3f414108f5105866392f738bd516b9294ac Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Nov 2019 08:31:53 -0600 Subject: [PATCH 07/12] 32 bit, doc --- doc/source/user_guide/integer_na.rst | 2 ++ pandas/tests/arrays/test_array.py | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 63cd5dbe03239..77568f3bcb244 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -84,6 +84,8 @@ dtype if needed. .. ipython:: python + s = pd.Series([1, 2, None], dtype="Int64") + # arithmetic s + 1 diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index ba2d5f8ee9a03..dc61bc3fc37d5 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -26,7 +26,11 @@ np.dtype("float32"), PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), ), - (np.array([1, 2]), None, pd.arrays.IntegerArray._from_sequence([1, 2])), + ( + np.array([1, 2], dtype="int64"), + None, + pd.arrays.IntegerArray._from_sequence([1, 2]), + ), # String alias passes through to NumPy ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), # Period alias From 0f89f47653d28a677f150eaffdf6488d77922776 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Nov 2019 08:32:38 -0600 Subject: [PATCH 08/12] update --- doc/source/whatsnew/v1.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6174976e74b92..92e87cacf607b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -350,7 +350,6 @@ Other API changes - :meth:`Series.dropna` has dropped its ``**kwargs`` argument in favor of a single ``how`` parameter. Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) -- :meth:`pandas.api.types.infer_dtype` returns ``"string"`` rather than ``"mixed"`` for a mixture of strings and NA values (:issue:`29799`) .. _whatsnew_1000.api.documentation: From 4e08fd2eed92c3d5bc8a9b65d03bb343df9889cf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Nov 2019 10:48:12 -0600 Subject: [PATCH 09/12] fix docstring --- pandas/core/construction.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index bfb41c7a011d3..a5a88a7b2d0b5 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -105,7 +105,7 @@ def array( Pandas infers nullable-integer dtype for integer data and string dtype for string data. - copy : bool, default True + copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require copying data, even if ``copy=False``. @@ -161,14 +161,6 @@ def array( ['a', 'b'] Length: 2, dtype: str32 - Or use the dedicated constructor for the array you're expecting, and - wrap that in a PandasArray - - >>> pd.array(np.array(['a', 'b'], dtype=' - ['a', 'b'] - Length: 2, dtype: str32 - Finally, Pandas has arrays that mostly overlap with NumPy * :class:`arrays.DatetimeArray` @@ -191,13 +183,21 @@ def array( Examples -------- - If a dtype is not specified, `data` is passed through to - :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. + If a dtype is not specified, pandas will infer the best dtype from the values. + See the description of `dtype` for the types pandas infers for. >>> pd.array([1, 2]) - + [1, 2] - Length: 2, dtype: int64 + Length: 2, dtype: Int64 + + If pandas does not infer a dedicated extension type for some values, we + fall back to returning a :class:`arrays.PandasArray`. + + >>> pd.array([1.1, 2.2]) + + [1.1, 2.2] + Length: 2, dtype: float64 Or the NumPy dtype can be specified From bddce9b8b3eeacd53d8be7688593c4034222ea1f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Nov 2019 10:49:57 -0600 Subject: [PATCH 10/12] reorganize --- pandas/core/construction.py | 48 ++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index a5a88a7b2d0b5..0e23b18505efa 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -191,20 +191,20 @@ def array( [1, 2] Length: 2, dtype: Int64 - If pandas does not infer a dedicated extension type for some values, we - fall back to returning a :class:`arrays.PandasArray`. - - >>> pd.array([1.1, 2.2]) - - [1.1, 2.2] - Length: 2, dtype: float64 + >>> pd.array([1, 2, np.nan]) + + [1, 2, NaN] + Length: 3, dtype: Int64 - Or the NumPy dtype can be specified + >>> pd.array(["a", None, "c"]) + + ['a', nan, 'c'] + Length: 3, dtype: string - >>> pd.array([1, 2], dtype=np.dtype("int32")) - - [1, 2] - Length: 2, dtype: int32 + >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) + + ['2000-01-01', '2000-01-01'] + Length: 2, dtype: period[D] You can use the string alias for `dtype` @@ -219,22 +219,20 @@ def array( [a, b, a] Categories (3, object): [a < b < c] - Pandas will infer an ExtensionArray for some types of data: + If pandas does not infer a dedicated extension type a + :class:`arrays.PandasArray` is returned. - >>> pd.array([1, 2, np.nan]) - - [1, 2, NaN] - Length: 3, dtype: Int64 + >>> pd.array([1.1, 2.2]) + + [1.1, 2.2] + Length: 2, dtype: float64 - >>> pd.array(["a", None, "c"]) - - ['a', nan, 'c'] - Length: 3, dtype: string + Or the NumPy dtype can be specified - >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) - - ['2000-01-01', '2000-01-01'] - Length: 2, dtype: period[D] + >>> pd.array([1, 2], dtype=np.dtype("int32")) + + [1, 2] + Length: 2, dtype: int32 `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. From 372ac06420c56418280162bb44d479f4f367bd50 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 27 Nov 2019 12:00:45 -0600 Subject: [PATCH 11/12] Handle BooleanArray --- doc/source/whatsnew/v1.0.0.rst | 3 ++- pandas/core/construction.py | 10 +++++++--- pandas/tests/arrays/test_array.py | 10 ++++++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 81965d64d325b..0b314a8c705a1 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -264,8 +264,9 @@ The following methods now also correctly output values for unobserved categories :meth:`pandas.array` now infers pandas' new extension types in several cases (:issue:`29791`): -1. Sting data (including missing values) now returns a :class:`arrays.StringArray`. +1. String data (including missing values) now returns a :class:`arrays.StringArray`. 2. Integer data (including missing values) now returns a :class:`arrays.IntegerArray`. +3. Boolean data (including missing values) now returns the new :class:`arrays.BooleanArray` *pandas 0.25.x* diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 0e23b18505efa..ce906678d990c 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -96,14 +96,16 @@ def array( :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` :class:`str` :class:`pandas.arrays.StringArray` + :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== For all other cases, NumPy's usual inference rules will be used. .. versionchanged:: 1.0.0 - Pandas infers nullable-integer dtype for integer data and - string dtype for string data. + Pandas infers nullable-integer dtype for integer data, + string dtype for string data, and nullable-boolean dtype + for boolean data. copy : bool, default True Whether to copy the data, even if not necessary. Depending @@ -244,6 +246,7 @@ def array( """ from pandas.core.arrays import ( period_array, + BooleanArray, IntegerArray, IntervalArray, PandasArray, @@ -306,7 +309,8 @@ def array( elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) - # TODO(BooleanArray): handle this type + elif inferred_dtype == "boolean": + return BooleanArray._from_sequence(data, copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns] diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 7b44a362a5c30..479f8dbad0418 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -124,6 +124,13 @@ pd.StringDtype(), pd.arrays.StringArray._from_sequence(["a", None]), ), + # Boolean + ([True, None], "boolean", pd.arrays.BooleanArray._from_sequence([True, None])), + ( + [True, None], + pd.BooleanDtype(), + pd.arrays.BooleanArray._from_sequence([True, None]), + ), # Index (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA @@ -228,6 +235,9 @@ def test_array_copy(): # string (["a", "b"], pd.arrays.StringArray._from_sequence(["a", "b"])), (["a", None], pd.arrays.StringArray._from_sequence(["a", None])), + # Boolean + ([True, False], pd.arrays.BooleanArray._from_sequence([True, False])), + ([True, None], pd.arrays.BooleanArray._from_sequence([True, None])), ], ) def test_array_inference(data, expected): From d0f3082ce4d2ca66533cdc0b4dabc2612b8e6e04 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 2 Dec 2019 06:15:14 -0600 Subject: [PATCH 12/12] update docstring --- pandas/core/construction.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index ce906678d990c..dc537d50b3419 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -229,7 +229,11 @@ def array( [1.1, 2.2] Length: 2, dtype: float64 - Or the NumPy dtype can be specified + As mentioned in the "Notes" section, new extension types may be added + in the future (by pandas or 3rd party libraries), causing the return + value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype` + as a NumPy dtype if you need to ensure there's no future change in + behavior. >>> pd.array([1, 2], dtype=np.dtype("int32"))