From 33386195a574f6c3610628d414b65d049b257d05 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 24 Dec 2021 12:38:19 -0800
Subject: [PATCH 1/4] PERF/WIP: avoid copies in lib.infer_dtype

---
 pandas/_libs/lib.pyx                          | 85 +++++++++++--------
 pandas/conftest.py                            |  1 +
 pandas/core/arrays/floating.py                |  8 +-
 pandas/core/arrays/integer.py                 | 12 +--
 .../arrays/floating/test_construction.py      |  1 +
 .../tests/arrays/integer/test_construction.py |  1 +
 pandas/tests/dtypes/test_inference.py         | 28 +++++-
 7 files changed, 81 insertions(+), 55 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 8f9016e726f1e..d13aa6e2b60fc 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -97,7 +97,6 @@ from pandas._libs.missing cimport (
     is_matching_na,
     is_null_datetime64,
     is_null_timedelta64,
-    isnaobj,
 )
 from pandas._libs.tslibs.conversion cimport convert_to_tsobject
 from pandas._libs.tslibs.nattype cimport (
@@ -1420,6 +1419,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
         ndarray values
         bint seen_pdnat = False
         bint seen_val = False
+        flatiter it
 
     if util.is_array(value):
         values = value
@@ -1457,24 +1457,22 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
         # This should not be reached
         values = values.astype(object)
 
-    # for f-contiguous array 1000 x 1000, passing order="K" gives 5000x speedup
-    values = values.ravel(order="K")
-
-    if skipna:
-        values = values[~isnaobj(values)]
-
     n = cnp.PyArray_SIZE(values)
     if n == 0:
         return "empty"
 
     # Iterate until we find our first valid value. We will use this
     #  value to decide which of the is_foo_array functions to call.
+    it = PyArray_IterNew(values)
     for i in range(n):
-        val = values[i]
+        # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
+        #  equivalents to `val = values[i]`
+        val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
+        PyArray_ITER_NEXT(it)
 
         # do not use checknull to keep
         # np.datetime64('nat') and np.timedelta64('nat')
-        if val is None or util.is_nan(val):
+        if val is None or util.is_nan(val) or val is C_NA:
             pass
         elif val is NaT:
             seen_pdnat = True
@@ -1486,23 +1484,25 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
     if seen_val is False and seen_pdnat is True:
         return "datetime"
         # float/object nan is handled in latter logic
+    if seen_val is False and skipna:
+        return "empty"
 
     if util.is_datetime64_object(val):
-        if is_datetime64_array(values):
+        if is_datetime64_array(values, skipna=skipna):
             return "datetime64"
 
     elif is_timedelta(val):
-        if is_timedelta_or_timedelta64_array(values):
+        if is_timedelta_or_timedelta64_array(values, skipna=skipna):
             return "timedelta"
 
     elif util.is_integer_object(val):
         # ordering matters here; this check must come after the is_timedelta
         #  check otherwise numpy timedelta64 objects would come through here
 
-        if is_integer_array(values):
+        if is_integer_array(values, skipna=skipna):
             return "integer"
-        elif is_integer_float_array(values):
-            if is_integer_na_array(values):
+        elif is_integer_float_array(values, skipna=skipna):
+            if is_integer_na_array(values, skipna=skipna):
                 return "integer-na"
             else:
                 return "mixed-integer-float"
@@ -1523,7 +1523,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
             return "time"
 
     elif is_decimal(val):
-        if is_decimal_array(values):
+        if is_decimal_array(values, skipna=skipna):
             return "decimal"
 
     elif util.is_complex_object(val):
@@ -1533,8 +1533,8 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
     elif util.is_float_object(val):
         if is_float_array(values):
             return "floating"
-        elif is_integer_float_array(values):
-            if is_integer_na_array(values):
+        elif is_integer_float_array(values, skipna=skipna):
+            if is_integer_na_array(values, skipna=skipna):
                 return "integer-na"
             else:
                 return "mixed-integer-float"
@@ -1552,15 +1552,18 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
             return "bytes"
 
     elif is_period_object(val):
-        if is_period_array(values):
+        if is_period_array(values, skipna=skipna):
             return "period"
 
     elif is_interval(val):
         if is_interval_array(values):
             return "interval"
 
+    cnp.PyArray_ITER_RESET(it)
     for i in range(n):
-        val = values[i]
+        val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
+        PyArray_ITER_NEXT(it)
+
         if util.is_integer_object(val):
             return "mixed-integer"
 
@@ -1789,10 +1792,11 @@ cdef class IntegerValidator(Validator):
 
 
 # Note: only python-exposed for tests
-cpdef bint is_integer_array(ndarray values):
+cpdef bint is_integer_array(ndarray values, bint skipna=True):
     cdef:
         IntegerValidator validator = IntegerValidator(len(values),
-                                                      values.dtype)
+                                                      values.dtype,
+                                                      skipna=skipna)
     return validator.validate(values)
 
 
@@ -1803,10 +1807,10 @@ cdef class IntegerNaValidator(Validator):
                 or (util.is_nan(value) and util.is_float_object(value)))
 
 
-cdef bint is_integer_na_array(ndarray values):
+cdef bint is_integer_na_array(ndarray values, bint skipna=True):
     cdef:
         IntegerNaValidator validator = IntegerNaValidator(len(values),
-                                                          values.dtype)
+                                                          values.dtype, skipna=skipna)
     return validator.validate(values)
 
 
@@ -1819,10 +1823,11 @@ cdef class IntegerFloatValidator(Validator):
         return issubclass(self.dtype.type, np.integer)
 
 
-cdef bint is_integer_float_array(ndarray values):
+cdef bint is_integer_float_array(ndarray values, bint skipna=True):
     cdef:
         IntegerFloatValidator validator = IntegerFloatValidator(len(values),
-                                                                values.dtype)
+                                                                values.dtype,
+                                                                skipna=skipna)
     return validator.validate(values)
 
 
@@ -1866,9 +1871,11 @@ cdef class DecimalValidator(Validator):
         return is_decimal(value)
 
 
-cdef bint is_decimal_array(ndarray values):
+cdef bint is_decimal_array(ndarray values, bint skipna=False):
     cdef:
-        DecimalValidator validator = DecimalValidator(len(values), values.dtype)
+        DecimalValidator validator = DecimalValidator(
+            len(values), values.dtype, skipna=skipna
+        )
     return validator.validate(values)
 
 
@@ -1882,7 +1889,7 @@ cdef class StringValidator(Validator):
 
     cdef bint is_valid_null(self, object value) except -1:
         # We deliberately exclude None / NaN here since StringArray uses NA
-        return value is C_NA
+        return value is C_NA or value is None or util.is_nan(value)
 
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):
@@ -1967,10 +1974,10 @@ cdef class Datetime64Validator(DatetimeValidator):
 
 
 # Note: only python-exposed for tests
-cpdef bint is_datetime64_array(ndarray values):
+cpdef bint is_datetime64_array(ndarray values, bint skipna=True):
     cdef:
         Datetime64Validator validator = Datetime64Validator(len(values),
-                                                            skipna=True)
+                                                            skipna=skipna)
     return validator.validate(values)
 
 
@@ -1982,10 +1989,10 @@ cdef class AnyDatetimeValidator(DatetimeValidator):
         )
 
 
-cdef bint is_datetime_or_datetime64_array(ndarray values):
+cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True):
     cdef:
         AnyDatetimeValidator validator = AnyDatetimeValidator(len(values),
-                                                              skipna=True)
+                                                              skipna=skipna)
     return validator.validate(values)
 
 
@@ -2039,13 +2046,13 @@ cdef class AnyTimedeltaValidator(TimedeltaValidator):
 
 
 # Note: only python-exposed for tests
-cpdef bint is_timedelta_or_timedelta64_array(ndarray values):
+cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True):
     """
     Infer with timedeltas and/or nat/none.
     """
     cdef:
         AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values),
-                                                                skipna=True)
+                                                                skipna=skipna)
     return validator.validate(values)
 
 
@@ -2075,7 +2082,8 @@ cpdef bint is_time_array(ndarray values, bint skipna=False):
     return validator.validate(values)
 
 
-cdef bint is_period_array(ndarray[object] values):
+# FIXME: actually use skipna
+cdef bint is_period_array(ndarray[object] values, bint skipna=True):
     """
     Is this an ndarray of Period objects (or NaT) with a single `freq`?
     """
@@ -2083,12 +2091,17 @@ cdef bint is_period_array(ndarray[object] values):
         Py_ssize_t i, n = len(values)
         int dtype_code = -10000  # i.e. c_FreqGroup.FR_UND
         object val
+        flatiter it
 
     if len(values) == 0:
         return False
 
+    it = PyArray_IterNew(values)
     for i in range(n):
-        val = values[i]
+        # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
+        #  equivalents to `val = values[i]`
+        val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
+        PyArray_ITER_NEXT(it)
 
         if is_period_object(val):
             if dtype_code == -10000:
diff --git a/pandas/conftest.py b/pandas/conftest.py
index be28dbe35fcb2..07b828824e26c 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1521,6 +1521,7 @@ def any_numpy_dtype(request):
 _any_skipna_inferred_dtype = [
     ("string", ["a", np.nan, "c"]),
     ("string", ["a", pd.NA, "c"]),
+    ("mixed", ["a", pd.NaT, "c"]),  # pd.NaT not considered valid by is_string_array
     ("bytes", [b"a", np.nan, b"c"]),
     ("empty", [np.nan, np.nan, np.nan]),
     ("empty", []),
diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
index 4c868747fa930..319db7ccbd80e 100644
--- a/pandas/core/arrays/floating.py
+++ b/pandas/core/arrays/floating.py
@@ -131,13 +131,7 @@ def coerce_to_array(
         inferred_type = lib.infer_dtype(values, skipna=True)
         if inferred_type == "empty":
             pass
-        elif inferred_type not in [
-            "floating",
-            "integer",
-            "mixed-integer",
-            "integer-na",
-            "mixed-integer-float",
-        ]:
+        elif inferred_type == "boolean":
             raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")
 
     elif is_bool_dtype(values) and is_float_dtype(dtype):
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
index 3587575503d33..69f241cfa55e3 100644
--- a/pandas/core/arrays/integer.py
+++ b/pandas/core/arrays/integer.py
@@ -181,16 +181,8 @@ def coerce_to_array(
         inferred_type = lib.infer_dtype(values, skipna=True)
         if inferred_type == "empty":
             pass
-        elif inferred_type not in [
-            "floating",
-            "integer",
-            "mixed-integer",
-            "integer-na",
-            "mixed-integer-float",
-            "string",
-            "unicode",
-        ]:
-            raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
+        elif inferred_type == "boolean":
+            raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")
 
     elif is_bool_dtype(values) and is_integer_dtype(dtype):
         values = np.array(values, dtype=int, copy=copy)
diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py
index 169b23c31f863..a4f36c0168fda 100644
--- a/pandas/tests/arrays/floating/test_construction.py
+++ b/pandas/tests/arrays/floating/test_construction.py
@@ -137,6 +137,7 @@ def test_to_array_error(values):
             "cannot be converted to a FloatingDtype",
             "values must be a 1D list-like",
             "Cannot pass scalar",
+            r"float\(\) argument must be a string or a number, not 'dict'",
         ]
     )
     with pytest.raises((TypeError, ValueError), match=msg):
diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py
index e5fd4977ec2b8..fa61f23fc5d64 100644
--- a/pandas/tests/arrays/integer/test_construction.py
+++ b/pandas/tests/arrays/integer/test_construction.py
@@ -139,6 +139,7 @@ def test_to_integer_array_error(values):
             r"invalid literal for int\(\) with base 10:",
             r"values must be a 1D list-like",
             r"Cannot pass scalar",
+            r"int\(\) argument must be a string",
         ]
     )
     with pytest.raises((ValueError, TypeError), match=msg):
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index 7953d650636be..1696d5b835247 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -1134,10 +1134,20 @@ def test_unicode(self):
         # This could also return "string" or "mixed-string"
         assert result == "mixed"
 
+        # even though we use skipna, we are only skipping those NAs that are
+        #  considered matching by is_string_array
         arr = ["a", np.nan, "c"]
         result = lib.infer_dtype(arr, skipna=True)
         assert result == "string"
 
+        arr = ["a", pd.NA, "c"]
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "string"
+
+        arr = ["a", pd.NaT, "c"]
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "mixed"
+
         arr = ["a", "c"]
         result = lib.infer_dtype(arr, skipna=False)
         assert result == "string"
@@ -1542,10 +1552,24 @@ def test_is_string_array(self):
         assert lib.is_string_array(
             np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
         )
-        # NaN is not valid for string array, just NA
-        assert not lib.is_string_array(
+        # we allow NaN/None in the StringArray constructor, so its allowed here
+        assert lib.is_string_array(
             np.array(["foo", "bar", np.nan], dtype=object), skipna=True
         )
+        assert lib.is_string_array(
+            np.array(["foo", "bar", None], dtype=object), skipna=True
+        )
+
+        # But not e.g. datetimelike or Decimal NAs
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", pd.NaT], dtype=object), skipna=True
+        )
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", np.datetime64("NaT")], dtype=object), skipna=True
+        )
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", Decimal("NaN")], dtype=object), skipna=True
+        )
 
         assert not lib.is_string_array(np.array([1, 2]))
 

From 3939517f97f654d4b995c84b1c38517168a35ab7 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 27 Dec 2021 12:16:36 -0800
Subject: [PATCH 2/4] update test

---
 pandas/tests/arrays/string_/test_string.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 7c3a8c691b786..5bc1416900099 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -266,15 +266,26 @@ def test_constructor_raises(cls):
     with pytest.raises(ValueError, match=msg):
         cls(np.array([]))
 
-    with pytest.raises(ValueError, match=msg):
+    if cls is pd.arrays.StringArray:
+        # GH#45057 np.nan and None do NOT raise, as they are considered valid NAs
+        #  for string dtype
         cls(np.array(["a", np.nan], dtype=object))
-
-    with pytest.raises(ValueError, match=msg):
         cls(np.array(["a", None], dtype=object))
+    else:
+        with pytest.raises(ValueError, match=msg):
+            cls(np.array(["a", np.nan], dtype=object))
+        with pytest.raises(ValueError, match=msg):
+            cls(np.array(["a", None], dtype=object))
 
     with pytest.raises(ValueError, match=msg):
         cls(np.array(["a", pd.NaT], dtype=object))
 
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array(["a", np.datetime64("NaT", "ns")], dtype=object))
+
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array(["a", np.timedelta64("NaT", "ns")], dtype=object))
+
 
 @pytest.mark.parametrize("copy", [True, False])
 def test_from_sequence_no_mutate(copy, cls, request):

From 132b6c4c3df2108068c5be8c85377af4ac66328b Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 27 Dec 2021 21:27:01 -0800
Subject: [PATCH 3/4] update exception message

---
 pandas/tests/arrays/floating/test_construction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py
index 0503d61c101c4..7debf9dd257b1 100644
--- a/pandas/tests/arrays/floating/test_construction.py
+++ b/pandas/tests/arrays/floating/test_construction.py
@@ -137,7 +137,7 @@ def test_to_array_error(values):
             "cannot be converted to a FloatingDtype",
             "values must be a 1D list-like",
             "Cannot pass scalar",
-            r"float\(\) argument must be a string or a number, not 'dict'",
+            r"float\(\) argument must be a string or a (real )?number, not 'dict'",
         ]
     )
     with pytest.raises((TypeError, ValueError), match=msg):

From 994b5cd4fa1cf05623ce55194aec9698ad5a4d08 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 28 Dec 2021 10:21:08 -0800
Subject: [PATCH 4/4] is_period_array compat

---
 pandas/_libs/lib.pyx | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index d13aa6e2b60fc..7bc1f5d559441 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2083,21 +2083,23 @@ cpdef bint is_time_array(ndarray values, bint skipna=False):
 
 
 # FIXME: actually use skipna
-cdef bint is_period_array(ndarray[object] values, bint skipna=True):
+cdef bint is_period_array(ndarray values, bint skipna=True):
     """
     Is this an ndarray of Period objects (or NaT) with a single `freq`?
     """
+    # values should be object-dtype, but ndarray[object] assumes 1D, while
+    #  this _may_ be 2D.
     cdef:
-        Py_ssize_t i, n = len(values)
+        Py_ssize_t i, N = values.size
         int dtype_code = -10000  # i.e. c_FreqGroup.FR_UND
         object val
         flatiter it
 
-    if len(values) == 0:
+    if N == 0:
         return False
 
     it = PyArray_IterNew(values)
-    for i in range(n):
+    for i in range(N):
         # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
         #  equivalents to `val = values[i]`
         val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))