From 148fc9d33a5fd937cbda81bdd257085bd9c6d6b2 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 29 Mar 2021 16:51:12 -0700
Subject: [PATCH 01/26] Add nullable dtypes to read_csv

---
 doc/source/whatsnew/v1.3.0.rst           |  1 +
 pandas/_libs/lib.pyx                     | 50 +++++++++++++---
 pandas/_libs/ops.pyx                     | 32 ++++++-----
 pandas/_libs/parsers.pyx                 | 41 ++++++++++---
 pandas/io/parsers/base_parser.py         | 30 +++++++---
 pandas/io/parsers/readers.py             | 10 +++-
 pandas/tests/dtypes/test_inference.py    | 12 ++++
 pandas/tests/io/parser/test_na_values.py | 73 +++++++++++++++++++++---
 8 files changed, 203 insertions(+), 46 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index e245bf797d932..0c3d87d225819 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -142,6 +142,7 @@ Other enhancements
 - Add support for unary operators in :class:`FloatingArray` (:issue:`38749`)
 - :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`)
 - :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`)
+- :meth:`pandas.read_csv` now accepts an argument ``use_nullable_dtypes`` that allows reading data directly into the nullable integer and boolean data types (:issue:`36712`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 1ff481553e413..53980548ff6cf 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1984,7 +1984,8 @@ def maybe_convert_numeric(
     set na_values,
     bint convert_empty=True,
     bint coerce_numeric=False,
-) -> ndarray:
+    bint convert_to_nullable_integer=False,
+) -> "ArrayLike":
     """
     Convert object array to a numeric array if possible.
 
@@ -2008,6 +2009,9 @@ def maybe_convert_numeric(
         numeric array has no suitable numerical dtype to return (i.e. uint64,
         int32, uint8). If set to False, the original object array will be
         returned. Otherwise, a ValueError will be raised.
+    convert_to_nullable_integer : bool, default False
+        If an array-like object contains only integer values (and NaN) is
+        encountered, whether to convert and return an IntegerArray.
 
     Returns
     -------
@@ -2039,21 +2043,34 @@ def maybe_convert_numeric(
         ndarray[int64_t] ints = np.empty(n, dtype='i8')
         ndarray[uint64_t] uints = np.empty(n, dtype='u8')
         ndarray[uint8_t] bools = np.empty(n, dtype='u1')
+        ndarray[uint8_t] mask = np.zeros(n, dtype="u1")
         float64_t fval
 
     for i in range(n):
         val = values[i]
+        # We only want to disable NaNs showing as float if
+        # a) convert_to_nullable_integer = True
+        # b) no floats have been seen ( assuming an int shows up later )
+        # However, if no ints present (all null array), we need to return floats
+        allow_nullable_dtypes = convert_to_nullable_integer and not seen.float_
 
         if val.__hash__ is not None and val in na_values:
-            seen.saw_null()
-            floats[i] = complexes[i] = NaN
+            if allow_nullable_dtypes:
+                seen.null_ = True
+                mask[i] = 1
+            else:
+                floats[i] = complexes[i] = NaN
+                seen.saw_null()
         elif util.is_float_object(val):
             fval = val
             if fval != fval:
+                mask[i] = 1
                 seen.null_ = True
-
+                if not allow_nullable_dtypes:
+                    seen.float_ = True
+            else:
+                seen.float_ = True
             floats[i] = complexes[i] = fval
-            seen.float_ = True
         elif util.is_integer_object(val):
             floats[i] = complexes[i] = val
 
@@ -2076,7 +2093,11 @@ def maybe_convert_numeric(
             floats[i] = uints[i] = ints[i] = bools[i] = val
             seen.bool_ = True
         elif val is None or val is C_NA:
-            seen.saw_null()
+            if allow_nullable_dtypes:
+                seen.null_ = True
+                mask[i] = 1
+            else:
+                seen.saw_null()
             floats[i] = complexes[i] = NaN
         elif hasattr(val, '__len__') and len(val) == 0:
             if convert_empty or seen.coerce_numeric:
@@ -2100,6 +2121,7 @@ def maybe_convert_numeric(
                 else:
                     if fval != fval:
                         seen.null_ = True
+                        mask[i] = 1
 
                     floats[i] = fval
 
@@ -2107,7 +2129,10 @@ def maybe_convert_numeric(
                     as_int = int(val)
 
                     if as_int in na_values:
-                        seen.saw_null()
+                        mask[i] = 1
+                        seen.null_ = True
+                        if not convert_to_nullable_integer:
+                            seen.float_ = True
                     else:
                         seen.saw_int(as_int)
 
@@ -2137,11 +2162,22 @@ def maybe_convert_numeric(
     if seen.check_uint64_conflict():
         return values
 
+    # This occurs since we disabled float nulls showing as null in anticipation
+    # of seeing ints that were never seen. So then, we return float
+    if convert_to_nullable_integer and seen.null_ and not seen.int_:
+        seen.float_ = True
+
     if seen.complex_:
         return complexes
     elif seen.float_:
         return floats
     elif seen.int_:
+        if seen.null_ and convert_to_nullable_integer:
+            from pandas.core.arrays import IntegerArray
+            if seen.uint_:
+                return IntegerArray(uints, mask.view(np.bool_))
+            else:
+                return IntegerArray(ints, mask.view(np.bool_))
         if seen.uint_:
             return uints
         else:
diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx
index 1e51a578c44ea..8b17d311747ac 100644
--- a/pandas/_libs/ops.pyx
+++ b/pandas/_libs/ops.pyx
@@ -24,10 +24,7 @@ import_array()
 
 
 from pandas._libs.missing cimport checknull
-from pandas._libs.util cimport (
-    UINT8_MAX,
-    is_nan,
-)
+from pandas._libs.util cimport is_nan
 
 
 @cython.wraparound(False)
@@ -258,17 +255,20 @@ def vec_binop(object[:] left, object[:] right, object op):
 
 
 def maybe_convert_bool(ndarray[object] arr,
-                       true_values=None, false_values=None):
+                       true_values=None,
+                       false_values=None,
+                       convert_to_nullable_boolean=False):
     cdef:
         Py_ssize_t i, n
         ndarray[uint8_t] result
+        ndarray[uint8_t] mask
         object val
         set true_vals, false_vals
-        int na_count = 0
+        bint has_na = False
 
     n = len(arr)
     result = np.empty(n, dtype=np.uint8)
-
+    mask = np.zeros(n, dtype=np.uint8)
     # the defaults
     true_vals = {'True', 'TRUE', 'true'}
     false_vals = {'False', 'FALSE', 'false'}
@@ -292,15 +292,19 @@ def maybe_convert_bool(ndarray[object] arr,
         elif val in false_vals:
             result[i] = 0
         elif isinstance(val, float):
-            result[i] = UINT8_MAX
-            na_count += 1
+            mask[i] = 1
+            result[i] = 0  # Value here doesn't matter, will be replaced w/ nan
+            has_na = True
         else:
             return arr
 
-    if na_count > 0:
-        mask = result == UINT8_MAX
-        arr = result.view(np.bool_).astype(object)
-        np.putmask(arr, mask, np.nan)
-        return arr
+    if has_na:
+        if convert_to_nullable_boolean:
+            from pandas.core.arrays import BooleanArray
+            return BooleanArray(result.view(np.bool_), mask.view(np.bool_))
+        else:
+            arr = result.view(np.bool_).astype(object)
+            np.putmask(arr, mask, np.nan)
+            return arr
     else:
         return result.view(np.bool_)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index c4d98ccb88ba5..b50ab5f71cbe2 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -105,6 +105,11 @@ from pandas.core.dtypes.common import (
 )
 from pandas.core.dtypes.concat import union_categoricals
 
+from pandas.core.arrays import (
+    BooleanArray,
+    IntegerArray,
+)
+
 cdef:
     float64_t INF = <float64_t>np.inf
     float64_t NEGINF = -INF
@@ -323,6 +328,7 @@ cdef class TextReader:
         int64_t leading_cols, table_width, skipfooter, buffer_lines
         bint allow_leading_cols, mangle_dupe_cols, low_memory
         bint delim_whitespace
+        bint use_nullable_dtypes
         object delimiter, converters
         object na_values
         object header, orig_header, names, header_start, header_end
@@ -370,7 +376,8 @@ cdef class TextReader:
                   bint verbose=False,
                   bint mangle_dupe_cols=True,
                   float_precision=None,
-                  bint skip_blank_lines=True):
+                  bint skip_blank_lines=True,
+                  bint use_nullable_dtypes=False):
 
         # set encoding for native Python and C library
         self.c_encoding = NULL
@@ -430,6 +437,7 @@ cdef class TextReader:
             # consistent with csv module semantics, cast all to float
             dtype_order = dtype_order[1:]
         self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
+        self.use_nullable_dtypes = use_nullable_dtypes
 
         if comment is not None:
             if len(comment) > 1:
@@ -1021,7 +1029,8 @@ cdef class TextReader:
             # don't try to upcast EAs
             try_upcast = upcast_na and na_count > 0
             if try_upcast and not is_extension_array_dtype(col_dtype):
-                col_res = _maybe_upcast(col_res)
+                col_res = _maybe_upcast(col_res,
+                                        use_nullable_dtypes=self.use_nullable_dtypes)
 
             if col_res is None:
                 raise ParserError(f'Unable to parse column {i}')
@@ -1318,18 +1327,36 @@ STR_NA_VALUES = {
 _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
 
 
-def _maybe_upcast(arr):
+def _maybe_upcast(arr, use_nullable_dtypes=False):
     """
+    Tries to upcast null values for integer and boolean data types.
+    If arr of boolean dtype, arr is upcast to object dtype or BooleanArray
+    and if arr is of integer dtype, arr is upcast to float dtype, or IntegerArray.
+
+    Parameters
+    ----------
+    arr : ndarray
+        Array to upcast.
+    use_nullable_dtypes: bool, default False
+        Whether to use nullable integer/boolean(IntegerArray/BooleanArray)
+        datatypes instead of upcasting.
 
     """
     if issubclass(arr.dtype.type, np.integer):
         na_value = na_values[arr.dtype]
-        arr = arr.astype(float)
-        np.putmask(arr, arr == na_value, np.nan)
+        mask = arr == na_value
+        if use_nullable_dtypes:
+            arr = IntegerArray(arr, mask)
+        else:
+            arr = arr.astype(float)
+            np.putmask(arr, mask, np.nan)
     elif arr.dtype == np.bool_:
         mask = arr.view(np.uint8) == na_values[np.uint8]
-        arr = arr.astype(object)
-        np.putmask(arr, mask, np.nan)
+        if use_nullable_dtypes:
+            arr = BooleanArray(arr, mask)
+        else:
+            arr = arr.astype(object)
+            np.putmask(arr, mask, np.nan)
 
     return arr
 
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 2d17978b60327..404c2e2df6288 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -54,6 +54,10 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core import algorithms
+from pandas.core.api import (
+    NA,
+    array as pd_array,
+)
 from pandas.core.arrays import Categorical
 from pandas.core.indexes.api import (
     Index,
@@ -109,6 +113,7 @@
     "mangle_dupe_cols": True,
     "infer_datetime_format": False,
     "skip_blank_lines": True,
+    "use_nullable_dtypes": False,
 }
 
 
@@ -199,6 +204,7 @@ def __init__(self, kwds):
 
         self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
 
+        self.use_nullable_dtypes = kwds.get("use_nullable_dtypes", False)
         self.handles: Optional[IOHandles] = None
 
     def _open_handles(self, src: FilePathOrBuffer, kwds: Dict[str, Any]) -> None:
@@ -548,10 +554,7 @@ def _convert_to_ndarrays(
                 )
 
                 # type specified in dtype param or cast_type is an EA
-                if cast_type and (
-                    not is_dtype_equal(cvals, cast_type)
-                    or is_extension_array_dtype(cast_type)
-                ):
+                if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea):
                     if not is_ea and na_count > 0:
                         try:
                             if is_bool_dtype(cast_type):
@@ -645,12 +648,12 @@ def _infer_types(self, values, na_values, try_num_bool=True):
         ----------
         values : ndarray
         na_values : set
-        try_num_bool : bool, default try
+        try_num_bool : bool, default True
            try to cast values to numeric (first preference) or boolean
 
         Returns
         -------
-        converted : ndarray
+        converted : ndarray or ExtensionArray
         na_count : int
         """
         na_count = 0
@@ -659,14 +662,24 @@ def _infer_types(self, values, na_values, try_num_bool=True):
             na_count = mask.sum()
             if na_count > 0:
                 if is_integer_dtype(values):
-                    values = values.astype(np.float64)
+                    if self.use_nullable_dtypes:
+                        values = pd_array(values, dtype="Int64")
+                        values[mask] = NA  # <- This is pd.NA
+                        return values, na_count
+                    else:
+                        values = values.astype(np.float64)
                 np.putmask(values, mask, np.nan)
             return values, na_count
 
         if try_num_bool and is_object_dtype(values.dtype):
             # exclude e.g DatetimeIndex here
             try:
-                result = lib.maybe_convert_numeric(values, na_values, False)
+                result = lib.maybe_convert_numeric(
+                    values,
+                    na_values,
+                    convert_empty=False,
+                    convert_to_nullable_integer=self.use_nullable_dtypes,
+                )
             except (ValueError, TypeError):
                 # e.g. encountering datetime string gets ValueError
                 #  TypeError can be raised in floatify
@@ -684,6 +697,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
                 np.asarray(values),
                 true_values=self.true_values,
                 false_values=self.false_values,
+                convert_to_nullable_boolean=self.use_nullable_dtypes,
             )
 
         return result, na_count
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index edfc7ee0b6258..a02a9d1fdf90d 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -339,6 +339,13 @@
 
     .. versionchanged:: 1.2
 
+use_nullable_dtypes : bool, default False
+    If True, use dtypes that use pd.NA as missing value indicator for
+    the resulting DataFrame. Currently supports reading data into the nullable boolean
+    and integer array types, but not string arrays.
+
+    .. versionadded:: 1.3
+
 {storage_options}
 
     .. versionadded:: 1.2
@@ -524,6 +531,7 @@ def read_csv(
     low_memory=_c_parser_defaults["low_memory"],
     memory_map=False,
     float_precision=None,
+    use_nullable_dtypes=False,
     storage_options: StorageOptions = None,
 ):
     kwds = locals()
@@ -604,6 +612,7 @@ def read_table(
     low_memory=_c_parser_defaults["low_memory"],
     memory_map=False,
     float_precision=None,
+    use_nullable_dtypes=False,
 ):
     kwds = locals()
     del kwds["filepath_or_buffer"]
@@ -812,7 +821,6 @@ def _clean_options(self, options, engine):
 
         sep = options["delimiter"]
         delim_whitespace = options["delim_whitespace"]
-
         if sep is None and not delim_whitespace:
             if engine == "c":
                 fallback_reason = (
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index 78a62c832833f..f46bbeedf7ff4 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -607,6 +607,18 @@ def test_maybe_convert_objects_nullable_integer(self, exp):
 
         tm.assert_extension_array_equal(result, exp)
 
+    @pytest.mark.parametrize(
+        "exp",
+        [
+            IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])),
+            IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])),
+        ],
+    )
+    def test_maybe_convert_numeric_nullable_integer(self, exp):
+        arr = np.array([2, np.NaN], dtype=object)
+        result = lib.maybe_convert_numeric(arr, set(), convert_to_nullable_integer=True)
+        tm.assert_extension_array_equal(result, exp)
+
     def test_maybe_convert_objects_bool_nan(self):
         # GH32146
         ind = Index([True, False, np.nan], dtype=object)
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index fecba8bd81404..96b43286cc854 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -10,9 +10,11 @@
 from pandas._libs.parsers import STR_NA_VALUES
 
 from pandas import (
+    NA,
     DataFrame,
     Index,
     MultiIndex,
+    array as pd_array,
 )
 import pandas._testing as tm
 
@@ -146,20 +148,73 @@ def test_custom_na_values(all_parsers, na_values):
     tm.assert_frame_equal(result, expected)
 
 
-def test_bool_na_values(all_parsers):
+@pytest.mark.parametrize(
+    "use_nullable_dtypes, expected",
+    [
+        (
+            True,
+            DataFrame(
+                {
+                    "A": pd_array([True, NA, False], dtype="boolean"),
+                    "B": pd_array([False, True, NA], dtype="boolean"),
+                    "C": [True, False, True],
+                }
+            ),
+        ),
+        (
+            False,
+            DataFrame(
+                {
+                    "A": np.array([True, np.nan, False], dtype=object),
+                    "B": np.array([False, True, np.nan], dtype=object),
+                    "C": [True, False, True],
+                }
+            ),
+        ),
+    ],
+)
+def test_bool_na_values(all_parsers, use_nullable_dtypes, expected):
     data = """A,B,C
 True,False,True
 NA,True,False
 False,NA,True"""
     parser = all_parsers
-    result = parser.read_csv(StringIO(data))
-    expected = DataFrame(
-        {
-            "A": np.array([True, np.nan, False], dtype=object),
-            "B": np.array([False, True, np.nan], dtype=object),
-            "C": [True, False, True],
-        }
-    )
+    result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "use_nullable_dtypes, expected",
+    [
+        (
+            True,
+            DataFrame(
+                {
+                    "A": pd_array([1, NA, 2], dtype="Int64"),
+                    "B": pd_array([3, 2, NA], dtype="Int64"),
+                    "C": [1, 2, 3],
+                }
+            ),
+        ),
+        (
+            False,
+            DataFrame(
+                {
+                    "A": np.array([1.0, np.nan, 2.0], dtype="float64"),
+                    "B": np.array([3.0, 2.0, np.nan], dtype="float64"),
+                    "C": [1, 2, 3],
+                }
+            ),
+        ),
+    ],
+)
+def test_int_na_values(all_parsers, use_nullable_dtypes, expected):
+    data = """A,B,C
+1,3,1
+NA,2,2
+2,NA,3"""
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes)
     tm.assert_frame_equal(result, expected)
 
 

From a70f3a4c3ce634e8982af59c91682783d56079e7 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 29 Mar 2021 18:41:42 -0700
Subject: [PATCH 02/26] Updates

---
 pandas/_libs/lib.pyx                  | 2 +-
 pandas/tests/dtypes/test_inference.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 53980548ff6cf..7d7ca4e238dd0 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2015,7 +2015,7 @@ def maybe_convert_numeric(
 
     Returns
     -------
-    np.ndarray
+    np.ndarray or ExtensionArray
         Array of converted object values to numerical ones.
     """
     if len(values) == 0:
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index f46bbeedf7ff4..b8102cd613078 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -615,6 +615,7 @@ def test_maybe_convert_objects_nullable_integer(self, exp):
         ],
     )
     def test_maybe_convert_numeric_nullable_integer(self, exp):
+        # GH 40687
         arr = np.array([2, np.NaN], dtype=object)
         result = lib.maybe_convert_numeric(arr, set(), convert_to_nullable_integer=True)
         tm.assert_extension_array_equal(result, exp)

From 2504be6ef7545983ce24e6a0de59f1fa429adbb3 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Tue, 30 Mar 2021 08:50:30 -0700
Subject: [PATCH 03/26] More thorough testing

---
 pandas/_libs/parsers.pyx                 |  9 ++++++++
 pandas/tests/dtypes/test_inference.py    | 29 +++++++++++++++++++++++-
 pandas/tests/io/parser/test_na_values.py | 28 +++++++++++++----------
 3 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 7cea17327bdca..9256bbb7d018c 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1314,6 +1314,9 @@ def _maybe_upcast(arr, use_nullable_dtypes=False):
     If arr of boolean dtype, arr is upcast to object dtype or BooleanArray
     and if arr is of integer dtype, arr is upcast to float dtype, or IntegerArray.
 
+    Note: If all values are null, array will be upcast to float64 even if
+    use_nullable_dtypes is True
+
     Parameters
     ----------
     arr : ndarray
@@ -1326,6 +1329,9 @@ def _maybe_upcast(arr, use_nullable_dtypes=False):
     if issubclass(arr.dtype.type, np.integer):
         na_value = na_values[arr.dtype]
         mask = arr == na_value
+        if np.count_nonzero(mask) == len(arr):
+            # Array of all NaN, dtype -> float64
+            use_nullable_dtypes = False
         if use_nullable_dtypes:
             arr = IntegerArray(arr, mask)
         else:
@@ -1333,6 +1339,9 @@ def _maybe_upcast(arr, use_nullable_dtypes=False):
             np.putmask(arr, mask, np.nan)
     elif arr.dtype == np.bool_:
         mask = arr.view(np.uint8) == na_values[np.uint8]
+        if np.count_nonzero(mask) == len(arr):
+            # Array of all NaN, dtype -> float64
+            use_nullable_dtypes = False
         if use_nullable_dtypes:
             arr = BooleanArray(arr, mask)
         else:
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index f057415070fe1..ac5e29de41768 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -24,6 +24,7 @@
 from pandas._libs import (
     lib,
     missing as libmissing,
+    ops as libops,
 )
 import pandas.util._test_decorators as td
 
@@ -60,7 +61,10 @@
     Timestamp,
 )
 import pandas._testing as tm
-from pandas.core.arrays import IntegerArray
+from pandas.core.arrays import (
+    BooleanArray,
+    IntegerArray,
+)
 
 
 @pytest.fixture(params=[True, False], ids=str)
@@ -415,6 +419,29 @@ def test_isneginf_scalar(self, value, expected):
         result = libmissing.isneginf_scalar(value)
         assert result is expected
 
+    @pytest.mark.parametrize(
+        "convert_to_nullable_boolean, exp",
+        [
+            (
+                True,
+                BooleanArray(
+                    np.array([True, False], dtype="bool"), np.array([False, True])
+                ),
+            ),
+            (False, np.array([True, np.nan], dtype="object")),
+        ],
+    )
+    def test_maybe_convert_nullable_boolean(self, convert_to_nullable_boolean, exp):
+        # GH 40687
+        arr = np.array([True, np.NaN], dtype=object)
+        result = libops.maybe_convert_bool(
+            arr, set(), convert_to_nullable_boolean=convert_to_nullable_boolean
+        )
+        if convert_to_nullable_boolean:
+            tm.assert_extension_array_equal(result, exp)
+        else:
+            tm.assert_numpy_array_equal(result, exp)
+
     @pytest.mark.parametrize("coerce_numeric", [True, False])
     @pytest.mark.parametrize(
         "infinity", ["inf", "inF", "iNf", "Inf", "iNF", "InF", "INf", "INF"]
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 96b43286cc854..e38257a680e6d 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -157,7 +157,8 @@ def test_custom_na_values(all_parsers, na_values):
                 {
                     "A": pd_array([True, NA, False], dtype="boolean"),
                     "B": pd_array([False, True, NA], dtype="boolean"),
-                    "C": [True, False, True],
+                    "C": np.array([np.nan, np.nan, np.nan], dtype="float64"),
+                    "D": np.array([True, False, True], dtype="bool"),
                 }
             ),
         ),
@@ -167,17 +168,18 @@ def test_custom_na_values(all_parsers, na_values):
                 {
                     "A": np.array([True, np.nan, False], dtype=object),
                     "B": np.array([False, True, np.nan], dtype=object),
-                    "C": [True, False, True],
+                    "C": np.array([np.nan, np.nan, np.nan], dtype="float64"),
+                    "D": np.array([True, False, True], dtype="bool"),
                 }
             ),
         ),
     ],
 )
 def test_bool_na_values(all_parsers, use_nullable_dtypes, expected):
-    data = """A,B,C
-True,False,True
-NA,True,False
-False,NA,True"""
+    data = """A,B,C,D
+True,False,NA,True
+NA,True,NA,False
+False,NA,NA,True"""
     parser = all_parsers
     result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes)
     tm.assert_frame_equal(result, expected)
@@ -192,7 +194,8 @@ def test_bool_na_values(all_parsers, use_nullable_dtypes, expected):
                 {
                     "A": pd_array([1, NA, 2], dtype="Int64"),
                     "B": pd_array([3, 2, NA], dtype="Int64"),
-                    "C": [1, 2, 3],
+                    "C": pd_array([NA, 1, 2], dtype="Int64"),
+                    "D": np.array([1, 2, 3], dtype="int64"),
                 }
             ),
         ),
@@ -202,17 +205,18 @@ def test_bool_na_values(all_parsers, use_nullable_dtypes, expected):
                 {
                     "A": np.array([1.0, np.nan, 2.0], dtype="float64"),
                     "B": np.array([3.0, 2.0, np.nan], dtype="float64"),
-                    "C": [1, 2, 3],
+                    "C": np.array([np.nan, 1.0, 2.0], dtype="float64"),
+                    "D": np.array([1, 2, 3], dtype="int64"),
                 }
             ),
         ),
     ],
 )
 def test_int_na_values(all_parsers, use_nullable_dtypes, expected):
-    data = """A,B,C
-1,3,1
-NA,2,2
-2,NA,3"""
+    data = """A,B,C,D
+1,3,NA,1
+NA,2,1,2
+2,NA,2,3"""
     parser = all_parsers
     result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes)
     tm.assert_frame_equal(result, expected)

From 63733dcbc346d9ec2b0c9497d82c564493f9e870 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 31 Mar 2021 12:06:54 -0700
Subject: [PATCH 04/26] Optimizations & Found a bug!

---
 pandas/_libs/lib.pyx     | 2 +-
 pandas/_libs/ops.pyi     | 3 ++-
 pandas/_libs/parsers.pyx | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 3d4df9e53e603..46bfcfd6b49e6 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2081,8 +2081,8 @@ def maybe_convert_numeric(
                 seen.null_ = True
                 mask[i] = 1
             else:
-                floats[i] = complexes[i] = NaN
                 seen.saw_null()
+            floats[i] = complexes[i] = NaN
         elif util.is_float_object(val):
             fval = val
             if fval != fval:
diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi
index b4f42f217a5db..aace09c50bc28 100644
--- a/pandas/_libs/ops.pyi
+++ b/pandas/_libs/ops.pyi
@@ -39,5 +39,6 @@ def vec_binop(
 def maybe_convert_bool(
     arr: np.ndarray,  # np.ndarray[object]
     true_values=...,
-    false_values=...
+    false_values=...,
+    convert_to_nullable_boolean: bool = True,
 ) -> np.ndarray: ...
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 9256bbb7d018c..47d0826d8ea13 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1329,7 +1329,7 @@ def _maybe_upcast(arr, use_nullable_dtypes=False):
     if issubclass(arr.dtype.type, np.integer):
         na_value = na_values[arr.dtype]
         mask = arr == na_value
-        if np.count_nonzero(mask) == len(arr):
+        if mask.all():
             # Array of all NaN, dtype -> float64
             use_nullable_dtypes = False
         if use_nullable_dtypes:
@@ -1339,7 +1339,7 @@ def _maybe_upcast(arr, use_nullable_dtypes=False):
             np.putmask(arr, mask, np.nan)
     elif arr.dtype == np.bool_:
         mask = arr.view(np.uint8) == na_values[np.uint8]
-        if np.count_nonzero(mask) == len(arr):
+        if mask.all():
             # Array of all NaN, dtype -> float64
             use_nullable_dtypes = False
         if use_nullable_dtypes:

From 8baf12030e3be4872fb0c5c970f568eac80c639e Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sun, 4 Apr 2021 12:19:12 -0700
Subject: [PATCH 05/26] WIP

---
 pandas/_libs/lib.pyx                     | 25 +++++++++++++-----
 pandas/_libs/parsers.pyx                 | 33 ++++++++++++++----------
 pandas/io/parsers/base_parser.py         | 30 +++++++++++++++------
 pandas/tests/dtypes/test_inference.py    | 25 ++++++++++++++++++
 pandas/tests/io/parser/test_na_values.py | 27 +++++++++++++++++++
 5 files changed, 113 insertions(+), 27 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 46bfcfd6b49e6..f1d42272ba8b1 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2007,6 +2007,7 @@ def maybe_convert_numeric(
     bint convert_empty=True,
     bint coerce_numeric=False,
     bint convert_to_nullable_integer=False,
+    bint convert_to_floating_array=False,
 ) -> "ArrayLike":
     """
     Convert object array to a numeric array if possible.
@@ -2034,7 +2035,9 @@ def maybe_convert_numeric(
     convert_to_nullable_integer : bool, default False
         If an array-like object contains only integer values (and NaN) is
         encountered, whether to convert and return an IntegerArray.
-
+    convert_to_floating_array : bool, default False
+        If an array-like object contains only float values (and NaN) is
+        encountered, whether to convert and return an FloatingArray.
     Returns
     -------
     np.ndarray or ExtensionArray
@@ -2074,21 +2077,26 @@ def maybe_convert_numeric(
         # a) convert_to_nullable_integer = True
         # b) no floats have been seen ( assuming an int shows up later )
         # However, if no ints present (all null array), we need to return floats
-        allow_nullable_dtypes = convert_to_nullable_integer and not seen.float_
+        allow_null_in_int = convert_to_nullable_integer and not seen.float_
 
         if val.__hash__ is not None and val in na_values:
-            if allow_nullable_dtypes:
+            if allow_null_in_int:
                 seen.null_ = True
                 mask[i] = 1
             else:
+                if convert_to_floating_array:
+                    mask[i] = 1
                 seen.saw_null()
             floats[i] = complexes[i] = NaN
         elif util.is_float_object(val):
             fval = val
             if fval != fval:
-                mask[i] = 1
                 seen.null_ = True
-                if not allow_nullable_dtypes:
+                if allow_null_in_int:
+                    mask[i] = 1
+                else:
+                    if convert_to_floating_array:
+                        mask[i] = 1
                     seen.float_ = True
             else:
                 seen.float_ = True
@@ -2115,10 +2123,12 @@ def maybe_convert_numeric(
             floats[i] = uints[i] = ints[i] = bools[i] = val
             seen.bool_ = True
         elif val is None or val is C_NA:
-            if allow_nullable_dtypes:
+            if allow_null_in_int:
                 seen.null_ = True
                 mask[i] = 1
             else:
+                if convert_to_floating_array:
+                    mask[i] = 1
                 seen.saw_null()
             floats[i] = complexes[i] = NaN
         elif hasattr(val, '__len__') and len(val) == 0:
@@ -2192,6 +2202,9 @@ def maybe_convert_numeric(
     if seen.complex_:
         return complexes
     elif seen.float_:
+        if seen.null_ and convert_to_floating_array:
+            from pandas.core.arrays import FloatingArray
+            return FloatingArray(floats, mask.view(np.bool_))
         return floats
     elif seen.int_:
         if seen.null_ and convert_to_nullable_integer:
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 47d0826d8ea13..76fa6efd4f88d 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -92,6 +92,7 @@ from pandas._libs.khash cimport (
     kh_strbox_t,
     khiter_t,
 )
+from pandas._libs.missing cimport C_NA
 
 from pandas.errors import (
     DtypeWarning,
@@ -114,7 +115,9 @@ from pandas.core.dtypes.concat import union_categoricals
 
 from pandas.core.arrays import (
     BooleanArray,
+    FloatingArray,
     IntegerArray,
+    StringArray,
 )
 
 cdef:
@@ -1017,8 +1020,11 @@ cdef class TextReader:
             # don't try to upcast EAs
             try_upcast = upcast_na and na_count > 0
             if try_upcast and not is_extension_array_dtype(col_dtype):
-                col_res = _maybe_upcast(col_res,
-                                        use_nullable_dtypes=self.use_nullable_dtypes)
+                if na_count < len(col_res):
+                    col_res = _maybe_upcast(col_res, self.use_nullable_dtypes)
+                else:
+                    # All NaN -> float64
+                    col_res = col_res.astype("float64")
 
             if col_res is None:
                 raise ParserError(f'Unable to parse column {i}')
@@ -1314,9 +1320,6 @@ def _maybe_upcast(arr, use_nullable_dtypes=False):
     If arr of boolean dtype, arr is upcast to object dtype or BooleanArray
     and if arr is of integer dtype, arr is upcast to float dtype, or IntegerArray.
 
-    Note: If all values are null, array will be upcast to float64 even if
-    use_nullable_dtypes is True
-
     Parameters
     ----------
     arr : ndarray
@@ -1326,27 +1329,31 @@ def _maybe_upcast(arr, use_nullable_dtypes=False):
         datatypes instead of upcasting.
 
     """
+    na_value = na_values[arr.dtype]
     if issubclass(arr.dtype.type, np.integer):
-        na_value = na_values[arr.dtype]
         mask = arr == na_value
-        if mask.all():
-            # Array of all NaN, dtype -> float64
-            use_nullable_dtypes = False
         if use_nullable_dtypes:
             arr = IntegerArray(arr, mask)
         else:
             arr = arr.astype(float)
             np.putmask(arr, mask, np.nan)
     elif arr.dtype == np.bool_:
-        mask = arr.view(np.uint8) == na_values[np.uint8]
-        if mask.all():
-            # Array of all NaN, dtype -> float64
-            use_nullable_dtypes = False
+        mask = arr.view(np.uint8) == na_value
         if use_nullable_dtypes:
             arr = BooleanArray(arr, mask)
         else:
             arr = arr.astype(object)
             np.putmask(arr, mask, np.nan)
+    elif use_nullable_dtypes and arr.dtype == np.floating:
+        mask = arr == na_value
+        if mask.any():
+            arr = FloatingArray(arr, mask)
+    elif use_nullable_dtypes and arr.dtype == np.object_:
+        # Maybe convert StringArray & catch error for non-strings
+        try:
+            arr = StringArray(arr)
+        except ValueError:
+            pass
 
     return arr
 
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 4667b28fce3e2..9fa1dc0f9ec8a 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -58,7 +58,10 @@
     NA,
     array as pd_array,
 )
-from pandas.core.arrays import Categorical
+from pandas.core.arrays import (
+    Categorical,
+    StringArray,
+)
 from pandas.core.indexes.api import (
     Index,
     MultiIndex,
@@ -689,6 +692,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
                     na_values,
                     convert_empty=False,
                     convert_to_nullable_integer=self.use_nullable_dtypes,
+                    convert_to_floating_array=self.use_nullable_dtypes,
                 )
             except (ValueError, TypeError):
                 # e.g. encountering datetime string gets ValueError
@@ -702,13 +706,23 @@ def _infer_types(self, values, na_values, try_num_bool=True):
             if values.dtype == np.object_:
                 na_count = parsers.sanitize_objects(values, na_values, False)
 
-        if result.dtype == np.object_ and try_num_bool:
-            result = libops.maybe_convert_bool(
-                np.asarray(values),
-                true_values=self.true_values,
-                false_values=self.false_values,
-                convert_to_nullable_boolean=self.use_nullable_dtypes,
-            )
+        if result.dtype == np.object_:
+            if try_num_bool:
+                result = libops.maybe_convert_bool(
+                    np.asarray(values),
+                    true_values=self.true_values,
+                    false_values=self.false_values,
+                    convert_to_nullable_boolean=self.use_nullable_dtypes,
+                )
+            # Maybe StringArray? Must have NA value to trigger
+            # Since it is called use_nullable_dtypes after all
+            # However, all NA -> float64 not StringArray
+            if self.use_nullable_dtypes and na_count > 0 and na_count < len(result):
+                try:
+                    result = StringArray(result)
+                except ValueError as e:
+                    print(e)
+                    pass
 
         return result, na_count
 
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index ac5e29de41768..858da789f998a 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -63,7 +63,9 @@
 import pandas._testing as tm
 from pandas.core.arrays import (
     BooleanArray,
+    FloatingArray,
     IntegerArray,
+    StringArray,
 )
 
 
@@ -647,6 +649,29 @@ def test_maybe_convert_numeric_nullable_integer(self, exp):
         result = lib.maybe_convert_numeric(arr, set(), convert_to_nullable_integer=True)
         tm.assert_extension_array_equal(result, exp)
 
+    @pytest.mark.parametrize(
+        "convert_to_floating_array, exp",
+        [
+            (
+                True,
+                FloatingArray(
+                    np.array([2.0, 0.0], dtype="float64"), np.array([False, True])
+                ),
+            ),
+            (False, np.array([2.0, np.nan])),
+        ],
+    )
+    def test_maybe_convert_numeric_floating_array(self, convert_to_floating_array, exp):
+        # GH 40687
+        arr = np.array([2, np.nan], dtype=object)
+        result = lib.maybe_convert_numeric(
+            arr, set(), convert_to_floating_array=convert_to_floating_array
+        )
+        if convert_to_floating_array:
+            tm.assert_extension_array_equal(result, exp)
+        else:
+            tm.assert_numpy_array_equal(result, exp)
+
     def test_maybe_convert_objects_bool_nan(self):
         # GH32146
         ind = Index([True, False, np.nan], dtype=object)
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index e38257a680e6d..248a9472bb61a 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -222,6 +222,33 @@ def test_int_na_values(all_parsers, use_nullable_dtypes, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "use_nullable_dtypes, expected",
+    [
+        (
+            True,
+            DataFrame(
+                {
+                    "A": np.array(["hi", "hello", "hey"], dtype=object),
+                    "B": pd_array(["hi", NA, "hello"], dtype="string"),
+                    "C": pd_array([NA, "hi", "hey"], dtype="string"),
+                    "D": np.array([np.nan, np.nan, np.nan], dtype="float64"),
+                }
+            ),
+        )
+    ],
+)
+def test_string_na_values(all_parsers, use_nullable_dtypes, expected):
+    data = """A,B,C,D
+hi,hi,NA,NA
+hello,NA,hi,NA
+hey,hello,hey,NA"""
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes)
+    print(result)
+    tm.assert_frame_equal(result, expected)
+
+
 def test_na_value_dict(all_parsers):
     data = """A,B,C
 foo,bar,NA

From c3ee88320a5660a973fefff878442d59758a4ca8 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 12 Apr 2021 19:41:15 -0700
Subject: [PATCH 06/26] float support working

---
 pandas/_libs/parsers.pyx                 | 35 +++++++++++---------
 pandas/io/parsers/base_parser.py         |  7 ++--
 pandas/tests/io/parser/test_na_values.py | 42 ++++++++++++++++++++++--
 3 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 76fa6efd4f88d..db2c11f8d2de3 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1020,11 +1020,7 @@ cdef class TextReader:
             # don't try to upcast EAs
             try_upcast = upcast_na and na_count > 0
             if try_upcast and not is_extension_array_dtype(col_dtype):
-                if na_count < len(col_res):
-                    col_res = _maybe_upcast(col_res, self.use_nullable_dtypes)
-                else:
-                    # All NaN -> float64
-                    col_res = col_res.astype("float64")
+                col_res = _maybe_upcast(col_res, self.use_nullable_dtypes)
 
             if col_res is None:
                 raise ParserError(f'Unable to parse column {i}')
@@ -1316,24 +1312,32 @@ _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
 
 def _maybe_upcast(arr, use_nullable_dtypes=False):
     """
-    Tries to upcast null values for integer and boolean data types.
-    If arr of boolean dtype, arr is upcast to object dtype or BooleanArray
-    and if arr is of integer dtype, arr is upcast to float dtype, or IntegerArray.
+    Tries to upcast null values or use nullable dtypes if set to True.
+
 
     Parameters
     ----------
     arr : ndarray
         Array to upcast.
     use_nullable_dtypes: bool, default False
-        Whether to use nullable integer/boolean(IntegerArray/BooleanArray)
-        datatypes instead of upcasting.
+        Whether to use nullable datatypes instead of upcasting.
+        If true, then:
+            - int w/ NaN -> IntegerArray
+            - bool w/ NaN -> BooleanArray
+            - float w/NaN -> FloatingArray
+            - object(strings) w/NaN -> StringArray
 
     """
     na_value = na_values[arr.dtype]
     if issubclass(arr.dtype.type, np.integer):
         mask = arr == na_value
         if use_nullable_dtypes:
-            arr = IntegerArray(arr, mask)
+            # only convert to integer array if not all NAN
+            if not mask.all():
+                arr = IntegerArray(arr, mask)
+            else:
+                arr = arr.astype(float)
+                arr = FloatingArray(arr, mask)
         else:
             arr = arr.astype(float)
             np.putmask(arr, mask, np.nan)
@@ -1344,15 +1348,14 @@ def _maybe_upcast(arr, use_nullable_dtypes=False):
         else:
             arr = arr.astype(object)
             np.putmask(arr, mask, np.nan)
-    elif use_nullable_dtypes and arr.dtype == np.floating:
-        mask = arr == na_value
-        if mask.any():
-            arr = FloatingArray(arr, mask)
+    elif use_nullable_dtypes and arr.dtype == np.float64:
+        mask = np.isnan(arr)
+        arr = FloatingArray(arr, mask)
     elif use_nullable_dtypes and arr.dtype == np.object_:
         # Maybe convert StringArray & catch error for non-strings
         try:
             arr = StringArray(arr)
-        except ValueError:
+        except ValueError as e:
             pass
 
     return arr
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 9fa1dc0f9ec8a..60075b9ee61f1 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -716,12 +716,11 @@ def _infer_types(self, values, na_values, try_num_bool=True):
                 )
             # Maybe StringArray? Must have NA value to trigger
             # Since it is called use_nullable_dtypes after all
-            # However, all NA -> float64 not StringArray
-            if self.use_nullable_dtypes and na_count > 0 and na_count < len(result):
+            # However, all NA -> Float64 not StringArray
+            if self.use_nullable_dtypes and 0 < na_count < len(result):
                 try:
                     result = StringArray(result)
-                except ValueError as e:
-                    print(e)
+                except ValueError:
                     pass
 
         return result, na_count
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 248a9472bb61a..a22e380076bf3 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -157,7 +157,7 @@ def test_custom_na_values(all_parsers, na_values):
                 {
                     "A": pd_array([True, NA, False], dtype="boolean"),
                     "B": pd_array([False, True, NA], dtype="boolean"),
-                    "C": np.array([np.nan, np.nan, np.nan], dtype="float64"),
+                    "C": pd_array([np.nan, np.nan, np.nan], dtype="Float64"),
                     "D": np.array([True, False, True], dtype="bool"),
                 }
             ),
@@ -222,6 +222,44 @@ def test_int_na_values(all_parsers, use_nullable_dtypes, expected):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "use_nullable_dtypes, expected",
+    [
+        (
+            True,
+            DataFrame(
+                {
+                    "A": pd_array([1.0, NA, 2.0], dtype="Float64"),
+                    "B": np.array([3.0, 2.0, 1.0], dtype="float64"),
+                    "C": pd_array([NA, 1.0, 2.0], dtype="Float64"),
+                    "D": pd_array([NA, NA, NA], dtype="Float64"),
+                }
+            ),
+        ),
+        (
+            False,
+            DataFrame(
+                {
+                    "A": np.array([1.0, np.nan, 2.0], dtype="float64"),
+                    "B": np.array([3.0, 2.0, 1.0], dtype="float64"),
+                    "C": np.array([np.nan, 1.0, 2.0], dtype="float64"),
+                    "D": np.array([np.nan, np.nan, np.nan], dtype="float64"),
+                }
+            ),
+        ),
+    ],
+)
+def test_float_na_values(all_parsers, use_nullable_dtypes, expected):
+    data = """A,B,C,D
+1.0,3,NA,NA
+NA,2,1.0,NA
+2,1.0,2.0,NA"""
+    parser = all_parsers
+    result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes)
+    print(result)
+    tm.assert_frame_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "use_nullable_dtypes, expected",
     [
@@ -232,7 +270,7 @@ def test_int_na_values(all_parsers, use_nullable_dtypes, expected):
                     "A": np.array(["hi", "hello", "hey"], dtype=object),
                     "B": pd_array(["hi", NA, "hello"], dtype="string"),
                     "C": pd_array([NA, "hi", "hey"], dtype="string"),
-                    "D": np.array([np.nan, np.nan, np.nan], dtype="float64"),
+                    "D": pd_array([np.nan, np.nan, np.nan], dtype="Float64"),
                 }
             ),
         )

From 6d49eaf56ead0650778c6dfa87df4ea92adcb29c Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 12 Apr 2021 19:48:55 -0700
Subject: [PATCH 07/26] Fixes

---
 pandas/_libs/lib.pyi                  | 2 ++
 pandas/tests/dtypes/test_inference.py | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index 477c9fd655a4a..0bbe375d235d7 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -75,6 +75,8 @@ def maybe_convert_numeric(
     na_values: set,
     convert_empty: bool = True,
     coerce_numeric: bool = False,
+    convert_to_nullable_integer: bool = False,
+    convert_to_floating_array: bool = False
 ) -> np.ndarray: ...
 
 # TODO: restrict `arr`?
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index 457f8f6c54667..497fb94b957c8 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -65,7 +65,6 @@
     BooleanArray,
     FloatingArray,
     IntegerArray,
-    StringArray,
 )
 
 

From 680ffb1426264dc7cf500984dee856a8d4b9d449 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sun, 18 Apr 2021 14:12:36 -0700
Subject: [PATCH 08/26] Address code review

---
 pandas/_libs/lib.pyi                  |  5 +-
 pandas/_libs/lib.pyx                  | 41 +++++++---------
 pandas/_libs/ops.pyx                  | 10 ++--
 pandas/tests/dtypes/test_inference.py | 68 ++++++++++++++++++---------
 4 files changed, 71 insertions(+), 53 deletions(-)

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index 0bbe375d235d7..bc1e4a3f28ef2 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -75,9 +75,8 @@ def maybe_convert_numeric(
     na_values: set,
     convert_empty: bool = True,
     coerce_numeric: bool = False,
-    convert_to_nullable_integer: bool = False,
-    convert_to_floating_array: bool = False
-) -> np.ndarray: ...
+    convert_to_masked_nullable: bool = False,
+) -> np.ndarray | (np.ndarray,np.ndarray): ...
 
 # TODO: restrict `arr`?
 def ensure_string_array(
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index e19c4ff17470d..7ed0329751863 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2007,8 +2007,7 @@ def maybe_convert_numeric(
     set na_values,
     bint convert_empty=True,
     bint coerce_numeric=False,
-    bint convert_to_nullable_integer=False,
-    bint convert_to_floating_array=False,
+    bint convert_to_masked_nullable=False,
 ) -> "ArrayLike":
     """
     Convert object array to a numeric array if possible.
@@ -2033,16 +2032,14 @@ def maybe_convert_numeric(
         numeric array has no suitable numerical dtype to return (i.e. uint64,
         int32, uint8). If set to False, the original object array will be
         returned. Otherwise, a ValueError will be raised.
-    convert_to_nullable_integer : bool, default False
-        If an array-like object contains only integer values (and NaN) is
-        encountered, whether to convert and return an IntegerArray.
-    convert_to_floating_array : bool, default False
-        If an array-like object contains only float values (and NaN) is
-        encountered, whether to convert and return an FloatingArray.
+    convert_to_masked_nullable : bool, default False
+        Whether to return a mask for the converted values. This also disables
+        upcasting for ints with nulls to float64.
     Returns
     -------
-    np.ndarray or ExtensionArray
+    np.ndarray or tuple of converted values and its mask
         Array of converted object values to numerical ones.
+        Also returns mask if convert_to_masked_nullable is True.
     """
     if len(values) == 0:
         return np.array([], dtype='i8')
@@ -2075,17 +2072,17 @@ def maybe_convert_numeric(
     for i in range(n):
         val = values[i]
         # We only want to disable NaNs showing as float if
-        # a) convert_to_nullable_integer = True
+        # a) convert_to_masked_nullable = True
         # b) no floats have been seen ( assuming an int shows up later )
         # However, if no ints present (all null array), we need to return floats
-        allow_null_in_int = convert_to_nullable_integer and not seen.float_
+        allow_null_in_int = convert_to_masked_nullable and not seen.float_
 
         if val.__hash__ is not None and val in na_values:
             if allow_null_in_int:
                 seen.null_ = True
                 mask[i] = 1
             else:
-                if convert_to_floating_array:
+                if convert_to_masked_nullable:
                     mask[i] = 1
                 seen.saw_null()
             floats[i] = complexes[i] = NaN
@@ -2096,7 +2093,7 @@ def maybe_convert_numeric(
                 if allow_null_in_int:
                     mask[i] = 1
                 else:
-                    if convert_to_floating_array:
+                    if convert_to_masked_nullable:
                         mask[i] = 1
                     seen.float_ = True
             else:
@@ -2128,7 +2125,7 @@ def maybe_convert_numeric(
                 seen.null_ = True
                 mask[i] = 1
             else:
-                if convert_to_floating_array:
+                if convert_to_masked_nullable:
                     mask[i] = 1
                 seen.saw_null()
             floats[i] = complexes[i] = NaN
@@ -2164,7 +2161,7 @@ def maybe_convert_numeric(
                     if as_int in na_values:
                         mask[i] = 1
                         seen.null_ = True
-                        if not convert_to_nullable_integer:
+                        if not allow_null_in_int:
                             seen.float_ = True
                     else:
                         seen.saw_int(as_int)
@@ -2197,23 +2194,21 @@ def maybe_convert_numeric(
 
     # This occurs since we disabled float nulls showing as null in anticipation
     # of seeing ints that were never seen. So then, we return float
-    if convert_to_nullable_integer and seen.null_ and not seen.int_:
+    if allow_null_in_int and seen.null_ and not seen.int_:
         seen.float_ = True
 
     if seen.complex_:
         return complexes
     elif seen.float_:
-        if seen.null_ and convert_to_floating_array:
-            from pandas.core.arrays import FloatingArray
-            return FloatingArray(floats, mask.view(np.bool_))
+        if seen.null_ and convert_to_masked_nullable:
+            return (floats, mask.view(np.bool_))
         return floats
     elif seen.int_:
-        if seen.null_ and convert_to_nullable_integer:
-            from pandas.core.arrays import IntegerArray
+        if seen.null_ and convert_to_masked_nullable:
             if seen.uint_:
-                return IntegerArray(uints, mask.view(np.bool_))
+                return (uints, mask.view(np.bool_))
             else:
-                return IntegerArray(ints, mask.view(np.bool_))
+                return (ints, mask.view(np.bool_))
         if seen.uint_:
             return uints
         else:
diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx
index 96d7d6d9870e6..a2b7354e61e53 100644
--- a/pandas/_libs/ops.pyx
+++ b/pandas/_libs/ops.pyx
@@ -257,7 +257,8 @@ def vec_binop(object[:] left, object[:] right, object op) -> ndarray:
 def maybe_convert_bool(ndarray[object] arr,
                        true_values=None,
                        false_values=None,
-                       convert_to_nullable_boolean=False) -> "ArrayLike":
+                       convert_to_masked_nullable=False
+                       ) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
     cdef:
         Py_ssize_t i, n
         ndarray[uint8_t] result
@@ -291,7 +292,7 @@ def maybe_convert_bool(ndarray[object] arr,
             result[i] = 1
         elif val in false_vals:
             result[i] = 0
-        elif isinstance(val, float):
+        elif is_nan(val):
             mask[i] = 1
             result[i] = 0  # Value here doesn't matter, will be replaced w/ nan
             has_na = True
@@ -299,9 +300,8 @@ def maybe_convert_bool(ndarray[object] arr,
             return arr
 
     if has_na:
-        if convert_to_nullable_boolean:
-            from pandas.core.arrays import BooleanArray
-            return BooleanArray(result.view(np.bool_), mask.view(np.bool_))
+        if convert_to_masked_nullable:
+            return (result.view(np.bool_), mask.view(np.bool_))
         else:
             arr = result.view(np.bool_).astype(object)
             np.putmask(arr, mask, np.nan)
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index 497fb94b957c8..5e8be9b8b987b 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -421,7 +421,7 @@ def test_isneginf_scalar(self, value, expected):
         assert result is expected
 
     @pytest.mark.parametrize(
-        "convert_to_nullable_boolean, exp",
+        "convert_to_masked_nullable, exp",
         [
             (
                 True,
@@ -432,14 +432,14 @@ def test_isneginf_scalar(self, value, expected):
             (False, np.array([True, np.nan], dtype="object")),
         ],
     )
-    def test_maybe_convert_nullable_boolean(self, convert_to_nullable_boolean, exp):
+    def test_maybe_convert_nullable_boolean(self, convert_to_masked_nullable, exp):
         # GH 40687
         arr = np.array([True, np.NaN], dtype=object)
         result = libops.maybe_convert_bool(
-            arr, set(), convert_to_nullable_boolean=convert_to_nullable_boolean
+            arr, set(), convert_to_masked_nullable=convert_to_masked_nullable
         )
-        if convert_to_nullable_boolean:
-            tm.assert_extension_array_equal(result, exp)
+        if convert_to_masked_nullable:
+            tm.assert_extension_array_equal(BooleanArray(*result), exp)
         else:
             tm.assert_numpy_array_equal(result, exp)
 
@@ -546,16 +546,30 @@ def test_convert_numeric_uint64_nan_values(self, coerce):
             np.array([str(-1), str(2 ** 63)], dtype=object),
         ],
     )
-    def test_convert_numeric_int64_uint64(self, case, coerce):
+    @pytest.mark.parametrize("convert_to_masked_nullable", [True, False])
+    def test_convert_numeric_int64_uint64(
+        self, case, coerce, convert_to_masked_nullable
+    ):
         expected = case.astype(float) if coerce else case.copy()
-        result = lib.maybe_convert_numeric(case, set(), coerce_numeric=coerce)
+        result = lib.maybe_convert_numeric(
+            case,
+            set(),
+            coerce_numeric=coerce,
+            convert_to_masked_nullable=convert_to_masked_nullable,
+        )
         tm.assert_almost_equal(result, expected)
 
-    def test_convert_numeric_string_uint64(self):
+    @pytest.mark.parametrize("convert_to_masked_nullable", [True, False])
+    def test_convert_numeric_string_uint64(self, convert_to_masked_nullable):
         # GH32394
         result = lib.maybe_convert_numeric(
-            np.array(["uint64"], dtype=object), set(), coerce_numeric=True
+            np.array(["uint64"], dtype=object),
+            set(),
+            coerce_numeric=True,
+            convert_to_masked_nullable=convert_to_masked_nullable,
         )
+        if convert_to_masked_nullable:
+            result = FloatingArray(*result)
         assert np.isnan(result)
 
     @pytest.mark.parametrize("value", [-(2 ** 63) - 1, 2 ** 64])
@@ -636,20 +650,28 @@ def test_maybe_convert_objects_nullable_integer(self, exp):
         tm.assert_extension_array_equal(result, exp)
 
     @pytest.mark.parametrize(
-        "exp",
+        "convert_to_masked_nullable, exp",
         [
-            IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])),
-            IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])),
+            (True, IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True]))),
+            (False, np.array([2, np.nan], dtype="float64")),
         ],
     )
-    def test_maybe_convert_numeric_nullable_integer(self, exp):
+    def test_maybe_convert_numeric_nullable_integer(
+        self, convert_to_masked_nullable, exp
+    ):
         # GH 40687
         arr = np.array([2, np.NaN], dtype=object)
-        result = lib.maybe_convert_numeric(arr, set(), convert_to_nullable_integer=True)
-        tm.assert_extension_array_equal(result, exp)
+        result = lib.maybe_convert_numeric(
+            arr, set(), convert_to_masked_nullable=convert_to_masked_nullable
+        )
+        if convert_to_masked_nullable:
+            result = IntegerArray(*result)
+            tm.assert_extension_array_equal(result, exp)
+        else:
+            tm.assert_numpy_array_equal(result, exp)
 
     @pytest.mark.parametrize(
-        "convert_to_floating_array, exp",
+        "convert_to_masked_nullable, exp",
         [
             (
                 True,
@@ -657,17 +679,19 @@ def test_maybe_convert_numeric_nullable_integer(self, exp):
                     np.array([2.0, 0.0], dtype="float64"), np.array([False, True])
                 ),
             ),
-            (False, np.array([2.0, np.nan])),
+            (False, np.array([2.0, np.nan], dtype="float64")),
         ],
     )
-    def test_maybe_convert_numeric_floating_array(self, convert_to_floating_array, exp):
+    def test_maybe_convert_numeric_floating_array(
+        self, convert_to_masked_nullable, exp
+    ):
         # GH 40687
-        arr = np.array([2, np.nan], dtype=object)
+        arr = np.array([2.0, np.nan], dtype=object)
         result = lib.maybe_convert_numeric(
-            arr, set(), convert_to_floating_array=convert_to_floating_array
+            arr, set(), convert_to_masked_nullable=convert_to_masked_nullable
         )
-        if convert_to_floating_array:
-            tm.assert_extension_array_equal(result, exp)
+        if convert_to_masked_nullable:
+            tm.assert_extension_array_equal(FloatingArray(*result), exp)
         else:
             tm.assert_numpy_array_equal(result, exp)
 

From d075bed2028d1df2278cefa729a8c9736388ac58 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 5 May 2021 16:41:05 -0700
Subject: [PATCH 09/26] Finish updating code to master

---
 pandas/io/parsers/base_parser.py | 51 +++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 795076efadb12..8b41363d57d57 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -44,6 +44,7 @@
     is_dict_like,
     is_dtype_equal,
     is_extension_array_dtype,
+    is_float_dtype,
     is_integer,
     is_integer_dtype,
     is_list_like,
@@ -61,7 +62,10 @@
     array as pd_array,
 )
 from pandas.core.arrays import (
+    BooleanArray,
     Categorical,
+    FloatingArray,
+    IntegerArray,
     StringArray,
 )
 from pandas.core.indexes.api import (
@@ -707,10 +711,12 @@ def _infer_types(self, values, na_values, try_num_bool=True):
         if try_num_bool and is_object_dtype(values.dtype):
             # exclude e.g DatetimeIndex here
             try:
-                result, mask = lib.maybe_convert_numeric(values, 
-                                   na_values, 
-                                   False, 
-                                   convert_to_masked_nullable=self.use_nullable_dtypes)
+                result, mask = lib.maybe_convert_numeric(
+                    values,
+                    na_values,
+                    False,
+                    convert_to_masked_nullable=self.use_nullable_dtypes,
+                )
                 if mask is not None:
                     if is_integer_dtype(result):
                         result = IntegerArray(result, mask)
@@ -728,22 +734,27 @@ def _infer_types(self, values, na_values, try_num_bool=True):
             if values.dtype == np.object_:
                 na_count = parsers.sanitize_objects(values, na_values, False)
 
-        if result.dtype == np.object_:
-            if try_num_bool:
-                result = libops.maybe_convert_bool(
-                    np.asarray(values),
-                    true_values=self.true_values,
-                    false_values=self.false_values,
-                    convert_to_masked_nullable=self.use_nullable_dtypes,
-                )
-            # Maybe StringArray? Must have NA value to trigger
-            # Since it is called use_nullable_dtypes after all
-            # However, all NA -> Float64 not StringArray
-            if self.use_nullable_dtypes and 0 < na_count < len(result):
-                try:
-                    result = StringArray(result)
-                except ValueError:
-                    pass
+        if result.dtype == np.object_ and try_num_bool:
+            result, mask = libops.maybe_convert_bool(
+                np.asarray(values),
+                true_values=self.true_values,
+                false_values=self.false_values,
+                convert_to_masked_nullable=self.use_nullable_dtypes,
+            )
+            if mask is not None:
+                result = BooleanArray(result, mask)
+        # Maybe StringArray? Must have NA value to trigger
+        # Since it is called use_nullable_dtypes after all
+        # However, all NA -> Float64 not StringArray
+        if (
+            result.dtype == np.object_
+            and self.use_nullable_dtypes
+            and 0 < na_count < len(result)
+        ):
+            try:
+                result = StringArray(result)
+            except ValueError:
+                pass
 
         return result, na_count
 

From dc03d0ffbdf419c7240a24490511f6f7795b0486 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 5 May 2021 18:04:34 -0700
Subject: [PATCH 10/26] Preliminary StringArray support

---
 pandas/_libs/lib.pyx                     | 13 +++++++++++--
 pandas/core/arrays/string_.py            | 16 ++++++++++++----
 pandas/tests/io/parser/test_na_values.py |  1 -
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 8d6b5cac5847c..a61fa102cfd90 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -679,11 +679,14 @@ cpdef ndarray[object] ensure_string_array(
         arr,
         object na_value=np.nan,
         bint convert_na_value=True,
+        bint coerce=True,
         bint copy=True,
         bint skipna=True,
 ):
     """
-    Returns a new numpy array with object dtype and only strings and na values.
+    Checks that all elements in numpy are string or null and returns a new numpy array
+    with object dtype and only strings and na values if so. Otherwise,
+    raise a ValueError.
 
     Parameters
     ----------
@@ -693,6 +696,9 @@ cpdef ndarray[object] ensure_string_array(
         The value to use for na. For example, np.nan or pd.NA.
     convert_na_value : bool, default True
         If False, existing na values will be used unchanged in the new array.
+    coerce : bool, default True
+        Whether to coerce non-null non-string elements to strings.
+        Will raise ValueError otherwise.
     copy : bool, default True
         Whether to ensure that a new array is returned.
     skipna : bool, default True
@@ -724,7 +730,10 @@ cpdef ndarray[object] ensure_string_array(
             continue
 
         if not checknull(val):
-            result[i] = str(val)
+            if coerce:
+                result[i] = str(val)
+            else:
+                raise ValueError("Non-string element encountered in array.")
         else:
             if convert_na_value:
                 val = na_value
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 74ca5130ca322..55ef2dde65674 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -208,21 +208,29 @@ def __init__(self, values, copy=False):
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
+        if not isinstance(values, type(self)):
+            self._validate()
         # error: Incompatible types in assignment (expression has type "StringDtype",
         # variable has type "PandasDtype")
         NDArrayBacked.__init__(self, self._ndarray, StringDtype())
-        if not isinstance(values, type(self)):
-            self._validate()
 
     def _validate(self):
         """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
         if self._ndarray.dtype != "object":
             raise ValueError(
                 "StringArray requires a sequence of strings or pandas.NA. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
+        try:
+            NDArrayBacked.__init__(
+                self,
+                lib.ensure_string_array(
+                    self._ndarray, na_value=StringDtype.na_value, coerce=False
+                ),
+                StringDtype(),
+            )
+        except ValueError:
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index a22e380076bf3..f63476b5a4dd6 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -283,7 +283,6 @@ def test_string_na_values(all_parsers, use_nullable_dtypes, expected):
 hey,hello,hey,NA"""
     parser = all_parsers
     result = parser.read_csv(StringIO(data), use_nullable_dtypes=use_nullable_dtypes)
-    print(result)
     tm.assert_frame_equal(result, expected)
 
 

From 99afeb4afd361ad369c0dc42a90b3068e86a8d17 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 5 May 2021 21:20:25 -0700
Subject: [PATCH 11/26] Fix tests

---
 pandas/core/arrays/string_.py              |  7 ++++---
 pandas/tests/arrays/string_/test_string.py | 13 ++++++-------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 55ef2dde65674..875c64fa76273 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -222,11 +222,12 @@ def _validate(self):
                 f"'{self._ndarray.dtype}' dtype instead."
             )
         try:
+            lib.ensure_string_array(
+                self._ndarray, na_value=StringDtype.na_value, coerce=False, copy=False
+            ),
             NDArrayBacked.__init__(
                 self,
-                lib.ensure_string_array(
-                    self._ndarray, na_value=StringDtype.na_value, coerce=False
-                ),
+                self._ndarray,
                 StringDtype(),
             )
         except ValueError:
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 17d05ebeb0fc5..7feb22f69632a 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -296,14 +296,13 @@ def test_constructor_raises(cls):
     with pytest.raises(ValueError, match=msg):
         cls(np.array([]))
 
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", np.nan], dtype=object))
-
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", None], dtype=object))
 
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", pd.NaT], dtype=object))
+@pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA])
+def test_constructor_nan_like(na):
+    expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
+    tm.assert_extension_array_equal(
+        pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
+    )
 
 
 @pytest.mark.parametrize("copy", [True, False])

From 3e1784de50a9305dbccb185d4b2830a4c4addbed Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 10 May 2021 16:19:47 -0700
Subject: [PATCH 12/26] API: allow nan-likes in StringArray constructor

---
 doc/source/whatsnew/v1.3.0.rst             |  1 +
 pandas/_libs/lib.pyx                       | 24 ++++++++++++++------
 pandas/core/arrays/string_.py              | 26 +++++++++++++++++-----
 pandas/tests/arrays/string_/test_string.py | 12 +++++-----
 4 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 5adc8540e6864..fd246cb554d7f 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -612,6 +612,7 @@ Other API changes
 - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects.
 - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`)
 - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`)
+- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")) in its constructor in addition to strings.
 
 Build
 =====
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index e1cb744c7033c..fcb6d39bfc91f 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -679,11 +679,14 @@ cpdef ndarray[object] ensure_string_array(
         arr,
         object na_value=np.nan,
         bint convert_na_value=True,
+        bint coerce=True,
         bint copy=True,
         bint skipna=True,
 ):
     """
-    Returns a new numpy array with object dtype and only strings and na values.
+    Checks that all elements in numpy are string or null and returns a new numpy array
+    with object dtype and only strings and na values if so. Otherwise,
+    raise a ValueError.
 
     Parameters
     ----------
@@ -693,6 +696,9 @@ cpdef ndarray[object] ensure_string_array(
         The value to use for na. For example, np.nan or pd.NA.
     convert_na_value : bool, default True
         If False, existing na values will be used unchanged in the new array.
+    coerce : bool, default True
+        Whether to coerce non-null non-string elements to strings.
+        Will raise ValueError otherwise.
     copy : bool, default True
         Whether to ensure that a new array is returned.
     skipna : bool, default True
@@ -724,7 +730,10 @@ cpdef ndarray[object] ensure_string_array(
             continue
 
         if not checknull(val):
-            result[i] = str(val)
+            if coerce:
+                result[i] = str(val)
+            else:
+                raise ValueError("Non-string element encountered in array.")
         else:
             if convert_na_value:
                 val = na_value
@@ -1835,10 +1844,6 @@ cdef class StringValidator(Validator):
     cdef inline bint is_array_typed(self) except -1:
         return issubclass(self.dtype.type, np.str_)
 
-    cdef bint is_valid_null(self, object value) except -1:
-        # We deliberately exclude None / NaN here since StringArray uses NA
-        return value is C_NA
-
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):
     cdef:
@@ -2059,7 +2064,7 @@ def maybe_convert_numeric(
         upcasting for ints with nulls to float64.
     Returns
     -------
-    np.ndarray
+    np.ndarray or tuple of converted values and its mask
         Array of converted object values to numerical ones.
 
     Optional[np.ndarray]
@@ -2224,6 +2229,11 @@ def maybe_convert_numeric(
     if allow_null_in_int and seen.null_ and not seen.int_:
         seen.float_ = True
 
+    # This occurs since we disabled float nulls showing as null in anticipation
+    # of seeing ints that were never seen. So then, we return float
+    if allow_null_in_int and seen.null_ and not seen.int_:
+        seen.float_ = True
+
     if seen.complex_:
         return (complexes, None)
     elif seen.float_:
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 74ca5130ca322..c30d4b8ba7b41 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -144,11 +144,18 @@ class StringArray(PandasArray):
         .. warning::
 
            Currently, this expects an object-dtype ndarray
-           where the elements are Python strings or :attr:`pandas.NA`.
+           where the elements are Python strings
+           or nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")).
            This may change without warning in the future. Use
            :meth:`pandas.array` with ``dtype="string"`` for a stable way of
            creating a `StringArray` from any sequence.
 
+        .. versionchanged:: 1.3
+
+           StringArray now accepts nan-likes in the constructor in addition
+           to strings, whereas it only accepted strings and :attr:`pandas.NA`
+           before.
+
     copy : bool, default False
         Whether to copy the array of data.
 
@@ -208,21 +215,30 @@ def __init__(self, values, copy=False):
         values = extract_array(values)
 
         super().__init__(values, copy=copy)
+        if not isinstance(values, type(self)):
+            self._validate()
         # error: Incompatible types in assignment (expression has type "StringDtype",
         # variable has type "PandasDtype")
         NDArrayBacked.__init__(self, self._ndarray, StringDtype())
-        if not isinstance(values, type(self)):
-            self._validate()
 
     def _validate(self):
         """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
-            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
         if self._ndarray.dtype != "object":
             raise ValueError(
                 "StringArray requires a sequence of strings or pandas.NA. Got "
                 f"'{self._ndarray.dtype}' dtype instead."
             )
+        try:
+            lib.ensure_string_array(
+                self._ndarray, na_value=StringDtype.na_value, coerce=False, copy=False
+            ),
+            NDArrayBacked.__init__(
+                self,
+                self._ndarray,
+                StringDtype(),
+            )
+        except ValueError:
+            raise ValueError("StringArray requires a sequence of strings or pandas.NA")
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 17d05ebeb0fc5..722aada176c44 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -297,13 +297,15 @@ def test_constructor_raises(cls):
         cls(np.array([]))
 
     with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", np.nan], dtype=object))
+        cls(np.array(["a", None]))
 
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", None], dtype=object))
 
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", pd.NaT], dtype=object))
+@pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA])
+def test_constructor_nan_like(na):
+    expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
+    tm.assert_extension_array_equal(
+        pd.arrays.StringArray(np.array(["a", na], dtype="object")), expected
+    )
 
 
 @pytest.mark.parametrize("copy", [True, False])

From 96ff1da535cd571cd45cb60d4cd1fdb47744f79e Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Mon, 10 May 2021 19:31:47 -0700
Subject: [PATCH 13/26] Revert weird changes & Fix stuff

---
 pandas/_libs/lib.pyi                       | 1 +
 pandas/_libs/lib.pyx                       | 7 +------
 pandas/tests/arrays/string_/test_string.py | 2 +-
 pandas/tests/dtypes/test_inference.py      | 7 ++++---
 4 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index 9dbc47f1d40f7..22990361bc52e 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -138,6 +138,7 @@ def ensure_string_array(
     arr,
     na_value: object = np.nan,
     convert_na_value: bool = True,
+    coerce: bool = True,
     copy: bool = True,
     skipna: bool = True,
 ) -> np.ndarray: ...  # np.ndarray[object]
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index fcb6d39bfc91f..b1523421e59fd 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2064,7 +2064,7 @@ def maybe_convert_numeric(
         upcasting for ints with nulls to float64.
     Returns
     -------
-    np.ndarray or tuple of converted values and its mask
+    np.ndarray
         Array of converted object values to numerical ones.
 
     Optional[np.ndarray]
@@ -2229,11 +2229,6 @@ def maybe_convert_numeric(
     if allow_null_in_int and seen.null_ and not seen.int_:
         seen.float_ = True
 
-    # This occurs since we disabled float nulls showing as null in anticipation
-    # of seeing ints that were never seen. So then, we return float
-    if allow_null_in_int and seen.null_ and not seen.int_:
-        seen.float_ = True
-
     if seen.complex_:
         return (complexes, None)
     elif seen.float_:
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 722aada176c44..b3bc3b09e047a 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -297,7 +297,7 @@ def test_constructor_raises(cls):
         cls(np.array([]))
 
     with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", None]))
+        cls(np.array(["a", np.nan]))
 
 
 @pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA])
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index 076cc155f3626..73e87c75ee621 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -1376,11 +1376,12 @@ def test_is_string_array(self):
         assert lib.is_string_array(
             np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
         )
-        # NaN is not valid for string array, just NA
-        assert not lib.is_string_array(
+        assert lib.is_string_array(
             np.array(["foo", "bar", np.nan], dtype=object), skipna=True
         )
-
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", np.nan], dtype=object), skipna=False
+        )
         assert not lib.is_string_array(np.array([1, 2]))
 
     def test_to_object_array_tuples(self):

From 418e1d201ad0c20b9c5119fff34567fe72158ec2 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Tue, 11 May 2021 07:01:06 -0700
Subject: [PATCH 14/26] Remove failing test

---
 pandas/tests/arrays/string_/test_string.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index b3bc3b09e047a..7feb22f69632a 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -296,9 +296,6 @@ def test_constructor_raises(cls):
     with pytest.raises(ValueError, match=msg):
         cls(np.array([]))
 
-    with pytest.raises(ValueError, match=msg):
-        cls(np.array(["a", np.nan]))
-
 
 @pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA])
 def test_constructor_nan_like(na):

From 25a6c4d2ec9287b5b0a341c3cdd583cc3659a276 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 19 May 2021 16:23:41 -0700
Subject: [PATCH 15/26] Changes from code review

---
 pandas/_libs/lib.pyi               |  3 +--
 pandas/_libs/lib.pyx               | 24 ++++++++++++++----------
 pandas/core/arrays/string_.py      |  9 ++-------
 pandas/core/arrays/string_arrow.py |  2 +-
 pandas/core/dtypes/cast.py         |  4 ++--
 5 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index 22990361bc52e..966fd0cd4c008 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -137,8 +137,7 @@ def maybe_convert_numeric(
 def ensure_string_array(
     arr,
     na_value: object = np.nan,
-    convert_na_value: bool = True,
-    coerce: bool = True,
+    coerce: str = "all,
     copy: bool = True,
     skipna: bool = True,
 ) -> np.ndarray: ...  # np.ndarray[object]
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index b1523421e59fd..fc3d73f332646 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -678,15 +678,14 @@ def astype_intsafe(ndarray[object] arr, cnp.dtype new_dtype) -> ndarray:
 cpdef ndarray[object] ensure_string_array(
         arr,
         object na_value=np.nan,
-        bint convert_na_value=True,
-        bint coerce=True,
+        coerce="all",
         bint copy=True,
         bint skipna=True,
 ):
     """
-    Checks that all elements in numpy are string or null and returns a new numpy array
-    with object dtype and only strings and na values if so. Otherwise,
-    raise a ValueError.
+    Checks that all elements in numpy array are string or null
+    and returns a new numpy array with object dtype
+    and only strings and na values if so. Otherwise, raise a ValueError.
 
     Parameters
     ----------
@@ -696,9 +695,14 @@ cpdef ndarray[object] ensure_string_array(
         The value to use for na. For example, np.nan or pd.NA.
     convert_na_value : bool, default True
         If False, existing na values will be used unchanged in the new array.
-    coerce : bool, default True
-        Whether to coerce non-null non-string elements to strings.
-        Will raise ValueError otherwise.
+    coerce : {{'all', 'null', 'non-null', None}}, default 'all'
+        Whether to coerce non-string elements to strings.
+            - 'all' will convert null values and non-null non-string values.
+            - 'null' will only convert nulls without converting other non-strings.
+            - 'non-null' will only convert non-null non-string elements to string.
+            - None will not convert anything.
+        If coerce is not all, a ValueError will be raised for values
+        that are not strings or na_value.
     copy : bool, default True
         Whether to ensure that a new array is returned.
     skipna : bool, default True
@@ -730,12 +734,12 @@ cpdef ndarray[object] ensure_string_array(
             continue
 
         if not checknull(val):
-            if coerce:
+            if coerce =="all" or coerce == "non-null":
                 result[i] = str(val)
             else:
                 raise ValueError("Non-string element encountered in array.")
         else:
-            if convert_na_value:
+            if coerce=="all" or coerce == "null":
                 val = na_value
             if skipna:
                 result[i] = val
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index c30d4b8ba7b41..289204c9aa4e5 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -230,13 +230,8 @@ def _validate(self):
             )
         try:
             lib.ensure_string_array(
-                self._ndarray, na_value=StringDtype.na_value, coerce=False, copy=False
+                self._ndarray, na_value=StringDtype.na_value, coerce="null", copy=False
             ),
-            NDArrayBacked.__init__(
-                self,
-                self._ndarray,
-                StringDtype(),
-            )
         except ValueError:
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
 
@@ -251,7 +246,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
             # avoid costly conversion to object dtype
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            result = lib.ensure_string_array(result, copy=copy, coerce="non-null")
             result[na_values] = StringDtype.na_value
 
         else:
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 219a8c7ec0b82..42b7bf1a52513 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -247,7 +247,7 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
             # numerical issues with Float32Dtype
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            result = lib.ensure_string_array(result, copy=copy, coerce="non-null")
             return cls(pa.array(result, mask=na_values, type=pa.string()))
 
         # convert non-na-likes to str
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 46dc97214e2f6..1e8c09136e223 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -1125,7 +1125,7 @@ def astype_nansafe(
         return arr.astype(dtype, copy=copy)
 
     if issubclass(dtype.type, str):
-        return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False)
+        return lib.ensure_string_array(arr, skipna=skipna, convert_na_value="non-null")
 
     elif is_datetime64_dtype(arr):
         if dtype == np.int64:
@@ -1925,7 +1925,7 @@ def construct_1d_ndarray_preserving_na(
     """
 
     if dtype is not None and dtype.kind == "U":
-        subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
+        subarr = lib.ensure_string_array(values, coerce="non-null", copy=copy)
     else:
         if dtype is not None:
             _disallow_mismatched_datetimelike(values, dtype)

From 8257dbd739a4b6f12b737f89da317a24d3f8b07f Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Thu, 20 May 2021 14:32:58 -0700
Subject: [PATCH 16/26] typo

---
 pandas/core/dtypes/cast.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index c256f20527ad6..46af33b724d2a 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -1125,7 +1125,7 @@ def astype_nansafe(
         return arr.astype(dtype, copy=copy)
 
     if issubclass(dtype.type, str):
-        return lib.ensure_string_array(arr, skipna=skipna, convert_na_value="non-null")
+        return lib.ensure_string_array(arr, skipna=skipna, coerce="non-null")
 
     elif is_datetime64_dtype(arr):
         if dtype == np.int64:

From 922436a78903dfa55cd1d54d4381477cad934af5 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 21 May 2021 13:30:08 -0700
Subject: [PATCH 17/26] Update lib.pyi

---
 pandas/_libs/lib.pyi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index 1e49ce67f7cec..726b306e71fd5 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -146,7 +146,7 @@ def maybe_convert_numeric(
 def ensure_string_array(
     arr,
     na_value: object = np.nan,
-    coerce: str = "all,
+    coerce: str = "all",
     copy: bool = True,
     skipna: bool = True,
 ) -> np.ndarray: ...  # np.ndarray[object]

From 2f28086a0f23bf2b30d79ca41aaab0abb3ca370b Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sat, 29 May 2021 11:03:33 -0700
Subject: [PATCH 18/26] Update lib.pyx

---
 pandas/_libs/lib.pyx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 8df50a32ae482..99872d2f9e91f 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1849,7 +1849,11 @@ cdef class StringValidator(Validator):
 
     cdef inline bint is_array_typed(self) except -1:
         return issubclass(self.dtype.type, np.str_)
-
+    
+    cdef bint is_valid_null(self, object value) except -1:
+        # Override to exclude float('Nan') and complex NaN
+        return value is None or value is C_NA or np.isnan(value)
+        
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):
     cdef:

From 3ee219815e619fb57edeee0c295ba36e84232e0a Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sat, 29 May 2021 11:05:19 -0700
Subject: [PATCH 19/26] Update lib.pyx

---
 pandas/_libs/lib.pyx | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 99872d2f9e91f..ce70d15c202f5 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -693,8 +693,6 @@ cpdef ndarray[object] ensure_string_array(
         The values to be converted to str, if needed.
     na_value : Any, default np.nan
         The value to use for na. For example, np.nan or pd.NA.
-    convert_na_value : bool, default True
-        If False, existing na values will be used unchanged in the new array.
     coerce : {{'all', 'null', 'non-null', None}}, default 'all'
         Whether to coerce non-string elements to strings.
             - 'all' will convert null values and non-null non-string values.
@@ -1849,11 +1847,11 @@ cdef class StringValidator(Validator):
 
     cdef inline bint is_array_typed(self) except -1:
         return issubclass(self.dtype.type, np.str_)
-    
+
     cdef bint is_valid_null(self, object value) except -1:
         # Override to exclude float('Nan') and complex NaN
         return value is None or value is C_NA or np.isnan(value)
-        
+
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):
     cdef:

From 3ee55f25a94a12da069a387a150164538394d460 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Sat, 29 May 2021 21:21:57 -0700
Subject: [PATCH 20/26] Updates

---
 pandas/tests/arrays/string_/test_string.py | 5 ++++-
 pandas/tests/dtypes/test_inference.py      | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index a246be938aef0..af57aff03b073 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -289,8 +289,11 @@ def test_constructor_raises(cls):
     with pytest.raises(ValueError, match=msg):
         cls(np.array([]))
 
+    with pytest.raises(ValueError, match=msg):
+        cls(np.array(["a", pd.NaT], dtype=object))
+
 
-@pytest.mark.parametrize("na", [np.nan, pd.NaT, None, pd.NA])
+@pytest.mark.parametrize("na", [np.nan, None, pd.NA])
 def test_constructor_nan_like(na):
     expected = pd.arrays.StringArray(np.array(["a", pd.NA]))
     tm.assert_extension_array_equal(
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index 987b3accbca2e..87a1be80e3639 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -1391,6 +1391,12 @@ def test_is_string_array(self):
         assert lib.is_string_array(
             np.array(["foo", "bar", np.nan], dtype=object), skipna=True
         )
+        assert lib.is_string_array(
+            np.array(["foo", "bar", None], dtype=object), skipna=True
+        )
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", None], dtype=object), skipna=False
+        )
         assert not lib.is_string_array(
             np.array(["foo", "bar", np.nan], dtype=object), skipna=False
         )

From fe4981a6337cd59ae68b1ff44ca0f9b600d2ee49 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 30 May 2021 06:18:55 -0700
Subject: [PATCH 21/26] Update lib.pyx

---
 pandas/_libs/lib.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index a1e66a575097e..08d7a68cd0dc0 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -731,7 +731,10 @@ cpdef ndarray[object] ensure_string_array(
         if isinstance(val, str):
             continue
 
-        if not checknull(val):
+        if not (val is None or val is C_NA or np.isnan(val)):
+            # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid
+            # If they are present, they are treated like a regular Python object
+            # and will either cause an exception to be raised or be coerced.
             if coerce =="all" or coerce == "non-null":
                 result[i] = str(val)
             else:

From a66948aa7aa21d057c322895b59ea9f8c79480cd Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 30 May 2021 09:26:52 -0700
Subject: [PATCH 22/26] Update lib.pyx

---
 pandas/_libs/lib.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 08d7a68cd0dc0..a987f47533259 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -731,7 +731,7 @@ cpdef ndarray[object] ensure_string_array(
         if isinstance(val, str):
             continue
 
-        if not (val is None or val is C_NA or np.isnan(val)):
+        if not (val is None or val is C_NA or val != val):
             # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid
             # If they are present, they are treated like a regular Python object
             # and will either cause an exception to be raised or be coerced.

From e8527191d33ed9c4416d265b175822c19bd5b4ae Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 31 May 2021 09:29:58 -0700
Subject: [PATCH 23/26] Update lib.pyx

---
 pandas/_libs/lib.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index a987f47533259..f39b1fbc49cdb 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -731,7 +731,7 @@ cpdef ndarray[object] ensure_string_array(
         if isinstance(val, str):
             continue
 
-        if not (val is None or val is C_NA or val != val):
+        if not (val is None or val is C_NA or val is np.nan):
             # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid
             # If they are present, they are treated like a regular Python object
             # and will either cause an exception to be raised or be coerced.
@@ -1853,7 +1853,7 @@ cdef class StringValidator(Validator):
 
     cdef bint is_valid_null(self, object value) except -1:
         # Override to exclude float('Nan') and complex NaN
-        return value is None or value is C_NA or np.isnan(value)
+        return value is None or value is C_NA or value is np.nan
 
 
 cpdef bint is_string_array(ndarray values, bint skipna=False):

From 91b73bb93aad90f26040c729b57d99ec26eb3941 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 4 Jun 2021 08:28:52 -0700
Subject: [PATCH 24/26] disallow invalid nans in stringarray constructor

---
 pandas/_libs/lib.pyx          | 23 ++++++++++++++++-------
 pandas/core/arrays/string_.py |  7 +++++--
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index f39b1fbc49cdb..e3fa8eeaa9b53 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -94,6 +94,7 @@ from pandas._libs.missing cimport (
     is_null_timedelta64,
     isnaobj,
 )
+from pandas._libs.missing import checknull
 from pandas._libs.tslibs.conversion cimport convert_to_tsobject
 from pandas._libs.tslibs.nattype cimport (
     NPY_NAT,
@@ -696,10 +697,12 @@ cpdef ndarray[object] ensure_string_array(
     coerce : {{'all', 'null', 'non-null', None}}, default 'all'
         Whether to coerce non-string elements to strings.
             - 'all' will convert null values and non-null non-string values.
-            - 'null' will only convert nulls without converting other non-strings.
+            - 'strict-null' will only convert pd.NA, np.nan, or None to na_value
+              without converting other non-strings.
+            - 'null' will convert nulls to na_value w/out converting other non-strings.
             - 'non-null' will only convert non-null non-string elements to string.
             - None will not convert anything.
-        If coerce is not all, a ValueError will be raised for values
+        If coerce is not 'all', a ValueError will be raised for values
         that are not strings or na_value.
     copy : bool, default True
         Whether to ensure that a new array is returned.
@@ -714,6 +717,7 @@ cpdef ndarray[object] ensure_string_array(
     """
     cdef:
         Py_ssize_t i = 0, n = len(arr)
+        set strict_na_values = {C_NA, np.nan, None}
 
     if hasattr(arr, "to_numpy"):
         arr = arr.to_numpy()
@@ -725,22 +729,27 @@ cpdef ndarray[object] ensure_string_array(
     if copy and result is arr:
         result = result.copy()
 
+    if coerce == 'strict-null':
+        # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid
+        # If they are present, they are treated like a regular Python object
+        # and will either cause an exception to be raised or be coerced.
+        check_null = strict_na_values.__contains__
+    else:
+        check_null = checknull
+
     for i in range(n):
         val = arr[i]
 
         if isinstance(val, str):
             continue
 
-        if not (val is None or val is C_NA or val is np.nan):
-            # We don't use checknull, since NaT, Decimal("NaN"), etc. aren't valid
-            # If they are present, they are treated like a regular Python object
-            # and will either cause an exception to be raised or be coerced.
+        if not check_null(val):
             if coerce =="all" or coerce == "non-null":
                 result[i] = str(val)
             else:
                 raise ValueError("Non-string element encountered in array.")
         else:
-            if coerce=="all" or coerce == "null":
+            if coerce=="all" or coerce == "null" or coerce == 'strict-null':
                 val = na_value
             if skipna:
                 result[i] = val
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 79ddd12476323..d0ea1aa5c5293 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -230,8 +230,11 @@ def _validate(self):
             )
         try:
             lib.ensure_string_array(
-                self._ndarray, na_value=StringDtype.na_value, coerce="null", copy=False
-            ),
+                self._ndarray,
+                na_value=StringDtype.na_value,
+                coerce="strict-null",
+                copy=False,
+            )
         except ValueError:
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
 

From 41f49d21d8da2bbdcc37d33714d009ea2b862049 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 4 Jun 2021 12:40:56 -0700
Subject: [PATCH 25/26] add to _from_sequence and fixes

---
 doc/source/whatsnew/v1.3.0.rst     |  2 +-
 pandas/core/arrays/string_.py      | 18 ++++++++++++++----
 pandas/core/arrays/string_arrow.py | 16 +++++++++++++---
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 93e27a7318f2d..4c5175b8e1bcc 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -644,7 +644,7 @@ Other API changes
 - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects.
 - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`)
 - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`)
-- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")) in its constructor in addition to strings.
+- :class:`StringArray` now accepts nan-likes(``None``, ``nan``, ``NA``) in its constructor in addition to strings.
 - Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`)
 
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index d0ea1aa5c5293..4d97035714ba3 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -145,7 +145,7 @@ class StringArray(PandasArray):
 
            Currently, this expects an object-dtype ndarray
            where the elements are Python strings
-           or nan-likes(``None``, ``nan``, ``NaT``, ``NA``, Decimal("NaN")).
+           or nan-likes(``None``, ``nan``, ``NA``).
            This may change without warning in the future. Use
            :meth:`pandas.array` with ``dtype="string"`` for a stable way of
            creating a `StringArray` from any sequence.
@@ -239,7 +239,9 @@ def _validate(self):
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
 
     @classmethod
-    def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
+    def _from_sequence(
+        cls, scalars, *, dtype: Dtype | None = None, copy=False, coerce=True
+    ):
         if dtype:
             assert dtype == "string"
 
@@ -247,15 +249,23 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False):
 
         if isinstance(scalars, BaseMaskedArray):
             # avoid costly conversion to object dtype
+            if coerce:
+                coerce = "non-null"
+            else:
+                coerce = None
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, coerce="non-null")
+            result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
             result[na_values] = StringDtype.na_value
 
         else:
             # convert non-na-likes to str, and nan-likes to StringDtype.na_value
+            if coerce:
+                coerce = "all"
+            else:
+                coerce = "strict-null"
             result = lib.ensure_string_array(
-                scalars, na_value=StringDtype.na_value, copy=copy
+                scalars, na_value=StringDtype.na_value, copy=copy, coerce=coerce
             )
 
         # Manually creating new array avoids the validation step in the __init__, so is
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 7aeadbb4c4616..f0af7a8a43594 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -237,7 +237,9 @@ def __init__(self, values):
             )
 
     @classmethod
-    def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
+    def _from_sequence(
+        cls, scalars, dtype: Dtype | None = None, copy: bool = False, coerce=True
+    ):
         from pandas.core.arrays.masked import BaseMaskedArray
 
         _chk_pyarrow_available()
@@ -247,11 +249,19 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False)
             # numerical issues with Float32Dtype
             na_values = scalars._mask
             result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, coerce="non-null")
+            if coerce:
+                coerce = "non-null"
+            else:
+                coerce = None
+            result = lib.ensure_string_array(result, copy=copy, coerce=coerce)
             return cls(pa.array(result, mask=na_values, type=pa.string()))
 
         # convert non-na-likes to str
-        result = lib.ensure_string_array(scalars, copy=copy)
+        if coerce:
+            coerce = "all"
+        else:
+            coerce = "strict-null"
+        result = lib.ensure_string_array(scalars, copy=copy, coerce=coerce)
         return cls(pa.array(result, type=pa.string(), from_pandas=True))
 
     @classmethod

From 033580ff373e626885295528e3b3db32d39d6016 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Fri, 4 Jun 2021 15:39:34 -0700
Subject: [PATCH 26/26] Update to make work

---
 doc/source/whatsnew/v1.3.0.rst   | 2 --
 pandas/_libs/parsers.pyx         | 4 ++--
 pandas/io/parsers/base_parser.py | 4 ++--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index 44db989b33903..e690d5fc85785 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -225,8 +225,6 @@ Other enhancements
 - Add support for parsing ``ISO 8601``-like timestamps with negative signs to :class:`Timedelta` (:issue:`37172`)
 - Add support for unary operators in :class:`FloatingArray` (:issue:`38749`)
 - :class:`RangeIndex` can now be constructed by passing a ``range`` object directly e.g. ``pd.RangeIndex(range(3))`` (:issue:`12067`)
-- :meth:`round` being enabled for the nullable integer and floating dtypes (:issue:`38844`)
-- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
 - :meth:`pandas.read_csv` now accepts an argument ``use_nullable_dtypes`` that allows reading data directly into the nullable integer and boolean data types (:issue:`36712`)
 - :meth:`Series.round` and :meth:`DataFrame.round` now work with nullable integer and floating dtypes (:issue:`38844`)
 - :meth:`read_csv` and :meth:`read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index bde6674dbda67..4c64a0e3d8479 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -114,8 +114,8 @@ from pandas.core.arrays import (
     BooleanArray,
     FloatingArray,
     IntegerArray,
-    StringArray,
 )
+from pandas.core.arrays.string_ import StringDtype
 
 cdef:
     float64_t INF = <float64_t>np.inf
@@ -1382,7 +1382,7 @@ def _maybe_upcast(arr, use_nullable_dtypes=False):
     elif use_nullable_dtypes and arr.dtype == np.object_:
         # Maybe convert StringArray & catch error for non-strings
         try:
-            arr = StringArray(arr)
+            arr = StringDtype.construct_array_type()._from_sequence(arr)
         except ValueError as e:
             pass
 
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index fc8f83bd29fe3..46e0875ab61ec 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -63,8 +63,8 @@
     Categorical,
     FloatingArray,
     IntegerArray,
-    StringArray,
 )
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.indexes.api import (
     Index,
     MultiIndex,
@@ -757,7 +757,7 @@ def _infer_types(self, values, na_values, try_num_bool=True):
             and 0 < na_count < len(result)
         ):
             try:
-                result = StringArray(result)
+                result = StringDtype.construct_array_type()._from_sequence(result)
             except ValueError:
                 pass