Merge branch 'main' into main

will-larkin · web-flow · commit 2828ccbdf5af · 2025-02-26T08:00:41.000Z
diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
@@ -198,7 +198,7 @@ In some cases you may be tempted to use ``cast`` from the typing module when you
            obj = cast(str, obj)  # Mypy complains without this!
            return obj.upper()
 
-The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 <https://github.com/python/mypy/issues/5206>`_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable
+The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 <https://github.com/python/mypy/issues/5206>`_). While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable
 
 .. code-block:: python
 
diff --git a/doc/source/getting_started/intro_tutorials/03_subset_data.rst b/doc/source/getting_started/intro_tutorials/03_subset_data.rst
@@ -335,7 +335,7 @@ the name ``anonymous`` to the first 3 elements of the fourth column:
 .. ipython:: python
 
     titanic.iloc[0:3, 3] = "anonymous"
-    titanic.head()
+    titanic.iloc[:5, 3]
 
 .. raw:: html
 
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
@@ -61,7 +61,7 @@ is an :class:`ArrowDtype`.
 support as NumPy including first-class nullability support for all data types, immutability and more.
 
 The table below shows the equivalent pyarrow-backed (``pa``), pandas extension, and numpy (``np``) types that are recognized by pandas.
-Pyarrow-backed types below need to be passed into :class:`ArrowDtype` to be recognized by pandas e.g. ``pd.ArrowDtype(pa.bool_())``
+Pyarrow-backed types below need to be passed into :class:`ArrowDtype` to be recognized by pandas e.g. ``pd.ArrowDtype(pa.bool_())``.
 
 =============================================== ========================== ===================
 PyArrow type                                    pandas extension type      NumPy type
@@ -114,7 +114,7 @@ values.
 
    ArrowDtype
 
-For more information, please see the :ref:`PyArrow user guide <pyarrow>`
+For more information, please see the :ref:`PyArrow user guide <pyarrow>`.
 
 .. _api.arrays.datetime:
 
@@ -495,7 +495,7 @@ a :class:`CategoricalDtype`.
    CategoricalDtype.categories
    CategoricalDtype.ordered
 
-Categorical data can be stored in a :class:`pandas.Categorical`
+Categorical data can be stored in a :class:`pandas.Categorical`:
 
 .. autosummary::
    :toctree: api/
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -13,7 +13,7 @@ Text data types
 
 There are two ways to store text data in pandas:
 
-1. ``object`` -dtype NumPy array.
+1. ``object`` dtype NumPy array.
 2. :class:`StringDtype` extension type.
 
 We recommend using :class:`StringDtype` to store text data.
@@ -40,20 +40,20 @@ to significantly increase the performance and lower the memory overhead of
    and parts of the API may change without warning.
 
 For backwards-compatibility, ``object`` dtype remains the default type we
-infer a list of strings to
+infer a list of strings to:
 
 .. ipython:: python
 
    pd.Series(["a", "b", "c"])
 
-To explicitly request ``string`` dtype, specify the ``dtype``
+To explicitly request ``string`` dtype, specify the ``dtype``:
 
 .. ipython:: python
 
    pd.Series(["a", "b", "c"], dtype="string")
    pd.Series(["a", "b", "c"], dtype=pd.StringDtype())
 
-Or ``astype`` after the ``Series`` or ``DataFrame`` is created
+Or ``astype`` after the ``Series`` or ``DataFrame`` is created:
 
 .. ipython:: python
 
@@ -88,7 +88,7 @@ Behavior differences
 ^^^^^^^^^^^^^^^^^^^^
 
 These are places where the behavior of ``StringDtype`` objects differ from
-``object`` dtype
+``object`` dtype:
 
 l. For ``StringDtype``, :ref:`string accessor methods<api.series.str>`
    that return **numeric** output will always return a nullable integer dtype,
@@ -102,7 +102,7 @@ l. For ``StringDtype``, :ref:`string accessor methods<api.series.str>`
       s.str.count("a")
       s.dropna().str.count("a")
 
-   Both outputs are ``Int64`` dtype. Compare that with object-dtype
+   Both outputs are ``Int64`` dtype. Compare that with object-dtype:
 
    .. ipython:: python
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -791,6 +791,7 @@ ExtensionArray
 ^^^^^^^^^^^^^^
 - Bug in :class:`Categorical` when constructing with an :class:`Index` with :class:`ArrowDtype` (:issue:`60563`)
 - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`)
+- Bug in :meth:`ArrowExtensionArray.factorize` where NA values were dropped when input was dictionary-encoded even when dropna was set to False(:issue:`60567`)
 - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
 - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`)
 - Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1208,7 +1208,12 @@ def factorize(
             data = data.cast(pa.int64())
 
         if pa.types.is_dictionary(data.type):
-            encoded = data
+            if null_encoding == "encode":
+                # dictionary encode does nothing if an already encoded array is given
+                data = data.cast(data.type.value_type)
+                encoded = data.dictionary_encode(null_encoding=null_encoding)
+            else:
+                encoded = data
         else:
             encoded = data.dictionary_encode(null_encoding=null_encoding)
         if encoded.length() == 0:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2788,7 +2788,7 @@ def to_sql(
         con,
         *,
         schema: str | None = None,
-        if_exists: Literal["fail", "replace", "append"] = "fail",
+        if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
         index: bool = True,
         index_label: IndexLabel | None = None,
         chunksize: int | None = None,
@@ -2825,12 +2825,13 @@ def to_sql(
         schema : str, optional
             Specify the schema (if database flavor supports this). If None, use
             default schema.
-        if_exists : {'fail', 'replace', 'append'}, default 'fail'
+        if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail'
             How to behave if the table already exists.
 
             * fail: Raise a ValueError.
             * replace: Drop the table before inserting new values.
             * append: Insert new values to the existing table.
+            * delete_rows: If a table exists, delete all records and insert data.
 
         index : bool, default True
             Write DataFrame index as a column. Uses `index_label` as the column
@@ -2947,6 +2948,16 @@ def to_sql(
         ...     conn.execute(text("SELECT * FROM users")).fetchall()
         [(0, 'User 6'), (1, 'User 7')]
 
+        Delete all rows before inserting new records with ``df3``
+
+        >>> df3 = pd.DataFrame({"name": ['User 8', 'User 9']})
+        >>> df3.to_sql(name='users', con=engine, if_exists='delete_rows',
+        ...            index_label='id')
+        2
+        >>> with engine.connect() as conn:
+        ...     conn.execute(text("SELECT * FROM users")).fetchall()
+        [(0, 'User 8'), (1, 'User 9')]
+
         Use ``method`` to define a callable insertion method to do nothing
         if there's a primary key conflict on a table in a PostgreSQL database.
 
@@ -6267,6 +6278,11 @@ def astype(
         """
         Cast a pandas object to a specified dtype ``dtype``.
 
+        This method allows the conversion of the data types of pandas objects,
+        including DataFrames and Series, to the specified dtype. It supports casting
+        entire objects to a single data type or applying different data types to
+        individual columns using a mapping.
+
         Parameters
         ----------
         dtype : str, data type, Series or Mapping of column name -> data type
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -3329,6 +3329,18 @@ def test_factorize_chunked_dictionary():
     tm.assert_index_equal(res_uniques, exp_uniques)
 
 
+def test_factorize_dictionary_with_na():
+    # GH#60567
+    arr = pd.array(
+        ["a1", pd.NA], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8()))
+    )
+    indices, uniques = arr.factorize(use_na_sentinel=False)
+    expected_indices = np.array([0, 1], dtype=np.intp)
+    expected_uniques = pd.array(["a1", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_numpy_array_equal(indices, expected_indices)
+    tm.assert_extension_array_equal(uniques, expected_uniques)
+
+
 def test_dictionary_astype_categorical():
     # GH#56672
     arrs = [
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -4282,11 +4282,11 @@ def test_xsqlite_execute_fail(sqlite_buildin):
     cur.execute(create_sql)
 
     with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql:
-        pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)')
-        pandas_sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)')
+        pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)")
+        pandas_sql.execute("INSERT INTO test VALUES('foo', 'baz', 2.567)")
 
         with pytest.raises(sql.DatabaseError, match="Execution failed on sql"):
-            pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 7)')
+            pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 7)")
 
 
 def test_xsqlite_execute_closed_connection():
@@ -4304,7 +4304,7 @@ def test_xsqlite_execute_closed_connection():
         cur.execute(create_sql)
 
         with sql.pandasSQL_builder(conn) as pandas_sql:
-            pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)')
+            pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)")
 
     msg = "Cannot operate on a closed database."
     with pytest.raises(sqlite3.ProgrammingError, match=msg):
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py