From 4f2b7609e722729e4f782a5a270886f6027bbc0d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Jun 2023 12:07:22 -0700 Subject: [PATCH 01/61] close to complete implementation --- pandas/io/sql.py | 272 ++++++++++++++++++++++++++++++++++++ pandas/tests/io/test_sql.py | 27 +++- 2 files changed, 298 insertions(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 51cc3eacae284..bf7c4e8b08c45 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -837,6 +837,10 @@ def pandasSQL_builder( if sqlalchemy is not None and isinstance(con, (str, sqlalchemy.engine.Connectable)): return SQLDatabase(con, schema, need_transaction) + adbc = import_optional_dependency("adbc_driver_manager") + if adbc and isinstance(con, adbc.dbapi.Connection): + return ADBCDatabase(con) + warnings.warn( "pandas only supports SQLAlchemy connectable (engine/connection) or " "database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 " @@ -2002,6 +2006,274 @@ def _create_sql_schema( # ---- SQL without SQLAlchemy --- + + +class ADBCDatabase(PandasSQL): + """ + This class enables conversion between DataFrame and SQL databases + using ADBC to handle DataBase abstraction. + + Parameters + ---------- + con : adbc_driver_manager.dbapi.Connection + """ + + def __init__(self, con) -> None: + self.con = con + + def execute(self, sql: str | Select | TextClause, params=None): + with self.con.cursor() as cur: + return cur(sql) + + def read_table( + self, + table_name: str, + index_col: str | list[str] | None = None, + coerce_float: bool = True, + parse_dates=None, + columns=None, + schema: str | None = None, + chunksize: int | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", + ) -> DataFrame | Iterator[DataFrame]: + """ + Read SQL database table into a DataFrame. + + Parameters + ---------- + table_name : str + Name of SQL table in database. + index_col : string, optional, default: None + Column to set as index. + coerce_float : bool, default True + Attempts to convert values of non-string, non-numeric objects + (like decimal.Decimal) to floating point. This can result in + loss of precision. + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg}``, where the arg corresponds + to the keyword arguments of :func:`pandas.to_datetime`. + Especially useful with databases without native Datetime support, + such as SQLite. + columns : list, default: None + List of column names to select from SQL table. + schema : string, default None + Name of SQL schema in database to query (if database flavor + supports this). If specified, this overwrites the default + schema of the SQL database object. + chunksize : int, default None + If specified, return an iterator where `chunksize` is the number + of rows to include in each chunk. + dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy dtypes + Which dtype_backend to use, e.g. whether a DataFrame should have NumPy + arrays, nullable dtypes are used for all dtypes that have a nullable + implementation when "numpy_nullable" is set, pyarrow is used for all + dtypes if "pyarrow" is set. + + The dtype_backends are still experimential. + + .. versionadded:: 2.0 + + Returns + ------- + DataFrame + + See Also + -------- + pandas.read_sql_table + SQLDatabase.read_query + + """ + if schema: + stmt = f"SELECT * FROM {schema}.{table_name}" + else: + stmt = f"SELECT * FROM {table_name}" + + with self.con.cursor() as cur: + return cur(stmt).fetch_arrow_table().to_pandas() + + def read_query( + self, + sql: str, + index_col: str | list[str] | None = None, + coerce_float: bool = True, + parse_dates=None, + params=None, + chunksize: int | None = None, + dtype: DtypeArg | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", + ) -> DataFrame | Iterator[DataFrame]: + """ + Read SQL query into a DataFrame. + + Parameters + ---------- + sql : str + SQL query to be executed. + index_col : string, optional, default: None + Column name to use as index for the returned DataFrame object. + coerce_float : bool, default True + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + params : list, tuple or dict, optional, default: None + List of parameters to pass to execute method. The syntax used + to pass parameters is database driver dependent. Check your + database driver documentation for which of the five syntax styles, + described in PEP 249's paramstyle, is supported. + Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'} + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict + corresponds to the keyword arguments of + :func:`pandas.to_datetime` Especially useful with databases + without native Datetime support, such as SQLite. + chunksize : int, default None + If specified, return an iterator where `chunksize` is the number + of rows to include in each chunk. + dtype : Type name or dict of columns + Data type for data or columns. E.g. np.float64 or + {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’} + + .. versionadded:: 1.3.0 + + Returns + ------- + DataFrame + + See Also + -------- + read_sql_table : Read SQL database table into a DataFrame. + read_sql + + """ + with self.con.cursor() as cur: + return cur(sql).fetch_arrow_table().to_pandas() + + read_sql = read_query + + def to_sql( + self, + frame, + name: str, + if_exists: Literal["fail", "replace", "append"] = "fail", + index: bool = True, + index_label=None, + schema: str | None = None, + chunksize: int | None = None, + dtype: DtypeArg | None = None, + method: Literal["multi"] | Callable | None = None, + engine: str = "auto", + **engine_kwargs, + ) -> int | None: + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame : DataFrame + name : string + Name of SQL table. + if_exists : {'fail', 'replace', 'append'}, default 'fail' + - fail: If table exists, do nothing. + - replace: If table exists, drop it, recreate it, and insert data. + - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column. + index_label : string or sequence, default None + Column label for index column(s). If None is given (default) and + `index` is True, then the index names are used. + A sequence should be given if the DataFrame uses MultiIndex. + schema : string, default None + Name of SQL schema in database to write to (if database flavor + supports this). If specified, this overwrites the default + schema of the SQLDatabase object. + chunksize : int, default None + If not None, then rows will be written in batches of this size at a + time. If None, all rows will be written at once. + dtype : single type or dict of column name to SQL type, default None + Optional specifying the datatype for columns. The SQL type should + be a SQLAlchemy type. If all columns are of the same type, one + single value can be used. + method : {None', 'multi', callable}, default None + Controls the SQL insertion clause used: + + * None : Uses standard SQL ``INSERT`` clause (one per row). + * 'multi': Pass multiple values in a single ``INSERT`` clause. + * callable with signature ``(pd_table, conn, keys, data_iter)``. + + Details and a sample callable implementation can be found in the + section :ref:`insert method `. + engine : {'auto', 'sqlalchemy'}, default 'auto' + SQL engine library to use. If 'auto', then the option + ``io.sql.engine`` is used. The default ``io.sql.engine`` + behavior is 'sqlalchemy' + + .. versionadded:: 1.3.0 + + **engine_kwargs + Any additional kwargs are passed to the engine. + """ + if schema: + table_name = f"{schema}.{name}" + else: + table_name = name + + # TODO: pandas if_exists="append" will still create the + # table if it does not exist; ADBC has append/create + # as applicable modes, so the semantics get blurred across + # the libraries + mode = "create" + if self.has_table(name, schema): + if if_exists == "fail": + raise ValueError(f"Table '{table_name}' already exists.") + elif if_exists == "replace": + with self.con.cursor() as cur: + cur.execute(f"DROP TABLE {table_name}") + + import pyarrow as pa + + tbl = pa.Table.from_pandas(frame) + with self.con.cursor() as cur: + total_inserted = cur.adbc_ingest(table_name, tbl, mode=mode) + + self.con.commit() + return total_inserted + + def has_table(self, name: str, schema: str | None = None) -> bool: + meta = self.con.adbc_get_objects( + db_schema_filter=schema, table_name_filter=name + ).read_all() + + for catalog_schema in meta["catalog_db_schemas"].to_pylist(): + if not catalog_schema: + continue + for schema_record in catalog_schema: + if not schema_record: + continue + + for table_record in schema_record["db_schema_tables"]: + if table_record["table_name"] == name: + return True + + return False + + def _create_sql_schema( + self, + frame: DataFrame, + table_name: str, + keys: list[str] | None = None, + dtype: DtypeArg | None = None, + schema: str | None = None, + ): + raise NotImplementedError("not implemented for adbc") + + # sqlite-specific sql strings and handler class # dictionary used for readability purposes _SQL_TYPES = { diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 75fcef09535d4..67d063e9084b3 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -34,6 +34,7 @@ import pytest from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td import pandas as pd @@ -274,9 +275,14 @@ def check_iris_frame(frame: DataFrame): def count_rows(conn, table_name: str): stmt = f"SELECT count(*) AS count_1 FROM {table_name}" + adbc = import_optional_dependency("adbc_driver_manager") if isinstance(conn, sqlite3.Connection): cur = conn.cursor() return cur.execute(stmt).fetchone()[0] + elif adbc and isinstance(conn, adbc.dbapi.Connection): + with conn.cursor() as cur: + cur.execute(stmt) + return cur.fetchone()[0] else: from sqlalchemy import create_engine from sqlalchemy.engine import Engine @@ -453,6 +459,16 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): yield conn +@pytest.fixture +def postgresql_adbc_conn(): + pytest.importorskip("adbc_driver_postgresql") + from adbc_driver_postgresql import dbapi + + uri = "postgresql://postgres:postgres@localhost:5432/pandas" + with dbapi.connect(uri) as conn: + yield conn + + @pytest.fixture def sqlite_str(): pytest.importorskip("sqlalchemy") @@ -533,11 +549,13 @@ def sqlite_buildin_iris(sqlite_buildin, iris_path): sqlalchemy_connectable = mysql_connectable + postgresql_connectable + sqlite_connectable +adbc_connectable = ["postgresql_adbc_conn"] + sqlalchemy_connectable_iris = ( mysql_connectable + postgresql_connectable + sqlite_iris_connectable ) -all_connectable = sqlalchemy_connectable + ["sqlite_buildin"] +all_connectable = sqlalchemy_connectable + ["sqlite_buildin"] + adbc_connectable all_connectable_iris = sqlalchemy_connectable_iris + ["sqlite_buildin_iris"] @@ -553,6 +571,9 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes(conn, request): + if conn == "postgresql_adbc_conn": + pytest.skip("int8/datetime not implemented yet in adbc driver") + # GH 52046 pytest.importorskip("pyarrow") df = DataFrame( @@ -566,6 +587,7 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): } ) conn = request.getfixturevalue(conn) + with tm.assert_produces_warning(UserWarning, match="the 'timedelta'"): df.to_sql("test_arrow", conn, if_exists="replace", index=False) @@ -573,6 +595,9 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): @pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): + if conn == "postgresql_adbc_conn": + pytest.skip("int8/datetime not implemented yet in adbc driver") + # GH 52046 pytest.importorskip("pyarrow") df = DataFrame( From a4ebbb5dce801bba6f44d0754c469695ffd18486 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Jun 2023 12:31:54 -0700 Subject: [PATCH 02/61] working implementation for postgres --- environment.yml | 1 + pandas/io/sql.py | 2 ++ pandas/tests/io/test_sql.py | 2 ++ requirements-dev.txt | 1 + 4 files changed, 6 insertions(+) diff --git a/environment.yml b/environment.yml index 6178fe896760f..c89aaa63f4fc6 100644 --- a/environment.yml +++ b/environment.yml @@ -118,6 +118,7 @@ dependencies: - pygments # Code highlighting - pip: + - adbc_driver_postgresql - sphinx-toggleprompt - typing_extensions; python_version<"3.11" - tzdata>=2022.1 diff --git a/pandas/io/sql.py b/pandas/io/sql.py index bf7c4e8b08c45..24748dc427e90 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2235,6 +2235,8 @@ def to_sql( elif if_exists == "replace": with self.con.cursor() as cur: cur.execute(f"DROP TABLE {table_name}") + elif if_exists == "append": + mode = "append" import pyarrow as pa diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 67d063e9084b3..2d7537bfd6947 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -467,6 +467,8 @@ def postgresql_adbc_conn(): uri = "postgresql://postgres:postgres@localhost:5432/pandas" with dbapi.connect(uri) as conn: yield conn + with conn.cursor() as cur: + cur.execute("DROP TABLE IF EXISTS test_frame") @pytest.fixture diff --git a/requirements-dev.txt b/requirements-dev.txt index 38a2ce7f66aa3..3cb73a2dc8096 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -87,6 +87,7 @@ feedparser pyyaml requests pygments +adbc_driver_postgresql sphinx-toggleprompt typing_extensions; python_version<"3.11" tzdata>=2022.1 From b2cd14985e68f0ed2704d99d88d2da2841b30a61 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Jun 2023 13:02:28 -0700 Subject: [PATCH 03/61] sqlite implementation --- pandas/tests/io/test_sql.py | 60 +++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 2d7537bfd6947..af3d25e8984f6 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -492,6 +492,19 @@ def sqlite_conn(sqlite_engine): yield conn +@pytest.fixture +def sqlite_adbc_conn(): + pytest.importorskip("adbc_driver_sqlite") + from adbc_driver_sqlite import dbapi + + with tm.ensure_clean() as name: + uri = f"file:{name}" + with dbapi.connect(uri) as conn: + yield conn + with conn.cursor() as cur: + cur.execute("DROP TABLE IF EXISTS test_frame") + + @pytest.fixture def sqlite_iris_str(sqlite_str, iris_path): sqlalchemy = pytest.importorskip("sqlalchemy") @@ -551,7 +564,7 @@ def sqlite_buildin_iris(sqlite_buildin, iris_path): sqlalchemy_connectable = mysql_connectable + postgresql_connectable + sqlite_connectable -adbc_connectable = ["postgresql_adbc_conn"] +adbc_connectable = ["postgresql_adbc_conn", "sqlite_adbc_conn"] sqlalchemy_connectable_iris = ( mysql_connectable + postgresql_connectable + sqlite_iris_connectable @@ -566,6 +579,13 @@ def sqlite_buildin_iris(sqlite_buildin, iris_path): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql(conn, test_frame1, request): # GH 51086 if conn is sqlite_engine + if conn == "sqlite_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="syntax error with CREATE TABLE", + strict=True, + ) + ) conn = request.getfixturevalue(conn) test_frame1.to_sql("test", conn, if_exists="append", index=False) @@ -574,7 +594,19 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes(conn, request): if conn == "postgresql_adbc_conn": - pytest.skip("int8/datetime not implemented yet in adbc driver") + request.node.add_marker( + pytest.mark.xfail( + reason="int8/datetime not implemented yet in pg adbc driver", + strict=True, + ) + ) + elif conn == "sqlite_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="timestamp not implemented yet in sqlite adbc driver", + strict=True, + ) + ) # GH 52046 pytest.importorskip("pyarrow") @@ -588,6 +620,7 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): "string": pd.array(["a"], dtype="string[pyarrow]"), } ) + conn = request.getfixturevalue(conn) with tm.assert_produces_warning(UserWarning, match="the 'timedelta'"): @@ -598,7 +631,16 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): if conn == "postgresql_adbc_conn": - pytest.skip("int8/datetime not implemented yet in adbc driver") + request.node.add_marker( + pytest.skip("int8/datetime not implemented yet in adbc driver") + ) + elif conn == "sqlite_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="timestamp not implemented yet in sqlite adbc driver", + strict=True, + ) + ) # GH 52046 pytest.importorskip("pyarrow") @@ -617,6 +659,10 @@ def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("method", [None, "multi"]) def test_to_sql(conn, method, test_frame1, request): + if conn == "sqlite_adbc_conn": + request.node.add_marker( + pytest.mark.xfail(reason="syntax error with CREATE TABLE", strict=True) + ) conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", method=method) @@ -628,6 +674,10 @@ def test_to_sql(conn, method, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("mode, num_row_coef", [("replace", 1), ("append", 2)]) def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): + if conn == "sqlite_adbc_conn": + request.node.add_marker( + pytest.mark.xfail(reason="syntax error with CREATE TABLE", strict=True) + ) conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail") @@ -639,6 +689,10 @@ def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): @pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_to_sql_exist_fail(conn, test_frame1, request): + if conn == "sqlite_adbc_conn": + request.node.add_marker( + pytest.mark.xfail(reason="syntax error with CREATE TABLE", strict=True) + ) conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail") From 512bd00f7ded66bf878f948ed505fe9f2d6c37f1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Jun 2023 13:16:57 -0700 Subject: [PATCH 04/61] Added ADBC to CI --- ci/deps/actions-310.yaml | 2 ++ ci/deps/actions-311-downstream_compat.yaml | 2 ++ ci/deps/actions-311.yaml | 2 ++ ci/deps/actions-39-minimum_versions.yaml | 2 ++ ci/deps/actions-39.yaml | 2 ++ ci/deps/circle-310-arm64.yaml | 4 ++++ environment.yml | 1 + pandas/compat/_optional.py | 2 ++ pyproject.toml | 4 +++- requirements-dev.txt | 1 + 10 files changed, 21 insertions(+), 1 deletion(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 0923594f2c840..dddf34fe851d5 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -58,5 +58,7 @@ dependencies: - zstandard>=0.17.0 - pip: + - adbc_driver_postgresql>=0.5.1 + - adbc_driver_sqlite>=0.5.1 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 51c7a97ad6500..04d2b995a4137 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -72,5 +72,7 @@ dependencies: - pyyaml - py - pip: + - adbc_driver_postgresql>=0.5.1 + - adbc_driver_sqlite>=0.5.1 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 66b8650116854..bde23ea70ffb0 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -58,5 +58,7 @@ dependencies: - zstandard>=0.17.0 - pip: + - adbc_driver_postgresql>=0.5.1 + - adbc_driver_sqlite>=0.5.1 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index e1b4fdfb1d897..13b76a99f97dc 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -60,5 +60,7 @@ dependencies: - zstandard=0.17.0 - pip: + - adbc_driver_postgresql=0.5.1 + - adbc_driver_sqlite=0.5.1 - pyqt5==5.15.6 - tzdata==2022.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 8ff47dbb9cc95..7ae156111038b 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -58,5 +58,7 @@ dependencies: - zstandard>=0.17.0 - pip: + - adbc_driver_postgresql>=0.5.1 + - adbc_driver_sqlite>=0.5.1 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index ca9860fc20742..8bfdf510d5ef9 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -57,3 +57,7 @@ dependencies: - xlrd>=2.0.1 - xlsxwriter>=3.0.3 - zstandard>=0.17.0 + + - pip: + - adbc_driver_postgresql>=0.5.1 + - adbc_driver_sqlite>=0.5.1 diff --git a/environment.yml b/environment.yml index c89aaa63f4fc6..dcaa14cf9a189 100644 --- a/environment.yml +++ b/environment.yml @@ -119,6 +119,7 @@ dependencies: - pip: - adbc_driver_postgresql + - adbc_driver_sqlite - sphinx-toggleprompt - typing_extensions; python_version<"3.11" - tzdata>=2022.1 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index dcd49f65fc4cd..91a29ce645340 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -15,6 +15,8 @@ # Update install.rst & setup.cfg when updating versions! VERSIONS = { + "adbc_driver_postgresql": "0.5.1", + "adbc_driver_sqlite": "0.5.1", "bs4": "4.11.1", "blosc": "1.21.0", "bottleneck": "1.3.4", diff --git a/pyproject.toml b/pyproject.toml index 0d1bca886a638..dc3bd01f5f74f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,9 @@ plot = ['matplotlib>=3.6.1'] output_formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] clipboard = ['PyQt5>=5.15.6', 'qtpy>=2.2.0'] compression = ['brotlipy>=0.7.0', 'python-snappy>=0.6.1', 'zstandard>=0.17.0'] -all = ['beautifulsoup4>=4.11.1', +all = ['adbc_driver_postgresql>=0.5.1', + 'adbc_driver_sqlite>=0.5.1', + 'beautifulsoup4>=4.11.1', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) #'blosc>=1.21.0', 'bottleneck>=1.3.4', diff --git a/requirements-dev.txt b/requirements-dev.txt index 3cb73a2dc8096..7fd54bc3c7cd4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -88,6 +88,7 @@ pyyaml requests pygments adbc_driver_postgresql +adbc_driver_sqlite sphinx-toggleprompt typing_extensions; python_version<"3.11" tzdata>=2022.1 From f49115cb18dcbee5b95eea9438daf8c27fa4b720 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Jun 2023 13:27:07 -0700 Subject: [PATCH 05/61] Doc updates --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/io/sql.py | 111 ++++++--------------------------- 2 files changed, 19 insertions(+), 93 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 7b9efd7f593dd..ee0fa8b481d58 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -118,6 +118,7 @@ Other enhancements - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) +- :meth:`DataFrame.to_sql` and :func:`read_sql` now support ADBC drivers .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 24748dc427e90..48a1ace144122 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -629,6 +629,19 @@ def read_sql( int_column date_column 0 0 2012-11-10 1 1 2010-11-12 + + .. versionadded:: 2.1.0 + + pandas now supports reading via ADBC drivers + + >>> from adbc_driver_postgresql import dbapi + >>> with dbapi.connect('postgres:///db_name') as conn: + ... pd.read_sql('SELECT int_column, + ... conn, + ... parse_dates={"date_column": {"format": "%d/%m/%y"}}) + int_column + 0 0 + 1 1 """ check_dtype_backend(dtype_backend) @@ -2037,45 +2050,15 @@ def read_table( dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame | Iterator[DataFrame]: """ - Read SQL database table into a DataFrame. + Read SQL database table into a DataFrame. Only keyword arguments used + are table_name and schema. The rest are silently discarded. Parameters ---------- table_name : str Name of SQL table in database. - index_col : string, optional, default: None - Column to set as index. - coerce_float : bool, default True - Attempts to convert values of non-string, non-numeric objects - (like decimal.Decimal) to floating point. This can result in - loss of precision. - parse_dates : list or dict, default: None - - List of column names to parse as dates. - - Dict of ``{column_name: format string}`` where format string is - strftime compatible in case of parsing string times, or is one of - (D, s, ns, ms, us) in case of parsing integer timestamps. - - Dict of ``{column_name: arg}``, where the arg corresponds - to the keyword arguments of :func:`pandas.to_datetime`. - Especially useful with databases without native Datetime support, - such as SQLite. - columns : list, default: None - List of column names to select from SQL table. schema : string, default None - Name of SQL schema in database to query (if database flavor - supports this). If specified, this overwrites the default - schema of the SQL database object. - chunksize : int, default None - If specified, return an iterator where `chunksize` is the number - of rows to include in each chunk. - dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy dtypes - Which dtype_backend to use, e.g. whether a DataFrame should have NumPy - arrays, nullable dtypes are used for all dtypes that have a nullable - implementation when "numpy_nullable" is set, pyarrow is used for all - dtypes if "pyarrow" is set. - - The dtype_backends are still experimential. - - .. versionadded:: 2.0 + Name of SQL schema in database to read from Returns ------- @@ -2107,40 +2090,12 @@ def read_query( dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame | Iterator[DataFrame]: """ - Read SQL query into a DataFrame. + Read SQL query into a DataFrame. Keyword arguments are discarded. Parameters ---------- sql : str SQL query to be executed. - index_col : string, optional, default: None - Column name to use as index for the returned DataFrame object. - coerce_float : bool, default True - Attempt to convert values of non-string, non-numeric objects (like - decimal.Decimal) to floating point, useful for SQL result sets. - params : list, tuple or dict, optional, default: None - List of parameters to pass to execute method. The syntax used - to pass parameters is database driver dependent. Check your - database driver documentation for which of the five syntax styles, - described in PEP 249's paramstyle, is supported. - Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'} - parse_dates : list or dict, default: None - - List of column names to parse as dates. - - Dict of ``{column_name: format string}`` where format string is - strftime compatible in case of parsing string times, or is one of - (D, s, ns, ms, us) in case of parsing integer timestamps. - - Dict of ``{column_name: arg dict}``, where the arg dict - corresponds to the keyword arguments of - :func:`pandas.to_datetime` Especially useful with databases - without native Datetime support, such as SQLite. - chunksize : int, default None - If specified, return an iterator where `chunksize` is the number - of rows to include in each chunk. - dtype : Type name or dict of columns - Data type for data or columns. E.g. np.float64 or - {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’} - - .. versionadded:: 1.3.0 Returns ------- @@ -2173,6 +2128,7 @@ def to_sql( ) -> int | None: """ Write records stored in a DataFrame to a SQL database. + Only frame, name, if_exists and schema are valid arguments. Parameters ---------- @@ -2183,41 +2139,10 @@ def to_sql( - fail: If table exists, do nothing. - replace: If table exists, drop it, recreate it, and insert data. - append: If table exists, insert data. Create if does not exist. - index : boolean, default True - Write DataFrame index as a column. - index_label : string or sequence, default None - Column label for index column(s). If None is given (default) and - `index` is True, then the index names are used. - A sequence should be given if the DataFrame uses MultiIndex. schema : string, default None Name of SQL schema in database to write to (if database flavor supports this). If specified, this overwrites the default schema of the SQLDatabase object. - chunksize : int, default None - If not None, then rows will be written in batches of this size at a - time. If None, all rows will be written at once. - dtype : single type or dict of column name to SQL type, default None - Optional specifying the datatype for columns. The SQL type should - be a SQLAlchemy type. If all columns are of the same type, one - single value can be used. - method : {None', 'multi', callable}, default None - Controls the SQL insertion clause used: - - * None : Uses standard SQL ``INSERT`` clause (one per row). - * 'multi': Pass multiple values in a single ``INSERT`` clause. - * callable with signature ``(pd_table, conn, keys, data_iter)``. - - Details and a sample callable implementation can be found in the - section :ref:`insert method `. - engine : {'auto', 'sqlalchemy'}, default 'auto' - SQL engine library to use. If 'auto', then the option - ``io.sql.engine`` is used. The default ``io.sql.engine`` - behavior is 'sqlalchemy' - - .. versionadded:: 1.3.0 - - **engine_kwargs - Any additional kwargs are passed to the engine. """ if schema: table_name = f"{schema}.{name}" From a8512b588b05a3c36321067a1fd9d1b19c813137 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Jun 2023 13:28:05 -0700 Subject: [PATCH 06/61] Whatsnew update --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index ee0fa8b481d58..77256c2b73e07 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -105,6 +105,7 @@ Other enhancements - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"`` - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) - :meth:`DataFrame.stack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) +- :meth:`DataFrame.to_sql` and :func:`read_sql` now support ADBC drivers (:issue:`53869`) - :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) - :meth:`DataFrameGroupby.agg` and :meth:`DataFrameGroupby.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`) - :meth:`Series.explode` now supports pyarrow-backed list types (:issue:`53602`) @@ -118,7 +119,6 @@ Other enhancements - Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) -- :meth:`DataFrame.to_sql` and :func:`read_sql` now support ADBC drivers .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: From c1c68ef10b70513cbf02a97ac50e8d8dc487e2d8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Jun 2023 15:23:15 -0700 Subject: [PATCH 07/61] Better optional dependency import --- pandas/io/sql.py | 2 +- pandas/tests/io/test_sql.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 48a1ace144122..9d854955ee78f 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -850,7 +850,7 @@ def pandasSQL_builder( if sqlalchemy is not None and isinstance(con, (str, sqlalchemy.engine.Connectable)): return SQLDatabase(con, schema, need_transaction) - adbc = import_optional_dependency("adbc_driver_manager") + adbc = import_optional_dependency("adbc_driver_manager", errors="ignore") if adbc and isinstance(con, adbc.dbapi.Connection): return ADBCDatabase(con) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index af3d25e8984f6..2382f334bb401 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -275,7 +275,7 @@ def check_iris_frame(frame: DataFrame): def count_rows(conn, table_name: str): stmt = f"SELECT count(*) AS count_1 FROM {table_name}" - adbc = import_optional_dependency("adbc_driver_manager") + adbc = import_optional_dependency("adbc_driver_manager", errors="ignore") if isinstance(conn, sqlite3.Connection): cur = conn.cursor() return cur.execute(stmt).fetchone()[0] From 3d7fb15d8e30d7baa9f5f171985e096b7b578744 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Jun 2023 15:30:08 -0700 Subject: [PATCH 08/61] min versions fix --- ci/deps/actions-39-minimum_versions.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 13b76a99f97dc..43d8bc528fb27 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -60,7 +60,7 @@ dependencies: - zstandard=0.17.0 - pip: - - adbc_driver_postgresql=0.5.1 - - adbc_driver_sqlite=0.5.1 + - adbc_driver_postgresql==0.5.1 + - adbc_driver_sqlite==0.5.1 - pyqt5==5.15.6 - tzdata==2022.1 From 1093bc871e46f6b36cd7cfc6a7dd8ddf3d164071 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Jun 2023 22:10:57 -0700 Subject: [PATCH 09/61] import updates --- pandas/io/sql.py | 4 ++-- pandas/tests/io/test_sql.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9d854955ee78f..85412f4039437 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -850,8 +850,8 @@ def pandasSQL_builder( if sqlalchemy is not None and isinstance(con, (str, sqlalchemy.engine.Connectable)): return SQLDatabase(con, schema, need_transaction) - adbc = import_optional_dependency("adbc_driver_manager", errors="ignore") - if adbc and isinstance(con, adbc.dbapi.Connection): + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(con, adbc.Connection): return ADBCDatabase(con) warnings.warn( diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 2382f334bb401..9d6dd7087c7af 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -275,11 +275,11 @@ def check_iris_frame(frame: DataFrame): def count_rows(conn, table_name: str): stmt = f"SELECT count(*) AS count_1 FROM {table_name}" - adbc = import_optional_dependency("adbc_driver_manager", errors="ignore") + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") if isinstance(conn, sqlite3.Connection): cur = conn.cursor() return cur.execute(stmt).fetchone()[0] - elif adbc and isinstance(conn, adbc.dbapi.Connection): + elif adbc and isinstance(conn, adbc.Connection): with conn.cursor() as cur: cur.execute(stmt) return cur.fetchone()[0] From 926e567878923b551f43245578e4bdb0432ef5c6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 26 Jun 2023 23:18:14 -0700 Subject: [PATCH 10/61] docstring fix --- pandas/io/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 85412f4039437..982792e12dea1 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -636,7 +636,7 @@ def read_sql( >>> from adbc_driver_postgresql import dbapi >>> with dbapi.connect('postgres:///db_name') as conn: - ... pd.read_sql('SELECT int_column, + ... pd.read_sql('SELECT int_column FROM test_data', ... conn, ... parse_dates={"date_column": {"format": "%d/%m/%y"}}) int_column From fcc21a864c0441b9f5995e0ff3c8028d0fbe7a64 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Jun 2023 11:13:02 -0700 Subject: [PATCH 11/61] doc fixup --- pandas/io/sql.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 964c10d484ef8..a42c6a8b0204a 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -635,10 +635,8 @@ def read_sql( pandas now supports reading via ADBC drivers >>> from adbc_driver_postgresql import dbapi - >>> with dbapi.connect('postgres:///db_name') as conn: - ... pd.read_sql('SELECT int_column FROM test_data', - ... conn, - ... parse_dates={"date_column": {"format": "%d/%m/%y"}}) + >>> with dbapi.connect('postgres:///db_name') as conn: # doctest:+SKIP + ... pd.read_sql('SELECT int_column FROM test_data', conn) int_column 0 0 1 1 From 156096d7c841f08ea4f14a7aa5c61065c08f10b3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 14 Jul 2023 13:34:41 -0700 Subject: [PATCH 12/61] Updates for 0.6.0 --- ci/deps/actions-310.yaml | 4 +- ci/deps/actions-311-downstream_compat.yaml | 4 +- ci/deps/actions-311.yaml | 4 +- ci/deps/actions-39-minimum_versions.yaml | 4 +- ci/deps/actions-39.yaml | 4 +- ci/deps/circle-310-arm64.yaml | 4 +- pandas/compat/_optional.py | 4 +- pandas/io/sql.py | 78 ++++++++++++++++++++-- pandas/tests/io/test_sql.py | 49 ++++++-------- pyproject.toml | 4 +- 10 files changed, 111 insertions(+), 48 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 295be6f54d345..01ed87b16af1c 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -59,7 +59,7 @@ dependencies: - zstandard>=0.17.0 - pip: - - adbc_driver_postgresql>=0.5.1 - - adbc_driver_sqlite>=0.5.1 + - adbc_driver_postgresql>=0.6.0 + - adbc_driver_sqlite>=0.6.0 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index a25974db429bb..73e248f95924a 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -73,7 +73,7 @@ dependencies: - pyyaml - py - pip: - - adbc_driver_postgresql>=0.5.1 - - adbc_driver_sqlite>=0.5.1 + - adbc_driver_postgresql>=0.6.0 + - adbc_driver_sqlite>=0.6.0 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index ac46969db008a..de917b84c1ee1 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -59,7 +59,7 @@ dependencies: - zstandard>=0.17.0 - pip: - - adbc_driver_postgresql>=0.5.1 - - adbc_driver_sqlite>=0.5.1 + - adbc_driver_postgresql>=0.6.0 + - adbc_driver_sqlite>=0.6.0 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 128cd90abeec5..579c1b9eb55f4 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -61,7 +61,7 @@ dependencies: - zstandard=0.17.0 - pip: - - adbc_driver_postgresql==0.5.1 - - adbc_driver_sqlite==0.5.1 + - adbc_driver_postgresql==0.6.0 + - adbc_driver_sqlite==0.6.0 - pyqt5==5.15.6 - tzdata==2022.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 73060f5447722..25a89d9cf689c 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -59,7 +59,7 @@ dependencies: - zstandard>=0.17.0 - pip: - - adbc_driver_postgresql>=0.5.1 - - adbc_driver_sqlite>=0.5.1 + - adbc_driver_postgresql>=0.6.0 + - adbc_driver_sqlite>=0.6.0 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 9fa8638be2024..511eacee9ea69 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -60,5 +60,5 @@ dependencies: - zstandard>=0.17.0 - pip: - - adbc_driver_postgresql>=0.5.1 - - adbc_driver_sqlite>=0.5.1 + - adbc_driver_postgresql>=0.6.0 + - adbc_driver_sqlite>=0.6.0 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 91a29ce645340..01820d2141a4c 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -15,8 +15,8 @@ # Update install.rst & setup.cfg when updating versions! VERSIONS = { - "adbc_driver_postgresql": "0.5.1", - "adbc_driver_sqlite": "0.5.1", + "adbc_driver_postgresql": "0.6.0", + "adbc_driver_sqlite": "0.6.0", "bs4": "4.11.1", "blosc": "1.21.0", "bottleneck": "1.3.4", diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 99f34af15fde4..4f2906adabe4d 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -45,7 +45,10 @@ is_dict_like, is_list_like, ) -from pandas.core.dtypes.dtypes import DatetimeTZDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + DatetimeTZDtype, +) from pandas.core.dtypes.missing import isna from pandas import get_option @@ -2047,7 +2050,7 @@ def __init__(self, con) -> None: def execute(self, sql: str | Select | TextClause, params=None): with self.con.cursor() as cur: - return cur(sql) + return cur.execute(sql) def read_table( self, @@ -2081,13 +2084,37 @@ def read_table( SQLDatabase.read_query """ + if index_col: + raise NotImplementedError("'index_col' is not implemented for ADBC drivers") + if coerce_float is not True: + raise NotImplementedError( + "'coerce_float' is not implemented for ADBC drivers" + ) + if parse_dates: + raise NotImplementedError( + "'parse_dates' is not implemented for ADBC drivers" + ) + if columns: + raise NotImplementedError("'columns' is not implemented for ADBC drivers") + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + if schema: stmt = f"SELECT * FROM {schema}.{table_name}" else: stmt = f"SELECT * FROM {table_name}" + if dtype_backend == "pyarrow": + mapping = ArrowDtype + elif dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping().get + else: + mapping = None + with self.con.cursor() as cur: - return cur(stmt).fetch_arrow_table().to_pandas() + return cur(stmt).fetch_arrow_table().to_pandas(types_mapper=mapping) def read_query( self, @@ -2118,8 +2145,34 @@ def read_query( read_sql """ + if index_col: + raise NotImplementedError("'index_col' is not implemented for ADBC drivers") + if coerce_float is not True: + raise NotImplementedError( + "'coerce_float' is not implemented for ADBC drivers" + ) + if parse_dates: + raise NotImplementedError( + "'parse_dates' is not implemented for ADBC drivers" + ) + if params: + raise NotImplementedError("'params' is not implemented for ADBC drivers") + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + if dtype: + raise NotImplementedError("'dtype' is not implemented for ADBC drivers") + + if dtype_backend == "pyarrow": + mapping = ArrowDtype + elif dtype_backend == "numpy_nullable": + from pandas.io._util import _arrow_dtype_mapping + + mapping = _arrow_dtype_mapping().get + else: + mapping = None + with self.con.cursor() as cur: - return cur(sql).fetch_arrow_table().to_pandas() + return cur(sql).fetch_arrow_table().to_pandas(types_mapper=mapping) read_sql = read_query @@ -2155,6 +2208,23 @@ def to_sql( supports this). If specified, this overwrites the default schema of the SQLDatabase object. """ + if index: + raise NotImplementedError("'index' is not implemented for ADBC drivers") + if index_label: + raise NotImplementedError( + "'index_label' is not implemented for ADBC drivers" + ) + if schema: + raise NotImplementedError("'shcema' is not implemented for ADBC drivers") + if chunksize: + raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + if dtype: + raise NotImplementedError("'dtype' is not implemented for ADBC drivers") + if method: + raise NotImplementedError("'method' is not implemented for ADBC drivers") + if engine != "auto": + raise NotImplementedError("'auto' is not implemented for ADBC drivers") + if schema: table_name = f"{schema}.{name}" else: diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 8950114d37db7..7c96897644c8d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -593,21 +593,6 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes(conn, request): - if conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.mark.xfail( - reason="int8/datetime not implemented yet in pg adbc driver", - strict=True, - ) - ) - elif conn == "sqlite_adbc_conn": - request.node.add_marker( - pytest.mark.xfail( - reason="timestamp not implemented yet in sqlite adbc driver", - strict=True, - ) - ) - # GH 52046 pytest.importorskip("pyarrow") df = DataFrame( @@ -622,27 +607,23 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): } ) + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="date/timedelta not implemented in ADBC", + strict=True, + ) + ) + conn = request.getfixturevalue(conn) - with tm.assert_produces_warning(UserWarning, match="the 'timedelta'"): + with tm.assert_produces_warning(UserWarning, match="time 'timedelta'"): df.to_sql("test_arrow", conn, if_exists="replace", index=False) @pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): - if conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.skip("int8/datetime not implemented yet in adbc driver") - ) - elif conn == "sqlite_adbc_conn": - request.node.add_marker( - pytest.mark.xfail( - reason="timestamp not implemented yet in sqlite adbc driver", - strict=True, - ) - ) - # GH 52046 pytest.importorskip("pyarrow") df = DataFrame( @@ -664,6 +645,10 @@ def test_to_sql(conn, method, test_frame1, request): request.node.add_marker( pytest.mark.xfail(reason="syntax error with CREATE TABLE", strict=True) ) + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail(reason="segfault when not 'index=False'", strict=True) + ) conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", method=method) @@ -679,6 +664,10 @@ def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): request.node.add_marker( pytest.mark.xfail(reason="syntax error with CREATE TABLE", strict=True) ) + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail(reason="segfault when not 'index=False'", strict=True) + ) conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail") @@ -694,6 +683,10 @@ def test_to_sql_exist_fail(conn, test_frame1, request): request.node.add_marker( pytest.mark.xfail(reason="syntax error with CREATE TABLE", strict=True) ) + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail(reason="segfault when not 'index=False'", strict=True) + ) conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail") diff --git a/pyproject.toml b/pyproject.toml index 8611a48ccf4f0..e11363fcba762 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,8 +78,8 @@ plot = ['matplotlib>=3.6.1'] output_formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] clipboard = ['PyQt5>=5.15.6', 'qtpy>=2.2.0'] compression = ['brotlipy>=0.7.0', 'python-snappy>=0.6.1', 'zstandard>=0.17.0'] -all = ['adbc_driver_postgresql>=0.5.1', - 'adbc_driver_sqlite>=0.5.1', +all = ['adbc_driver_postgresql>=0.6.0', + 'adbc_driver_sqlite>=0.6.0', 'beautifulsoup4>=4.11.1', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) #'blosc>=1.21.0', From dd26edb65a786ee268da638dc8b1852510083d9d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 20 Jul 2023 00:03:29 -0700 Subject: [PATCH 13/61] fix sqlite name escaping --- pandas/io/sql.py | 4 +--- pandas/tests/io/test_sql.py | 21 +-------------------- 2 files changed, 2 insertions(+), 23 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 4f2906adabe4d..616cbff0bd91a 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2208,8 +2208,6 @@ def to_sql( supports this). If specified, this overwrites the default schema of the SQLDatabase object. """ - if index: - raise NotImplementedError("'index' is not implemented for ADBC drivers") if index_label: raise NotImplementedError( "'index_label' is not implemented for ADBC drivers" @@ -2246,7 +2244,7 @@ def to_sql( import pyarrow as pa - tbl = pa.Table.from_pandas(frame) + tbl = pa.Table.from_pandas(frame, preserve_index=index) with self.con.cursor() as cur: total_inserted = cur.adbc_ingest(table_name, tbl, mode=mode) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 7c96897644c8d..ce66a88f93fc6 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -579,13 +579,6 @@ def sqlite_buildin_iris(sqlite_buildin, iris_path): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql(conn, test_frame1, request): # GH 51086 if conn is sqlite_engine - if conn == "sqlite_adbc_conn": - request.node.add_marker( - pytest.mark.xfail( - reason="syntax error with CREATE TABLE", - strict=True, - ) - ) conn = request.getfixturevalue(conn) test_frame1.to_sql("test", conn, if_exists="append", index=False) @@ -641,13 +634,9 @@ def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("method", [None, "multi"]) def test_to_sql(conn, method, test_frame1, request): - if conn == "sqlite_adbc_conn": - request.node.add_marker( - pytest.mark.xfail(reason="syntax error with CREATE TABLE", strict=True) - ) if conn == "postgresql_adbc_conn": request.node.add_marker( - pytest.mark.xfail(reason="segfault when not 'index=False'", strict=True) + pytest.mark.skip(reason="segfault when not 'index=False'", strict=True) ) conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: @@ -660,10 +649,6 @@ def test_to_sql(conn, method, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("mode, num_row_coef", [("replace", 1), ("append", 2)]) def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): - if conn == "sqlite_adbc_conn": - request.node.add_marker( - pytest.mark.xfail(reason="syntax error with CREATE TABLE", strict=True) - ) if conn == "postgresql_adbc_conn": request.node.add_marker( pytest.mark.xfail(reason="segfault when not 'index=False'", strict=True) @@ -679,10 +664,6 @@ def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): @pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_to_sql_exist_fail(conn, test_frame1, request): - if conn == "sqlite_adbc_conn": - request.node.add_marker( - pytest.mark.xfail(reason="syntax error with CREATE TABLE", strict=True) - ) if conn == "postgresql_adbc_conn": request.node.add_marker( pytest.mark.xfail(reason="segfault when not 'index=False'", strict=True) From 4d8a233259eb12ff7920b333dc1d6545877b6d78 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 20 Jul 2023 00:21:22 -0700 Subject: [PATCH 14/61] more cleanups --- pandas/tests/io/test_sql.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index ce66a88f93fc6..f3f29c95d8ce0 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -634,10 +634,13 @@ def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("method", [None, "multi"]) def test_to_sql(conn, method, test_frame1, request): - if conn == "postgresql_adbc_conn": + if method == "multi" and "adbc" in conn: request.node.add_marker( - pytest.mark.skip(reason="segfault when not 'index=False'", strict=True) + pytest.mark.xfail( + reason="'method' not implemented for ADBC drivers", strict=True + ) ) + conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", method=method) @@ -649,10 +652,6 @@ def test_to_sql(conn, method, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("mode, num_row_coef", [("replace", 1), ("append", 2)]) def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): - if conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.mark.xfail(reason="segfault when not 'index=False'", strict=True) - ) conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail") @@ -664,10 +663,6 @@ def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request): @pytest.mark.db @pytest.mark.parametrize("conn", all_connectable) def test_to_sql_exist_fail(conn, test_frame1, request): - if conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.mark.xfail(reason="segfault when not 'index=False'", strict=True) - ) conn = request.getfixturevalue(conn) with pandasSQL_builder(conn, need_transaction=True) as pandasSQL: pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail") From 5238e69d68039156f7b6c9c2af5febe1b5459156 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 2 Aug 2023 13:15:50 -0400 Subject: [PATCH 15/61] more 0.6.0 updates --- pandas/tests/io/test_sql.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index f3f29c95d8ce0..b49ef66f0aead 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -595,22 +595,33 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): [datetime(2023, 1, 1)], dtype="timestamp[ns][pyarrow]" ), "date": pd.array([date(2023, 1, 1)], dtype="date32[day][pyarrow]"), - "timedelta": pd.array([timedelta(1)], dtype="duration[ns][pyarrow]"), + "timedelta": pd.array( + [timedelta(1)], dtype="month_day_nano_interval[pyarrow]" + ), "string": pd.array(["a"], dtype="string[pyarrow]"), } ) - if "adbc" in conn: + if conn == "sqlite_adbc_conn": request.node.add_marker( pytest.mark.xfail( - reason="date/timedelta not implemented in ADBC", + reason="timedelta not implemented in ADBC sqlite driver", strict=True, ) ) + if "adbc" in conn: + exp_warning = FutureWarning + match_str = "is_sparse is deprecated" + else: + exp_warning = UserWarning + match_str = "time 'timedelta'" + conn = request.getfixturevalue(conn) - with tm.assert_produces_warning(UserWarning, match="time 'timedelta'"): + with tm.assert_produces_warning( + exp_warning, match=match_str, check_stacklevel=False + ): df.to_sql("test_arrow", conn, if_exists="replace", index=False) From 51c6c98c267a82ec2906ccde6ff2dd7601336d50 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 2 Aug 2023 13:40:54 -0400 Subject: [PATCH 16/61] typo --- pandas/io/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 616cbff0bd91a..18873b198fbc5 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2213,7 +2213,7 @@ def to_sql( "'index_label' is not implemented for ADBC drivers" ) if schema: - raise NotImplementedError("'shcema' is not implemented for ADBC drivers") + raise NotImplementedError("'schema' is not implemented for ADBC drivers") if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") if dtype: From 428c4f74fb136cb27ac392039585ffaeda23afd0 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 11:27:53 -0400 Subject: [PATCH 17/61] remove warning --- pandas/tests/io/test_sql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e2a2c587cabb3..6df366385bdba 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -612,8 +612,8 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): ) if "adbc" in conn: - exp_warning = FutureWarning - match_str = "is_sparse is deprecated" + exp_warning = None + match_str = None else: exp_warning = UserWarning match_str = "time 'timedelta'" From 84d95bb6dcfcb5e9fb2e0d76349e850127056590 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 11:35:13 -0400 Subject: [PATCH 18/61] test_sql expectations --- pandas/tests/io/test_sql.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6df366385bdba..a0ad0eed03506 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -596,13 +596,19 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): [datetime(2023, 1, 1)], dtype="timestamp[ns][pyarrow]" ), "date": pd.array([date(2023, 1, 1)], dtype="date32[day][pyarrow]"), - "timedelta": pd.array( - [timedelta(1)], dtype="month_day_nano_interval[pyarrow]" - ), "string": pd.array(["a"], dtype="string[pyarrow]"), } ) + if "adbc" in conn: + df["timedelta"] = pd.array( + [timedelta(1)], dtype="month_day_nano_interval[pyarrow]" + ) + exp_warning = None + else: + df["timedelta"] = pd.array([timedelta(1)], dtype="duration[ns][pyarrow]") + exp_warning = UserWarning + if conn == "sqlite_adbc_conn": request.node.add_marker( pytest.mark.xfail( @@ -611,18 +617,8 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): ) ) - if "adbc" in conn: - exp_warning = None - match_str = None - else: - exp_warning = UserWarning - match_str = "time 'timedelta'" - conn = request.getfixturevalue(conn) - - with tm.assert_produces_warning( - exp_warning, match=match_str, check_stacklevel=False - ): + with tm.assert_produces_warning(exp_warning, match="the 'timedelta'"): df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) From a4d5b31f450336c80658ea988b63d7e2d607a1ef Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 11:37:06 -0400 Subject: [PATCH 19/61] revert whatsnew issues --- doc/source/whatsnew/v2.1.0.rst | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 944b2c75d4e60..19a8500928ab7 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -260,18 +260,6 @@ Other enhancements - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns (:issue:`52084`) - Improved error message when providing an invalid ``index`` or ``offset`` argument to :class:`.VariableOffsetWindowIndexer` (:issue:`54379`) - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) -- Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"`` -- :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) -- :meth:`DataFrame.stack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) -- :meth:`DataFrame.to_sql` and :func:`read_sql` now support ADBC drivers (:issue:`53869`) -- :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) -- :meth:`DataFrameGroupby.agg` and :meth:`DataFrameGroupby.transform` now support grouping by multiple keys when the index is not a :class:`MultiIndex` for ``engine="numba"`` (:issue:`53486`) -- :meth:`Series.explode` now supports pyarrow-backed list types (:issue:`53602`) -- :meth:`Series.str.join` now supports ``ArrowDtype(pa.string())`` (:issue:`53646`) -- :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`) -- :meth:`SeriesGroupby.transform` and :meth:`DataFrameGroupby.transform` now support passing in a string as the function for ``engine="numba"`` (:issue:`53579`) -- Added :meth:`ExtensionArray.interpolate` used by :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` (:issue:`53659`) -- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) - Added a new parameter ``by_row`` to :meth:`Series.apply` and :meth:`DataFrame.apply`. When set to ``False`` the supplied callables will always operate on the whole Series or DataFrame (:issue:`53400`, :issue:`53601`). - :meth:`DataFrame.shift` and :meth:`Series.shift` now allow shifting by multiple periods by supplying a list of periods (:issue:`44424`) - Groupby aggregations with ``numba`` (such as :meth:`.DataFrameGroupBy.sum`) now can preserve the dtype of the input instead of casting to ``float64`` (:issue:`44952`) From 21b35f6c964c4c02611203fc7d4e32637d94d4b6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 11:39:15 -0400 Subject: [PATCH 20/61] pip deps --- environment.yml | 4 ++-- requirements-dev.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index fcd678db1b379..bd8ad1cc0ea62 100644 --- a/environment.yml +++ b/environment.yml @@ -113,8 +113,8 @@ dependencies: - pygments # Code highlighting - pip: - - adbc_driver_postgresql - - adbc_driver_sqlite + - adbc_driver_postgresql>=0.6.0 + - adbc_driver_sqlite>=0.6.0 - dataframe-api-compat>=0.1.7 - sphinx-toggleprompt # conda-forge version has stricter pins on jinja2 - typing_extensions; python_version<"3.11" diff --git a/requirements-dev.txt b/requirements-dev.txt index 10663605cf649..5b71525a1d6eb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -82,8 +82,8 @@ feedparser pyyaml requests pygments -adbc_driver_postgresql -adbc_driver_sqlite +adbc_driver_postgresql>=0.6.0 +adbc_driver_sqlite>=0.6.0 dataframe-api-compat>=0.1.7 sphinx-toggleprompt typing_extensions; python_version<"3.11" From e709d523c6484cc80d0de72285b6dfb9613c8635 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 18:22:23 -0400 Subject: [PATCH 21/61] Suppress pyarrow warning --- pandas/tests/io/test_sql.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a0ad0eed03506..6206c2f06ecbf 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -604,10 +604,12 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): df["timedelta"] = pd.array( [timedelta(1)], dtype="month_day_nano_interval[pyarrow]" ) - exp_warning = None + exp_warning = FutureWarning # warning thrown from pyarrow + msg = "is_sparce is deprecated" else: df["timedelta"] = pd.array([timedelta(1)], dtype="duration[ns][pyarrow]") exp_warning = UserWarning + msg = "the 'timedelta'" if conn == "sqlite_adbc_conn": request.node.add_marker( @@ -618,7 +620,7 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): ) conn = request.getfixturevalue(conn) - with tm.assert_produces_warning(exp_warning, match="the 'timedelta'"): + with tm.assert_produces_warning(exp_warning, match=msg): df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) From 6077fa96d69daf11ebd3343f9de9d6cfe992119d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 18:26:21 -0400 Subject: [PATCH 22/61] Updated docs --- doc/source/getting_started/install.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index ae7c9d4ea9c62..6c1d6d9461707 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -344,6 +344,8 @@ SQLAlchemy 1.4.36 postgresql, SQL support for dat sql-other psycopg2 2.9.3 postgresql PostgreSQL engine for sqlalchemy pymysql 1.0.2 mysql MySQL engine for sqlalchemy +adbc-driver-postgresql 0.6.0 ADBC Driver for PostgreSQL +adbc-driver-sqlite 0.6.0 ADBC Driver for SQLite ========================= ================== =============== ============================================================= Other data sources From 5bba566e887b2ea10b94b28b3c17bad86c2eb219 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 28 Aug 2023 18:32:05 -0400 Subject: [PATCH 23/61] mypy fixes --- pandas/io/sql.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 3a33ce45b5c84..71d20503612d4 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2113,6 +2113,7 @@ def read_table( else: stmt = f"SELECT * FROM {table_name}" + mapping: type[ArrowDtype] | None | Callable if dtype_backend == "pyarrow": mapping = ArrowDtype elif dtype_backend == "numpy_nullable": @@ -2171,6 +2172,7 @@ def read_query( if dtype: raise NotImplementedError("'dtype' is not implemented for ADBC drivers") + mapping: type[ArrowDtype] | None | Callable if dtype_backend == "pyarrow": mapping = ArrowDtype elif dtype_backend == "numpy_nullable": From 236e12b8661a3a52263624d014b0c31bd01bb081 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 29 Aug 2023 14:34:24 -0400 Subject: [PATCH 24/61] Remove stacklevel check from test --- pandas/tests/io/test_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6206c2f06ecbf..448a05946a086 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -620,7 +620,7 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): ) conn = request.getfixturevalue(conn) - with tm.assert_produces_warning(exp_warning, match=msg): + with tm.assert_produces_warning(exp_warning, match=msg, check_stacklevel=False): df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) From b35374c1815940e8ca8b70090a34f4fbb6e8e4fc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 29 Aug 2023 17:11:03 -0400 Subject: [PATCH 25/61] typo fix --- pandas/tests/io/test_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 448a05946a086..6c31f11132ec4 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -605,7 +605,7 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): [timedelta(1)], dtype="month_day_nano_interval[pyarrow]" ) exp_warning = FutureWarning # warning thrown from pyarrow - msg = "is_sparce is deprecated" + msg = "is_sparse is deprecated" else: df["timedelta"] = pd.array([timedelta(1)], dtype="duration[ns][pyarrow]") exp_warning = UserWarning From 8d814e15b48d7ae43fe04488a0b4304df7f03b50 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 30 Aug 2023 09:32:50 -0400 Subject: [PATCH 26/61] compat --- pandas/tests/io/test_sql.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6c31f11132ec4..d724b435d5c1d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -35,6 +35,7 @@ import pytest from pandas._libs import lib +from pandas.compat import pa_version_under8p0 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -462,6 +463,8 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): @pytest.fixture def postgresql_adbc_conn(): + if pa_version_under8p0: + pytest.skip("ADBC requires pyarrow >= 8.0.0") pytest.importorskip("adbc_driver_postgresql") from adbc_driver_postgresql import dbapi @@ -495,6 +498,8 @@ def sqlite_conn(sqlite_engine): @pytest.fixture def sqlite_adbc_conn(): + if pa_version_under8p0: + pytest.skip("ADBC requires pyarrow >= 8.0.0") pytest.importorskip("adbc_driver_sqlite") from adbc_driver_sqlite import dbapi From cfac2c712179dc72e57e62a8d4624d277861eeb9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 31 Aug 2023 14:31:18 -0400 Subject: [PATCH 27/61] Joris feedback --- doc/source/getting_started/install.rst | 3 ++- environment.yml | 4 ++-- pandas/io/sql.py | 12 +++++++++--- pandas/tests/io/test_sql.py | 23 ++++++++++++++++++++++- requirements-dev.txt | 4 ++-- 5 files changed, 37 insertions(+), 9 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 6c1d6d9461707..67a0ecc2007ff 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -334,7 +334,8 @@ lxml 4.8.0 xml XML parser for read SQL databases ^^^^^^^^^^^^^ -Installable with ``pip install "pandas[postgresql, mysql, sql-other]"``. +Traditional ODBC drivers are Installable with ``pip install "pandas[postgresql, mysql, sql-other]"``. ADBC drivers +must be installed separately. ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes diff --git a/environment.yml b/environment.yml index bd8ad1cc0ea62..dffb8438d95c3 100644 --- a/environment.yml +++ b/environment.yml @@ -25,6 +25,8 @@ dependencies: - pytz # optional dependencies + - adbc_driver_postgresql>=0.6.0 + - adbc_driver_sqlite>=0.6.0 - beautifulsoup4>=4.11.1 - blosc - bottleneck>=1.3.4 @@ -113,8 +115,6 @@ dependencies: - pygments # Code highlighting - pip: - - adbc_driver_postgresql>=0.6.0 - - adbc_driver_sqlite>=0.6.0 - dataframe-api-compat>=0.1.7 - sphinx-toggleprompt # conda-forge version has stricter pins on jinja2 - typing_extensions; python_version<"3.11" diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 71d20503612d4..67b637cdbff49 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -32,6 +32,8 @@ import numpy as np +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -2074,7 +2076,7 @@ def read_table( ) -> DataFrame | Iterator[DataFrame]: """ Read SQL database table into a DataFrame. Only keyword arguments used - are table_name and schema. The rest are silently discarded. + are table_name and schema. Other keywords will raise NotImplementedError. Parameters ---------- @@ -2120,11 +2122,15 @@ def read_table( from pandas.io._util import _arrow_dtype_mapping mapping = _arrow_dtype_mapping().get + elif using_pyarrow_string_dtype(): + from pandas.io._util import arrow_string_types_mapper + + arrow_string_types_mapper() else: mapping = None with self.con.cursor() as cur: - return cur(stmt).fetch_arrow_table().to_pandas(types_mapper=mapping) + return cur.execute(stmt).fetch_arrow_table().to_pandas(types_mapper=mapping) def read_query( self, @@ -2203,7 +2209,7 @@ def to_sql( ) -> int | None: """ Write records stored in a DataFrame to a SQL database. - Only frame, name, if_exists and schema are valid arguments. + Only frame, name, if_exists, index and schema are valid arguments. Parameters ---------- diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index d724b435d5c1d..e448dfbc69ed0 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -146,7 +146,10 @@ def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): reader = csv.reader(csvfile) next(reader) stmt = "INSERT INTO iris VALUES(?, ?, ?, ?, ?)" - cur.executemany(stmt, reader) + cur.executemany(stmt, list(reader)) + + conn.commit() + cur.close() def create_and_load_iris(conn, iris_file: Path, dialect: str): @@ -532,6 +535,23 @@ def sqlite_iris_conn(sqlite_iris_engine): yield conn +@pytest.fixture +def sqlite_iris_adbc_conn(iris_path): + if pa_version_under8p0: + pytest.skip("ADBC requires pyarrow >= 8.0.0") + pytest.importorskip("adbc_driver_sqlite") + from adbc_driver_sqlite import dbapi + + with tm.ensure_clean() as name: + uri = f"file:{name}" + with dbapi.connect(uri) as conn: + create_and_load_iris_sqlite3(conn, iris_path) + + yield conn + with conn.cursor() as cur: + cur.execute("DROP TABLE IF EXISTS test_frame") + + @pytest.fixture def sqlite_buildin(): with contextlib.closing(sqlite3.connect(":memory:")) as closing_conn: @@ -566,6 +586,7 @@ def sqlite_buildin_iris(sqlite_buildin, iris_path): "sqlite_iris_engine", "sqlite_iris_conn", "sqlite_iris_str", + "sqlite_iris_adbc_conn", ] sqlalchemy_connectable = mysql_connectable + postgresql_connectable + sqlite_connectable diff --git a/requirements-dev.txt b/requirements-dev.txt index 5b71525a1d6eb..e338f9c05c7bc 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,6 +14,8 @@ coverage python-dateutil numpy pytz +adbc_driver_postgresql>=0.6.0 +adbc_driver_sqlite>=0.6.0 beautifulsoup4>=4.11.1 blosc bottleneck>=1.3.4 @@ -82,8 +84,6 @@ feedparser pyyaml requests pygments -adbc_driver_postgresql>=0.6.0 -adbc_driver_sqlite>=0.6.0 dataframe-api-compat>=0.1.7 sphinx-toggleprompt typing_extensions; python_version<"3.11" From a22e5d1fd59d9563ccc4d6e38bf52426ce84a12b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 31 Aug 2023 16:37:17 -0400 Subject: [PATCH 28/61] Better test coverage with ADBC --- pandas/io/sql.py | 3 ++- pandas/tests/io/test_sql.py | 53 ++++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 67b637cdbff49..f9fd6138f799f 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2189,7 +2189,8 @@ def read_query( mapping = None with self.con.cursor() as cur: - return cur(sql).fetch_arrow_table().to_pandas(types_mapper=mapping) + cur.execute(sql) + return cur.fetch_arrow_table().to_pandas(types_mapper=mapping) read_sql = read_query diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index e448dfbc69ed0..bc9b5271eaf08 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -156,6 +156,7 @@ def create_and_load_iris(conn, iris_file: Path, dialect: str): from sqlalchemy import insert from sqlalchemy.engine import Engine + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") iris = iris_table_metadata(dialect) with iris_file.open(newline=None, encoding="utf-8") as csvfile: @@ -169,6 +170,13 @@ def create_and_load_iris(conn, iris_file: Path, dialect: str): iris.drop(conn, checkfirst=True) iris.create(bind=conn) conn.execute(stmt) + elif adbc and isinstance(conn, adbc.Connection): + from sqlalchemy.schema import CreateTable + + create_stmt = CreateTable(iris, if_not_exists=True) + with conn.cursor() as cur: + cur.execute(str(create_stmt)) + cur.execute(str(stmt.compile(compile_kwargs={"literal_binds": True}))) else: with conn.begin(): iris.drop(conn, checkfirst=True) @@ -465,17 +473,24 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): @pytest.fixture -def postgresql_adbc_conn(): +def postgresql_adbc_conn(iris_path): if pa_version_under8p0: pytest.skip("ADBC requires pyarrow >= 8.0.0") pytest.importorskip("adbc_driver_postgresql") + import adbc_driver_manager as mgr from adbc_driver_postgresql import dbapi uri = "postgresql://postgres:postgres@localhost:5432/pandas" with dbapi.connect(uri) as conn: + try: + conn.adbc_get_table_schema("iris") + except mgr.OperationalError: + conn.rollback() + create_and_load_iris(conn, iris_path, "postgresql") yield conn with conn.cursor() as cur: cur.execute("DROP TABLE IF EXISTS test_frame") + cur.execute("DROP TABLE IF EXISTS iris") @pytest.fixture @@ -500,18 +515,25 @@ def sqlite_conn(sqlite_engine): @pytest.fixture -def sqlite_adbc_conn(): +def sqlite_adbc_conn(iris_path): if pa_version_under8p0: pytest.skip("ADBC requires pyarrow >= 8.0.0") pytest.importorskip("adbc_driver_sqlite") + import adbc_driver_manager as mgr from adbc_driver_sqlite import dbapi with tm.ensure_clean() as name: uri = f"file:{name}" with dbapi.connect(uri) as conn: + try: + conn.adbc_get_table_schema("iris") + except mgr.InternalError: # note arrow-adbc issue 1022 + conn.rollback() + create_and_load_iris(conn, iris_path, "sqlite") yield conn with conn.cursor() as cur: cur.execute("DROP TABLE IF EXISTS test_frame") + cur.execute("DROP TABLE IF EXISTS iris") @pytest.fixture @@ -586,7 +608,6 @@ def sqlite_buildin_iris(sqlite_buildin, iris_path): "sqlite_iris_engine", "sqlite_iris_conn", "sqlite_iris_str", - "sqlite_iris_adbc_conn", ] sqlalchemy_connectable = mysql_connectable + postgresql_connectable + sqlite_connectable @@ -599,7 +620,9 @@ def sqlite_buildin_iris(sqlite_buildin, iris_path): all_connectable = sqlalchemy_connectable + ["sqlite_buildin"] + adbc_connectable -all_connectable_iris = sqlalchemy_connectable_iris + ["sqlite_buildin_iris"] +all_connectable_iris = ( + sqlalchemy_connectable_iris + ["sqlite_buildin_iris"] + adbc_connectable +) @pytest.mark.db @@ -725,6 +748,13 @@ def test_read_iris_query(conn, request): @pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_chunksize(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) iris_frame = concat(read_sql_query("SELECT * FROM iris", conn, chunksize=7)) check_iris_frame(iris_frame) @@ -738,6 +768,13 @@ def test_read_iris_query_chunksize(conn, request): @pytest.mark.db @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_query_expression_with_parameter(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) from sqlalchemy import ( MetaData, @@ -760,6 +797,14 @@ def test_read_iris_query_expression_with_parameter(conn, request): @pytest.mark.db @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_iris_query_string_with_parameter(conn, request, sql_strings): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'chunksize' not implemented for ADBC drivers", + strict=True, + ) + ) + for db, query in sql_strings["read_parameters"].items(): if db in conn: break From c51b7f466eec75ee90ea659f0105dea0411e0bfc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 31 Aug 2023 16:41:09 -0400 Subject: [PATCH 29/61] cleanups --- pandas/io/sql.py | 3 ++- pandas/tests/io/test_sql.py | 17 ----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index f9fd6138f799f..5439d7afafa28 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2130,7 +2130,8 @@ def read_table( mapping = None with self.con.cursor() as cur: - return cur.execute(stmt).fetch_arrow_table().to_pandas(types_mapper=mapping) + cur.execute(stmt) + return cur.fetch_arrow_table().to_pandas(types_mapper=mapping) def read_query( self, diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index bc9b5271eaf08..8d5cf4d5bbfad 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -557,23 +557,6 @@ def sqlite_iris_conn(sqlite_iris_engine): yield conn -@pytest.fixture -def sqlite_iris_adbc_conn(iris_path): - if pa_version_under8p0: - pytest.skip("ADBC requires pyarrow >= 8.0.0") - pytest.importorskip("adbc_driver_sqlite") - from adbc_driver_sqlite import dbapi - - with tm.ensure_clean() as name: - uri = f"file:{name}" - with dbapi.connect(uri) as conn: - create_and_load_iris_sqlite3(conn, iris_path) - - yield conn - with conn.cursor() as cur: - cur.execute("DROP TABLE IF EXISTS test_frame") - - @pytest.fixture def sqlite_buildin(): with contextlib.closing(sqlite3.connect(":memory:")) as closing_conn: From 7f5e6ac149154af47086c1d27388fb6a599e74a5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 1 Sep 2023 12:02:38 -0400 Subject: [PATCH 30/61] feedback --- doc/source/getting_started/install.rst | 2 +- pandas/tests/io/test_sql.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 67a0ecc2007ff..8256c1ad2bb50 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -334,7 +334,7 @@ lxml 4.8.0 xml XML parser for read SQL databases ^^^^^^^^^^^^^ -Traditional ODBC drivers are Installable with ``pip install "pandas[postgresql, mysql, sql-other]"``. ADBC drivers +Traditional ODBC drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"``. ADBC drivers must be installed separately. ========================= ================== =============== ============================================================= diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 8d5cf4d5bbfad..c9f5a25865f89 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -146,10 +146,7 @@ def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): reader = csv.reader(csvfile) next(reader) stmt = "INSERT INTO iris VALUES(?, ?, ?, ?, ?)" - cur.executemany(stmt, list(reader)) - - conn.commit() - cur.close() + cur.executemany(stmt, reader) def create_and_load_iris(conn, iris_file: Path, dialect: str): From a8b645f8ae03fca04d61303d684ed795c89bf26e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 18 Sep 2023 22:57:45 -0400 Subject: [PATCH 31/61] checkpoint --- pandas/tests/io/test_sql.py | 280 +++++++++++++++++++++++++++++------- 1 file changed, 231 insertions(+), 49 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1e5ad4341afcc..a07d4d39093fb 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -133,7 +133,6 @@ def iris_table_metadata(dialect: str): def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): - cur = conn.cursor() stmt = """CREATE TABLE iris ( "SepalLength" REAL, "SepalWidth" REAL, @@ -141,12 +140,55 @@ def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): "PetalWidth" REAL, "Name" TEXT )""" - cur.execute(stmt) - with iris_file.open(newline=None, encoding="utf-8") as csvfile: - reader = csv.reader(csvfile) - next(reader) - stmt = "INSERT INTO iris VALUES(?, ?, ?, ?, ?)" - cur.executemany(stmt, reader) + with conn.cursor() as cur: + cur.execute(stmt) + with iris_file.open(newline=None, encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + next(reader) + stmt = "INSERT INTO iris VALUES($1, $2, $3, $4, $5)" + # ADBC requires explicit types - no implicit str -> float conversion + records = [] + records = [ + ( + float(row[0]), + float(row[1]), + float(row[2]), + float(row[3]), + row[4], + ) + for row in reader + ] + + cur.executemany(stmt, records) + + +def create_and_load_iris_postgresql(conn, iris_file: Path): + stmt = """CREATE TABLE iris ( + "SepalLength" DOUBLE PRECISION, + "SepalWidth" DOUBLE PRECISION, + "PetalLength" DOUBLE PRECISION, + "PetalWidth" DOUBLE PRECISION, + "Name" TEXT + )""" + with conn.cursor() as cur: + cur.execute(stmt) + with iris_file.open(newline=None, encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + next(reader) + stmt = "INSERT INTO iris VALUES($1, $2, $3, $4, $5)" + # ADBC requires explicit types - no implicit str -> float conversion + records = [ + ( + float(row[0]), + float(row[1]), + float(row[2]), + float(row[3]), + row[4], + ) + for row in reader + ] + + cur.executemany(stmt, records) def create_and_load_iris(conn, iris_file: Path, dialect: str): @@ -233,26 +275,48 @@ def types_table_metadata(dialect: str): return types -def create_and_load_types_sqlite3(conn: sqlite3.Connection, types_data: list[dict]): - cur = conn.cursor() - stmt = """CREATE TABLE types ( - "TextCol" TEXT, - "DateCol" TEXT, - "IntDateCol" INTEGER, - "IntDateOnlyCol" INTEGER, - "FloatCol" REAL, - "IntCol" INTEGER, - "BoolCol" INTEGER, - "IntColWithNull" INTEGER, - "BoolColWithNull" INTEGER - )""" - cur.execute(stmt) - - stmt = """ - INSERT INTO types - VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) - """ - cur.executemany(stmt, types_data) +def create_and_load_types_sqlite3(conn, types_data: list[dict]): + with conn.cursor() as cur: + stmt = """CREATE TABLE types ( + "TextCol" TEXT, + "DateCol" TEXT, + "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, + "FloatCol" REAL, + "IntCol" INTEGER, + "BoolCol" INTEGER, + "IntColWithNull" INTEGER, + "BoolColWithNull" INTEGER + )""" + cur.execute(stmt) + + stmt = """ + INSERT INTO types + VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) + """ + cur.executemany(stmt, types_data) + + +def create_and_load_types_postgresql(conn, types_data: list[dict]): + with conn.cursor() as cur: + stmt = """CREATE TABLE types ( + "TextCol" TEXT, + "DateCol" TEXT, + "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, + "FloatCol" DOUBLE PRECISION, + "IntCol" INTEGER, + "BoolCol" INTEGER, + "IntColWithNull" INTEGER, + "BoolColWithNull" INTEGER + )""" + cur.execute(stmt) + + stmt = """ + INSERT INTO types + VALUES($1, $2, $3, $4, $5, $6, $7, $8, $9) + """ + cur.executemany(stmt, types_data) def create_and_load_types(conn, types_data: list[dict], dialect: str): @@ -417,9 +481,24 @@ def get_all_views(conn): c = conn.execute("SELECT name FROM sqlite_master WHERE type='view'") return [view[0] for view in c.fetchall()] else: - from sqlalchemy import inspect + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + results = [] + info = conn.adbc_get_objects().read_all().to_pylist() + for catalog in info: + catalog["catalog_name"] + for schema in catalog["catalog_db_schemas"]: + schema_name = schema["db_schema_name"] + for table in schema["db_schema_tables"]: + if table["table_type"] == "view": + table_name = table["table_name"] + results.append((schema_name, table_name)) + + return results + else: + from sqlalchemy import inspect - return inspect(conn).get_view_names() + return inspect(conn).get_view_names() def get_all_tables(conn): @@ -427,9 +506,22 @@ def get_all_tables(conn): c = conn.execute("SELECT name FROM sqlite_master WHERE type='table'") return [table[0] for table in c.fetchall()] else: - from sqlalchemy import inspect + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + results = [] + info = conn.adbc_get_objects().read_all().to_pylist() + for catalog in info: + for schema in catalog["catalog_db_schemas"]: + for table in schema["db_schema_tables"]: + if table["table_type"] == "table": + table_name = table["table_name"] + results.append(table_name) + + return results + else: + from sqlalchemy import inspect - return inspect(conn).get_table_names() + return inspect(conn).get_table_names() def drop_table( @@ -439,9 +531,15 @@ def drop_table( if isinstance(conn, sqlite3.Connection): conn.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}") conn.commit() + else: - with conn.begin() as con: - sql.SQLDatabase(con).drop_table(table_name) + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(f"DROP TABLE IF EXISTS {table_name}") + else: + with conn.begin() as con: + sql.SQLDatabase(con).drop_table(table_name) def drop_view( @@ -452,12 +550,17 @@ def drop_view( conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") conn.commit() else: - quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( - view_name - ) - stmt = sqlalchemy.text(f"DROP VIEW IF EXISTS {quoted_view}") - with conn.begin() as con: - con.execute(stmt) # type: ignore[union-attr] + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(f"DROP TABLE IF EXISTS {view_name}") + else: + quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( + view_name + ) + stmt = sqlalchemy.text(f"DROP VIEW IF EXISTS {quoted_view}") + with conn.begin() as con: + con.execute(stmt) # type: ignore[union-attr] @pytest.fixture @@ -522,7 +625,7 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): @pytest.fixture -def postgresql_adbc_conn(iris_path): +def postgresql_adbc_conn(iris_path, types_data): if pa_version_under8p0: pytest.skip("ADBC requires pyarrow >= 8.0.0") pytest.importorskip("adbc_driver_postgresql") @@ -535,11 +638,25 @@ def postgresql_adbc_conn(iris_path): conn.adbc_get_table_schema("iris") except mgr.OperationalError: conn.rollback() - create_and_load_iris(conn, iris_path, "postgresql") + create_and_load_iris_postgresql(conn, iris_path) + try: + conn.adbc_get_table_schema("types") + except mgr.OperationalError: + conn.rollback() + # ADBC cannot cast boolean data + new_data = [] + for entry in types_data: + entry["BoolCol"] = int(entry["BoolCol"]) + if entry["BoolColWithNull"] is not None: + entry["BoolColWithNull"] = int(entry["BoolColWithNull"]) + new_data.append(tuple(entry.values())) + + create_and_load_types_postgresql(conn, new_data) yield conn - with conn.cursor() as cur: - cur.execute("DROP TABLE IF EXISTS test_frame") - cur.execute("DROP TABLE IF EXISTS iris") + for view in get_all_views(conn): + drop_view(view, conn) + for tbl in get_all_tables(conn): + drop_table(tbl, conn) @pytest.fixture @@ -579,7 +696,7 @@ def sqlite_conn(sqlite_engine): @pytest.fixture -def sqlite_adbc_conn(iris_path): +def sqlite_adbc_conn(iris_path, types_data): if pa_version_under8p0: pytest.skip("ADBC requires pyarrow >= 8.0.0") pytest.importorskip("adbc_driver_sqlite") @@ -593,11 +710,25 @@ def sqlite_adbc_conn(iris_path): conn.adbc_get_table_schema("iris") except mgr.InternalError: # note arrow-adbc issue 1022 conn.rollback() - create_and_load_iris(conn, iris_path, "sqlite") + create_and_load_iris_sqlite3(conn, iris_path) + try: + conn.adbc_get_table_schema("types") + except mgr.InternalError: # note arrow-adbc issue 1022 + conn.rollback() + new_data = [] + for entry in types_data: + entry["BoolCol"] = int(entry["BoolCol"]) + if entry["BoolColWithNull"] is not None: + entry["BoolColWithNull"] = int(entry["BoolColWithNull"]) + entry.pop("DateColWithTz") + new_data.append(tuple(entry.values())) + + create_and_load_types_sqlite3(conn, new_data) yield conn - with conn.cursor() as cur: - cur.execute("DROP TABLE IF EXISTS test_frame") - cur.execute("DROP TABLE IF EXISTS iris") + for view in get_all_views(conn): + drop_view(view, conn) + for tbl in get_all_tables(conn): + drop_table(tbl, conn) @pytest.fixture @@ -730,6 +861,12 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_empty(conn, test_frame1, request): + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="postgres ADBC driver doesn't like empty dataset", + ) + ) # GH 51086 if conn is sqlite_engine conn = request.getfixturevalue(conn) empty_df = test_frame1.iloc[:0] @@ -921,6 +1058,10 @@ def test_read_iris_table(conn, request): @pytest.mark.parametrize("conn", sqlalchemy_connectable_iris) def test_read_iris_table_chunksize(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) iris_frame = concat(read_sql_table("iris", conn, chunksize=7)) check_iris_frame(iris_frame) @@ -1314,6 +1455,10 @@ def test_api_read_sql_view(conn, request): @pytest.mark.parametrize("conn", all_connectable_iris) def test_api_read_sql_with_chunksize_no_result(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) query = 'SELECT * FROM iris_view WHERE "SepalLength" < 0.0' with_batch = sql.read_sql_query(query, conn, chunksize=5) @@ -1430,6 +1575,10 @@ def test_api_roundtrip(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_roundtrip_chunksize(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) if sql.has_table("test_frame_roundtrip", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1462,6 +1611,10 @@ def test_api_date_parsing(conn, request): conn = request.getfixturevalue(conn) # Test date parsing in read_sql # No Parsing + if "adbc" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="parse_dates argument NotImplemented with ADBC") + ) df = sql.read_sql_query("SELECT * FROM types", conn) if not ("mysql" in conn_name or "postgres" in conn_name): assert not issubclass(df.DateCol.dtype.type, np.datetime64) @@ -1536,6 +1689,10 @@ def test_api_custom_dateparsing_error( request.node.add_marker( pytest.mark.xfail(reason="failing combination of arguments") ) + elif "adbc" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="parse_dates argument NotImplemented with ADBC") + ) expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) @@ -1558,6 +1715,10 @@ def test_api_custom_dateparsing_error( @pytest.mark.parametrize("conn", all_connectable_iris) def test_api_date_and_index(conn, request): # Test case where same column appears in parse_date and index_col + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="index_col argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) df = sql.read_sql_query( "SELECT * FROM types", @@ -1614,6 +1775,10 @@ def test_api_complex_raises(conn, request): ], ) def test_api_to_sql_index_label(conn, request, index_name, index_label, expected): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="index_label argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) if sql.has_table("test_index_label", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1636,6 +1801,10 @@ def test_api_to_sql_index_label_multiindex(conn, request): reason="MySQL can fail using TEXT without length as key", strict=False ) ) + elif "adbc" in conn_name: + request.node.add_marker( + pytest.mark.xfail(reason="index_label argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) if sql.has_table("test_index_label", conn): @@ -1729,6 +1898,11 @@ def test_api_multiindex_roundtrip(conn, request): ) def test_api_dtype_argument(conn, request, dtype): # GH10285 Add dtype argument to read_sql_query + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="dtype argument NotImplemented with ADBC") + ) + conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_dtype_argument", conn): @@ -1812,6 +1986,10 @@ def test_api_get_schema_keys(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_chunksize_read(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_chunksize", conn): @@ -3326,6 +3504,10 @@ def func(storage, dtype_backend, conn_name): @pytest.mark.parametrize("conn", all_connectable) def test_chunksize_empty_dtypes(conn, request): # GH#50245 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) dtypes = {"a": "int64", "b": "object"} df = DataFrame(columns=["a", "b"]).astype(dtypes) From 902df4f91afea4dc0f4edef8aad13a2b5e29e0ed Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 19 Sep 2023 16:49:29 -0400 Subject: [PATCH 32/61] more checkpoint --- pandas/io/sql.py | 36 +++++- pandas/tests/io/test_sql.py | 211 ++++++++++++++++++++++++++++-------- 2 files changed, 200 insertions(+), 47 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index de669f8bf3950..e0034c4cf8496 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2059,9 +2059,35 @@ class ADBCDatabase(PandasSQL): def __init__(self, con) -> None: self.con = con - def execute(self, sql: str | Select | TextClause, params=None): + @contextmanager + def run_transaction(self): with self.con.cursor() as cur: - return cur.execute(sql) + try: + yield cur + except Exception: + self.con.rollback() + raise + self.con.commit() + + def execute(self, sql: str | Select | TextClause, params=None): + if not isinstance(sql, str): + raise TypeError("Query must be a string unless using sqlalchemy.") + args = [] if params is None else [params] + cur = self.con.cursor() + try: + cur.execute(sql, *args) + return cur + except Exception as exc: + try: + self.con.rollback() + except Exception as inner_exc: # pragma: no cover + ex = DatabaseError( + f"Execution failed on sql: {sql}\n{exc}\nunable to rollback" + ) + raise ex from inner_exc + + ex = DatabaseError(f"Execution failed on sql '{sql}': {exc}") + raise ex from exc def read_table( self, @@ -2263,7 +2289,11 @@ def to_sql( import pyarrow as pa - tbl = pa.Table.from_pandas(frame, preserve_index=index) + try: + tbl = pa.Table.from_pandas(frame, preserve_index=index) + except pa.ArrowNotImplementedError as exc: + raise ValueError("datatypes not supported") from exc + with self.con.cursor() as cur: total_inserted = cur.adbc_ingest(table_name, tbl, mode=mode) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a07d4d39093fb..cd964497ff221 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -161,6 +161,8 @@ def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): cur.executemany(stmt, records) + conn.commit() + def create_and_load_iris_postgresql(conn, iris_file: Path): stmt = """CREATE TABLE iris ( @@ -190,12 +192,13 @@ def create_and_load_iris_postgresql(conn, iris_file: Path): cur.executemany(stmt, records) + conn.commit() + def create_and_load_iris(conn, iris_file: Path, dialect: str): from sqlalchemy import insert - from sqlalchemy.engine import Engine - adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") iris = iris_table_metadata(dialect) with iris_file.open(newline=None, encoding="utf-8") as csvfile: @@ -203,24 +206,10 @@ def create_and_load_iris(conn, iris_file: Path, dialect: str): header = next(reader) params = [dict(zip(header, row)) for row in reader] stmt = insert(iris).values(params) - if isinstance(conn, Engine): - with conn.connect() as conn: - with conn.begin(): - iris.drop(conn, checkfirst=True) - iris.create(bind=conn) - conn.execute(stmt) - elif adbc and isinstance(conn, adbc.Connection): - from sqlalchemy.schema import CreateTable - - create_stmt = CreateTable(iris, if_not_exists=True) - with conn.cursor() as cur: - cur.execute(str(create_stmt)) - cur.execute(str(stmt.compile(compile_kwargs={"literal_binds": True}))) - else: - with conn.begin(): - iris.drop(conn, checkfirst=True) - iris.create(bind=conn) - conn.execute(stmt) + with conn.begin() as con: + iris.drop(con, checkfirst=True) + iris.create(bind=con) + con.execute(stmt) def create_and_load_iris_view(conn): @@ -229,17 +218,17 @@ def create_and_load_iris_view(conn): cur = conn.cursor() cur.execute(stmt) else: - from sqlalchemy import text - from sqlalchemy.engine import Engine - - stmt = text(stmt) - if isinstance(conn, Engine): - with conn.connect() as conn: - with conn.begin(): - conn.execute(stmt) + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if adbc and isinstance(conn, adbc.Connection): + with conn.cursor() as cur: + cur.execute(stmt) + conn.commit() else: - with conn.begin(): - conn.execute(stmt) + from sqlalchemy import text + + stmt = text(stmt) + with conn.begin() as con: + con.execute(stmt) def types_table_metadata(dialect: str): @@ -296,6 +285,8 @@ def create_and_load_types_sqlite3(conn, types_data: list[dict]): """ cur.executemany(stmt, types_data) + conn.commit() + def create_and_load_types_postgresql(conn, types_data: list[dict]): with conn.cursor() as cur: @@ -318,6 +309,8 @@ def create_and_load_types_postgresql(conn, types_data: list[dict]): """ cur.executemany(stmt, types_data) + conn.commit() + def create_and_load_types(conn, types_data: list[dict], dialect: str): from sqlalchemy import insert @@ -488,11 +481,11 @@ def get_all_views(conn): for catalog in info: catalog["catalog_name"] for schema in catalog["catalog_db_schemas"]: - schema_name = schema["db_schema_name"] + schema["db_schema_name"] for table in schema["db_schema_tables"]: if table["table_type"] == "view": - table_name = table["table_name"] - results.append((schema_name, table_name)) + view_name = table["table_name"] + results.append(view_name) return results else: @@ -553,7 +546,7 @@ def drop_view( adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") if adbc and isinstance(conn, adbc.Connection): with conn.cursor() as cur: - cur.execute(f"DROP TABLE IF EXISTS {view_name}") + cur.execute(f"DROP VIEW IF EXISTS {view_name}") else: quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( view_name @@ -639,6 +632,11 @@ def postgresql_adbc_conn(iris_path, types_data): except mgr.OperationalError: conn.rollback() create_and_load_iris_postgresql(conn, iris_path) + try: + conn.adbc_get_table_schema("iris_view") + except mgr.OperationalError: # note arrow-adbc issue 1022 + conn.rollback() + create_and_load_iris_view(conn) try: conn.adbc_get_table_schema("types") except mgr.OperationalError: @@ -711,6 +709,11 @@ def sqlite_adbc_conn(iris_path, types_data): except mgr.InternalError: # note arrow-adbc issue 1022 conn.rollback() create_and_load_iris_sqlite3(conn, iris_path) + try: + conn.adbc_get_table_schema("iris_view") + except mgr.InternalError: # note arrow-adbc issue 1022 + conn.rollback() + create_and_load_iris_view(conn) try: conn.adbc_get_table_schema("types") except mgr.InternalError: # note arrow-adbc issue 1022 @@ -724,6 +727,7 @@ def sqlite_adbc_conn(iris_path, types_data): new_data.append(tuple(entry.values())) create_and_load_types_sqlite3(conn, new_data) + conn.commit() yield conn for view in get_all_views(conn): drop_view(view, conn) @@ -1103,8 +1107,8 @@ def test_default_type_conversion(conn, request): assert issubclass(df.FloatCol.dtype.type, np.floating) assert issubclass(df.IntCol.dtype.type, np.integer) - # MySQL/sqlite has no real BOOL type - if "postgresql" in conn_name: + # MySQL/sqlite has no real BOOL type, but ADBC loses this + if "postgresql" in conn_name and "adbc" not in conn_name: assert issubclass(df.BoolCol.dtype.type, np.bool_) else: assert issubclass(df.BoolCol.dtype.type, np.integer) @@ -1404,6 +1408,13 @@ def func(conn_name): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_sql_iris_parameter(conn, request, sql_strings, flavor): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'params' not implemented for ADBC drivers", + strict=True, + ) + ) conn_name = conn conn = request.getfixturevalue(conn) query = sql_strings["read_parameters"][flavor(conn_name)] @@ -1417,6 +1428,13 @@ def test_read_sql_iris_parameter(conn, request, sql_strings, flavor): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_sql_iris_named_parameter(conn, request, sql_strings, flavor): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'params' not implemented for ADBC drivers", + strict=True, + ) + ) conn_name = conn conn = request.getfixturevalue(conn) query = sql_strings["read_named_parameters"][flavor(conn_name)] @@ -1602,6 +1620,7 @@ def test_api_execute_sql(conn, request): with sql.pandasSQL_builder(conn) as pandas_sql: iris_results = pandas_sql.execute("SELECT * FROM iris") row = iris_results.fetchone() + iris_results.close() tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) @@ -1749,9 +1768,14 @@ def test_api_timedelta(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_complex_raises(conn, request): + conn_name = conn conn = request.getfixturevalue(conn) df = DataFrame({"a": [1 + 1j, 2j]}) - msg = "Complex datatypes not supported" + + if "adbc" in conn_name: + msg = "datatypes not supported" + else: + msg = "Complex datatypes not supported" with pytest.raises(ValueError, match=msg): assert df.to_sql("test_complex", con=conn) is None @@ -1868,6 +1892,13 @@ def test_api_to_sql_index_label_multiindex(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_multiindex_roundtrip(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'index_col' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) if sql.has_table("test_multiindex_roundtrip", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1932,6 +1963,13 @@ def test_api_integer_col_names(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) create_sql = sql.get_schema(test_frame1, "test", con=conn) assert "CREATE" in create_sql @@ -1940,6 +1978,13 @@ def test_api_get_schema(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_with_schema(conn, request, test_frame1): # GH28486 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) create_sql = sql.get_schema(test_frame1, "test", con=conn, schema="pypi") assert "CREATE TABLE pypi." in create_sql @@ -1947,6 +1992,13 @@ def test_api_get_schema_with_schema(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_dtypes(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn_name = conn conn = request.getfixturevalue(conn) float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]}) @@ -1964,6 +2016,13 @@ def test_api_get_schema_dtypes(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_get_schema_keys(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'get_schema' not implemented for ADBC drivers", + strict=True, + ) + ) conn_name = conn conn = request.getfixturevalue(conn) frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]}) @@ -2035,6 +2094,13 @@ def test_api_chunksize_read(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_categorical(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="categorical dtype not implemented for ADBC drivers", + strict=True, + ) + ) # GH8624 # test that categorical gets written correctly as dense column conn = request.getfixturevalue(conn) @@ -2071,6 +2137,10 @@ def test_api_unicode_column_name(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_escaped_table_name(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="see arrow-adbc gh issue 1080") + ) # GH 13206 conn_name = conn conn = request.getfixturevalue(conn) @@ -2115,6 +2185,13 @@ def test_read_table_columns(conn, request, test_frame1): conn_name = conn if conn_name == "sqlite_buildin": request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) + elif "adbc" in conn_name: + request.node.add_marker( + pytest.mark.xfail( + reason="'columns' not implemented for ADBC drivers", + strict=True, + ) + ) conn = request.getfixturevalue(conn) sql.to_sql(test_frame1, "test_frame", conn) @@ -2127,6 +2204,13 @@ def test_read_table_columns(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_read_table_index_col(conn, request, test_frame1): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail( + reason="'index_col' not implemented for ADBC drivers", + strict=True, + ) + ) # test columns argument in read_table conn_name = conn if conn_name == "sqlite_buildin": @@ -2502,7 +2586,8 @@ def test_execute_sql(conn, request): pandasSQL = pandasSQL_builder(conn) with pandasSQL.run_transaction(): iris_results = pandasSQL.execute("SELECT * FROM iris") - row = iris_results.fetchone() + row = iris_results.fetchone() + iris_results.close() tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) @@ -2915,7 +3000,7 @@ def test_to_sql_save_index(conn, request): with pandasSQL.run_transaction(): assert pandasSQL.to_sql(df, tbl_name) == 2 - if conn_name in {"sqlite_buildin", "sqlite_str"}: + if conn_name in {"sqlite_buildin", "sqlite_str"} or "adbc" in conn_name: ixs = sql.read_sql_query( "SELECT * FROM sqlite_master WHERE type = 'index' " f"AND tbl_name = '{tbl_name}'", @@ -2943,7 +3028,7 @@ def test_transactions(conn, request): stmt = "CREATE TABLE test_trans (A INT, B TEXT)" pandasSQL = pandasSQL_builder(conn) - if conn_name != "sqlite_buildin": + if conn_name != "sqlite_buildin" and "adbc" not in conn_name: from sqlalchemy import text stmt = text(stmt) @@ -2954,11 +3039,19 @@ def test_transactions(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_transaction_rollback(conn, request): + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="broken for postgres ADBC driver - needs investigation", + strict=True, + ) + ) + conn_name = conn conn = request.getfixturevalue(conn) pandasSQL = pandasSQL_builder(conn) with pandasSQL.run_transaction() as trans: stmt = "CREATE TABLE test_trans (A INT, B TEXT)" - if isinstance(pandasSQL, SQLiteDatabase): + if "adbc" in conn_name or isinstance(pandasSQL, SQLiteDatabase): trans.execute(stmt) else: from sqlalchemy import text @@ -3262,9 +3355,11 @@ class Temporary(Base): @pytest.mark.parametrize("conn", all_connectable) def test_invalid_engine(conn, request, test_frame1): - if conn == "sqlite_buildin": + if conn == "sqlite_buildin" or "adbc" in conn: request.node.add_marker( - pytest.mark.xfail(reason="SQLiteDatabase does not raise for bad engine") + pytest.mark.xfail( + reason="SQLiteDatabase/ADBCDatabase does not raise for bad engine" + ) ) conn = request.getfixturevalue(conn) @@ -3355,6 +3450,9 @@ def test_read_sql_dtype_backend( conn = request.getfixturevalue(conn) table = "test" df = dtype_backend_data + if "adbc" in conn_name: + # adbc cannot write / roundtrip booleans + df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): @@ -3362,8 +3460,15 @@ def test_read_sql_dtype_backend( f"Select * from {table}", conn, dtype_backend=dtype_backend ) expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + if "adbc" in conn_name: + # adbc cannot write / roundtrip booleans + expected = expected.drop(columns=["e", "f"]) tm.assert_frame_equal(result, expected) + if "adbc" in conn_name: + # adbc does not support chunksize argument + return + with pd.option_context("mode.string_storage", string_storage): iterator = getattr(pd, func)( f"Select * from {table}", @@ -3387,7 +3492,7 @@ def test_read_sql_dtype_backend_table( dtype_backend_data, dtype_backend_expected, ): - if "sqlite" in conn: + if "sqlite" in conn and "adbc" not in conn: request.node.add_marker( pytest.mark.xfail( reason=( @@ -3401,13 +3506,23 @@ def test_read_sql_dtype_backend_table( conn = request.getfixturevalue(conn) table = "test" df = dtype_backend_data + if "adbc" in conn_name: + # adbc cannot write / roundtrip booleans + df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + if "adbc" in conn_name: + # adbc cannot write / roundtrip booleans + expected = expected.drop(columns=["e", "f"]) tm.assert_frame_equal(result, expected) + if "adbc" in conn_name: + # adbc does not support chunksize argument + return + with pd.option_context("mode.string_storage", string_storage): iterator = getattr(pd, func)( table, @@ -3423,9 +3538,13 @@ def test_read_sql_dtype_backend_table( @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("func", ["read_sql", "read_sql_table", "read_sql_query"]) def test_read_sql_invalid_dtype_backend_table(conn, request, func, dtype_backend_data): + conn_name = conn conn = request.getfixturevalue(conn) table = "test" df = dtype_backend_data + if "adbc" in conn_name: + # adbc cannot write / roundtrip booleans + df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") msg = ( @@ -3528,6 +3647,10 @@ def test_chunksize_empty_dtypes(conn, request): @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) def test_read_sql_dtype(conn, request, func, dtype_backend): # GH#50797 + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="dtype argument NotImplemented with ADBC") + ) conn = request.getfixturevalue(conn) table = "test" df = DataFrame({"a": [1, 2, 3], "b": 5}) From 90ca2cb065756accf271bb0db6c3a41a9a37c23c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 19 Sep 2023 22:35:14 -0400 Subject: [PATCH 33/61] more skips --- pandas/tests/io/test_sql.py | 113 ++++++++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 25 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index cd964497ff221..3ced13f0b3f74 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -132,7 +132,7 @@ def iris_table_metadata(dialect: str): return iris -def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): +def create_and_load_iris_sqlite3(conn, iris_file: Path): stmt = """CREATE TABLE iris ( "SepalLength" REAL, "SepalWidth" REAL, @@ -140,7 +140,9 @@ def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): "PetalWidth" REAL, "Name" TEXT )""" - with conn.cursor() as cur: + + if isinstance(conn, sqlite3.Connection): + cur = conn.cursor() cur.execute(stmt) with iris_file.open(newline=None, encoding="utf-8") as csvfile: reader = csv.reader(csvfile) @@ -160,8 +162,29 @@ def create_and_load_iris_sqlite3(conn: sqlite3.Connection, iris_file: Path): ] cur.executemany(stmt, records) + else: + with conn.cursor() as cur: + cur.execute(stmt) + with iris_file.open(newline=None, encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + next(reader) + stmt = "INSERT INTO iris VALUES($1, $2, $3, $4, $5)" + # ADBC requires explicit types - no implicit str -> float conversion + records = [] + records = [ + ( + float(row[0]), + float(row[1]), + float(row[2]), + float(row[3]), + row[4], + ) + for row in reader + ] + + cur.executemany(stmt, records) - conn.commit() + conn.commit() def create_and_load_iris_postgresql(conn, iris_file: Path): @@ -265,27 +288,33 @@ def types_table_metadata(dialect: str): def create_and_load_types_sqlite3(conn, types_data: list[dict]): - with conn.cursor() as cur: - stmt = """CREATE TABLE types ( - "TextCol" TEXT, - "DateCol" TEXT, - "IntDateCol" INTEGER, - "IntDateOnlyCol" INTEGER, - "FloatCol" REAL, - "IntCol" INTEGER, - "BoolCol" INTEGER, - "IntColWithNull" INTEGER, - "BoolColWithNull" INTEGER - )""" - cur.execute(stmt) - - stmt = """ + stmt = """CREATE TABLE types ( + "TextCol" TEXT, + "DateCol" TEXT, + "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, + "FloatCol" REAL, + "IntCol" INTEGER, + "BoolCol" INTEGER, + "IntColWithNull" INTEGER, + "BoolColWithNull" INTEGER + )""" + + ins_stmt = """ INSERT INTO types VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) """ - cur.executemany(stmt, types_data) - conn.commit() + if isinstance(conn, sqlite3.Connection): + cur = conn.cursor() + cur.execute(stmt) + cur.executemany(ins_stmt, types_data) + else: + with conn.cursor() as cur: + cur.execute(stmt) + cur.executemany(ins_stmt, types_data) + + conn.commit() def create_and_load_types_postgresql(conn, types_data: list[dict]): @@ -1447,8 +1476,8 @@ def test_read_sql_iris_named_parameter(conn, request, sql_strings, flavor): @pytest.mark.parametrize("conn", all_connectable_iris) def test_read_sql_iris_no_parameter_with_percent(conn, request, sql_strings, flavor): - if "mysql" in conn or "postgresql" in conn: - request.node.add_marker(pytest.mark.xfail(reason="broken test")) + if "mysql" in conn or ("postgresql" in conn and "adbc" not in conn): + request.node.add_marker(pytest.mark.xfail(reason="broken test", strict=True)) conn_name = conn conn = request.getfixturevalue(conn) @@ -1549,6 +1578,7 @@ def test_api_to_sql_append(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_to_sql_type_mapping(conn, request, test_frame3): + conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_frame5", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1557,6 +1587,9 @@ def test_api_to_sql_type_mapping(conn, request, test_frame3): sql.to_sql(test_frame3, "test_frame5", conn, index=False) result = sql.read_sql("SELECT * FROM test_frame5", conn) + if conn_name == "postgresql_adbc_conn": + # postgresql driver does not maintain capitalization + result.columns = ["index", "A", "B"] tm.assert_frame_equal(test_frame3, result) @@ -1575,6 +1608,7 @@ def test_api_to_sql_series(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_roundtrip(conn, request, test_frame1): + conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_frame_roundtrip", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1584,6 +1618,10 @@ def test_api_roundtrip(conn, request, test_frame1): result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn) # HACK! + if "adbc" in conn_name: + result = result.rename(columns={"__index_level_0__": "level_0"}) + if conn_name == "postgresql_adbc_conn": + result = result.rename(columns={"a": "A", "b": "B", "c": "C", "d": "D"}) result.index = test_frame1.index result.set_index("level_0", inplace=True) result.index.astype(int) @@ -1929,10 +1967,14 @@ def test_api_multiindex_roundtrip(conn, request): ) def test_api_dtype_argument(conn, request, dtype): # GH10285 Add dtype argument to read_sql_query - if "adbc" in conn: + if "adbc" in conn and dtype: request.node.add_marker( pytest.mark.xfail(reason="dtype argument NotImplemented with ADBC") ) + elif conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail(reason="does not properly handle capitalized cols") + ) conn_name = conn conn = request.getfixturevalue(conn) @@ -1956,6 +1998,10 @@ def test_api_dtype_argument(conn, request, dtype): @pytest.mark.parametrize("conn", all_connectable) def test_api_integer_col_names(conn, request): + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail(reason="fails with syntax error", strict=True) + ) conn = request.getfixturevalue(conn) df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) sql.to_sql(df, "test_frame_integer_col_names", conn, if_exists="replace") @@ -2163,6 +2209,14 @@ def test_api_escaped_table_name(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_read_sql_duplicate_columns(conn, request): # GH#53117 + if conn == "postgresql_adbc_conn": + request.node.add_marker( + pytest.mark.xfail(reason="fails with syntax error", strict=True) + ) + elif conn == "sqlite_adbc_conn": + request.node.add_marker( + pytest.mark.xfail(reason="fails with ValueError", strict=True) + ) conn = request.getfixturevalue(conn) if sql.has_table("test_table", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -2279,7 +2333,7 @@ def test_not_reflect_all_tables(sqlite_conn): @pytest.mark.parametrize("conn", all_connectable) def test_warning_case_insensitive_table_name(conn, request, test_frame1): conn_name = conn - if conn_name == "sqlite_buildin": + if conn_name == "sqlite_buildin" or "adbc" in conn_name: request.node.add_marker(pytest.mark.xfail(reason="Does not raise warning")) conn = request.getfixturevalue(conn) @@ -2428,7 +2482,7 @@ def test_query_by_select_obj(conn, request): def test_column_with_percentage(conn, request): # GH 37157 conn_name = conn - if conn_name == "sqlite_buildin": + if conn_name in {"sqlite_buildin", "postgresql_adbc_conn"}: request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) conn = request.getfixturevalue(conn) @@ -2566,12 +2620,17 @@ def test_roundtrip(conn, request, test_frame1): if conn == "sqlite_str": pytest.skip("sqlite_str has no inspection system") + conn_name = conn conn = request.getfixturevalue(conn) pandasSQL = pandasSQL_builder(conn) with pandasSQL.run_transaction(): assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") + if "adbc" in conn_name: + result = result.rename(columns={"__index_level_0__": "level_0"}) + if conn_name == "postgresql_adbc_conn": + result = result.rename(columns={"a": "A", "b": "B", "c": "C", "d": "D"}) result.set_index("level_0", inplace=True) # result.index.astype(int) @@ -2989,6 +3048,10 @@ def test_nan_string(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_to_sql_save_index(conn, request): + if "adbc" in conn: + request.node.add_marker( + pytest.mark.xfail(reason="not working with ADBC drivers", strict=True) + ) conn_name = conn conn = request.getfixturevalue(conn) df = DataFrame.from_records( From d753c3c09c3e6f4b6f2f42e1e29ea4536cb0d581 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 20 Sep 2023 16:08:49 -0400 Subject: [PATCH 34/61] updates --- pandas/tests/io/test_sql.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 3ced13f0b3f74..1bdbf36018082 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -860,7 +860,10 @@ def sqlite_buildin_iris(sqlite_buildin, iris_path, types_data): sqlalchemy_connectable = mysql_connectable + postgresql_connectable + sqlite_connectable -adbc_connectable = ["postgresql_adbc_conn", "sqlite_adbc_conn"] +adbc_connectable = [ + pytest.param("postgresql_adbc_conn", marks=pytest.mark.db), + pytest.param("sqlite_adbc_conn", marks=pytest.mark.db), +] sqlalchemy_connectable_iris = ( mysql_connectable + postgresql_connectable + sqlite_iris_connectable @@ -1791,13 +1794,20 @@ def test_api_date_and_index(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_timedelta(conn, request): # see #6921 + conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_timedelta", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: pandasSQL.drop_table("test_timedelta") df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() - with tm.assert_produces_warning(UserWarning): + + if "adbc" in conn_name: + exp_warning = None + else: + exp_warning = UserWarning + + with tm.assert_produces_warning(exp_warning): result_count = df.to_sql(name="test_timedelta", con=conn) assert result_count == 2 result = sql.read_sql_query("SELECT * FROM test_timedelta", conn) From d469e24ed445f147dbdbc8c102fd732372b70ff5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 20 Sep 2023 20:03:05 -0400 Subject: [PATCH 35/61] implement more --- pandas/io/sql.py | 68 +++++++++++++++++++++++++------------ pandas/tests/io/test_sql.py | 23 +------------ 2 files changed, 48 insertions(+), 43 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index e0034c4cf8496..f1b6c8cbc1a98 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -61,6 +61,7 @@ from pandas.core.arrays import ArrowExtensionArray from pandas.core.base import PandasObject import pandas.core.common as com +from pandas.core.common import maybe_make_list from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime @@ -185,7 +186,7 @@ def _wrap_result( dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ): - """Wrap result set of query in a DataFrame.""" + """Wrap result set of a SQLAlchemy query in a DataFrame.""" frame = _convert_arrays_to_dataframe(data, columns, coerce_float, dtype_backend) if dtype: @@ -199,6 +200,25 @@ def _wrap_result( return frame +def _wrap_result_adbc( + df: DataFrame, + *, + index_col=None, + parse_dates=None, + dtype: DtypeArg | None = None, +): + """Wrap result set of a SQLAlchemy query in a DataFrame.""" + if dtype: + df = df.astype(dtype) + + df = _parse_date_columns(df, parse_dates) + + if index_col is not None: + df = df.set_index(index_col) + + return df + + def execute(sql, con, params=None): """ Execute the given SQL query using the provided connection object. @@ -2121,25 +2141,26 @@ def read_table( SQLDatabase.read_query """ - if index_col: - raise NotImplementedError("'index_col' is not implemented for ADBC drivers") if coerce_float is not True: raise NotImplementedError( "'coerce_float' is not implemented for ADBC drivers" ) - if parse_dates: - raise NotImplementedError( - "'parse_dates' is not implemented for ADBC drivers" - ) - if columns: - raise NotImplementedError("'columns' is not implemented for ADBC drivers") if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") + if columns: + if index_col: + index_select = maybe_make_list(index_col) + else: + index_select = [] + to_select = index_select + columns + select_list = ", ".join(f'"{x}"' for x in to_select) + else: + select_list = "*" if schema: - stmt = f"SELECT * FROM {schema}.{table_name}" + stmt = f"SELECT {select_list} FROM {schema}.{table_name}" else: - stmt = f"SELECT * FROM {table_name}" + stmt = f"SELECT {select_list} FROM {table_name}" mapping: type[ArrowDtype] | None | Callable if dtype_backend == "pyarrow": @@ -2157,7 +2178,13 @@ def read_table( with self.con.cursor() as cur: cur.execute(stmt) - return cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + + return _wrap_result_adbc( + df, + index_col=index_col, + parse_dates=parse_dates, + ) def read_query( self, @@ -2188,22 +2215,14 @@ def read_query( read_sql """ - if index_col: - raise NotImplementedError("'index_col' is not implemented for ADBC drivers") if coerce_float is not True: raise NotImplementedError( "'coerce_float' is not implemented for ADBC drivers" ) - if parse_dates: - raise NotImplementedError( - "'parse_dates' is not implemented for ADBC drivers" - ) if params: raise NotImplementedError("'params' is not implemented for ADBC drivers") if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") - if dtype: - raise NotImplementedError("'dtype' is not implemented for ADBC drivers") mapping: type[ArrowDtype] | None | Callable if dtype_backend == "pyarrow": @@ -2217,7 +2236,14 @@ def read_query( with self.con.cursor() as cur: cur.execute(sql) - return cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + + return _wrap_result_adbc( + df, + index_col=index_col, + parse_dates=parse_dates, + dtype=dtype, + ) read_sql = read_query diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1bdbf36018082..732edaddf5318 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1940,13 +1940,6 @@ def test_api_to_sql_index_label_multiindex(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_multiindex_roundtrip(conn, request): - if "adbc" in conn: - request.node.add_marker( - pytest.mark.xfail( - reason="'index_col' not implemented for ADBC drivers", - strict=True, - ) - ) conn = request.getfixturevalue(conn) if sql.has_table("test_multiindex_roundtrip", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -2249,13 +2242,6 @@ def test_read_table_columns(conn, request, test_frame1): conn_name = conn if conn_name == "sqlite_buildin": request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) - elif "adbc" in conn_name: - request.node.add_marker( - pytest.mark.xfail( - reason="'columns' not implemented for ADBC drivers", - strict=True, - ) - ) conn = request.getfixturevalue(conn) sql.to_sql(test_frame1, "test_frame", conn) @@ -2268,13 +2254,6 @@ def test_read_table_columns(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_read_table_index_col(conn, request, test_frame1): - if "adbc" in conn: - request.node.add_marker( - pytest.mark.xfail( - reason="'index_col' not implemented for ADBC drivers", - strict=True, - ) - ) # test columns argument in read_table conn_name = conn if conn_name == "sqlite_buildin": @@ -2492,7 +2471,7 @@ def test_query_by_select_obj(conn, request): def test_column_with_percentage(conn, request): # GH 37157 conn_name = conn - if conn_name in {"sqlite_buildin", "postgresql_adbc_conn"}: + if conn_name == "sqlite_buildin": request.node.add_marker(pytest.mark.xfail(reason="Not Implemented")) conn = request.getfixturevalue(conn) From 2bc11a1e69e900490655d32e115c3bc5a1bdb6dc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sun, 24 Sep 2023 10:29:10 -0400 Subject: [PATCH 36/61] bump to 0.7.0 --- ci/deps/actions-310.yaml | 4 +- ci/deps/actions-311-downstream_compat.yaml | 4 +- ci/deps/actions-311.yaml | 4 +- ci/deps/actions-39-minimum_versions.yaml | 4 +- ci/deps/actions-39.yaml | 4 +- ci/deps/circle-310-arm64.yaml | 6 +- doc/source/getting_started/install.rst | 6 +- environment.yml | 4 +- pandas/compat/_optional.py | 4 +- pandas/io/sql.py | 4 +- pandas/tests/io/test_sql.py | 98 ++++++++++++++-------- pyproject.toml | 4 +- requirements-dev.txt | 4 +- 13 files changed, 87 insertions(+), 63 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 7f2543b7ef047..db6b2cab54980 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -24,6 +24,8 @@ dependencies: - pytz # optional dependencies + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 @@ -58,7 +60,5 @@ dependencies: - zstandard>=0.17.0 - pip: - - adbc_driver_postgresql>=0.6.0 - - adbc_driver_sqlite>=0.6.0 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index f54cedb510842..88d721336c0f0 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -25,6 +25,8 @@ dependencies: - pytz # optional dependencies + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 @@ -72,8 +74,6 @@ dependencies: - pyyaml - py - pip: - - adbc_driver_postgresql>=0.6.0 - - adbc_driver_sqlite>=0.6.0 - dataframe-api-compat>=0.1.7 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 222595ad65341..6e2e729dfc650 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -24,6 +24,8 @@ dependencies: - pytz # optional dependencies + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 @@ -58,7 +60,5 @@ dependencies: - zstandard>=0.17.0 - pip: - - adbc_driver_postgresql>=0.6.0 - - adbc_driver_sqlite>=0.6.0 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index b325e1c09270f..dc9787b6fd336 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -26,6 +26,8 @@ dependencies: - pytz=2020.1 # optional dependencies + - adbc-driver-postgresql=0.7.0 + - adbc-driver-sqlite=0.7.0 - beautifulsoup4=4.11.1 - blosc=1.21.0 - bottleneck=1.3.4 @@ -60,8 +62,6 @@ dependencies: - zstandard=0.17.0 - pip: - - adbc_driver_postgresql==0.6.0 - - adbc_driver_sqlite==0.6.0 - dataframe-api-compat==0.1.7 - pyqt5==5.15.6 - tzdata==2022.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index ccf748282ac2d..e1199039572f6 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -24,6 +24,8 @@ dependencies: - pytz # optional dependencies + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 @@ -58,7 +60,5 @@ dependencies: - zstandard>=0.17.0 - pip: - - adbc_driver_postgresql>=0.6.0 - - adbc_driver_sqlite>=0.6.0 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 7248d6e317f55..34e311fd220e7 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -24,6 +24,8 @@ dependencies: - pytz # optional dependencies + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 @@ -57,7 +59,3 @@ dependencies: - xlrd>=2.0.1 - xlsxwriter>=3.0.3 - zstandard>=0.17.0 - - - pip: - - adbc_driver_postgresql>=0.6.0 - - adbc_driver_sqlite>=0.6.0 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index ea425e3551df6..86ab4c8e4dfc0 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -335,7 +335,7 @@ lxml 4.8.0 xml XML parser for read SQL databases ^^^^^^^^^^^^^ -Traditional ODBC drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"``. ADBC drivers +Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"``. ADBC drivers must be installed separately. ========================= ================== =============== ============================================================= @@ -346,8 +346,8 @@ SQLAlchemy 1.4.36 postgresql, SQL support for dat sql-other psycopg2 2.9.3 postgresql PostgreSQL engine for sqlalchemy pymysql 1.0.2 mysql MySQL engine for sqlalchemy -adbc-driver-postgresql 0.6.0 ADBC Driver for PostgreSQL -adbc-driver-sqlite 0.6.0 ADBC Driver for SQLite +adbc-driver-postgresql 0.7.0 ADBC Driver for PostgreSQL +adbc-driver-sqlite 0.7.0 ADBC Driver for SQLite ========================= ================== =============== ============================================================= Other data sources diff --git a/environment.yml b/environment.yml index 29e2387122cfc..e2c606f1a124b 100644 --- a/environment.yml +++ b/environment.yml @@ -25,8 +25,8 @@ dependencies: - pytz # optional dependencies - - adbc_driver_postgresql>=0.6.0 - - adbc_driver_sqlite>=0.6.0 + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc - bottleneck>=1.3.4 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 083004494b953..4c80bb7b08bc6 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -15,8 +15,8 @@ # Update install.rst & setup.cfg when updating versions! VERSIONS = { - "adbc_driver_postgresql": "0.6.0", - "adbc_driver_sqlite": "0.6.0", + "adbc-driver-postgresql": "0.7.0", + "adbc-driver-sqlite": "0.7.0", "bs4": "4.11.1", "blosc": "1.21.0", "bottleneck": "1.3.4", diff --git a/pandas/io/sql.py b/pandas/io/sql.py index f1b6c8cbc1a98..064673ef4dce9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -206,6 +206,7 @@ def _wrap_result_adbc( index_col=None, parse_dates=None, dtype: DtypeArg | None = None, + dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ): """Wrap result set of a SQLAlchemy query in a DataFrame.""" if dtype: @@ -2121,8 +2122,7 @@ def read_table( dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame | Iterator[DataFrame]: """ - Read SQL database table into a DataFrame. Only keyword arguments used - are table_name and schema. Other keywords will raise NotImplementedError. + Read SQL database table into a DataFrame. Parameters ---------- diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 732edaddf5318..677411b6afc57 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -318,24 +318,36 @@ def create_and_load_types_sqlite3(conn, types_data: list[dict]): def create_and_load_types_postgresql(conn, types_data: list[dict]): + # Boolean support not added until 0.8.0 + adbc = import_optional_dependency("adbc_driver_manager") + from pandas.util.version import Version + + if Version(adbc.__version__) < Version("0.8.0"): + bool_type = "INTEGER" + else: + bool_type = "BOOLEAN" + with conn.cursor() as cur: - stmt = """CREATE TABLE types ( + stmt = f"""CREATE TABLE types ( "TextCol" TEXT, - "DateCol" TEXT, + "DateCol" TIMESTAMP, "IntDateCol" INTEGER, "IntDateOnlyCol" INTEGER, "FloatCol" DOUBLE PRECISION, "IntCol" INTEGER, - "BoolCol" INTEGER, + "BoolCol" {bool_type}, "IntColWithNull" INTEGER, - "BoolColWithNull" INTEGER + "BoolColWithNull" {bool_type}, + "DateColWithTz" TIMESTAMP WITH TIME ZONE )""" cur.execute(stmt) stmt = """ INSERT INTO types - VALUES($1, $2, $3, $4, $5, $6, $7, $8, $9) + VALUES($1, $2::timestamp, $3, $4, $5, $6, $7, $8, $9, + $10::timestamptz) """ + cur.executemany(stmt, types_data) conn.commit() @@ -658,25 +670,31 @@ def postgresql_adbc_conn(iris_path, types_data): with dbapi.connect(uri) as conn: try: conn.adbc_get_table_schema("iris") - except mgr.OperationalError: + except mgr.ProgrammingError: conn.rollback() create_and_load_iris_postgresql(conn, iris_path) try: conn.adbc_get_table_schema("iris_view") - except mgr.OperationalError: # note arrow-adbc issue 1022 + except mgr.ProgrammingError: # note arrow-adbc issue 1022 conn.rollback() create_and_load_iris_view(conn) try: conn.adbc_get_table_schema("types") - except mgr.OperationalError: + except mgr.ProgrammingError: conn.rollback() - # ADBC cannot cast boolean data - new_data = [] - for entry in types_data: - entry["BoolCol"] = int(entry["BoolCol"]) - if entry["BoolColWithNull"] is not None: - entry["BoolColWithNull"] = int(entry["BoolColWithNull"]) - new_data.append(tuple(entry.values())) + # Boolean support not added until 0.8.0 + adbc = import_optional_dependency("adbc_driver_manager") + from pandas.util.version import Version + + if Version(adbc.__version__) < Version("0.8.0"): + new_data = [] + for entry in types_data: + entry["BoolCol"] = int(entry["BoolCol"]) + if entry["BoolColWithNull"] is not None: + entry["BoolColWithNull"] = int(entry["BoolColWithNull"]) + new_data.append(tuple(entry.values())) + else: + new_data = [tuple(entry.values()) for entry in types_data] create_and_load_types_postgresql(conn, new_data) yield conn @@ -735,17 +753,17 @@ def sqlite_adbc_conn(iris_path, types_data): with dbapi.connect(uri) as conn: try: conn.adbc_get_table_schema("iris") - except mgr.InternalError: # note arrow-adbc issue 1022 + except mgr.ProgrammingError: conn.rollback() create_and_load_iris_sqlite3(conn, iris_path) try: conn.adbc_get_table_schema("iris_view") - except mgr.InternalError: # note arrow-adbc issue 1022 + except mgr.ProgrammingError: conn.rollback() create_and_load_iris_view(conn) try: conn.adbc_get_table_schema("types") - except mgr.InternalError: # note arrow-adbc issue 1022 + except mgr.ProgrammingError: conn.rollback() new_data = [] for entry in types_data: @@ -1671,10 +1689,6 @@ def test_api_date_parsing(conn, request): conn = request.getfixturevalue(conn) # Test date parsing in read_sql # No Parsing - if "adbc" in conn_name: - request.node.add_marker( - pytest.mark.xfail(reason="parse_dates argument NotImplemented with ADBC") - ) df = sql.read_sql_query("SELECT * FROM types", conn) if not ("mysql" in conn_name or "postgres" in conn_name): assert not issubclass(df.DateCol.dtype.type, np.datetime64) @@ -1749,10 +1763,6 @@ def test_api_custom_dateparsing_error( request.node.add_marker( pytest.mark.xfail(reason="failing combination of arguments") ) - elif "adbc" in conn_name: - request.node.add_marker( - pytest.mark.xfail(reason="parse_dates argument NotImplemented with ADBC") - ) expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) @@ -3503,8 +3513,12 @@ def test_read_sql_dtype_backend( table = "test" df = dtype_backend_data if "adbc" in conn_name: - # adbc cannot write / roundtrip booleans - df = df.drop(columns=["e", "f"]) + # Boolean support not added until 0.8.0 + adbc = import_optional_dependency("adbc_driver_manager") + from pandas.util.version import Version + + if Version(adbc.__version__) < Version("0.8.0"): + df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): @@ -3513,8 +3527,11 @@ def test_read_sql_dtype_backend( ) expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) if "adbc" in conn_name: - # adbc cannot write / roundtrip booleans - expected = expected.drop(columns=["e", "f"]) + adbc = import_optional_dependency("adbc_driver_manager") + from pandas.util.version import Version + + if Version(adbc.__version__) < Version("0.8.0"): + expected = expected.drop(columns=["e", "f"]) tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3559,16 +3576,22 @@ def test_read_sql_dtype_backend_table( table = "test" df = dtype_backend_data if "adbc" in conn_name: - # adbc cannot write / roundtrip booleans - df = df.drop(columns=["e", "f"]) + adbc = import_optional_dependency("adbc_driver_manager") + from pandas.util.version import Version + + if Version(adbc.__version__) < Version("0.8.0"): + df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) if "adbc" in conn_name: - # adbc cannot write / roundtrip booleans - expected = expected.drop(columns=["e", "f"]) + adbc = import_optional_dependency("adbc_driver_manager") + from pandas.util.version import Version + + if Version(adbc.__version__) < Version("0.8.0"): + expected = expected.drop(columns=["e", "f"]) tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3595,8 +3618,11 @@ def test_read_sql_invalid_dtype_backend_table(conn, request, func, dtype_backend table = "test" df = dtype_backend_data if "adbc" in conn_name: - # adbc cannot write / roundtrip booleans - df = df.drop(columns=["e", "f"]) + adbc = import_optional_dependency("adbc_driver_manager") + from pandas.util.version import Version + + if Version(adbc.__version__) < Version("0.8.0"): + df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") msg = ( diff --git a/pyproject.toml b/pyproject.toml index 94d7198a3e2d2..958f2a4c1ab03 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,8 +86,8 @@ output_formatting = ['jinja2>=3.1.2', 'tabulate>=0.8.10'] clipboard = ['PyQt5>=5.15.6', 'qtpy>=2.2.0'] compression = ['zstandard>=0.17.0'] consortium-standard = ['dataframe-api-compat>=0.1.7'] -all = ['adbc_driver_postgresql>=0.6.0', - 'adbc_driver_sqlite>=0.6.0', +all = ['adbc-driver-postgresql>=0.7.0', + 'adbc-driver-sqlite>=0.7.0', 'beautifulsoup4>=4.11.1', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) #'blosc>=1.21.0', diff --git a/requirements-dev.txt b/requirements-dev.txt index ce7375fbcab08..8f05e4d526b82 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,8 +14,8 @@ coverage python-dateutil numpy pytz -adbc_driver_postgresql>=0.6.0 -adbc_driver_sqlite>=0.6.0 +adbc-driver-postgresql>=0.7.0 +adbc-driver-sqlite>=0.7.0 beautifulsoup4>=4.11.1 blosc bottleneck>=1.3.4 From f205f90e1f69ccdd747f35873c153419a1283867 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 2 Oct 2023 10:59:51 -0400 Subject: [PATCH 37/61] fixups --- pandas/tests/io/test_sql.py | 79 ++++++++++++++++++------------------- 1 file changed, 38 insertions(+), 41 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 677411b6afc57..daf94588c70ea 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -570,7 +570,7 @@ def drop_table( adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") if adbc and isinstance(conn, adbc.Connection): with conn.cursor() as cur: - cur.execute(f"DROP TABLE IF EXISTS {table_name}") + cur.execute(f'DROP TABLE IF EXISTS "{table_name}"') else: with conn.begin() as con: sql.SQLDatabase(con).drop_table(table_name) @@ -587,7 +587,7 @@ def drop_view( adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") if adbc and isinstance(conn, adbc.Connection): with conn.cursor() as cur: - cur.execute(f"DROP VIEW IF EXISTS {view_name}") + cur.execute(f'DROP VIEW IF EXISTS "{view_name}"') else: quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier( view_name @@ -918,7 +918,8 @@ def test_dataframe_to_sql_empty(conn, test_frame1, request): if conn == "postgresql_adbc_conn": request.node.add_marker( pytest.mark.xfail( - reason="postgres ADBC driver doesn't like empty dataset", + reason="postgres ADBC driver cannot insert index with null type", + strict=True, ) ) # GH 51086 if conn is sqlite_engine @@ -938,29 +939,18 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): [datetime(2023, 1, 1)], dtype="timestamp[ns][pyarrow]" ), "date": pd.array([date(2023, 1, 1)], dtype="date32[day][pyarrow]"), + "timedelta": pd.array([timedelta(1)], dtype="duration[ns][pyarrow]"), "string": pd.array(["a"], dtype="string[pyarrow]"), } ) if "adbc" in conn: - df["timedelta"] = pd.array( - [timedelta(1)], dtype="month_day_nano_interval[pyarrow]" - ) exp_warning = FutureWarning # warning thrown from pyarrow msg = "is_sparse is deprecated" else: - df["timedelta"] = pd.array([timedelta(1)], dtype="duration[ns][pyarrow]") exp_warning = UserWarning msg = "the 'timedelta'" - if conn == "sqlite_adbc_conn": - request.node.add_marker( - pytest.mark.xfail( - reason="timedelta not implemented in ADBC sqlite driver", - strict=True, - ) - ) - conn = request.getfixturevalue(conn) with tm.assert_produces_warning(exp_warning, match=msg, check_stacklevel=False): df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False) @@ -1773,22 +1763,29 @@ def test_api_custom_dateparsing_error( "DateCol": {"errors": error}, }, ) + if "postgres" in conn_name: # TODO: clean up types_data_frame fixture result = result.drop(columns=["DateColWithTz"]) result["BoolCol"] = result["BoolCol"].astype(int) result["BoolColWithNull"] = result["BoolColWithNull"].astype(float) + if conn_name == "postgresql_adbc_conn": + expected = expected.astype( + { + "DateCol": "datetime64[us]", # TODO: is this astype allowed? + "IntDateCol": "int32", + "IntDateOnlyCol": "int32", + "IntCol": "int32", + } + ) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", all_connectable_iris) def test_api_date_and_index(conn, request): # Test case where same column appears in parse_date and index_col - if "adbc" in conn: - request.node.add_marker( - pytest.mark.xfail(reason="index_col argument NotImplemented with ADBC") - ) conn = request.getfixturevalue(conn) df = sql.read_sql_query( "SELECT * FROM types", @@ -1812,6 +1809,13 @@ def test_api_timedelta(conn, request): df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame() + if conn_name == "sqlite_adbc_conn": + request.node.add_marker( + pytest.mark.xfail( + reason="sqlite ADBC driver doesn't implement timedelta", + ) + ) + if "adbc" in conn_name: exp_warning = None else: @@ -1821,7 +1825,21 @@ def test_api_timedelta(conn, request): result_count = df.to_sql(name="test_timedelta", con=conn) assert result_count == 2 result = sql.read_sql_query("SELECT * FROM test_timedelta", conn) - tm.assert_series_equal(result["foo"], df["foo"].view("int64")) + + if conn_name == "postgresql_adbc_conn": + # TODO: Postgres stores an INTERVAL, which ADBC reads as a Month-Day-Nano + # Interval; the default pandas type mapper maps this to a DateOffset + # but maybe we should try and restore the timedelta here? + expected = Series( + [ + pd.DateOffset(months=0, days=0, microseconds=1000000, nanoseconds=0), + pd.DateOffset(months=0, days=0, microseconds=3000000, nanoseconds=0), + ], + name="foo", + ) + else: + expected = df["foo"].view("int64") + tm.assert_series_equal(result["foo"], expected) @pytest.mark.parametrize("conn", all_connectable) @@ -1980,15 +1998,6 @@ def test_api_multiindex_roundtrip(conn, request): ) def test_api_dtype_argument(conn, request, dtype): # GH10285 Add dtype argument to read_sql_query - if "adbc" in conn and dtype: - request.node.add_marker( - pytest.mark.xfail(reason="dtype argument NotImplemented with ADBC") - ) - elif conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.mark.xfail(reason="does not properly handle capitalized cols") - ) - conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_dtype_argument", conn): @@ -2011,10 +2020,6 @@ def test_api_dtype_argument(conn, request, dtype): @pytest.mark.parametrize("conn", all_connectable) def test_api_integer_col_names(conn, request): - if conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.mark.xfail(reason="fails with syntax error", strict=True) - ) conn = request.getfixturevalue(conn) df = DataFrame([[1, 2], [3, 4]], columns=[0, 1]) sql.to_sql(df, "test_frame_integer_col_names", conn, if_exists="replace") @@ -2196,10 +2201,6 @@ def test_api_unicode_column_name(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_escaped_table_name(conn, request): - if "adbc" in conn: - request.node.add_marker( - pytest.mark.xfail(reason="see arrow-adbc gh issue 1080") - ) # GH 13206 conn_name = conn conn = request.getfixturevalue(conn) @@ -3725,10 +3726,6 @@ def test_chunksize_empty_dtypes(conn, request): @pytest.mark.parametrize("func", ["read_sql", "read_sql_query"]) def test_read_sql_dtype(conn, request, func, dtype_backend): # GH#50797 - if "adbc" in conn: - request.node.add_marker( - pytest.mark.xfail(reason="dtype argument NotImplemented with ADBC") - ) conn = request.getfixturevalue(conn) table = "test" df = DataFrame({"a": [1, 2, 3], "b": 5}) From 3577a59bdc23f56581070840b28769f9058a4b63 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 2 Oct 2023 11:46:45 -0400 Subject: [PATCH 38/61] cleanups --- pandas/io/sql.py | 4 +- pandas/tests/io/test_sql.py | 99 +++++++++++-------------------------- 2 files changed, 30 insertions(+), 73 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 064673ef4dce9..dbbfa1ddff092 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2299,8 +2299,8 @@ def to_sql( else: table_name = name - # TODO: pandas if_exists="append" will still create the - # table if it does not exist; ADBC has append/create + # pandas if_exists="append" will still create the + # table if it does not exist; ADBC is more explicit with append/create # as applicable modes, so the semantics get blurred across # the libraries mode = "create" diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 9553e0fd0977d..32cb06d17d79d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -141,50 +141,29 @@ def create_and_load_iris_sqlite3(conn, iris_file: Path): "Name" TEXT )""" - if isinstance(conn, sqlite3.Connection): - cur = conn.cursor() - cur.execute(stmt) - with iris_file.open(newline=None, encoding="utf-8") as csvfile: - reader = csv.reader(csvfile) - next(reader) - stmt = "INSERT INTO iris VALUES($1, $2, $3, $4, $5)" - # ADBC requires explicit types - no implicit str -> float conversion - records = [] - records = [ - ( - float(row[0]), - float(row[1]), - float(row[2]), - float(row[3]), - row[4], - ) - for row in reader - ] + cur = conn.cursor() + cur.execute(stmt) + with iris_file.open(newline=None, encoding="utf-8") as csvfile: + reader = csv.reader(csvfile) + next(reader) + stmt = "INSERT INTO iris VALUES($1, $2, $3, $4, $5)" + # ADBC requires explicit types - no implicit str -> float conversion + records = [] + records = [ + ( + float(row[0]), + float(row[1]), + float(row[2]), + float(row[3]), + row[4], + ) + for row in reader + ] - cur.executemany(stmt, records) - else: - with conn.cursor() as cur: - cur.execute(stmt) - with iris_file.open(newline=None, encoding="utf-8") as csvfile: - reader = csv.reader(csvfile) - next(reader) - stmt = "INSERT INTO iris VALUES($1, $2, $3, $4, $5)" - # ADBC requires explicit types - no implicit str -> float conversion - records = [] - records = [ - ( - float(row[0]), - float(row[1]), - float(row[2]), - float(row[3]), - row[4], - ) - for row in reader - ] - - cur.executemany(stmt, records) + cur.executemany(stmt, records) + cur.close() - conn.commit() + conn.commit() def create_and_load_iris_postgresql(conn, iris_file: Path): @@ -320,8 +299,6 @@ def create_and_load_types_sqlite3(conn, types_data: list[dict]): def create_and_load_types_postgresql(conn, types_data: list[dict]): # Boolean support not added until 0.8.0 adbc = import_optional_dependency("adbc_driver_manager") - from pandas.util.version import Version - if Version(adbc.__version__) < Version("0.8.0"): bool_type = "INTEGER" else: @@ -684,8 +661,6 @@ def postgresql_adbc_conn(iris_path, types_data): conn.rollback() # Boolean support not added until 0.8.0 adbc = import_optional_dependency("adbc_driver_manager") - from pandas.util.version import Version - if Version(adbc.__version__) < Version("0.8.0"): new_data = [] for entry in types_data: @@ -1589,7 +1564,6 @@ def test_api_to_sql_append(conn, request, test_frame1): @pytest.mark.parametrize("conn", all_connectable) def test_api_to_sql_type_mapping(conn, request, test_frame3): - conn_name = conn conn = request.getfixturevalue(conn) if sql.has_table("test_frame5", conn): with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL: @@ -1598,9 +1572,6 @@ def test_api_to_sql_type_mapping(conn, request, test_frame3): sql.to_sql(test_frame3, "test_frame5", conn, index=False) result = sql.read_sql("SELECT * FROM test_frame5", conn) - if conn_name == "postgresql_adbc_conn": - # postgresql driver does not maintain capitalization - result.columns = ["index", "A", "B"] tm.assert_frame_equal(test_frame3, result) @@ -2223,13 +2194,9 @@ def test_api_escaped_table_name(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_read_sql_duplicate_columns(conn, request): # GH#53117 - if conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.mark.xfail(reason="fails with syntax error", strict=True) - ) - elif conn == "sqlite_adbc_conn": + if "adbc" in conn: request.node.add_marker( - pytest.mark.xfail(reason="fails with ValueError", strict=True) + pytest.mark.xfail(reason="pyarrow->pandas throws ValueError", strict=True) ) conn = request.getfixturevalue(conn) if sql.has_table("test_table", conn): @@ -2239,7 +2206,7 @@ def test_api_read_sql_duplicate_columns(conn, request): df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1}) df.to_sql(name="test_table", con=conn, index=False) - result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table;", conn) + result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table", conn) expected = DataFrame( [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]], columns=["a", "b", "a", "c"], @@ -2629,8 +2596,6 @@ def test_roundtrip(conn, request, test_frame1): if "adbc" in conn_name: result = result.rename(columns={"__index_level_0__": "level_0"}) - if conn_name == "postgresql_adbc_conn": - result = result.rename(columns={"a": "A", "b": "B", "c": "C", "d": "D"}) result.set_index("level_0", inplace=True) # result.index.astype(int) @@ -3050,7 +3015,9 @@ def test_nan_string(conn, request): def test_to_sql_save_index(conn, request): if "adbc" in conn: request.node.add_marker( - pytest.mark.xfail(reason="not working with ADBC drivers", strict=True) + pytest.mark.xfail( + reason="ADBC implementation does not create index", strict=True + ) ) conn_name = conn conn = request.getfixturevalue(conn) @@ -3063,7 +3030,7 @@ def test_to_sql_save_index(conn, request): with pandasSQL.run_transaction(): assert pandasSQL.to_sql(df, tbl_name) == 2 - if conn_name in {"sqlite_buildin", "sqlite_str"} or "adbc" in conn_name: + if conn_name in {"sqlite_buildin", "sqlite_str"}: ixs = sql.read_sql_query( "SELECT * FROM sqlite_master WHERE type = 'index' " f"AND tbl_name = '{tbl_name}'", @@ -3516,8 +3483,6 @@ def test_read_sql_dtype_backend( if "adbc" in conn_name: # Boolean support not added until 0.8.0 adbc = import_optional_dependency("adbc_driver_manager") - from pandas.util.version import Version - if Version(adbc.__version__) < Version("0.8.0"): df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") @@ -3529,8 +3494,6 @@ def test_read_sql_dtype_backend( expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) if "adbc" in conn_name: adbc = import_optional_dependency("adbc_driver_manager") - from pandas.util.version import Version - if Version(adbc.__version__) < Version("0.8.0"): expected = expected.drop(columns=["e", "f"]) tm.assert_frame_equal(result, expected) @@ -3578,8 +3541,6 @@ def test_read_sql_dtype_backend_table( df = dtype_backend_data if "adbc" in conn_name: adbc = import_optional_dependency("adbc_driver_manager") - from pandas.util.version import Version - if Version(adbc.__version__) < Version("0.8.0"): df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") @@ -3589,8 +3550,6 @@ def test_read_sql_dtype_backend_table( expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) if "adbc" in conn_name: adbc = import_optional_dependency("adbc_driver_manager") - from pandas.util.version import Version - if Version(adbc.__version__) < Version("0.8.0"): expected = expected.drop(columns=["e", "f"]) tm.assert_frame_equal(result, expected) @@ -3620,8 +3579,6 @@ def test_read_sql_invalid_dtype_backend_table(conn, request, func, dtype_backend df = dtype_backend_data if "adbc" in conn_name: adbc = import_optional_dependency("adbc_driver_manager") - from pandas.util.version import Version - if Version(adbc.__version__) < Version("0.8.0"): df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") From c5bf7f800a279b8aa391f622be703ae0a1c2a8db Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 2 Oct 2023 13:00:36 -0400 Subject: [PATCH 39/61] sqlite fixups --- pandas/tests/io/test_sql.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 32cb06d17d79d..63dd8e54b4196 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -146,7 +146,7 @@ def create_and_load_iris_sqlite3(conn, iris_file: Path): with iris_file.open(newline=None, encoding="utf-8") as csvfile: reader = csv.reader(csvfile) next(reader) - stmt = "INSERT INTO iris VALUES($1, $2, $3, $4, $5)" + stmt = "INSERT INTO iris VALUES(?, ?, ?, ?, ?)" # ADBC requires explicit types - no implicit str -> float conversion records = [] records = [ @@ -920,6 +920,8 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): ) if "adbc" in conn: + if conn == "sqlite_adbc_conn": + df = df.drop(columns=["timedelta"]) exp_warning = FutureWarning # warning thrown from pyarrow msg = "is_sparse is deprecated" else: From 98d22ce2b09b9471766f7f2b30499a3ea29c7879 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 2 Oct 2023 13:09:49 -0400 Subject: [PATCH 40/61] pyarrow compat --- pandas/tests/io/test_sql.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 63dd8e54b4196..158ff8d9ea4bb 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -35,7 +35,10 @@ import pytest from pandas._libs import lib -from pandas.compat import pa_version_under8p0 +from pandas.compat import ( + pa_version_under8p0, + pa_version_under13p0, +) from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -1746,13 +1749,16 @@ def test_api_custom_dateparsing_error( if conn_name == "postgresql_adbc_conn": expected = expected.astype( { - "DateCol": "datetime64[us]", # TODO: is this astype allowed? "IntDateCol": "int32", "IntDateOnlyCol": "int32", "IntCol": "int32", } ) + if not pa_version_under13p0: + # TODO: is this astype safe? + expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") + tm.assert_frame_equal(result, expected) From 4f72010a9723d65e9d81b53c5f379cc00e09cd57 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 2 Oct 2023 13:14:07 -0400 Subject: [PATCH 41/61] revert to using pip instead of conda --- ci/deps/actions-310.yaml | 4 ++-- ci/deps/actions-311-downstream_compat.yaml | 4 ++-- ci/deps/actions-311.yaml | 4 ++-- ci/deps/actions-39-minimum_versions.yaml | 4 ++-- ci/deps/actions-39.yaml | 4 ++-- ci/deps/circle-310-arm64.yaml | 6 ++++-- environment.yml | 4 ++-- requirements-dev.txt | 4 ++-- 8 files changed, 18 insertions(+), 16 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 5a1c49b0be7bd..dbff477d04181 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -24,8 +24,6 @@ dependencies: - pytz # optional dependencies - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 @@ -60,5 +58,7 @@ dependencies: - zstandard>=0.17.0 - pip: + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 1ab34bdf6a9cc..98a8dd72eb7c8 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -25,8 +25,6 @@ dependencies: - pytz # optional dependencies - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 @@ -74,6 +72,8 @@ dependencies: - pyyaml - py - pip: + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 - dataframe-api-compat>=0.1.7 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 6b711a624492d..0c734cb1d43ea 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -24,8 +24,6 @@ dependencies: - pytz # optional dependencies - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 @@ -60,5 +58,7 @@ dependencies: - zstandard>=0.17.0 - pip: + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index dc9787b6fd336..555ed766ccdd1 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -26,8 +26,6 @@ dependencies: - pytz=2020.1 # optional dependencies - - adbc-driver-postgresql=0.7.0 - - adbc-driver-sqlite=0.7.0 - beautifulsoup4=4.11.1 - blosc=1.21.0 - bottleneck=1.3.4 @@ -62,6 +60,8 @@ dependencies: - zstandard=0.17.0 - pip: + - adbc-driver-postgresql==0.7.0 + - adbc-driver-sqlite==0.7.0 - dataframe-api-compat==0.1.7 - pyqt5==5.15.6 - tzdata==2022.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index fce29bd19091b..2fa4f525c595c 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -24,8 +24,6 @@ dependencies: - pytz # optional dependencies - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 @@ -60,5 +58,7 @@ dependencies: - zstandard>=0.17.0 - pip: + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index ef26955402eba..30a49d873dd03 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -24,8 +24,6 @@ dependencies: - pytz # optional dependencies - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc>=1.21.0 - bottleneck>=1.3.4 @@ -59,3 +57,7 @@ dependencies: - xlrd>=2.0.1 - xlsxwriter>=3.0.3 - zstandard>=0.17.0 + + - pip: + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 diff --git a/environment.yml b/environment.yml index 4138bd1ed9343..1b442e3cbc80b 100644 --- a/environment.yml +++ b/environment.yml @@ -25,8 +25,6 @@ dependencies: - pytz # optional dependencies - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 - beautifulsoup4>=4.11.1 - blosc - bottleneck>=1.3.4 @@ -116,6 +114,8 @@ dependencies: - pygments # Code highlighting - pip: + - adbc-driver-postgresql>=0.7.0 + - adbc-driver-sqlite>=0.7.0 - dataframe-api-compat>=0.1.7 - sphinx-toggleprompt # conda-forge version has stricter pins on jinja2 - typing_extensions; python_version<"3.11" diff --git a/requirements-dev.txt b/requirements-dev.txt index 651c67f2772fc..deaec1f9ff0c9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,8 +14,6 @@ coverage python-dateutil numpy pytz -adbc-driver-postgresql>=0.7.0 -adbc-driver-sqlite>=0.7.0 beautifulsoup4>=4.11.1 blosc bottleneck>=1.3.4 @@ -84,6 +82,8 @@ feedparser pyyaml requests pygments +adbc-driver-postgresql>=0.7.0 +adbc-driver-sqlite>=0.7.0 dataframe-api-compat>=0.1.7 sphinx-toggleprompt typing_extensions; python_version<"3.11" From 7223e635048f79d8dcbd0b2b79d9c77381111f72 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 2 Oct 2023 14:13:41 -0400 Subject: [PATCH 42/61] documentation cleanups --- pandas/io/sql.py | 72 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 6 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index dbbfa1ddff092..bdd80eda6c4c1 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2128,8 +2128,35 @@ def read_table( ---------- table_name : str Name of SQL table in database. + coerce_float : bool, default True + Raises NotImplementedError + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg}``, where the arg corresponds + to the keyword arguments of :func:`pandas.to_datetime`. + Especially useful with databases without native Datetime support, + such as SQLite. + columns : list, default: None + List of column names to select from SQL table. schema : string, default None - Name of SQL schema in database to read from + Name of SQL schema in database to query (if database flavor + supports this). If specified, this overwrites the default + schema of the SQL database object. + chunksize : int, default None + Raises NotImplementedError + dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 Returns ------- @@ -2198,12 +2225,34 @@ def read_query( dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", ) -> DataFrame | Iterator[DataFrame]: """ - Read SQL query into a DataFrame. Keyword arguments are discarded. + Read SQL query into a DataFrame. Parameters ---------- sql : str SQL query to be executed. + index_col : string, optional, default: None + Column name to use as index for the returned DataFrame object. + coerce_float : bool, default True + Raises NotImplementedError + params : list, tuple or dict, optional, default: None + Raises NotImplementedError + parse_dates : list or dict, default: None + - List of column names to parse as dates. + - Dict of ``{column_name: format string}`` where format string is + strftime compatible in case of parsing string times, or is one of + (D, s, ns, ms, us) in case of parsing integer timestamps. + - Dict of ``{column_name: arg dict}``, where the arg dict + corresponds to the keyword arguments of + :func:`pandas.to_datetime` Especially useful with databases + without native Datetime support, such as SQLite. + chunksize : int, default None + Raises NotImplementedError + dtype : Type name or dict of columns + Data type for data or columns. E.g. np.float64 or + {'a': np.float64, 'b': np.int32, 'c': 'Int64'} + + .. versionadded:: 1.3.0 Returns ------- @@ -2263,7 +2312,6 @@ def to_sql( ) -> int | None: """ Write records stored in a DataFrame to a SQL database. - Only frame, name, if_exists, index and schema are valid arguments. Parameters ---------- @@ -2274,17 +2322,27 @@ def to_sql( - fail: If table exists, do nothing. - replace: If table exists, drop it, recreate it, and insert data. - append: If table exists, insert data. Create if does not exist. + index : boolean, default True + Write DataFrame index as a column. + index_label : string or sequence, default None + Raises NotImplementedError schema : string, default None Name of SQL schema in database to write to (if database flavor supports this). If specified, this overwrites the default schema of the SQLDatabase object. + chunksize : int, default None + Raises NotImplementedError + dtype : single type or dict of column name to SQL type, default None + Raises NotImplementedError + method : {None', 'multi', callable}, default None + Raises NotImplementedError + engine : {'auto', 'sqlalchemy'}, default 'auto' + Raises NotImplementedError if not set to 'auto' """ if index_label: raise NotImplementedError( "'index_label' is not implemented for ADBC drivers" ) - if schema: - raise NotImplementedError("'schema' is not implemented for ADBC drivers") if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") if dtype: @@ -2292,7 +2350,9 @@ def to_sql( if method: raise NotImplementedError("'method' is not implemented for ADBC drivers") if engine != "auto": - raise NotImplementedError("'auto' is not implemented for ADBC drivers") + raise NotImplementedError( + "engine != 'auto' not implemented for ADBC drivers" + ) if schema: table_name = f"{schema}.{name}" From c2cd90ac54478c8738f7d01bda8b83aaaa5add47 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 2 Oct 2023 22:26:50 -0400 Subject: [PATCH 43/61] compat fixups --- pandas/tests/io/test_sql.py | 52 ++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 158ff8d9ea4bb..a66b769db642a 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -203,7 +203,6 @@ def create_and_load_iris_postgresql(conn, iris_file: Path): def create_and_load_iris(conn, iris_file: Path, dialect: str): from sqlalchemy import insert - import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") iris = iris_table_metadata(dialect) with iris_file.open(newline=None, encoding="utf-8") as csvfile: @@ -223,7 +222,12 @@ def create_and_load_iris_view(conn): cur = conn.cursor() cur.execute(stmt) else: - adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if pa_version_under8p0: + adbc = None + else: + adbc = import_optional_dependency( + "adbc_driver_manager.dbapi", errors="ignore" + ) if adbc and isinstance(conn, adbc.Connection): with conn.cursor() as cur: cur.execute(stmt) @@ -301,8 +305,12 @@ def create_and_load_types_sqlite3(conn, types_data: list[dict]): def create_and_load_types_postgresql(conn, types_data: list[dict]): # Boolean support not added until 0.8.0 - adbc = import_optional_dependency("adbc_driver_manager") - if Version(adbc.__version__) < Version("0.8.0"): + if pa_version_under8p0: + adbc = None + else: + adbc = import_optional_dependency("adbc_driver_manager", errors="ignore") + + if adbc and Version(adbc.__version__) < Version("0.8.0"): bool_type = "INTEGER" else: bool_type = "BOOLEAN" @@ -363,7 +371,10 @@ def check_iris_frame(frame: DataFrame): def count_rows(conn, table_name: str): stmt = f"SELECT count(*) AS count_1 FROM {table_name}" - adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if pa_version_under8p0: + adbc = None + else: + adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") if isinstance(conn, sqlite3.Connection): cur = conn.cursor() return cur.execute(stmt).fetchone()[0] @@ -495,7 +506,12 @@ def get_all_views(conn): c = conn.execute("SELECT name FROM sqlite_master WHERE type='view'") return [view[0] for view in c.fetchall()] else: - adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if pa_version_under8p0: + adbc = None + else: + adbc = import_optional_dependency( + "adbc_driver_manager.dbapi", errors="ignore" + ) if adbc and isinstance(conn, adbc.Connection): results = [] info = conn.adbc_get_objects().read_all().to_pylist() @@ -520,7 +536,13 @@ def get_all_tables(conn): c = conn.execute("SELECT name FROM sqlite_master WHERE type='table'") return [table[0] for table in c.fetchall()] else: - adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if pa_version_under8p0: + adbc = None + else: + adbc = import_optional_dependency( + "adbc_driver_manager.dbapi", errors="ignore" + ) + if adbc and isinstance(conn, adbc.Connection): results = [] info = conn.adbc_get_objects().read_all().to_pylist() @@ -547,7 +569,12 @@ def drop_table( conn.commit() else: - adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if pa_version_under8p0: + adbc = None + else: + adbc = import_optional_dependency( + "adbc_driver_manager.dbapi", errors="ignore" + ) if adbc and isinstance(conn, adbc.Connection): with conn.cursor() as cur: cur.execute(f'DROP TABLE IF EXISTS "{table_name}"') @@ -564,7 +591,12 @@ def drop_view( conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}") conn.commit() else: - adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore") + if pa_version_under8p0: + adbc = None + else: + adbc = import_optional_dependency( + "adbc_driver_manager.dbapi", errors="ignore" + ) if adbc and isinstance(conn, adbc.Connection): with conn.cursor() as cur: cur.execute(f'DROP VIEW IF EXISTS "{view_name}"') @@ -1796,7 +1828,7 @@ def test_api_timedelta(conn, request): ) if "adbc" in conn_name: - exp_warning = None + exp_warning = FutureWarning # pyarrow warns is_sparse is deprecated else: exp_warning = UserWarning From de65ec0310c1ef797d6c74ed04bd64854a2e55bd Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 3 Oct 2023 14:12:18 -0400 Subject: [PATCH 44/61] Fix stacklevel --- pandas/tests/io/test_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a66b769db642a..1d09e9e9f255c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1832,7 +1832,7 @@ def test_api_timedelta(conn, request): else: exp_warning = UserWarning - with tm.assert_produces_warning(exp_warning): + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): result_count = df.to_sql(name="test_timedelta", con=conn) assert result_count == 2 result = sql.read_sql_query("SELECT * FROM test_timedelta", conn) From 76457275a89df6d7609dac99c258b3db1d982437 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 3 Oct 2023 17:04:25 -0400 Subject: [PATCH 45/61] remove unneeded code --- pandas/tests/io/test_sql.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1d09e9e9f255c..ef820efaca9f2 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1639,8 +1639,6 @@ def test_api_roundtrip(conn, request, test_frame1): # HACK! if "adbc" in conn_name: result = result.rename(columns={"__index_level_0__": "level_0"}) - if conn_name == "postgresql_adbc_conn": - result = result.rename(columns={"a": "A", "b": "B", "c": "C", "d": "D"}) result.index = test_frame1.index result.set_index("level_0", inplace=True) result.index.astype(int) @@ -1771,7 +1769,6 @@ def test_api_custom_dateparsing_error( "DateCol": {"errors": error}, }, ) - if "postgres" in conn_name: # TODO: clean up types_data_frame fixture result = result.drop(columns=["DateColWithTz"]) From 6dbaae5b62a0819ac3e724d0702eaa549454e855 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 16 Oct 2023 18:58:30 -0400 Subject: [PATCH 46/61] commit after drop in fixtures --- pandas/tests/io/test_sql.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 8aed56dbae795..1fbaf477b8f7d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -695,6 +695,7 @@ def postgresql_adbc_conn(iris_path, types_data): drop_view(view, conn) for tbl in get_all_tables(conn): drop_table(tbl, conn) + conn.commit() @pytest.fixture @@ -773,6 +774,7 @@ def sqlite_adbc_conn(iris_path, types_data): drop_view(view, conn) for tbl in get_all_tables(conn): drop_table(tbl, conn) + conn.commit() @pytest.fixture From 3bf550c7d5a93338b44babe2c0958bddcd29cde1 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 17 Oct 2023 14:03:52 -0400 Subject: [PATCH 47/61] close cursor --- pandas/tests/io/test_sql.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1fbaf477b8f7d..36132547194e9 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2638,6 +2638,7 @@ def test_execute_sql(conn, request): with pandasSQL.run_transaction(): iris_results = pandasSQL.execute("SELECT * FROM iris") row = iris_results.fetchone() + iris_results.close() tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]) From 1207bc4fc9527e86fcdb557c0bfc55f2bdab8ae5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 30 Oct 2023 19:28:26 -0400 Subject: [PATCH 48/61] fix table dropping --- pandas/tests/io/test_sql.py | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 378d98074092d..d168c40b5db85 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -688,6 +688,11 @@ def postgresql_adbc_conn(): uri = "postgresql://postgres:postgres@localhost:5432/pandas" with dbapi.connect(uri) as conn: yield conn + for view in get_all_views(conn): + drop_view(view, conn) + for tbl in get_all_tables(conn): + drop_table(tbl, conn) + conn.commit() @pytest.fixture @@ -707,11 +712,6 @@ def postgresql_adbc_iris(postgresql_adbc_conn, iris_path): conn.rollback() create_and_load_iris_view(conn) yield conn - for view in get_all_views(conn): - drop_view(view, conn) - for tbl in get_all_tables(conn): - drop_table(tbl, conn) - conn.commit() @pytest.fixture @@ -739,11 +739,6 @@ def postgresql_adbc_types(postgresql_adbc_conn, types_data): create_and_load_types_postgresql(conn, new_data) yield conn - for view in get_all_views(conn): - drop_view(view, conn) - for tbl in get_all_tables(conn): - drop_table(tbl, conn) - conn.commit() @pytest.fixture @@ -836,6 +831,11 @@ def sqlite_adbc_conn(): uri = f"file:{name}" with dbapi.connect(uri) as conn: yield conn + for view in get_all_views(conn): + drop_view(view, conn) + for tbl in get_all_tables(conn): + drop_table(tbl, conn) + conn.commit() @pytest.fixture @@ -854,11 +854,6 @@ def sqlite_adbc_iris(sqlite_adbc_conn, iris_path): conn.rollback() create_and_load_iris_view(conn) yield conn - for view in get_all_views(conn): - drop_view(view, conn) - for tbl in get_all_tables(conn): - drop_table(tbl, conn) - conn.commit() @pytest.fixture @@ -881,11 +876,6 @@ def sqlite_adbc_types(sqlite_adbc_conn, types_data): conn.commit() yield conn - for view in get_all_views(conn): - drop_view(view, conn) - for tbl in get_all_tables(conn): - drop_table(tbl, conn) - conn.commit() @pytest.fixture From 3eed8976c8bbae2950153900724b6a8c085c95c4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Nov 2023 08:02:12 -0500 Subject: [PATCH 49/61] Bumped ADBC min to 0.8.0 --- ci/deps/actions-310.yaml | 4 +- ci/deps/actions-311-downstream_compat.yaml | 4 +- ci/deps/actions-311.yaml | 4 +- ci/deps/actions-39-minimum_versions.yaml | 4 +- ci/deps/actions-39.yaml | 4 +- ci/deps/circle-310-arm64.yaml | 4 +- doc/source/getting_started/install.rst | 4 +- environment.yml | 4 +- pandas/compat/_optional.py | 4 +- pandas/tests/io/test_sql.py | 48 ++-------------------- pyproject.toml | 4 +- requirements-dev.txt | 4 +- 12 files changed, 26 insertions(+), 66 deletions(-) diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index e769c973b147f..f73e05b13cdcd 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -59,7 +59,7 @@ dependencies: - zstandard>=0.19.0 - pip: - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - pyqt5>=5.15.8 - tzdata>=2022.7 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 35ebf73a372d7..59df5d72bd078 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -73,8 +73,8 @@ dependencies: - pyyaml - py - pip: - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - dataframe-api-compat>=0.1.7 - pyqt5>=5.15.8 - tzdata>=2022.7 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index eb9894d5a5279..365f27ae27f17 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -59,7 +59,7 @@ dependencies: - zstandard>=0.19.0 - pip: - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - pyqt5>=5.15.8 - tzdata>=2022.7 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 43bd7072fd938..3f20f8a03a5c3 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -59,8 +59,8 @@ dependencies: - zstandard=0.19.0 - pip: - - adbc-driver-postgresql==0.7.0 - - adbc-driver-sqlite==0.7.0 + - adbc-driver-postgresql==0.8.0 + - adbc-driver-sqlite==0.8.0 - dataframe-api-compat==0.1.7 - pyqt5==5.15.8 - tzdata==2022.7 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 0e0bb892caccf..a65c4cf294cc7 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -59,7 +59,7 @@ dependencies: - zstandard>=0.19.0 - pip: - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - pyqt5>=5.15.8 - tzdata>=2022.7 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index c8a1c11288594..da918f6f46862 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -58,5 +58,5 @@ dependencies: - xlsxwriter>=3.0.5 - zstandard>=0.19.0 - pip: - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 31f8222ae6c32..5f8ccabea530a 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -346,8 +346,8 @@ SQLAlchemy 2.0.0 postgresql, SQL support for dat sql-other psycopg2 2.9.6 postgresql PostgreSQL engine for sqlalchemy pymysql 1.0.2 mysql MySQL engine for sqlalchemy -adbc-driver-postgresql 0.7.0 ADBC Driver for PostgreSQL -adbc-driver-sqlite 0.7.0 ADBC Driver for SQLite +adbc-driver-postgresql 0.8.0 ADBC Driver for PostgreSQL +adbc-driver-sqlite 0.8.0 ADBC Driver for SQLite ========================= ================== =============== ============================================================= Other data sources diff --git a/environment.yml b/environment.yml index e2a03317148a6..5fa65532bdf00 100644 --- a/environment.yml +++ b/environment.yml @@ -113,8 +113,8 @@ dependencies: - pygments # Code highlighting - pip: - - adbc-driver-postgresql>=0.7.0 - - adbc-driver-sqlite>=0.7.0 + - adbc-driver-postgresql>=0.8.0 + - adbc-driver-sqlite>=0.8.0 - dataframe-api-compat>=0.1.7 - sphinx-toggleprompt # conda-forge version has stricter pins on jinja2 - typing_extensions; python_version<"3.11" diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 817979f0cba01..84cf7af0fe7a6 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -15,8 +15,8 @@ # Update install.rst & setup.cfg when updating versions! VERSIONS = { - "adbc-driver-postgresql": "0.7.0", - "adbc-driver-sqlite": "0.7.0", + "adbc-driver-postgresql": "0.8.0", + "adbc-driver-sqlite": "0.8.0", "bs4": "4.11.2", "blosc": "1.21.3", "bottleneck": "1.3.6", diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index d168c40b5db85..b758d37dad7b8 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -274,25 +274,17 @@ def create_and_load_types_sqlite3(conn, types_data: list[dict]): def create_and_load_types_postgresql(conn, types_data: list[dict]): - # Boolean support not added until 0.8.0 - adbc = import_optional_dependency("adbc_driver_manager", errors="ignore") - - if adbc and Version(adbc.__version__) < Version("0.8.0"): - bool_type = "INTEGER" - else: - bool_type = "BOOLEAN" - with conn.cursor() as cur: - stmt = f"""CREATE TABLE types ( + stmt = """CREATE TABLE types ( "TextCol" TEXT, "DateCol" TIMESTAMP, "IntDateCol" INTEGER, "IntDateOnlyCol" INTEGER, "FloatCol" DOUBLE PRECISION, "IntCol" INTEGER, - "BoolCol" {bool_type}, + "BoolCol" BOOLEAN, "IntColWithNull" INTEGER, - "BoolColWithNull" {bool_type} + "BoolColWithNull" BOOLEAN )""" cur.execute(stmt) @@ -724,17 +716,7 @@ def postgresql_adbc_types(postgresql_adbc_conn, types_data): conn.adbc_get_table_schema("types") except mgr.ProgrammingError: conn.rollback() - # Boolean support not added until 0.8.0 - adbc = import_optional_dependency("adbc_driver_manager") - if Version(adbc.__version__) < Version("0.8.0"): - new_data = [] - for entry in types_data: - entry["BoolCol"] = int(entry["BoolCol"]) - if entry["BoolColWithNull"] is not None: - entry["BoolColWithNull"] = int(entry["BoolColWithNull"]) - new_data.append(tuple(entry.values())) - else: - new_data = [tuple(entry.values()) for entry in types_data] + new_data = [tuple(entry.values()) for entry in types_data] create_and_load_types_postgresql(conn, new_data) @@ -3539,11 +3521,6 @@ def test_read_sql_dtype_backend( conn = request.getfixturevalue(conn) table = "test" df = dtype_backend_data - if "adbc" in conn_name: - # Boolean support not added until 0.8.0 - adbc = import_optional_dependency("adbc_driver_manager") - if Version(adbc.__version__) < Version("0.8.0"): - df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): @@ -3551,10 +3528,6 @@ def test_read_sql_dtype_backend( f"Select * from {table}", conn, dtype_backend=dtype_backend ) expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) - if "adbc" in conn_name: - adbc = import_optional_dependency("adbc_driver_manager") - if Version(adbc.__version__) < Version("0.8.0"): - expected = expected.drop(columns=["e", "f"]) tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3598,19 +3571,11 @@ def test_read_sql_dtype_backend_table( conn = request.getfixturevalue(conn) table = "test" df = dtype_backend_data - if "adbc" in conn_name: - adbc = import_optional_dependency("adbc_driver_manager") - if Version(adbc.__version__) < Version("0.8.0"): - df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") with pd.option_context("mode.string_storage", string_storage): result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) - if "adbc" in conn_name: - adbc = import_optional_dependency("adbc_driver_manager") - if Version(adbc.__version__) < Version("0.8.0"): - expected = expected.drop(columns=["e", "f"]) tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3632,14 +3597,9 @@ def test_read_sql_dtype_backend_table( @pytest.mark.parametrize("conn", all_connectable) @pytest.mark.parametrize("func", ["read_sql", "read_sql_table", "read_sql_query"]) def test_read_sql_invalid_dtype_backend_table(conn, request, func, dtype_backend_data): - conn_name = conn conn = request.getfixturevalue(conn) table = "test" df = dtype_backend_data - if "adbc" in conn_name: - adbc = import_optional_dependency("adbc_driver_manager") - if Version(adbc.__version__) < Version("0.8.0"): - df = df.drop(columns=["e", "f"]) df.to_sql(name=table, con=conn, index=False, if_exists="replace") msg = ( diff --git a/pyproject.toml b/pyproject.toml index a0058bf26ea22..fa746a471a5cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,8 +86,8 @@ output-formatting = ['jinja2>=3.1.2', 'tabulate>=0.9.0'] clipboard = ['PyQt5>=5.15.8', 'qtpy>=2.3.0'] compression = ['zstandard>=0.19.0'] consortium-standard = ['dataframe-api-compat>=0.1.7'] -all = ['adbc-driver-postgresql>=0.7.0', - 'adbc-driver-sqlite>=0.7.0', +all = ['adbc-driver-postgresql>=0.8.0', + 'adbc-driver-sqlite>=0.8.0', 'beautifulsoup4>=4.11.2', # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297) #'blosc>=1.21.3', diff --git a/requirements-dev.txt b/requirements-dev.txt index a799e8b00ae64..3a68da45bc4dc 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -81,8 +81,8 @@ feedparser pyyaml requests pygments -adbc-driver-postgresql>=0.7.0 -adbc-driver-sqlite>=0.7.0 +adbc-driver-postgresql>=0.8.0 +adbc-driver-sqlite>=0.8.0 dataframe-api-compat>=0.1.7 sphinx-toggleprompt typing_extensions; python_version<"3.11" From 67101fd08cf41adb335b8550269ff6cfb5539b31 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Nov 2023 13:45:06 -0500 Subject: [PATCH 50/61] documentation --- doc/source/user_guide/io.rst | 36 +++++++++++++++++++++++++++------- doc/source/whatsnew/v2.2.0.rst | 34 ++++++++++++++++++++++++++++++++ pandas/io/sql.py | 17 ++++++++++------ pandas/tests/io/test_sql.py | 5 +++++ 4 files changed, 79 insertions(+), 13 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 73ec09cdd12f1..f80374ceb250a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5565,9 +5565,16 @@ SQL queries ----------- The :mod:`pandas.io.sql` module provides a collection of query wrappers to both -facilitate data retrieval and to reduce dependency on DB-specific API. Database abstraction -is provided by SQLAlchemy if installed. In addition you will need a driver library for -your database. Examples of such drivers are `psycopg2 `__ +facilitate data retrieval and to reduce dependency on DB-specific API. + +Starting in pandas 2.2 users have the option to use `Apache Arrow ADBC +`_ drivers. Where available, +these drivers should provide the best performance, null handling, and type +detection. + +In earlier versions of pandas and in instances where an ADBC driver is not available, +users should opt for installing SQLAlchemy alongside their database driver library. +Examples of such drivers are `psycopg2 `__ for PostgreSQL or `pymysql `__ for MySQL. For `SQLite `__ this is included in Python's standard library by default. @@ -5600,6 +5607,18 @@ In the following example, we use the `SQlite engine. You can use a temporary SQLite database where data are stored in "memory". +To connect using an ADBC driver you will want to install the ``adbc_driver_sqlite`` using your +package manager. Once installed, you can use the DBAPI interface provided by the ADBC driver +to connect to your database. + +.. ipython:: python + + import adbc_driver_sqlite.dbapi as sqlite_dbapi + + # Create the connection + with sqlite_dbapi.connect("sqlite:///:memory:") as conn: + ... + To connect with SQLAlchemy you use the :func:`create_engine` function to create an engine object from database URI. You only need to create the engine once per database you are connecting to. @@ -5696,7 +5715,9 @@ default ``Text`` type for string columns: Due to the limited support for timedelta's in the different database flavors, columns with type ``timedelta64`` will be written as integer - values as nanoseconds to the database and a warning will be raised. + values as nanoseconds to the database and a warning will be raised. The only + exception to this is when using the ADBC PostgreSQL driver in which case a + timedelta will be written to the database as an ``INTERVAL`` .. note:: @@ -5711,7 +5732,7 @@ default ``Text`` type for string columns: Datetime data types ''''''''''''''''''' -Using SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing +Using ADBC or SQLAlchemy, :func:`~pandas.DataFrame.to_sql` is capable of writing datetime data that is timezone naive or timezone aware. However, the resulting data stored in the database ultimately depends on the supported data type for datetime data of the database system being used. @@ -5802,7 +5823,7 @@ table name and optionally a subset of columns to read. .. note:: In order to use :func:`~pandas.read_sql_table`, you **must** have the - SQLAlchemy optional dependency installed. + ADBC driver or SQLAlchemy optional dependency installed. .. ipython:: python @@ -5810,7 +5831,8 @@ table name and optionally a subset of columns to read. .. note:: - Note that pandas infers column dtypes from query outputs, and not by looking + ADBC drivers will map database types directly back to pandas types. For other drivers + note that pandas infers column dtypes from query outputs, and not by looking up data types in the physical database schema. For example, assume ``userid`` is an integer column in a table. Then, intuitively, ``select userid ...`` will return integer-valued series, while ``select cast(userid as text) ...`` will diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index efa4a52993a90..a4162938a5b09 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -89,6 +89,40 @@ a Series. (:issue:`55323`) ) series.list[0] +.. _whatsnew_220.enhancements.adbc_support: + +ADBC Driver support in to_sql and read_sql +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_sql` and :meth:`to_sql` now work with `Apache Arrow ADBC +`_ drivers. Compared to +traditional drivers used via SQLAlchemy, ADBC drivers should provide +significant performance improvements, better type support and cleaner +nullability handling. + +.. ipython:: python + + import adbc_driver_postgresql.dbapi as pg_dbapi + + df = pd.DataFrame( + [ + [1, 2, 3], + [4, 5, 6], + ], + columns=['a', 'b', 'c'] + ) + uri = "postgresql://postgres:postgres@localhost/postgres" + with pg_dbapi.connect(uri) as conn: + df.to_sql("pandas_table", conn, index=False) + + # for roundtripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn) + +For a full list of ADBC drivers and their development status, see the `ADBC Driver +Implementation Status `_ +documentation. + .. _whatsnew_220.enhancements.other: Other enhancements diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 2b94f5558f955..b19f47192af5f 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -585,7 +585,8 @@ def read_sql( ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. - con : SQLAlchemy connectable, str, or sqlite3 connection + con : AdbcConnection, SQLAlchemy connectable, str, or sqlite3 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible for engine disposal and connection closure for the SQLAlchemy connectable; str @@ -756,8 +757,9 @@ def to_sql( frame : DataFrame, Series name : str Name of SQL table. - con : SQLAlchemy connectable(engine/connection) or database string URI + con : AdbcConnection, SQLAlchemy connectable(engine/connection), database string URI or sqlite3 DBAPI2 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. @@ -812,7 +814,8 @@ def to_sql( Notes ----- The returned rows affected is the sum of the ``rowcount`` attribute of ``sqlite3.Cursor`` - or SQLAlchemy connectable. The returned value may not reflect the exact number of written + or SQLAlchemy connectable. If using ADBC the returned rows are the result + of ``Cursor.adbc_ingest``. The returned value may not reflect the exact number of written rows as stipulated in the `sqlite3 `__ or `SQLAlchemy `__ @@ -851,7 +854,8 @@ def has_table(table_name: str, con, schema: str | None = None) -> bool: ---------- table_name: string Name of SQL table. - con: SQLAlchemy connectable(engine/connection) or sqlite3 DBAPI2 connection + con: AdbcConnection, SQLAlchemy connectable, str, or sqlite3 connection + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. @@ -2898,9 +2902,10 @@ def get_schema( name of SQL table keys : string or sequence, default: None columns to use a primary key - con: an open SQL database connection object or a SQLAlchemy connectable + con: AdbcConnection, SQLAlchemy connectable, sqlite3 connection, default: None + ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that - library, default: None + library If a DBAPI2 object, only sqlite3 is supported. dtype : dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index b758d37dad7b8..306ede759c3b3 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -58,6 +58,11 @@ import sqlalchemy +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" +) + + @pytest.fixture def sql_strings(): return { From ea5dcb9df0ab7a359b1febf81ff388366089a90f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Nov 2023 15:05:28 -0500 Subject: [PATCH 51/61] doc updates --- doc/source/whatsnew/v2.2.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index a4162938a5b09..214e16a452ebd 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -94,13 +94,13 @@ a Series. (:issue:`55323`) ADBC Driver support in to_sql and read_sql ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:func:`read_sql` and :meth:`to_sql` now work with `Apache Arrow ADBC +:func:`read_sql` and :meth:`~DataFrame.to_sql` now work with `Apache Arrow ADBC `_ drivers. Compared to traditional drivers used via SQLAlchemy, ADBC drivers should provide significant performance improvements, better type support and cleaner nullability handling. -.. ipython:: python +.. code-block:: ipython import adbc_driver_postgresql.dbapi as pg_dbapi From fb38411c7547e7b53c51b12790c4750daa04808c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Nov 2023 15:06:26 -0500 Subject: [PATCH 52/61] more fixups --- pandas/io/sql.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b19f47192af5f..db263f68720a7 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -589,8 +589,8 @@ def read_sql( ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible - for engine disposal and connection closure for the SQLAlchemy connectable; str - connections are closed automatically. See + for engine disposal and connection closure for the ADBC connection and + SQLAlchemy connectable; str connections are closed automatically. See `here `_. index_col : str or list of str, optional, default: None Column(s) to set as index(MultiIndex). From a0bed67ade0f7ec1900fec6d38d8596b8abaf56e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 10 Nov 2023 19:13:51 -0500 Subject: [PATCH 53/61] documentation fixups --- doc/source/user_guide/io.rst | 2 +- pandas/io/sql.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index f80374ceb250a..5a22524188cf5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5611,7 +5611,7 @@ To connect using an ADBC driver you will want to install the ``adbc_driver_sqlit package manager. Once installed, you can use the DBAPI interface provided by the ADBC driver to connect to your database. -.. ipython:: python +.. code-block:: python import adbc_driver_sqlite.dbapi as sqlite_dbapi diff --git a/pandas/io/sql.py b/pandas/io/sql.py index db263f68720a7..3a542b97d6d5c 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -585,7 +585,7 @@ def read_sql( ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. - con : AdbcConnection, SQLAlchemy connectable, str, or sqlite3 connection + con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible @@ -757,7 +757,7 @@ def to_sql( frame : DataFrame, Series name : str Name of SQL table. - con : AdbcConnection, SQLAlchemy connectable(engine/connection), database string URI + con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection or sqlite3 DBAPI2 connection ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that @@ -854,7 +854,7 @@ def has_table(table_name: str, con, schema: str | None = None) -> bool: ---------- table_name: string Name of SQL table. - con: AdbcConnection, SQLAlchemy connectable, str, or sqlite3 connection + con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library. @@ -2902,7 +2902,7 @@ def get_schema( name of SQL table keys : string or sequence, default: None columns to use a primary key - con: AdbcConnection, SQLAlchemy connectable, sqlite3 connection, default: None + con: ADBC Connection, SQLAlchemy connectable, sqlite3 connection, default: None ADBC provides high performance I/O with native type support, where available. Using SQLAlchemy makes it possible to use any DB supported by that library From 1e77f2b806e7bf194543549d427b4889d0c6de9c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 13 Nov 2023 12:05:10 -0500 Subject: [PATCH 54/61] fixes --- pandas/tests/io/test_sql.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 306ede759c3b3..129a27c74ed0a 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1012,8 +1012,8 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): if "adbc" in conn: if conn == "sqlite_adbc_conn": df = df.drop(columns=["timedelta"]) - exp_warning = FutureWarning # warning thrown from pyarrow - msg = "is_sparse is deprecated" + exp_warning = None # warning thrown from pyarrow + msg = "" else: exp_warning = UserWarning msg = "the 'timedelta'" @@ -1214,8 +1214,8 @@ def test_default_type_conversion(conn, request): assert issubclass(df.FloatCol.dtype.type, np.floating) assert issubclass(df.IntCol.dtype.type, np.integer) - # MySQL/sqlite has no real BOOL type, but ADBC loses this - if "postgresql" in conn_name and "adbc" not in conn_name: + # MySQL/sqlite has no real BOOL type + if "postgresql" in conn_name: assert issubclass(df.BoolCol.dtype.type, np.bool_) else: assert issubclass(df.BoolCol.dtype.type, np.integer) @@ -1825,7 +1825,7 @@ def test_api_custom_dateparsing_error( result["BoolCol"] = result["BoolCol"].astype(int) result["BoolColWithNull"] = result["BoolColWithNull"].astype(float) - if conn_name == "postgresql_adbc_conn": + if conn_name == "postgresql_adbc_types": expected = expected.astype( { "IntDateCol": "int32", @@ -1875,7 +1875,7 @@ def test_api_timedelta(conn, request): ) if "adbc" in conn_name: - exp_warning = FutureWarning # pyarrow warns is_sparse is deprecated + exp_warning = None else: exp_warning = UserWarning @@ -2216,10 +2216,10 @@ def test_api_chunksize_read(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_api_categorical(conn, request): - if "adbc" in conn: + if conn == "postgresql_adbc_conn": request.node.add_marker( pytest.mark.xfail( - reason="categorical dtype not implemented for ADBC drivers", + reason="categorical dtype not implemented for ADBC postgres driver", strict=True, ) ) @@ -3115,13 +3115,6 @@ def test_transactions(conn, request): @pytest.mark.parametrize("conn", all_connectable) def test_transaction_rollback(conn, request): - if conn == "postgresql_adbc_conn": - request.node.add_marker( - pytest.mark.xfail( - reason="broken for postgres ADBC driver - needs investigation", - strict=True, - ) - ) conn_name = conn conn = request.getfixturevalue(conn) with pandasSQL_builder(conn) as pandasSQL: From 97ed24fb11a55d52767a759630b0d5feb325b529 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 13 Nov 2023 12:20:53 -0500 Subject: [PATCH 55/61] more documentation --- doc/source/user_guide/io.rst | 2 +- doc/source/whatsnew/v2.2.0.rst | 57 ++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 5a22524188cf5..d23a04c52d48a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5831,7 +5831,7 @@ table name and optionally a subset of columns to read. .. note:: - ADBC drivers will map database types directly back to pandas types. For other drivers + ADBC drivers will map database types directly back to arrow types. For other drivers note that pandas infers column dtypes from query outputs, and not by looking up data types in the physical database schema. For example, assume ``userid`` is an integer column in a table. Then, intuitively, ``select userid ...`` will diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 214e16a452ebd..5e3ec634a686f 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -119,6 +119,63 @@ nullability handling. with pg_dbapi.connect(uri) as conn: df2 = pd.read_sql("pandas_table", conn) +The Arrow type system offers a wider array of types that can more closely match +what databases like PostgreSQL can offer. To illustrate, note this (non-exhaustive) +listing of types available in different databases and pandas backends: + ++-----------------+-----------------------+----------------+---------+ +|numpy/pandas |arrow |postgres |sqlite | ++=================+=======================+================+=========+ +|int16/Int16 |int16 |SMALLINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int32/Int32 |int32 |INTEGER |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int64/Int64 |int64 |BIGINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|float32 |float32 |REAL |REAL | ++-----------------+-----------------------+----------------+---------+ +|float64 |float64 |DOUBLE PRECISION|REAL | ++-----------------+-----------------------+----------------+---------+ +|object |string |TEXT |TEXT | ++-----------------+-----------------------+----------------+---------+ +|bool |bool_ |BOOLEAN | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns] |timestamp(us) |TIMESTAMP | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns,tz]|timestamp(us,tz) |TIMESTAMPTZ | | ++-----------------+-----------------------+----------------+---------+ +| |date32 |DATE | | ++-----------------+-----------------------+----------------+---------+ +| |month_day_nano_interval|INTERVAL | | ++-----------------+-----------------------+----------------+---------+ +| |binary |BINARY |BLOB | ++-----------------+-----------------------+----------------+---------+ +| |decimal128 |DECIMAL [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |list |ARRAY [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |struct* |COMPOSITE TYPE | | +| | |[#f1]_ | | ++-----------------+-----------------------+----------------+---------+ + +.. rubric:: Footnotes + +.. [#f1] Not implemented as of writing, but theoretically possible + +If you are interested in preserving database types as best as possible +throughout the lifecycle of your DataFrame, users are encouraged to +leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` + +.. code-block:: ipython + + # for roundtripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") + +This will prevent your data from being converted to the traditional pandas/NumPy +type system, which often converts SQL types in ways that make them impossible to +round-trip. + For a full list of ADBC drivers and their development status, see the `ADBC Driver Implementation Status `_ documentation. From 7dc07daa3476be1840a9eae13e8a1992384196c8 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 13 Nov 2023 13:20:17 -0500 Subject: [PATCH 56/61] doc spacing --- doc/source/whatsnew/v2.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 5e3ec634a686f..e8750d1c9b883 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -155,7 +155,7 @@ listing of types available in different databases and pandas backends: | |list |ARRAY [#f1]_ | | +-----------------+-----------------------+----------------+---------+ | |struct* |COMPOSITE TYPE | | -| | |[#f1]_ | | +| | | [#f1]_ | | +-----------------+-----------------------+----------------+---------+ .. rubric:: Footnotes From 52ee8d35aa7e50ccd71f4cec471ac57a6c593d7a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 13 Nov 2023 21:11:37 -0500 Subject: [PATCH 57/61] doc target fix --- doc/source/whatsnew/v2.2.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e8750d1c9b883..52cb474118cbe 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -138,7 +138,7 @@ listing of types available in different databases and pandas backends: +-----------------+-----------------------+----------------+---------+ |object |string |TEXT |TEXT | +-----------------+-----------------------+----------------+---------+ -|bool |bool_ |BOOLEAN | | +|bool |``bool_`` |BOOLEAN | | +-----------------+-----------------------+----------------+---------+ |datetime64[ns] |timestamp(us) |TIMESTAMP | | +-----------------+-----------------------+----------------+---------+ @@ -154,7 +154,7 @@ listing of types available in different databases and pandas backends: +-----------------+-----------------------+----------------+---------+ | |list |ARRAY [#f1]_ | | +-----------------+-----------------------+----------------+---------+ -| |struct* |COMPOSITE TYPE | | +| |struct |COMPOSITE TYPE | | | | | [#f1]_ | | +-----------------+-----------------------+----------------+---------+ From 1de848816f02169680d21bf7654af4100a5301de Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 14 Nov 2023 00:24:19 -0500 Subject: [PATCH 58/61] pyarrow warning compat --- pandas/tests/io/test_sql.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 129a27c74ed0a..2978b348224d2 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -19,7 +19,10 @@ import pytest from pandas._libs import lib -from pandas.compat import pa_version_under13p0 +from pandas.compat import ( + pa_version_under13p0, + pa_version_under14p1, +) from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -1012,8 +1015,12 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): if "adbc" in conn: if conn == "sqlite_adbc_conn": df = df.drop(columns=["timedelta"]) - exp_warning = None # warning thrown from pyarrow - msg = "" + if pa_version_under14p1: + exp_warning = FutureWarning + msg = "is_sparse is deprecated" + else: + exp_warning = None + msg = "" else: exp_warning = UserWarning msg = "the 'timedelta'" @@ -1875,7 +1882,10 @@ def test_api_timedelta(conn, request): ) if "adbc" in conn_name: - exp_warning = None + if pa_version_under14p1: + exp_warning = FutureWarning + else: + exp_warning = None else: exp_warning = UserWarning From 2d077e95bb856f3909cefbdfbd699a532337e487 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 16 Nov 2023 22:12:14 -0500 Subject: [PATCH 59/61] feedback --- doc/source/user_guide/io.rst | 15 +++++++++------ pandas/io/sql.py | 4 ++-- pandas/tests/io/test_sql.py | 6 ++++-- pyproject.toml | 4 ++-- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d23a04c52d48a..cce93cc1e6471 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5567,12 +5567,15 @@ SQL queries The :mod:`pandas.io.sql` module provides a collection of query wrappers to both facilitate data retrieval and to reduce dependency on DB-specific API. -Starting in pandas 2.2 users have the option to use `Apache Arrow ADBC -`_ drivers. Where available, -these drivers should provide the best performance, null handling, and type -detection. +Where available, users may first want to opt for `Apache Arrow ADBC +`_ drivers. These drivers +should provide the best performance, null handling, and type detection. -In earlier versions of pandas and in instances where an ADBC driver is not available, + .. versionadded:: 2.2.0 + + Added native support for ADBC drivers + +Where an ADBC driver is not available or may be missing functionality, users should opt for installing SQLAlchemy alongside their database driver library. Examples of such drivers are `psycopg2 `__ for PostgreSQL or `pymysql `__ for MySQL. @@ -5617,7 +5620,7 @@ to connect to your database. # Create the connection with sqlite_dbapi.connect("sqlite:///:memory:") as conn: - ... + df = pd.read_sql_table("data", conn) To connect with SQLAlchemy you use the :func:`create_engine` function to create an engine object from database URI. You only need to create the engine once per database you are diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 3a542b97d6d5c..21520f3ab24a8 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -213,7 +213,7 @@ def _wrap_result_adbc( parse_dates=None, dtype: DtypeArg | None = None, dtype_backend: DtypeBackend | Literal["numpy"] = "numpy", -): +) -> DataFrame: """Wrap result set of a SQLAlchemy query in a DataFrame.""" if dtype: df = df.astype(dtype) @@ -676,7 +676,7 @@ def read_sql( 0 0 2012-11-10 1 1 2010-11-12 - .. versionadded:: 2.1.0 + .. versionadded:: 2.2.0 pandas now supports reading via ADBC drivers diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 8872fdbf4cc49..144210166d1a6 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -948,8 +948,8 @@ def sqlite_buildin_types(sqlite_buildin, types_data): ) adbc_connectable = [ + "sqlite_adbc_conn", pytest.param("postgresql_adbc_conn", marks=pytest.mark.db), - pytest.param("sqlite_adbc_conn", marks=pytest.mark.db), ] adbc_connectable_iris = [ @@ -3540,7 +3540,9 @@ def test_read_sql_dtype_backend( if "adbc" in conn_name: # adbc does not support chunksize argument - return + request.applymarker( + pytest.mark.xfail(reason="adbc does not support chunksize argument") + ) with pd.option_context("mode.string_storage", string_storage): iterator = getattr(pd, func)( diff --git a/pyproject.toml b/pyproject.toml index 153c96b94b9fc..88a3907b14ee9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,9 +76,9 @@ hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/i #'blosc>=1.20.1', 'tables>=3.8.0'] spss = ['pyreadstat>=1.2.0'] -postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6'] +postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.8.0'] mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.0.2'] -sql-other = ['SQLAlchemy>=2.0.0'] +sql-other = ['SQLAlchemy>=2.0.0', 'adbc-driver-postgresql>=0.8.0', 'adbc-driver-sqlite>=0.8.0'] html = ['beautifulsoup4>=4.11.2', 'html5lib>=1.1', 'lxml>=4.9.2'] xml = ['lxml>=4.9.2'] plot = ['matplotlib>=3.6.3'] From accbd49e18b2df33c673a55f72a17b608131048b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 17 Nov 2023 11:34:51 -0500 Subject: [PATCH 60/61] updated io documentation --- doc/source/user_guide/io.rst | 75 ++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index cce93cc1e6471..21e07e8d00ad6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5575,6 +5575,10 @@ should provide the best performance, null handling, and type detection. Added native support for ADBC drivers +For a full list of ADBC drivers and their development status, see the `ADBC Driver +Implementation Status `_ +documentation. + Where an ADBC driver is not available or may be missing functionality, users should opt for installing SQLAlchemy alongside their database driver library. Examples of such drivers are `psycopg2 `__ @@ -5697,9 +5701,74 @@ writes ``data`` to the database in batches of 1000 rows at a time: SQL data types ++++++++++++++ -:func:`~pandas.DataFrame.to_sql` will try to map your data to an appropriate -SQL data type based on the dtype of the data. When you have columns of dtype -``object``, pandas will try to infer the data type. +Ensuring consistent data type management across SQL databases is challenging. +Not every SQL database offers the same types, and even when they do the implementation +of a given type can vary in ways that have subtle effects on how types can be +preserved. + +For the best odds at preserving database types users are advised to use +ADBC drivers when available. The Arrow type system offers a wider array of +types that more closely match database types than the historical pandas/NumPy +type system. To illustrate, note this (non-exhaustive) listing of types +available in different databases and pandas backends: + ++-----------------+-----------------------+----------------+---------+ +|numpy/pandas |arrow |postgres |sqlite | ++=================+=======================+================+=========+ +|int16/Int16 |int16 |SMALLINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int32/Int32 |int32 |INTEGER |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|int64/Int64 |int64 |BIGINT |INTEGER | ++-----------------+-----------------------+----------------+---------+ +|float32 |float32 |REAL |REAL | ++-----------------+-----------------------+----------------+---------+ +|float64 |float64 |DOUBLE PRECISION|REAL | ++-----------------+-----------------------+----------------+---------+ +|object |string |TEXT |TEXT | ++-----------------+-----------------------+----------------+---------+ +|bool |``bool_`` |BOOLEAN | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns] |timestamp(us) |TIMESTAMP | | ++-----------------+-----------------------+----------------+---------+ +|datetime64[ns,tz]|timestamp(us,tz) |TIMESTAMPTZ | | ++-----------------+-----------------------+----------------+---------+ +| |date32 |DATE | | ++-----------------+-----------------------+----------------+---------+ +| |month_day_nano_interval|INTERVAL | | ++-----------------+-----------------------+----------------+---------+ +| |binary |BINARY |BLOB | ++-----------------+-----------------------+----------------+---------+ +| |decimal128 |DECIMAL [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |list |ARRAY [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ +| |struct |COMPOSITE TYPE | | +| | | [#f1]_ | | ++-----------------+-----------------------+----------------+---------+ + +.. rubric:: Footnotes + +.. [#f1] Not implemented as of writing, but theoretically possible + +If you are interested in preserving database types as best as possible +throughout the lifecycle of your DataFrame, users are encouraged to +leverage the ``dtype_backend="pyarrow"`` argument of :func:`~pandas.read_sql` + +.. code-block:: ipython + + # for roundtripping + with pg_dbapi.connect(uri) as conn: + df2 = pd.read_sql("pandas_table", conn, dtype_backend="pyarrow") + +This will prevent your data from being converted to the traditional pandas/NumPy +type system, which often converts SQL types in ways that make them impossible to +round-trip. + +In case an ADBC driver is not available, :func:`~pandas.DataFrame.to_sql` +will try to map your data to an appropriate SQL data type based on the dtype of +the data. When you have columns of dtype ``object``, pandas will try to infer +the data type. You can always override the default type by specifying the desired SQL type of any of the columns by using the ``dtype`` argument. This argument needs a From f84f63a07bc444594868b2a5af319f3cf9d7c77a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 17 Nov 2023 18:32:06 -0800 Subject: [PATCH 61/61] install updates --- doc/source/getting_started/install.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 5f8ccabea530a..27131dd113f1f 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -335,8 +335,7 @@ lxml 4.9.2 xml XML parser for read SQL databases ^^^^^^^^^^^^^ -Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"``. ADBC drivers -must be installed separately. +Traditional drivers are installable with ``pip install "pandas[postgresql, mysql, sql-other]"`` ========================= ================== =============== ============================================================= Dependency Minimum Version pip extra Notes @@ -346,8 +345,8 @@ SQLAlchemy 2.0.0 postgresql, SQL support for dat sql-other psycopg2 2.9.6 postgresql PostgreSQL engine for sqlalchemy pymysql 1.0.2 mysql MySQL engine for sqlalchemy -adbc-driver-postgresql 0.8.0 ADBC Driver for PostgreSQL -adbc-driver-sqlite 0.8.0 ADBC Driver for SQLite +adbc-driver-postgresql 0.8.0 postgresql ADBC Driver for PostgreSQL +adbc-driver-sqlite 0.8.0 sql-other ADBC Driver for SQLite ========================= ================== =============== ============================================================= Other data sources