diff --git a/ci/deps/actions-310-numpydev.yaml b/ci/deps/actions-310-numpydev.yaml index 3e32665d5433f..f96deaaddf9fe 100644 --- a/ci/deps/actions-310-numpydev.yaml +++ b/ci/deps/actions-310-numpydev.yaml @@ -14,6 +14,7 @@ dependencies: - python-dateutil - pytz - pip + - python-duckdb - pip: - cython==0.29.24 # GH#34014 - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 9829380620f86..95cc6143d0f52 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -43,6 +43,7 @@ dependencies: - s3fs - scipy - sqlalchemy + - python-duckdb - tabulate - xarray - xlrd diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index af4f7dee851d5..f537260c1b569 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -45,6 +45,7 @@ dependencies: - xlrd - xlsxwriter - xlwt + - python-duckdb # downstream packages - aiobotocore<2.0.0 # GH#44311 pinned to fix docbuild diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index 467402bb6ef7f..e2f3de76b436c 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -50,3 +50,4 @@ dependencies: - xlsxwriter=1.2.2 - xlwt=1.3.0 - zstandard=0.15.2 + - python-duckdb=0.3.1 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index b23f686d845e9..f04ed18feb910 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -42,6 +42,7 @@ dependencies: - s3fs - scipy - sqlalchemy + - python-duckdb - tabulate - xarray - xlrd diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 631ef40b02e33..bb2dc031b314b 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -42,6 +42,7 @@ dependencies: - s3fs - scipy - sqlalchemy + - python-duckdb - tabulate - xarray - xlrd diff --git a/ci/deps/azure-macos-310.yaml b/ci/deps/azure-macos-310.yaml index 312fac8091db6..3903fccb56cbc 100644 --- a/ci/deps/azure-macos-310.yaml +++ b/ci/deps/azure-macos-310.yaml @@ -34,3 +34,4 @@ dependencies: - xlsxwriter - xlwt - zstandard + - python-duckdb diff --git a/ci/deps/azure-macos-38.yaml b/ci/deps/azure-macos-38.yaml index 422aa86c57fc7..d335fe7cd668f 100644 --- a/ci/deps/azure-macos-38.yaml +++ b/ci/deps/azure-macos-38.yaml @@ -34,3 +34,4 @@ dependencies: - xlsxwriter - xlwt - zstandard + - python-duckdb=0.3.1 diff --git a/ci/deps/azure-macos-39.yaml b/ci/deps/azure-macos-39.yaml index 140d67796452c..f005ef1f81275 100644 --- a/ci/deps/azure-macos-39.yaml +++ b/ci/deps/azure-macos-39.yaml @@ -34,3 +34,4 @@ dependencies: - xlsxwriter - xlwt - zstandard + - python-duckdb diff --git a/ci/deps/azure-windows-310.yaml b/ci/deps/azure-windows-310.yaml index 8e6f4deef6057..136f5d28a4e24 100644 --- a/ci/deps/azure-windows-310.yaml +++ b/ci/deps/azure-windows-310.yaml @@ -33,6 +33,7 @@ dependencies: - s3fs>=0.4.2 - scipy - sqlalchemy + - python-duckdb - xlrd - xlsxwriter - xlwt diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index eb533524147d9..5562f40516eed 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -33,3 +33,4 @@ dependencies: - xlsxwriter - xlwt - zstandard + - python-duckdb diff --git a/ci/deps/azure-windows-39.yaml b/ci/deps/azure-windows-39.yaml index 6f820b1c2aedb..729ec1e9b70c4 100644 --- a/ci/deps/azure-windows-39.yaml +++ b/ci/deps/azure-windows-39.yaml @@ -32,6 +32,7 @@ dependencies: - s3fs>=0.4.2 - scipy - sqlalchemy + - python-duckdb - xlrd - xlsxwriter - xlwt diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index df9c258f4aa6d..cdff578624ce2 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -348,6 +348,7 @@ Dependency Minimum Version Notes SQLAlchemy 1.4.0 SQL support for databases other than sqlite psycopg2 2.8.4 PostgreSQL engine for sqlalchemy pymysql 0.10.1 MySQL engine for sqlalchemy +duckdb 0.3.1 High-performance analytical database system ========================= ================== ============================================================= Other data sources diff --git a/environment.yml b/environment.yml index a168e691821c3..170b56a568d9a 100644 --- a/environment.yml +++ b/environment.yml @@ -89,6 +89,7 @@ dependencies: - numexpr>=2.7.1 - scipy>=1.4.1 - numba>=0.50.1 + - python-duckdb>=0.3.1 # optional for io # --------------- @@ -123,3 +124,4 @@ dependencies: - pydata-sphinx-theme - pandas-dev-flaker==0.2.0 - pytest-cython + - duckdb diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index a26bc94ab883e..833584c55cf15 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -13,6 +13,7 @@ "bs4": "4.8.2", "blosc": "1.20.1", "bottleneck": "1.3.1", + "python-duckdb": "0.3.1", "fastparquet": "0.4.0", "fsspec": "0.7.4", "html5lib": "1.1", diff --git a/pandas/io/sql.py b/pandas/io/sql.py index fcb3f5177ae3f..c6b69b43b852a 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -733,6 +733,11 @@ def pandasSQL_builder(con, schema: str | None = None): import sqlite3 import warnings + duckdb = import_optional_dependency("duckdb") + + if isinstance(con, duckdb.DuckDBPyConnection): + return DuckDBDatabase(con) + if isinstance(con, sqlite3.Connection) or con is None: return SQLiteDatabase(con) @@ -2226,3 +2231,88 @@ def get_schema( return pandas_sql._create_sql_schema( frame, name, keys=keys, dtype=dtype, schema=schema ) + + +class DuckDBDatabase(PandasSQL): + """ + Version of SQLDatabase to support DuckDB connections (fallback without + SQLAlchemy). This should only be used internally. + + Parameters + ---------- + con : duckdb connection object + + """ + + def __init__(self, con): + self.con = con + + def to_sql( + self, + frame, + name, + if_exists="fail", + index=True, + index_label=None, + schema=None, + chunksize=None, + dtype: DtypeArg | None = None, + method=None, + **kwargs, + ) -> int | None: + """ + Write records stored in a DataFrame to a SQL database. + + Parameters + ---------- + frame: DataFrame + name: string + Name of SQL table. + if_exists: {'fail', 'replace', 'append'}, default 'fail' + fail: If table exists, do nothing. + replace: If table exists, drop it, recreate it, and insert data. + append: If table exists, insert data. Create if it does not exist. + index : bool, default True + Ignored parameter included for compatibility with SQLAlchemy + and SQLite version of ``to_sql``. + index_label : string or sequence, default None + Ignored parameter included for compatibility with SQLAlchemy + and SQLite version of ``to_sql``. + schema : string, default None + Ignored parameter included for compatibility with SQLAlchemy + version of ``to_sql``. + chunksize : int, default None + Ignored parameter included for compatibility with SQLAlchemy + and SQLite version of ``to_sql``. + dtype : Ignored parameter included for compatibility with SQLAlchemy + and SQLite version of ``to_sql``. + method : {None, 'multi', callable}, default None + Ignored parameter included for compatibility with SQLAlchemy + and SQLite version of ``to_sql``. + """ + table_exits = ( + len( + self.con.execute( + f"SELECT name FROM sqlite_master WHERE name='{name}'" + ).fetchall() + ) + > 0 + ) + if table_exits: + if if_exists == "fail": + raise ValueError(f"Table '{name}' already exists.") + elif if_exists == "replace": + self.con.execute(f"DROP TABLE {name}") + return self.con.execute( + f"CREATE TABLE {name} AS SELECT * FROM frame" + ).fetchone()[0] + elif if_exists == "append": + return self.con.execute( + f"INSERT INTO {name} SELECT * FROM frame" + ).fetchone()[0] + else: + raise ValueError(f"'{if_exists}' is not valid for if_exists") + + return self.con.execute( + f"CREATE TABLE {name} AS SELECT * FROM frame" + ).fetchone()[0] diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 741af4324c1a6..a70031bb1f63a 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -71,6 +71,13 @@ except ImportError: SQLALCHEMY_INSTALLED = False +try: + import duckdb + + DUCKDB_INSTALLED = True +except ImportError: + DUCKDB_INSTALLED = False + SQL_STRINGS = { "read_parameters": { "sqlite": "SELECT * FROM iris WHERE Name=? AND SepalLength=?", @@ -2936,3 +2943,63 @@ def test_if_exists(self): (5, "E"), ] self.drop_table(table_name) + + +class TestDuckDB: + def test_to_sql_duck(self): + if not DUCKDB_INSTALLED: + return + con = duckdb.connect() + df = DataFrame( + [[None, 10, 1.0], ["nick", None, 1.5], ["juli", 14, None]], + columns=["Name", "Age", "Numeric"], + ) + df.to_sql("ages", con) + result = con.execute( + 'SELECT count(*), sum("Age"), sum("Numeric") FROM ages' + ).fetchone() + assert result == ( + 3, + 24, + 2.5, + ) + con.close() + + def test_to_sql_duck_all_exist_options(self): + if not DUCKDB_INSTALLED: + return + con = duckdb.connect() + con.execute("CREATE TABLE ages (a INTEGER)") + + df = DataFrame( + [[None, 10, 1.0], ["nick", None, 1.5], ["juli", 14, None]], + columns=["Name", "Age", "Numeric"], + ) + msg = "Table 'ages' already exists." + with pytest.raises(ValueError, match=msg): + df.to_sql("ages", con) + + df.to_sql("ages", con, if_exists="replace") + result = con.execute( + 'SELECT count(*), sum("Age"), sum("Numeric") FROM ages' + ).fetchone() + assert result == ( + 3, + 24, + 2.5, + ) + + df.to_sql("ages", con, if_exists="append") + result = con.execute( + 'SELECT count(*), sum("Age"), sum("Numeric") FROM ages' + ).fetchone() + assert result == ( + 6, + 48, + 5, + ) + msg = "'flark' is not valid for if_exists" + with pytest.raises(ValueError, match=msg): + df.to_sql("ages", con, if_exists="flark") + + con.close() diff --git a/requirements-dev.txt b/requirements-dev.txt index 2434428101285..c2136e4fbd04c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -61,6 +61,7 @@ matplotlib>=3.3.2 numexpr>=2.7.1 scipy>=1.4.1 numba>=0.50.1 +python-duckdb>=0.3.1 beautifulsoup4>=4.8.2 html5lib lxml @@ -86,4 +87,5 @@ natsort pydata-sphinx-theme pandas-dev-flaker==0.2.0 pytest-cython +duckdb setuptools>=51.0.0