ENH: to_sql() add parameter "method" to control insertions method (pandas-dev#8953)

schettino72 · schettino72 · commit a77cdfd63df7 · 2018-06-09T18:49:08.000+08:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -16,6 +16,7 @@ Other Enhancements
 - :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
 - :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`)
 - :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with MultiIndex (:issue:`21115`)
+- :func:`~pandas.DataFrame.to_sql` add parameter ``method`` to control SQL insertion clause (:8953:)
 
 
 .. _whatsnew_0240.api_breaking:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2014,7 +2014,7 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
                                   **kwargs)
 
     def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
-               index_label=None, chunksize=None, dtype=None):
+               index_label=None, chunksize=None, dtype=None, method='default'):
         """
         Write records stored in a DataFrame to a SQL database.
 
@@ -2052,6 +2052,8 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
             Specifying the datatype for columns. The keys should be the column
             names and the values should be the SQLAlchemy types or strings for
             the sqlite3 legacy mode.
+        method : {'default', 'multi', callable}, default 'default'
+            Controls the SQL insertion clause used.
 
         Raises
         ------
@@ -2120,11 +2122,59 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
 
         >>> engine.execute("SELECT * FROM integers").fetchall()
         [(1,), (None,), (2,)]
+
+        Insertion method:
+
+        .. versionadded:: 0.24.0
+
+        The parameter ``method`` controls the SQL insertion clause used.
+        Possible values are:
+
+         - `'default'`: Uses standard SQL `INSERT` clause
+         - `'multi'`: Pass multiple values in a single `INSERT` clause.
+            It uses a **special** SQL syntax not supported by all backends.
+            This usually provides a big performance for Analytic databases
+            like *Presto* and *Redshit*, but has worse performance for
+            traditional SQL backend if the table contains many columns.
+            For more information check SQLAlchemy `documention <http://docs.sqlalchemy.org/en/latest/core/dml.html?highlight=multivalues#sqlalchemy.sql.expression.Insert.values.params.*args>`__.
+         - callable: with signature `(pd_table, conn, keys, data_iter)`.
+            This can be used to implement more performant insertion based on
+            specific backend dialect features.
+            I.e. using *Postgresql* `COPY clause
+            <https://www.postgresql.org/docs/current/static/sql-copy.html>`__.
+            Check API for details and a sample implementation
+            :func:`~pandas.DataFrame.to_sql`.
+
+
+        Example of callable for Postgresql *COPY*::
+
+          # Alternative to_sql() *method* for DBs that support COPY FROM
+          import csv
+          from io import StringIO
+
+          def psql_insert_copy(table, conn, keys, data_iter):
+              # gets a DBAPI connection that can provide a cursor
+              dbapi_conn = conn.connection
+              with dbapi_conn.cursor() as cur:
+                  s_buf = StringIO()
+                  writer = csv.writer(s_buf)
+                  writer.writerows(data_iter)
+                  s_buf.seek(0)
+
+                  columns = ', '.join('"{}"'.format(k) for k in keys)
+                  if table.schema:
+                      table_name = '{}.{}'.format(table.schema, table.name)
+                  else:
+                      table_name = table.name
+
+                  sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
+                      table_name, columns)
+                  cur.copy_expert(sql=sql, file=s_buf)
         """
         from pandas.io import sql
         sql.to_sql(self, name, con, schema=schema, if_exists=if_exists,
                    index=index, index_label=index_label, chunksize=chunksize,
-                   dtype=dtype)
+                   dtype=dtype, method=method)
 
     def to_pickle(self, path, compression='infer',
                   protocol=pkl.HIGHEST_PROTOCOL):
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -6,6 +6,7 @@
 
 from __future__ import print_function, division
 from datetime import datetime, date, time
+from functools import partial
 
 import warnings
 import re
@@ -398,7 +399,7 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None,
 
 
 def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
-           index_label=None, chunksize=None, dtype=None):
+           index_label=None, chunksize=None, dtype=None, method='default'):
     """
     Write records stored in a DataFrame to a SQL database.
 
@@ -432,6 +433,8 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
         Optional specifying the datatype for columns. The SQL type should
         be a SQLAlchemy type, or a string for sqlite3 fallback connection.
         If all columns are of the same type, one single value can be used.
+    method : {'default', 'multi', callable}, default 'default'
+        Controls the SQL insertion clause used.
 
     """
     if if_exists not in ('fail', 'replace', 'append'):
@@ -447,7 +450,7 @@ def to_sql(frame, name, con, schema=None, if_exists='fail', index=True,
 
     pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index,
                       index_label=index_label, schema=schema,
-                      chunksize=chunksize, dtype=dtype)
+                      chunksize=chunksize, dtype=dtype, method=method)
 
 
 def has_table(table_name, con, schema=None):
@@ -572,8 +575,25 @@ def create(self):
         else:
             self._execute_create()
 
-    def insert_statement(self):
-        return self.table.insert()
+    def _execute_insert(self, conn, keys, data_iter):
+        """Execute SQL statement inserting data
+
+        Parameters
+        ----------
+        data : list of list
+           of values to be inserted
+        """
+        data = [{k: v for k, v in zip(keys, row)} for row in data_iter]
+        conn.execute(self.table.insert(), data)
+
+    def _execute_insert_multi(self, conn, keys, data_iter):
+        """Alternative to _exec_insert for DBs that support multivalue INSERT.
+
+        Note: multi-value insert is usually faster for a few columns
+        but performance degrades quickly with increase of columns.
+        """
+        data = [{k: v for k, v in zip(keys, row)} for row in data_iter]
+        conn.execute(self.table.insert(data))
 
     def insert_data(self):
         if self.index is not None:
@@ -611,11 +631,18 @@ def insert_data(self):
 
         return column_names, data_list
 
-    def _execute_insert(self, conn, keys, data_iter):
-        data = [{k: v for k, v in zip(keys, row)} for row in data_iter]
-        conn.execute(self.insert_statement(), data)
+    def insert(self, chunksize=None, method=None):
+
+        # set insert method
+        if method in (None, 'default'):
+            exec_insert = self._execute_insert
+        elif method == 'multi':
+            exec_insert = self._execute_insert_multi
+        elif callable(method):
+            exec_insert = partial(method, self)
+        else:
+            raise ValueError('Invalid parameter `method`: {}'.format(method))
 
-    def insert(self, chunksize=None):
         keys, data_list = self.insert_data()
 
         nrows = len(self.frame)
@@ -638,7 +665,7 @@ def insert(self, chunksize=None):
                     break
 
                 chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list])
-                self._execute_insert(conn, keys, chunk_iter)
+                exec_insert(conn, keys, chunk_iter)
 
     def _query_iterator(self, result, chunksize, columns, coerce_float=True,
                         parse_dates=None):
@@ -1078,7 +1105,8 @@ def read_query(self, sql, index_col=None, coerce_float=True,
     read_sql = read_query
 
     def to_sql(self, frame, name, if_exists='fail', index=True,
-               index_label=None, schema=None, chunksize=None, dtype=None):
+               index_label=None, schema=None, chunksize=None, dtype=None,
+               method='default'):
         """
         Write records stored in a DataFrame to a SQL database.
 
@@ -1108,7 +1136,8 @@ def to_sql(self, frame, name, if_exists='fail', index=True,
             Optional specifying the datatype for columns. The SQL type should
             be a SQLAlchemy type. If all columns are of the same type, one
             single value can be used.
-
+        method : {'default', 'multi', callable}, default 'default'
+            Controls the SQL insertion clause used.
         """
         if dtype and not is_dict_like(dtype):
             dtype = {col_name: dtype for col_name in frame}
@@ -1124,7 +1153,7 @@ def to_sql(self, frame, name, if_exists='fail', index=True,
                          if_exists=if_exists, index_label=index_label,
                          schema=schema, dtype=dtype)
         table.create()
-        table.insert(chunksize)
+        table.insert(chunksize, method=method)
         if (not name.isdigit() and not name.islower()):
             # check for potentially case sensitivity issues (GH7815)
             # Only check when name is not a number and name is not lower case
@@ -1434,7 +1463,8 @@ def _fetchall_as_list(self, cur):
         return result
 
     def to_sql(self, frame, name, if_exists='fail', index=True,
-               index_label=None, schema=None, chunksize=None, dtype=None):
+               index_label=None, schema=None, chunksize=None, dtype=None,
+               method='default'):
         """
         Write records stored in a DataFrame to a SQL database.
 
@@ -1463,7 +1493,8 @@ def to_sql(self, frame, name, if_exists='fail', index=True,
             Optional specifying the datatype for columns. The SQL type should
             be a string. If all columns are of the same type, one single value
             can be used.
-
+        method : {'default', 'multi', callable}, default 'default'
+            Controls the SQL insertion clause used.
         """
         if dtype and not is_dict_like(dtype):
             dtype = {col_name: dtype for col_name in frame}
@@ -1478,7 +1509,7 @@ def to_sql(self, frame, name, if_exists='fail', index=True,
                             if_exists=if_exists, index_label=index_label,
                             dtype=dtype)
         table.create()
-        table.insert(chunksize)
+        table.insert(chunksize, method)
 
     def has_table(self, name, schema=None):
         # TODO(wesm): unused?
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -372,12 +372,16 @@ def _read_sql_iris_named_parameter(self):
         iris_frame = self.pandasSQL.read_query(query, params=params)
         self._check_iris_loaded_frame(iris_frame)
 
-    def _to_sql(self):
+    def _to_sql(self, method=None):
         self.drop_table('test_frame1')
 
-        self.pandasSQL.to_sql(self.test_frame1, 'test_frame1')
+        self.pandasSQL.to_sql(self.test_frame1, 'test_frame1', method=method)
         assert self.pandasSQL.has_table('test_frame1')
 
+        num_entries = len(self.test_frame1)
+        num_rows = self._count_rows('test_frame1')
+        assert num_rows == num_entries
+
         # Nuke table
         self.drop_table('test_frame1')
 
@@ -431,6 +435,25 @@ def _to_sql_append(self):
         assert num_rows == num_entries
         self.drop_table('test_frame1')
 
+    def _to_sql_method_callable(self):
+        check = []  # used to double check function below is really being used
+
+        def sample(pd_table, conn, keys, data_iter):
+            check.append(1)
+            data = [{k: v for k, v in zip(keys, row)} for row in data_iter]
+            conn.execute(pd_table.table.insert(), data)
+        self.drop_table('test_frame1')
+
+        self.pandasSQL.to_sql(self.test_frame1, 'test_frame1', method=sample)
+        assert self.pandasSQL.has_table('test_frame1')
+
+        assert check == [1]
+        num_entries = len(self.test_frame1)
+        num_rows = self._count_rows('test_frame1')
+        assert num_rows == num_entries
+        # Nuke table
+        self.drop_table('test_frame1')
+
     def _roundtrip(self):
         self.drop_table('test_frame_roundtrip')
         self.pandasSQL.to_sql(self.test_frame1, 'test_frame_roundtrip')
@@ -1180,7 +1203,7 @@ def setup_connect(self):
             pytest.skip(
                 "Can't connect to {0} server".format(self.flavor))
 
-    def test_aread_sql(self):
+    def test_read_sql(self):
         self._read_sql_iris()
 
     def test_read_sql_parameter(self):
@@ -1204,6 +1227,12 @@ def test_to_sql_replace(self):
     def test_to_sql_append(self):
         self._to_sql_append()
 
+    def test_to_sql_method_multi(self):
+        self._to_sql(method='multi')
+
+    def test_to_sql_method_callable(self):
+        self._to_sql_method_callable()
+
     def test_create_table(self):
         temp_conn = self.connect()
         temp_frame = DataFrame(