Skip to content

Commit 638fb5b

Browse files
ENH: refactor to_sql insert_data - performance improvement (GH8208)
1 parent 019a932 commit 638fb5b

File tree

2 files changed

+37
-28
lines changed

2 files changed

+37
-28
lines changed

doc/source/v0.15.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,7 @@ Performance
576576
- Performance improvements in ``StataWriter`` when writing large files (:issue:`8079`)
577577
- Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`)
578578
- Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`)
579-
579+
- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`).
580580

581581

582582

pandas/io/sql.py

+36-27
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import pandas.core.common as com
1616
from pandas.compat import lzip, map, zip, raise_with_traceback, string_types
1717
from pandas.core.api import DataFrame, Series
18-
from pandas.core.common import notnull
18+
from pandas.core.common import notnull, isnull
1919
from pandas.core.base import PandasObject
2020
from pandas.tseries.tools import to_datetime
2121

@@ -599,12 +599,6 @@ def create(self):
599599
def insert_statement(self):
600600
return self.table.insert()
601601

602-
def maybe_asscalar(self, i):
603-
try:
604-
return np.asscalar(i)
605-
except AttributeError:
606-
return i
607-
608602
def insert_data(self):
609603
if self.index is not None:
610604
temp = self.frame.copy()
@@ -617,17 +611,35 @@ def insert_data(self):
617611
else:
618612
temp = self.frame
619613

620-
temp = temp.astype(object)
621-
temp = temp.where(notnull(temp), None)
622-
return temp
614+
column_names = list(map(str, temp.columns))
615+
ncols = len(column_names)
616+
data_list = [None] * ncols
617+
blocks = temp._data.blocks
618+
619+
for i in range(len(blocks)):
620+
b = blocks[i]
621+
if b.is_datetime:
622+
# convert to microsecond resolution so this yields datetime.datetime
623+
d = b.values.astype('M8[us]').astype(object)
624+
else:
625+
d = np.array(b.values, dtype=object)
626+
627+
# replace NaN with None
628+
if b._can_hold_na:
629+
mask = isnull(d)
630+
d[mask] = None
631+
632+
for col_loc, col in zip(b.mgr_locs, d):
633+
data_list[col_loc] = col
634+
635+
return column_names, data_list
623636

624637
def insert(self, chunksize=None):
625638

626639
ins = self.insert_statement()
627-
temp = self.insert_data()
628-
keys = list(map(str, temp.columns))
640+
keys, data_list = self.insert_data()
629641

630-
nrows = len(temp)
642+
nrows = len(self.frame)
631643
if chunksize is None:
632644
chunksize = nrows
633645
chunks = int(nrows / chunksize) + 1
@@ -639,12 +651,11 @@ def insert(self, chunksize=None):
639651
end_i = min((i + 1) * chunksize, nrows)
640652
if start_i >= end_i:
641653
break
642-
data_list = []
643-
for t in temp.iloc[start_i:end_i].itertuples():
644-
data = dict((k, self.maybe_asscalar(v))
645-
for k, v in zip(keys, t[1:]))
646-
data_list.append(data)
647-
con.execute(ins, data_list)
654+
655+
chunk_list = [arr[start_i:end_i] for arr in data_list]
656+
insert_list = [dict((k, v) for k, v in zip(keys, row))
657+
for row in zip(*chunk_list)]
658+
con.execute(ins, insert_list)
648659

649660
def read(self, coerce_float=True, parse_dates=None, columns=None):
650661

@@ -1011,9 +1022,9 @@ def insert_statement(self):
10111022
def insert(self, chunksize=None):
10121023

10131024
ins = self.insert_statement()
1014-
temp = self.insert_data()
1025+
keys, data_list = self.insert_data()
10151026

1016-
nrows = len(temp)
1027+
nrows = len(self.frame)
10171028
if chunksize is None:
10181029
chunksize = nrows
10191030
chunks = int(nrows / chunksize) + 1
@@ -1024,13 +1035,11 @@ def insert(self, chunksize=None):
10241035
end_i = min((i + 1) * chunksize, nrows)
10251036
if start_i >= end_i:
10261037
break
1027-
data_list = []
1028-
for t in temp.iloc[start_i:end_i].itertuples():
1029-
data = tuple((self.maybe_asscalar(v) for v in t[1:]))
1030-
data_list.append(data)
1031-
1038+
chunk_list = [arr[start_i:end_i] for arr in data_list]
1039+
insert_list = [tuple((v for v in row))
1040+
for row in zip(*chunk_list)]
10321041
cur = self.pd_sql.con.cursor()
1033-
cur.executemany(ins, data_list)
1042+
cur.executemany(ins, insert_list)
10341043
cur.close()
10351044

10361045
def _create_table_setup(self):

0 commit comments

Comments
 (0)