Skip to content

Commit b8c36be

Browse files
Merge pull request #8062 from artemyk/to_sql_chunksize
Added chunksize argument to to_sql
2 parents 4411ab6 + 3c43c95 commit b8c36be

File tree

5 files changed

+72
-26
lines changed

5 files changed

+72
-26
lines changed

doc/source/io.rst

+6
Original file line numberDiff line numberDiff line change
@@ -3267,6 +3267,12 @@ the database using :func:`~pandas.DataFrame.to_sql`.
32673267
32683268
data.to_sql('data', engine)
32693269
3270+
With some databases, writing large DataFrames can result in errors due to packet size limitations being exceeded. This can be avoided by setting the ``chunksize`` parameter when calling ``to_sql``. For example, the following writes ``data`` to the database in batches of 1000 rows at a time:
3271+
3272+
.. ipython:: python
3273+
3274+
data.to_sql('data', engine, chunksize=1000)
3275+
32703276
.. note::
32713277

32723278
Due to the limited support for timedelta's in the different database

doc/source/v0.15.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,9 @@ Known Issues
425425

426426
Enhancements
427427
~~~~~~~~~~~~
428+
429+
- Added support for a ``chunksize`` parameter to ``to_sql`` function. This allows DataFrame to be written in chunks and avoid packet-size overflow errors (:issue:`8062`)
430+
428431
- Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`)
429432

430433
- Added ``layout`` keyword to ``DataFrame.plot`` (:issue:`6667`)

pandas/core/generic.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -916,7 +916,7 @@ def to_msgpack(self, path_or_buf=None, **kwargs):
916916
return packers.to_msgpack(path_or_buf, self, **kwargs)
917917

918918
def to_sql(self, name, con, flavor='sqlite', if_exists='fail', index=True,
919-
index_label=None):
919+
index_label=None, chunksize=None):
920920
"""
921921
Write records stored in a DataFrame to a SQL database.
922922
@@ -942,12 +942,15 @@ def to_sql(self, name, con, flavor='sqlite', if_exists='fail', index=True,
942942
Column label for index column(s). If None is given (default) and
943943
`index` is True, then the index names are used.
944944
A sequence should be given if the DataFrame uses MultiIndex.
945+
chunksize : int, default None
946+
If not None, then rows will be written in batches of this size at a
947+
time. If None, all rows will be written at once.
945948
946949
"""
947950
from pandas.io import sql
948951
sql.to_sql(
949952
self, name, con, flavor=flavor, if_exists=if_exists, index=index,
950-
index_label=index_label)
953+
index_label=index_label, chunksize=chunksize)
951954

952955
def to_pickle(self, path):
953956
"""

pandas/io/sql.py

+50-24
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None,
432432

433433

434434
def to_sql(frame, name, con, flavor='sqlite', if_exists='fail', index=True,
435-
index_label=None):
435+
index_label=None, chunksize=None):
436436
"""
437437
Write records stored in a DataFrame to a SQL database.
438438
@@ -459,6 +459,9 @@ def to_sql(frame, name, con, flavor='sqlite', if_exists='fail', index=True,
459459
Column label for index column(s). If None is given (default) and
460460
`index` is True, then the index names are used.
461461
A sequence should be given if the DataFrame uses MultiIndex.
462+
chunksize : int, default None
463+
If not None, then rows will be written in batches of this size at a
464+
time. If None, all rows will be written at once.
462465
463466
"""
464467
if if_exists not in ('fail', 'replace', 'append'):
@@ -472,7 +475,7 @@ def to_sql(frame, name, con, flavor='sqlite', if_exists='fail', index=True,
472475
raise NotImplementedError
473476

474477
pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index,
475-
index_label=index_label)
478+
index_label=index_label, chunksize=chunksize)
476479

477480

478481
def has_table(table_name, con, flavor='sqlite'):
@@ -597,18 +600,30 @@ def insert_data(self):
597600

598601
return temp
599602

600-
def insert(self):
603+
def insert(self, chunksize=None):
604+
601605
ins = self.insert_statement()
602-
data_list = []
603606
temp = self.insert_data()
604607
keys = list(map(str, temp.columns))
605608

606-
for t in temp.itertuples():
607-
data = dict((k, self.maybe_asscalar(v))
608-
for k, v in zip(keys, t[1:]))
609-
data_list.append(data)
610-
611-
self.pd_sql.execute(ins, data_list)
609+
nrows = len(temp)
610+
if chunksize is None:
611+
chunksize = nrows
612+
chunks = int(nrows / chunksize) + 1
613+
614+
con = self.pd_sql.engine.connect()
615+
with con.begin() as trans:
616+
for i in range(chunks):
617+
start_i = i * chunksize
618+
end_i = min((i + 1) * chunksize, nrows)
619+
if start_i >= end_i:
620+
break
621+
data_list = []
622+
for t in temp.iloc[start_i:end_i].itertuples():
623+
data = dict((k, self.maybe_asscalar(v))
624+
for k, v in zip(keys, t[1:]))
625+
data_list.append(data)
626+
con.execute(ins, data_list)
612627

613628
def read(self, coerce_float=True, parse_dates=None, columns=None):
614629

@@ -843,11 +858,11 @@ def read_sql(self, sql, index_col=None, coerce_float=True,
843858
return data_frame
844859

845860
def to_sql(self, frame, name, if_exists='fail', index=True,
846-
index_label=None):
861+
index_label=None, chunksize=None):
847862
table = PandasSQLTable(
848863
name, self, frame=frame, index=index, if_exists=if_exists,
849864
index_label=index_label)
850-
table.insert()
865+
table.insert(chunksize)
851866

852867
@property
853868
def tables(self):
@@ -948,19 +963,30 @@ def insert_statement(self):
948963
self.name, col_names, wildcards)
949964
return insert_statement
950965

951-
def insert(self):
966+
def insert(self, chunksize=None):
967+
952968
ins = self.insert_statement()
953969
temp = self.insert_data()
954-
data_list = []
955-
956-
for t in temp.itertuples():
957-
data = tuple((self.maybe_asscalar(v) for v in t[1:]))
958-
data_list.append(data)
959970

960-
cur = self.pd_sql.con.cursor()
961-
cur.executemany(ins, data_list)
962-
cur.close()
963-
self.pd_sql.con.commit()
971+
nrows = len(temp)
972+
if chunksize is None:
973+
chunksize = nrows
974+
chunks = int(nrows / chunksize) + 1
975+
976+
with self.pd_sql.con:
977+
for i in range(chunks):
978+
start_i = i * chunksize
979+
end_i = min((i + 1) * chunksize, nrows)
980+
if start_i >= end_i:
981+
break
982+
data_list = []
983+
for t in temp.iloc[start_i:end_i].itertuples():
984+
data = tuple((self.maybe_asscalar(v) for v in t[1:]))
985+
data_list.append(data)
986+
987+
cur = self.pd_sql.con.cursor()
988+
cur.executemany(ins, data_list)
989+
cur.close()
964990

965991
def _create_table_statement(self):
966992
"Return a CREATE TABLE statement to suit the contents of a DataFrame."
@@ -1069,7 +1095,7 @@ def _fetchall_as_list(self, cur):
10691095
return result
10701096

10711097
def to_sql(self, frame, name, if_exists='fail', index=True,
1072-
index_label=None):
1098+
index_label=None, chunksize=None):
10731099
"""
10741100
Write records stored in a DataFrame to a SQL database.
10751101
@@ -1087,7 +1113,7 @@ def to_sql(self, frame, name, if_exists='fail', index=True,
10871113
table = PandasSQLTableLegacy(
10881114
name, self, frame=frame, index=index, if_exists=if_exists,
10891115
index_label=index_label)
1090-
table.insert()
1116+
table.insert(chunksize)
10911117

10921118
def has_table(self, name):
10931119
flavor_map = {

pandas/io/tests/test_sql.py

+8
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,14 @@ def test_roundtrip(self):
455455
result.index.name = None
456456
tm.assert_frame_equal(result, self.test_frame1)
457457

458+
def test_roundtrip_chunksize(self):
459+
sql.to_sql(self.test_frame1, 'test_frame_roundtrip', con=self.conn,
460+
index=False, flavor='sqlite', chunksize=2)
461+
result = sql.read_sql_query(
462+
'SELECT * FROM test_frame_roundtrip',
463+
con=self.conn)
464+
tm.assert_frame_equal(result, self.test_frame1)
465+
458466
def test_execute_sql(self):
459467
# drop_sql = "DROP TABLE IF EXISTS test" # should already be done
460468
iris_results = sql.execute("SELECT * FROM iris", con=self.conn)

0 commit comments

Comments
 (0)