Skip to content

Commit e1cd966

Browse files
Merge pull request #8208 from jorisvandenbossche/sql-2754-nan
ENH: sql support for writing NaN + datetime64 values (GH2754, GH7103)
2 parents 41cc8cc + 638fb5b commit e1cd966

File tree

3 files changed

+49
-61
lines changed

3 files changed

+49
-61
lines changed

doc/source/v0.15.0.txt

+4-1
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,9 @@ Enhancements
494494
df.to_sql('table', engine, schema='other_schema')
495495
pd.read_sql_table('table', engine, schema='other_schema')
496496

497+
- Added support for writing ``NaN`` values with ``to_sql`` (:issue:`2754`).
498+
- Added support for writing datetime64 columns with ``to_sql`` for all database flavors (:issue:`7103`).
499+
497500
- Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`)
498501

499502
- Added ``layout`` keyword to ``DataFrame.plot`` (:issue:`6667`)
@@ -573,7 +576,7 @@ Performance
573576
- Performance improvements in ``StataWriter`` when writing large files (:issue:`8079`)
574577
- Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`)
575578
- Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`)
576-
579+
- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`).
577580

578581

579582

pandas/io/sql.py

+40-28
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import pandas.core.common as com
1616
from pandas.compat import lzip, map, zip, raise_with_traceback, string_types
1717
from pandas.core.api import DataFrame, Series
18+
from pandas.core.common import notnull, isnull
1819
from pandas.core.base import PandasObject
1920
from pandas.tseries.tools import to_datetime
2021

@@ -598,12 +599,6 @@ def create(self):
598599
def insert_statement(self):
599600
return self.table.insert()
600601

601-
def maybe_asscalar(self, i):
602-
try:
603-
return np.asscalar(i)
604-
except AttributeError:
605-
return i
606-
607602
def insert_data(self):
608603
if self.index is not None:
609604
temp = self.frame.copy()
@@ -615,16 +610,36 @@ def insert_data(self):
615610
"duplicate name in index/columns: {0}".format(err))
616611
else:
617612
temp = self.frame
613+
614+
column_names = list(map(str, temp.columns))
615+
ncols = len(column_names)
616+
data_list = [None] * ncols
617+
blocks = temp._data.blocks
618+
619+
for i in range(len(blocks)):
620+
b = blocks[i]
621+
if b.is_datetime:
622+
# convert to microsecond resolution so this yields datetime.datetime
623+
d = b.values.astype('M8[us]').astype(object)
624+
else:
625+
d = np.array(b.values, dtype=object)
626+
627+
# replace NaN with None
628+
if b._can_hold_na:
629+
mask = isnull(d)
630+
d[mask] = None
618631

619-
return temp
632+
for col_loc, col in zip(b.mgr_locs, d):
633+
data_list[col_loc] = col
634+
635+
return column_names, data_list
620636

621637
def insert(self, chunksize=None):
622638

623639
ins = self.insert_statement()
624-
temp = self.insert_data()
625-
keys = list(map(str, temp.columns))
640+
keys, data_list = self.insert_data()
626641

627-
nrows = len(temp)
642+
nrows = len(self.frame)
628643
if chunksize is None:
629644
chunksize = nrows
630645
chunks = int(nrows / chunksize) + 1
@@ -636,12 +651,11 @@ def insert(self, chunksize=None):
636651
end_i = min((i + 1) * chunksize, nrows)
637652
if start_i >= end_i:
638653
break
639-
data_list = []
640-
for t in temp.iloc[start_i:end_i].itertuples():
641-
data = dict((k, self.maybe_asscalar(v))
642-
for k, v in zip(keys, t[1:]))
643-
data_list.append(data)
644-
con.execute(ins, data_list)
654+
655+
chunk_list = [arr[start_i:end_i] for arr in data_list]
656+
insert_list = [dict((k, v) for k, v in zip(keys, row))
657+
for row in zip(*chunk_list)]
658+
con.execute(ins, insert_list)
645659

646660
def read(self, coerce_float=True, parse_dates=None, columns=None):
647661

@@ -758,12 +772,12 @@ def _harmonize_columns(self, parse_dates=None):
758772

759773
elif col_type is float:
760774
# floats support NA, can always convert!
761-
self.frame[col_name].astype(col_type, copy=False)
775+
self.frame[col_name] = df_col.astype(col_type, copy=False)
762776

763777
elif len(df_col) == df_col.count():
764778
# No NA values, can convert ints and bools
765-
if col_type is int or col_type is bool:
766-
self.frame[col_name].astype(col_type, copy=False)
779+
if col_type is np.dtype('int64') or col_type is bool:
780+
self.frame[col_name] = df_col.astype(col_type, copy=False)
767781

768782
# Handle date parsing
769783
if col_name in parse_dates:
@@ -813,7 +827,7 @@ def _numpy_type(self, sqltype):
813827
return float
814828
if isinstance(sqltype, Integer):
815829
# TODO: Refine integer size.
816-
return int
830+
return np.dtype('int64')
817831
if isinstance(sqltype, DateTime):
818832
# Caution: np.datetime64 is also a subclass of np.number.
819833
return datetime
@@ -1008,9 +1022,9 @@ def insert_statement(self):
10081022
def insert(self, chunksize=None):
10091023

10101024
ins = self.insert_statement()
1011-
temp = self.insert_data()
1025+
keys, data_list = self.insert_data()
10121026

1013-
nrows = len(temp)
1027+
nrows = len(self.frame)
10141028
if chunksize is None:
10151029
chunksize = nrows
10161030
chunks = int(nrows / chunksize) + 1
@@ -1021,13 +1035,11 @@ def insert(self, chunksize=None):
10211035
end_i = min((i + 1) * chunksize, nrows)
10221036
if start_i >= end_i:
10231037
break
1024-
data_list = []
1025-
for t in temp.iloc[start_i:end_i].itertuples():
1026-
data = tuple((self.maybe_asscalar(v) for v in t[1:]))
1027-
data_list.append(data)
1028-
1038+
chunk_list = [arr[start_i:end_i] for arr in data_list]
1039+
insert_list = [tuple((v for v in row))
1040+
for row in zip(*chunk_list)]
10291041
cur = self.pd_sql.con.cursor()
1030-
cur.executemany(ins, data_list)
1042+
cur.executemany(ins, insert_list)
10311043
cur.close()
10321044

10331045
def _create_table_setup(self):

pandas/io/tests/test_sql.py

+5-32
Original file line numberDiff line numberDiff line change
@@ -952,9 +952,6 @@ def test_date_parsing(self):
952952
"IntDateCol loaded with incorrect type")
953953

954954
def test_datetime(self):
955-
if self.driver == 'pymysql':
956-
raise nose.SkipTest('writing datetime not working with pymysql')
957-
958955
df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3),
959956
'B': np.arange(3.0)})
960957
df.to_sql('test_datetime', self.conn)
@@ -975,17 +972,6 @@ def test_datetime(self):
975972
tm.assert_frame_equal(result, df)
976973

977974
def test_datetime_NaT(self):
978-
# status:
979-
# - postgresql: gives error on inserting "0001-255-255T00:00:00"
980-
# - sqlite3: works, but reading it with query returns '-001--1--1 -1:-1:-1.-00001'
981-
982-
if self.driver == 'pymysql':
983-
raise nose.SkipTest('writing datetime not working with pymysql')
984-
if self.driver == 'psycopg2':
985-
raise nose.SkipTest('writing datetime NaT not working with psycopg2')
986-
if self.flavor == 'sqlite':
987-
raise nose.SkipTest('reading datetime NaT not working with sqlite')
988-
989975
df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3),
990976
'B': np.arange(3.0)})
991977
df.loc[1, 'A'] = np.nan
@@ -1032,9 +1018,6 @@ def test_mixed_dtype_insert(self):
10321018
tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True)
10331019

10341020
def test_nan_numeric(self):
1035-
if self.driver == 'pymysql':
1036-
raise nose.SkipTest('writing NaNs not working with pymysql')
1037-
10381021
# NaNs in numeric float column
10391022
df = DataFrame({'A':[0, 1, 2], 'B':[0.2, np.nan, 5.6]})
10401023
df.to_sql('test_nan', self.conn, index=False)
@@ -1048,37 +1031,27 @@ def test_nan_numeric(self):
10481031
tm.assert_frame_equal(result, df)
10491032

10501033
def test_nan_fullcolumn(self):
1051-
if self.driver == 'pymysql':
1052-
raise nose.SkipTest('writing NaNs not working with pymysql')
1053-
10541034
# full NaN column (numeric float column)
10551035
df = DataFrame({'A':[0, 1, 2], 'B':[np.nan, np.nan, np.nan]})
10561036
df.to_sql('test_nan', self.conn, index=False)
10571037

1058-
if self.flavor == 'sqlite':
1059-
df['B'] = df['B'].astype('object')
1060-
df['B'] = None
1061-
10621038
# with read_table
10631039
result = sql.read_sql_table('test_nan', self.conn)
10641040
tm.assert_frame_equal(result, df)
10651041

1066-
# with read_sql
1042+
# with read_sql -> not type info from table -> stays None
1043+
df['B'] = df['B'].astype('object')
1044+
df['B'] = None
10671045
result = sql.read_sql_query('SELECT * FROM test_nan', self.conn)
10681046
tm.assert_frame_equal(result, df)
10691047

10701048
def test_nan_string(self):
1071-
if self.driver == 'pymysql':
1072-
raise nose.SkipTest('writing NaNs not working with pymysql')
1073-
10741049
# NaNs in string column
10751050
df = DataFrame({'A':[0, 1, 2], 'B':['a', 'b', np.nan]})
10761051
df.to_sql('test_nan', self.conn, index=False)
10771052

1078-
if self.flavor == 'sqlite':
1079-
df.loc[2, 'B'] = None
1080-
elif self.flavor == 'postgresql':
1081-
df = df.fillna('NaN')
1053+
# NaNs are coming back as None
1054+
df.loc[2, 'B'] = None
10821055

10831056
# with read_table
10841057
result = sql.read_sql_table('test_nan', self.conn)

0 commit comments

Comments
 (0)