Skip to content

ENH: sql support for writing NaN + datetime64 values (GH2754, GH7103) #8208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 13, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,9 @@ Enhancements
df.to_sql('table', engine, schema='other_schema')
pd.read_sql_table('table', engine, schema='other_schema')

- Added support for writing ``NaN`` values with ``to_sql`` (:issue:`2754`).
- Added support for writing datetime64 columns with ``to_sql`` for all database flavors (:issue:`7103`).

- Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`)

- Added ``layout`` keyword to ``DataFrame.plot`` (:issue:`6667`)
Expand Down Expand Up @@ -573,7 +576,7 @@ Performance
- Performance improvements in ``StataWriter`` when writing large files (:issue:`8079`)
- Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`)
- Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`)

- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`).



Expand Down
68 changes: 40 additions & 28 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pandas.core.common as com
from pandas.compat import lzip, map, zip, raise_with_traceback, string_types
from pandas.core.api import DataFrame, Series
from pandas.core.common import notnull, isnull
from pandas.core.base import PandasObject
from pandas.tseries.tools import to_datetime

Expand Down Expand Up @@ -598,12 +599,6 @@ def create(self):
def insert_statement(self):
return self.table.insert()

def maybe_asscalar(self, i):
try:
return np.asscalar(i)
except AttributeError:
return i

def insert_data(self):
if self.index is not None:
temp = self.frame.copy()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback making a copy to reset the index to be able to also insert the index in the SQL table seems a bit stupid.
It seems more logical if I could add these seperately to the data_list. Eg I could do something like:

if self.index:
    b_index = self.frame.index.to_series()._data.blocks[0]
    blocks = b_index + blocks

(and then of course taking possibly different levels into account, but you get the idea)

But is there a better way to get the index values into a block? (what is needed to have the datetime and nan handling, as these both can also occur in the index)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think making a copy is a big deal its quite cheap to do it. (and much more time doing the conversions to tuples anyhow) You can make only reset if the self.index is True I guess. Too complicated otherwise.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, that is what happens now, only if index=True

Expand All @@ -615,16 +610,36 @@ def insert_data(self):
"duplicate name in index/columns: {0}".format(err))
else:
temp = self.frame

column_names = list(map(str, temp.columns))
ncols = len(column_names)
data_list = [None] * ncols
blocks = temp._data.blocks

for i in range(len(blocks)):
b = blocks[i]
if b.is_datetime:
# convert to microsecond resolution so this yields datetime.datetime
d = b.values.astype('M8[us]').astype(object)
else:
d = np.array(b.values, dtype=object)

# replace NaN with None
if b._can_hold_na:
mask = isnull(d)
d[mask] = None

return temp
for col_loc, col in zip(b.mgr_locs, d):
data_list[col_loc] = col

return column_names, data_list

def insert(self, chunksize=None):

ins = self.insert_statement()
temp = self.insert_data()
keys = list(map(str, temp.columns))
keys, data_list = self.insert_data()

nrows = len(temp)
nrows = len(self.frame)
if chunksize is None:
chunksize = nrows
chunks = int(nrows / chunksize) + 1
Expand All @@ -636,12 +651,11 @@ def insert(self, chunksize=None):
end_i = min((i + 1) * chunksize, nrows)
if start_i >= end_i:
break
data_list = []
for t in temp.iloc[start_i:end_i].itertuples():
data = dict((k, self.maybe_asscalar(v))
for k, v in zip(keys, t[1:]))
data_list.append(data)
con.execute(ins, data_list)

chunk_list = [arr[start_i:end_i] for arr in data_list]
insert_list = [dict((k, v) for k, v in zip(keys, row))
for row in zip(*chunk_list)]
con.execute(ins, insert_list)

def read(self, coerce_float=True, parse_dates=None, columns=None):

Expand Down Expand Up @@ -758,12 +772,12 @@ def _harmonize_columns(self, parse_dates=None):

elif col_type is float:
# floats support NA, can always convert!
self.frame[col_name].astype(col_type, copy=False)
self.frame[col_name] = df_col.astype(col_type, copy=False)

elif len(df_col) == df_col.count():
# No NA values, can convert ints and bools
if col_type is int or col_type is bool:
self.frame[col_name].astype(col_type, copy=False)
if col_type is np.dtype('int64') or col_type is bool:
self.frame[col_name] = df_col.astype(col_type, copy=False)

# Handle date parsing
if col_name in parse_dates:
Expand Down Expand Up @@ -813,7 +827,7 @@ def _numpy_type(self, sqltype):
return float
if isinstance(sqltype, Integer):
# TODO: Refine integer size.
return int
return np.dtype('int64')
if isinstance(sqltype, DateTime):
# Caution: np.datetime64 is also a subclass of np.number.
return datetime
Expand Down Expand Up @@ -1008,9 +1022,9 @@ def insert_statement(self):
def insert(self, chunksize=None):

ins = self.insert_statement()
temp = self.insert_data()
keys, data_list = self.insert_data()

nrows = len(temp)
nrows = len(self.frame)
if chunksize is None:
chunksize = nrows
chunks = int(nrows / chunksize) + 1
Expand All @@ -1021,13 +1035,11 @@ def insert(self, chunksize=None):
end_i = min((i + 1) * chunksize, nrows)
if start_i >= end_i:
break
data_list = []
for t in temp.iloc[start_i:end_i].itertuples():
data = tuple((self.maybe_asscalar(v) for v in t[1:]))
data_list.append(data)

chunk_list = [arr[start_i:end_i] for arr in data_list]
insert_list = [tuple((v for v in row))
for row in zip(*chunk_list)]
cur = self.pd_sql.con.cursor()
cur.executemany(ins, data_list)
cur.executemany(ins, insert_list)
cur.close()

def _create_table_setup(self):
Expand Down
37 changes: 5 additions & 32 deletions pandas/io/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -952,9 +952,6 @@ def test_date_parsing(self):
"IntDateCol loaded with incorrect type")

def test_datetime(self):
if self.driver == 'pymysql':
raise nose.SkipTest('writing datetime not working with pymysql')

df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3),
'B': np.arange(3.0)})
df.to_sql('test_datetime', self.conn)
Expand All @@ -975,17 +972,6 @@ def test_datetime(self):
tm.assert_frame_equal(result, df)

def test_datetime_NaT(self):
# status:
# - postgresql: gives error on inserting "0001-255-255T00:00:00"
# - sqlite3: works, but reading it with query returns '-001--1--1 -1:-1:-1.-00001'

if self.driver == 'pymysql':
raise nose.SkipTest('writing datetime not working with pymysql')
if self.driver == 'psycopg2':
raise nose.SkipTest('writing datetime NaT not working with psycopg2')
if self.flavor == 'sqlite':
raise nose.SkipTest('reading datetime NaT not working with sqlite')

df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3),
'B': np.arange(3.0)})
df.loc[1, 'A'] = np.nan
Expand Down Expand Up @@ -1032,9 +1018,6 @@ def test_mixed_dtype_insert(self):
tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True)

def test_nan_numeric(self):
if self.driver == 'pymysql':
raise nose.SkipTest('writing NaNs not working with pymysql')

# NaNs in numeric float column
df = DataFrame({'A':[0, 1, 2], 'B':[0.2, np.nan, 5.6]})
df.to_sql('test_nan', self.conn, index=False)
Expand All @@ -1048,37 +1031,27 @@ def test_nan_numeric(self):
tm.assert_frame_equal(result, df)

def test_nan_fullcolumn(self):
if self.driver == 'pymysql':
raise nose.SkipTest('writing NaNs not working with pymysql')

# full NaN column (numeric float column)
df = DataFrame({'A':[0, 1, 2], 'B':[np.nan, np.nan, np.nan]})
df.to_sql('test_nan', self.conn, index=False)

if self.flavor == 'sqlite':
df['B'] = df['B'].astype('object')
df['B'] = None

# with read_table
result = sql.read_sql_table('test_nan', self.conn)
tm.assert_frame_equal(result, df)

# with read_sql
# with read_sql -> not type info from table -> stays None
df['B'] = df['B'].astype('object')
df['B'] = None
result = sql.read_sql_query('SELECT * FROM test_nan', self.conn)
tm.assert_frame_equal(result, df)

def test_nan_string(self):
if self.driver == 'pymysql':
raise nose.SkipTest('writing NaNs not working with pymysql')

# NaNs in string column
df = DataFrame({'A':[0, 1, 2], 'B':['a', 'b', np.nan]})
df.to_sql('test_nan', self.conn, index=False)

if self.flavor == 'sqlite':
df.loc[2, 'B'] = None
elif self.flavor == 'postgresql':
df = df.fillna('NaN')
# NaNs are coming back as None
df.loc[2, 'B'] = None

# with read_table
result = sql.read_sql_table('test_nan', self.conn)
Expand Down