From 019a932a3b2def047f6b86d6b2724cd19d6e876b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 7 Sep 2014 23:07:28 +0200 Subject: [PATCH 1/2] ENH: SQL support for writing NaN + datetime64 values (GH2754, GH7103) Cast values to object dtype converts to native python types. For datetime64 columns these are converted to datetime.datetime which also fixes the datetime64 issue (supercedes PR GH8205). NaN issue is solved by converting all NaN values to None. --- doc/source/v0.15.0.txt | 3 +++ pandas/io/sql.py | 13 ++++++++----- pandas/io/tests/test_sql.py | 37 +++++-------------------------------- 3 files changed, 16 insertions(+), 37 deletions(-) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index f0c3c0e6bc508..3596dfe9c2240 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -494,6 +494,9 @@ Enhancements df.to_sql('table', engine, schema='other_schema') pd.read_sql_table('table', engine, schema='other_schema') +- Added support for writing ``NaN`` values with ``to_sql`` (:issue:`2754`). +- Added support for writing datetime64 columns with ``to_sql`` for all database flavors (:issue:`7103`). + - Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`) - Added ``layout`` keyword to ``DataFrame.plot`` (:issue:`6667`) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c960a73bb0f88..05db26a815a2e 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -15,6 +15,7 @@ import pandas.core.common as com from pandas.compat import lzip, map, zip, raise_with_traceback, string_types from pandas.core.api import DataFrame, Series +from pandas.core.common import notnull from pandas.core.base import PandasObject from pandas.tseries.tools import to_datetime @@ -615,7 +616,9 @@ def insert_data(self): "duplicate name in index/columns: {0}".format(err)) else: temp = self.frame - + + temp = temp.astype(object) + temp = temp.where(notnull(temp), None) return temp def insert(self, chunksize=None): @@ -758,12 +761,12 @@ def _harmonize_columns(self, parse_dates=None): elif col_type is float: # floats support NA, can always convert! - self.frame[col_name].astype(col_type, copy=False) + self.frame[col_name] = df_col.astype(col_type, copy=False) elif len(df_col) == df_col.count(): # No NA values, can convert ints and bools - if col_type is int or col_type is bool: - self.frame[col_name].astype(col_type, copy=False) + if col_type is np.dtype('int64') or col_type is bool: + self.frame[col_name] = df_col.astype(col_type, copy=False) # Handle date parsing if col_name in parse_dates: @@ -813,7 +816,7 @@ def _numpy_type(self, sqltype): return float if isinstance(sqltype, Integer): # TODO: Refine integer size. - return int + return np.dtype('int64') if isinstance(sqltype, DateTime): # Caution: np.datetime64 is also a subclass of np.number. return datetime diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 3ad9669abb883..53ddd5c42a1d7 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -952,9 +952,6 @@ def test_date_parsing(self): "IntDateCol loaded with incorrect type") def test_datetime(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing datetime not working with pymysql') - df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), 'B': np.arange(3.0)}) df.to_sql('test_datetime', self.conn) @@ -975,17 +972,6 @@ def test_datetime(self): tm.assert_frame_equal(result, df) def test_datetime_NaT(self): - # status: - # - postgresql: gives error on inserting "0001-255-255T00:00:00" - # - sqlite3: works, but reading it with query returns '-001--1--1 -1:-1:-1.-00001' - - if self.driver == 'pymysql': - raise nose.SkipTest('writing datetime not working with pymysql') - if self.driver == 'psycopg2': - raise nose.SkipTest('writing datetime NaT not working with psycopg2') - if self.flavor == 'sqlite': - raise nose.SkipTest('reading datetime NaT not working with sqlite') - df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), 'B': np.arange(3.0)}) df.loc[1, 'A'] = np.nan @@ -1032,9 +1018,6 @@ def test_mixed_dtype_insert(self): tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) def test_nan_numeric(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing NaNs not working with pymysql') - # NaNs in numeric float column df = DataFrame({'A':[0, 1, 2], 'B':[0.2, np.nan, 5.6]}) df.to_sql('test_nan', self.conn, index=False) @@ -1048,37 +1031,27 @@ def test_nan_numeric(self): tm.assert_frame_equal(result, df) def test_nan_fullcolumn(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing NaNs not working with pymysql') - # full NaN column (numeric float column) df = DataFrame({'A':[0, 1, 2], 'B':[np.nan, np.nan, np.nan]}) df.to_sql('test_nan', self.conn, index=False) - if self.flavor == 'sqlite': - df['B'] = df['B'].astype('object') - df['B'] = None - # with read_table result = sql.read_sql_table('test_nan', self.conn) tm.assert_frame_equal(result, df) - # with read_sql + # with read_sql -> not type info from table -> stays None + df['B'] = df['B'].astype('object') + df['B'] = None result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) tm.assert_frame_equal(result, df) def test_nan_string(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing NaNs not working with pymysql') - # NaNs in string column df = DataFrame({'A':[0, 1, 2], 'B':['a', 'b', np.nan]}) df.to_sql('test_nan', self.conn, index=False) - if self.flavor == 'sqlite': - df.loc[2, 'B'] = None - elif self.flavor == 'postgresql': - df = df.fillna('NaN') + # NaNs are coming back as None + df.loc[2, 'B'] = None # with read_table result = sql.read_sql_table('test_nan', self.conn) From 638fb5b3e4199dfdb9a89c6e535300f55446c8fb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 10 Sep 2014 22:32:22 +0200 Subject: [PATCH 2/2] ENH: refactor to_sql insert_data - performance improvement (GH8208) --- doc/source/v0.15.0.txt | 2 +- pandas/io/sql.py | 63 ++++++++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 3596dfe9c2240..49c431d8071e8 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -576,7 +576,7 @@ Performance - Performance improvements in ``StataWriter`` when writing large files (:issue:`8079`) - Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`) - Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`) - +- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`). diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 05db26a815a2e..462179b442ac0 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -15,7 +15,7 @@ import pandas.core.common as com from pandas.compat import lzip, map, zip, raise_with_traceback, string_types from pandas.core.api import DataFrame, Series -from pandas.core.common import notnull +from pandas.core.common import notnull, isnull from pandas.core.base import PandasObject from pandas.tseries.tools import to_datetime @@ -599,12 +599,6 @@ def create(self): def insert_statement(self): return self.table.insert() - def maybe_asscalar(self, i): - try: - return np.asscalar(i) - except AttributeError: - return i - def insert_data(self): if self.index is not None: temp = self.frame.copy() @@ -617,17 +611,35 @@ def insert_data(self): else: temp = self.frame - temp = temp.astype(object) - temp = temp.where(notnull(temp), None) - return temp + column_names = list(map(str, temp.columns)) + ncols = len(column_names) + data_list = [None] * ncols + blocks = temp._data.blocks + + for i in range(len(blocks)): + b = blocks[i] + if b.is_datetime: + # convert to microsecond resolution so this yields datetime.datetime + d = b.values.astype('M8[us]').astype(object) + else: + d = np.array(b.values, dtype=object) + + # replace NaN with None + if b._can_hold_na: + mask = isnull(d) + d[mask] = None + + for col_loc, col in zip(b.mgr_locs, d): + data_list[col_loc] = col + + return column_names, data_list def insert(self, chunksize=None): ins = self.insert_statement() - temp = self.insert_data() - keys = list(map(str, temp.columns)) + keys, data_list = self.insert_data() - nrows = len(temp) + nrows = len(self.frame) if chunksize is None: chunksize = nrows chunks = int(nrows / chunksize) + 1 @@ -639,12 +651,11 @@ def insert(self, chunksize=None): end_i = min((i + 1) * chunksize, nrows) if start_i >= end_i: break - data_list = [] - for t in temp.iloc[start_i:end_i].itertuples(): - data = dict((k, self.maybe_asscalar(v)) - for k, v in zip(keys, t[1:])) - data_list.append(data) - con.execute(ins, data_list) + + chunk_list = [arr[start_i:end_i] for arr in data_list] + insert_list = [dict((k, v) for k, v in zip(keys, row)) + for row in zip(*chunk_list)] + con.execute(ins, insert_list) def read(self, coerce_float=True, parse_dates=None, columns=None): @@ -1011,9 +1022,9 @@ def insert_statement(self): def insert(self, chunksize=None): ins = self.insert_statement() - temp = self.insert_data() + keys, data_list = self.insert_data() - nrows = len(temp) + nrows = len(self.frame) if chunksize is None: chunksize = nrows chunks = int(nrows / chunksize) + 1 @@ -1024,13 +1035,11 @@ def insert(self, chunksize=None): end_i = min((i + 1) * chunksize, nrows) if start_i >= end_i: break - data_list = [] - for t in temp.iloc[start_i:end_i].itertuples(): - data = tuple((self.maybe_asscalar(v) for v in t[1:])) - data_list.append(data) - + chunk_list = [arr[start_i:end_i] for arr in data_list] + insert_list = [tuple((v for v in row)) + for row in zip(*chunk_list)] cur = self.pd_sql.con.cursor() - cur.executemany(ins, data_list) + cur.executemany(ins, insert_list) cur.close() def _create_table_setup(self):