diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index f0c3c0e6bc508..49c431d8071e8 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -494,6 +494,9 @@ Enhancements df.to_sql('table', engine, schema='other_schema') pd.read_sql_table('table', engine, schema='other_schema') +- Added support for writing ``NaN`` values with ``to_sql`` (:issue:`2754`). +- Added support for writing datetime64 columns with ``to_sql`` for all database flavors (:issue:`7103`). + - Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`) - Added ``layout`` keyword to ``DataFrame.plot`` (:issue:`6667`) @@ -573,7 +576,7 @@ Performance - Performance improvements in ``StataWriter`` when writing large files (:issue:`8079`) - Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`) - Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`) - +- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`). diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c960a73bb0f88..462179b442ac0 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -15,6 +15,7 @@ import pandas.core.common as com from pandas.compat import lzip, map, zip, raise_with_traceback, string_types from pandas.core.api import DataFrame, Series +from pandas.core.common import notnull, isnull from pandas.core.base import PandasObject from pandas.tseries.tools import to_datetime @@ -598,12 +599,6 @@ def create(self): def insert_statement(self): return self.table.insert() - def maybe_asscalar(self, i): - try: - return np.asscalar(i) - except AttributeError: - return i - def insert_data(self): if self.index is not None: temp = self.frame.copy() @@ -615,16 +610,36 @@ def insert_data(self): "duplicate name in index/columns: {0}".format(err)) else: temp = self.frame + + column_names = list(map(str, temp.columns)) + ncols = len(column_names) + data_list = [None] * ncols + blocks = temp._data.blocks + + for i in range(len(blocks)): + b = blocks[i] + if b.is_datetime: + # convert to microsecond resolution so this yields datetime.datetime + d = b.values.astype('M8[us]').astype(object) + else: + d = np.array(b.values, dtype=object) + + # replace NaN with None + if b._can_hold_na: + mask = isnull(d) + d[mask] = None - return temp + for col_loc, col in zip(b.mgr_locs, d): + data_list[col_loc] = col + + return column_names, data_list def insert(self, chunksize=None): ins = self.insert_statement() - temp = self.insert_data() - keys = list(map(str, temp.columns)) + keys, data_list = self.insert_data() - nrows = len(temp) + nrows = len(self.frame) if chunksize is None: chunksize = nrows chunks = int(nrows / chunksize) + 1 @@ -636,12 +651,11 @@ def insert(self, chunksize=None): end_i = min((i + 1) * chunksize, nrows) if start_i >= end_i: break - data_list = [] - for t in temp.iloc[start_i:end_i].itertuples(): - data = dict((k, self.maybe_asscalar(v)) - for k, v in zip(keys, t[1:])) - data_list.append(data) - con.execute(ins, data_list) + + chunk_list = [arr[start_i:end_i] for arr in data_list] + insert_list = [dict((k, v) for k, v in zip(keys, row)) + for row in zip(*chunk_list)] + con.execute(ins, insert_list) def read(self, coerce_float=True, parse_dates=None, columns=None): @@ -758,12 +772,12 @@ def _harmonize_columns(self, parse_dates=None): elif col_type is float: # floats support NA, can always convert! - self.frame[col_name].astype(col_type, copy=False) + self.frame[col_name] = df_col.astype(col_type, copy=False) elif len(df_col) == df_col.count(): # No NA values, can convert ints and bools - if col_type is int or col_type is bool: - self.frame[col_name].astype(col_type, copy=False) + if col_type is np.dtype('int64') or col_type is bool: + self.frame[col_name] = df_col.astype(col_type, copy=False) # Handle date parsing if col_name in parse_dates: @@ -813,7 +827,7 @@ def _numpy_type(self, sqltype): return float if isinstance(sqltype, Integer): # TODO: Refine integer size. - return int + return np.dtype('int64') if isinstance(sqltype, DateTime): # Caution: np.datetime64 is also a subclass of np.number. return datetime @@ -1008,9 +1022,9 @@ def insert_statement(self): def insert(self, chunksize=None): ins = self.insert_statement() - temp = self.insert_data() + keys, data_list = self.insert_data() - nrows = len(temp) + nrows = len(self.frame) if chunksize is None: chunksize = nrows chunks = int(nrows / chunksize) + 1 @@ -1021,13 +1035,11 @@ def insert(self, chunksize=None): end_i = min((i + 1) * chunksize, nrows) if start_i >= end_i: break - data_list = [] - for t in temp.iloc[start_i:end_i].itertuples(): - data = tuple((self.maybe_asscalar(v) for v in t[1:])) - data_list.append(data) - + chunk_list = [arr[start_i:end_i] for arr in data_list] + insert_list = [tuple((v for v in row)) + for row in zip(*chunk_list)] cur = self.pd_sql.con.cursor() - cur.executemany(ins, data_list) + cur.executemany(ins, insert_list) cur.close() def _create_table_setup(self): diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 3ad9669abb883..53ddd5c42a1d7 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -952,9 +952,6 @@ def test_date_parsing(self): "IntDateCol loaded with incorrect type") def test_datetime(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing datetime not working with pymysql') - df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), 'B': np.arange(3.0)}) df.to_sql('test_datetime', self.conn) @@ -975,17 +972,6 @@ def test_datetime(self): tm.assert_frame_equal(result, df) def test_datetime_NaT(self): - # status: - # - postgresql: gives error on inserting "0001-255-255T00:00:00" - # - sqlite3: works, but reading it with query returns '-001--1--1 -1:-1:-1.-00001' - - if self.driver == 'pymysql': - raise nose.SkipTest('writing datetime not working with pymysql') - if self.driver == 'psycopg2': - raise nose.SkipTest('writing datetime NaT not working with psycopg2') - if self.flavor == 'sqlite': - raise nose.SkipTest('reading datetime NaT not working with sqlite') - df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3), 'B': np.arange(3.0)}) df.loc[1, 'A'] = np.nan @@ -1032,9 +1018,6 @@ def test_mixed_dtype_insert(self): tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) def test_nan_numeric(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing NaNs not working with pymysql') - # NaNs in numeric float column df = DataFrame({'A':[0, 1, 2], 'B':[0.2, np.nan, 5.6]}) df.to_sql('test_nan', self.conn, index=False) @@ -1048,37 +1031,27 @@ def test_nan_numeric(self): tm.assert_frame_equal(result, df) def test_nan_fullcolumn(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing NaNs not working with pymysql') - # full NaN column (numeric float column) df = DataFrame({'A':[0, 1, 2], 'B':[np.nan, np.nan, np.nan]}) df.to_sql('test_nan', self.conn, index=False) - if self.flavor == 'sqlite': - df['B'] = df['B'].astype('object') - df['B'] = None - # with read_table result = sql.read_sql_table('test_nan', self.conn) tm.assert_frame_equal(result, df) - # with read_sql + # with read_sql -> not type info from table -> stays None + df['B'] = df['B'].astype('object') + df['B'] = None result = sql.read_sql_query('SELECT * FROM test_nan', self.conn) tm.assert_frame_equal(result, df) def test_nan_string(self): - if self.driver == 'pymysql': - raise nose.SkipTest('writing NaNs not working with pymysql') - # NaNs in string column df = DataFrame({'A':[0, 1, 2], 'B':['a', 'b', np.nan]}) df.to_sql('test_nan', self.conn, index=False) - if self.flavor == 'sqlite': - df.loc[2, 'B'] = None - elif self.flavor == 'postgresql': - df = df.fillna('NaN') + # NaNs are coming back as None + df.loc[2, 'B'] = None # with read_table result = sql.read_sql_table('test_nan', self.conn)