Merge pull request #8208 from jorisvandenbossche/sql-2754-nan

jorisvandenbossche · jorisvandenbossche · commit e1cd966c726c · 2014-09-13T23:48:56.000+02:00
ENH: sql support for writing NaN + datetime64 values (GH2754, GH7103)
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -494,6 +494,9 @@ Enhancements
    df.to_sql('table', engine, schema='other_schema')
    pd.read_sql_table('table', engine, schema='other_schema')
 
+- Added support for writing ``NaN`` values with ``to_sql`` (:issue:`2754`).
+- Added support for writing datetime64 columns with ``to_sql`` for all database flavors (:issue:`7103`).
+
 - Added support for bool, uint8, uint16 and uint32 datatypes in ``to_stata`` (:issue:`7097`, :issue:`7365`)
 
 - Added ``layout`` keyword to ``DataFrame.plot`` (:issue:`6667`)
@@ -573,7 +576,7 @@ Performance
 - Performance improvements in ``StataWriter`` when writing large files (:issue:`8079`)
 - Performance and memory usage improvements in multi-key ``groupby`` (:issue:`8128`)
 - Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`)
-
+- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`).
 
 
 
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -15,6 +15,7 @@
 import pandas.core.common as com
 from pandas.compat import lzip, map, zip, raise_with_traceback, string_types
 from pandas.core.api import DataFrame, Series
+from pandas.core.common import notnull, isnull
 from pandas.core.base import PandasObject
 from pandas.tseries.tools import to_datetime
 
@@ -598,12 +599,6 @@ def create(self):
     def insert_statement(self):
         return self.table.insert()
 
-    def maybe_asscalar(self, i):
-        try:
-            return np.asscalar(i)
-        except AttributeError:
-            return i
-
     def insert_data(self):
         if self.index is not None:
             temp = self.frame.copy()
@@ -615,16 +610,36 @@ def insert_data(self):
                     "duplicate name in index/columns: {0}".format(err))
         else:
             temp = self.frame
+        
+        column_names = list(map(str, temp.columns))
+        ncols = len(column_names)
+        data_list = [None] * ncols
+        blocks = temp._data.blocks
+
+        for i in range(len(blocks)):
+            b = blocks[i]
+            if b.is_datetime:
+                # convert to microsecond resolution so this yields datetime.datetime
+                d = b.values.astype('M8[us]').astype(object)
+            else:
+                d = np.array(b.values, dtype=object)
+
+            # replace NaN with None
+            if b._can_hold_na:
+                mask = isnull(d)
+                d[mask] = None
 
-        return temp
+            for col_loc, col in zip(b.mgr_locs, d):
+                data_list[col_loc] = col
+
+        return column_names, data_list
 
     def insert(self, chunksize=None):
 
         ins = self.insert_statement()
-        temp = self.insert_data()
-        keys = list(map(str, temp.columns))
+        keys, data_list = self.insert_data()
 
-        nrows = len(temp)
+        nrows = len(self.frame)
         if chunksize is None: 
             chunksize = nrows
         chunks = int(nrows / chunksize) + 1
@@ -636,12 +651,11 @@ def insert(self, chunksize=None):
                 end_i = min((i + 1) * chunksize, nrows)
                 if start_i >= end_i:
                     break
-                data_list = []
-                for t in temp.iloc[start_i:end_i].itertuples():
-                    data = dict((k, self.maybe_asscalar(v))
-                                for k, v in zip(keys, t[1:]))
-                    data_list.append(data)
-                con.execute(ins, data_list)
+
+                chunk_list = [arr[start_i:end_i] for arr in data_list]
+                insert_list = [dict((k, v) for k, v in zip(keys, row))
+                               for row in zip(*chunk_list)]
+                con.execute(ins, insert_list)
 
     def read(self, coerce_float=True, parse_dates=None, columns=None):
 
@@ -758,12 +772,12 @@ def _harmonize_columns(self, parse_dates=None):
 
                 elif col_type is float:
                     # floats support NA, can always convert!
-                    self.frame[col_name].astype(col_type, copy=False)
+                    self.frame[col_name] = df_col.astype(col_type, copy=False)
 
                 elif len(df_col) == df_col.count():
                     # No NA values, can convert ints and bools
-                    if col_type is int or col_type is bool:
-                        self.frame[col_name].astype(col_type, copy=False)
+                    if col_type is np.dtype('int64') or col_type is bool:
+                        self.frame[col_name] = df_col.astype(col_type, copy=False)
 
                 # Handle date parsing
                 if col_name in parse_dates:
@@ -813,7 +827,7 @@ def _numpy_type(self, sqltype):
             return float
         if isinstance(sqltype, Integer):
             # TODO: Refine integer size.
-            return int
+            return np.dtype('int64')
         if isinstance(sqltype, DateTime):
             # Caution: np.datetime64 is also a subclass of np.number.
             return datetime
@@ -1008,9 +1022,9 @@ def insert_statement(self):
     def insert(self, chunksize=None):
 
         ins = self.insert_statement()
-        temp = self.insert_data()
+        keys, data_list = self.insert_data()
 
-        nrows = len(temp)
+        nrows = len(self.frame)
         if chunksize is None: 
             chunksize = nrows
         chunks = int(nrows / chunksize) + 1
@@ -1021,13 +1035,11 @@ def insert(self, chunksize=None):
                 end_i = min((i + 1) * chunksize, nrows)
                 if start_i >= end_i:
                     break
-                data_list = []
-                for t in temp.iloc[start_i:end_i].itertuples():
-                    data = tuple((self.maybe_asscalar(v) for v in t[1:]))
-                    data_list.append(data)
-
+                chunk_list = [arr[start_i:end_i] for arr in data_list]
+                insert_list = [tuple((v for v in row))
+                               for row in zip(*chunk_list)]
                 cur = self.pd_sql.con.cursor()
-                cur.executemany(ins, data_list)
+                cur.executemany(ins, insert_list)
                 cur.close()
 
     def _create_table_setup(self):
diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py
@@ -952,9 +952,6 @@ def test_date_parsing(self):
                         "IntDateCol loaded with incorrect type")
 
     def test_datetime(self):
-        if self.driver == 'pymysql':
-             raise nose.SkipTest('writing datetime not working with pymysql')
-
         df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3),
                         'B': np.arange(3.0)})
         df.to_sql('test_datetime', self.conn)
@@ -975,17 +972,6 @@ def test_datetime(self):
             tm.assert_frame_equal(result, df)
 
     def test_datetime_NaT(self):
-        # status:
-        # - postgresql: gives error on inserting "0001-255-255T00:00:00"
-        # - sqlite3: works, but reading it with query returns '-001--1--1 -1:-1:-1.-00001'
-
-        if self.driver == 'pymysql':
-            raise nose.SkipTest('writing datetime not working with pymysql')
-        if self.driver == 'psycopg2':
-            raise nose.SkipTest('writing datetime NaT not working with psycopg2')
-        if self.flavor == 'sqlite':
-            raise nose.SkipTest('reading datetime NaT not working with sqlite')
-
         df = DataFrame({'A': date_range('2013-01-01 09:00:00', periods=3),
                         'B': np.arange(3.0)})
         df.loc[1, 'A'] = np.nan
@@ -1032,9 +1018,6 @@ def test_mixed_dtype_insert(self):
         tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True)
 
     def test_nan_numeric(self):
-        if self.driver == 'pymysql':
-            raise nose.SkipTest('writing NaNs not working with pymysql')
-
         # NaNs in numeric float column
         df = DataFrame({'A':[0, 1, 2], 'B':[0.2, np.nan, 5.6]})
         df.to_sql('test_nan', self.conn, index=False)
@@ -1048,37 +1031,27 @@ def test_nan_numeric(self):
         tm.assert_frame_equal(result, df)
 
     def test_nan_fullcolumn(self):
-        if self.driver == 'pymysql':
-            raise nose.SkipTest('writing NaNs not working with pymysql')
-
         # full NaN column (numeric float column)
         df = DataFrame({'A':[0, 1, 2], 'B':[np.nan, np.nan, np.nan]})
         df.to_sql('test_nan', self.conn, index=False)
 
-        if self.flavor == 'sqlite':
-            df['B'] = df['B'].astype('object')
-            df['B'] = None
-
         # with read_table
         result = sql.read_sql_table('test_nan', self.conn)
         tm.assert_frame_equal(result, df)
 
-        # with read_sql
+        # with read_sql -> not type info from table -> stays None
+        df['B'] = df['B'].astype('object')
+        df['B'] = None
         result = sql.read_sql_query('SELECT * FROM test_nan', self.conn)
         tm.assert_frame_equal(result, df)
 
     def test_nan_string(self):
-        if self.driver == 'pymysql':
-             raise nose.SkipTest('writing NaNs not working with pymysql')
-
         # NaNs in string column
         df = DataFrame({'A':[0, 1, 2], 'B':['a', 'b', np.nan]})
         df.to_sql('test_nan', self.conn, index=False)
 
-        if self.flavor == 'sqlite':
-            df.loc[2, 'B'] = None
-        elif self.flavor == 'postgresql':
-            df = df.fillna('NaN')
+        # NaNs are coming back as None
+        df.loc[2, 'B'] = None
 
         # with read_table
         result = sql.read_sql_table('test_nan', self.conn)