From 6680b6b37edf53437fd3ea1402a94fe2e4e206c8 Mon Sep 17 00:00:00 2001 From: Jonathan Chambers Date: Thu, 20 Feb 2014 14:18:29 +0000 Subject: [PATCH 1/2] ENH #6416: performance improvements on write - tradoff higher memory use for faster writes. --- pandas/io/sql.py | 75 +++++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 989f6983b28d3..4d2fce596bba4 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2,13 +2,13 @@ Collection of query wrappers / abstractions to both facilitate data retrieval and to reduce dependency on DB-specific API. """ -from __future__ import print_function -from datetime import datetime, date +from __future__ import print_function, division +from datetime import datetime, date, timedelta import warnings from pandas.compat import lzip, map, zip, raise_with_traceback, string_types import numpy as np - +import pandas.core.common as com from pandas.core.api import DataFrame from pandas.core.base import PandasObject from pandas.tseries.tools import to_datetime @@ -360,7 +360,7 @@ def pandasSQL_builder(con, flavor=None, meta=None): class PandasSQLTable(PandasObject): - """ + """ For mapping Pandas tables to SQL tables. Uses fact that table is reflected by SQLAlchemy to do better type convertions. @@ -419,13 +419,21 @@ def maybe_asscalar(self, i): def insert(self): ins = self.insert_statement() - - for t in self.frame.iterrows(): - data = dict((k, self.maybe_asscalar(v)) - for k, v in t[1].iteritems()) - if self.index is not None: + data_list = [] + # to avoid if check for every row + if self.index is not None: + for t in self.frame.iterrows(): + data = dict((k, self.maybe_asscalar(v)) + for k, v in t[1].iteritems()) data[self.index] = self.maybe_asscalar(t[0]) - self.pd_sql.execute(ins, **data) + data_list.append(data) + else: + for t in self.frame.iterrows(): + data = dict((k, self.maybe_asscalar(v)) + for k, v in t[1].iteritems()) + data_list.append(data) + #self.pd_sql.execute(ins, **data) + self.pd_sql.execute(ins, data_list) def read(self, coerce_float=True, parse_dates=None, columns=None): @@ -480,7 +488,7 @@ def _create_table_statement(self): if self.index is not None: columns.insert(0, Column(self.index, self._sqlalchemy_type( - self.frame.index.dtype), + self.frame.index), index=True)) return Table(self.name, self.pd_sql.meta, *columns) @@ -537,22 +545,33 @@ def _harmonize_columns(self, parse_dates=None): except KeyError: pass # this column not in results - def _sqlalchemy_type(self, dtype): - from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date + def _sqlalchemy_type(self, arr_or_dtype): + from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date, Interval - pytype = dtype.type + if isinstance(arr_or_dtype, np.dtype): + tipo = arr_or_dtype + elif isinstance(arr_or_dtype, type): + tipo = np.dtype(arr_or_dtype) + else: + tipo = arr_or_dtype.dtype - if pytype is date: + if arr_or_dtype is date: return Date - if issubclass(pytype, np.datetime64) or pytype is datetime: - # Caution: np.datetime64 is also a subclass of np.number. - return DateTime - if issubclass(pytype, np.floating): + if com.is_datetime64_dtype(arr_or_dtype): + try: + tz = arr_or_dtype.tzinfo + return DateTime(timezone=True) + except: + print('no tzinfo') + return DateTime + if com.is_timedelta64_dtype(arr_or_dtype): + return Interval + if com.is_float_dtype(arr_or_dtype): return Float - if issubclass(pytype, np.integer): + if com.is_integer_dtype(arr_or_dtype): # TODO: Refine integer size. return Integer - if issubclass(pytype, np.bool_): + if isinstance(tipo, np.bool_): return Boolean return Text @@ -638,14 +657,18 @@ def to_sql(self, frame, name, if_exists='fail', index=True): name, self, frame=frame, index=index, if_exists=if_exists) table.insert() + @property + def tables(self): + return self.meta.tables + def has_table(self, name): - return self.engine.has_table(name) + if self.meta.tables.get(name) is not None: + return True + else: + return False def get_table(self, table_name): - if self.engine.has_table(table_name): - return self.meta.tables[table_name] - else: - return None + return self.meta.tables.get(table_name) def read_table(self, table_name, index_col=None, coerce_float=True, parse_dates=None, columns=None): From c67ae75d37b7c269766381c10ef783d91ef1a62d Mon Sep 17 00:00:00 2001 From: Jonathan Chambers Date: Thu, 20 Feb 2014 17:45:52 +0000 Subject: [PATCH 2/2] ENH #6416 cleanup for PR --- pandas/io/sql.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 4d2fce596bba4..9f4b642afc2d1 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -4,11 +4,13 @@ """ from __future__ import print_function, division from datetime import datetime, date, timedelta + import warnings -from pandas.compat import lzip, map, zip, raise_with_traceback, string_types +import itertools import numpy as np import pandas.core.common as com +from pandas.compat import lzip, map, zip, raise_with_traceback, string_types from pandas.core.api import DataFrame from pandas.core.base import PandasObject from pandas.tseries.tools import to_datetime @@ -432,7 +434,6 @@ def insert(self): data = dict((k, self.maybe_asscalar(v)) for k, v in t[1].iteritems()) data_list.append(data) - #self.pd_sql.execute(ins, **data) self.pd_sql.execute(ins, data_list) def read(self, coerce_float=True, parse_dates=None, columns=None): @@ -548,13 +549,6 @@ def _harmonize_columns(self, parse_dates=None): def _sqlalchemy_type(self, arr_or_dtype): from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date, Interval - if isinstance(arr_or_dtype, np.dtype): - tipo = arr_or_dtype - elif isinstance(arr_or_dtype, type): - tipo = np.dtype(arr_or_dtype) - else: - tipo = arr_or_dtype.dtype - if arr_or_dtype is date: return Date if com.is_datetime64_dtype(arr_or_dtype): @@ -562,16 +556,15 @@ def _sqlalchemy_type(self, arr_or_dtype): tz = arr_or_dtype.tzinfo return DateTime(timezone=True) except: - print('no tzinfo') return DateTime if com.is_timedelta64_dtype(arr_or_dtype): return Interval - if com.is_float_dtype(arr_or_dtype): + elif com.is_float_dtype(arr_or_dtype): return Float - if com.is_integer_dtype(arr_or_dtype): + elif com.is_integer_dtype(arr_or_dtype): # TODO: Refine integer size. return Integer - if isinstance(tipo, np.bool_): + elif com.is_bool(arr_or_dtype): return Boolean return Text @@ -769,8 +762,6 @@ def insert(self): data = [self.maybe_asscalar(v) for v in r[1].values] if self.index is not None: data.insert(0, self.maybe_asscalar(r[0])) - print(type(data[2])) - print(type(r[0])) cur.execute(ins, tuple(data)) cur.close()