Skip to content

Commit 1033d5f

Browse files
mangecoeurjreback
authored andcommitted
ENH #4163 Fix bug in index + parse date interaction, added test case for problem
1 parent de64af1 commit 1033d5f

File tree

2 files changed

+120
-80
lines changed

2 files changed

+120
-80
lines changed

pandas/io/sql.py

+87-72
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class DatabaseError(IOError):
2323

2424

2525
#------------------------------------------------------------------------------
26-
# Helper execution functions
26+
# Helper functions
2727

2828
def _convert_params(sql, params):
2929
"""convert sql and params args to DBAPI2.0 compliant format"""
@@ -33,6 +33,47 @@ def _convert_params(sql, params):
3333
return args
3434

3535

36+
def _safe_col_name(col_name):
37+
#TODO: probably want to forbid database reserved names, such as "database"
38+
return col_name.strip().replace(' ', '_')
39+
40+
41+
def _handle_date_column(col, format=None):
42+
if isinstance(format, dict):
43+
return to_datetime(col, **format)
44+
else:
45+
if format in ['D', 's', 'ms', 'us', 'ns']:
46+
return to_datetime(col, coerce=True, unit=format)
47+
elif issubclass(col.dtype.type, np.floating) or issubclass(col.dtype.type, np.integer):
48+
# parse dates as timestamp
49+
format = 's' if format is None else format
50+
return to_datetime(col, coerce=True, unit=format)
51+
else:
52+
return to_datetime(col, coerce=True, format=format)
53+
54+
55+
def _parse_date_columns(data_frame, parse_dates):
56+
""" Force non-datetime columns to be read as such.
57+
Supports both string formatted and integer timestamp columns
58+
"""
59+
# handle non-list entries for parse_dates gracefully
60+
if parse_dates is True or parse_dates is None or parse_dates is False:
61+
parse_dates = []
62+
63+
if not hasattr(parse_dates, '__iter__'):
64+
parse_dates = [parse_dates]
65+
66+
for col_name in parse_dates:
67+
df_col = data_frame[col_name]
68+
try:
69+
fmt = parse_dates[col_name]
70+
except TypeError:
71+
fmt = None
72+
data_frame[col_name] = _handle_date_column(df_col, format=fmt)
73+
74+
return data_frame
75+
76+
3677
def execute(sql, con, cur=None, params=None, flavor='sqlite'):
3778
"""
3879
Execute the given SQL query using the provided connection object.
@@ -44,7 +85,7 @@ def execute(sql, con, cur=None, params=None, flavor='sqlite'):
4485
con: SQLAlchemy engine or DBAPI2 connection (legacy mode)
4586
Using SQLAlchemy makes it possible to use any DB supported by that
4687
library.
47-
If a DBAPI2 object is given, a supported SQL flavor must also be provided
88+
If a DBAPI2 object, a supported SQL flavor must also be provided
4889
cur: depreciated, cursor is obtained from connection
4990
params: list or tuple, optional
5091
List of parameters to pass to execute method.
@@ -283,9 +324,11 @@ def pandasSQL_builder(con, flavor=None, meta=None):
283324
return PandasSQLAlchemy(con, meta=meta)
284325
else:
285326
warnings.warn(
286-
"Not an SQLAlchemy engine, attempting to use as legacy DBAPI connection")
327+
"""Not an SQLAlchemy engine,
328+
attempting to use as legacy DBAPI connection""")
287329
if flavor is None:
288-
raise ValueError("""PandasSQL must be created with an SQLAlchemy engine
330+
raise ValueError(
331+
"""PandasSQL must be created with an SQLAlchemy engine
289332
or a DBAPI2 connection and SQL flavour""")
290333
else:
291334
return PandasSQLLegacy(con, flavor)
@@ -298,36 +341,16 @@ def pandasSQL_builder(con, flavor=None, meta=None):
298341
return PandasSQLLegacy(con, flavor)
299342

300343

301-
def _safe_col_name(col_name):
302-
return col_name.strip().replace(' ', '_')
303-
304-
305-
def _parse_date_column(col, format=None):
306-
if isinstance(format, dict):
307-
return to_datetime(col, **format)
308-
else:
309-
if format in ['D', 's', 'ms', 'us', 'ns']:
310-
return to_datetime(col, coerce=True, unit=format)
311-
elif issubclass(col.dtype.type, np.floating) or issubclass(col.dtype.type, np.integer):
312-
# parse dates as timestamp
313-
format = 's' if format is None else format
314-
return to_datetime(col, coerce=True, unit=format)
315-
else:
316-
return to_datetime(col, coerce=True, format=format)
317-
318-
319-
def _frame_from_data_and_columns(data, columns, index_col=None,
320-
coerce_float=True):
321-
df = DataFrame.from_records(
322-
data, columns=columns, coerce_float=coerce_float)
323-
if index_col is not None:
324-
df.set_index(index_col, inplace=True)
325-
return df
326-
327-
328344
class PandasSQLTable(PandasObject):
329-
330-
def __init__(self, name, pandas_sql_engine, frame=None, index=True, if_exists='fail', prefix='pandas'):
345+
""" For mapping Pandas tables to SQL tables.
346+
Uses fact that table is reflected by SQLAlchemy to
347+
do better type convertions.
348+
Also holds various flags needed to avoid having to
349+
pass them between functions all the time.
350+
"""
351+
# TODO: support for multiIndex
352+
def __init__(self, name, pandas_sql_engine, frame=None, index=True,
353+
if_exists='fail', prefix='pandas'):
331354
self.name = name
332355
self.pd_sql = pandas_sql_engine
333356
self.prefix = prefix
@@ -400,13 +423,15 @@ def read(self, coerce_float=True, parse_dates=None, columns=None):
400423
data = result.fetchall()
401424
column_names = result.keys()
402425

403-
self.frame = _frame_from_data_and_columns(data, column_names,
404-
index_col=self.index,
405-
coerce_float=coerce_float)
426+
self.frame = DataFrame.from_records(
427+
data, columns=column_names, coerce_float=coerce_float)
406428

407429
self._harmonize_columns(parse_dates=parse_dates)
408430

409-
# Assume that if the index was in prefix_index format, we gave it a name
431+
if self.index is not None:
432+
self.frame.set_index(self.index, inplace=True)
433+
434+
# Assume if the index in prefix_index format, we gave it a name
410435
# and should return it nameless
411436
if self.index == self.prefix + '_index':
412437
self.frame.index.name = None
@@ -442,13 +467,14 @@ def _create_table_statement(self):
442467
return Table(self.name, self.pd_sql.meta, *columns)
443468

444469
def _harmonize_columns(self, parse_dates=None):
445-
""" Make a data_frame's column type align with an sql_table column types
470+
""" Make a data_frame's column type align with an sql_table
471+
column types
446472
Need to work around limited NA value support.
447473
Floats are always fine, ints must always
448474
be floats if there are Null values.
449475
Booleans are hard because converting bool column with None replaces
450-
all Nones with false. Therefore only convert bool if there are no NA
451-
values.
476+
all Nones with false. Therefore only convert bool if there are no
477+
NA values.
452478
Datetimes should already be converted
453479
to np.datetime if supported, but here we also force conversion
454480
if required
@@ -469,7 +495,7 @@ def _harmonize_columns(self, parse_dates=None):
469495

470496
if col_type is datetime or col_type is date:
471497
if not issubclass(df_col.dtype.type, np.datetime64):
472-
self.frame[col_name] = _parse_date_column(df_col)
498+
self.frame[col_name] = _handle_date_column(df_col)
473499

474500
elif col_type is float:
475501
# floats support NA, can always convert!
@@ -486,7 +512,7 @@ def _harmonize_columns(self, parse_dates=None):
486512
fmt = parse_dates[col_name]
487513
except TypeError:
488514
fmt = None
489-
self.frame[col_name] = _parse_date_column(
515+
self.frame[col_name] = _handle_date_column(
490516
df_col, format=fmt)
491517

492518
except KeyError:
@@ -543,27 +569,6 @@ def to_sql(self, *args, **kwargs):
543569
raise ValueError(
544570
"PandasSQL must be created with an SQLAlchemy engine or connection+sql flavor")
545571

546-
def _parse_date_columns(self, data_frame, parse_dates):
547-
""" Force non-datetime columns to be read as such.
548-
Supports both string formatted and integer timestamp columns
549-
"""
550-
# handle non-list entries for parse_dates gracefully
551-
if parse_dates is True or parse_dates is None or parse_dates is False:
552-
parse_dates = []
553-
554-
if not hasattr(parse_dates, '__iter__'):
555-
parse_dates = [parse_dates]
556-
557-
for col_name in parse_dates:
558-
df_col = data_frame[col_name]
559-
try:
560-
fmt = parse_dates[col_name]
561-
except TypeError:
562-
fmt = None
563-
data_frame[col_name] = _parse_date_column(df_col, format=fmt)
564-
565-
return data_frame
566-
567572

568573
class PandasSQLAlchemy(PandasSQL):
569574

@@ -593,17 +598,23 @@ def uquery(self, *args, **kwargs):
593598
result = self.execute(*args, **kwargs)
594599
return result.rowcount
595600

596-
def read_sql(self, sql, index_col=None, coerce_float=True, parse_dates=None, params=None):
601+
def read_sql(self, sql, index_col=None, coerce_float=True,
602+
parse_dates=None, params=None):
597603
args = _convert_params(sql, params)
604+
598605
result = self.execute(*args)
599606
data = result.fetchall()
600607
columns = result.keys()
601608

602-
data_frame = _frame_from_data_and_columns(data, columns,
603-
index_col=index_col,
604-
coerce_float=coerce_float)
609+
data_frame = DataFrame.from_records(
610+
data, columns=columns, coerce_float=coerce_float)
611+
612+
_parse_date_columns(data_frame, parse_dates)
613+
614+
if index_col is not None:
615+
data_frame.set_index(index_col, inplace=True)
605616

606-
return self._parse_date_columns(data_frame, parse_dates)
617+
return data_frame
607618

608619
def to_sql(self, frame, name, if_exists='fail', index=True):
609620
table = PandasSQLTable(
@@ -818,10 +829,14 @@ def read_sql(self, sql, index_col=None, coerce_float=True, params=None,
818829
data = self._fetchall_as_list(cursor)
819830
cursor.close()
820831

821-
data_frame = _frame_from_data_and_columns(data, columns,
822-
index_col=index_col,
823-
coerce_float=coerce_float)
824-
return self._parse_date_columns(data_frame, parse_dates=parse_dates)
832+
data_frame = DataFrame.from_records(
833+
data, columns=columns, coerce_float=coerce_float)
834+
835+
_parse_date_columns(data_frame, parse_dates)
836+
837+
if index_col is not None:
838+
data_frame.set_index(index_col, inplace=True)
839+
return data_frame
825840

826841
def _fetchall_as_list(self, cur):
827842
result = cur.fetchall()

pandas/io/tests/test_sql.py

+33-8
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ def _roundtrip(self):
215215
result = self.pandasSQL.read_sql('SELECT * FROM test_frame_roundtrip')
216216

217217
result.set_index('pandas_index', inplace=True)
218-
#result.index.astype(int)
218+
# result.index.astype(int)
219219

220220
result.index.name = None
221221

@@ -327,7 +327,9 @@ def test_roundtrip(self):
327327
sql.to_sql(self.test_frame1, 'test_frame_roundtrip',
328328
con=self.conn, flavor='sqlite')
329329
result = sql.read_sql(
330-
'SELECT * FROM test_frame_roundtrip', con=self.conn, flavor='sqlite')
330+
'SELECT * FROM test_frame_roundtrip',
331+
con=self.conn,
332+
flavor='sqlite')
331333

332334
# HACK!
333335
result.index = self.test_frame1.index
@@ -355,28 +357,51 @@ def test_date_parsing(self):
355357
df = sql.read_sql(
356358
"SELECT * FROM types_test_data", self.conn, flavor='sqlite')
357359
self.assertFalse(
358-
issubclass(df.DateCol.dtype.type, np.datetime64), "DateCol loaded with incorrect type")
360+
issubclass(df.DateCol.dtype.type, np.datetime64),
361+
"DateCol loaded with incorrect type")
359362

360363
df = sql.read_sql("SELECT * FROM types_test_data",
361364
self.conn, flavor='sqlite', parse_dates=['DateCol'])
362365
self.assertTrue(
363-
issubclass(df.DateCol.dtype.type, np.datetime64), "DateCol loaded with incorrect type")
366+
issubclass(df.DateCol.dtype.type, np.datetime64),
367+
"DateCol loaded with incorrect type")
364368

365369
df = sql.read_sql("SELECT * FROM types_test_data", self.conn,
366-
flavor='sqlite', parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'})
370+
flavor='sqlite',
371+
parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'})
367372
self.assertTrue(
368-
issubclass(df.DateCol.dtype.type, np.datetime64), "DateCol loaded with incorrect type")
373+
issubclass(df.DateCol.dtype.type, np.datetime64),
374+
"DateCol loaded with incorrect type")
369375

370376
df = sql.read_sql("SELECT * FROM types_test_data",
371-
self.conn, flavor='sqlite', parse_dates=['IntDateCol'])
377+
self.conn, flavor='sqlite',
378+
parse_dates=['IntDateCol'])
379+
372380
self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64),
373381
"IntDateCol loaded with incorrect type")
374382

375383
df = sql.read_sql("SELECT * FROM types_test_data",
376-
self.conn, flavor='sqlite', parse_dates={'IntDateCol': 's'})
384+
self.conn, flavor='sqlite',
385+
parse_dates={'IntDateCol': 's'})
386+
377387
self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64),
378388
"IntDateCol loaded with incorrect type")
379389

390+
def test_date_and_index(self):
391+
""" Test case where same column appears in parse_date and index_col"""
392+
393+
df = sql.read_sql("SELECT * FROM types_test_data",
394+
self.conn, flavor='sqlite',
395+
parse_dates=['DateCol', 'IntDateCol'],
396+
index_col='DateCol')
397+
self.assertTrue(
398+
issubclass(df.index.dtype.type, np.datetime64),
399+
"DateCol loaded with incorrect type")
400+
401+
self.assertTrue(
402+
issubclass(df.IntDateCol.dtype.type, np.datetime64),
403+
"IntDateCol loaded with incorrect type")
404+
380405

381406
class TestSQLAlchemy(PandasSQLTest):
382407

0 commit comments

Comments
 (0)