Skip to content

Commit 65e7797

Browse files
Chang Shewesm
Chang She
authored andcommitted
ENH: use parsed date column as index #1251
1 parent a4bca8c commit 65e7797

File tree

2 files changed

+120
-14
lines changed

2 files changed

+120
-14
lines changed

pandas/io/parsers.py

+117-14
Original file line numberDiff line numberDiff line change
@@ -441,10 +441,17 @@ def __init__(self, f, delimiter=None, names=None, header=0,
441441
self.data = f
442442
self.columns = self._infer_columns()
443443

444+
# needs to be cleaned/refactored
445+
# multiple date column thing turning into a real sphaghetti factory
446+
444447
# get popped off for index
445448
self.orig_columns = list(self.columns)
446449

447-
self.index_name = self._get_index_name()
450+
self.index_name = None
451+
self._name_processed = False
452+
if not self._has_complex_date_col:
453+
self.index_name = self._get_index_name()
454+
self._name_processed = True
448455
self._first_chunk = True
449456

450457
self.squeeze = squeeze
@@ -534,6 +541,8 @@ def _infer_columns(self):
534541
else:
535542
columns = names
536543

544+
545+
537546
return columns
538547

539548
def _next_line(self):
@@ -656,10 +665,14 @@ def _get_index_name(self):
656665
index_name = None
657666
elif np.isscalar(self.index_col):
658667
if isinstance(self.index_col, basestring):
668+
index_names = self.index_col
659669
for i, c in enumerate(list(columns)):
660670
if c == self.index_col:
661671
self.index_col = i
662-
index_name = columns.pop(self.index_col)
672+
columns.pop(i)
673+
break
674+
else:
675+
index_name = columns.pop(self.index_col)
663676

664677
if index_name is not None and 'Unnamed' in index_name:
665678
index_name = None
@@ -670,10 +683,12 @@ def _get_index_name(self):
670683
index_col = list(self.index_col)
671684
for i, c in enumerate(index_col):
672685
if isinstance(c, basestring):
673-
index_name = c
686+
index_name.append(c)
674687
for j, name in enumerate(cp_cols):
675-
if name == index_name:
688+
if name == c:
676689
index_col[i] = j
690+
columns.remove(name)
691+
break
677692
else:
678693
name = cp_cols[c]
679694
columns.remove(name)
@@ -710,8 +725,8 @@ def get_chunk(self, rows=None):
710725

711726
zipped_content = list(lib.to_object_array(content).T)
712727

713-
if self.index_col is not None:
714-
index = self._extract_index(zipped_content)
728+
if not self._has_complex_date_col and self.index_col is not None:
729+
index = self._get_index(zipped_content)
715730
else:
716731
index = Index(np.arange(len(content)))
717732

@@ -746,24 +761,110 @@ def get_chunk(self, rows=None):
746761
data = _convert_to_ndarrays(data, self.na_values, self.verbose)
747762

748763
df = DataFrame(data=data, columns=columns, index=index)
764+
if self._has_complex_date_col and self.index_col is not None:
765+
if not self._name_processed:
766+
self.index_name = self._get_index_name()
767+
self._name_processed = True
768+
data = dict(((k, v) for k, v in df.iteritems()))
769+
columns = list(columns)
770+
index = self._get_index(data, col_order=columns, parse_dates=False)
771+
data = dict(((k, v.values) for k, v in data.iteritems()))
772+
df = DataFrame(data=data, columns=columns, index=index)
773+
749774
if self.squeeze and len(df.columns) == 1:
750775
return df[df.columns[0]]
751776
return df
752777

753-
def _extract_index(self, zipped_content):
778+
@property
779+
def _has_complex_date_col(self):
780+
return (isinstance(self.parse_dates, dict) or
781+
(isinstance(self.parse_dates, list) and
782+
len(self.parse_dates) > 0 and
783+
isinstance(self.parse_dates[0], list)))
784+
785+
def _get_index(self, data, col_order=None, parse_dates=True):
786+
if isinstance(data, dict):
787+
index = self._get_index_from_dict(data, col_order, parse_dates)
788+
return self._agg_index(index, parse_dates)
789+
else:
790+
index = self._get_index_from_list(data, col_order, parse_dates)
791+
return self._agg_index(index, parse_dates)
792+
793+
def _get_index_from_list(self, data, col_names=None, parse_dates=True):
794+
def _get_ix(icol):
795+
if not isinstance(icol, basestring):
796+
return icol
797+
798+
if col_names is None:
799+
raise ValueError(('Must supply column order to use %s as '
800+
'index') % icol)
801+
802+
for i, c in enumerate(col_names):
803+
if c == icol:
804+
return i
805+
806+
index = None
807+
if np.isscalar(self.index_col):
808+
ix = _get_ix(self.index_col)
809+
index = data.pop(ix)
810+
if col_names is not None:
811+
col_names.pop(ix)
812+
else: # given a list of index
813+
to_remove = []
814+
index = []
815+
for idx in self.index_col:
816+
i = _get_ix(idx)
817+
to_remove.append(i)
818+
index.append(data[i])
819+
820+
# remove index items from content and columns, don't pop in
821+
# loop
822+
for i in reversed(sorted(to_remove)):
823+
data.pop(i)
824+
if col_names is not None:
825+
col_names.pop(i)
826+
827+
return index
828+
829+
def _get_index_from_dict(self, data, col_names=None, parse_dates=True):
830+
def _get_name(icol):
831+
if isinstance(icol, basestring):
832+
return icol
833+
834+
if col_names is None:
835+
raise ValueError(('Must supply column order to use %s as '
836+
'index') % str(icol))
837+
838+
for i, c in enumerate(col_names):
839+
if i == icol:
840+
return c
841+
842+
index = None
754843
if np.isscalar(self.index_col):
755-
index = zipped_content.pop(self.index_col)
844+
name = _get_name(self.index_col)
845+
index = data.pop(name)
846+
if col_names is not None:
847+
col_names.remove(name)
756848
else: # given a list of index
849+
to_remove = []
757850
index = []
758851
for idx in self.index_col:
759-
index.append(zipped_content[idx])
852+
c = _get_name(idx)
853+
to_remove.append(c)
854+
index.append(data[c])
855+
760856
# remove index items from content and columns, don't pop in
761857
# loop
762-
for i in reversed(sorted(self.index_col)):
763-
zipped_content.pop(i)
858+
for c in reversed(sorted(to_remove)):
859+
data.pop(c)
860+
if col_names is not None:
861+
col_names.remove(c)
764862

863+
return index
864+
865+
def _agg_index(self, index, parse_dates):
765866
if np.isscalar(self.index_col):
766-
if self._should_parse_dates(self.index_col):
867+
if parse_dates and self._should_parse_dates(self.index_col):
767868
index = self._conv_date(index)
768869
index, na_count = _convert_types(index, self.na_values)
769870
index = Index(index, name=self.index_name)
@@ -772,7 +873,7 @@ def _extract_index(self, zipped_content):
772873
else:
773874
arrays = []
774875
for i, arr in enumerate(index):
775-
if self._should_parse_dates(self.index_col[i]):
876+
if parse_dates and self._should_parse_dates(self.index_col[i]):
776877
arr = self._conv_date(arr)
777878
arr, _ = _convert_types(arr, self.na_values)
778879
arrays.append(arr)
@@ -801,11 +902,13 @@ def _should_parse_dates(self, i):
801902
if isinstance(self.parse_dates, bool):
802903
return self.parse_dates
803904
else:
804-
to_parse = self.parse_dates
905+
to_parse = self.parse_dates # int/string or list of int or string
906+
805907
if np.isscalar(self.index_col):
806908
name = self.index_name
807909
else:
808910
name = self.index_name[i]
911+
809912
return i in to_parse or name in to_parse
810913

811914
def _conv_date(self, *date_cols):

pandas/io/tests/test_parsers.py

+3
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,9 @@ def test_multiple_date_cols_index(self):
215215
index_col=0)
216216
assert_frame_equal(df2, df)
217217

218+
df3 = read_csv(StringIO(data), parse_dates=[[1, 2]], index_col=0)
219+
assert_frame_equal(df3, df)
220+
218221
def test_index_col_named(self):
219222
data = """\
220223
ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir

0 commit comments

Comments
 (0)