Skip to content

Commit 513267d

Browse files
author
Chang She
committed
BUG: converters and index_col on same column #1835. Also minor refactor of parsers
1 parent 3187a90 commit 513267d

File tree

2 files changed

+110
-60
lines changed

2 files changed

+110
-60
lines changed

pandas/io/parsers.py

Lines changed: 100 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -502,17 +502,16 @@ def __init__(self, f, delimiter=None, dialect=None, names=None, header=0,
502502
else:
503503
self.data = f
504504
self.columns = self._infer_columns()
505-
506505
# needs to be cleaned/refactored
507506
# multiple date column thing turning into a real sphaghetti factory
508507

509508
# get popped off for index
510509
self.orig_columns = list(self.columns)
511-
512510
self.index_name = None
513511
self._name_processed = False
514512
if not self._has_complex_date_col:
515-
self.index_name = self._get_index_name()
513+
self.index_name, self.orig_columns, _ = (
514+
self._get_index_name(self.columns))
516515
self._name_processed = True
517516
self._first_chunk = True
518517

@@ -679,9 +678,9 @@ def __iter__(self):
679678

680679
_implicit_index = False
681680

682-
def _get_index_name(self, columns=None):
683-
if columns is None:
684-
columns = self.columns
681+
def _get_index_name(self, columns):
682+
orig_columns = list(columns)
683+
columns = list(columns)
685684

686685
try:
687686
line = self._next_line()
@@ -701,10 +700,13 @@ def _get_index_name(self, columns=None):
701700
implicit_first_cols = len(line) - len(columns)
702701
if next_line is not None:
703702
if len(next_line) == len(line) + len(columns):
703+
# column and index names on diff rows
704704
implicit_first_cols = 0
705705
self.index_col = range(len(line))
706706
self.buf = self.buf[1:]
707-
return line
707+
for c in reversed(line):
708+
columns.insert(0, c)
709+
return line, columns, orig_columns
708710

709711
if implicit_first_cols > 0:
710712
self._implicit_index = True
@@ -714,7 +716,15 @@ def _get_index_name(self, columns=None):
714716
else:
715717
self.index_col = range(implicit_first_cols)
716718
index_name = None
717-
elif np.isscalar(self.index_col):
719+
720+
else:
721+
index_name = self._explicit_index_names(columns)
722+
723+
return index_name, orig_columns, columns
724+
725+
def _explicit_index_names(self, columns):
726+
index_name = None
727+
if np.isscalar(self.index_col):
718728
if isinstance(self.index_col, basestring):
719729
index_name = self.index_col
720730
for i, c in enumerate(list(columns)):
@@ -723,7 +733,7 @@ def _get_index_name(self, columns=None):
723733
columns.pop(i)
724734
break
725735
else:
726-
index_name = columns.pop(self.index_col)
736+
index_name = columns[self.index_col]
727737

728738
if index_name is not None and 'Unnamed' in index_name:
729739
index_name = None
@@ -745,9 +755,37 @@ def _get_index_name(self, columns=None):
745755
columns.remove(name)
746756
index_name.append(name)
747757
self.index_col = index_col
748-
749758
return index_name
750759

760+
def _rows_to_cols(self, content):
761+
zipped_content = list(lib.to_object_array(content).T)
762+
763+
col_len = len(self.orig_columns)
764+
zip_len = len(zipped_content)
765+
766+
if self._implicit_index:
767+
if np.isscalar(self.index_col):
768+
col_len += 1
769+
else:
770+
col_len += len(self.index_col)
771+
772+
if col_len != zip_len:
773+
row_num = -1
774+
for (i, l) in enumerate(content):
775+
if len(l) != col_len:
776+
break
777+
778+
footers = 0
779+
if self.skip_footer:
780+
footers = self.skip_footer
781+
row_num = self.pos - (len(content) - i + footers)
782+
783+
msg = ('Expecting %d columns, got %d in row %d' %
784+
(col_len, zip_len, row_num))
785+
raise ValueError(msg)
786+
787+
return zipped_content
788+
751789
def get_chunk(self, rows=None):
752790
if rows is not None and self.skip_footer:
753791
raise ValueError('skip_footer not supported for iteration')
@@ -775,91 +813,96 @@ def get_chunk(self, rows=None):
775813

776814
return DataFrame(index=index, columns=self.columns)
777815

778-
zipped_content = list(lib.to_object_array(content).T)
779-
780-
if not self._has_complex_date_col and self.index_col is not None:
781-
index = self._get_simple_index(zipped_content)
782-
index = self._agg_index(index)
783-
else:
784-
index = Index(np.arange(len(content)))
785-
786-
col_len, zip_len = len(self.columns), len(zipped_content)
787-
if col_len != zip_len:
788-
row_num = -1
789-
for (i, l) in enumerate(content):
790-
if len(l) != col_len:
791-
break
792-
793-
footers = 0
794-
if self.skip_footer:
795-
footers = self.skip_footer
796-
row_num = self.pos - (len(content) - i + footers)
797-
798-
msg = ('Expecting %d columns, got %d in row %d' %
799-
(col_len, zip_len, row_num))
800-
raise ValueError(msg)
801-
802-
data = dict((k, v) for k, v in izip(self.columns, zipped_content))
816+
alldata = self._rows_to_cols(content)
817+
data = self._exclude_implicit_index(alldata)
803818

804819
# apply converters
805820
for col, f in self.converters.iteritems():
806-
if isinstance(col, int) and col not in self.columns:
807-
col = self.columns[col]
821+
if isinstance(col, int) and col not in self.orig_columns:
822+
col = self.orig_columns[col]
808823
data[col] = lib.map_infer(data[col], f)
809824

810825
data = _convert_to_ndarrays(data, self.na_values, self.verbose)
811826

812-
columns = list(self.columns)
827+
columns = list(self.orig_columns)
813828
if self.parse_dates is not None:
814829
data, columns = self._process_date_conversion(data)
815830

816-
df = DataFrame(data=data, columns=columns, index=index)
817-
if self._has_complex_date_col and self.index_col is not None:
831+
if self.index_col is None:
832+
numrows = len(content)
833+
index = Index(np.arange(numrows))
834+
835+
elif not self._has_complex_date_col:
836+
index = self._get_simple_index(alldata, columns)
837+
index = self._agg_index(index)
838+
839+
elif self._has_complex_date_col:
818840
if not self._name_processed:
819-
self.index_name = self._get_index_name(list(columns))
841+
self.index_name = self._explicit_index_names(list(columns))
820842
self._name_processed = True
821-
data = dict(((k, v) for k, v in df.iteritems()))
822-
index = self._get_complex_date_index(data, col_names=columns,
823-
parse_dates=False)
843+
index = self._get_complex_date_index(data, columns)
824844
index = self._agg_index(index, False)
825-
data = dict(((k, v.values) for k, v in data.iteritems()))
826-
df = DataFrame(data=data, columns=columns, index=index)
845+
846+
df = DataFrame(data=data, columns=columns, index=index)
827847

828848
if self.squeeze and len(df.columns) == 1:
829849
return df[df.columns[0]]
830850
return df
831851

852+
def _exclude_implicit_index(self, alldata):
853+
854+
if self._implicit_index:
855+
if np.isscalar(self.index_col):
856+
excl_indices = [self.index_col]
857+
else:
858+
excl_indices = self.index_col
859+
data = {}
860+
offset = 0
861+
for i, col in enumerate(self.orig_columns):
862+
while i + offset in excl_indices:
863+
offset += 1
864+
data[col] = alldata[i + offset]
865+
else:
866+
data = dict((k, v) for k, v in izip(self.orig_columns, alldata))
867+
868+
return data
869+
832870
@property
833871
def _has_complex_date_col(self):
834872
return (isinstance(self.parse_dates, dict) or
835873
(isinstance(self.parse_dates, list) and
836874
len(self.parse_dates) > 0 and
837875
isinstance(self.parse_dates[0], list)))
838876

839-
def _get_simple_index(self, data):
877+
def _get_simple_index(self, data, columns):
840878
def ix(col):
841879
if not isinstance(col, basestring):
842880
return col
843881
raise ValueError('Index %s invalid' % col)
844882
index = None
845883
if np.isscalar(self.index_col):
846-
index = data.pop(ix(self.index_col))
884+
i = ix(self.index_col)
885+
index = data.pop(i)
886+
if not self._implicit_index:
887+
columns.pop(i)
847888
else: # given a list of index
848889
to_remove = []
849890
index = []
850891
for idx in self.index_col:
851892
i = ix(idx)
852893
to_remove.append(i)
853-
index.append(data[idx])
894+
index.append(data[i])
854895

855896
# remove index items from content and columns, don't pop in
856897
# loop
857898
for i in reversed(sorted(to_remove)):
858899
data.pop(i)
900+
if not self._implicit_index:
901+
columns.pop(i)
859902

860903
return index
861904

862-
def _get_complex_date_index(self, data, col_names=None, parse_dates=True):
905+
def _get_complex_date_index(self, data, col_names):
863906
def _get_name(icol):
864907
if isinstance(icol, basestring):
865908
return icol
@@ -876,22 +919,20 @@ def _get_name(icol):
876919
if np.isscalar(self.index_col):
877920
name = _get_name(self.index_col)
878921
index = data.pop(name)
879-
if col_names is not None:
880-
col_names.remove(name)
922+
col_names.remove(name)
881923
else: # given a list of index
882924
to_remove = []
883925
index = []
884926
for idx in self.index_col:
885-
c = _get_name(idx)
886-
to_remove.append(c)
887-
index.append(data[c])
927+
name = _get_name(idx)
928+
to_remove.append(name)
929+
index.append(data[name])
888930

889931
# remove index items from content and columns, don't pop in
890932
# loop
891933
for c in reversed(sorted(to_remove)):
892934
data.pop(c)
893-
if col_names is not None:
894-
col_names.remove(c)
935+
col_names.remove(c)
895936

896937
return index
897938

@@ -955,7 +996,7 @@ def _conv_date(self, *date_cols):
955996
def _process_date_conversion(self, data_dict):
956997
new_cols = []
957998
new_data = {}
958-
columns = self.columns
999+
columns = list(self.orig_columns)
9591000
date_cols = set()
9601001

9611002
if self.parse_dates is None or isinstance(self.parse_dates, bool):
@@ -1126,7 +1167,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
11261167

11271168
def _concat_date_cols(date_cols):
11281169
if len(date_cols) == 1:
1129-
return date_cols[0]
1170+
return np.array([str(x) for x in date_cols[0]], dtype=object)
11301171

11311172
# stripped = [map(str.strip, x) for x in date_cols]
11321173
rs = np.array([' '.join([str(y) for y in x])

pandas/io/tests/test_parsers.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,15 @@ def test_index_col_named(self):
383383
assert_frame_equal(xp, rs)
384384
self.assert_(xp.index.name == rs.index.name)
385385

386+
def test_converter_index_col_bug(self):
387+
#1835
388+
data = "A;B\n1;2\n3;4"
389+
rs = read_csv(StringIO(data), sep=';', index_col='A',
390+
converters={'A' : lambda x: x})
391+
xp = DataFrame({'B' : [2, 4]}, index=Index([1, 3], name='A'))
392+
assert_frame_equal(rs, xp)
393+
self.assert_(rs.index.name == xp.index.name)
394+
386395
def test_multiple_skts_example(self):
387396
data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11."
388397
pass
@@ -1182,7 +1191,7 @@ def test_verbose_import(self):
11821191
try:
11831192
# it works!
11841193
df = read_csv(StringIO(text), verbose=True, index_col=0)
1185-
self.assert_(buf.getvalue() == 'Found 1 NA values in the index\n')
1194+
self.assert_(buf.getvalue() == 'Filled 1 NA values in column a\n')
11861195
finally:
11871196
sys.stdout = sys.__stdout__
11881197

0 commit comments

Comments
 (0)