Skip to content

Commit e161fe8

Browse files
committed
Merge remote-tracking branch 'chang/conv-index-col'
* chang/conv-index-col: BUG: fixed empty case #1835 BUG: converters and index_col on same column #1835. Also minor refactor of parsers
2 parents ddcc30a + 2f917e8 commit e161fe8

File tree

2 files changed

+114
-61
lines changed

2 files changed

+114
-61
lines changed

pandas/io/parsers.py

Lines changed: 104 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -502,17 +502,16 @@ def __init__(self, f, delimiter=None, dialect=None, names=None, header=0,
502502
else:
503503
self.data = f
504504
self.columns = self._infer_columns()
505-
506505
# needs to be cleaned/refactored
507506
# multiple date column thing turning into a real sphaghetti factory
508507

509508
# get popped off for index
510509
self.orig_columns = list(self.columns)
511-
512510
self.index_name = None
513511
self._name_processed = False
514512
if not self._has_complex_date_col:
515-
self.index_name = self._get_index_name()
513+
self.index_name, self.orig_columns, _ = (
514+
self._get_index_name(self.columns))
516515
self._name_processed = True
517516
self._first_chunk = True
518517

@@ -679,9 +678,9 @@ def __iter__(self):
679678

680679
_implicit_index = False
681680

682-
def _get_index_name(self, columns=None):
683-
if columns is None:
684-
columns = self.columns
681+
def _get_index_name(self, columns):
682+
orig_columns = list(columns)
683+
columns = list(columns)
685684

686685
try:
687686
line = self._next_line()
@@ -701,10 +700,13 @@ def _get_index_name(self, columns=None):
701700
implicit_first_cols = len(line) - len(columns)
702701
if next_line is not None:
703702
if len(next_line) == len(line) + len(columns):
703+
# column and index names on diff rows
704704
implicit_first_cols = 0
705705
self.index_col = range(len(line))
706706
self.buf = self.buf[1:]
707-
return line
707+
for c in reversed(line):
708+
columns.insert(0, c)
709+
return line, columns, orig_columns
708710

709711
if implicit_first_cols > 0:
710712
self._implicit_index = True
@@ -714,7 +716,15 @@ def _get_index_name(self, columns=None):
714716
else:
715717
self.index_col = range(implicit_first_cols)
716718
index_name = None
717-
elif np.isscalar(self.index_col):
719+
720+
else:
721+
index_name = self._explicit_index_names(columns)
722+
723+
return index_name, orig_columns, columns
724+
725+
def _explicit_index_names(self, columns):
726+
index_name = None
727+
if np.isscalar(self.index_col):
718728
if isinstance(self.index_col, basestring):
719729
index_name = self.index_col
720730
for i, c in enumerate(list(columns)):
@@ -723,7 +733,7 @@ def _get_index_name(self, columns=None):
723733
columns.pop(i)
724734
break
725735
else:
726-
index_name = columns.pop(self.index_col)
736+
index_name = columns[self.index_col]
727737

728738
if index_name is not None and 'Unnamed' in index_name:
729739
index_name = None
@@ -745,9 +755,37 @@ def _get_index_name(self, columns=None):
745755
columns.remove(name)
746756
index_name.append(name)
747757
self.index_col = index_col
748-
749758
return index_name
750759

760+
def _rows_to_cols(self, content):
761+
zipped_content = list(lib.to_object_array(content).T)
762+
763+
col_len = len(self.orig_columns)
764+
zip_len = len(zipped_content)
765+
766+
if self._implicit_index:
767+
if np.isscalar(self.index_col):
768+
col_len += 1
769+
else:
770+
col_len += len(self.index_col)
771+
772+
if col_len != zip_len:
773+
row_num = -1
774+
for (i, l) in enumerate(content):
775+
if len(l) != col_len:
776+
break
777+
778+
footers = 0
779+
if self.skip_footer:
780+
footers = self.skip_footer
781+
row_num = self.pos - (len(content) - i + footers)
782+
783+
msg = ('Expecting %d columns, got %d in row %d' %
784+
(col_len, zip_len, row_num))
785+
raise ValueError(msg)
786+
787+
return zipped_content
788+
751789
def get_chunk(self, rows=None):
752790
if rows is not None and self.skip_footer:
753791
raise ValueError('skip_footer not supported for iteration')
@@ -763,103 +801,111 @@ def get_chunk(self, rows=None):
763801
# done with first read, next time raise StopIteration
764802
self._first_chunk = False
765803

804+
columns = list(self.orig_columns)
766805
if len(content) == 0: # pragma: no cover
767806
if self.index_col is not None:
768807
if np.isscalar(self.index_col):
769808
index = Index([], name=self.index_name)
809+
columns.pop(self.index_col)
770810
else:
771811
index = MultiIndex.from_arrays([[]] * len(self.index_col),
772812
names=self.index_name)
813+
for n in self.index_col:
814+
columns.pop(n)
773815
else:
774816
index = Index([])
775817

776-
return DataFrame(index=index, columns=self.columns)
818+
return DataFrame(index=index, columns=columns)
777819

778-
zipped_content = list(lib.to_object_array(content).T)
779-
780-
if not self._has_complex_date_col and self.index_col is not None:
781-
index = self._get_simple_index(zipped_content)
782-
index = self._agg_index(index)
783-
else:
784-
index = Index(np.arange(len(content)))
785-
786-
col_len, zip_len = len(self.columns), len(zipped_content)
787-
if col_len != zip_len:
788-
row_num = -1
789-
for (i, l) in enumerate(content):
790-
if len(l) != col_len:
791-
break
792-
793-
footers = 0
794-
if self.skip_footer:
795-
footers = self.skip_footer
796-
row_num = self.pos - (len(content) - i + footers)
797-
798-
msg = ('Expecting %d columns, got %d in row %d' %
799-
(col_len, zip_len, row_num))
800-
raise ValueError(msg)
801-
802-
data = dict((k, v) for k, v in izip(self.columns, zipped_content))
820+
alldata = self._rows_to_cols(content)
821+
data = self._exclude_implicit_index(alldata)
803822

804823
# apply converters
805824
for col, f in self.converters.iteritems():
806-
if isinstance(col, int) and col not in self.columns:
807-
col = self.columns[col]
825+
if isinstance(col, int) and col not in self.orig_columns:
826+
col = self.orig_columns[col]
808827
data[col] = lib.map_infer(data[col], f)
809828

810829
data = _convert_to_ndarrays(data, self.na_values, self.verbose)
811830

812-
columns = list(self.columns)
813831
if self.parse_dates is not None:
814832
data, columns = self._process_date_conversion(data)
815833

816-
df = DataFrame(data=data, columns=columns, index=index)
817-
if self._has_complex_date_col and self.index_col is not None:
834+
if self.index_col is None:
835+
numrows = len(content)
836+
index = Index(np.arange(numrows))
837+
838+
elif not self._has_complex_date_col:
839+
index = self._get_simple_index(alldata, columns)
840+
index = self._agg_index(index)
841+
842+
elif self._has_complex_date_col:
818843
if not self._name_processed:
819-
self.index_name = self._get_index_name(list(columns))
844+
self.index_name = self._explicit_index_names(list(columns))
820845
self._name_processed = True
821-
data = dict(((k, v) for k, v in df.iteritems()))
822-
index = self._get_complex_date_index(data, col_names=columns,
823-
parse_dates=False)
846+
index = self._get_complex_date_index(data, columns)
824847
index = self._agg_index(index, False)
825-
data = dict(((k, v.values) for k, v in data.iteritems()))
826-
df = DataFrame(data=data, columns=columns, index=index)
848+
849+
df = DataFrame(data=data, columns=columns, index=index)
827850

828851
if self.squeeze and len(df.columns) == 1:
829852
return df[df.columns[0]]
830853
return df
831854

855+
def _exclude_implicit_index(self, alldata):
856+
857+
if self._implicit_index:
858+
if np.isscalar(self.index_col):
859+
excl_indices = [self.index_col]
860+
else:
861+
excl_indices = self.index_col
862+
data = {}
863+
offset = 0
864+
for i, col in enumerate(self.orig_columns):
865+
while i + offset in excl_indices:
866+
offset += 1
867+
data[col] = alldata[i + offset]
868+
else:
869+
data = dict((k, v) for k, v in izip(self.orig_columns, alldata))
870+
871+
return data
872+
832873
@property
833874
def _has_complex_date_col(self):
834875
return (isinstance(self.parse_dates, dict) or
835876
(isinstance(self.parse_dates, list) and
836877
len(self.parse_dates) > 0 and
837878
isinstance(self.parse_dates[0], list)))
838879

839-
def _get_simple_index(self, data):
880+
def _get_simple_index(self, data, columns):
840881
def ix(col):
841882
if not isinstance(col, basestring):
842883
return col
843884
raise ValueError('Index %s invalid' % col)
844885
index = None
845886
if np.isscalar(self.index_col):
846-
index = data.pop(ix(self.index_col))
887+
i = ix(self.index_col)
888+
index = data.pop(i)
889+
if not self._implicit_index:
890+
columns.pop(i)
847891
else: # given a list of index
848892
to_remove = []
849893
index = []
850894
for idx in self.index_col:
851895
i = ix(idx)
852896
to_remove.append(i)
853-
index.append(data[idx])
897+
index.append(data[i])
854898

855899
# remove index items from content and columns, don't pop in
856900
# loop
857901
for i in reversed(sorted(to_remove)):
858902
data.pop(i)
903+
if not self._implicit_index:
904+
columns.pop(i)
859905

860906
return index
861907

862-
def _get_complex_date_index(self, data, col_names=None, parse_dates=True):
908+
def _get_complex_date_index(self, data, col_names):
863909
def _get_name(icol):
864910
if isinstance(icol, basestring):
865911
return icol
@@ -876,22 +922,20 @@ def _get_name(icol):
876922
if np.isscalar(self.index_col):
877923
name = _get_name(self.index_col)
878924
index = data.pop(name)
879-
if col_names is not None:
880-
col_names.remove(name)
925+
col_names.remove(name)
881926
else: # given a list of index
882927
to_remove = []
883928
index = []
884929
for idx in self.index_col:
885-
c = _get_name(idx)
886-
to_remove.append(c)
887-
index.append(data[c])
930+
name = _get_name(idx)
931+
to_remove.append(name)
932+
index.append(data[name])
888933

889934
# remove index items from content and columns, don't pop in
890935
# loop
891936
for c in reversed(sorted(to_remove)):
892937
data.pop(c)
893-
if col_names is not None:
894-
col_names.remove(c)
938+
col_names.remove(c)
895939

896940
return index
897941

@@ -955,7 +999,7 @@ def _conv_date(self, *date_cols):
955999
def _process_date_conversion(self, data_dict):
9561000
new_cols = []
9571001
new_data = {}
958-
columns = self.columns
1002+
columns = list(self.orig_columns)
9591003
date_cols = set()
9601004

9611005
if self.parse_dates is None or isinstance(self.parse_dates, bool):
@@ -1126,7 +1170,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
11261170

11271171
def _concat_date_cols(date_cols):
11281172
if len(date_cols) == 1:
1129-
return date_cols[0]
1173+
return np.array([str(x) for x in date_cols[0]], dtype=object)
11301174

11311175
# stripped = [map(str.strip, x) for x in date_cols]
11321176
rs = np.array([' '.join([str(y) for y in x])

pandas/io/tests/test_parsers.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,15 @@ def test_index_col_named(self):
398398
assert_frame_equal(xp, rs)
399399
self.assert_(xp.index.name == rs.index.name)
400400

401+
def test_converter_index_col_bug(self):
402+
#1835
403+
data = "A;B\n1;2\n3;4"
404+
rs = read_csv(StringIO(data), sep=';', index_col='A',
405+
converters={'A' : lambda x: x})
406+
xp = DataFrame({'B' : [2, 4]}, index=Index([1, 3], name='A'))
407+
assert_frame_equal(rs, xp)
408+
self.assert_(rs.index.name == xp.index.name)
409+
401410
def test_multiple_skts_example(self):
402411
data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11."
403412
pass
@@ -1197,7 +1206,7 @@ def test_verbose_import(self):
11971206
try:
11981207
# it works!
11991208
df = read_csv(StringIO(text), verbose=True, index_col=0)
1200-
self.assert_(buf.getvalue() == 'Found 1 NA values in the index\n')
1209+
self.assert_(buf.getvalue() == 'Filled 1 NA values in column a\n')
12011210
finally:
12021211
sys.stdout = sys.__stdout__
12031212

0 commit comments

Comments
 (0)