@@ -502,17 +502,16 @@ def __init__(self, f, delimiter=None, dialect=None, names=None, header=0,
502
502
else :
503
503
self .data = f
504
504
self .columns = self ._infer_columns ()
505
-
506
505
# needs to be cleaned/refactored
507
506
# multiple date column thing turning into a real sphaghetti factory
508
507
509
508
# get popped off for index
510
509
self .orig_columns = list (self .columns )
511
-
512
510
self .index_name = None
513
511
self ._name_processed = False
514
512
if not self ._has_complex_date_col :
515
- self .index_name = self ._get_index_name ()
513
+ self .index_name , self .orig_columns , _ = (
514
+ self ._get_index_name (self .columns ))
516
515
self ._name_processed = True
517
516
self ._first_chunk = True
518
517
@@ -679,9 +678,9 @@ def __iter__(self):
679
678
680
679
_implicit_index = False
681
680
682
- def _get_index_name (self , columns = None ):
683
- if columns is None :
684
- columns = self . columns
681
+ def _get_index_name (self , columns ):
682
+ orig_columns = list ( columns )
683
+ columns = list ( columns )
685
684
686
685
try :
687
686
line = self ._next_line ()
@@ -701,10 +700,13 @@ def _get_index_name(self, columns=None):
701
700
implicit_first_cols = len (line ) - len (columns )
702
701
if next_line is not None :
703
702
if len (next_line ) == len (line ) + len (columns ):
703
+ # column and index names on diff rows
704
704
implicit_first_cols = 0
705
705
self .index_col = range (len (line ))
706
706
self .buf = self .buf [1 :]
707
- return line
707
+ for c in reversed (line ):
708
+ columns .insert (0 , c )
709
+ return line , columns , orig_columns
708
710
709
711
if implicit_first_cols > 0 :
710
712
self ._implicit_index = True
@@ -714,7 +716,15 @@ def _get_index_name(self, columns=None):
714
716
else :
715
717
self .index_col = range (implicit_first_cols )
716
718
index_name = None
717
- elif np .isscalar (self .index_col ):
719
+
720
+ else :
721
+ index_name = self ._explicit_index_names (columns )
722
+
723
+ return index_name , orig_columns , columns
724
+
725
+ def _explicit_index_names (self , columns ):
726
+ index_name = None
727
+ if np .isscalar (self .index_col ):
718
728
if isinstance (self .index_col , basestring ):
719
729
index_name = self .index_col
720
730
for i , c in enumerate (list (columns )):
@@ -723,7 +733,7 @@ def _get_index_name(self, columns=None):
723
733
columns .pop (i )
724
734
break
725
735
else :
726
- index_name = columns . pop ( self .index_col )
736
+ index_name = columns [ self .index_col ]
727
737
728
738
if index_name is not None and 'Unnamed' in index_name :
729
739
index_name = None
@@ -745,9 +755,37 @@ def _get_index_name(self, columns=None):
745
755
columns .remove (name )
746
756
index_name .append (name )
747
757
self .index_col = index_col
748
-
749
758
return index_name
750
759
760
+ def _rows_to_cols (self , content ):
761
+ zipped_content = list (lib .to_object_array (content ).T )
762
+
763
+ col_len = len (self .orig_columns )
764
+ zip_len = len (zipped_content )
765
+
766
+ if self ._implicit_index :
767
+ if np .isscalar (self .index_col ):
768
+ col_len += 1
769
+ else :
770
+ col_len += len (self .index_col )
771
+
772
+ if col_len != zip_len :
773
+ row_num = - 1
774
+ for (i , l ) in enumerate (content ):
775
+ if len (l ) != col_len :
776
+ break
777
+
778
+ footers = 0
779
+ if self .skip_footer :
780
+ footers = self .skip_footer
781
+ row_num = self .pos - (len (content ) - i + footers )
782
+
783
+ msg = ('Expecting %d columns, got %d in row %d' %
784
+ (col_len , zip_len , row_num ))
785
+ raise ValueError (msg )
786
+
787
+ return zipped_content
788
+
751
789
def get_chunk (self , rows = None ):
752
790
if rows is not None and self .skip_footer :
753
791
raise ValueError ('skip_footer not supported for iteration' )
@@ -763,103 +801,111 @@ def get_chunk(self, rows=None):
763
801
# done with first read, next time raise StopIteration
764
802
self ._first_chunk = False
765
803
804
+ columns = list (self .orig_columns )
766
805
if len (content ) == 0 : # pragma: no cover
767
806
if self .index_col is not None :
768
807
if np .isscalar (self .index_col ):
769
808
index = Index ([], name = self .index_name )
809
+ columns .pop (self .index_col )
770
810
else :
771
811
index = MultiIndex .from_arrays ([[]] * len (self .index_col ),
772
812
names = self .index_name )
813
+ for n in self .index_col :
814
+ columns .pop (n )
773
815
else :
774
816
index = Index ([])
775
817
776
- return DataFrame (index = index , columns = self . columns )
818
+ return DataFrame (index = index , columns = columns )
777
819
778
- zipped_content = list (lib .to_object_array (content ).T )
779
-
780
- if not self ._has_complex_date_col and self .index_col is not None :
781
- index = self ._get_simple_index (zipped_content )
782
- index = self ._agg_index (index )
783
- else :
784
- index = Index (np .arange (len (content )))
785
-
786
- col_len , zip_len = len (self .columns ), len (zipped_content )
787
- if col_len != zip_len :
788
- row_num = - 1
789
- for (i , l ) in enumerate (content ):
790
- if len (l ) != col_len :
791
- break
792
-
793
- footers = 0
794
- if self .skip_footer :
795
- footers = self .skip_footer
796
- row_num = self .pos - (len (content ) - i + footers )
797
-
798
- msg = ('Expecting %d columns, got %d in row %d' %
799
- (col_len , zip_len , row_num ))
800
- raise ValueError (msg )
801
-
802
- data = dict ((k , v ) for k , v in izip (self .columns , zipped_content ))
820
+ alldata = self ._rows_to_cols (content )
821
+ data = self ._exclude_implicit_index (alldata )
803
822
804
823
# apply converters
805
824
for col , f in self .converters .iteritems ():
806
- if isinstance (col , int ) and col not in self .columns :
807
- col = self .columns [col ]
825
+ if isinstance (col , int ) and col not in self .orig_columns :
826
+ col = self .orig_columns [col ]
808
827
data [col ] = lib .map_infer (data [col ], f )
809
828
810
829
data = _convert_to_ndarrays (data , self .na_values , self .verbose )
811
830
812
- columns = list (self .columns )
813
831
if self .parse_dates is not None :
814
832
data , columns = self ._process_date_conversion (data )
815
833
816
- df = DataFrame (data = data , columns = columns , index = index )
817
- if self ._has_complex_date_col and self .index_col is not None :
834
+ if self .index_col is None :
835
+ numrows = len (content )
836
+ index = Index (np .arange (numrows ))
837
+
838
+ elif not self ._has_complex_date_col :
839
+ index = self ._get_simple_index (alldata , columns )
840
+ index = self ._agg_index (index )
841
+
842
+ elif self ._has_complex_date_col :
818
843
if not self ._name_processed :
819
- self .index_name = self ._get_index_name (list (columns ))
844
+ self .index_name = self ._explicit_index_names (list (columns ))
820
845
self ._name_processed = True
821
- data = dict (((k , v ) for k , v in df .iteritems ()))
822
- index = self ._get_complex_date_index (data , col_names = columns ,
823
- parse_dates = False )
846
+ index = self ._get_complex_date_index (data , columns )
824
847
index = self ._agg_index (index , False )
825
- data = dict ((( k , v . values ) for k , v in data . iteritems ()))
826
- df = DataFrame (data = data , columns = columns , index = index )
848
+
849
+ df = DataFrame (data = data , columns = columns , index = index )
827
850
828
851
if self .squeeze and len (df .columns ) == 1 :
829
852
return df [df .columns [0 ]]
830
853
return df
831
854
855
+ def _exclude_implicit_index (self , alldata ):
856
+
857
+ if self ._implicit_index :
858
+ if np .isscalar (self .index_col ):
859
+ excl_indices = [self .index_col ]
860
+ else :
861
+ excl_indices = self .index_col
862
+ data = {}
863
+ offset = 0
864
+ for i , col in enumerate (self .orig_columns ):
865
+ while i + offset in excl_indices :
866
+ offset += 1
867
+ data [col ] = alldata [i + offset ]
868
+ else :
869
+ data = dict ((k , v ) for k , v in izip (self .orig_columns , alldata ))
870
+
871
+ return data
872
+
832
873
@property
833
874
def _has_complex_date_col (self ):
834
875
return (isinstance (self .parse_dates , dict ) or
835
876
(isinstance (self .parse_dates , list ) and
836
877
len (self .parse_dates ) > 0 and
837
878
isinstance (self .parse_dates [0 ], list )))
838
879
839
- def _get_simple_index (self , data ):
880
+ def _get_simple_index (self , data , columns ):
840
881
def ix (col ):
841
882
if not isinstance (col , basestring ):
842
883
return col
843
884
raise ValueError ('Index %s invalid' % col )
844
885
index = None
845
886
if np .isscalar (self .index_col ):
846
- index = data .pop (ix (self .index_col ))
887
+ i = ix (self .index_col )
888
+ index = data .pop (i )
889
+ if not self ._implicit_index :
890
+ columns .pop (i )
847
891
else : # given a list of index
848
892
to_remove = []
849
893
index = []
850
894
for idx in self .index_col :
851
895
i = ix (idx )
852
896
to_remove .append (i )
853
- index .append (data [idx ])
897
+ index .append (data [i ])
854
898
855
899
# remove index items from content and columns, don't pop in
856
900
# loop
857
901
for i in reversed (sorted (to_remove )):
858
902
data .pop (i )
903
+ if not self ._implicit_index :
904
+ columns .pop (i )
859
905
860
906
return index
861
907
862
- def _get_complex_date_index (self , data , col_names = None , parse_dates = True ):
908
+ def _get_complex_date_index (self , data , col_names ):
863
909
def _get_name (icol ):
864
910
if isinstance (icol , basestring ):
865
911
return icol
@@ -876,22 +922,20 @@ def _get_name(icol):
876
922
if np .isscalar (self .index_col ):
877
923
name = _get_name (self .index_col )
878
924
index = data .pop (name )
879
- if col_names is not None :
880
- col_names .remove (name )
925
+ col_names .remove (name )
881
926
else : # given a list of index
882
927
to_remove = []
883
928
index = []
884
929
for idx in self .index_col :
885
- c = _get_name (idx )
886
- to_remove .append (c )
887
- index .append (data [c ])
930
+ name = _get_name (idx )
931
+ to_remove .append (name )
932
+ index .append (data [name ])
888
933
889
934
# remove index items from content and columns, don't pop in
890
935
# loop
891
936
for c in reversed (sorted (to_remove )):
892
937
data .pop (c )
893
- if col_names is not None :
894
- col_names .remove (c )
938
+ col_names .remove (c )
895
939
896
940
return index
897
941
@@ -955,7 +999,7 @@ def _conv_date(self, *date_cols):
955
999
def _process_date_conversion (self , data_dict ):
956
1000
new_cols = []
957
1001
new_data = {}
958
- columns = self .columns
1002
+ columns = list ( self .orig_columns )
959
1003
date_cols = set ()
960
1004
961
1005
if self .parse_dates is None or isinstance (self .parse_dates , bool ):
@@ -1126,7 +1170,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
1126
1170
1127
1171
def _concat_date_cols (date_cols ):
1128
1172
if len (date_cols ) == 1 :
1129
- return date_cols [0 ]
1173
+ return np . array ([ str ( x ) for x in date_cols [0 ]], dtype = object )
1130
1174
1131
1175
# stripped = [map(str.strip, x) for x in date_cols]
1132
1176
rs = np .array ([' ' .join ([str (y ) for y in x ])
0 commit comments