@@ -502,17 +502,16 @@ def __init__(self, f, delimiter=None, dialect=None, names=None, header=0,
502
502
else :
503
503
self .data = f
504
504
self .columns = self ._infer_columns ()
505
-
506
505
# needs to be cleaned/refactored
507
506
# multiple date column thing turning into a real sphaghetti factory
508
507
509
508
# get popped off for index
510
509
self .orig_columns = list (self .columns )
511
-
512
510
self .index_name = None
513
511
self ._name_processed = False
514
512
if not self ._has_complex_date_col :
515
- self .index_name = self ._get_index_name ()
513
+ self .index_name , self .orig_columns , _ = (
514
+ self ._get_index_name (self .columns ))
516
515
self ._name_processed = True
517
516
self ._first_chunk = True
518
517
@@ -679,9 +678,9 @@ def __iter__(self):
679
678
680
679
_implicit_index = False
681
680
682
- def _get_index_name (self , columns = None ):
683
- if columns is None :
684
- columns = self . columns
681
+ def _get_index_name (self , columns ):
682
+ orig_columns = list ( columns )
683
+ columns = list ( columns )
685
684
686
685
try :
687
686
line = self ._next_line ()
@@ -701,10 +700,13 @@ def _get_index_name(self, columns=None):
701
700
implicit_first_cols = len (line ) - len (columns )
702
701
if next_line is not None :
703
702
if len (next_line ) == len (line ) + len (columns ):
703
+ # column and index names on diff rows
704
704
implicit_first_cols = 0
705
705
self .index_col = range (len (line ))
706
706
self .buf = self .buf [1 :]
707
- return line
707
+ for c in reversed (line ):
708
+ columns .insert (0 , c )
709
+ return line , columns , orig_columns
708
710
709
711
if implicit_first_cols > 0 :
710
712
self ._implicit_index = True
@@ -714,7 +716,15 @@ def _get_index_name(self, columns=None):
714
716
else :
715
717
self .index_col = range (implicit_first_cols )
716
718
index_name = None
717
- elif np .isscalar (self .index_col ):
719
+
720
+ else :
721
+ index_name = self ._explicit_index_names (columns )
722
+
723
+ return index_name , orig_columns , columns
724
+
725
+ def _explicit_index_names (self , columns ):
726
+ index_name = None
727
+ if np .isscalar (self .index_col ):
718
728
if isinstance (self .index_col , basestring ):
719
729
index_name = self .index_col
720
730
for i , c in enumerate (list (columns )):
@@ -723,7 +733,7 @@ def _get_index_name(self, columns=None):
723
733
columns .pop (i )
724
734
break
725
735
else :
726
- index_name = columns . pop ( self .index_col )
736
+ index_name = columns [ self .index_col ]
727
737
728
738
if index_name is not None and 'Unnamed' in index_name :
729
739
index_name = None
@@ -745,9 +755,37 @@ def _get_index_name(self, columns=None):
745
755
columns .remove (name )
746
756
index_name .append (name )
747
757
self .index_col = index_col
748
-
749
758
return index_name
750
759
760
+ def _rows_to_cols (self , content ):
761
+ zipped_content = list (lib .to_object_array (content ).T )
762
+
763
+ col_len = len (self .orig_columns )
764
+ zip_len = len (zipped_content )
765
+
766
+ if self ._implicit_index :
767
+ if np .isscalar (self .index_col ):
768
+ col_len += 1
769
+ else :
770
+ col_len += len (self .index_col )
771
+
772
+ if col_len != zip_len :
773
+ row_num = - 1
774
+ for (i , l ) in enumerate (content ):
775
+ if len (l ) != col_len :
776
+ break
777
+
778
+ footers = 0
779
+ if self .skip_footer :
780
+ footers = self .skip_footer
781
+ row_num = self .pos - (len (content ) - i + footers )
782
+
783
+ msg = ('Expecting %d columns, got %d in row %d' %
784
+ (col_len , zip_len , row_num ))
785
+ raise ValueError (msg )
786
+
787
+ return zipped_content
788
+
751
789
def get_chunk (self , rows = None ):
752
790
if rows is not None and self .skip_footer :
753
791
raise ValueError ('skip_footer not supported for iteration' )
@@ -775,91 +813,96 @@ def get_chunk(self, rows=None):
775
813
776
814
return DataFrame (index = index , columns = self .columns )
777
815
778
- zipped_content = list (lib .to_object_array (content ).T )
779
-
780
- if not self ._has_complex_date_col and self .index_col is not None :
781
- index = self ._get_simple_index (zipped_content )
782
- index = self ._agg_index (index )
783
- else :
784
- index = Index (np .arange (len (content )))
785
-
786
- col_len , zip_len = len (self .columns ), len (zipped_content )
787
- if col_len != zip_len :
788
- row_num = - 1
789
- for (i , l ) in enumerate (content ):
790
- if len (l ) != col_len :
791
- break
792
-
793
- footers = 0
794
- if self .skip_footer :
795
- footers = self .skip_footer
796
- row_num = self .pos - (len (content ) - i + footers )
797
-
798
- msg = ('Expecting %d columns, got %d in row %d' %
799
- (col_len , zip_len , row_num ))
800
- raise ValueError (msg )
801
-
802
- data = dict ((k , v ) for k , v in izip (self .columns , zipped_content ))
816
+ alldata = self ._rows_to_cols (content )
817
+ data = self ._exclude_implicit_index (alldata )
803
818
804
819
# apply converters
805
820
for col , f in self .converters .iteritems ():
806
- if isinstance (col , int ) and col not in self .columns :
807
- col = self .columns [col ]
821
+ if isinstance (col , int ) and col not in self .orig_columns :
822
+ col = self .orig_columns [col ]
808
823
data [col ] = lib .map_infer (data [col ], f )
809
824
810
825
data = _convert_to_ndarrays (data , self .na_values , self .verbose )
811
826
812
- columns = list (self .columns )
827
+ columns = list (self .orig_columns )
813
828
if self .parse_dates is not None :
814
829
data , columns = self ._process_date_conversion (data )
815
830
816
- df = DataFrame (data = data , columns = columns , index = index )
817
- if self ._has_complex_date_col and self .index_col is not None :
831
+ if self .index_col is None :
832
+ numrows = len (content )
833
+ index = Index (np .arange (numrows ))
834
+
835
+ elif not self ._has_complex_date_col :
836
+ index = self ._get_simple_index (alldata , columns )
837
+ index = self ._agg_index (index )
838
+
839
+ elif self ._has_complex_date_col :
818
840
if not self ._name_processed :
819
- self .index_name = self ._get_index_name (list (columns ))
841
+ self .index_name = self ._explicit_index_names (list (columns ))
820
842
self ._name_processed = True
821
- data = dict (((k , v ) for k , v in df .iteritems ()))
822
- index = self ._get_complex_date_index (data , col_names = columns ,
823
- parse_dates = False )
843
+ index = self ._get_complex_date_index (data , columns )
824
844
index = self ._agg_index (index , False )
825
- data = dict ((( k , v . values ) for k , v in data . iteritems ()))
826
- df = DataFrame (data = data , columns = columns , index = index )
845
+
846
+ df = DataFrame (data = data , columns = columns , index = index )
827
847
828
848
if self .squeeze and len (df .columns ) == 1 :
829
849
return df [df .columns [0 ]]
830
850
return df
831
851
852
+ def _exclude_implicit_index (self , alldata ):
853
+
854
+ if self ._implicit_index :
855
+ if np .isscalar (self .index_col ):
856
+ excl_indices = [self .index_col ]
857
+ else :
858
+ excl_indices = self .index_col
859
+ data = {}
860
+ offset = 0
861
+ for i , col in enumerate (self .orig_columns ):
862
+ while i + offset in excl_indices :
863
+ offset += 1
864
+ data [col ] = alldata [i + offset ]
865
+ else :
866
+ data = dict ((k , v ) for k , v in izip (self .orig_columns , alldata ))
867
+
868
+ return data
869
+
832
870
@property
833
871
def _has_complex_date_col (self ):
834
872
return (isinstance (self .parse_dates , dict ) or
835
873
(isinstance (self .parse_dates , list ) and
836
874
len (self .parse_dates ) > 0 and
837
875
isinstance (self .parse_dates [0 ], list )))
838
876
839
- def _get_simple_index (self , data ):
877
+ def _get_simple_index (self , data , columns ):
840
878
def ix (col ):
841
879
if not isinstance (col , basestring ):
842
880
return col
843
881
raise ValueError ('Index %s invalid' % col )
844
882
index = None
845
883
if np .isscalar (self .index_col ):
846
- index = data .pop (ix (self .index_col ))
884
+ i = ix (self .index_col )
885
+ index = data .pop (i )
886
+ if not self ._implicit_index :
887
+ columns .pop (i )
847
888
else : # given a list of index
848
889
to_remove = []
849
890
index = []
850
891
for idx in self .index_col :
851
892
i = ix (idx )
852
893
to_remove .append (i )
853
- index .append (data [idx ])
894
+ index .append (data [i ])
854
895
855
896
# remove index items from content and columns, don't pop in
856
897
# loop
857
898
for i in reversed (sorted (to_remove )):
858
899
data .pop (i )
900
+ if not self ._implicit_index :
901
+ columns .pop (i )
859
902
860
903
return index
861
904
862
- def _get_complex_date_index (self , data , col_names = None , parse_dates = True ):
905
+ def _get_complex_date_index (self , data , col_names ):
863
906
def _get_name (icol ):
864
907
if isinstance (icol , basestring ):
865
908
return icol
@@ -876,22 +919,20 @@ def _get_name(icol):
876
919
if np .isscalar (self .index_col ):
877
920
name = _get_name (self .index_col )
878
921
index = data .pop (name )
879
- if col_names is not None :
880
- col_names .remove (name )
922
+ col_names .remove (name )
881
923
else : # given a list of index
882
924
to_remove = []
883
925
index = []
884
926
for idx in self .index_col :
885
- c = _get_name (idx )
886
- to_remove .append (c )
887
- index .append (data [c ])
927
+ name = _get_name (idx )
928
+ to_remove .append (name )
929
+ index .append (data [name ])
888
930
889
931
# remove index items from content and columns, don't pop in
890
932
# loop
891
933
for c in reversed (sorted (to_remove )):
892
934
data .pop (c )
893
- if col_names is not None :
894
- col_names .remove (c )
935
+ col_names .remove (c )
895
936
896
937
return index
897
938
@@ -955,7 +996,7 @@ def _conv_date(self, *date_cols):
955
996
def _process_date_conversion (self , data_dict ):
956
997
new_cols = []
957
998
new_data = {}
958
- columns = self .columns
999
+ columns = list ( self .orig_columns )
959
1000
date_cols = set ()
960
1001
961
1002
if self .parse_dates is None or isinstance (self .parse_dates , bool ):
@@ -1126,7 +1167,7 @@ def _try_convert_dates(parser, colspec, data_dict, columns):
1126
1167
1127
1168
def _concat_date_cols (date_cols ):
1128
1169
if len (date_cols ) == 1 :
1129
- return date_cols [0 ]
1170
+ return np . array ([ str ( x ) for x in date_cols [0 ]], dtype = object )
1130
1171
1131
1172
# stripped = [map(str.strip, x) for x in date_cols]
1132
1173
rs = np .array ([' ' .join ([str (y ) for y in x ])
0 commit comments