From abbf818de345b1ad306e84ad35267b28a6c64f70 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Tue, 5 May 2020 00:44:12 +0200 Subject: [PATCH 01/12] FIX: Fix is_potential_multi_index --- pandas/io/parsers.py | 11 +++++++---- pandas/tests/io/data/excel/df_empty.xlsx | Bin 0 -> 5595 bytes pandas/tests/io/data/excel/df_equals.xlsx | Bin 0 -> 5595 bytes pandas/tests/io/excel/test_readers.py | 5 +++++ 4 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 pandas/tests/io/data/excel/df_empty.xlsx create mode 100644 pandas/tests/io/data/excel/df_equals.xlsx diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2df81ba0aa51a..7a444976024d4 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1168,7 +1168,7 @@ def _is_index_col(col): return col is not None and col is not False -def _is_potential_multi_index(columns): +def _is_potential_multi_index(columns, index_col=None): """ Check whether or not the `columns` parameter could be converted into a MultiIndex. @@ -1182,10 +1182,13 @@ def _is_potential_multi_index(columns): ------- boolean : Whether or not columns could become a MultiIndex """ + if index_col is None: + index_col = [] + columns_to_check = [col for col in columns if col not in list(index_col)] return ( len(columns) and not isinstance(columns, MultiIndex) - and all(isinstance(c, tuple) for c in columns) + and all(isinstance(c, tuple) for c in columns_to_check) ) @@ -1570,7 +1573,7 @@ def _maybe_dedup_names(self, names): if self.mangle_dupe_cols: names = list(names) # so we can index counts = defaultdict(int) - is_potential_mi = _is_potential_multi_index(names) + is_potential_mi = _is_potential_multi_index(names, self.index_col) for i, col in enumerate(names): cur_count = counts[col] @@ -1591,7 +1594,7 @@ def _maybe_dedup_names(self, names): def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here - if _is_potential_multi_index(columns): + if _is_potential_multi_index(columns, self.index_col): columns = MultiIndex.from_tuples(columns, names=col_names) return columns diff --git a/pandas/tests/io/data/excel/df_empty.xlsx b/pandas/tests/io/data/excel/df_empty.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..d65a92b10e2932a10438c1580972698bff213420 GIT binary patch literal 5595 zcmaJ_by(DGv!+2p8cC%Wkl2-4Km?>@>28o%8kUkqI;Esr2}$Yh6iJCix^xwkR6s)N z?D~G^$m?}JXRm92`^U~a*K^Nr?zv~w6fvD!si9pn0_1N~SEz#JN zN>1D>Kz2owC~f^Hj1G1S|Kk3UOBZiGp|Sj@1EgSi4VW~SYrHDtaP))JaG|ez0tEld zfwJNx`i8}}C7QTogsZW(=eM|!W zc??373jB(}?PNsIaD49-sDI1I>R$3q8baVBWzsaJ_1!@#cvBqHMe@kk^mOvpmUUNW z^Fy4mcCt{QwY#q}i~l2~-4~P${HBm+2D#@W>*Q*RxOici0-3u=5Z+;+p{e~>m`ITS zu!LEtxx$>?_{^N0dA%GRauQTkkyyRi^NZ$MXi)enCs-svj^~nz6%pL5&E(Sb@>~<4 zvfKm+tlt?O8z28U|I|-R%#hwpiTxyXX)2mxO?zTs&v1W6Jk4H%HQ7Xw?tc9ztSE;! zL;(!ap`=8G%;bJ{L+BtUYQ;@774}TW7dcEZD(0z(+%PjBer|9ky_e+^Z9zgStSYIr z?}V%Y{bQt81n;FTmeN+*G)M3Eh&fL}uzKcSZDT+dX>dR8J~$qamgu*MO-yGY6*~~= z9of4%Gr?v&nW^(E=R15U;B{7~KBX>;|K&#lidtxcS9Z*M=-9kqT%DrxM>-T#S7un- zJ4hh>(f?8?g2F#?mk^IkT}qPVc5s zkZu{CSTK+U3HcET@;}#nBwtBu%)g~!Z2sm1a2Flm;umIL{rYjL8H~EDhm?Vb;DKIQ zY^7*OutmvB99trG(wt5xJMN=VwN#DT&N>T8AwIBv#-$yPN#32p%79?g4dU_T zlDvec+VpA%_RL#-1sS)cyOEJVm;c{%X8xqCZ6-B5|z(I0n$1L->tC{p*hZBsqyApEg; z%)B<{IiaB>uH=B4^||K=Cm!Ku=Q$R}?>Cp$y2a{Yp$tv7Q*X;PpsvDeTt83S1h`HA*j%!@8N^EjZ%sB{^T3Mf5m% z44WE_-bz?h@;v{@a0pg0$0~xxstoXvSyn?(*5PwTH^- z^jLm_g^B79yNyi3@VA>u(%p;iYiXF6hhyhY?2Fr{b{{jpqw*G=Kqmq)8NA?sv zB2Yd*JI779(nhVicJt=6AMJJ=bunq3+~|OPIVn8SwqL-U7L*m~a@_PaYs%3vh1Rot z8oXGzZj;q7ATf1yB=0lE+2m5To~-T9Ccqb&;wkM3*X>iUx3hZ^C*drz9ayvFXumq% z_;yjoW1+!Ur7@y%txcSOBJj->Y(6up@gP^f&cD)?S!dTfqBPH@b#hef| z8P^Sd9{UvMVK~JY!f=PyEPw~Sy>2zFP$hP{B}IS&qwcYh>rFfQV_ubt8M9bVXSzK? z0`XwB&n0rUt8y8BS$;28l)99jD#O?%g7`N?n+$Z~FM+3`r=SagizP1ob$U)Ac&M+3Cy)OeYld zO3YYcFC40KsH%zP_AX?T6umgEgk~4P9~`0=c6`~MUq*Z$Ye`+(xQiQD{(v8lv-0ova___`L*_wuTk0iab$%YQo!u1CkT-&?Q_?eK<02k> zCdP|RR)7x9i{8f0-rn=F7wtE6?0=>+_?cJOU^qQb#H5^5o#M2Mxy?h3_nau~yM!zeRY2dtPl9`8mEG93Y@om38P;f3#$ z`)kQCIf&43NPG=uoC)=a?c&|AgGn+268jkOdZ})`JDJQEtI|l!+zl8~h#r*$Fc!0| zGsH&D#~;IOGDnLh84V%|q^O+$A;bmWEfbvJ$?RzkiQYex+&`<`I#d+Bq$2WXJyzT0 ztzUE-ZQP*$wBWMOS#jn$eQrWt{)>|ZGUJWTkeuxN*9J=QA5OM0bA?)Jy1UvsS^uKu z*aQ$kFOVe6GeEZ47%m!_%$gs(R{6}(W?X2($c~rt;#f{>x5a4-vKW%GrI0RBDc9?q0OmaBQK60gq z-r}xzqz&CJNz&fuS`M8lLwUwcpvF`+1}L#Dw)x7cV2&g z(Xg$K-fCY>M1XsAwG0_B57jwp!3m>O@Y|W z$bM-;oU(kZu%0&_cpX6(yE`9fbjI>m)@<0~tM8DNx0BBW-csN*s5=1b^P2Z2h#40S zK~zd-hgw5+)cHpC72w(EW0aZ|hM4tEch%b5datT2W{GZ^7WlpZ4ieoMbG`kXe^Mjf zeC;mZJ={ab)6e3cC72oJSca=YUT6q;-gs|nb$ic;0Cr=sLhb4HIqSiGQ-*9s)ylFT z1_~L~Z>K&}$a}N>wM!#u2Gylen7G+Mq3&*czaAhIrF7^j!xq6L9eY(I?p2*T4Va6v zi0MV_sn!xW&jcNhIu&X3bYe2Y*{tUyNbN_yZKh>K8b2Dr7{iEJ;SAq;%hq zf?u@W>234Q7mw+az}mBf?F`sp>x;O;qEXO~3ZBcitG!F;-$`D%qgJfCsX(XC<~<9G7)8x5R_b>$7U>*xAe z?n~VQ!Dbs!ZFP(x^e>(I1Ax%oAWR75{6m|HElADsFJDdx# z9BL;`?{@!z;lsG|CfkD)Ad$JmyNg)x@hybj{w-=YVQRKn7GuUAF7DzK+fT@5JHAvI z3i6Gbg}^5xaYGM6EX&ecSzrk=h!0=JrHZ*Hz5`y1ND^2OClJ3}otffDHMD*3k*8=4 zc5>*5_L%ZRSMLjD$?&Q3nc%{bM1y_hiTc!pr_!U*t&IBN^Rjv&`uVq5>Jxznx@8?y z4XMRMNnV(mFPdayX#2k~fBYuQI6!3XhYmU#=xdTE46&IAjKfWXh#fsFFkd~I7Wj$I zDUgNN=@$Am0=sEv?^}Ib{v5%0q~4y6UFsH1OC2GTOKgz@+Eid~+s}cycfnCz@6T!D zeYZ$|-0z*eeJ~?kjIHlpDkH3MvNuSPFYB*}<(utd~bOiZzEyp(n zJV%$8jQB4n8F+Q1(hYR17+vwHL5~x~;#ji~U{|}Kni8=eMhTeK_>&?^`g(_*q(CmN z#9Y{kN8`_!{%|y)ptR!J*O7;FwTrmT5aswCY^&6;jF9?1Mnd_XmuD_H*=|p~NA|2* zT7$)Edd+2Tew(C`&hbS+R6@0UI8HUv?8%23+)NKr2(sSp(z-D)cyJOH#O%#!2F;Vz z;`HEQC}gR-ZR!|t1mP`p+DA`q5ObfYle~{jRAE%H^=PjqDY%U5Fi-9^x9|E2KYwbD z>TIoo&f}bkGD(ZHO$T)*gD!We+FQ}8 zQtZo3#}fsdJKdGX&qZ6)8pL9+yr7E&HUu7p#siqdA7%p#-*NzU_{2|fP-(g&X?9|T z?3vTps5GHtP@w%@qg~$|q1I^swrfkZKOL{{QBaGe-v&S)_2-)DPw(qH0Mz^O+cc0Z z?Ek!fZ(jZkaD6UCtqOkIG*T7)4)CAF!JqEer8jDL{cS-=kp4Bw{^@#MeWA4ZZ(GLu zzyA8aO8rmg>k+?h>%r?jPPHC&!2Iw*DgvJ{5Aj* nkpCtV{)}?{)%tfHbCFs3f2v+h5gP|}6h873h)lKYFR%Xr|F=qU literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/df_equals.xlsx b/pandas/tests/io/data/excel/df_equals.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..d65a92b10e2932a10438c1580972698bff213420 GIT binary patch literal 5595 zcmaJ_by(DGv!+2p8cC%Wkl2-4Km?>@>28o%8kUkqI;Esr2}$Yh6iJCix^xwkR6s)N z?D~G^$m?}JXRm92`^U~a*K^Nr?zv~w6fvD!si9pn0_1N~SEz#JN zN>1D>Kz2owC~f^Hj1G1S|Kk3UOBZiGp|Sj@1EgSi4VW~SYrHDtaP))JaG|ez0tEld zfwJNx`i8}}C7QTogsZW(=eM|!W zc??373jB(}?PNsIaD49-sDI1I>R$3q8baVBWzsaJ_1!@#cvBqHMe@kk^mOvpmUUNW z^Fy4mcCt{QwY#q}i~l2~-4~P${HBm+2D#@W>*Q*RxOici0-3u=5Z+;+p{e~>m`ITS zu!LEtxx$>?_{^N0dA%GRauQTkkyyRi^NZ$MXi)enCs-svj^~nz6%pL5&E(Sb@>~<4 zvfKm+tlt?O8z28U|I|-R%#hwpiTxyXX)2mxO?zTs&v1W6Jk4H%HQ7Xw?tc9ztSE;! zL;(!ap`=8G%;bJ{L+BtUYQ;@774}TW7dcEZD(0z(+%PjBer|9ky_e+^Z9zgStSYIr z?}V%Y{bQt81n;FTmeN+*G)M3Eh&fL}uzKcSZDT+dX>dR8J~$qamgu*MO-yGY6*~~= z9of4%Gr?v&nW^(E=R15U;B{7~KBX>;|K&#lidtxcS9Z*M=-9kqT%DrxM>-T#S7un- zJ4hh>(f?8?g2F#?mk^IkT}qPVc5s zkZu{CSTK+U3HcET@;}#nBwtBu%)g~!Z2sm1a2Flm;umIL{rYjL8H~EDhm?Vb;DKIQ zY^7*OutmvB99trG(wt5xJMN=VwN#DT&N>T8AwIBv#-$yPN#32p%79?g4dU_T zlDvec+VpA%_RL#-1sS)cyOEJVm;c{%X8xqCZ6-B5|z(I0n$1L->tC{p*hZBsqyApEg; z%)B<{IiaB>uH=B4^||K=Cm!Ku=Q$R}?>Cp$y2a{Yp$tv7Q*X;PpsvDeTt83S1h`HA*j%!@8N^EjZ%sB{^T3Mf5m% z44WE_-bz?h@;v{@a0pg0$0~xxstoXvSyn?(*5PwTH^- z^jLm_g^B79yNyi3@VA>u(%p;iYiXF6hhyhY?2Fr{b{{jpqw*G=Kqmq)8NA?sv zB2Yd*JI779(nhVicJt=6AMJJ=bunq3+~|OPIVn8SwqL-U7L*m~a@_PaYs%3vh1Rot z8oXGzZj;q7ATf1yB=0lE+2m5To~-T9Ccqb&;wkM3*X>iUx3hZ^C*drz9ayvFXumq% z_;yjoW1+!Ur7@y%txcSOBJj->Y(6up@gP^f&cD)?S!dTfqBPH@b#hef| z8P^Sd9{UvMVK~JY!f=PyEPw~Sy>2zFP$hP{B}IS&qwcYh>rFfQV_ubt8M9bVXSzK? z0`XwB&n0rUt8y8BS$;28l)99jD#O?%g7`N?n+$Z~FM+3`r=SagizP1ob$U)Ac&M+3Cy)OeYld zO3YYcFC40KsH%zP_AX?T6umgEgk~4P9~`0=c6`~MUq*Z$Ye`+(xQiQD{(v8lv-0ova___`L*_wuTk0iab$%YQo!u1CkT-&?Q_?eK<02k> zCdP|RR)7x9i{8f0-rn=F7wtE6?0=>+_?cJOU^qQb#H5^5o#M2Mxy?h3_nau~yM!zeRY2dtPl9`8mEG93Y@om38P;f3#$ z`)kQCIf&43NPG=uoC)=a?c&|AgGn+268jkOdZ})`JDJQEtI|l!+zl8~h#r*$Fc!0| zGsH&D#~;IOGDnLh84V%|q^O+$A;bmWEfbvJ$?RzkiQYex+&`<`I#d+Bq$2WXJyzT0 ztzUE-ZQP*$wBWMOS#jn$eQrWt{)>|ZGUJWTkeuxN*9J=QA5OM0bA?)Jy1UvsS^uKu z*aQ$kFOVe6GeEZ47%m!_%$gs(R{6}(W?X2($c~rt;#f{>x5a4-vKW%GrI0RBDc9?q0OmaBQK60gq z-r}xzqz&CJNz&fuS`M8lLwUwcpvF`+1}L#Dw)x7cV2&g z(Xg$K-fCY>M1XsAwG0_B57jwp!3m>O@Y|W z$bM-;oU(kZu%0&_cpX6(yE`9fbjI>m)@<0~tM8DNx0BBW-csN*s5=1b^P2Z2h#40S zK~zd-hgw5+)cHpC72w(EW0aZ|hM4tEch%b5datT2W{GZ^7WlpZ4ieoMbG`kXe^Mjf zeC;mZJ={ab)6e3cC72oJSca=YUT6q;-gs|nb$ic;0Cr=sLhb4HIqSiGQ-*9s)ylFT z1_~L~Z>K&}$a}N>wM!#u2Gylen7G+Mq3&*czaAhIrF7^j!xq6L9eY(I?p2*T4Va6v zi0MV_sn!xW&jcNhIu&X3bYe2Y*{tUyNbN_yZKh>K8b2Dr7{iEJ;SAq;%hq zf?u@W>234Q7mw+az}mBf?F`sp>x;O;qEXO~3ZBcitG!F;-$`D%qgJfCsX(XC<~<9G7)8x5R_b>$7U>*xAe z?n~VQ!Dbs!ZFP(x^e>(I1Ax%oAWR75{6m|HElADsFJDdx# z9BL;`?{@!z;lsG|CfkD)Ad$JmyNg)x@hybj{w-=YVQRKn7GuUAF7DzK+fT@5JHAvI z3i6Gbg}^5xaYGM6EX&ecSzrk=h!0=JrHZ*Hz5`y1ND^2OClJ3}otffDHMD*3k*8=4 zc5>*5_L%ZRSMLjD$?&Q3nc%{bM1y_hiTc!pr_!U*t&IBN^Rjv&`uVq5>Jxznx@8?y z4XMRMNnV(mFPdayX#2k~fBYuQI6!3XhYmU#=xdTE46&IAjKfWXh#fsFFkd~I7Wj$I zDUgNN=@$Am0=sEv?^}Ib{v5%0q~4y6UFsH1OC2GTOKgz@+Eid~+s}cycfnCz@6T!D zeYZ$|-0z*eeJ~?kjIHlpDkH3MvNuSPFYB*}<(utd~bOiZzEyp(n zJV%$8jQB4n8F+Q1(hYR17+vwHL5~x~;#ji~U{|}Kni8=eMhTeK_>&?^`g(_*q(CmN z#9Y{kN8`_!{%|y)ptR!J*O7;FwTrmT5aswCY^&6;jF9?1Mnd_XmuD_H*=|p~NA|2* zT7$)Edd+2Tew(C`&hbS+R6@0UI8HUv?8%23+)NKr2(sSp(z-D)cyJOH#O%#!2F;Vz z;`HEQC}gR-ZR!|t1mP`p+DA`q5ObfYle~{jRAE%H^=PjqDY%U5Fi-9^x9|E2KYwbD z>TIoo&f}bkGD(ZHO$T)*gD!We+FQ}8 zQtZo3#}fsdJKdGX&qZ6)8pL9+yr7E&HUu7p#siqdA7%p#-*NzU_{2|fP-(g&X?9|T z?3vTps5GHtP@w%@qg~$|q1I^swrfkZKOL{{QBaGe-v&S)_2-)DPw(qH0Mz^O+cc0Z z?Ek!fZ(jZkaD6UCtqOkIG*T7)4)CAF!JqEer8jDL{cS-=kp4Bw{^@#MeWA4ZZ(GLu zzyA8aO8rmg>k+?h>%r?jPPHC&!2Iw*DgvJ{5Aj* nkpCtV{)}?{)%tfHbCFs3f2v+h5gP|}6h873h)lKYFR%Xr|F=qU literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 99447c03e89af..249764caf5dfb 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1084,3 +1084,8 @@ def test_excel_high_surrogate(self, engine): # should not produce a segmentation violation actual = pd.read_excel("high_surrogate.xlsx") tm.assert_frame_equal(expected, actual) + + @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) + def test_header_empty_cells(self, engine, filename): + pd.read_excel(filename, sheet_name='Sheet1', index_col=0, header=[0, 1]) + From 463ef45873b019d41b05b9e4edb801b662186994 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Tue, 5 May 2020 00:47:23 +0200 Subject: [PATCH 02/12] Syntax --- pandas/io/parsers.py | 2 +- pandas/tests/io/excel/test_readers.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7a444976024d4..028e0529718b9 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1168,7 +1168,7 @@ def _is_index_col(col): return col is not None and col is not False -def _is_potential_multi_index(columns, index_col=None): +def _is_potential_multi_index(columns, index_col): """ Check whether or not the `columns` parameter could be converted into a MultiIndex. diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 249764caf5dfb..df19df5035483 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1087,5 +1087,4 @@ def test_excel_high_surrogate(self, engine): @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) def test_header_empty_cells(self, engine, filename): - pd.read_excel(filename, sheet_name='Sheet1', index_col=0, header=[0, 1]) - + pd.read_excel(filename, sheet_name="Sheet1", index_col=0, header=[0, 1]) From 34187083ea231a14e2210382e58f4619e5b74406 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Tue, 5 May 2020 01:58:24 +0200 Subject: [PATCH 03/12] Fix handling bool and int values --- pandas/io/parsers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 028e0529718b9..7d7672766a946 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1182,9 +1182,12 @@ def _is_potential_multi_index(columns, index_col): ------- boolean : Whether or not columns could become a MultiIndex """ - if index_col is None: + if index_col is None or isinstance(index_col, bool): index_col = [] + elif isinstance(index_col, int): + index_col = [index_col] columns_to_check = [col for col in columns if col not in list(index_col)] + return ( len(columns) and not isinstance(columns, MultiIndex) From be99190c38a6c081ed18958c2f4679d2c0cd7c6c Mon Sep 17 00:00:00 2001 From: mproszewska Date: Tue, 5 May 2020 03:36:44 +0200 Subject: [PATCH 04/12] Fix --- pandas/io/parsers.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7d7672766a946..9c36697d7ea88 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1168,7 +1168,7 @@ def _is_index_col(col): return col is not None and col is not False -def _is_potential_multi_index(columns, index_col): +def _is_potential_multi_index(columns, index_col=None): """ Check whether or not the `columns` parameter could be converted into a MultiIndex. @@ -1177,17 +1177,22 @@ def _is_potential_multi_index(columns, index_col): ---------- columns : array-like Object which may or may not be convertible into a MultiIndex + index_col : None, bool or list, optional + Column or columns to use as the (possibly hierarchical) index Returns ------- boolean : Whether or not columns could become a MultiIndex """ + print(columns, index_col) if index_col is None or isinstance(index_col, bool): index_col = [] - elif isinstance(index_col, int): - index_col = [index_col] columns_to_check = [col for col in columns if col not in list(index_col)] - + print(( + len(columns) + and not isinstance(columns, MultiIndex) + and all(isinstance(c, tuple) for c in columns_to_check) + )) return ( len(columns) and not isinstance(columns, MultiIndex) @@ -1597,7 +1602,7 @@ def _maybe_dedup_names(self, names): def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here - if _is_potential_multi_index(columns, self.index_col): + if _is_potential_multi_index(columns): columns = MultiIndex.from_tuples(columns, names=col_names) return columns From 4115fd332771c6debf5573beafb6a58db6bb6916 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Tue, 5 May 2020 12:11:17 +0200 Subject: [PATCH 05/12] Fix --- pandas/io/parsers.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9c36697d7ea88..d0779f251983d 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1184,19 +1184,13 @@ def _is_potential_multi_index(columns, index_col=None): ------- boolean : Whether or not columns could become a MultiIndex """ - print(columns, index_col) if index_col is None or isinstance(index_col, bool): index_col = [] - columns_to_check = [col for col in columns if col not in list(index_col)] - print(( - len(columns) - and not isinstance(columns, MultiIndex) - and all(isinstance(c, tuple) for c in columns_to_check) - )) + return ( len(columns) and not isinstance(columns, MultiIndex) - and all(isinstance(c, tuple) for c in columns_to_check) + and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) ) From c54fed3a85d5cdcf09aee7e42cce1da2df5000cc Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 16:54:28 +0200 Subject: [PATCH 06/12] Add types and csv test --- pandas/io/parsers.py | 8 +++++--- pandas/tests/io/parser/test_index_col.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d0779f251983d..a70ebd62d99bf 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -10,7 +10,7 @@ import re import sys from textwrap import fill -from typing import Any, Dict, Iterable, List, Set +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set import warnings import numpy as np @@ -20,7 +20,7 @@ import pandas._libs.parsers as parsers from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, Union from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -1168,7 +1168,9 @@ def _is_index_col(col): return col is not None and col is not False -def _is_potential_multi_index(columns, index_col=None): +def _is_potential_multi_index( + columns, index_col: Optional[Union[bool, Sequence[int]]] = None +): """ Check whether or not the `columns` parameter could be converted into a MultiIndex. diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index f67a658cadfa2..998d1dd84329c 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -184,3 +184,17 @@ def test_no_multi_index_level_names_empty(all_parsers): expected.to_csv(path) result = parser.read_csv(path, index_col=[0, 1, 2]) tm.assert_frame_equal(result, expected) + + +def test_header_empty_cells(all_parsers): + parser = all_parsers + data = """ +I11,A,A +I12,B,B +I2,1,3 +""" + midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"]) + idx = Index(["I2"]) + expected = DataFrame([[1, 3]], index=idx, columns=midx) + result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1]) + tm.assert_frame_equal(result, expected) From 02706f01d5289bdeb05139aedd4b7125b79b4be9 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 18:41:37 +0200 Subject: [PATCH 07/12] Change test name --- pandas/tests/io/excel/test_readers.py | 2 +- pandas/tests/io/parser/test_index_col.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index df19df5035483..bed124febb36b 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1086,5 +1086,5 @@ def test_excel_high_surrogate(self, engine): tm.assert_frame_equal(expected, actual) @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) - def test_header_empty_cells(self, engine, filename): + def test_header_with_index_col(self, engine, filename): pd.read_excel(filename, sheet_name="Sheet1", index_col=0, header=[0, 1]) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 998d1dd84329c..989f87e0ac867 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -186,7 +186,7 @@ def test_no_multi_index_level_names_empty(all_parsers): tm.assert_frame_equal(result, expected) -def test_header_empty_cells(all_parsers): +def test_header_with_index_col(all_parsers): parser = all_parsers data = """ I11,A,A @@ -198,3 +198,6 @@ def test_header_empty_cells(all_parsers): expected = DataFrame([[1, 3]], index=idx, columns=midx) result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1]) tm.assert_frame_equal(result, expected) + + result = parser.read_csv(StringIO(data), index_col="I11", header=0) + From be6981ba798ab60d40276d2ddbc816ca6cd61298 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Sun, 10 May 2020 18:44:55 +0200 Subject: [PATCH 08/12] Run black --- pandas/tests/io/parser/test_index_col.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 989f87e0ac867..f6ffc9547867b 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -200,4 +200,3 @@ def test_header_with_index_col(all_parsers): tm.assert_frame_equal(result, expected) result = parser.read_csv(StringIO(data), index_col="I11", header=0) - From f1381dc2020b25a172ac79478016180934f73a19 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Fri, 15 May 2020 13:18:01 +0200 Subject: [PATCH 09/12] Add assert in test --- pandas/tests/io/parser/test_index_col.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index f6ffc9547867b..f06aed79905e7 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -196,7 +196,13 @@ def test_header_with_index_col(all_parsers): midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"]) idx = Index(["I2"]) expected = DataFrame([[1, 3]], index=idx, columns=midx) + result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1]) tm.assert_frame_equal(result, expected) + col_idx = Index(["A", "A.1"]) + idx = Index(["I12", "I2"], name="I11") + expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx) + result = parser.read_csv(StringIO(data), index_col="I11", header=0) + tm.assert_frame_equal(result, expected) From ccedc7f4fcd18480a526cede5a52f0672660fa08 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Thu, 28 May 2020 16:45:22 +0200 Subject: [PATCH 10/12] Link issue to tests --- pandas/tests/io/excel/test_readers.py | 1 + pandas/tests/io/parser/test_index_col.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index bed124febb36b..96ca66c0ba2ca 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1087,4 +1087,5 @@ def test_excel_high_surrogate(self, engine): @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) def test_header_with_index_col(self, engine, filename): + # GH 33476 pd.read_excel(filename, sheet_name="Sheet1", index_col=0, header=[0, 1]) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index f06aed79905e7..9f425168540ba 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -187,6 +187,7 @@ def test_no_multi_index_level_names_empty(all_parsers): def test_header_with_index_col(all_parsers): + # GH 33476 parser = all_parsers data = """ I11,A,A From d4036359b648d586f200fd672c4462424e83a39f Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 1 Jun 2020 01:34:07 +0200 Subject: [PATCH 11/12] Add assert in test --- pandas/tests/io/excel/test_readers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 96ca66c0ba2ca..60e530448d01c 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1088,4 +1088,8 @@ def test_excel_high_surrogate(self, engine): @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) def test_header_with_index_col(self, engine, filename): # GH 33476 - pd.read_excel(filename, sheet_name="Sheet1", index_col=0, header=[0, 1]) + idx = pd.Index(["Z"], name="I2") + cols = pd.MultiIndex.from_tuples([("A","B"),("A","B.1")], names=["I11","I12"]) + expected = pd.DataFrame([[1,3]], index=idx, columns=cols, dtype="int64") + result = pd.read_excel(filename, sheet_name="Sheet1", index_col=0, header=[0, 1]) + tm.assert_frame_equal(expected, result) From 41d88e8422242d7a0900df4b69373131a99ebdb9 Mon Sep 17 00:00:00 2001 From: mproszewska Date: Mon, 1 Jun 2020 02:42:01 +0200 Subject: [PATCH 12/12] Run black --- pandas/tests/io/excel/test_readers.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 60e530448d01c..5401c4bea79f4 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1089,7 +1089,11 @@ def test_excel_high_surrogate(self, engine): def test_header_with_index_col(self, engine, filename): # GH 33476 idx = pd.Index(["Z"], name="I2") - cols = pd.MultiIndex.from_tuples([("A","B"),("A","B.1")], names=["I11","I12"]) - expected = pd.DataFrame([[1,3]], index=idx, columns=cols, dtype="int64") - result = pd.read_excel(filename, sheet_name="Sheet1", index_col=0, header=[0, 1]) + cols = pd.MultiIndex.from_tuples( + [("A", "B"), ("A", "B.1")], names=["I11", "I12"] + ) + expected = pd.DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64") + result = pd.read_excel( + filename, sheet_name="Sheet1", index_col=0, header=[0, 1] + ) tm.assert_frame_equal(expected, result)