From 2bcf35b80ba765e8aee002bb0a19acd2799a5cc5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 30 Jan 2021 13:54:17 -0500 Subject: [PATCH 1/9] BUG: read_excel with openpyxl and missing dimension --- doc/source/whatsnew/v1.2.2.rst | 2 +- pandas/io/excel/_openpyxl.py | 6 ++++++ pandas/tests/io/data/excel/no_dimension.xlsx | Bin 0 -> 4875 bytes pandas/tests/io/excel/test_openpyxl.py | 21 +++++++++++++++++++ 4 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/io/data/excel/no_dimension.xlsx diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index baa0cc2ac9e18..d1418492a44c8 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -26,7 +26,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing dimension information (:issue:`38956`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 71e1bf6b43ad5..56778426f1eef 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -527,4 +527,10 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) + # openpyxl may not have the correct padding if the dimension tag is + # not specified or is incorrect + max_width = max(len(row) for row in data) + if min(len(row) for row in data) < max_width: + data = [row + (max_width - len(row)) * [""] for row in data] + return data diff --git a/pandas/tests/io/data/excel/no_dimension.xlsx b/pandas/tests/io/data/excel/no_dimension.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..9274896689a72c83138e7ba580c12a66ac59bcf6 GIT binary patch literal 4875 zcmZ`-bzD?y*PQ{$0qKxZ8bn$V>5?8|7+Shx=t~VDjkJi;A)$bT2qGaVAPk*@j5J8d z+#v-);2ZRNulMracg}Cl`D^X-oV}mD*4mGb2JRJV0DzDX(EC^hVqYHqZUF!QNa6qh zAnen^%g)tXP~hTTnL4bFD@-1P364vF`_Gt{ zw_=W?d~bBgurP)HkVjci^gV_4u#%pB0fSbvtDxU^xE3MLlxfXEtY>Zmv^F85?GLHO z7_VGesy9cv*NLP>3ZgQ+ku6ddT>XwL% z`hK0n>=|23;t<&#o-H{N)ugUDgTflux3{7qUskau-yL)?r$R^e=X`%&?&8PZ^z?DpN?wZ4M;^4SzmFv4-LcEX;x|a7rIZ5`*bAANY zhE@aii&EAsyIaqhp#xk%X}+kM^*C{Atz4$C&9}67R@ydt(G{}NtN5Gd(fs-n7*P3^ zPdK#%F`tN9!k~gq`)9hxid$BtvpHV`$w^kZj|*ZIroBXTLdNjqD~Qkxp=Lvqa}SlY zIAY|<>tB2HB-73Y=%-HjoNIs~w)hgK;=IBbOQnI3_xZ}>wql9+qY_M{o!h=5M?wtR z_#3dSv4vEyh<0dd1=D7Dh?>fL;dA#DAOam_0RNMt-Zw1>v3bTU+Gg0bN%zk^LUv;U zzJhI(4LSgT^w%Eo_6cyc^S;<2Xd_eiY$5vK`Rdm{S{24Pz}y9!62fS^@+oN6ZFC0^ z*C1HFemwf@_)`YCW476tz6STM^-1mU_p1ab@vB2q3E`%+d-J@q?QU+SoWH{kb)-28RIx4o)k0g3uw6_|QVl@5pr=TPkb%M{he_p2#+K{MwV1+`y z8HDjsKvi$-*%2;JSBIW*(8Z>faA?0!Bj$LT2~5Jgqe?cH@RZd{PratRAB0G!u%Pq3 zN~MmvClxI~j>kF1puv)j@3p{No~G;BShVL989S|`Uz@s^tTG$ekkmcs;Of0Ho*jLtyCspVU$S8vtt z-a*N~o9>sFY>A-0l7hm(jrjvwi*p$Gd>J_S*^f;prrsj_ACRXg2j z*|=cuYRX*g05?+^bKI0@Iqr-Msk-M%dm!eIZ_glKDz7OPH7!=@Wjk#Lb|nx9OLlFw z)Z+_BWXKQPuqUrg&gk1=4d&%8ihzjTEUPTu*tRkK@_ z=b#=6b2!d}J&~(GN+dr_c9b3oFO^`&%d^C&9Ii218~=LdZZBP`V(6+Q=$2iy=(%{m}ms^tu^_^ zMy2oBox1O2;1V<+@9g$o8=Vw$_@Oa+6zwJVN;kmiJ>AErmKHifc?I>$*V_?L{QHNz zB8_2P6DiSK_3bxA$lo2pygmr^ed9$D8Q^xMq~u)F0FD#seNetC*+sCjt_Pem+mM;|QkpbIe|Z zF6``a)L5jXMqw!GSPHDhE8;8j&dP4owDwO{I3;$wg;q7y`o`cko6zEdb|QMX?mHLB zixa6Vx-w0bboK-RO?`cm367XUdZMAv-ryCM9*{CUw0gfhQEwHkV7(nK*SD5M|0aje zbN@AS{dA)k8(;V)wHV4@ik__|tw(r!MlbR_`f*VNv#pXdT)7;V1SsC$1pOfj^5{wSmxn6dI=Z!v3P-Kf+ z|0r%?v!02|@a;NaAJU2k%^N+Krb2G?uf0~0-irOIl9Nw+7jENOo7bSG^0`7Kc!~m< z+^`-CGPxT8P0LR7iTI@9sjwgf<3hA%a+!g&sy9QlqY8(^89NQc(eihiww`p4lwt_u zD2@_KrMtTZ26Gk{wtn2YMW9>T^=d{spmWFgG~($zQACN%s@iOU3xJYU(}j6r`EI&6 zDMu-PFuosnV`@dMbgXA69VLa5_k=g;n$51QzG-a_9`1BY>t~K<4^<6we~sghL*0$j zur}P8u5j%>&rlvS`{*)Fa%l0y#Q6M&k6JD`D&mg}*lIC6uoxZ{(<-Z3J0z_8ZLxo9 zMM^nAA}3{PcymS#?fth?Rm$d?Pp+JFC4O+{(fS1z|O@c$w$XjxI}$!+uxmyRryi zxeXN-J^g;?IKMgM!hdRSe-9kPN+~B+RvCUt>MslWck}hjjy?vvd=Mgu#+=e9A9kw| z*d&OzB*<5v9szrFmqxn8Qm5XYhX^p565DN!JQ+dmwW=O$-cH<%?NTJcFE6|j)zR%s z*b;X7CwlRk!C>Lsf+J`0>+6)S-cwCKq*D_#sofJd7O?J2ku)prQ`ag(_BBPIbp`u( z)y8vxgEV@Trnq%(d^*kEo#(G@kfc{Fis1P|y`)5gUM!DAMtJ15o zY|PW_i0X)Zh?pPIq=h{TCJ^LKgsRjD+4{k;xBD*her5ICY|y1l$ODkKtFRf6#F|j7 zKfhp8Z$~RHJ6k;;FDEyLpB^J|xP!P;hy;J3SzFf&sYOBIBnjOn+6N+N&8)xYM!p%D4{!@Kx zP;T$*(X8yNMQg?kDTtLEuYHJht0RlZ@iR%KM=j>iNp+R$gwQ9-lpqZq_NcG1C)&Qz z5u%tlQnFUw!Bu>lR@Mfa6qZCw^&a`#qY+m`BSmq^LQ|u9R}$WPMB6H8_o$AeVfG z7DLZ3bAU|>2sp0$I`q^wn>*CDRD?6r^Ep3dr|jkdbUO<`GmFwErQR!s{Q>nfnI!h| z0mR#P>-YE|oN(^d1}3+H?0S%!B%JxPrlBmZU#B4SMZ(`axSPb z&07*Bd2RBzkt#|{6c%wh6qU?nm5VNrDB@A3)n+JN$*;SbP)8@|Gmr@(_8algl}ITH zbnoVI&Ej%S)!IDaK7#GkZ7GvRIp@a#Z@#f%=M|M>-fm|s4sHw!0JE?ddP6mrh=^<` zxpv|>%4Ct_ha~4s^9U<2Xa2TA*1krgzd9xB54;&ZYazeeG4GF^WK2^0@vf+M! zCoL5jdSipQovNHBB;F|2z?DX62+4c;JBb-Ndm4?m6jQU~;b06qi zN_yIW|IADBw^cD?Y)%aE001g%UQ89;-F)oad@P^=9(LYl7pZB1N@`>G>f8w*TQ!ew z|3h;1nXr$9PkBT|b~uF-ufLfcJA)7LLiLGwV`KZ?U|Q3)eZ^3vT*eW5W9vDZ>lqvc zAok)auid%^4Z%KF2?VVvNT$Oa?G1^stft?)PZ|ezzU5ev*2Bx<@%5&CK+Oux>%E<$ za2jQT5mwU25t_A(Jm1TS`q22veochjeFs&7>Z_5asNQgbTeU`Zx#9$g7+}>~aj*0c z@uFB)co2W3?1_2D=BBD#dB&|gKX!9|AH^9QYnJV%TekY$D+%_0*09?sArk>`pIipJ6Yvi>e-C|&~}9x z7jE|SUjB9ssZhs{0!K!m*vu>zEL!FPcF1IY|lxBvhE literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 3155e22d3ff5d..b4442ceec8df5 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -116,3 +116,24 @@ def test_to_excel_with_openpyxl_engine(ext): ).highlight_max() styled.to_excel(filename, engine="openpyxl") + + +@pytest.mark.parametrize( + "header, expected_data", + [ + ( + 0, + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + }, + ), + (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), + ], +) +def test_read_with_missing_dimension(datapath, ext, header, expected_data): + path = datapath("io", "data", "excel", f"no_dimension{ext}") + result = pd.read_excel(path, header=header) + expected = DataFrame(expected_data) + tm.assert_frame_equal(result, expected) From ea18d610bbb62bb5c097b022472680600da6bffd Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 30 Jan 2021 15:49:40 -0500 Subject: [PATCH 2/9] fixups --- pandas/io/excel/_openpyxl.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 56778426f1eef..c935eda7e4172 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -529,8 +529,11 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: # openpyxl may not have the correct padding if the dimension tag is # not specified or is incorrect - max_width = max(len(row) for row in data) - if min(len(row) for row in data) < max_width: - data = [row + (max_width - len(row)) * [""] for row in data] + if len(data) > 0: + max_width = max(len(data_row) for data_row in data) + if min(len(data_row) for data_row in data) < max_width: + data = [ + data_row + (max_width - len(data_row)) * [""] for data_row in data + ] return data From d5215f776393292d794e7e34d8aad779494b1604 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 31 Jan 2021 09:27:12 -0500 Subject: [PATCH 3/9] Added fixes for incorrect dimension information --- doc/source/whatsnew/v1.2.2.rst | 2 +- pandas/io/excel/_openpyxl.py | 8 ++++++-- pandas/tests/io/data/excel/dimension_large.xlsx | Bin 0 -> 4894 bytes ...{no_dimension.xlsx => dimension_missing.xlsx} | Bin pandas/tests/io/data/excel/dimension_small.xlsx | Bin 0 -> 4894 bytes pandas/tests/io/excel/test_openpyxl.py | 8 ++++++-- 6 files changed, 13 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/io/data/excel/dimension_large.xlsx rename pandas/tests/io/data/excel/{no_dimension.xlsx => dimension_missing.xlsx} (100%) create mode 100644 pandas/tests/io/data/excel/dimension_small.xlsx diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index d1418492a44c8..50abb0e872331 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -26,7 +26,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing dimension information (:issue:`38956`) +- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information (:issue:`38956`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index c935eda7e4172..3f7d6382314bb 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -523,12 +523,16 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + # GH 39001 + # Reading of excel file depends on dimension data being correct but + # writers sometimes omit or get it wrong + sheet.reset_dimensions() + data: List[List[Scalar]] = [] for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) - # openpyxl may not have the correct padding if the dimension tag is - # not specified or is incorrect + # With dimension reset, openpyxl no longer pads rows if len(data) > 0: max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/tests/io/data/excel/dimension_large.xlsx b/pandas/tests/io/data/excel/dimension_large.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..6a8daf607f299df20579bfc31ff97bea919ae1bd GIT binary patch literal 4894 zcmZ`-cRXBO*BxZE8NJu&J!**Pq7yTW76f7Ro*{_QA|yi)#1kc>OM)OoNz~|%5xoSX zMvEFXO0;hz-}mN;?|JY2-FyF9`<}D**=w(J4D<*t(Em@b@zS06-oe z007~>HV6lIA2HGMcX48`E`bE)pd*>x<^<(=L}Lq@qR>dX~C-I zWB++;e9i0qvt~K*SM~)o#iZ(}puOrwCRxmoT6Z!3LnMS);In1f4C-$B2jlXXl#llm zsvjfLm1(|4l$`op|3Iwy^t3KffBBR1aAQBevogTo4|oU(pj_|lZe6>C?P=LBYP@qkIn*O=5qO?tOz+K#Q~IcHN; z4lf0}!WH2wZQQ2M>vWLS=Nfqr+w&GH=&co;(!extcX`nW_ zh{(gsV5{d`k7Ii%qy^Sgz}j)GQ>Hm(?v)~u5wA-)pI+;BvZh&&_=G-uV}`YfP2LI$ zb%u4unPXl8T`LA&i;ekrB*@LeB;U>ue^FrE7YtQ#&buk3*7JFbXU0DCZ_7c>DHm?* ziW{QutNUT?;O;LLv8bGmGrF;=hk^5{>*-&evPpxZnTnUT7ef#uPKI|hK8I((@meJJySo6&DCstRG!5k=pcn*4Fld}$NKjj|&7VXsicx|W_d8*^LU->k*3IRJXL;4L7gk7Y*HW1T2~nUTqAqf zGxH@}p&T&)0A#<`h>!1mcL$&I6|#P7(({ElWBs(ztp*!Tm&8{z2KUla{G?3Wg!I$= zzJ%euIaRG1e57>Zev`POyYS_k=aBjKgYw?PIEt#ftmtAU;QL!bTmi&dT1uBv__=r$ zD878>3Q>Nn;?ioQ4*Z+5s5IQEzgA^AdiD)IoD580#wk)EsvMA^OWGALN6owVikBvl z)1y#w-&jyW_b;HtN^EfGqJ+9^41R;1#2_h;U0Y>B|D!PBhCDvm7b-JLqUx@BLG7%1 z>e&oHYc)z>>WpQ^b>9a92KzJghHA6p-&iHqR=y_6J_1YJ5N2QdnZSzdtk<~ST3z*TB_bBs<1x*i#HUBsil>a zMymN+)lh1abON<@YrwXT@|#P^f~oz3XkzfAr94o)7?7o_Vc>Ihx2CQZLb#Fq;$Bl8 zRQ#oxU+!Usht>wmz`eZy_{_bwH5*gp&QeMmsgRyUTXW(NO&?I{vsGZWk1nET^I=>v z%o)8Dm|4_f7$(D}%E?oGZDbz}tFD?;-Ve3KV#J62Y8It!P(Ag~ zqso*f2<@n4C1psZe^Ui~0-=%5H!)oIcNnON5xaX?9~m~64%lot8w+<->|4|6)79;I zR@jE#6@NBu8(6)lIDs+xpeI7JLvGRGLNV*PTjiM6xrbHy0sLJOYdXp>7ktPf-X#FQ z2iH6p0j&Qgn?BAC4!%BOKR*)Z)nQ^x0w4&grN49j+zS8!@GdRkl=kPV-gMk^T$VC; z!;Cm^Y4CGm5LBpE8|_%D!c24*i+`l~BuOKJ%RKSji=Y$of+utsfpEJ7fKF>_|CdxL zTBlj)N5tZK)PV73%g2kV z6A!Zeh(5{YTU)oMghy1ZWpFJwe?;{t_j*Kt4j>rS_Zf=mQ7zJ&!i)pbXsOd;fre~! z*l>Q1t zCd^nIF4*XIV|Y(d-1=am69a&0*S1BAI}YyFx3-@W91NCiD1s8as9RO?1i+T7mivJt zt_yXl&>Zh~#@0s??vVi%d{0=gT-72)b<|@NukF4diHnf1chxc2)ol~#nJ*B}#z+h_ z5mKV(vkn0#)xhwL4^+!AVP;oTZZ?l8FhiKrvxIC=!>ki6ctNAR2C-P!vY2kRS4W08 z%0B`tGn(Bc7ND9n>k|pKXxC*`$d5Fs>C>EdFes0l^pItZP!Y*xq?ron+?*Ci_+|xx zbv!7vgI8#3Gp`i;jW>T{E;@2xBtmATHHyW%ro@J{h%lQwoZ8rNYq^hd(Mz}9Zgo($ zJCL*+e#)2*@jPzF#IltoaJNJG%rs*xspcSv6V%x9Jf9bOA}{&&YtiIq?B8|xt3z6k zeG!A#Af8w~&mkR8Y_8He&&X5RRUU>4Pr?F$Yi`RmmC35eFmj|je=S^i*&^%Za59pQ zHkf{bqbi&#EFW&|7bkIbSw8%oScrzJmMwgd86um~?ewe=#Fp$0t-bCniI!H*IyPmD z|B=M+WA%+Vp48T2HES6Y8DYv1h3#{iEfJh_l@~O@jxHiB=PhAV86}}3`}Zr|MZKrX zOWhXs+GiUtOE!;W0nh*qRlqfxD?x%uhNv5JW0FY<*51e5xv!W=6jH)0)~`K~si3ZA zE2&t|Vs($?8?o}6gx%I5<;U>BMrNFKBJx_Ty0SD1K35KQv|QEChD^vFbHBvawIA%9 z223TFHGW_fdcI*Os$4QLy{`FSS%@f$C8@(&-?P4vod)gi zD_3JzP_1fU!or+OkxgxW#PttOzO8>@GVRWpnsMfRT6&rKO&!f>7=wR$wg+0caV{EBiIw9 z`e9G)W`xBgiFQ2>7Zewix!6XzJKsLcgvt1m4BI%eZv=w8NXCl|Le`FXcDCQAY81bn znhe7KlL=M8$kh@Y1M)auinHwJ;_BmUi*T?v@C&EjCfnOdkJVgy1( z6pg=3>_F)qrTu9y$aBA%Gskz%qY=9>y9Q@=$sf<<)uZaIjfZu$ zE)yF+)1(IJ8E{9gqK=?`j~+^GM3Yf82zD791hm2mfF*$2!RVDw#a+m2M$0NZ9glgs z&S$Gw?Uf)`-6FxG9Nrln*6BMWaflTGPck({xtv^Hr`(rQX@ycVU^O6hQhAE{rOXQQ zmy^`sehlhpB4p)=|G;CbVoj4Q(ls+0Pv{+-TTn`cZL^Uj zFSz=_eSLOzGauudvm_*NYQC*#o&x2O!F}*q%{24gU61JJio8mTcT%aahirTE6pdf$ zOcp*X2jB?>nfoPDxtf_9uF{7ryWt(5&M1oBQPbK+orLDO=z8{QGJ|V63H@?Tkig~W zIG6Ft$&es?pA&8PES&!~P*EWw^c22D_Gd<#3~D-8aG40iWrgye8L{>DK2Jz`JoLQc z+BS}8n|9M=L7YjmABvLFNyj-^YJt6`78GYKrn*1Yzk2lT)xF5bIM4}nuf1i2=Hy$q zg0jC!qn*h!!?Gc_WfYR#U_7)Ck~y~BdHg8%Ljq*1l8QQ6LC2-!K*S9&i|#Xp2XqCU zRiAq1bLmnX6ky8w7R^(i#=_OGL-(|7RlP}#nMzc)wIa>!Pt#Sr!4|$r8&<8o&!qf$a&PYmNO*GX>)QfjhZ_klgW^OkW(gDe{9D&ky} z)Go;Gy(;8*zighabZ6@zoKBOdkCQCn;mvXufx|))%sSyy{~BU`%=Z#8-GvCt@s4mJ zk@8y(>9Rzz8+av^vWVmm*<6%6GU!UN@{x7Z%8Is1VTwqGKezQ2Uo|Yg9s6cYI&FMc zHE92~Eder4hmlPdnmq8z)~o%BO^o@#soREul!G9M^3^TD2m>!R?=Nm)S1FJp zghU^&ZKcZR7(G#+317Q>Vog4@B<_cRiU%r$ z|M~B~IA-84$9%ypUmV!QcKLf?&q#kp|1Zt+Vw{W3=69TQoVWkC)^jn!MLGW+AsGBG z;(if+QG0&FDaii;|6e7#h`v~%ztQ}-_k_ny(!#Kofl00;>IO?M&sjwRvsvj6}<4hH}r z!9J~F4(>i;qUZPW)FBOA35uAFkho-cKpm~JeRXT>@DsI=u&n-5+pF2)jD8{T2tu>s zB7!9K(4B#+N;%V1jz8iMs&Oh_C1yhLiN_4^f;_VQRpwSVlL4+lV-0R{XkW>Ar~B@u zR8Qa=X7NuJdL4JHWan&(SQ=yOr|sd99+uX)XZA~bvBi5BXUtK8am;>XW>xX3N=(4K z6^_cakvWt6WRZP-6)~w6N}VBP1EXh5+Rg4_0f%sHLjFnf*R!#Aa~lk+>DM z^YU^N4fJqNFzEF}i&M1j3*lirM66tyGjL53lL6BdFMG37Rp{K}XnM9X^3O1o4n+be zC35L&Br`49_slQ|w`JD$JI{^!J8a*|>o$r;qPo`5P}wYrAx;xerkP^DsLj;C%Jg>O zz>Tq>jS;v5)R;q>WTyTe=t{lPbXdMxvDQG!?dly5Xc&DR&ResedK9-$a!9&lBcl%4 z!Il}EF$sg@SNPWyz-oz|7~}lc?zO^EkuNJalCBOoSy8V?en1?SnxL!`GPi>7JL~r) zn)VifTx&*OicJOdq{zf9jd)EH`d-E|4j0=}l%}vqz z#u5E)@W2`ZL? z9lyxiv2CwCXEp5S1zs15dc71UL!+I`@?fQw?#g`Ia?g4h^>zX41Jr|Mwc%L!Fr} zVH@R$9snTyy+(X|@3}kpoUf4eJJX&|#2H#=4DKMUIb2d-Q0w~TX8B7Qxe03J_p8U%X z&QM5!XaEE3JH9wLEN7#OU1MJf`>Yl!=prU z-%vn8<1t8LB_ZVgqJ*+cJPy)MVvLyEuB$d>Bsxq8S;#B%L}7MGRN1vKxSK^&`8hqv zN{IrLJ!}5-n(qUC-ThfQJ*7GHcNU4Ym2ass(O`+2Li8+X1^r9K{Yxxm+H|f@rCFf*Ctx_ExPSy`X%1f!;Kir*}O}=0@E3zB{exOTLS>(ms@( zTkRNhz3C!X`hFOc|4i8o+6y%R4(#L$4vfw^mdoZe%}BNSJS-UrL;=U6dYfq^ufUZ8 zESo6Qi26XPyG>x*=vN(8q#;xR!PN0MaZ({hU2E3H@SUZs9AZIDv#yTRaq3~vjY-R(=RO*+!Oe)oOnqm> zQczymJH0SzRz(i(#;fCdpv3{CeZ7$u6*KH>l)~5AQ{KxkVU@95<%JGBb7~@;)y@}M zw5gj?f0eSC%XGUvi|7ie>Cv<8p)rt6BH;EYHRlv`unSIGW-TU4+3gn!fG73?$ z9r}BYTW+xqYB%T(2>^o}jjRra5Yt3Y8naEgW$ttr&NgDOpE`vBUhp+kn#jluZ1p4g z92vM4{pe<8&^2NS5Rujtt9oh?oYBGppSJ@)_~ra7*`Y*@uDhhi#LH5-Y86LAh{0ZJ`OInw{^lKOWklbR^#Ye8wCZ%Gw5#v| zX?V1^nTe}%lWR`^@tLAmVUl4qjZ=2F7^Ga9wB7C60H?t#vO_oR(F_t40-Gm;KPQox zdZue$!)g3w!jC@&gXK8b5_)HUzEG1U;qprCk1=@d&5AdPD==jNn)W*^DF{V{TYx-T z!QT6gq64U>4E%mYc&|1i>~e7`Z7x4Kd{k{C@8%m`63naOr#AM$kT@S3RHmZKcAg1w9FTAp|GLVR{lwx|>}sRsTq z6>GHZoLZh=#J&tp{#=;(IanbCqv^?T7o$12`Wf6);aD6}3<*uQPN9z?d zOU3pL)qV;$sJMf;04W|;j)^0e*<@SZh=tyQ>DWRYnG+#q2oXmkde=;E9rQJ4X{*bS zXUE#k$v@NiYAq{3YLUaawI6rqIb)2IsT+%iI;{*!#{8)SA4?A`)b;omwp{l(tJv6a zowAs%fZALg{hrX30x692QUJ;TQ^NuNyk<>}$!46y2~O{Ij{|$RlG0ZkxxH<0R%&J# zP{SJbjt%-RB4Aatab-tqKZ_a$1rXe6YL*d`RF4o;aES=IA~Uzrln9qph1`a^1Cb(B zOZ&lw5*$;YYhyu1I|5H%U^m&@ILeC}W@ooS=naa*3+{6&9aqBN#F*a7&}OEOUvK*` z>}CtLQ(rveVA5ha3U;y;DjQy1B>LcJk%&1V_(Lh@mQ-i+FlZF3t~^+!W%{kMzg_7+ z{n&4Znxf_QP8=M)aYC!I-=$6fjhAVSSFAia0Cwxm4R=bXPS&1HBqm9)>v6bneF4JB`iZNrG9pi3I)G@x+L3J|h{THsU9KCtcFeuAU7vlc zyb?1VjPrLU6agPES70+BhxMgc%YII-KF+o<2YUlwn2V>=FWZqY)K1(X4#uBt*3pA~ z)TX3#ku%&R+6DH}S=wqjr9DKp8c}%7ZSh%eQya`cD?PKWalaAIGkTVsGmhVMM~0Pr z>4U7v|;dBL?+?Is9_n`X994iBq)f(57rW{Q5$k zhNl?s6`vqSEn(-tC-gXlT*kWe`pj2uEwg6?-0c2N-PO%jPSvet;j9cou8+B>y7&QI zuA+w71(_4_ElLmk4DXsv5W{={j~u&0dO!ZdjLYLiu$EUN`MQxN+w_{K47rJP84yiDKoBHtm^H1QQ z<+wx_bnSF#FpkfO8gvfIX92n{A9?>2x<&eTMcQ?n`c|-&2*Or{;-3|<_4Yn5NN%#u zdB=5a9MQDz=E#t6q|Y^!rRNf(IhdP4L&j$0XYY(Pey)G>7~X4n^7QF}dXRhb9Rmah z@4BT9TWJo>Y@rE?6}~MkpXsL0O`}cpm34O>=b`%{E?W95BTfDR zZE;`Shr#&*+AIfoeTbG>2j;_6xDsmIPt{h@o7j-CLSb9=Gh(YvcVS^DWI3cEvj3|) zfH_`aHXhU0ULYP(_jdI&l6a3Md_>GCBRJwrl!r zmh}P7!aP(>3GaLhSs9)HsIfRN!nYGHpS+|1Oxz3WAkFJ;OT2^c+cCx z$MQTit%hCsd8pn#zfkw9#0@0$%| zrjEZf`W3Hf#Lnmv;`UM&_cIc%!V1`SO@pRbue)p?ojJ*kcB^$C{TQ1{hMilaad1~* z=d$!}0XFYdLC1Tq=V_~UwhqE+RS1STNK+ziS2Ob;mJs!><0l0)5%Tr^s1Vawh%`s{ zgcAr?-*L#5AxPK&R@BPCGDBqwV%_1vBIS@HtM-)@HHDHa;imyyRwBMiC>%Ss&8A$M zv zE!J5&s`7YCUXDpUy>yRcv&@ng_sOT7yv^6l`(Id2U3QU+WSvm&10+G-TaJJ{^3I{$ z*KGIk3EHl1Wy|FoJW`$wUt>S9qAWf`FwL3MUqZSjNlu2Qj|J#z0&!^0`R~6RSzycY zkKaEIFfIZwZt?%1h_I!-u;IT5zPR`M0~W#h*mGw7&ra|h%7*)U!2jPPGyFngKM1IK zpn(6M@BWKp2K{!-7u@p2fnAi#KLbl6{uTYdi08#P7scjJoLsE8|2OHm7~vwF|A`O+ z{ugq;2*1cZf8gX~|A7B56J11K?9e}GKJ0PMzx1oFCJ38-0DuU4nP3Iz^*P@G{s-D? BQXv2U literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index b4442ceec8df5..fa67ce95488ad 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -132,8 +132,12 @@ def test_to_excel_with_openpyxl_engine(ext): (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), ], ) -def test_read_with_missing_dimension(datapath, ext, header, expected_data): - path = datapath("io", "data", "excel", f"no_dimension{ext}") +@pytest.mark.parametrize( + "filename", ["dimension_missing", "dimension_small", "dimension_large"] +) +def test_read_with_missing_dimension(datapath, ext, header, expected_data, filename): + # GH 38956, 39001, 39181 - no/incorrect dimension information + path = datapath("io", "data", "excel", f"{filename}{ext}") result = pd.read_excel(path, header=header) expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) From d6c3af1ea172f723d72d9eedc6c264ab1f704c08 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 31 Jan 2021 10:14:09 -0500 Subject: [PATCH 4/9] Return "" for null date columns, trim empty trailing rows --- pandas/io/excel/_openpyxl.py | 19 ++++++++++++------ .../tests/io/data/excel/dimension_large.xlsx | Bin 4894 -> 4920 bytes pandas/tests/io/excel/test_openpyxl.py | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 3f7d6382314bb..6f29f31441c73 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -503,14 +503,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC - if cell.is_date: + if cell.value is None: + return "" # compat with xlrd + elif cell.is_date: return cell.value elif cell.data_type == TYPE_ERROR: return np.nan elif cell.data_type == TYPE_BOOL: return bool(cell.value) - elif cell.value is None: - return "" # compat with xlrd elif cell.data_type == TYPE_NUMERIC: # GH5394 if convert_float: @@ -529,11 +529,18 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: sheet.reset_dimensions() data: List[List[Scalar]] = [] - for row in sheet.rows: - data.append([self._convert_cell(cell, convert_float) for cell in row]) + last_row_with_data = -1 + for row_number, row in enumerate(sheet.rows): + converted_row = [self._convert_cell(cell, convert_float) for cell in row] + if any(cell != "" for cell in converted_row): + last_row_with_data = row_number + data.append(converted_row) - # With dimension reset, openpyxl no longer pads rows if len(data) > 0: + # Trim trailing rows that have no data + data = data[: last_row_with_data + 1] + + # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: data = [ diff --git a/pandas/tests/io/data/excel/dimension_large.xlsx b/pandas/tests/io/data/excel/dimension_large.xlsx index 6a8daf607f299df20579bfc31ff97bea919ae1bd..3aa085289eeb830a48653c4b9a650b2a345be065 100644 GIT binary patch delta 1160 zcmV;31b6$MCb%ZBN(K`tQ9n`z^K)4n1ONcw3jhEZ034H#5gQ2!00SP#7GNI8lYs^z ze^ZjmE)u8SE|A7SoVLKe(IRCLp-7da>?U8oL+TpID1u^<07=A|;o&(mlxCt&zcS_= zh*VVYg&T}~*C9L>nDYI?{r0f_aOOHnBaRUhoGjcYqTEl*chOPEN0kzy9RT2J;ig*W z7oMl$lw?SaL{2z(5+O6BVJY{X$|XVAe{RZ{H}?Ixmm$jC65&F=216tXjmfo$i;Qqx zf=I%U2Kp+cxvJ5A;aA~diH;z(=F_2gU3wb4U^>9dXe@;ii5|rwtEdfCo_psnbY}5u zh-FAV7WoGN&OzLcGOC{}M|T-n@H^=wlxZTr3Ji5g5K{zvIogvg(cg0G*ALYET7@tu;R>O4Ct6~{FYEErNXo;Ch0DZZhcfK|() zGTw{hO0X}G3AkEa$G-$pRvRSUr$F(BB)WmrXtyPd#G2slzD=PS9I}1Nc7inkm?WrR z+E9SwgsklcV7zd-$tMFyk((n{e}plk({*AK`w!sHr>^rXM7E6)gDeHU-&x<3RVE+{KPc8I@Crt=nIE17<5`m@sLB^&at`kC2;S^xs77t6fATNNMBk7D zVGwT#b!~7$sn2^spyG3lG+IVd937~I$#kK#$jYGR2y@zaJo159N--w9@s#1eVPxq* z`54NTxghnmkIi|ew8|a}n-y3OOF!}sKsAP6Ak{FN{DxUDF`4;)iA^lAafevY5VH|i z!-(vqe@WAL=QGLKD6)FG7=b<1I+S0%JPpZ`}h6NLgz5{`Yov#=Fb;{ZGqx@j<9Zoe^R@0N{GzXU53$Akaxha-;`B z5)PhJs2mwss$HO^A_%*GmvR;i!*Ckph;qL~I9IR15NS#iawU=?C)|`EiZEn=zRsxB zHQF!yDm<*v0aC5`v?yMco(3-(_waI>D4|7a28qZkYW*zFg0mNS=E-Y__w$IH+D;#+ZVH zw(OoF>B7H=&Ogs>gbVmi$w9ZCJ)MceHSjAK(lwkj{;ep!p`3u#sG>67i^EE=FCYY* zthVD{0*ciJMRyr6ydkM+APw4X2qTFhxc%O`Fbp=?Jmp)#8URdER4`*Hz;;5__B}9O z_}t3L08&W1#fmV0X4CY&#K!&u_|viPJqwX<62w5HC=6TcTlad`YUR8^Poi+LR9^cr zY=wBVCW{-|BDpNAyb{?Dr()rI2&{Wjvbl`f>+)4k|8xZbUsEp&+g;l!HM&xBsvxVJlI2?pPETb3`-gwIJ-!O7?pnMETZ4XF&>Jz)4 zEvm(PnZU(6+H*4v-wU Date: Sun, 31 Jan 2021 10:30:40 -0500 Subject: [PATCH 5/9] whatsnew --- doc/source/whatsnew/v1.2.2.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 50abb0e872331..ab952485d29fe 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -26,7 +26,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information (:issue:`38956`) +- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information (:issue:`38956`, :issue:`39001`) +- Fixed bug in :func:`pandas.read_excel` sometimes producing a ``DataFrame`` with trailing rows of ``np.nan`` when the engine ``openpyxl`` is used (:issue:`39181`) - .. --------------------------------------------------------------------------- From 8cd7aade9a3410656227b01b3af4458e65820b12 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 1 Feb 2021 17:30:58 -0500 Subject: [PATCH 6/9] Removed fix for 39181 --- doc/source/whatsnew/v1.2.2.rst | 1 - pandas/io/excel/_openpyxl.py | 6 ------ .../tests/io/data/excel/dimension_large.xlsx | Bin 4920 -> 4895 bytes pandas/tests/io/excel/test_openpyxl.py | 2 +- 4 files changed, 1 insertion(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index ab952485d29fe..ab2838cd7918e 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -27,7 +27,6 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information (:issue:`38956`, :issue:`39001`) -- Fixed bug in :func:`pandas.read_excel` sometimes producing a ``DataFrame`` with trailing rows of ``np.nan`` when the engine ``openpyxl`` is used (:issue:`39181`) - .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 6f29f31441c73..1b706e062a3b2 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -529,17 +529,11 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: sheet.reset_dimensions() data: List[List[Scalar]] = [] - last_row_with_data = -1 for row_number, row in enumerate(sheet.rows): converted_row = [self._convert_cell(cell, convert_float) for cell in row] - if any(cell != "" for cell in converted_row): - last_row_with_data = row_number data.append(converted_row) if len(data) > 0: - # Trim trailing rows that have no data - data = data[: last_row_with_data + 1] - # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/tests/io/data/excel/dimension_large.xlsx b/pandas/tests/io/data/excel/dimension_large.xlsx index 3aa085289eeb830a48653c4b9a650b2a345be065..d57abdf2fbbaea74548d94ff72dcfc1468fa24a7 100644 GIT binary patch delta 1170 zcmV;D1a14cCZ8s-N(K$Ei$PK;MdH-k1=2W((-zn_TBIx@6v>j5-Q??cNL{Q* zMiCT?1V|#zj1JG7p*RzN`jsJte zAV-w@CBlVz4TeZlnviRe6glCh1W|+`1N3!9rLNI_;aA~dg$|Hv&8J22y7V-7(YS|~ z(?kg^QZq6*B(!Ds6iMg)Wpwd*9NG{U@tu-`ZasTC z6NeQLEEv)?o-_WfD88YbfYqp?GTw{BQm`+e1RSll<6i=j)dod(8Bn|-sc9e$+HMFV ze~BTu{ocAT3^v(3mec0$$mJusg8-0H~yQb@bSk}ziT^u5H!{sZ{a zvF|+#k#7>jK&2=QTkBi*de>_8yg^T*aI#ch`!#HZc(f*q8`>(lEUdl~*)OMJ;d=tGB=`Tv5Z^)i7htHxv5_OjPbODJj9U@oPICQ- z+3DmaZ=Bqumt3bVSCby~Nhi5}#M$ZOrf;0ww3pndliajNecDN`A93DEE@<5hQc-T! z(zRr%`YgCXYzZ>}QOCUOAY*9KcLZf){EF0$!p6)s zoNzEZn@y@NbX>wc?FOOAaMt{Q@>^j*>QkP~b-5!zDuijRgQh7L5?X;IYD1riyQXxj z?W9y`W1Guuk#9+Pd%6Z*-{;k`xxjtT!mgFu8;b*f$OvD90&oHp1p(Ype>C@{P(~pN zjc3bH@_5PdkBpjTCh%YcYPH`4+R#$ucDmBGgL1GHnyk9)x}~_56i$*&q^`lyk_btO zyO=?XQmvKK$ip;Ep-$X5{3E5>yvF$R{&?OOvB0<-Di-fiEMJHU4lr$kOulHrMarne&JW)VTq0)wdT{I zcwKrLykI)O%V;cx5{VwgBCDtkRi1n2FLY+{Ylvk?J{EuZ2LR4N+>SD;pDagr8Cmc< z=_HhCBEJd@bx9CY1bjK#lP%HTa_iR*;tmY8+gnCnwRss~3Mn*YcO*#`?qzWCdFq=G z8}Xfzqv||6Diy~y5G)u{HJ&y8Eh)aCoPbr!qB7o#<4UkEkO{b2UB|x!QdS!z-KRkD zh9tUy)M$UVC5*(H;O@Rnp&1;qead!%H2|0-s9@Sqfa8R$?FV4IaJk7R14xmZBUXel zqtkU_6Z;R~&!?{QEJU`A5rZrRzTa8j+U$Av5`|4m}RM(ZK?7v%0%Cg17Q$v33Y97LaEPtL7?JujWk+DQXCzq zg~@cGw8+Y!<_L4zcs%lfSV}P_yz!LbzhPwQK=~NTmAN4GwU5ntrnJf)3!4>K4@*Dt z4nTi3hF>7nFq`~_Suin~`G1K`EU|HiSkMr&5m&>A@ym$SFk*6A-1LpbO)asBCB{#R zD4}>UsLy-ih7sqd z#Rc>F+^eA66q@f_iPE*a+!eOF!ojYvzsrB?e1&pJx!&YtuNQBUxwbu*)MGkEzCvr=liTxz1PK!&ZRN6GM+alYM@=I-pg}Q%t zRLiDWxQ>C{NP9LGNB)o!zJUzDZIcudUzu-^HA3ndw3b-pl-MQ!xhS)>wmEVzO%kXR*A~B} zRGT*#e?GMD)-n^Axu@O1~tqPN{3LKLO5-I_=lQa@R0hg0@5<{9 diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index a9ad4229cf949..fed8e2923382b 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -136,7 +136,7 @@ def test_to_excel_with_openpyxl_engine(ext): "filename", ["dimension_missing", "dimension_small", "dimension_large"] ) def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename): - # GH 38956, 39001, 39181 - no/incorrect dimension information + # GH 38956, 39001 - no/incorrect dimension information path = datapath("io", "data", "excel", f"{filename}{ext}") result = pd.read_excel(path, header=header) expected = DataFrame(expected_data) From f7fbf7fdafdc4fee1134f3a3db621c92d2324e01 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 3 Feb 2021 17:58:36 -0500 Subject: [PATCH 7/9] Updated minimum version of openpyxl to 2.6.1 --- ci/deps/azure-37-locale_slow.yaml | 2 +- ci/deps/azure-37-minimum_versions.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.2.2.rst | 9 +++++++++ pandas/compat/_optional.py | 2 +- 5 files changed, 13 insertions(+), 4 deletions(-) diff --git a/ci/deps/azure-37-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml index 7f658fe62d268..e4e356155b25c 100644 --- a/ci/deps/azure-37-locale_slow.yaml +++ b/ci/deps/azure-37-locale_slow.yaml @@ -18,7 +18,7 @@ dependencies: - lxml - matplotlib=3.0.0 - numpy=1.16.* - - openpyxl=2.6.0 + - openpyxl=2.6.1 - python-dateutil - python-blosc - pytz=2017.3 diff --git a/ci/deps/azure-37-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml index f184ea87c89fe..2e0f35ba199bf 100644 --- a/ci/deps/azure-37-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -19,7 +19,7 @@ dependencies: - numba=0.46.0 - numexpr=2.6.8 - numpy=1.16.5 - - openpyxl=2.6.0 + - openpyxl=2.6.1 - pytables=3.5.1 - python-dateutil=2.7.3 - pytz=2017.3 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 49039f05b889a..215780272139b 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -274,7 +274,7 @@ html5lib 1.0.1 HTML parser for read_html (see :ref lxml 4.3.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.3 Visualization numba 0.46.0 Alternative execution engine for rolling operations -openpyxl 2.6.0 Reading / writing for xlsx files +openpyxl 2.6.1 Reading / writing for xlsx files pandas-gbq 0.12.0 Google Big Query access psycopg2 2.7 PostgreSQL engine for sqlalchemy pyarrow 0.15.0 Parquet, ORC, and feather reading / writing diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 095d44bb84590..4a853b9d02ffe 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -10,6 +10,15 @@ including other versions of pandas. .. --------------------------------------------------------------------------- +.. _whatsnew_120.api_breaking.deps: + +Increased minimum versions for dependencies +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- The minimum version of ``openpyxl`` has been increased from 2.6.0 to 2.6.1 (:issue:`39001`) + +.. --------------------------------------------------------------------------- + .. _whatsnew_122.regressions: Fixed regressions diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 35c7b6547431f..f4e11183624c8 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -17,7 +17,7 @@ "matplotlib": "2.2.3", "numexpr": "2.6.8", "odfpy": "1.3.0", - "openpyxl": "2.5.7", + "openpyxl": "2.6.1", "pandas_gbq": "0.12.0", "pyarrow": "0.15.0", "pytest": "5.0.1", From 21ab3849aa19ace9af3cd142c5e1d3494fea8db5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 3 Feb 2021 20:41:56 -0500 Subject: [PATCH 8/9] Changed openpyxl minimum version to 3.0.0, mypy/whatsnew fixups --- ci/deps/azure-37-locale_slow.yaml | 2 +- ci/deps/azure-37-minimum_versions.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.2.2.rst | 4 ++-- pandas/compat/_optional.py | 2 +- pandas/io/excel/_openpyxl.py | 4 +++- 6 files changed, 9 insertions(+), 7 deletions(-) diff --git a/ci/deps/azure-37-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml index e4e356155b25c..0c47b1a72774f 100644 --- a/ci/deps/azure-37-locale_slow.yaml +++ b/ci/deps/azure-37-locale_slow.yaml @@ -18,7 +18,7 @@ dependencies: - lxml - matplotlib=3.0.0 - numpy=1.16.* - - openpyxl=2.6.1 + - openpyxl=3.0.0 - python-dateutil - python-blosc - pytz=2017.3 diff --git a/ci/deps/azure-37-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml index 2e0f35ba199bf..9cc158b76cd41 100644 --- a/ci/deps/azure-37-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -19,7 +19,7 @@ dependencies: - numba=0.46.0 - numexpr=2.6.8 - numpy=1.16.5 - - openpyxl=2.6.1 + - openpyxl=3.0.0 - pytables=3.5.1 - python-dateutil=2.7.3 - pytz=2017.3 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 215780272139b..06e1af75053d3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -274,7 +274,7 @@ html5lib 1.0.1 HTML parser for read_html (see :ref lxml 4.3.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.3 Visualization numba 0.46.0 Alternative execution engine for rolling operations -openpyxl 2.6.1 Reading / writing for xlsx files +openpyxl 3.0.0 Reading / writing for xlsx files pandas-gbq 0.12.0 Google Big Query access psycopg2 2.7 PostgreSQL engine for sqlalchemy pyarrow 0.15.0 Parquet, ORC, and feather reading / writing diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 6d67429e6d40c..c67143f5beec4 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -10,12 +10,12 @@ including other versions of pandas. .. --------------------------------------------------------------------------- -.. _whatsnew_120.api_breaking.deps: +.. _whatsnew_122.api_breaking.deps: Increased minimum versions for dependencies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- The minimum version of ``openpyxl`` has been increased from 2.6.0 to 2.6.1 (:issue:`39001`) +- The minimum version of ``openpyxl`` has been increased from 2.6.0 to 3.0.0 (:issue:`39001`) .. --------------------------------------------------------------------------- diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index f4e11183624c8..eb2b4caddb7a6 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -17,7 +17,7 @@ "matplotlib": "2.2.3", "numexpr": "2.6.8", "odfpy": "1.3.0", - "openpyxl": "2.6.1", + "openpyxl": "3.0.0", "pandas_gbq": "0.12.0", "pyarrow": "0.15.0", "pytest": "5.0.1", diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index fc567d5028777..951a88c678394 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -539,8 +539,10 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: + empty_cell: List[Scalar] = [""] data = [ - data_row + (max_width - len(data_row)) * [""] for data_row in data + data_row + (max_width - len(data_row)) * empty_cell + for data_row in data ] return data From 1b1998bfef291cdf751560a253de5e7430154b12 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 4 Feb 2021 18:08:57 -0500 Subject: [PATCH 9/9] Reverted changed to minimum version, apply patch only to version >= 3.0.0 --- ci/deps/azure-37-locale_slow.yaml | 2 +- ci/deps/azure-37-minimum_versions.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.2.2.rst | 11 +---------- pandas/compat/_optional.py | 2 +- pandas/io/excel/_openpyxl.py | 12 +++++++++--- pandas/tests/io/excel/test_openpyxl.py | 8 ++++++++ 7 files changed, 22 insertions(+), 17 deletions(-) diff --git a/ci/deps/azure-37-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml index 0c47b1a72774f..7f658fe62d268 100644 --- a/ci/deps/azure-37-locale_slow.yaml +++ b/ci/deps/azure-37-locale_slow.yaml @@ -18,7 +18,7 @@ dependencies: - lxml - matplotlib=3.0.0 - numpy=1.16.* - - openpyxl=3.0.0 + - openpyxl=2.6.0 - python-dateutil - python-blosc - pytz=2017.3 diff --git a/ci/deps/azure-37-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml index 9cc158b76cd41..f184ea87c89fe 100644 --- a/ci/deps/azure-37-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -19,7 +19,7 @@ dependencies: - numba=0.46.0 - numexpr=2.6.8 - numpy=1.16.5 - - openpyxl=3.0.0 + - openpyxl=2.6.0 - pytables=3.5.1 - python-dateutil=2.7.3 - pytz=2017.3 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 06e1af75053d3..49039f05b889a 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -274,7 +274,7 @@ html5lib 1.0.1 HTML parser for read_html (see :ref lxml 4.3.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.3 Visualization numba 0.46.0 Alternative execution engine for rolling operations -openpyxl 3.0.0 Reading / writing for xlsx files +openpyxl 2.6.0 Reading / writing for xlsx files pandas-gbq 0.12.0 Google Big Query access psycopg2 2.7 PostgreSQL engine for sqlalchemy pyarrow 0.15.0 Parquet, ORC, and feather reading / writing diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index c67143f5beec4..cc5653fe2f360 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -10,15 +10,6 @@ including other versions of pandas. .. --------------------------------------------------------------------------- -.. _whatsnew_122.api_breaking.deps: - -Increased minimum versions for dependencies -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- The minimum version of ``openpyxl`` has been increased from 2.6.0 to 3.0.0 (:issue:`39001`) - -.. --------------------------------------------------------------------------- - .. _whatsnew_122.regressions: Fixed regressions @@ -40,7 +31,7 @@ Bug fixes ~~~~~~~~~ - :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`) -- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information (:issue:`38956`, :issue:`39001`) +- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information; the fix requires ``openpyxl`` >= 3.0.0, prior versions may still fail (:issue:`38956`, :issue:`39001`) - .. --------------------------------------------------------------------------- diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index eb2b4caddb7a6..bcad9f1ddab09 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -17,7 +17,7 @@ "matplotlib": "2.2.3", "numexpr": "2.6.8", "odfpy": "1.3.0", - "openpyxl": "3.0.0", + "openpyxl": "2.6.0", "pandas_gbq": "0.12.0", "pyarrow": "0.15.0", "pytest": "5.0.1", diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 951a88c678394..64c64b5009b0c 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,11 +1,12 @@ from __future__ import annotations +from distutils.version import LooseVersion from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions -from pandas.compat._optional import import_optional_dependency +from pandas.compat._optional import get_version, import_optional_dependency from pandas.io.excel._base import BaseExcelReader, ExcelWriter from pandas.io.excel._util import validate_freeze_panes @@ -528,14 +529,19 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: # GH 39001 # Reading of excel file depends on dimension data being correct but # writers sometimes omit or get it wrong - sheet.reset_dimensions() + import openpyxl + + version = LooseVersion(get_version(openpyxl)) + + if version >= "3.0.0": + sheet.reset_dimensions() data: List[List[Scalar]] = [] for row_number, row in enumerate(sheet.rows): converted_row = [self._convert_cell(cell, convert_float) for cell in row] data.append(converted_row) - if len(data) > 0: + if version >= "3.0.0" and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index fed8e2923382b..640501baffc62 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -1,6 +1,10 @@ +from distutils.version import LooseVersion + import numpy as np import pytest +from pandas.compat._optional import get_version + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -135,6 +139,10 @@ def test_to_excel_with_openpyxl_engine(ext): @pytest.mark.parametrize( "filename", ["dimension_missing", "dimension_small", "dimension_large"] ) +@pytest.mark.xfail( + LooseVersion(get_version(openpyxl)) < "3.0.0", + reason="openpyxl read-only sheet is incorrect when dimension data is wrong", +) def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename): # GH 38956, 39001 - no/incorrect dimension information path = datapath("io", "data", "excel", f"{filename}{ext}")