From 74842a59b31caaf8e241bf8286c5e1e70bac8290 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 8 Feb 2021 10:14:24 -0500 Subject: [PATCH] Backport PR #39547: BUG: read_excel with openpyxl produces trailing rows of nan --- doc/source/whatsnew/v1.2.2.rst | 1 + pandas/io/excel/_openpyxl.py | 6 +++ .../io/data/excel/empty_trailing_rows.xlsx | Bin 0 -> 4900 bytes .../io/data/excel/empty_with_blank_row.xlsx | Bin 0 -> 4301 bytes pandas/tests/io/excel/test_openpyxl.py | 40 ++++++++++++++++++ 5 files changed, 47 insertions(+) create mode 100644 pandas/tests/io/data/excel/empty_trailing_rows.xlsx create mode 100644 pandas/tests/io/data/excel/empty_with_blank_row.xlsx diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 46023e2ce08f5..88bceb7a11cb9 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -35,6 +35,7 @@ Bug fixes - :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`) - Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information; the fix requires ``openpyxl`` >= 3.0.0, prior versions may still fail (:issue:`38956`, :issue:`39001`) +- Fixed bug in :func:`pandas.read_excel` sometimes producing a ``DataFrame`` with trailing rows of ``np.nan`` when the engine ``openpyxl`` is used (:issue:`39181`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 205d248bf6768..be1587dbc010c 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -544,10 +544,16 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: sheet.reset_dimensions() data: List[List[Scalar]] = [] + last_row_with_data = -1 for row_number, row in enumerate(sheet.rows): converted_row = [self._convert_cell(cell, convert_float) for cell in row] + if not all(cell == "" for cell in converted_row): + last_row_with_data = row_number data.append(converted_row) + # Trim trailing empty rows + data = data[: last_row_with_data + 1] + if version >= "3.0.0" and is_readonly and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) diff --git a/pandas/tests/io/data/excel/empty_trailing_rows.xlsx b/pandas/tests/io/data/excel/empty_trailing_rows.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..920b03915a3c8f1800bc41e76dfdb75e783762d1 GIT binary patch literal 4900 zcmZ`-1yodByB=cbE`cGWOIiVG1c^bqL2~FAaA<~ZkrIP&Xhd3+PH6!dq*X>>NRg0~ z5P>`By}zH_@1L{goVCtc&wKWM-~B$%v$w7$4lWGuFW&Yue&Ro~eg~XAGU$N@j{M_=dov2+T{L z!3Q%mosYf6~JXbIA_ZL2Yg(NRDA+Fef==uxz5oegA42 z6zerekP%1)JwgO>y=!_V8#AOKY5S`_+T^(t}y&*^?yWxWnl5>e6-9IR7Op zY}K2i3nm4rf``1S!s4xzI`35sjPe+@Tik{HkKx(`yi*ph(DC=O8x3m`Gv4^hHpRiR z6{x>NSDks^Y$Q-kIcrVR`ufg!vSXCTSplH?S0Ww+h;JTZ0RT`O0Dv5mhyYAXPnesX zr>C1xfQNgM!2lE`LeX|8fII6UXypoeier+T3YejIF_4+2O6Qh9)4%hy;CzD~<#L^?zCJ-Z1o-CSu%$mSDrZDzL! zDL)>xH41lt7_&(b&ou-BZ!{>+gcqonY9SxqsoC>@gkP!0?lAwMM{(ygi?~-hI_8*_ z#PTV7T+%T44c-k!67}SsS>u9N?seB%DZZfpoc2?FRM1l9X7(4Se4>y9hRW65l~C9RCq0|n#E4|a#aVxRI}_U> z$9Y*hmfekfX2T(FEIEOgSE~t9G}_rr5o>jHH2LG2PHHpi%|lCS#c8@Z@Z`iZ*AQ z5=DKrS6>RbSi`65gmZ6KU7BpV*NIqYmCz zkE}6tnwFSOSXu?sT4bn(+FU`t=MvW7W{ffX8)v=WJQ|+a`qnh`}$>jC-m=Ki=zy?!(_5cDkHF?|&9(FES^7-%V zp$c(|E!)`Q5C5~>~=fRaUxE;-nxgQvEc&kMch))6w#|fDy~JreN37v`B#8e z$`rs%v_;NMp9o&vLo}V9@&f7`ljz3Umo%wZ646@%SC~+W`d3SbR+%ca7fKGU?vsiw9ts%fGhi3 zHdCk*4g%Hon@MbAOS-B_L#X_NsS~ji#69A<28b{++du&#>Di+@B{~--Z)+(ogTRYQe)7vjN-P=bs`SiASzu$DgR_QC`TI3^W8PWMsLyyzM`jT#qU%fN@N)MnFHiR`>|gsCbB29}(g$}WyS zzM62iG8<$?A&&Kg_vN@Dd3T{sXAN=7apH61vFc!4b&0*)i~|MaoYr9;i|=5@4u#@c zW-j9+Yut!rgQt`;Q3#EF&O9+YuhRO6VOov}k3p3Y^6q=7ltQOe^S9dYaMGgRZZIye zL&hFoO(3*OZPyO0a;vr8HwrL(C|7L0=c3mtBu8p>eFN+D896!Cdm)9{T6) z9J8N{b_{wcx6iO5yzfA}fs#7Og$lXS+cL{scj=4Uv3r|2Rh(nn5$>O|GCiC2p5Zqt zD{qpmjOR_u=ISR?O}~aGaEYIbH6O{6c*H+A8vbO}kRzAf6?z|+Sji{pOJbOSwr8Wg z^Nq5S#KUUq@S{jDdeT+q%z@X;LpS&tdoSo|2HsdIGK-K7q>4H*j-d?b|G|d}cLwlk zsE`&N>r$O zUC?YQ4=>`{lMn5rpl~&}*Nyp=Y45t$o>8By7+wob$@6^+mN3zSTSC9@>zGjP-vHKh zY<_9JuIcYkJ@tgDq>EfzE1|Vu)|p0oxZtd0l(2j^#tb<+vo-(hG;y*-@X47jQAoGn z9h+8;iTZg$htN)9Y}#BmyZ+!Zn=M+T=z z+$BtNgDb7Mv7+|eUFu}$B;9Jit;}El$TmqI_Rz4~eH-LB+QGkfP6d*(JEeP7!nVR* z)l7MsPWb@ZgZ6=SOKL+`<4KuB1I3!e>^>&Yvv7S#%0OsC_xlEtqz9NMmt80F<)y(@ zpnHDOO*fHMrflPB0kz^GS)|+o%{fA6rkD1cTAcR&~!A8n0R} zZ0u(qH%NrHi%(~*c^&I2mPn=V2no*ZWIlPu2-8!XJJ?z!Mha^h1B`Uox6OG5V`RZB+Lhq|8si8oB&|WE1x_tgK)1!a>%1EBft1X;5S}a2 zPfCb{u{;zX)lacQ3kZwh?|U3C%PlLgGk@e5d>NIiFX2x(Y3<0e6-4Ywh^o{L-T2A5 zx7(0;yYl7iOfdG}nNR|JxL$?HfDFc+VvPF*#d3t_rmk0?S zxs@MtImk2xZ$sPkj+#U`M38MmJp*%4>=SC3{rBP;x3#+W*|gn;eq+qFBhi2Zmf zqjJB^%5V~_#!6uLNR^6MQ3DJdldU*>&JYf#Mytn>4?QRi@);-#X zTlW`>WofudaZ7mk+3E;-kRMSK6f#dYB{t_4Ikn95@Htuiocd~7tek4vDk7Qb1zewS zQ1$WxdR>JKGYiuvWm}aad=2lJPZGg=01q5{jrx2BU2)zQPuU;C*b2fy@ur*#Nsj{4OK_Qn@;q%$7GI2DCGLY(V zO@<0?ZY^12?G<64p-g=u{}C@e>D0m?&t6{lEN<5{?X?r0BafZh4OP+@*W3gw{?`x= zK5<3n?G8{;NK-_h77L4sx8a=yLP7`?_f7(5sltcxLz466ImUx~9tlt7xa3!CGAXBz znGcr8JHF5wEl()~VDSf=`K3|1nwaUW(}jL@!}@uKmJ_m3R@;p~4J&d1L*J`1k~9zE z`4ybPfnO7nT~Kv1p~2YRr|OUe2+v)hoNRR18Dxj_?}~KlHV>|0DiMUK3dKJwV(aB~ zQIPCZor{j^-8!M^+|QCCW_!HQSn)WU2*n0!0lqgjCqM5t2LIUn;xTg2`Yb2sNF&I- z#nb@7#=U8&!&06_gDx_eW`^%d$fmpLbJA!NE;8>gVc&N@#(^ivE2@x|_T#1p!ETX& zz>j1jXiEp{-wiJn(q=fw>MLlOcg?>06sbHt;j3n=zT1L_%NTL5g}iE z{J`t2_H-RT2~-KUS=8RxD&o#+24@~IM^P1Qx3*DJc)(qHkj{cwzSC;cTR+a`CH>xQ z(ge8cHRp=QeS9ol>->&^ua;=5_jZmVX;tw@*hn8n-Khcb9+wdgY~rQ(Hxuv-e6JD) zFGpLT`XlkL)tEYDOW`MNVO7;h!O}ye3gg}3!Ge_vCsv(nYwC(+8P{_BIjjVIl&7)n zShkz9X;O!ph!5}D;=q$N=$WN-(#MKzJ^KW$6V1lX+_rSZ9r%bTicR^Vbv>EA=H0^c z7^kZY@0IGTo>Y5ukXO#8om~wi-mb9Z#(DN}FL(PD=y0Ct%w@lQg{%kSb<~cZ+rj3a zOWyN7`xQ$V9{wB2olKbmg9j?;$PLz0E6UPy1ml9mm8jxjC+FK_h!pa?Ogy|nGW48FVr`~w!mxY-M4{?C5!0?LB(d%*wSDbxQ# zVm=6{jZnn<&v*aDHv@nB=1ZRW^1v?3<)49N68(z)U&QlroXcYKCr%;8;s2ZTT#j&= z&i_P+BKa3`zYM?3J%8Y|WdDHwFB4rxU+&O9XhF=O&cF1lt|kzZeE@(EbD3ZSsNsU| G0RICPPjXKH literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/empty_with_blank_row.xlsx b/pandas/tests/io/data/excel/empty_with_blank_row.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..fe3bcfcc269d77e56e9bd99a4225e66a38bb6f6e GIT binary patch literal 4301 zcmZ`-by!qe`yCiM1ZC)uMnpnD8cFGy!I4Hf9lAuiJEc>}OCyMsbV+weH`3im{D$lA zd-U>s_nhaP{hUA6KKotgeb;{1QIrRwfB*mt48VEG3z%`fcjY_)01!k50I(6Gj;)co z9Xs3Yy(p?*4)_#5VAm};(#5fYP|~ohE^r`O#?3RK=UR_9k%QF1&BX^pBQG00Le~AT zmscWb=7I5XaEf%WlvTb4OKA8diA(0kd#9V^b!Mu)bbTsv^mqnc`6I37N7PZ4NT10$ z#$Uf|R?`+()Xk=-4ltZCbn&&&(gEHWuAKztoy?lf{tQzIIQ5M$$=j3)a9q|#mf{~= zRLzKFJ$)p_&Q*IK+AsM^IgJcbW6th)?gGJhG^SZPA83NT&zc{@L=5R?K^9$s5pcAiziBM+^qgTjwv`XL124)a_wO)W%9rB%W zPMq)@ZFz?aWTIx-AmfyCD(8?Rwc9qRweUoI2A&KB5W)rJpG=!R7;v`YO^JrvM1sge z1Qg&)XzM=qxRtO}Lm$^fdX}Af(nx4RVJ=ElhMZSyW}D9VU&Vrcf|-3lxQIbhXM`|} zTJC!q@!p8dJm6!c&h_v$d^$)&@rPovW-VB?;pA~Z)$mBb41P{$Vw4n-Sum($KQ7~D_P$Xz^P~hG z4?p%?ZK9LLY>L@BWs_REa!<2fqmW`Xo9|@X)|7!j!qa|QRn&M5u^u+viQbKwgRI`? z!_K>dE=C3_G(6aIl}@P7DkWw-Gi36>z3&yu4lN8ki7SvBH7;J_D_kXEw+Z%)8YXiF4FCcJf4C8N~5{E44$PBBdcZ?B0q!M1OA%6Wn6q)u4;zN?2RtSel- zGaD8DWhjSz!m;DMz#@$E(>%t45XmXChp1_m!efcPv3>e^LyvaNVVc=DblP9U*($3C zVc&3i7c{~GGdgYvCn|iLmlL;BS54Bf+=5Aq)((ES+kQ8Bsrv#uCER#v))8G_RjA}vsS+?G?(eF#;o5ObT%L#uCe)>9HfA0UQb zQDXfqf@sD`IcmiIMji|^L=(8;V}81;E7{{-`%!AxkS7dnP=ErrdG9Uf83!2@ZQUYi zXg&(e3H697q}cFwmzAE&NVi%>>e}{GaoM4-cpKx|wQ)_!)u7tCLHy4g35EKDd=Tnj za327`{goqj_D<$TcDDtwtp?{r z%k=E%CqHD^x0%r)fH03~ioN0V@?b0XZFETN{MvhXOH9hb+N0LME`=3^I4;+k1P`H> z4mDc4=GZK*X+i?dsjeBhr=2Z`;E)`iCbpfyKewZVXdM z<{qnco6gymq8mxPb}N#F+C%rwrsG1>t2siQfGi)QrY#>R;5!OeANkH z?{`L)IG}?fX<;z2tY30UhL+uiYdsDe120;`%IDO;*p}naKQXnT$S3hXr&dpE`_ChJ zQfZeJ##t8LXg6EC>yN!CElV{bJ$QZlgT?J<%B9<`YpV~78hp6i243)2Q6HB=_5m+; z^7&ilW>S;!qH1(FEfbG2b(5{LeW^Du)(j>2+@n3+{TrNz>?M=xNs<`znqx3tye<-{n!GHQ-|S04^>V9 z&r;-3S_soHA95@mF}xwLVXj!GA&hP_VNoH&`-LkW&vljSd&0N9GuEvvT`u69Hjm}!X2!Va+gT_Wn983CcV0LNV9D;+FG)<|fXuz8c)_oiLT{Un@JL3LwU z7pFuUF@__H2pmwznb7gh{FQX$VE6|*%|#pVGQ86~fH3r9*nGS=l6q>M<`OQI>Z1ll zLKziVnAhgWQ_aGTF;9HmUO+&{Z!@B(J}!QXYEv<8Y?DbZ z;N2>i%5g&w^m(s*vHkAjlt4U!$NQBMkVy4zgP%@%? zrF{f397F<-O8pqKqrpK^LTR=N#inJRq~7PpYm(O^@41&w!V>G!8aZX01o&NCc&v_v zDP)oK>iSGft^&GBC8W=9l5=nlxd5O*OMA%0%TLdO6nrLv(b)~624n{OIL4eNv`0t# zbf8PdtoCwmBI>voJ6bFi0(`BO%0tvLZTvB#jv${ecV6#5%(lMz zE6B7o98HkP>r=^r;MB#0l3@XB;lnbQ>0oSDqWvIf)@GA#=S%FlVGd?+`B=Ktec~5l zZZNiaycmH)|3heim~S@@zt!020Ykx2fbyC9d4bgFLR>{wSg|SQtZv�r~ha5qB6@ zZG2yIzf&9z2URJ};SIhUIAhP1QdhI^3F)eA`?b!y^F_Sh1$^6(?ktsTDI!{uK*W}V z*l_@cRtAc;R@Qdx23EF4x0P!F)3oyD0J+b7qIL@)>iHoKh_={&!ze#uKB4)Ys+2tD zFX7w7pcP^jLRgnP+dwO&jgrkT4ewl+^}^{rp-~IS|%@vVunv-xcvOy zbY+W-1dN1Xe#BEGCAp2L&t+m|XgXQuFG;^y%Y9(B3hR2gK11hroL)f*E!6Y@A z=m__vWIxEP~eWq}N`CMNRnBE7{?pPAZohwD4<(~eJS2@kM{q?9*jdKHY7@qNXE zgng^9$b37bd3E{JO;d>dXMw+*Bb31tH1Q@n$Pl6%RQ2OgPh{&(TX_W<28+d*k;9DJ5NLa|_hhv_i#IY>wAW*aHjB6Pi-_nnrDu zY?nj4Sj#PgCa)KSjPyQzjewQRK5Z`cn{spYZcM+ya1Fy^Az8CK-#rBVp`W+(Hz|JY z+Cb3mjQ9ljf6}jKZGFprY9!RGg#)K;_a~_7D2Wf7CT6j!FeVjiiiW%fwO>U8@1{jX z?qYk@V&J4U`NM}ZS!eSawO1)LjN4jJs)8iYe3t4YrON@2P`nw89t6Q$qC8qg4l+Ln zx`aI!mBh{KK#6y@6~`f$ix%=C%B4r1iwCv6M)Q zMGvH>Q}!DaS62Gn*4?UndzQ=r~>g38ip|dR{b=lteFz zZqZ~iUf?D>iNA$wP!AWgfrg=m#ZL0I0o}j)t84c<0k>@VYV~6Wk?cDpQpCQsQI5C{ z`fa)UO_8&8$-b>w56O+PHWO6e_cS6ZC{$fdJCyd7XFZu-(KG?Y=CxN2U4*ot6eJ6K zDm_)7-c_wtUyIK|hJV;|6lA7Owk4*jZbmzK5Y6HIL&z;lxzewWoY&)-F{JeTW24Ud zoIwSxZYiRNtRC&CSGlWM$LIwf;-J>Gt|Ks?> zvEBvV^>n{cn21mL?+)+oJa=us+L=tl^X@jt3dQ63c$GvdDm;v_`u4H@@tpZ*V(X?ZLF literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 04a484c3edc0d..0962b719efd4d 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -189,3 +189,43 @@ def test_append_mode_file(ext): second = data.find(b"docProps/app.xml", first + 1) third = data.find(b"docProps/app.xml", second + 1) assert second != -1 and third == -1 + + +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_with_empty_trailing_rows(datapath, ext, read_only, request): + # GH 39181 + version = LooseVersion(get_version(openpyxl)) + if (read_only or read_only is None) and version < "3.0.0": + msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" + request.node.add_marker(pytest.mark.xfail(reason=msg)) + path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}") + if read_only is None: + result = pd.read_excel(path) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = DataFrame( + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + } + ) + tm.assert_frame_equal(result, expected) + + +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) +def test_read_empty_with_blank_row(datapath, ext, read_only): + # GH 39547 - empty excel file with a row that has no data + path = datapath("io", "data", "excel", f"empty_with_blank_row{ext}") + if read_only is None: + result = pd.read_excel(path) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = DataFrame() + tm.assert_frame_equal(result, expected)