From 3fcbc01a3ade2fa058faec79eb15b99b3b9e127a Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sat, 5 Sep 2020 02:00:01 -0400 Subject: [PATCH 1/7] ods monkeypatch --- pandas/io/excel/_odfreader.py | 46 ++++++++++++++++------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 6cbca59aed97e..434ea0b63dc1b 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -28,8 +28,28 @@ def __init__( storage_options: StorageOptions = None, ): import_optional_dependency("odf") + self._monkeypatch_odf_element_str() super().__init__(filepath_or_buffer, storage_options=storage_options) + def _monkeypatch_odf_element_str(self): + from odf.element import Element + from odf.namespaces import TEXTNS + from odf.text import S + + text_s = S().qname + + def element_str(self): + value = [] + for c in self.childNodes: + if isinstance(c, Element) and c.qname == text_s: + spaces = int(c.attributes.get((TEXTNS, "c"), 1)) + value.append(" " * spaces) + else: + value.append(str(c)) + return "".join(value) + + Element.__str__ = element_str + @property def _workbook_class(self): from odf.opendocument import OpenDocument @@ -178,7 +198,7 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) elif cell_type == "string": - return self._get_cell_string_value(cell) + return str(cell) elif cell_type == "currency": cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) @@ -192,27 +212,3 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: else: raise ValueError(f"Unrecognized type {cell_type}") - def _get_cell_string_value(self, cell) -> str: - """ - Find and decode OpenDocument text:s tags that represent - a run length encoded sequence of space characters. - """ - from odf.element import Element, Text - from odf.namespaces import TEXTNS - from odf.text import P, S - - text_p = P().qname - text_s = S().qname - - p = cell.childNodes[0] - - value = [] - if p.qname == text_p: - for k, fragment in enumerate(p.childNodes): - if isinstance(fragment, Text): - value.append(fragment.data) - elif isinstance(fragment, Element): - if fragment.qname == text_s: - spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) - value.append(" " * spaces) - return "".join(value) From 3738fd1bd06bd855489ca85aba4730bc519a22c5 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sun, 6 Sep 2020 18:31:15 -0400 Subject: [PATCH 2/7] add test - ods only --- pandas/tests/io/excel/test_readers.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 431a50477fccc..cc70ba663b45e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -499,6 +499,17 @@ def test_reader_spaces(self, read_ext): ) tm.assert_frame_equal(actual, expected) + def test_read_excel_ods_nested_xml(self, read_ext): + # see gh-35802 + engine = pd.read_excel.keywords["engine"] + if engine != "odf": + pytest.skip(f"Skipped for engine: {engine}") + basename = "test_nested" + + actual = pd.read_excel(basename + read_ext) + expected = DataFrame({"COLUMN": ["Test (1)"]}) + tm.assert_frame_equal(actual, expected) + def test_reading_all_sheets(self, read_ext): # Test reading all sheet names by setting sheet_name to None, # Ensure a dict is returned. From c4edbd2c435b5b26e8433536dbf2dfc488ce76d2 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sun, 6 Sep 2020 18:48:41 -0400 Subject: [PATCH 3/7] add files to tests --- pandas/tests/io/data/excel/gh-35802.ods | Bin 0 -> 12692 bytes pandas/tests/io/data/excel/gh-36122.ods | Bin 0 -> 8974 bytes pandas/tests/io/excel/test_readers.py | 12 +++++++++--- 3 files changed, 9 insertions(+), 3 deletions(-) create mode 100755 pandas/tests/io/data/excel/gh-35802.ods create mode 100755 pandas/tests/io/data/excel/gh-36122.ods diff --git a/pandas/tests/io/data/excel/gh-35802.ods b/pandas/tests/io/data/excel/gh-35802.ods new file mode 100755 index 0000000000000000000000000000000000000000..f3ad061f1d995488bf029f926dc3eba60fdded2f GIT binary patch literal 12692 zcmdseWmp|c)-DoULU4C?cMnc*cXx-4ySozzPH=a3cZZ+>g1ft4nEB4+%(-Xg`R?Dl z_S3zqclUa$YgKiv)vMkuD+&4m83+gx2q@G5MJ~{iBa9XZ2L-YWd9BG4d!2h z{#N>IV{L3^;%NU5GzUgHfQ^l%zMehZ|C^SvrHvlI=>Jvg?M@y4dtI1+bBDGzwvM*% zs{iF4|4z>Vpa*bt_+1|)B;-G={4aC770CXkK0N~iBTJ(<1KHTq893V8znj;^$`TYv zRuba9SAam?cW?5aO@{q;_OFJvwYM>`H*)yX4gJyW{{Z`+-QP9jy^C!1tc@)HivjTd zrbaU>Jrg4bIzcmlm7cA`|0w^R6=2_#t@NzTjEx)swDyL^WAS6w{`3d|r`&-?;|u(B zU|{y-!%gOyoh(7BV00HLh^y^U@yc?!#Rr+Qo8=-*Yf-hS$9mX<;njs*ip&NSB*S8+ zb^BDDEwoK}%bZIK^;CE%IOpd6d6|^W&eoJ1I*EZaY_OISHQD4YDVN+bt%O}a*zLq% z`)i{hp>QJw&91jRFgeCIpo~@I0EbyIuDAF+;VYa)S=G>b_3~}O_4u*A{E<*!h64cp z*v-J27x`oasAX zBTCRsV?2>LXyI~w-F=~*c*WSLsC5#u^^(7Fb7+$r72iC)-jP-cZ$2kp(?V1j@PAH2s;WS}92kX;!lb!>h{-nVYW&Lna{ zc~oYce_}WQXhXtrNK0;;9D%%vF!Cy+;H50`)=CWu&-Q6==Ew5tK6WqRE?$=74hWiVtNIdhWtK?kbFS-bbi)0iEqq;EyeLdmU9UXsiLtP!P-ss5qx zpdIGq<)tbesXjnO0O^P5X9s$8I!ZrVkjb-ri!??-3{ttcsa&xd?#xF`#^IovWC;X* z4t$qL0}=b6OGqR3oPZs1D6|D;CFUO5W-uum&x!-dv^E{23*Tv(1g~_mHbJepdz*40 z6%>wcI!Fg2zXm~=LzK0VGL+0VclyYUbpp0q`P+f9Tke%yJ6rIDqq2L7Mt_E8*Llew zK#U(c5T< z=(h*V0pMzBTUz2fA1weO(S zW31pHXH8cVo*B<|(UsBFGSzX7==8e4Lu2I%I9iVUh{-znym`4ot*(#u^kgy%*RadY znG(($06&_sIXDYvzUSqn=x+0eoAu1qh2^sE>yg$L1EBRDU6(piM^a)&Bg^cxH-o}k zHLOwPbycw@E?6))l?~^0k;l4J#myWQHG0u-PR=!S z9blr)C2hNysS7kG~LH@ubo$g`B6DY^Z9ffmzj79Hp10 zF@Yz)7^oqCO%BIdzRsCebX^Ues;00i8fv`G3yiP(xFFj4)ojsl1#PC%Yoey8R=dLF z$XXL{iZ{BEePyi(u(%w)+(_=8qpi$r+qUXojdQcNbf!U_;bzU0*q}PZX??@nj{h$LWZ6iucf|5d{%ln zVQ#m9%3VFXJsS7E1)3_Y)C%Wg^ul&MMAfsJ4Usu-($ec93nWPBhPs)Zd3J*R_Dfwx zm7pBlO!+!KL2)RmJizbCQ-Dop%!`C=@zv{jdvQ$0gwIYF z7;3p@j88(sAA|1Q_?~e&>TebDHEr!U#aX0AV7M$ljUeEiXTVKJezk?wA{auH0?!~c-(49#^?oLpar6_E0Kco zs=5v0bB)f733SSpu2M0AcJiAbTW$cYX?+~={=(Zql|ZY3@TFNzP6n<5g1{y>FsfnC zG|1O>)}EZ4NJgM1pCdAiYfzXAjl;c*U=!8J(_d(!i$*@BR-eny+J(1zuTl7$K@pC0 z%#`d&D%(bd-xs2f9Bjo|95e?XMsuW6Kd7%+H4xqpQBrJCxoZ4^Sip%va;BDK^9y}IVvN2E z#_L$S=UW+|4aXjUl_JR1bekKEiF0Yvm=*lBONWN=%cmVJ$ux-6rJ<4LjG!6JVSglu z8aI(xy!|Hyy7Grj^}SG|z0gQ4HfI3!54M~ndip;8pMugY!bUMWY&*!8;F_%Tu>4OZ ztCQXdiO&Vw=atRvvbl^~RU&44$On-eflHu8u#4Wa#xFz;0jF*B)@nYB{A!^kHM1#t zQ819nFd?C``>~~xJACvjmHZwn1Bu=OeC@dMO9ET;-*!!hTYo0>&MUfR8tsui96_G`-rwInx1j&&gx5XJ{OnTH$ zY=Z5l9*qS47U;^)QFGT1ybH>~2P}T%d{u>(9FNkPnNnXjf@v3jDvX<-jmARH6(9Jy zRrpA&H-`r9UbT{@%d7ijF8AJ1!_W;%bZuF5eiO+lWQj5zgGg;t226`l^okKzUZ&eU zMSwG>iiP=mnp!TSajZnLQATsP4)$`k+A$O+yn57WWq`u;chw)iR_B~pN;4_l`^U}O zZ8>%!PcWvv1!N6?Yy#l!dcTZhR_fJt*4x_s1wL&?22UtuN<|bOKRMop-+ZY)~exh&w}vsu?QI2^ZpU+#f5Dm7@~1cUF$? zL5fuF`Fv1;-UM|m?WURWNR!argon5LQ_^|lT3%~n=t@6t*x|mXW3*4Hh=oz(R>@cu zrY<$BoxPVfGco}VeZ=~+ThUrfdm7|C&``w!=S0WT^6q{QpAK=SwpjU8XVnRBo1@{G zUj))&K2&KKb5Lze&+E%aS>#eNoQ|l;kK733eVRF4wG!)M{>OMNhEI!=bvrz#n=F$U zy*ilrZIswGbp-kDKQfqlUW4IKwAvAW}|==ytjFgNLtabl4!sZvHe4ic)5o|nU%$)^4yhcwc31- z1{;e9G@!h_S%v-FVxVen4BJAj9ut=zvClblt(!}uhZ)T6fo9s{C}Y^L{G%DM&xNd? zhF!iQ2*o;W>c&^z48SJBm&9L9H_*K?=V=NL#FTU{Jpph*?K)yD-5hEPF8d{WB<(G}V;xW=fm#CSeH|3trp z-?@T7(hcHlkp_uMT3+44--9m<7KMz0FRw@&r@x+9Mm|*FWyb-`T_=zV?4D?Yi_><2 zLcLg57T?Ct9B*OjL~AXOv@sDvZki=92RUW6nrY2KqCIXiWN4c(leuZnn!5H7O?KU0(0Y75&MF&GmKtpK+<69^d4txzehb*zgvI ztdVQ=WUoy9!@<_(iRhAn?eEPSNr7%wBrG)iv-ZxLn|il*@)T0z{Ml{X0xjj9lyXvj za$}6RsS-_wO)DJufnS9ZQ))?FN+ME&Qz-&o?Lu{t!=M^)ZzleyD?CEYOa*M30 zy|V)6pA@N6UPG5<_o9TTjcI@;rZXsSreaNJMEiLN|9!KEUL{8bS4iA9t>Xqas5?+p z;v52j6E4~?<2@M`}ReHC0#@~$!?dI#y&Z) z)#Q{~^sFk@guR?C>r@y|WmB#W^Rtk+=4Azdnc_&^{)g4Sn#?56wr2f^&O2-0&p&VjKF2&xR z=Ho41P%RIT6D^Z}KKO?=A-v~|5o}kv=TV_0!Si(+w$or&n-XL}gxT`eO*ePijwPaFB0BS(V5L>xCI-GycQ|8&Sq}50ZC9lHy=zHzJ{#f9 zID@F0csRxN*1!N>l=32xRmyG>7Plva4=5ZqUf$wcv8}N(KSomLZnIZ`_~-=AiaRrV3prNm|dvV_S9O-A+)H1ZR&{{dIdAyW}wq(E1 zU&GlnocMfZ;ur(3Ve&wKl&r3yy>L`hSsO@c^YLdQnsUmF*7CHV2K@3-0O(S;t1M~NM3`|DEX)-lGDqJ@I0r+7X1#nXLAJikeQXnI_yvnufyyyR zJ%7^F)ywjA!4lW>ww&=M>A)>cl!(JwV@;KP)#P;t9Mh1>!VB}&#vypjh7tp=&Pm!^ z+3fFADlPen+SZ>?pP}QSuj(AZ45ssAUx80!Ujvo4m==wl@I9Cw%+kOtl zjX~iA9?@b!-2NiqT>c_Eoc{SgANiGEDxyB?eI}2g2`C|wGmU^4$&Oh!K5i%p63mH7 z#e}JpiWKgVf&(*D*gW8kg2Eu9z;`E7xy6vu4GEUWZmYoP4D6OzS;_it+X&L6kn3}d zOabQ{h8&ZgU9xFANiPMwXy43<7~Fq{72eQsk?IaPTxg1IkMsq2`GW$cwI2n(31I|4 z8V(b#Ty)?EQ2=^G@uT*-5+zRK6xwbX0f!A-zw4oKS^)?xUqtz5KX93YJ7tXi5W9&| z2z2mbgTZ4(0+S=kg$OMxb_10Xk8d@SOIXX@c)<3SrEG#Bk0foYyQ{X}^VN67zK(OK z_-<)}ML2v7N`v`EPtPgn1LxnLgi0?GVUy-a@Og!ym8D3K6unRv3fx@WPeAq*`|{Q$ zJSrcLrBxWBhj{N{FP6DXZ-&}Kn75Q))`SppkO?97y3GRXVT-A{bx71OC0gv!Gld$9 zn}G6}$KLX^AdRytvNeUvQg41T8?{wsVjx&iVBoCI6YvVv1g>Q4Mp|i%hmF+nQ--e@ zeI|I9C$dDASYESHwy;r%8nsTPo4qgbfh_{iZK`8*eyHMleZ8l5ThjW1$s@$ZJ?*Iz zsJK4#v34PvWDf3>eb8vvfsC8-0yc^WQRs##FX>x(#yNx;zc#zq2By`Pm-!;CUdqh2 ziRIDF%k$L~>$SS7*fx+@9e}R8nH6=q7zB0wn$vV}+jJ0A;Is9oNIIbu5qLf->&N)x zO?4}#@A|D9m>9F<=Tn1y(}6SlM9XFQ>7`*#^}-?tq6%kPVUd9oC-+-U9?|vYEu5ri zy)7jBu}BlFP>Ws-i2IrG)g&TmHJMCAVKkWMIaZITunT=|fe2@NMt6hE%s%B05?l0e zhquZoIiB#F65x`x;yfZwXrF!j> zW7$$pPt6%syxsks~7A-4Gf`FX`VX^w7f=|Rh^iMNcB2AYj60#lZB#e;-d{dCZ z5NbKw5eTPZ=@gG~$rhmVaS1H1jd9_0OAyv>pqE>b>s{w7_m6^gV%I8F+C64N;4TEu z=7^MTWN%LM(-3_WCaz~X4+cf|uX!InmBz8Nrq%SIlg@i1A|Uo0QFqlumx~nJL?#v5 zp>d)2N!CoUK^s&J)l7{GkMkPbDCFEK91Jbc91jdhmneU0{h{t)UYpemR%8+1*;Ppc ztj0z8qP~T8QPCeiX1WdI3G_U`wOm)3D#KWM6@ZeSJ0c)cGQR!5r8Uz|tz4}7DDTN! zS)`LlsZ{D&v`T#xr7_?f#4ULlFZ6Ct_jjc1BO7le9dpX)_bw>!vr{9jA7 z&y&>9^#oFe`x!xLjV;_QK{6sN>$-}rR^pb~^;CTS!NH{=!bIp6p0qs<;rja_#~8&AK$esF`$T6?k)U64u| zEF{K=0#p9LU`-cKC#lziiRGOO;)}>OygdhQ?w)U`g$bjORh}%}S3M{2EfP!bo~Yx* z&s3QLSb-c|oMqoZl!Un&>D})L?viBu>xHI%-QrCEiSsZxhF<4IAYVPI1zuV)hDXCd zQ4<6M801jF>_`TClZB>J_q?;wB?Mr2pUe=UJQ{y8Kl-UJ)YdMDjf{CXvL&Ix2rJKm zdG-c9qfw>r2Ae&*0mcUnYYf2>+pjCN$*4ZetK09W@p=m-Icjj>ZVb-2-+8~M^PWhx zlar8ufH-i0|0liwcL`sZw{)I?jWytZRtDNsHEfnyk-gR`=$UmFa1HyUxtO&#vL%(X zG^T+ixsGVE@`ysINT7+gCB2{FDPr>thm_{a=<0hN9^oefgPXmY=oTzt1m(3?KZM^4 z)B7{-K+)8s{(5eiUl0Ga{g7&rkhw?HZ+ni7$;nT~O9TBIVKqXE5 zwImK1UxiZp99n{^h4ZT#M$4ER^>$t}lzm>Y67sQA%l(P{xG-wlBhhrNY?rohA{UZ) zcTi&z8d{PZDv406t)$*<+1G{LV)O)6O)V|qpaWGB9J#iT7q>?52nTX8G07q^41WiF zGC`>ou%4q9x`stXHN!r3WEAW5Am0k&Pljwt;X8f0j~m&X;QW%zT@CGg(H|5w$w$gI z8Gd}*Id&<5hb4V1N%q_aK~APDb@g22$=MD*XP#6XNx3Kzxrqy;z=ML8DJWj27^cyp zH-M-q24ljHX9odS7ZM4-@BRw9`}AciLwM|wL5enh-^@PwZZ%gE`B8ZR!{(TpxJv7LY*!cM(0n!I|~Dlivos%h9=- z6Ha!N!!FVdCF2)mzjNo1F_`{3kP1mFwC;U>&~6zvC&fd*QLW@b~4(-P>e(BN$5i& zg(Pe0u2Ft z!7CZQ3tBCiTP`{?PmQ^8=)5$B!1%BEJHRhrQ0cqBY&cYgR(T?M%!39|@fHW=qB&%# zMePpvJuP06VDf!zOhmYlnDVQfOj6Dc+HXX`=(j?HNh9G8KpDcX)nX=LQQ#@y4oy^kPV?@Xy6}0~| z2Ut{J;kVY6Q)y=`oR9rZT{R`!G)G;RW5?*vAwj5UUpZIG5eG0gy3I@VboqY;!XGIo zwFZ2OV)YTpb3Hl3K>cb^TEndvJiLFz)2H^JRuNgbIj}`;4$UJ}#QNI>_g8cB#C- zy;}dymh}a%m#3w0WW@!t;ApFJaeGSYm>$<1jG~{S;c{ z*Yja;U2@*r&`C04<#1XBgwx6DU z=x^gD2{U)Ff9j=U7j3?Fi@zarH2HM+<~M~8Xq zWU!js$zM8MG+i1Y(qmoc74+SM=&aSVLV*JT5xy<9|8s%(?>xx+imQVW0AOZq^3Nq9 zbt~s=PJ|aPZ{cX2Q!}L{Hx-#f+}d`4N*PG|ISqe8DZx6iKyAkh`0Q#qRawbojr+-i z09c$!(2(IyvTw@-)khs_g3XY)N6qu%m=>Sje0ajk$Dfm?+Frw+9zpfRIDUS#AF{L6v=;us6d|KS3tH1dWt#CP*#Yl=o;kR52!eQrpTiMB|3^UKG)5YZ5*sYloI_ z{o|4FkPyu}+oab|TLnH`wxrbpxJ06j!C5GIiriRja8o~d3SFCQJ*gk^Af67*~ADRmafOC0P6#zESTiJ!0cP zvv^+X=CcbguIHCT&#~|sond*AUQ6(wgIAV>rZCya$7*#hWTE_UEw9NLFFz4OUPP<* zffIy%L3D*&o&5*6E_}|f2U53vA={Naz3}hPjaG7D1o%zGy1?9a4hR~UNSxrBp=3jp z#@~W)*dcB>#phskGw_c(hvu6o_U+SzqE-g%+{6cUmbvYnvT88;7C@+D>5y$k$RzGJ zr(RtF#95@FGuacmLOjxEjRw8nVy0BRGhS{&&)gLa!P3VvqGb<}6`Uru5~l02e&SPJcSEn8^NiXv^1CKuCOZSD*Wl zJsW|Mn^g>%18HSHLh5GF5yZpa+y|!dRM8A_0cga130Wgqd=fut{Hv&Il9*VQ_L%iJwF>0#a*d(Jx^0*N;*pXru7 z)Ueal@}^kOo^e)uO?A||920N&i8J^*9414Cc@ix(Z+>gL5`J`Wmsrr`6OoawEVltW zyyAxLgsZr!w8?>})8nL(38k zK%MX09!;JXIx8U#mQHXPDT62vo?Fj!Cwif}JF>)RYC+%e^hHoHZ#!IisIh99{W_Gz z;l5#2*l0GZo2P+dNo2})zp3J(Igzt5> zW_B*k=GV&-`Q=hc-5ve zxe%@gKUz%cAUa@+Ruk4!wAt(e3z&?zd$@T3kK@4f(rX5($Y8iew=em&6z^VTQ0ynr zAw(}KF#DPMXG}lV5^RoAU>?5I&Z|MZ`ik8BX;&uKkSu-b70M+)okAsu-XO9MMZN9j zFTreE7viVz%CBJWPQsfkulx`U2*{KC4=3>l7pR}4rcdVW%=`9E*HScfw9>cMGqZG{ z1N>2>wY4@0m6aBSgT{O(7Ql&%3CX{GzrJn1Am4ZhK#$7&^=~vKZP_o1!q6Xp;o(s~ zBI6;!!F+;8MnXVBM8L+v!a~C!#>2(OC8a|lWI`q3AVGyD!9u0NK_Vl>W5!3}AVuRQ z!{DVrB_YHoC#NK&;i8~p!lL5Gp%*5i6Chv_qhS-I;#FYc7Uty<;1d$(mDc5w)#p_( z6=fojWF-*eW0w?QQ~gY*EI_Xy#I7j8ryCc~g|E6zfUaYhkz2I6qrbC$m{t*PgZw~H202G_5P^p zTdW;g?iihI9^Y)A+3qg;Hdve4Szj_-_kE&+X&I-HX-3+l~2= zt%aeDg~_9}>62d*+rQ?H*QXA**Uq=5uMSt&_f9tt&vwpkk1sBdt{yH9H_lE^FV1(L zFLxd;4=*k+&u$*C?w_6?FJE3>-Ur<4tIoKD(c4HX6&DgvbXhu{Qb`5?`jEBO zndJU*Nk5d_phN0l%$@D*ayEb0roe|$)3yrAc$+J>7BXR9YMD>RrO3S9LPl8E^7Ak|N8huR#O=+sFsX%IA%2FuxPz(oIL<(4dAAyv9%nU89>MMZbyv^!r!Q|*A z^m0vF!-W(x*}3E{s^z)Qq|epyyW-a3nFCmHwQDq{??Ii0GIKa4~ zfSTLYL*=WYk{P324l5t8$;zC+Qu|e*F9VOKYy8ibS00J_<%>-Z&kD`*63KSE@?vMv ziYbnTt+dL#vh?S-^?PZrLmm4L??V0StK)R<%>4uje}NLK$Xe~0+*dk6LYd;j;}e%| z%C#6Jw8^m-D&S=`GmFhr9sPP%+F@=f9C?UJ>WQ$LFNVvaR!?+f!JlbXP`DUbhBIrp zfMZRQc%OAFdA*&7!dUQ{%FQLM?osITF8py{Y-k9hd!TBF(kdf7&0nb8BV_3l)W7}! zyn3#rj(o(bU3oo~`l`LW^w8vezBv$$$LIBQ_y29-wcrjEm;`a#+0g3bZ3F{}3rh=? z3+np)=SUsp?J6l@MFARdX%V`AktE+(TxAJT*4^|7fwyUd7xiH))y32xFu55%xSe$a zU2#nHPyWAd90@^+C=tvST{q5eS5te{tfJD(;1ROz7Gl^QBq;g!6T0dNGFze_(k~sD6rx{?a8u3v4tUG$%fJ)6s33I>CZEtJ!G2|kw7KwMKmA=VzJbD z{c1-Fg(z1{2-w{q^Vbj)xym9x@DjjE911%|W<>@z*9Oo0@EM<*)6Vt$ydb4BPvFCt zElu_j?$fFU5ya;spD=i)teHLL2Oz;d9)3d0sIO(8t(LIC-82~qNL0!5V7}}ixVeP7 z^(?;D;znRG-JqJXDoR6L_70dZlRx{|?z`wK!{oZ50kg{+6+~(DFAK z^{;w=&R@LexqnL<#=m8~|El=snD*ZrxcX+iKPNl>(ELv^@9!!>N&j;M{0GV(X8Q9) i@;+(*mJq7{FonxXg1u#GfPg-|eQe+6A9$MgqyGa7Jc#%J literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/gh-36122.ods b/pandas/tests/io/data/excel/gh-36122.ods new file mode 100755 index 0000000000000000000000000000000000000000..3dfdaf976da4589b48754606c8136196e68132c4 GIT binary patch literal 8974 zcmdUVby$>Z_wFE|ASu#FmvnbXhvd*8HFVdIBLdRW-Cfe%Fd*Hapn!CDOGy7*@3_v+}#v?jb+>h=73b*LJ!a%zq8}u4WCgF*h*=J2E;LnG8h_ z+4wM{`|k7m84phiF~P$-JnwI?NdLg$uK~|=l!P(Y5*Don%qiST|GZo(!8RXOoxBTr z)*D=z-=@NDNJH5#WmdCE$J@l%kUPygHCanXoJ4qN;gg$A+vs9L%cB?P$H0YbHBy!J z+%@SNzd|!v+ZS$oDdg_zFa$)RP!aQ!6?c4|;Uz>9b)e&RMugiLi9qlyZ$U=oV^FOU zG@zCw@~u!PqHMpfqtCNOd6_P@xNkt7w#?n79)0MTPZq!a0bSV0x`fO1J3w;+hD7WOZ za*mpX+7<)Z>8tS)L&=DxE?UA2KOu<_whI!HQ)m*6kv>R<8{CysvWd3Ytj~b!XQ98tQ5gWN4 z-KfMFwZ%WVLp+N53=#T@ZY;g_MZdwTl~3^cukD;acknd4-ieJ19NwEhrgc9-J}A*E zy@9=7>-RZV1GRT+UH-3Y{rK)t0Xw=`8H4Xvdsllie3}#cw#CzL5l;L;u|$ACbOqZo zNwSU0wDm*r=q?X>ZVbw&v0`ttdf2YF<8x4z-3VgVTiC{gC z0Ma3Lu6!I!A)9dioX(0Hv+}o>C)B%;3(!d&!A@hZzs#fQWPra)@MbjHGNK{svay){ zAU+HhD<_4v0eZG8o;RRd#)8Qnt|P=rb5%146z$9xW_s-0n8&WQb>Z9uog8LlK0GdK z-ELOMK8W1nIl)ISj@&}SJjm${K_4e&~hdi!j}ZbM_RYQyA37xLoDXI^B@;$^W~EbkBGu8jnnIGL8vR9+7>DR#8H* zDep2YrWqQ7jozvjXa)ltEYL95wJ-<_Z%b5`;$E0;$|uuJTV|Z|m5)k)$NwEy*xME7 z0DiiXBr^rGget***xW&nY{L|LCC90QHB$@S@_A0gf*FlH&YS-8FW0mZ(cfw6)fKRW z>A?ZX3q%;kp3L*=m7$3Z)+e`dB>rAvHg!>89IOz_@G?KfKvq;@7la5*RcVpT6hNbm z)Hlm(wr$MY^Q&#fuhJPo&)+j(3MOYrHCR!7_S5ec7TrtY&HlImtMGl3qDCW0wq%nw znMY^PL=I><*dJW!ZG_OO4^sQad+aQgd$M5A^yqHUYOOMbOzrT6MX#Px-qglf7h}<1OdVs4? zWkV@~wVGz|*hg?`78C@J8LiNIWqP!#ob7Df-c~)9EW%)|Fm9EIh3eSWt)T9ZxS|4# zyiP)S-V)GAj`;J`fRk^6G|CCnQO_Epja*!8-y+7yBaKx!fnDOuR5e&0%-gCePgDqAxjCV`{TWeR!Hgq!`bJe9+4;TA127s;i;5EOR}S+-)`%SeYH>> z-+t2ku2bGG3MD3#VJg?F$efex)%dWL^6X4Dgef3lt(qoV4DvmJfls%gw9UgkM}_LW z4z#ZaD}H3Kta8g%CT{1@jW285Rw6DRZPFAgkm;&3e*`5fjX5Ed*&VIE<3VXjuf~}7 zhPyYiP>cdjrcV?f4cdqK4_vmM2OF2u{4a$u&o=|LNQ1r@ogZHd0xyGAlKrZ#5h)ej zb%=>}hdHNGtdW;?qLS|#8)efh%tEgS3h1Fj&Vx)Abs zA2w9f#Th}3BvW4Xkcu^e%|mv64WuQU30GUDw=^pD3i%9U|P z1=7@HB2ST79yOiQEJ;d#PRm){kdbZ@mFshPJVJrNJaTd~tbaaps*x4g8LVI?!;z_n z7W-+uj}=nigBQo4DDeu)sw2@}fRTcf!5$r0{`U}8bv@YTh#=M%n&OFQ>cBNv)tK|SAQ2#7 z`)v%-7R#;m^AI+d$L-Sv;i6?boyKo4yCRC_D1^#3vNGmWiFD{wLn|RD7QHxrKk{^rei^m%GrGwc6N8FkJD2oyk+FAG--1{DjnX$M!Jd`*_|)SKnQ+ zQ$9Axpn+7l1UY}7QG1lrHeWaLOyla5)H=vV+@D z6M{$S=QM|#`QKKnPR)ZqS?8{9<-i}=IqSwYN}eo`QmJ>rP16p)pdwyaCB9aD@%D3a z;w=iy3z}>{p51qVzVM-sL9leqTl#_N-ybTu$KYnSCIBpnfCUV1NjfPO<`GPXUIjLD zKvU-o=OYFAW?)}hOc2K%O16}{C=OyLDQLYlFmSFE=qRb$T|0rPeZQ+yTsKuxCvODh z$U4zWA`*0Jz|4l@48cMm+SPT6|c3{_AD|^-=f!IUk z8!On~_)U-z>uEsxy1sU^As*Meitv=aB1@lJ^P5ghhb7r7T6uN}$0Y^aF+1Gy@$pmO0nDp8jzc}j(M)T;TF&ewOm4&K=2bcco~9*oZ8D02cpk0^ zQ9`?c_3TXe@-;-w4yjTN{_9GR5ZWR|H5xph`A2ozsN3wo3s!T7>)HoWpiYr!soW^+ zD|rup%op_PX(juUQ_rbKJHe1RkiW56w+P$X7}f}*+?8C-f^Si)0;^vlmVX7NCpdSz zJwJbb(|}G?J07pID&va*DI0|~c~_Qh`ZeSlxL9=i*Ez9KN=fJd5dg5z{F%6p zXlvtmAAa8*6rmb2;Au|umNRv`7;{@@N}j#R;u2ZvM4~C~fT`3?EV6itZjv?_A>dZo z{(}YB9I;sIv9*PS4<1ePg5^lx4?8Lr+F)`FvfQ<-ZN`fL{TdX-gK285&>OKmbz zULrI2c5r|`HLI&qf%hf(;B0+_$sQG+vREZ4j^!A2MP(}&Sl-@pxfQ|CmGyirGoRV_ zHEK(g*r-VXHhFO0>0%{5hCVvudFsAPrteFsS?OeVdJYpK$+@kk0bMuvAYvV_Jef9$ z@g5=`y7{$%4?E6T1sWDB`P|Z7vPxo&Dxp$O>UN{?&e)5z9AsGEi7x1q(9X^ix~8xk$j0jLMDBfVHS}U^bkH9tl-A4 z5;b_C8y(artDRh*G4~-W(tlI99zS}Bmz@==j_g`FNGJT5!>KQ$hlcC>s(IZwe{h@xD{Metvw{GS|jE*C9PiIcCF;_Qf;`SHTx+-3+(qtFtc7UD^V z*f7IQ&u6h=#{)#v>Kr_SwFWu3_y|54l&)6LnYbB9^oje`dHllaGJR9Qg1RBjaI997 zyd}dyU{!*Kz#EjQbaFj|ip|iP8SDKv(C~Jawv&`+q{`%_u*E}KDe>xfq4{2-1EWk4 z!^W}6ATD~AS(DdQ(57&tf%$B<5xHUe=&VHa4y^RpY1Uw#a{8XNs+NXnkMQoYr=?iv zKyx)kdH3o#jw`~s>9XF->)BJ!rIO86SC?RT)Rz%CS$OCH-{ELV8q&B|c~<BgY~+|s3pwbB7E4hZ*A9g&gRu+;E!Ju(NnFX-^)DyAhQ}ffm1CchYMyD6KdK6^ za#vbq&=L(Hwe@_PV@mq|#I=d60 zhJS&`M1exN)xn!xkDlb~@rEt9p{8o_zyLFCe%zWirUH*&3r z7!>UpU7S?ZIvo6_j~``A>Xyepje6A8-b#Sl;t{$@CC@5Ts`n|=`x+t;r!=~2dp@Oz zCp!ZxUf9`Jl~mV_<3)BcLrU;Mn58T&#d2Udcg+dNU-QB3^H=W_%W(~5zw4-&)pZBx zj%*VrmaZA>mObv$7S6}7^J%)Y9U1bk^2wQ<3|?W^eoMO}`>r46c<&RT8+f^-p@#xn*+YjX@QL$9nEb_{}ZZf>VT(s(ckQs z^?px1i1eR!#o+bj#L=2!PRN12d3yRFsaGqU!f>Ok%t+K`X$lHQpO@O8p#O@uNtZBn zHkN6=RK&Rwj+Ewz?!DWToxIm2uA*S8!yXk^4cGAWaFI;PP(GTCm&daQ>KuqKH%X7g z6xK&iB1iJm(2@4YB2^kwd`Waxn{Z*~<-zG;&dP>gQ?atI7W*}yV>O&WqA1??eWeL< zgSy$<0~OM30?3w@g?v)o;!tfk@XNNpo-A)1Td4PN+9vYWp5IciGrX?~?KnN*i%)jx z^RQ46HZuTMHdu$~@Jt!INRzsSBq_7+pQpU0-HZ`d2PEpRSP>UG-mvus zdj&8nJ4dvB6k-UHxGgg+Ou`#TZRRfH{$UTS>cH3@xJeC zh`*1|2XR;;@i8jLz}gpm*<;w^p!Ea!eV9k@cqdU1^XcJnS~+)k67Q7N4goCdqwJDa zG24B_JqG6US09_L6rSLr?>WW`tRTxq=di7&8hSiWjKXEO+RwUjq;W6lM?htW~e-0~1#=o;RaNkwjH z${KyOtt3YBettmn!gPWsxwOUA3$Es}K+c&QyR&?gvWIKGgU0L-jr<{<&iY5G*^Q(g z@4;OE#wZM%`9Z~iMFbNYo>%kI2p;CyB2Nl6D8r&$iwpg_+E<_Z`E^-*vKe9>-Hp{$ zQ?$RC)PG4)t*56f-#JxqOO-foQlQ_IyCDIjWm_Tyt{@Op_iHV>Zn_6j;BLPO(u!uF zci>W+Bp(dN1LjijGN(?yBq!2rH+mAFEdTs6`eHP+-+dgXYn_j;ci^F+7#TX(gAS3t zLXuu6{FBa!I9Uctp`=um^M>JwqkyL8A2I1)e0gyxK;R_We!25>m0_PHhBIdpvdj}f zEPPNMMA$W>&O-B@9q5;76^jC*&aed2WwAx(f_V!~%s z_|Iq<>Dbtq*m>Dl`8oJ`xdr(I#QCK(BzU-G1bJjc1;m9!#6(|8NlL0o2nzH zloe$)On`dUnmUF$7H$?AFYUEu?e!FG^pzoIY97|QHl~L9md@r5u3#%;4{Lo-7h7{k zupPwL-8;}TFx<;h&DQ}G_S!ty*FE0H&iAeNJ0JhBu(05;=$NR;gqY}r#Kge3tl*U5 z#MI1)%<{PW`mFe%_i6FDnaM?2@kO~AWo2dWODalB%kwK5tBbR%%ZkctTbi32p=}+F zJtK|19~+04TSk}q+v+CYu3KfAKMI@z}}^J#H*>S%3pX?J^V=V<-(a`*Ur_x$E?XXET> z`|{iR)$!)R@yYSk^~LGo_0`GE&CUHh-`@6X6B6D{@{qKcu!`%{?iU#aWgw>4r^kWM zRovnHo_lA625J%H1Mv;vwN^;Yl%KFK1qZy&d8gC%fCam>7+uKfJ9y5TxI7hht{ z3;bmMx-&Z~hx}P+%-pjx&jQak>NJ%0Dk*s1*_}Ku4Age1amX$_PsNez9*)^MxEDdj zQD3LEDAaOV%t|}&QmZRnN)@?U@HeVm6SPh|Ow(iFfS^@gnA(gCW1ae&t~0qXLf^SE ztCa2En7_D4%Kq|g2uCNe#`cRYR`l?IX;Z}ouR*Cc<9r2n+qMiTWtRZEd>yh5ZA-0qH-4X+&#pT6HMfAP@EXZKp-6bckBFrEyFTwO*Ve2jo zD~Xn~=^#OuIOFl_s%5R#RDCXw9SQM127E?F#UMgQ20C0C(8+77MjFuYdHA0RJgoSd z`dU4eglfA2M0}O1>Qyp<-Q@p5pFmBKP%J5==D^4yf;?n&&G5N0ZC z^)J_tyLt0v+?r?LWt~e=gThdrqCxlM0M@E<@c^ywnh&T>5^&ZYVA`I`s#PBCF_ zPLbtK7@@y&jEXWa4=@3L7Nzgk=coOUsQ-!jc?SS+ujBqSmAmS{i_m{>|0favxR;@S zTG(CnpG^Ntl>Rf>y-57i2JWi=B>PpQ{xi?thX;E%jsN8NyJ-DqroZ<{;5SUair9bV zxqo~8G^gM2+>h)(g7Ne^`F# zxW6jA_aVbiQ@CUKukW}M#eZz#AIM+z!g~eyr-j|U>+k-X@s}R_Pt>p3qx(a`PlMcr zG(VNaf8u@(dhT;eKP?OGZ<(fl0)IWy{C$ArcW(062OJoL@=qhadrkk=ODoF2KfJ%~$=%2J&gn!G-Cz9=7`lA` literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index cc70ba663b45e..33467be42dfd9 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -499,15 +499,21 @@ def test_reader_spaces(self, read_ext): ) tm.assert_frame_equal(actual, expected) - def test_read_excel_ods_nested_xml(self, read_ext): + # gh-36122, gh-35802 + @pytest.mark.parametrize( + "basename,expected", + [ + ("gh-35802", DataFrame({"COLUMN": ["Test (1)"]})), + ("gh-36122", DataFrame(columns=["got 2nd sa"])), + ], + ) + def test_read_excel_ods_nested_xml(self, read_ext, basename, expected): # see gh-35802 engine = pd.read_excel.keywords["engine"] if engine != "odf": pytest.skip(f"Skipped for engine: {engine}") - basename = "test_nested" actual = pd.read_excel(basename + read_ext) - expected = DataFrame({"COLUMN": ["Test (1)"]}) tm.assert_frame_equal(actual, expected) def test_reading_all_sheets(self, read_ext): From 9f77b562597a5d925478b1009f259d093e4ecdcb Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sun, 6 Sep 2020 18:53:16 -0400 Subject: [PATCH 4/7] flake8 --- pandas/io/excel/_odfreader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 34224d2e3bcbb..a23921af9d974 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -47,7 +47,7 @@ def element_str(self): else: value.append(str(c)) return "".join(value) - + Element.__str__ = element_str @property @@ -211,4 +211,3 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: return result.time() else: raise ValueError(f"Unrecognized type {cell_type}") - From 29b8cda2dbd22b8c23a2211b0964b0f2959ca639 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Tue, 8 Sep 2020 19:23:31 -0400 Subject: [PATCH 5/7] revert monkeypatch and change impl --- pandas/io/excel/_odfreader.py | 46 +++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index a23921af9d974..1a3cc55ede083 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -28,28 +28,8 @@ def __init__( storage_options: StorageOptions = None, ): import_optional_dependency("odf") - self._monkeypatch_odf_element_str() super().__init__(filepath_or_buffer, storage_options=storage_options) - def _monkeypatch_odf_element_str(self): - from odf.element import Element - from odf.namespaces import TEXTNS - from odf.text import S - - text_s = S().qname - - def element_str(self): - value = [] - for c in self.childNodes: - if isinstance(c, Element) and c.qname == text_s: - spaces = int(c.attributes.get((TEXTNS, "c"), 1)) - value.append(" " * spaces) - else: - value.append(str(c)) - return "".join(value) - - Element.__str__ = element_str - @property def _workbook_class(self): from odf.opendocument import OpenDocument @@ -198,7 +178,7 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) elif cell_type == "string": - return str(cell) + return self._get_cell_string_value(cell) elif cell_type == "currency": cell_value = cell.attributes.get((OFFICENS, "value")) return float(cell_value) @@ -211,3 +191,27 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: return result.time() else: raise ValueError(f"Unrecognized type {cell_type}") + + def _get_cell_string_value(self, cell) -> str: + """ + Find and decode OpenDocument text:s tags that represent + a run length encoded sequence of space characters. + """ + from odf.element import Element + from odf.namespaces import TEXTNS + from odf.text import S + + text_s = S().qname + + value = [] + + for fragment in cell.childNodes: + if isinstance(fragment, Element): + if fragment.qname == text_s: + spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) + value.append(" " * spaces) + else: + value.append(self._get_cell_string_value(fragment)) + else: + value.append(str(fragment)) + return "".join(value) From 21cc458d21ab57c3afbee1b1a06dafa5b9db9357 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Wed, 9 Sep 2020 09:41:15 -0400 Subject: [PATCH 6/7] add comment linking discussion + add whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/excel/_odfreader.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2aac2596c18cb..bde0fe1a930fc 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -296,6 +296,7 @@ I/O - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) +- Bug in :meth:`read_excel` with `engine="odf"` caused UnboundLocalError in some cases where cells had nested child nodes (:issue:`36122`, and :issue:`35802`) Plotting ^^^^^^^^ diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 1a3cc55ede083..09a4bfb9accc3 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -211,6 +211,9 @@ def _get_cell_string_value(self, cell) -> str: spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) value.append(" " * spaces) else: + # recursive impl needed in case of nested fragments + # with multiple spaces + # https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704 value.append(self._get_cell_string_value(fragment)) else: value.append(str(fragment)) From 0e6b2e1809f2a96414530dc4004656d175e781ac Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Fri, 11 Sep 2020 16:24:55 -0400 Subject: [PATCH 7/7] whatsnew formatting fix --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index bde0fe1a930fc..b4289973a26cd 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -296,7 +296,7 @@ I/O - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) -- Bug in :meth:`read_excel` with `engine="odf"` caused UnboundLocalError in some cases where cells had nested child nodes (:issue:`36122`, and :issue:`35802`) +- Bug in :meth:`read_excel` with `engine="odf"` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, and :issue:`35802`) Plotting ^^^^^^^^