From d0597b8475de62ce678c8cbe8466e153b4347932 Mon Sep 17 00:00:00 2001 From: Fabio Zanini Date: Mon, 13 Oct 2014 09:49:21 +0200 Subject: [PATCH 1/2] BUG: "converters" in read_excel with missing data --- doc/source/io.rst | 24 +++++++++++++++++++++ pandas/io/excel.py | 9 +++++++- pandas/io/parsers.py | 9 ++++++-- pandas/io/tests/data/test_converters.xls | Bin 0 -> 6144 bytes pandas/io/tests/data/test_converters.xlsx | Bin 0 -> 4810 bytes pandas/io/tests/test_excel.py | 25 ++++++++++++++++++++++ 6 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 pandas/io/tests/data/test_converters.xls create mode 100644 pandas/io/tests/data/test_converters.xlsx diff --git a/doc/source/io.rst b/doc/source/io.rst index 1d83e06a13567..4655898d67724 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1992,6 +1992,30 @@ indices to be parsed. read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3]) +.. note:: + + It is possible to transform the contents of Excel cells via the `converters` + option. It accepts a dictionary of functions: the keys are the names or + indices of columns to be transformed, the values are functions that take one + input argument, the Excel cell content, and return the transformed content. + For instance, to convert a column to boolean: + + .. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) + + This options handles missing values and treats exceptions in the converters + as missing data. Transformations are applied cell by cell rather than to the + column as a whole, so the array dtype is not guaranteed. For instance, a + column of integers with missing values cannot be transformed to an array + with integer dtype, because NaN is strictly a float. You can manually mask + missing data to recover integer dtype: + + .. code-block:: python + + cfun = lambda x: int(x) if x else -1 + read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + To write a DataFrame object to a sheet of an Excel file, you can use the ``to_excel`` instance method. The arguments are largely the same as ``to_csv`` described above, the first argument being the name of the excel file, and the diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 424518cbde4f8..c49744749a143 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -83,6 +83,9 @@ def read_excel(io, sheetname=0, **kwds): Rows to skip at the beginning (0-indexed) skip_footer : int, default 0 Rows at the end to skip (0-indexed) + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels index_col : int, default None Column to use as the row labels of the DataFrame. Pass None if there is no such column @@ -175,7 +178,7 @@ def __init__(self, io, **kwds): def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, - convert_float=True, has_index_names=False, **kwds): + convert_float=True, has_index_names=False, converters=None, **kwds): """Read an Excel table into DataFrame Parameters @@ -188,6 +191,9 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, Rows to skip at the beginning (0-indexed) skip_footer : int, default 0 Rows at the end to skip (0-indexed) + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels index_col : int, default None Column to use as the row labels of the DataFrame. Pass None if there is no such column @@ -235,6 +241,7 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, thousands=thousands, chunksize=chunksize, skip_footer=skip_footer, convert_float=convert_float, + converters=converters, **kwds) def _should_parse(self, i, parse_cols): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8f8e3151d56e6..efb3b7cdcd0d0 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -127,7 +127,7 @@ class ParserWarning(Warning): Return TextFileReader object for iteration skipfooter : int, default 0 Number of lines at bottom of file to skip (Unsupported with engine='c') -converters : dict. optional +converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels verbose : boolean, default False @@ -983,8 +983,13 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, na_fvalues) coerce_type = True if conv_f is not None: - values = lib.map_infer(values, conv_f) + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = lib.ismember(values, na_values).view(np.uin8) + values = lib.map_infer_mask(values, conv_f, mask) coerce_type = False + cvals, na_count = self._convert_types( values, set(col_na_values) | col_na_fvalues, coerce_type) result[c] = cvals diff --git a/pandas/io/tests/data/test_converters.xls b/pandas/io/tests/data/test_converters.xls new file mode 100644 index 0000000000000000000000000000000000000000..c0aa9d903adadade636e3abc1420a2e563b23527 GIT binary patch literal 6144 zcmeHLU2IfE6h5=vEw?{(w}logP}eH7w1wE<4~9UuEnCG@F>Q&)kVsp$P+7WI*90Hb zMd5)&5(qKAkSIyn`0|VuI2Dwmf zArD-0#>zGHViA1+*L)FR0pu4u^({25g`wu@fi8*RFD^j|%Mj{ec^<2sf2l&mxJ~7w z={_&=k)KUw!Tz)dX7d%>-TvqK&2zi}$hQA^-gEz#0HwfUpbYQ>NC8p~Q~*nWr9dT6 z1uO%W1GfRyzzSd`unMRFZU=(E9RSC3r&E(&1G)~V2i7|LMyH;h=>l~#JWym|6$^dx z>Ef?>9ba!Wq2C-Sap6>PBY>#33*G7>n1R=$h1qZL}gI=a1IVi zGa~mZNTgm;2~a-Msx6e7Piu|0R7RL%9}GVl?rQC<@m9+j?w#76t{vgpP3fwmwX-{1 zJE1#kM1D^NF@|&*4U*5HEC09r`1EHs`CNSVnoa)y%O8PHgPfjI(td8V{lJjXlIE|? z!EegJZ^*$f%E}kSiH?Vdv&-%jpO>^5{4@WMKk0#TLQ<>)i2|+56NOq=B)nQLNi5QO zX~LJvl-azd^kvE07!uQ0zf}ohoolP3Op0 zly_L4Yo_!swbZ%j`YAfuverePmRgKvH|%gVJh0YeO3v=W|3pi=HnIaP#qxqqTg=g7 zW^5D>v$uWD+Zo^T(DS65;4c*87xMU$$>XYK^6Z<#OkKuFrW`}b&AX=e3i~D@V!s)9 zns)6skNQ3RR)9C!`W*KP%J?-``DGF>l}dY5&p@gQmPB5}s&d4BwPe3}99iQjL>nd9 zAN%#oA4sTv`3WkOs%Q6Cpi&$?d*IA44k)k(FXWb~Rj*3%mlLHZ&6vTx3jX$~&THq_ zTq;>_w%G0Iad4(S)n)ptUs+OKUuyxu-*sr0$_21AwCzdhKFdt7{M1>QW^5qHh}ioxx# z0Nt?1fJKjqjOtT5cxQZMuzxuAY-}JN3?J-|M1v39zpkgZFA|MLbjjNW2L|l4Z|ltY z3GdoQ=d5WfT!$aH2Qv#-ZQ{rqzh2$GH}K90OB(Aw`hlw#Km+g4Y(D@{^B}+oJPa^y zj{{t}Hvul(y8xr(6M*aRCBTLG9^eX00KA`H0eHy$3DAbuncUCU>^nDYvhe=xrfKWp z$iP6hHn*Ic&eY*NIKL1(aY`snpeCJoX7a?@@pDG$p(oxtdj6crnO;_LB3Z5^6w-dbO0LMlI0P1qix%g;aD2@ki=I?*?f5slp=4i&c1}v(_@%3R? z_Tk3O?-2X4iQkY{fz{@EbKj&-r5h?Zjcv|>W~3c<_hT<^#~7Z@M|Fk_?Q=e~%`&4g z$4f2v+nhE7yHSbcker>(3};)mpR;R-+yx;X?kqm(mgg}4{rKg>Z$I1c6=*=$gq%2j z1;xP0UFseCJP-1n|7ysFoWsN?TVbJJ#}LskP;(p4#(y#Y2H211p3v K%c3wJ`+oz*DjZS( literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/test_converters.xlsx b/pandas/io/tests/data/test_converters.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..e21bc5fbf9ee23292a431f52ec400cc3ef231c86 GIT binary patch literal 4810 zcmaJ_2{@E**Qc?Jow5^IvS!~2r7(y_GWN(I%P`ill{FDEwnUUYA(6FY$k-Z%k;pRk zEZN4sWcz0NfA5>G?|nOSUC%Smbv@_IoZq?6xqrVKYyco*CIJG0Bp+ag+9W58hVb3i z+Y#m?B}u%NCHLq9WEi9I_hJ)+{cBmSI#i)zu$iEHVdaTt~?z*?X(q}r5mdQFR>JlPdH!?&d;XbS>!^wnRN+tL-Yk1pq(XOTVb2$l08T( zxugfJjnU?}RHy6;x!*-Qj5-A0cek+x96QW!#}sdmIgjl>f<*7!e^ODrs1xn~{njZR zmEI|{yhMo|5gjSH2BzyhS4~WFIE~)Jr2O}SjesHpR&OR^yq>+iSrwn&Zs|FcRKyfJ4FF za8Dnp+n%110q(G8Sar7+X-0|(R1-SpRqf4=bVi4$v@fDqdXFDY%s;YTn)8C@CJy#X zB;+&Hc<^hhV{4=KMN_mg9C%f4TbUbSCbzi!mp)n@TYG?#z^W3$_#6Q?kc05buG=Ol zKPO0=)q3G0l?+VOaDfc+LaaklI3EeI=b29W=_OD7yy2;Ya)AO=B8t4(1?#vAz?8pL zpAI!%?)Pun9JAHhCc~?D7AvumO>e5MMl%ZK3(iirjdAlg95`1OFO-xGl%l*gWbeX; zbKPSzqxk9$$)c)@GaO4hhOX6H`vfg|1sx{so+1KNY2E{xCZz97`Zs`>P67mldpi;l zib}S1iaaI_ev~r{dCvMzx%UECJ;&`t7pF zxHiu@W?)5!Rf;&;Axk-viVJ5;*gg%&Jqe8P60aw2f_ z=@Lh$x94vAxdCg_{Uake@FYU6vr(Y|(p6@)gSXTd;W0Wh4Iih3zZh%6OeeOg@Wpp? z6q;eA!o2FkBe@&Rb=q?qxl6NoMfZPI1JD$_b)8U&)W0rQPC_*TU{e3IvOnD0%^nVS zBUa>}woWo6EL_?L_(K+*oi;s6XuL8i{&M-@E@`{*6t-0{d7$PvM3T#j+Hnb+g8jD9 z2-;eZf3y(OdWDvvB#$hrxeWnCg&(dgPjW!I^2R2d&m${%PnR~F84P37ld`PZP_~e? zM<=S;6kzm?ioap%BbSY(KCSEZWs!EVn3P!osy)QP?|sn670_>V^tSoI+0q~BsJgs) zlH_`CAnhl_rDQ2L-S=kHXEgcG4hnRZM`qtp^rz~#b>hJX(ZH!j%D^E@2Z9^xbs5@a zybuZ`(A@B2lMTYqwWPF@&dpkOq_j-+B6oFKz>O8op6f&&{{q&;zyH!AngVh=Gb9wXtG7Xm@P+p9;2 z;foPWkK|iF-{*NP;t5nTx@xF)eoeZ>==y-Zrv1ap8LDChUTcG|tcydPH)ZH$`CT!H zaX*II)~j|w&-K)+dF2XwIFO+#%6iQ%0etG-2uNg==6Y$-XRog8j1B50SDW6^(;iPe z>+bJ|R=9_)(h=Kpvi8}%8bVSUe{E0pf6^u{1s`WeM_(VQUpEL`dYd`g)yq{)_P z4<$&7Mu^+hM0p3_@GnkSDf5L3A!1BBx`*gt`-^axFrT6FvU)gUWyf^JUgMiJrQ^-2 ziRL7+t^)I%0Os7V*qf(?=xH_&1ATnrPPtc{OE$O z8`}9^`m2i(%`4r?f>RLLVrrOO34N|4OIxeESb%auo`t!cP$JT^=;%6ahU0}ow+sk5 zOFN};DikoYnDv-bVotIpg<7_q&eZmNc%|HDg(!u{9dS${_1D`?mok4wqGuXfDeLoR zgMT2N-q|C2;3V)W+HY_sOZubx9yN0p>%7n1y5|-QbE|Cm!lN9N!a^|@)kR&LUAZwD z4mm!7!z9(Q(8KQC_x9hDsHh?NCBwT#7cWoRhFzLB^`C^7IJkkA60iI~%FDTf!W#0q zlTE&i)|Jsl3%>tmFBQlYhNU&b4W8z{{^$@0W(5rzc>+ZZGVQfh>UND2Eyp#_9l>HF zltklfVpS^GbdB#Eb-ClMBzlm$rjUg4pnjfmRI1c~!?aFqIepX&s&!jrz%+*>atm}% zfx*jvrHA9%H!c@|e}`d@;z~s`$IA?ou_o{*G3K-!C*A3qEP(fNhy*3~li^Bh zxokPD`FE4#M=rgO&ale{4=r^C)yP|Xh)>5!B9qrTEcX4#_qiq4IJp|K!~>~QX%ZpB zAJI>Ix)MPMaYr)8V zEc9Rus=k?C&BHh0(Tpso4#9~USVtg2o1yYXLAxlEgYnw~V^CbPCMrJE+w>rXwEosK zB*j)M^D7q3ll;Rp=^7=-LkI<-x0&j7?{QH$JF#pNz2%GT{O&ED;&$3I+xpVZn{h+D z{2SD|T{3C{-Gf0p@$r$Hj)fEzW_YQzUW0M`I7TQ~AkA@i(8ANbD zcW1mgcjw46WNM!kd#)=#)1YmvKsvqr(h|5o(DKq7jx>5USH2CFQb1%Yjw;-Fldrhu zvRrswU+$;@VDtU$mFg%z(A2J?{xAczEY-ry)E5h7)AJWUoi|n6);Sy*NFw1d86J#2 zvNBz=5&W(_fc!aBTC{8ucZv&9YOS4<-Rfi1#tLqk1k+xEmKW+5$D?VlUMl>NGLuzO z6Z)onaLhA{(iuPiMKgezwx$Ntko1%rekw*o}k^aI|1`H)jzzLyA5~ znba{kab9FD`3^1sNJ!QQT<^d09lHM=lfHp4$6xbtIlb3tM4I^leup_t3ZayoR{H@I zK`qcHc%PBk9}90(eNyam!29FNCr;FQ0c2&*{of?9r3cYB1vP75)v1#>y;;Oo+vM8)<)BDDRt_d z$5-ezR|5}D)qD3vQ8K|ZZj~WHW>=nI`f9p56UHLigZrxsN+M$;UTb-r)4$+(J}Zsy ztc$*0r*{KW!1=f6B(b_6QA(Z?z~y%Gz^~47;i69F^MML*3T=LYp}D9_lZExO)E2SZ zo9DnXefB0-(NnT-8inO9501u17rZO`bW>iDWTy9Qf?Ep5!!0bNpz4~U*v#ui`N3MF z$30zIH8Y691;FNzt zoUTUVx24BbJCL>N?NTX4-6J7K*q>HveLIf(bB_DL``?#M-EmU2Y>&H&Z~2!Y*}F>N zr>w-Jvm@^YxM;GQU-jP~JknJTX%(et}Z00 z&ff_Pf`RpK?ZNPSJN*~4dUV4OJ|azv!kYo>m1nksqgKyI6s4Q=Gv4D*^gd+1O zP$N5a)@2R=Nj@(3mesPOsHcNW%ED2k=!MFQ+4@Jhz}CmO2Px1g%v9ph6f~U!uT|X? zDeTFhN->vO@K)0gn(MH2oEYr|MuLk(_ts%3s>tMqyl?(He74zTxoKdEz4qT;QF~NwvP941--u&2(V36Q=wg0&RT|>z}aR{laATBKDw?vv55S{?_M6 zxE0$hWTgh-HcJZ+fX`-l<=uY&bPi4>@Xa>*(QYdo$3}(Z}W&LrQHnXpv@a z$M1{lzy9)K;~f7og7M8*poz8jZK?c zQo%(Vi5J?UvYx|Sivu+Q78>8{>@_lPesU-2d^=z)@sz#gS!w~-JPV6P*z~i<9k;ZP z+G=R^>s;y9>o33ywk0Ft0#|vW0#H$64PtIe7=y< zkB68?a!t;A38|@2It4{63mb4x2@`<2#0#qQN?I^N@AY@O+97wIpC_k%C2zO>AuX&I z?FCzAGK$9EF=j^&o`+ literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 6d3f0b5475298..4f97cef3d46d3 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -399,6 +399,31 @@ def test_reader_special_dtypes(self): convert_float=False) tm.assert_frame_equal(actual, no_convert_float) + # GH8212 - support for converters and missing values + def test_reader_converters(self): + _skip_if_no_xlrd() + + expected = DataFrame.from_items([ + ("IntCol", [1, 2, -3, -1000, 0]), + ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), + ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']), + ("StrCol", ['1', np.nan, '3', '4', '5']), + ]) + + converters = {'IntCol': lambda x: int(x) if x != '' else -1000, + 'FloatCol': lambda x: 10 * x if x else np.nan, + 2: lambda x: 'Found' if x != '' else 'Not found', + 3: lambda x: str(x) if x else '', + } + + xlsx_path = os.path.join(self.dirpath, 'test_converters.xlsx') + xls_path = os.path.join(self.dirpath, 'test_converters.xls') + + # should read in correctly and set types of single cells (not array dtypes) + for path in (xls_path, xlsx_path): + actual = read_excel(path, 'Sheet1', converters=converters) + tm.assert_frame_equal(actual, expected) + def test_reader_seconds(self): # Test reading times with and without milliseconds. GH5945. _skip_if_no_xlrd() From 89d48716906dd43c6e5aab919e513f6aec0248a5 Mon Sep 17 00:00:00 2001 From: Fabio Zanini Date: Fri, 14 Nov 2014 14:37:36 +0100 Subject: [PATCH 2/2] docs fix (?) --- doc/source/io.rst | 5 +---- pandas/io/excel.py | 4 +++- pandas/io/parsers.py | 5 +++++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 4655898d67724..00e86d971182d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1995,10 +1995,7 @@ indices to be parsed. .. note:: It is possible to transform the contents of Excel cells via the `converters` - option. It accepts a dictionary of functions: the keys are the names or - indices of columns to be transformed, the values are functions that take one - input argument, the Excel cell content, and return the transformed content. - For instance, to convert a column to boolean: + option. For instance, to convert a column to boolean: .. code-block:: python diff --git a/pandas/io/excel.py b/pandas/io/excel.py index c49744749a143..2ece91b5dea11 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -85,7 +85,9 @@ def read_excel(io, sheetname=0, **kwds): Rows at the end to skip (0-indexed) converters : dict, default None Dict of functions for converting values in certain columns. Keys can - either be integers or column labels + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. index_col : int, default None Column to use as the row labels of the DataFrame. Pass None if there is no such column diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index efb3b7cdcd0d0..b23aa017138e1 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1274,6 +1274,11 @@ def TextParser(*args, **kwds): Row numbers to skip skip_footer : int Number of line at bottom of file to skip + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. encoding : string, default None Encoding to use for UTF when reading/writing (ex. 'utf-8') squeeze : boolean, default False