Skip to content

Commit 3a832df

Browse files
nbonnottejreback
authored andcommitted
ENH in .to_latex() support for utf-8 encoding in Python 2, #7061
1 parent 2213e18 commit 3a832df

File tree

4 files changed

+153
-95
lines changed

4 files changed

+153
-95
lines changed

doc/source/whatsnew/v0.18.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,9 @@ Other API Changes
303303

304304
- ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`)
305305

306+
- ``DataFrame.to_latex()`` now supports non-ascii encodings (eg utf-8) in Python 2 with the parameter ``encoding`` (:issue:`7061`)
307+
308+
306309
Changes to eval
307310
^^^^^^^^^^^^^^^
308311

pandas/core/format.py

+125-92
Original file line numberDiff line numberDiff line change
@@ -619,105 +619,20 @@ def _join_multiline(self, *strcols):
619619
st = ed
620620
return '\n\n'.join(str_lst)
621621

622-
def to_latex(self, column_format=None, longtable=False):
622+
def to_latex(self, column_format=None, longtable=False, encoding=None):
623623
"""
624624
Render a DataFrame to a LaTeX tabular/longtable environment output.
625625
"""
626-
self.escape = self.kwds.get('escape', True)
627626

628-
def get_col_type(dtype):
629-
if issubclass(dtype.type, np.number):
630-
return 'r'
631-
else:
632-
return 'l'
633-
634-
frame = self.frame
635-
636-
if len(frame.columns) == 0 or len(frame.index) == 0:
637-
info_line = (u('Empty %s\nColumns: %s\nIndex: %s')
638-
% (type(self.frame).__name__,
639-
frame.columns, frame.index))
640-
strcols = [[info_line]]
641-
else:
642-
strcols = self._to_str_columns()
643-
644-
if self.index and isinstance(self.frame.index, MultiIndex):
645-
clevels = self.frame.columns.nlevels
646-
strcols.pop(0)
647-
name = any(self.frame.index.names)
648-
for i, lev in enumerate(self.frame.index.levels):
649-
lev2 = lev.format()
650-
blank = ' ' * len(lev2[0])
651-
lev3 = [blank] * clevels
652-
if name:
653-
lev3.append(lev.name)
654-
for level_idx, group in itertools.groupby(
655-
self.frame.index.labels[i]):
656-
count = len(list(group))
657-
lev3.extend([lev2[level_idx]] + [blank] * (count - 1))
658-
strcols.insert(i, lev3)
659-
660-
if column_format is None:
661-
dtypes = self.frame.dtypes._values
662-
column_format = ''.join(map(get_col_type, dtypes))
663-
if self.index:
664-
index_format = 'l' * self.frame.index.nlevels
665-
column_format = index_format + column_format
666-
elif not isinstance(column_format,
667-
compat.string_types): # pragma: no cover
668-
raise AssertionError('column_format must be str or unicode, not %s'
669-
% type(column_format))
670-
671-
def write(buf, frame, column_format, strcols, longtable=False):
672-
if not longtable:
673-
buf.write('\\begin{tabular}{%s}\n' % column_format)
674-
buf.write('\\toprule\n')
675-
else:
676-
buf.write('\\begin{longtable}{%s}\n' % column_format)
677-
buf.write('\\toprule\n')
678-
679-
nlevels = frame.columns.nlevels
680-
if any(frame.index.names):
681-
nlevels += 1
682-
for i, row in enumerate(zip(*strcols)):
683-
if i == nlevels and self.header:
684-
buf.write('\\midrule\n') # End of header
685-
if longtable:
686-
buf.write('\\endhead\n')
687-
buf.write('\\midrule\n')
688-
buf.write('\\multicolumn{3}{r}{{Continued on next '
689-
'page}} \\\\\n')
690-
buf.write('\midrule\n')
691-
buf.write('\endfoot\n\n')
692-
buf.write('\\bottomrule\n')
693-
buf.write('\\endlastfoot\n')
694-
if self.escape:
695-
crow = [(x.replace('\\', '\\textbackslash') # escape backslashes first
696-
.replace('_', '\\_')
697-
.replace('%', '\\%')
698-
.replace('$', '\\$')
699-
.replace('#', '\\#')
700-
.replace('{', '\\{')
701-
.replace('}', '\\}')
702-
.replace('~', '\\textasciitilde')
703-
.replace('^', '\\textasciicircum')
704-
.replace('&', '\\&') if x else '{}') for x in row]
705-
else:
706-
crow = [x if x else '{}' for x in row]
707-
buf.write(' & '.join(crow))
708-
buf.write(' \\\\\n')
709-
710-
if not longtable:
711-
buf.write('\\bottomrule\n')
712-
buf.write('\\end{tabular}\n')
713-
else:
714-
buf.write('\\end{longtable}\n')
627+
latex_renderer = LatexFormatter(self, column_format=column_format,
628+
longtable=longtable)
715629

716630
if hasattr(self.buf, 'write'):
717-
write(self.buf, frame, column_format, strcols, longtable)
631+
latex_renderer.write_result(self.buf)
718632
elif isinstance(self.buf, compat.string_types):
719-
with open(self.buf, 'w') as f:
720-
write(f, frame, column_format, strcols, longtable)
633+
import codecs
634+
with codecs.open(self.buf, 'w', encoding=encoding) as f:
635+
latex_renderer.write_result(f)
721636
else:
722637
raise TypeError('buf is not a file name and it has no write '
723638
'method')
@@ -851,6 +766,124 @@ def _get_column_name_list(self):
851766
return names
852767

853768

769+
class LatexFormatter(TableFormatter):
770+
""" Used to render a DataFrame to a LaTeX tabular/longtable environment
771+
output.
772+
773+
Parameters
774+
----------
775+
formatter : `DataFrameFormatter`
776+
column_format : str, default None
777+
The columns format as specified in `LaTeX table format
778+
<https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g 'rcl' for 3 columns
779+
longtable : boolean, default False
780+
Use a longtable environment instead of tabular.
781+
782+
See also
783+
--------
784+
HTMLFormatter
785+
"""
786+
787+
def __init__(self, formatter, column_format=None, longtable=False):
788+
self.fmt = formatter
789+
self.frame = self.fmt.frame
790+
self.column_format = column_format
791+
self.longtable = longtable
792+
793+
def write_result(self, buf):
794+
"""
795+
Render a DataFrame to a LaTeX tabular/longtable environment output.
796+
"""
797+
798+
# string representation of the columns
799+
if len(self.frame.columns) == 0 or len(self.frame.index) == 0:
800+
info_line = (u('Empty %s\nColumns: %s\nIndex: %s')
801+
% (type(self.frame).__name__,
802+
self.frame.columns, self.frame.index))
803+
strcols = [[info_line]]
804+
else:
805+
strcols = self.fmt._to_str_columns()
806+
807+
def get_col_type(dtype):
808+
if issubclass(dtype.type, np.number):
809+
return 'r'
810+
else:
811+
return 'l'
812+
813+
if self.fmt.index and isinstance(self.frame.index, MultiIndex):
814+
clevels = self.frame.columns.nlevels
815+
strcols.pop(0)
816+
name = any(self.frame.index.names)
817+
for i, lev in enumerate(self.frame.index.levels):
818+
lev2 = lev.format()
819+
blank = ' ' * len(lev2[0])
820+
lev3 = [blank] * clevels
821+
if name:
822+
lev3.append(lev.name)
823+
for level_idx, group in itertools.groupby(
824+
self.frame.index.labels[i]):
825+
count = len(list(group))
826+
lev3.extend([lev2[level_idx]] + [blank] * (count - 1))
827+
strcols.insert(i, lev3)
828+
829+
column_format = self.column_format
830+
if column_format is None:
831+
dtypes = self.frame.dtypes._values
832+
column_format = ''.join(map(get_col_type, dtypes))
833+
if self.fmt.index:
834+
index_format = 'l' * self.frame.index.nlevels
835+
column_format = index_format + column_format
836+
elif not isinstance(column_format,
837+
compat.string_types): # pragma: no cover
838+
raise AssertionError('column_format must be str or unicode, not %s'
839+
% type(column_format))
840+
841+
if not self.longtable:
842+
buf.write('\\begin{tabular}{%s}\n' % column_format)
843+
buf.write('\\toprule\n')
844+
else:
845+
buf.write('\\begin{longtable}{%s}\n' % column_format)
846+
buf.write('\\toprule\n')
847+
848+
nlevels = self.frame.columns.nlevels
849+
if any(self.frame.index.names):
850+
nlevels += 1
851+
for i, row in enumerate(zip(*strcols)):
852+
if i == nlevels and self.fmt.header:
853+
buf.write('\\midrule\n') # End of header
854+
if self.longtable:
855+
buf.write('\\endhead\n')
856+
buf.write('\\midrule\n')
857+
buf.write('\\multicolumn{3}{r}{{Continued on next '
858+
'page}} \\\\\n')
859+
buf.write('\\midrule\n')
860+
buf.write('\\endfoot\n\n')
861+
buf.write('\\bottomrule\n')
862+
buf.write('\\endlastfoot\n')
863+
if self.fmt.kwds.get('escape', True):
864+
# escape backslashes first
865+
crow = [(x.replace('\\', '\\textbackslash')
866+
.replace('_', '\\_')
867+
.replace('%', '\\%')
868+
.replace('$', '\\$')
869+
.replace('#', '\\#')
870+
.replace('{', '\\{')
871+
.replace('}', '\\}')
872+
.replace('~', '\\textasciitilde')
873+
.replace('^', '\\textasciicircum')
874+
.replace('&', '\\&') if x else '{}') for x in row]
875+
else:
876+
crow = [x if x else '{}' for x in row]
877+
buf.write(' & '.join(crow))
878+
buf.write(' \\\\\n')
879+
880+
if not self.longtable:
881+
buf.write('\\bottomrule\n')
882+
buf.write('\\end{tabular}\n')
883+
else:
884+
buf.write('\\end{longtable}\n')
885+
886+
854887
class HTMLFormatter(TableFormatter):
855888

856889
indent_delta = 2

pandas/core/frame.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1547,7 +1547,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None,
15471547
header=True, index=True, na_rep='NaN', formatters=None,
15481548
float_format=None, sparsify=None, index_names=True,
15491549
bold_rows=True, column_format=None,
1550-
longtable=None, escape=None):
1550+
longtable=None, escape=None, encoding=None):
15511551
"""
15521552
Render a DataFrame to a tabular environment table. You can splice
15531553
this into a LaTeX document. Requires \\usepackage{booktabs}.
@@ -1567,7 +1567,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None,
15671567
default: True
15681568
When set to False prevents from escaping latex special
15691569
characters in column names.
1570-
1570+
encoding : str, default None
1571+
Default encoding is ascii in Python 2 and utf-8 in Python 3
15711572
"""
15721573

15731574
if colSpace is not None: # pragma: no cover
@@ -1589,7 +1590,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None,
15891590
sparsify=sparsify,
15901591
index_names=index_names,
15911592
escape=escape)
1592-
formatter.to_latex(column_format=column_format, longtable=longtable)
1593+
formatter.to_latex(column_format=column_format, longtable=longtable,
1594+
encoding=encoding)
15931595

15941596
if buf is None:
15951597
return formatter.buf.getvalue()

pandas/tests/test_format.py

+20
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
from numpy.random import randn
1616
import numpy as np
1717

18+
import codecs
19+
1820
div_style = ''
1921
try:
2022
import IPython
@@ -2554,6 +2556,24 @@ def test_to_latex_filename(self):
25542556
with open(path, 'r') as f:
25552557
self.assertEqual(self.frame.to_latex(), f.read())
25562558

2559+
# test with utf-8 and encoding option (GH 7061)
2560+
df = DataFrame([[u'au\xdfgangen']])
2561+
with tm.ensure_clean('test.tex') as path:
2562+
df.to_latex(path, encoding='utf-8')
2563+
with codecs.open(path, 'r', encoding='utf-8') as f:
2564+
self.assertEqual(df.to_latex(), f.read())
2565+
2566+
# test with utf-8 without encoding option
2567+
if compat.PY3: # python3 default encoding is utf-8
2568+
with tm.ensure_clean('test.tex') as path:
2569+
df.to_latex(path)
2570+
with codecs.open(path, 'r') as f:
2571+
self.assertEqual(df.to_latex(), f.read())
2572+
else:
2573+
# python2 default encoding is ascii, so an error should be raised
2574+
with tm.ensure_clean('test.tex') as path:
2575+
self.assertRaises(UnicodeEncodeError, df.to_latex, path)
2576+
25572577
def test_to_latex(self):
25582578
# it works!
25592579
self.frame.to_latex()

0 commit comments

Comments
 (0)