Skip to content

Commit ae70acc

Browse files
committed
ENH: handling of UTF-8 strings in DataFrame columns, close #1620
1 parent 22e3802 commit ae70acc

File tree

3 files changed

+27
-3
lines changed

3 files changed

+27
-3
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ pandas 0.8.1
102102
- Handle None values in dict passed to concat (#1649)
103103
- Fix Series.interpolate with method='values' and DatetimeIndex (#1646)
104104
- Fix IndexError in left merges on a DataFrame with 0-length (#1628)
105+
- Fix DataFrame column width display with UTF-8 encoded characters (#1620)
105106

106107
pandas 0.8.0
107108
============

pandas/core/format.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,12 @@ def to_string(self):
136136
def _encode_diff(x):
137137
return len(x) - len(x.decode('utf-8'))
138138

139+
def _strlen(x):
140+
try:
141+
return len(x.decode('utf-8'))
142+
except UnicodeError:
143+
return len(x)
144+
139145
class DataFrameFormatter(object):
140146
"""
141147
Render a DataFrame
@@ -204,7 +210,7 @@ def to_string(self, force_unicode=False):
204210
if self.header:
205211
fmt_values = self._format_col(i)
206212
cheader = str_columns[i]
207-
max_len = max(max(len(x) for x in fmt_values),
213+
max_len = max(max(_strlen(x) for x in fmt_values),
208214
max(len(x) for x in cheader))
209215
if self.justify == 'left':
210216
cheader = [x.ljust(max_len) for x in cheader]
@@ -606,7 +612,7 @@ def _make_fixed_width(strings, justify='right'):
606612
if len(strings) == 0:
607613
return strings
608614

609-
max_len = max(len(x) for x in strings)
615+
max_len = max(_strlen(x) for x in strings)
610616
conf_max = print_config.max_colwidth
611617
if conf_max is not None and max_len > conf_max:
612618
max_len = conf_max
@@ -617,7 +623,12 @@ def _make_fixed_width(strings, justify='right'):
617623
justfunc = lambda self, x: self.rjust(x)
618624

619625
def just(x):
620-
return justfunc(x[:max_len], max_len)
626+
try:
627+
eff_len = max_len + _encode_diff(x)
628+
except UnicodeError:
629+
eff_len = max_len
630+
631+
return justfunc(x[:eff_len], eff_len)
621632

622633
return [just(x) for x in strings]
623634

pandas/tests/test_frame.py

+12
Original file line numberDiff line numberDiff line change
@@ -2574,6 +2574,18 @@ def test_repr(self):
25742574

25752575
fmt.reset_printoptions()
25762576

2577+
def test_repr_unicode(self):
2578+
df = DataFrame({'A': ['\xc3\xa4\xc3\xa4\xc3\xa4\xc3\xa4',
2579+
'\xc3\xbc\xc3\xbc\xc3\xbc\xc3\xbc']})
2580+
2581+
result = repr(df)
2582+
ex_top = ' A'
2583+
self.assertEqual(result.split('\n')[0].rstrip(), ex_top)
2584+
2585+
df = DataFrame({'A': [u'\xe4\xe4\xe4\xe4', u'\xfc\xfc\xfc\xfc']})
2586+
result = repr(df)
2587+
self.assertEqual(result.split('\n')[0].rstrip(), ex_top)
2588+
25772589
def test_very_wide_info_repr(self):
25782590
df = DataFrame(np.random.randn(10, 20),
25792591
columns=[tm.rands(10) for _ in xrange(20)])

0 commit comments

Comments
 (0)