Skip to content

Commit 381ef5d

Browse files
committed
BUG: encoding woes on iso8859-2 encoded movielens file, close #795
1 parent 8ec6236 commit 381ef5d

File tree

5 files changed

+56
-5
lines changed

5 files changed

+56
-5
lines changed

pandas/core/common.py

+5
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,11 @@ def _stringify(col):
520520
except UnicodeError:
521521
return console_encode(col)
522522

523+
def _stringify_seq(values):
524+
if any(isinstance(x, unicode) for x in values):
525+
return [_stringify(x) for x in values]
526+
return [str(x) for x in values]
527+
523528
def _maybe_make_list(obj):
524529
if obj is not None and not isinstance(obj, (tuple, list)):
525530
return [obj]

pandas/core/format.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ def is_numeric_dtype(dtype):
329329
fmt_columns = zip(*fmt_columns)
330330
dtypes = self.frame.dtypes.values
331331
need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
332-
str_columns = zip(*[[u' %s' % y
332+
str_columns = zip(*[[' ' + y
333333
if y not in self.formatters and need_leadsp[x]
334334
else y for y in x]
335335
for x in fmt_columns])
@@ -341,7 +341,7 @@ def is_numeric_dtype(dtype):
341341
fmt_columns = self.columns.format()
342342
dtypes = self.frame.dtypes
343343
need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
344-
str_columns = [[u' %s' % x
344+
str_columns = [[' ' + x
345345
if col not in self.formatters and need_leadsp[x]
346346
else x]
347347
for col, x in zip(self.columns, fmt_columns)]
@@ -439,6 +439,18 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN',
439439
self.justify = justify
440440

441441
def get_result(self):
442+
if self._have_unicode():
443+
fmt_values = self._format_strings(use_unicode=True)
444+
else:
445+
fmt_values = self._format_strings(use_unicode=False)
446+
447+
return _make_fixed_width(fmt_values, self.justify)
448+
449+
def _have_unicode(self):
450+
mask = lib.map_infer(self.values, lambda x: isinstance(x, unicode))
451+
return mask.any()
452+
453+
def _format_strings(self, use_unicode=False):
442454
if self.float_format is None:
443455
float_format = print_config.float_format
444456
if float_format is None:
@@ -447,7 +459,10 @@ def get_result(self):
447459
else:
448460
float_format = self.float_format
449461

450-
formatter = _stringify if self.formatter is None else self.formatter
462+
if use_unicode:
463+
formatter = _stringify if self.formatter is None else self.formatter
464+
else:
465+
formatter = str if self.formatter is None else self.formatter
451466

452467
def _format(x):
453468
if self.na_rep is not None and lib.checknull(x):
@@ -472,7 +487,7 @@ def _format(x):
472487
else:
473488
fmt_values.append(' %s' % _format(v))
474489

475-
return _make_fixed_width(fmt_values, self.justify)
490+
return fmt_values
476491

477492
class FloatArrayFormatter(GenericArrayFormatter):
478493
"""

pandas/core/index.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def format(self, name=False):
292292
values = lib.maybe_convert_objects(values, safe=1)
293293

294294
if values.dtype == np.object_:
295-
result = [com._stringify(x) for x in values]
295+
result = com._stringify_seq(values)
296296
else:
297297
result = _trim_front(format_array(values, None, justify='left'))
298298
return header + result

pandas/tests/test_format.py

+13
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from StringIO import StringIO
2+
import os
23
import sys
34
import unittest
45

@@ -9,9 +10,14 @@
910
from pandas import DataFrame, Series, Index
1011
import pandas.core.format as fmt
1112
import pandas.util.testing as tm
13+
import pandas
1214

1315
_frame = DataFrame(tm.getSeriesData())
1416

17+
def curpath():
18+
pth, _ = os.path.split(os.path.abspath(__file__))
19+
return pth
20+
1521
class TestDataFrameFormatting(unittest.TestCase):
1622

1723
def setUp(self):
@@ -126,6 +132,13 @@ def test_unicode_problem_decoding_as_ascii(self):
126132
dm = DataFrame({u'c/\u03c3': Series({'test':np.NaN})})
127133
unicode(dm.to_string())
128134

135+
def test_string_repr_encoding(self):
136+
pth = curpath()
137+
filepath = os.path.join(pth, 'unicode_series.csv')
138+
df = pandas.read_csv(filepath, header=None)
139+
repr(df)
140+
repr(df['X.2'])
141+
129142
def test_repr_corner(self):
130143
# representing infs poses no problems
131144
df = DataFrame({'foo' : np.inf * np.empty(10)})

pandas/tests/unicode_series.csv

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
1617,King of New York (1990)
2+
1618,All Things Fair (1996)
3+
1619,"Sixth Man, The (1997)"
4+
1620,Butterfly Kiss (1995)
5+
1621,"Paris, France (1993)"
6+
1622,"C�r�monie, La (1995)"
7+
1623,Hush (1998)
8+
1624,Nightwatch (1997)
9+
1625,Nobody Loves Me (Keiner liebt mich) (1994)
10+
1626,"Wife, The (1995)"
11+
1627,Lamerica (1994)
12+
1628,Nico Icon (1995)
13+
1629,"Silence of the Palace, The (Saimt el Qusur) (1994)"
14+
1630,"Slingshot, The (1993)"
15+
1631,Land and Freedom (Tierra y libertad) (1995)
16+
1632,� k�ldum klaka (Cold Fever) (1994)
17+
1633,Etz Hadomim Tafus (Under the Domin Tree) (1994)
18+
1634,Two Friends (1986)

0 commit comments

Comments
 (0)