BUG: encoding woes on iso8859-2 encoded movielens file, close #795

wesm · wesm · commit 381ef5d48f76 · 2012-03-13T16:16:17.000-04:00
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -520,6 +520,11 @@ def _stringify(col):
     except UnicodeError:
         return console_encode(col)
 
+def _stringify_seq(values):
+    if any(isinstance(x, unicode) for x in values):
+        return [_stringify(x) for x in values]
+    return [str(x) for x in values]
+
 def _maybe_make_list(obj):
     if obj is not None and not isinstance(obj, (tuple, list)):
         return [obj]
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -329,7 +329,7 @@ def is_numeric_dtype(dtype):
             fmt_columns = zip(*fmt_columns)
             dtypes = self.frame.dtypes.values
             need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
-            str_columns = zip(*[[u' %s' % y
+            str_columns = zip(*[[' ' + y
                                 if y not in self.formatters and need_leadsp[x]
                                 else y for y in x]
                                for x in fmt_columns])
@@ -341,7 +341,7 @@ def is_numeric_dtype(dtype):
             fmt_columns = self.columns.format()
             dtypes = self.frame.dtypes
             need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes)))
-            str_columns = [[u' %s' % x
+            str_columns = [[' ' + x
                             if col not in self.formatters and need_leadsp[x]
                             else x]
                            for col, x in zip(self.columns, fmt_columns)]
@@ -439,6 +439,18 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN',
         self.justify = justify
 
     def get_result(self):
+        if self._have_unicode():
+            fmt_values = self._format_strings(use_unicode=True)
+        else:
+            fmt_values = self._format_strings(use_unicode=False)
+
+        return _make_fixed_width(fmt_values, self.justify)
+
+    def _have_unicode(self):
+        mask = lib.map_infer(self.values, lambda x: isinstance(x, unicode))
+        return mask.any()
+
+    def _format_strings(self, use_unicode=False):
         if self.float_format is None:
             float_format = print_config.float_format
             if float_format is None:
@@ -447,7 +459,10 @@ def get_result(self):
         else:
             float_format = self.float_format
 
-        formatter = _stringify if self.formatter is None else self.formatter
+        if use_unicode:
+            formatter = _stringify if self.formatter is None else self.formatter
+        else:
+            formatter = str if self.formatter is None else self.formatter
 
         def _format(x):
             if self.na_rep is not None and lib.checknull(x):
@@ -472,7 +487,7 @@ def _format(x):
             else:
                 fmt_values.append(' %s' % _format(v))
 
-        return _make_fixed_width(fmt_values, self.justify)
+        return fmt_values
 
 class FloatArrayFormatter(GenericArrayFormatter):
     """
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -292,7 +292,7 @@ def format(self, name=False):
             values = lib.maybe_convert_objects(values, safe=1)
 
         if values.dtype == np.object_:
-            result = [com._stringify(x) for x in values]
+            result = com._stringify_seq(values)
         else:
             result = _trim_front(format_array(values, None, justify='left'))
         return header + result
diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py
@@ -1,4 +1,5 @@
 from StringIO import StringIO
+import os
 import sys
 import unittest
 
@@ -9,9 +10,14 @@
 from pandas import DataFrame, Series, Index
 import pandas.core.format as fmt
 import pandas.util.testing as tm
+import pandas
 
 _frame = DataFrame(tm.getSeriesData())
 
+def curpath():
+    pth, _ = os.path.split(os.path.abspath(__file__))
+    return pth
+
 class TestDataFrameFormatting(unittest.TestCase):
 
     def setUp(self):
@@ -126,6 +132,13 @@ def test_unicode_problem_decoding_as_ascii(self):
         dm = DataFrame({u'c/\u03c3': Series({'test':np.NaN})})
         unicode(dm.to_string())
 
+    def test_string_repr_encoding(self):
+        pth = curpath()
+        filepath = os.path.join(pth, 'unicode_series.csv')
+        df = pandas.read_csv(filepath, header=None)
+        repr(df)
+        repr(df['X.2'])
+
     def test_repr_corner(self):
         # representing infs poses no problems
         df = DataFrame({'foo' : np.inf * np.empty(10)})
diff --git a/pandas/tests/unicode_series.csv b/pandas/tests/unicode_series.csv
@@ -0,0 +1,18 @@
+1617,King of New York (1990)
+1618,All Things Fair (1996)
+1619,"Sixth Man, The (1997)"
+1620,Butterfly Kiss (1995)
+1621,"Paris, France (1993)"
+1622,"C�r�monie, La (1995)"
+1623,Hush (1998)
+1624,Nightwatch (1997)
+1625,Nobody Loves Me (Keiner liebt mich) (1994)
+1626,"Wife, The (1995)"
+1627,Lamerica (1994)
+1628,Nico Icon (1995)
+1629,"Silence of the Palace, The (Saimt el Qusur) (1994)"
+1630,"Slingshot, The (1993)"
+1631,Land and Freedom (Tierra y libertad) (1995)
+1632,� k�ldum klaka (Cold Fever) (1994)
+1633,Etz Hadomim Tafus (Under the Domin Tree) (1994)
+1634,Two Friends (1986)