pandas-dev · jreback · Oct 3, 2015 · Aug 9, 2014 · jreback · Sep 27, 2015
diff --git a/doc/source/options.rst b/doc/source/options.rst
@@ -440,3 +440,56 @@ For instance:
    pd.reset_option('^display\.')
 
 To round floats on a case-by-case basis, you can also use :meth:`~pandas.Series.round` and :meth:`~pandas.DataFrame.round`.
+
+.. _options.east_asian_width:
+
+Unicode Formatting
+------------------
+
+.. warning::
+
+   Enabling this option will affect the performance for printing of DataFrame and Series (about 2 times slower).
+   Use only when it is actually required.
+
+Some East Asian countries use Unicode characters its width is corresponding to 2 alphabets.
+If DataFrame or Series contains these characters, default output cannot be aligned properly.
+
+.. ipython:: python
+
+   df = pd.DataFrame({u'国籍': ['UK', u'日本'], u'名前': ['Alice', u'しのぶ']})
+   df
+
+Enable ``display.unicode.east_asian_width`` allows pandas to check each character's "East Asian Width" property.
+These characters can be aligned properly by checking this property, but it takes longer time than standard ``len`` function.
+
+.. ipython:: python
+
+   pd.set_option('display.unicode.east_asian_width', True)
+   df
+
+In addition, Unicode contains characters which width is "Ambiguous". These character's width should be either 1 or 2 depending on terminal setting or encoding. Because this cannot be distinguished from Python, ``display.unicode.ambiguous_as_wide`` option is added to handle this.
+
+By default, "Ambiguous" character's width, "¡" (inverted exclamation) in below example, is regarded as 1.
+
+.. note::
+
+   This should be aligned properly in terminal which uses monospaced font.
+
+.. ipython:: python
+
+   df = pd.DataFrame({'a': ['xxx', u'¡¡'], 'b': ['yyy', u'¡¡']})
+   df
+
+Enabling ``display.unicode.ambiguous_as_wide`` lets pandas to regard these character's width as 2. Note that this option will be effective only when ``display.unicode.east_asian_width`` is enabled. Confirm starting position has been changed, but not aligned properly because the setting is mismatched with this environment.
+
+.. ipython:: python
+
+   pd.set_option('display.unicode.ambiguous_as_wide', True)
+   df
+
+.. ipython:: python
+   :suppress:
+
+   pd.set_option('display.unicode.east_asian_width', False)
+   pd.set_option('display.unicode.ambiguous_as_wide', False)
+
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -49,6 +49,7 @@ Highlights include:
 - Support for reading SAS xport files, see :ref:`here <whatsnew_0170.enhancements.sas_xport>`
 - Documentation comparing SAS to *pandas*, see :ref:`here <compare_with_sas>`
 - Removal of the automatic TimeSeries broadcasting, deprecated since 0.8.0, see :ref:`here <whatsnew_0170.prior_deprecations>`
+- Display format with plain text can optionally align with Unicode East Asian Width, see :ref:`here <whatsnew_0170.east_asian_width>`
 - Compatibility with Python 3.5 (:issue:`11097`)
 - Compatibility with matplotlib 1.5.0 (:issue:`11111`)
 
@@ -334,6 +335,36 @@ Google BigQuery Enhancements
 - The ``generate_bq_schema()`` function is now deprecated and will be removed in a future version (:issue:`11121`)
 - Update the gbq module to support Python 3 (:issue:`11094`).
 
+.. _whatsnew_0170.east_asian_width:
+
+Display Alignemnt with Unicode East Asian Width
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. warning::
+
+   Enabling this option will affect the performance for printing of DataFrame and Series (about 2 times slower).
+   Use only when it is actually required.
+
+Some East Asian countries use Unicode characters its width is corresponding to 2 alphabets. If DataFrame or Series contains these characters, default output cannot be aligned properly. The following options are added to enable precise handling for these characters.
+
+- ``display.unicode.east_asian_width``: Whether to use the Unicode East Asian Width to calculate the display text width. (:issue:`2612`)
+- ``display.unicode.ambiguous_as_wide``: Whether to handle Unicode characters belong to Ambiguous as Wide. (:issue:`11102`)
+
+.. ipython:: python
+
+   df = pd.DataFrame({u'国籍': ['UK', u'日本'], u'名前': ['Alice', u'しのぶ']})
+   df
+
+   pd.set_option('display.unicode.east_asian_width', True)
+   df
+
+For further details, see :ref:`here <options.east_asian_width>`
+
+.. ipython:: python
+   :suppress:
+
+   pd.set_option('display.unicode.east_asian_width', False)
+
 .. _whatsnew_0170.enhancements.other:
 
 Other enhancements

diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -35,6 +35,7 @@
 from itertools import product
 import sys
 import types
+from unicodedata import east_asian_width
 
 PY2 = sys.version_info[0] == 2
 PY3 = (sys.version_info[0] >= 3)
@@ -90,6 +91,7 @@ def lmap(*args, **kwargs):
 
     def lfilter(*args, **kwargs):
         return list(filter(*args, **kwargs))
+
 else:
     # Python 2
     import re
@@ -176,6 +178,11 @@ class to receive bound method
 # The license for this library can be found in LICENSES/SIX and the code can be
 # found at https://bitbucket.org/gutworth/six
 
+# Definition of East Asian Width
+# http://unicode.org/reports/tr11/
+# Ambiguous width can be changed by option
+_EAW_MAP = {'Na': 1, 'N': 1, 'W': 2, 'F': 2, 'H': 1}
+
 if PY3:
     string_types = str,
     integer_types = int,
@@ -188,6 +195,20 @@ def u(s):
 
     def u_safe(s):
         return s
+
+    def strlen(data, encoding=None):
+        # encoding is for compat with PY2
+        return len(data)
+
+    def east_asian_len(data, encoding=None, ambiguous_width=1):
+        """
+        Calculate display width considering unicode East Asian Width
+        """
+        if isinstance(data, text_type):
+            return sum([_EAW_MAP.get(east_asian_width(c), ambiguous_width) for c in data])
+        else:
+            return len(data)
+
 else:
     string_types = basestring,
     integer_types = (int, long)
@@ -204,6 +225,25 @@ def u_safe(s):
         except:
             return s
 
+    def strlen(data, encoding=None):
+        try:
+            data = data.decode(encoding)
+        except UnicodeError:
+            pass
+        return len(data)
+
+    def east_asian_len(data, encoding=None, ambiguous_width=1):
+        """
+        Calculate display width considering unicode East Asian Width
+        """
+        if isinstance(data, text_type):
+            try:
+                data = data.decode(encoding)
+            except UnicodeError:
+                pass
+            return sum([_EAW_MAP.get(east_asian_width(c), ambiguous_width) for c in data])
+        else:
+            return len(data)
 
 string_and_binary_types = string_types + (binary_type,)
 

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -2149,28 +2149,50 @@ def _count_not_none(*args):
 
 
 
-def adjoin(space, *lists):
+def adjoin(space, *lists, **kwargs):
     """
     Glues together two sets of strings using the amount of space requested.
     The idea is to prettify.
-    """
+
+    ----------
+    space : int
+        number of spaces for padding
+    lists : str
+        list of str which being joined
+    strlen : callable
+        function used to calculate the length of each str. Needed for unicode
+        handling.
+    justfunc : callable
+        function used to justify str. Needed for unicode handling.
+    """
+    strlen = kwargs.pop('strlen', len)
+    justfunc = kwargs.pop('justfunc', _justify)
+
     out_lines = []
     newLists = []
-    lengths = [max(map(len, x)) + space for x in lists[:-1]]
-
+    lengths = [max(map(strlen, x)) + space for x in lists[:-1]]
     # not the last one
     lengths.append(max(map(len, lists[-1])))
-
     maxLen = max(map(len, lists))
     for i, lst in enumerate(lists):
-        nl = [x.ljust(lengths[i]) for x in lst]
+        nl = justfunc(lst, lengths[i], mode='left')
         nl.extend([' ' * lengths[i]] * (maxLen - len(lst)))
         newLists.append(nl)
     toJoin = zip(*newLists)
     for lines in toJoin:
         out_lines.append(_join_unicode(lines))
     return _join_unicode(out_lines, sep='\n')
 
+def _justify(texts, max_len, mode='right'):
+    """
+    Perform ljust, center, rjust against string or list-like
+    """
+    if mode == 'left':
+        return [x.ljust(max_len) for x in texts]
+    elif mode == 'center':
+        return [x.center(max_len) for x in texts]
+    else:
+        return [x.rjust(max_len) for x in texts]
 
 def _join_unicode(lines, sep=''):
     try:

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -144,6 +144,17 @@
     Deprecated.
 """
 
+pc_east_asian_width_doc = """
+: boolean
+    Whether to use the Unicode East Asian Width to calculate the display text width
+    Enabling this may affect to the performance (default: False)
+"""
+pc_ambiguous_as_wide_doc = """
+: boolean
+    Whether to handle Unicode characters belong to Ambiguous as Wide (width=2)
+    (default: False)
+"""
+
 pc_line_width_deprecation_warning = """\
 line_width has been deprecated, use display.width instead (currently both are
 identical)
@@ -282,6 +293,10 @@ def mpl_style_cb(key):
                        pc_line_width_doc)
     cf.register_option('memory_usage', True, pc_memory_usage_doc,
                         validator=is_instance_factory([type(None), bool]))
+    cf.register_option('unicode.east_asian_width', False,
+                       pc_east_asian_width_doc, validator=is_bool)
+    cf.register_option('unicode.ambiguous_as_wide', False,
+                       pc_east_asian_width_doc, validator=is_bool)
 
 cf.deprecate_option('display.line_width',
                     msg=pc_line_width_deprecation_warning,