BUG: coerce pd.wide_to_long suffixes to ints (#17628)

tdpetrou · jorisvandenbossche · commit a259b64fe582 · 2017-12-10T22:36:15.000+01:00
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -191,6 +191,7 @@ Other API Changes
 - Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`)
 - Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:pr:`16672`)
 - :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`)
+- :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`)
 
 .. _whatsnew_0220.deprecations:
 
diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py
@@ -13,6 +13,7 @@
 
 import re
 from pandas.core.dtypes.missing import notna
+from pandas.core.tools.numeric import to_numeric
 
 
 @Appender(_shared_docs['melt'] %
@@ -199,6 +200,9 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
 
         .. versionadded:: 0.20.0
 
+        .. versionchanged:: 0.22.0
+            When all suffixes are numeric, they are cast to int64/float64.
+
     Returns
     -------
     DataFrame
@@ -278,8 +282,8 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
 
     Going from long back to wide just takes some creative use of `unstack`
 
-    >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack()
-    >>> w.columns = pd.Index(w.columns).str.join('')
+    >>> w = l.unstack()
+    >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
     >>> w.reset_index()
        famid  birth  ht1  ht2
     0      1      1  2.8  3.4
@@ -333,26 +337,76 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
     >>> list(stubnames)
     ['A(quarterly)', 'B(quarterly)']
 
+    All of the above examples have integers as suffixes. It is possible to
+    have non-integers as suffixes.
+
+    >>> df = pd.DataFrame({
+    ...     'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
+    ...     'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
+    ...     'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
+    ...     'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
+    ... })
+    >>> df
+       birth  famid  ht_one  ht_two
+    0      1      1     2.8     3.4
+    1      2      1     2.9     3.8
+    2      3      1     2.2     2.9
+    3      1      2     2.0     3.2
+    4      2      2     1.8     2.8
+    5      3      2     1.9     2.4
+    6      1      3     2.2     3.3
+    7      2      3     2.3     3.4
+    8      3      3     2.1     2.9
+
+    >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
+                            sep='_', suffix='\w')
+    >>> l
+    ... # doctest: +NORMALIZE_WHITESPACE
+                      ht
+    famid birth age
+    1     1     one  2.8
+                two  3.4
+          2     one  2.9
+                two  3.8
+          3     one  2.2
+                two  2.9
+    2     1     one  2.0
+                two  3.2
+          2     one  1.8
+                two  2.8
+          3     one  1.9
+                two  2.4
+    3     1     one  2.2
+                two  3.3
+          2     one  2.3
+                two  3.4
+          3     one  2.1
+                two  2.9
+
     Notes
     -----
     All extra variables are left untouched. This simply uses
     `pandas.melt` under the hood, but is hard-coded to "do the right thing"
-    in a typicaly case.
+    in a typical case.
     """
     def get_var_names(df, stub, sep, suffix):
-        regex = "^{stub}{sep}{suffix}".format(
+        regex = r'^{stub}{sep}{suffix}$'.format(
             stub=re.escape(stub), sep=re.escape(sep), suffix=suffix)
-        return df.filter(regex=regex).columns.tolist()
+        pattern = re.compile(regex)
+        return [col for col in df.columns if pattern.match(col)]
 
     def melt_stub(df, stub, i, j, value_vars, sep):
         newdf = melt(df, id_vars=i, value_vars=value_vars,
                      value_name=stub.rstrip(sep), var_name=j)
         newdf[j] = Categorical(newdf[j])
         newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
 
+        # GH17627 Cast numerics suffixes to int/float
+        newdf[j] = to_numeric(newdf[j], errors='ignore')
+
         return newdf.set_index(i + [j])
 
-    if any(map(lambda s: s in df.columns.tolist(), stubnames)):
+    if any([col in stubnames for col in df.columns]):
         raise ValueError("stubname can't be identical to a column name")
 
     if not is_list_like(stubnames):
@@ -368,8 +422,7 @@ def melt_stub(df, stub, i, j, value_vars, sep):
     if df[i].duplicated().any():
         raise ValueError("the id variables need to uniquely identify each row")
 
-    value_vars = list(map(lambda stub:
-                          get_var_names(df, stub, sep, suffix), stubnames))
+    value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]
 
     value_vars_flattened = [e for sublist in value_vars for e in sublist]
     id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py