ENH/DOC: wide_to_long performance and functionality improvements (#14779)

erikcs · erikcs · commit 5747a251e62e · 2016-12-10T20:58:54.000+01:00
Speed up by avoiding big copies, and regex on categorical column

Add functionality to deal with "pathological" input

Add docstring examples and more test cases
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
@@ -1,5 +1,5 @@
 from .pandas_vb_common import *
-from pandas.core.reshape import melt
+from pandas.core.reshape import melt, wide_to_long
 
 
 class melt_dataframe(object):
@@ -74,3 +74,28 @@ def setup(self):
 
     def time_unstack_sparse_keyspace(self):
         self.idf.unstack()
+
+
+class wide_to_long_big(object):
+    goal_time = 0.2
+
+    def setup(self):
+            vars = 'ABCD'
+            nyrs = 20
+            nidvars = 20
+            N = 5000
+            yrvars = []
+            for var in vars:
+                for yr in range(1, nyrs + 1):
+                    yrvars.append(var + str(yr))
+
+            yearobs = dict(zip(yrvars, np.random.randn(len(yrvars), N)))
+            idobs = dict(zip(range(nidvars), np.random.rand(nidvars, N)))
+
+            self.df = pd.concat([pd.DataFrame(idobs), pd.DataFrame(yearobs)],
+                                axis=1)
+            self.vars = vars
+
+    def time_wide_to_long_big(self):
+        self.df['id'] = self.df.index
+        wide_to_long(self.df, list(self.vars), i='id', j='year')
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -156,6 +156,7 @@ Data manipulations
    concat
    get_dummies
    factorize
+   wide_to_long
 
 Top-level missing data
 ~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -88,6 +88,7 @@ Removal of prior version deprecations/changes
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
+- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
 
 
 
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -3,6 +3,7 @@
 from pandas.compat import range, zip
 from pandas import compat
 import itertools
+import re
 
 import numpy as np
 
@@ -875,29 +876,45 @@ def lreshape(data, groups, dropna=True, label=None):
     return DataFrame(mdata, columns=id_cols + pivot_cols)
 
 
-def wide_to_long(df, stubnames, i, j):
+def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True):
     """
     Wide panel to long format. Less flexible but more user-friendly than melt.
 
+    With stubnames ['A', 'B'], this function expects to find one or more
+    group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,...
+    You specify what you want to call this suffix in the resulting long format
+    with `j` (for example `j`='year')
+
+    Each row of these wide variables are assumed to be uniquely identified by
+    `i` (can be a single column name or a list of column names)
+
+    All remaining variables in the data frame are left intact.
+
     Parameters
     ----------
     df : DataFrame
         The wide-format DataFrame
-    stubnames : list
-        A list of stub names. The wide format variables are assumed to
+    stubnames : list or string
+        The stub name(s). The wide format variables are assumed to
         start with the stub names.
-    i : str
-        The name of the id variable.
+    i : list or string
+        Column(s) to use as id variable(s)
     j : str
-        The name of the subobservation variable.
-    stubend : str
-        Regex to match for the end of the stubs.
+        The name of the subobservation variable. What you wish to name your
+        suffix in the long format.
+    sep : str, default ""
+        A character indicating the separation of the variable names
+        in the wide format, to be stripped from the names in the long format.
+        For example, if your column names are A-suffix1, A-suffix2, you
+        can strip the hypen by specifying `sep`='-'
+    numeric_suffix : bool, default True
+        Whether the stub suffix is assumed to be numeric or not.
 
     Returns
     -------
     DataFrame
-        A DataFrame that contains each stub name as a variable as well as
-        variables for i and j.
+        A DataFrame that contains each stub name as a variable, with new index
+        (i, j)
 
     Examples
     --------
@@ -916,7 +933,7 @@ def wide_to_long(df, stubnames, i, j):
     0     a     d    2.5    3.2 -1.085631   0
     1     b     e    1.2    1.3  0.997345   1
     2     c     f    0.7    0.1  0.282978   2
-    >>> wide_to_long(df, ["A", "B"], i="id", j="year")
+    >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
                     X  A    B
     id year
     0  1970 -1.085631  a  2.5
@@ -926,38 +943,166 @@ def wide_to_long(df, stubnames, i, j):
     1  1980  0.997345  e  1.3
     2  1980  0.282978  f  0.1
 
+    With multuple id columns
+
+    >>> df = pd.DataFrame({
+    ...     'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
+    ...     'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
+    ...     'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
+    ...     'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
+    ... })
+    >>> df
+       birth  famid  ht1  ht2
+    0      1      1  2.8  3.4
+    1      2      1  2.9  3.8
+    2      3      1  2.2  2.9
+    3      1      2  2.0  3.2
+    4      2      2  1.8  2.8
+    5      3      2  1.9  2.4
+    6      1      3  2.2  3.3
+    7      2      3  2.3  3.4
+    8      3      3  2.1  2.9
+    >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
+    >>> l
+                      ht
+    famid birth age
+    1     1     1    2.8
+                2    3.4
+          2     1    2.9
+                2    3.8
+          3     1    2.2
+                2    2.9
+    2     1     1    2.0
+                2    3.2
+          2     1    1.8
+                2    2.8
+          3     1    1.9
+                2    2.4
+    3     1     1    2.2
+                2    3.3
+          2     1    2.3
+                2    3.4
+          3     1    2.1
+                2    2.9
+
+    Going from long back to wide just takes some creative use of `unstack`
+
+    >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack()
+    >>> w.columns = [name + suffix for name, suffix in wide.columns.tolist()]
+    >>> w.reset_index()
+       famid  birth  ht1  ht2
+    0      1      1  2.8  3.4
+    1      1      2  2.9  3.8
+    2      1      3  2.2  2.9
+    3      2      1  2.0  3.2
+    4      2      2  1.8  2.8
+    5      2      3  1.9  2.4
+    6      3      1  2.2  3.3
+    7      3      2  2.3  3.4
+    8      3      3  2.1  2.9
+
+    Less wieldy column names are also handled
+
+    >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
+    ...                    'A(quarterly)-2011': np.random.rand(3),
+    ...                    'B(quarterly)-2010': np.random.rand(3),
+    ...                    'B(quarterly)-2011': np.random.rand(3),
+    ...                    'X' : np.random.randint(3, size=3)})
+    >>> df['id'] = df.index
+    >>> df
+      A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011
+    0          0.531828          0.724455          0.322959          0.293714
+    1          0.634401          0.611024          0.361789          0.630976
+    2          0.849432          0.722443          0.228263          0.092105
+    \
+       X  id
+    0  0   0
+    1  1   1
+    2  2   2
+    >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'],
+                        i='id', j='year', sep='-')
+             X     A(quarterly)  B(quarterly)
+    id year
+    0  2010  0       0.531828       0.322959
+    1  2010  2       0.634401       0.361789
+    2  2010  2       0.849432       0.228263
+    0  2011  0       0.724455       0.293714
+    1  2011  2       0.611024       0.630976
+    2  2011  2       0.722443       0.092105
+
+    If we have many columns, we could also use a regex to find our
+    stubnames and pass that list on to wide_to_long
+
+    >>> stubnames = set([match[0] for match in
+                        df.columns.str.findall('[A-B]\(.*\)').values
+                        if match != [] ])
+    >>> list(stubnames)
+    ['B(quarterly)', 'A(quarterly)']
+
     Notes
     -----
-    All extra variables are treated as extra id variables. This simply uses
+    All extra variables are left untouched. This simply uses
     `pandas.melt` under the hood, but is hard-coded to "do the right thing"
     in a typicaly case.
     """
 
     def get_var_names(df, regex):
         return df.filter(regex=regex).columns.tolist()
 
-    def melt_stub(df, stub, i, j):
-        varnames = get_var_names(df, "^" + stub)
-        newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub,
-                     var_name=j)
-        newdf_j = newdf[j].str.replace(stub, "")
-        try:
-            newdf_j = newdf_j.astype(int)
-        except ValueError:
-            pass
-        newdf[j] = newdf_j
-        return newdf
-
-    id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
-    if i not in id_vars:
-        id_vars += [i]
-
-    newdf = melt_stub(df, stubnames[0], id_vars, j)
-
-    for stub in stubnames[1:]:
-        new = melt_stub(df, stub, id_vars, j)
-        newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
-    return newdf.set_index([i, j])
+    def melt_stub(df, stub, i, j, value_vars, sep):
+        newdf = melt(df, id_vars=i, value_vars=value_vars,
+                     value_name=stub.rstrip(sep), var_name=j)
+        newdf[j] = Categorical(newdf[j])
+        newdf[j] = newdf[j].str.replace(re.escape(stub), "")
+
+        return newdf.set_index(i + [j])
+
+    if any(map(lambda s: s in df.columns.tolist(), stubnames)):
+        raise ValueError("stubname can't be identical to a column name")
+
+    if not isinstance(stubnames, list):
+        stubnames = [stubnames]
+
+    if not isinstance(i, list):
+        i = [i]
+
+    stubs = list(map(lambda x: x + sep, stubnames))
+
+    # This regex is needed to avoid multiple "greedy" matches with stubs
+    # that have overlapping substrings
+    # For example A2011, A2012 are separate from AA2011, AA2012
+    # And BBone, BBtwo is different from Bone, Btwo, and BBBrating
+    value_vars = list(map(lambda x: get_var_names(
+        df, "^{0}(?!{1})".format(re.escape(x), re.escape(x[-1]))), stubs))
+
+    value_vars_flattened = [e for sublist in value_vars for e in sublist]
+    id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
+
+    # If we know the stub end type is a number we can disambiguate potential
+    # misclassified value_vars, for ex, with stubname A: A2011, A2012 and
+    # Arating would all be found as value_vars. If the suffix is numeric we
+    # know the last one should be an id_var. (Note the converse disambiguation
+    # is not possible)
+    if numeric_suffix:
+        for s, v in zip(stubs, value_vars):
+            for vname in v[:]:
+                end = vname.replace(s, "")
+                if not end.isdigit():
+                    v.remove(vname)
+                    id_vars.append(vname)
+
+    melted = []
+    for s, v in zip(stubs, value_vars):
+        melted.append(melt_stub(df, s, i, j, v, sep))
+    melted = melted[0].join(melted[1:], how='outer')
+
+    if len(i) == 1:
+        new = df[id_vars].set_index(i).join(melted)
+        return new
+
+    new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
+
+    return new
 
 
 def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py

Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,7 @@ Removal of prior version deprecations/changes`
`88`	`88`	`Performance Improvements`
`89`	`89`	`~~~~~~~~~~~~~~~~~~~~~~~~`
`90`	`90`
	`91`	+- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
`91`	`92`
`92`	`93`
`93`	`94`