From 5747a251e62e20e6d659c851b2f2457266722ad2 Mon Sep 17 00:00:00 2001 From: nuffe Date: Sat, 10 Dec 2016 20:58:54 +0100 Subject: [PATCH 1/6] ENH/DOC: wide_to_long performance and functionality improvements (#14779) Speed up by avoiding big copies, and regex on categorical column Add functionality to deal with "pathological" input Add docstring examples and more test cases --- asv_bench/benchmarks/reshape.py | 27 +++- doc/source/api.rst | 1 + doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/reshape.py | 213 +++++++++++++++++++++++++++----- pandas/tests/test_reshape.py | 200 +++++++++++++++++++++++++++++- 5 files changed, 406 insertions(+), 36 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index ab235e085986c..6ebf414de52f3 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -1,5 +1,5 @@ from .pandas_vb_common import * -from pandas.core.reshape import melt +from pandas.core.reshape import melt, wide_to_long class melt_dataframe(object): @@ -74,3 +74,28 @@ def setup(self): def time_unstack_sparse_keyspace(self): self.idf.unstack() + + +class wide_to_long_big(object): + goal_time = 0.2 + + def setup(self): + vars = 'ABCD' + nyrs = 20 + nidvars = 20 + N = 5000 + yrvars = [] + for var in vars: + for yr in range(1, nyrs + 1): + yrvars.append(var + str(yr)) + + yearobs = dict(zip(yrvars, np.random.randn(len(yrvars), N))) + idobs = dict(zip(range(nidvars), np.random.rand(nidvars, N))) + + self.df = pd.concat([pd.DataFrame(idobs), pd.DataFrame(yearobs)], + axis=1) + self.vars = vars + + def time_wide_to_long_big(self): + self.df['id'] = self.df.index + wide_to_long(self.df, list(self.vars), i='id', j='year') diff --git a/doc/source/api.rst b/doc/source/api.rst index a510f663d19ee..3766093cabed0 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -156,6 +156,7 @@ Data manipulations concat get_dummies factorize + wide_to_long Top-level missing data ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index ff086380fdb05..5d99308abfb63 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -88,6 +88,7 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 055a0041b181a..e231aabfce8ae 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -3,6 +3,7 @@ from pandas.compat import range, zip from pandas import compat import itertools +import re import numpy as np @@ -875,29 +876,45 @@ def lreshape(data, groups, dropna=True, label=None): return DataFrame(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df, stubnames, i, j): +def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True): """ Wide panel to long format. Less flexible but more user-friendly than melt. + With stubnames ['A', 'B'], this function expects to find one or more + group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,... + You specify what you want to call this suffix in the resulting long format + with `j` (for example `j`='year') + + Each row of these wide variables are assumed to be uniquely identified by + `i` (can be a single column name or a list of column names) + + All remaining variables in the data frame are left intact. + Parameters ---------- df : DataFrame The wide-format DataFrame - stubnames : list - A list of stub names. The wide format variables are assumed to + stubnames : list or string + The stub name(s). The wide format variables are assumed to start with the stub names. - i : str - The name of the id variable. + i : list or string + Column(s) to use as id variable(s) j : str - The name of the subobservation variable. - stubend : str - Regex to match for the end of the stubs. + The name of the subobservation variable. What you wish to name your + suffix in the long format. + sep : str, default "" + A character indicating the separation of the variable names + in the wide format, to be stripped from the names in the long format. + For example, if your column names are A-suffix1, A-suffix2, you + can strip the hypen by specifying `sep`='-' + numeric_suffix : bool, default True + Whether the stub suffix is assumed to be numeric or not. Returns ------- DataFrame - A DataFrame that contains each stub name as a variable as well as - variables for i and j. + A DataFrame that contains each stub name as a variable, with new index + (i, j) Examples -------- @@ -916,7 +933,7 @@ def wide_to_long(df, stubnames, i, j): 0 a d 2.5 3.2 -1.085631 0 1 b e 1.2 1.3 0.997345 1 2 c f 0.7 0.1 0.282978 2 - >>> wide_to_long(df, ["A", "B"], i="id", j="year") + >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") X A B id year 0 1970 -1.085631 a 2.5 @@ -926,9 +943,105 @@ def wide_to_long(df, stubnames, i, j): 1 1980 0.997345 e 1.3 2 1980 0.282978 f 0.1 + With multuple id columns + + >>> df = pd.DataFrame({ + ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + ... }) + >>> df + birth famid ht1 ht2 + 0 1 1 2.8 3.4 + 1 2 1 2.9 3.8 + 2 3 1 2.2 2.9 + 3 1 2 2.0 3.2 + 4 2 2 1.8 2.8 + 5 3 2 1.9 2.4 + 6 1 3 2.2 3.3 + 7 2 3 2.3 3.4 + 8 3 3 2.1 2.9 + >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') + >>> l + ht + famid birth age + 1 1 1 2.8 + 2 3.4 + 2 1 2.9 + 2 3.8 + 3 1 2.2 + 2 2.9 + 2 1 1 2.0 + 2 3.2 + 2 1 1.8 + 2 2.8 + 3 1 1.9 + 2 2.4 + 3 1 1 2.2 + 2 3.3 + 2 1 2.3 + 2 3.4 + 3 1 2.1 + 2 2.9 + + Going from long back to wide just takes some creative use of `unstack` + + >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack() + >>> w.columns = [name + suffix for name, suffix in wide.columns.tolist()] + >>> w.reset_index() + famid birth ht1 ht2 + 0 1 1 2.8 3.4 + 1 1 2 2.9 3.8 + 2 1 3 2.2 2.9 + 3 2 1 2.0 3.2 + 4 2 2 1.8 2.8 + 5 2 3 1.9 2.4 + 6 3 1 2.2 3.3 + 7 3 2 2.3 3.4 + 8 3 3 2.1 2.9 + + Less wieldy column names are also handled + + >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), + ... 'A(quarterly)-2011': np.random.rand(3), + ... 'B(quarterly)-2010': np.random.rand(3), + ... 'B(quarterly)-2011': np.random.rand(3), + ... 'X' : np.random.randint(3, size=3)}) + >>> df['id'] = df.index + >>> df + A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011 + 0 0.531828 0.724455 0.322959 0.293714 + 1 0.634401 0.611024 0.361789 0.630976 + 2 0.849432 0.722443 0.228263 0.092105 + \ + X id + 0 0 0 + 1 1 1 + 2 2 2 + >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], + i='id', j='year', sep='-') + X A(quarterly) B(quarterly) + id year + 0 2010 0 0.531828 0.322959 + 1 2010 2 0.634401 0.361789 + 2 2010 2 0.849432 0.228263 + 0 2011 0 0.724455 0.293714 + 1 2011 2 0.611024 0.630976 + 2 2011 2 0.722443 0.092105 + + If we have many columns, we could also use a regex to find our + stubnames and pass that list on to wide_to_long + + >>> stubnames = set([match[0] for match in + df.columns.str.findall('[A-B]\(.*\)').values + if match != [] ]) + >>> list(stubnames) + ['B(quarterly)', 'A(quarterly)'] + Notes ----- - All extra variables are treated as extra id variables. This simply uses + All extra variables are left untouched. This simply uses `pandas.melt` under the hood, but is hard-coded to "do the right thing" in a typicaly case. """ @@ -936,28 +1049,60 @@ def wide_to_long(df, stubnames, i, j): def get_var_names(df, regex): return df.filter(regex=regex).columns.tolist() - def melt_stub(df, stub, i, j): - varnames = get_var_names(df, "^" + stub) - newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub, - var_name=j) - newdf_j = newdf[j].str.replace(stub, "") - try: - newdf_j = newdf_j.astype(int) - except ValueError: - pass - newdf[j] = newdf_j - return newdf - - id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames)) - if i not in id_vars: - id_vars += [i] - - newdf = melt_stub(df, stubnames[0], id_vars, j) - - for stub in stubnames[1:]: - new = melt_stub(df, stub, id_vars, j) - newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False) - return newdf.set_index([i, j]) + def melt_stub(df, stub, i, j, value_vars, sep): + newdf = melt(df, id_vars=i, value_vars=value_vars, + value_name=stub.rstrip(sep), var_name=j) + newdf[j] = Categorical(newdf[j]) + newdf[j] = newdf[j].str.replace(re.escape(stub), "") + + return newdf.set_index(i + [j]) + + if any(map(lambda s: s in df.columns.tolist(), stubnames)): + raise ValueError("stubname can't be identical to a column name") + + if not isinstance(stubnames, list): + stubnames = [stubnames] + + if not isinstance(i, list): + i = [i] + + stubs = list(map(lambda x: x + sep, stubnames)) + + # This regex is needed to avoid multiple "greedy" matches with stubs + # that have overlapping substrings + # For example A2011, A2012 are separate from AA2011, AA2012 + # And BBone, BBtwo is different from Bone, Btwo, and BBBrating + value_vars = list(map(lambda x: get_var_names( + df, "^{0}(?!{1})".format(re.escape(x), re.escape(x[-1]))), stubs)) + + value_vars_flattened = [e for sublist in value_vars for e in sublist] + id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) + + # If we know the stub end type is a number we can disambiguate potential + # misclassified value_vars, for ex, with stubname A: A2011, A2012 and + # Arating would all be found as value_vars. If the suffix is numeric we + # know the last one should be an id_var. (Note the converse disambiguation + # is not possible) + if numeric_suffix: + for s, v in zip(stubs, value_vars): + for vname in v[:]: + end = vname.replace(s, "") + if not end.isdigit(): + v.remove(vname) + id_vars.append(vname) + + melted = [] + for s, v in zip(stubs, value_vars): + melted.append(melt_stub(df, s, i, j, v, sep)) + melted = melted[0].join(melted[1:], how='outer') + + if len(i) == 1: + new = df[id_vars].set_index(i).join(melted) + return new + + new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) + + return new def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 80d1f5f76e5a9..a7d7dbb2e0a24 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -698,7 +698,7 @@ def test_simple(self): exp_data = {"X": x.tolist() + x.tolist(), "A": ['a', 'b', 'c', 'd', 'e', 'f'], "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], - "year": [1970, 1970, 1970, 1980, 1980, 1980], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], "id": [0, 1, 2, 0, 1, 2]} exp_frame = DataFrame(exp_data) exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] @@ -716,6 +716,204 @@ def test_stubs(self): self.assertEqual(stubs, ['inc', 'edu']) + def test_separating_character(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A.1970": {0: "a", + 1: "b", + 2: "c"}, + "A.1980": {0: "d", + 1: "e", + 2: "f"}, + "B.1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B.1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A": ['a', 'b', 'c', 'd', 'e', 'f'], + "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_escapable_characters(self): + np.random.seed(123) + x = np.random.randn(3) + df = pd.DataFrame({"A(quarterly)1970": {0: "a", + 1: "b", + 2: "c"}, + "A(quarterly)1980": {0: "d", + 1: "e", + 2: "f"}, + "B(quarterly)1970": {0: 2.5, + 1: 1.2, + 2: .7}, + "B(quarterly)1980": {0: 3.2, + 1: 1.3, + 2: .1}, + "X": dict(zip( + range(3), x))}) + df["id"] = df.index + exp_data = {"X": x.tolist() + x.tolist(), + "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'], + "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], + "year": ['1970', '1970', '1970', '1980', '1980', '1980'], + "id": [0, 1, 2, 0, 1, 2]} + exp_frame = DataFrame(exp_data) + exp_frame = exp_frame.set_index( + ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]] + long_frame = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], + i="id", j="year") + tm.assert_frame_equal(long_frame, exp_frame) + + def test_unbalanced(self): + # test that we can have a varying amount of time variables + df = pd.DataFrame({'A2010': [1.0, 2.0], + 'A2011': [3.0, 4.0], + 'B2010': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': ['X1', 'X1', 'X2', 'X2'], + 'A': [1.0, 3.0, 2.0, 4.0], + 'B': [5.0, np.nan, 6.0, np.nan], + 'id': [0, 0, 1, 1], + 'year': ['2010', '2011', '2010', '2011']} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') + tm.assert_frame_equal(long_frame, exp_frame) + + def test_character_overlap(self): + # Test we handle overlapping characters in both id_vars and value_vars + df = pd.DataFrame({ + 'A11': ['a11', 'a22', 'a33'], + 'A12': ['a21', 'a22', 'a23'], + 'B11': ['b11', 'b12', 'b13'], + 'B12': ['b21', 'b22', 'b23'], + 'BB11': [1, 2, 3], + 'BB12': [4, 5, 6], + 'BBBX': [91, 92, 93], + 'BBBZ': [91, 92, 93] + }) + df['id'] = df.index + exp_frame = pd.DataFrame({ + 'BBBX': [91, 92, 93, 91, 92, 93], + 'BBBZ': [91, 92, 93, 91, 92, 93], + 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], + 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], + 'BB': [1, 2, 3, 4, 5, 6], + 'id': [0, 1, 2, 0, 1, 2], + 'year': ['11', '11', '11', '12', '12', '12']}) + exp_frame = exp_frame.set_index(['id', 'year'])[ + ['BBBX', 'BBBZ', 'A', 'B', 'BB']] + long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_invalid_separator(self): + # if an invalid separator is supplied a empty data frame is returned + sep = 'nope!' + df = pd.DataFrame({'A2010': [1.0, 2.0], + 'A2011': [3.0, 4.0], + 'B2010': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': '', + 'A2010': [], + 'A2011': [], + 'B2010': [], + 'id': [], + 'year': [], + 'A': [], + 'B': []} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[[ + 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']] + exp_frame.index.set_levels([[0, 1], []], inplace=True) + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep) + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_num_string_disambiguation(self): + # Test that we can disambiguate number value_vars from + # string value_vars + df = pd.DataFrame({ + 'A11': ['a11', 'a22', 'a33'], + 'A12': ['a21', 'a22', 'a23'], + 'B11': ['b11', 'b12', 'b13'], + 'B12': ['b21', 'b22', 'b23'], + 'BB11': [1, 2, 3], + 'BB12': [4, 5, 6], + 'Arating': [91, 92, 93], + 'Arating_old': [91, 92, 93] + }) + df['id'] = df.index + exp_frame = pd.DataFrame({ + 'Arating': [91, 92, 93, 91, 92, 93], + 'Arating_old': [91, 92, 93, 91, 92, 93], + 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], + 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], + 'BB': [1, 2, 3, 4, 5, 6], + 'id': [0, 1, 2, 0, 1, 2], + 'year': ['11', '11', '11', '12', '12', '12']}) + exp_frame = exp_frame.set_index(['id', 'year'])[ + ['Arating', 'Arating_old', 'A', 'B', 'BB']] + long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_invalid_suffixtype(self): + # If all stubs names end with a string, but a numeric suffix is + # assumed, an empty data frame is returned + df = pd.DataFrame({'Aone': [1.0, 2.0], + 'Atwo': [3.0, 4.0], + 'Bone': [5.0, 6.0], + 'X': ['X1', 'X2']}) + df['id'] = df.index + exp_data = {'X': '', + 'Aone': [], + 'Atwo': [], + 'Bone': [], + 'id': [], + 'year': [], + 'A': [], + 'B': []} + exp_frame = pd.DataFrame(exp_data) + exp_frame = exp_frame.set_index(['id', 'year'])[[ + 'X', 'Aone', 'Atwo', 'Bone', 'A', 'B']] + exp_frame.index.set_levels([[0, 1], []], inplace=True) + long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year') + tm.assert_frame_equal(long_frame.sort_index(axis=1), + exp_frame.sort_index(axis=1)) + + def test_multiple_id_columns(self): + # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm + df = pd.DataFrame({ + 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], + 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], + 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] + }) + exp_frame = pd.DataFrame({ + 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8, + 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9], + 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], + 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], + 'age': ['1', '2', '1', '2', '1', '2', '1', '2', '1', + '2', '1', '2', '1', '2', '1', '2', '1', '2'] + }) + exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']] + long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') + tm.assert_frame_equal(long_frame, exp_frame) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 54c5920d57e50e31a16c211fb50f38b3b73c2a49 Mon Sep 17 00:00:00 2001 From: nuffe Date: Sun, 11 Dec 2016 00:32:41 +0100 Subject: [PATCH 2/6] Specify the suffix with a regex --- pandas/core/reshape.py | 46 +++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index e231aabfce8ae..744c0fb59e74f 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -876,7 +876,7 @@ def lreshape(data, groups, dropna=True, label=None): return DataFrame(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True): +def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): """ Wide panel to long format. Less flexible but more user-friendly than melt. @@ -907,8 +907,10 @@ def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True): in the wide format, to be stripped from the names in the long format. For example, if your column names are A-suffix1, A-suffix2, you can strip the hypen by specifying `sep`='-' - numeric_suffix : bool, default True - Whether the stub suffix is assumed to be numeric or not. + suffix : str default '\d+' + A regular expression capturing the wanted suffixes. '\d+' captures + numeric suffixes. Suffixes with no numbers could be specified with the + negated character class '\D+'. Returns ------- @@ -1045,15 +1047,24 @@ def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True): `pandas.melt` under the hood, but is hard-coded to "do the right thing" in a typicaly case. """ + def get_var_names(df, stub, sep, suffix): + # The first part of this regex is needed to avoid multiple "greedy" + # matches with stubs that have overlapping substrings. For example + # A2011, A2012 are separate from AA2011, AA2012. And BBone, BBtwo is + # different from Bone, Btwo, and BBBrating + # The last part lets us disambiguate suffixes. For example, with + # stubname A: (A2011, A2012) would be captured while Arating would + # be ignored by the numeric class \d+ + regex = "^{0}(?!{1}){2}{3}".format( + re.escape(stub), re.escape(stub[-1]), re.escape(sep), suffix) - def get_var_names(df, regex): return df.filter(regex=regex).columns.tolist() def melt_stub(df, stub, i, j, value_vars, sep): newdf = melt(df, id_vars=i, value_vars=value_vars, value_name=stub.rstrip(sep), var_name=j) newdf[j] = Categorical(newdf[j]) - newdf[j] = newdf[j].str.replace(re.escape(stub), "") + newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") return newdf.set_index(i + [j]) @@ -1066,33 +1077,14 @@ def melt_stub(df, stub, i, j, value_vars, sep): if not isinstance(i, list): i = [i] - stubs = list(map(lambda x: x + sep, stubnames)) - - # This regex is needed to avoid multiple "greedy" matches with stubs - # that have overlapping substrings - # For example A2011, A2012 are separate from AA2011, AA2012 - # And BBone, BBtwo is different from Bone, Btwo, and BBBrating - value_vars = list(map(lambda x: get_var_names( - df, "^{0}(?!{1})".format(re.escape(x), re.escape(x[-1]))), stubs)) + value_vars = list(map(lambda stub: + get_var_names(df, stub, sep, suffix), stubnames)) value_vars_flattened = [e for sublist in value_vars for e in sublist] id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) - # If we know the stub end type is a number we can disambiguate potential - # misclassified value_vars, for ex, with stubname A: A2011, A2012 and - # Arating would all be found as value_vars. If the suffix is numeric we - # know the last one should be an id_var. (Note the converse disambiguation - # is not possible) - if numeric_suffix: - for s, v in zip(stubs, value_vars): - for vname in v[:]: - end = vname.replace(s, "") - if not end.isdigit(): - v.remove(vname) - id_vars.append(vname) - melted = [] - for s, v in zip(stubs, value_vars): + for s, v in zip(stubnames, value_vars): melted.append(melt_stub(df, s, i, j, v, sep)) melted = melted[0].join(melted[1:], how='outer') From 1c49291d165ed920654b7d307063402f10437a3b Mon Sep 17 00:00:00 2001 From: nuffe Date: Sun, 11 Dec 2016 17:09:53 +0100 Subject: [PATCH 3/6] Can of course get rid negative lookahead now that suffix is a regex --- pandas/core/reshape.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 744c0fb59e74f..685ac253ce15b 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -907,10 +907,13 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): in the wide format, to be stripped from the names in the long format. For example, if your column names are A-suffix1, A-suffix2, you can strip the hypen by specifying `sep`='-' - suffix : str default '\d+' + suffix : str, default '\d+' A regular expression capturing the wanted suffixes. '\d+' captures numeric suffixes. Suffixes with no numbers could be specified with the - negated character class '\D+'. + negated character class '\D+'. You can also further disambiguate + suffixes, for example, if your wide variables are of the form + Aone, Btwo,.., and you have an unrelated column Arating, you can + ignore the last one by specyfing `suffix`='(!?one|two)' Returns ------- @@ -1048,16 +1051,7 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): in a typicaly case. """ def get_var_names(df, stub, sep, suffix): - # The first part of this regex is needed to avoid multiple "greedy" - # matches with stubs that have overlapping substrings. For example - # A2011, A2012 are separate from AA2011, AA2012. And BBone, BBtwo is - # different from Bone, Btwo, and BBBrating - # The last part lets us disambiguate suffixes. For example, with - # stubname A: (A2011, A2012) would be captured while Arating would - # be ignored by the numeric class \d+ - regex = "^{0}(?!{1}){2}{3}".format( - re.escape(stub), re.escape(stub[-1]), re.escape(sep), suffix) - + regex = "^{0}{1}{2}".format(re.escape(stub), re.escape(sep), suffix) return df.filter(regex=regex).columns.tolist() def melt_stub(df, stub, i, j, value_vars, sep): From 295d1e67738655813626075c6d682b174fd07acd Mon Sep 17 00:00:00 2001 From: nuffe Date: Sun, 11 Dec 2016 18:58:47 +0100 Subject: [PATCH 4/6] Use pd.Index in doc example Use is_list_like Add GH ticket # --- pandas/core/reshape.py | 14 +++++++++----- pandas/tests/test_reshape.py | 1 + 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 685ac253ce15b..7582ea86dd8b4 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -894,10 +894,10 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): ---------- df : DataFrame The wide-format DataFrame - stubnames : list or string + stubnames : str or list-like The stub name(s). The wide format variables are assumed to start with the stub names. - i : list or string + i : str or list-like Column(s) to use as id variable(s) j : str The name of the subobservation variable. What you wish to name your @@ -993,7 +993,7 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): Going from long back to wide just takes some creative use of `unstack` >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack() - >>> w.columns = [name + suffix for name, suffix in wide.columns.tolist()] + >>> w.columns = pd.Index(w.columns).str.join('') >>> w.reset_index() famid birth ht1 ht2 0 1 1 2.8 3.4 @@ -1065,11 +1065,15 @@ def melt_stub(df, stub, i, j, value_vars, sep): if any(map(lambda s: s in df.columns.tolist(), stubnames)): raise ValueError("stubname can't be identical to a column name") - if not isinstance(stubnames, list): + if not is_list_like(stubnames): stubnames = [stubnames] + else: + stubnames = list(stubnames) - if not isinstance(i, list): + if not is_list_like(i): i = [i] + else: + i = list(i) value_vars = list(map(lambda stub: get_var_names(df, stub, sep, suffix), stubnames)) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index a7d7dbb2e0a24..603674ac01bc0 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -717,6 +717,7 @@ def test_stubs(self): self.assertEqual(stubs, ['inc', 'edu']) def test_separating_character(self): + # GH14779 np.random.seed(123) x = np.random.randn(3) df = pd.DataFrame({"A.1970": {0: "a", From dc13064b687c083b9058ad24aa65bb8ff643fbe6 Mon Sep 17 00:00:00 2001 From: nuffe Date: Sun, 11 Dec 2016 21:33:10 +0100 Subject: [PATCH 5/6] Set docstring to raw literal to allow backslashes to be printed (still had to escape them) --- pandas/core/reshape.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 7582ea86dd8b4..71f32f4b21e29 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -877,13 +877,13 @@ def lreshape(data, groups, dropna=True, label=None): def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): - """ + r""" Wide panel to long format. Less flexible but more user-friendly than melt. With stubnames ['A', 'B'], this function expects to find one or more group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,... You specify what you want to call this suffix in the resulting long format - with `j` (for example `j`='year') + with `j` (for example `j='year'`) Each row of these wide variables are assumed to be uniquely identified by `i` (can be a single column name or a list of column names) @@ -906,14 +906,14 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): A character indicating the separation of the variable names in the wide format, to be stripped from the names in the long format. For example, if your column names are A-suffix1, A-suffix2, you - can strip the hypen by specifying `sep`='-' - suffix : str, default '\d+' - A regular expression capturing the wanted suffixes. '\d+' captures + can strip the hypen by specifying `sep='-'` + suffix : str, default '\\d+' + A regular expression capturing the wanted suffixes. '\\d+' captures numeric suffixes. Suffixes with no numbers could be specified with the - negated character class '\D+'. You can also further disambiguate + negated character class '\\D+'. You can also further disambiguate suffixes, for example, if your wide variables are of the form Aone, Btwo,.., and you have an unrelated column Arating, you can - ignore the last one by specyfing `suffix`='(!?one|two)' + ignore the last one by specifying `suffix='(!?one|two)'` Returns ------- From df1edf8e182047f330df02e1697ad6feaff1dbf4 Mon Sep 17 00:00:00 2001 From: nuffe Date: Sun, 11 Dec 2016 22:23:15 +0100 Subject: [PATCH 6/6] asv_bench: fix indentation and simplify --- asv_bench/benchmarks/reshape.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 6ebf414de52f3..a3ecfff52c794 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -80,21 +80,18 @@ class wide_to_long_big(object): goal_time = 0.2 def setup(self): - vars = 'ABCD' - nyrs = 20 - nidvars = 20 - N = 5000 - yrvars = [] - for var in vars: - for yr in range(1, nyrs + 1): - yrvars.append(var + str(yr)) - - yearobs = dict(zip(yrvars, np.random.randn(len(yrvars), N))) - idobs = dict(zip(range(nidvars), np.random.rand(nidvars, N))) - - self.df = pd.concat([pd.DataFrame(idobs), pd.DataFrame(yearobs)], - axis=1) - self.vars = vars + vars = 'ABCD' + nyrs = 20 + nidvars = 20 + N = 5000 + yrvars = [] + for var in vars: + for yr in range(1, nyrs + 1): + yrvars.append(var + str(yr)) + + self.df = pd.DataFrame(np.random.randn(N, nidvars + len(yrvars)), + columns=list(range(nidvars)) + yrvars) + self.vars = vars def time_wide_to_long_big(self): self.df['id'] = self.df.index