Skip to content

Commit 86233e1

Browse files
erikcsjreback
authored andcommitted
ENH/DOC: wide_to_long performance and docstring clarification
closes #14778 Please see regex search on long columns by first converting to Categorical, avoid melting all dataframes with all the id variables, and wait with trying to convert the "time" variable to `int` until last), and clear up the docstring. Author: nuffe <[email protected]> Closes #14779 from nuffe/wide2longfix and squashes the following commits: df1edf8 [nuffe] asv_bench: fix indentation and simplify dc13064 [nuffe] Set docstring to raw literal to allow backslashes to be printed (still had to escape them) 295d1e6 [nuffe] Use pd.Index in doc example 1c49291 [nuffe] Can of course get rid negative lookahead now that suffix is a regex 54c5920 [nuffe] Specify the suffix with a regex 5747a25 [nuffe] ENH/DOC: wide_to_long performance and functionality improvements (#14779)
1 parent 7d8bc0d commit 86233e1

File tree

5 files changed

+402
-39
lines changed

5 files changed

+402
-39
lines changed

asv_bench/benchmarks/reshape.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .pandas_vb_common import *
2-
from pandas.core.reshape import melt
2+
from pandas.core.reshape import melt, wide_to_long
33

44

55
class melt_dataframe(object):
@@ -74,3 +74,25 @@ def setup(self):
7474

7575
def time_unstack_sparse_keyspace(self):
7676
self.idf.unstack()
77+
78+
79+
class wide_to_long_big(object):
80+
goal_time = 0.2
81+
82+
def setup(self):
83+
vars = 'ABCD'
84+
nyrs = 20
85+
nidvars = 20
86+
N = 5000
87+
yrvars = []
88+
for var in vars:
89+
for yr in range(1, nyrs + 1):
90+
yrvars.append(var + str(yr))
91+
92+
self.df = pd.DataFrame(np.random.randn(N, nidvars + len(yrvars)),
93+
columns=list(range(nidvars)) + yrvars)
94+
self.vars = vars
95+
96+
def time_wide_to_long_big(self):
97+
self.df['id'] = self.df.index
98+
wide_to_long(self.df, list(self.vars), i='id', j='year')

doc/source/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ Data manipulations
157157
concat
158158
get_dummies
159159
factorize
160+
wide_to_long
160161

161162
Top-level missing data
162163
~~~~~~~~~~~~~~~~~~~~~~

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ Removal of prior version deprecations/changes
113113
Performance Improvements
114114
~~~~~~~~~~~~~~~~~~~~~~~~
115115

116+
- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
116117

117118

118119

pandas/core/reshape.py

+177-37
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from pandas.compat import range, zip
44
from pandas import compat
55
import itertools
6+
import re
67

78
import numpy as np
89

@@ -877,29 +878,55 @@ def lreshape(data, groups, dropna=True, label=None):
877878
return DataFrame(mdata, columns=id_cols + pivot_cols)
878879

879880

880-
def wide_to_long(df, stubnames, i, j):
881-
"""
881+
def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'):
882+
r"""
882883
Wide panel to long format. Less flexible but more user-friendly than melt.
883884
885+
With stubnames ['A', 'B'], this function expects to find one or more
886+
group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,...
887+
You specify what you want to call this suffix in the resulting long format
888+
with `j` (for example `j='year'`)
889+
890+
Each row of these wide variables are assumed to be uniquely identified by
891+
`i` (can be a single column name or a list of column names)
892+
893+
All remaining variables in the data frame are left intact.
894+
884895
Parameters
885896
----------
886897
df : DataFrame
887898
The wide-format DataFrame
888-
stubnames : list
889-
A list of stub names. The wide format variables are assumed to
899+
stubnames : str or list-like
900+
The stub name(s). The wide format variables are assumed to
890901
start with the stub names.
891-
i : str
892-
The name of the id variable.
902+
i : str or list-like
903+
Column(s) to use as id variable(s)
893904
j : str
894-
The name of the subobservation variable.
895-
stubend : str
896-
Regex to match for the end of the stubs.
905+
The name of the subobservation variable. What you wish to name your
906+
suffix in the long format.
907+
sep : str, default ""
908+
A character indicating the separation of the variable names
909+
in the wide format, to be stripped from the names in the long format.
910+
For example, if your column names are A-suffix1, A-suffix2, you
911+
can strip the hypen by specifying `sep='-'`
912+
913+
.. versionadded:: 0.20.0
914+
915+
suffix : str, default '\\d+'
916+
A regular expression capturing the wanted suffixes. '\\d+' captures
917+
numeric suffixes. Suffixes with no numbers could be specified with the
918+
negated character class '\\D+'. You can also further disambiguate
919+
suffixes, for example, if your wide variables are of the form
920+
Aone, Btwo,.., and you have an unrelated column Arating, you can
921+
ignore the last one by specifying `suffix='(!?one|two)'`
922+
923+
.. versionadded:: 0.20.0
897924
898925
Returns
899926
-------
900927
DataFrame
901-
A DataFrame that contains each stub name as a variable as well as
902-
variables for i and j.
928+
A DataFrame that contains each stub name as a variable, with new index
929+
(i, j)
903930
904931
Examples
905932
--------
@@ -918,7 +945,7 @@ def wide_to_long(df, stubnames, i, j):
918945
0 a d 2.5 3.2 -1.085631 0
919946
1 b e 1.2 1.3 0.997345 1
920947
2 c f 0.7 0.1 0.282978 2
921-
>>> wide_to_long(df, ["A", "B"], i="id", j="year")
948+
>>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
922949
X A B
923950
id year
924951
0 1970 -1.085631 a 2.5
@@ -928,38 +955,151 @@ def wide_to_long(df, stubnames, i, j):
928955
1 1980 0.997345 e 1.3
929956
2 1980 0.282978 f 0.1
930957
958+
With multuple id columns
959+
960+
>>> df = pd.DataFrame({
961+
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
962+
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
963+
... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
964+
... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
965+
... })
966+
>>> df
967+
birth famid ht1 ht2
968+
0 1 1 2.8 3.4
969+
1 2 1 2.9 3.8
970+
2 3 1 2.2 2.9
971+
3 1 2 2.0 3.2
972+
4 2 2 1.8 2.8
973+
5 3 2 1.9 2.4
974+
6 1 3 2.2 3.3
975+
7 2 3 2.3 3.4
976+
8 3 3 2.1 2.9
977+
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
978+
>>> l
979+
ht
980+
famid birth age
981+
1 1 1 2.8
982+
2 3.4
983+
2 1 2.9
984+
2 3.8
985+
3 1 2.2
986+
2 2.9
987+
2 1 1 2.0
988+
2 3.2
989+
2 1 1.8
990+
2 2.8
991+
3 1 1.9
992+
2 2.4
993+
3 1 1 2.2
994+
2 3.3
995+
2 1 2.3
996+
2 3.4
997+
3 1 2.1
998+
2 2.9
999+
1000+
Going from long back to wide just takes some creative use of `unstack`
1001+
1002+
>>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack()
1003+
>>> w.columns = pd.Index(w.columns).str.join('')
1004+
>>> w.reset_index()
1005+
famid birth ht1 ht2
1006+
0 1 1 2.8 3.4
1007+
1 1 2 2.9 3.8
1008+
2 1 3 2.2 2.9
1009+
3 2 1 2.0 3.2
1010+
4 2 2 1.8 2.8
1011+
5 2 3 1.9 2.4
1012+
6 3 1 2.2 3.3
1013+
7 3 2 2.3 3.4
1014+
8 3 3 2.1 2.9
1015+
1016+
Less wieldy column names are also handled
1017+
1018+
>>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
1019+
... 'A(quarterly)-2011': np.random.rand(3),
1020+
... 'B(quarterly)-2010': np.random.rand(3),
1021+
... 'B(quarterly)-2011': np.random.rand(3),
1022+
... 'X' : np.random.randint(3, size=3)})
1023+
>>> df['id'] = df.index
1024+
>>> df
1025+
A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011
1026+
0 0.531828 0.724455 0.322959 0.293714
1027+
1 0.634401 0.611024 0.361789 0.630976
1028+
2 0.849432 0.722443 0.228263 0.092105
1029+
\
1030+
X id
1031+
0 0 0
1032+
1 1 1
1033+
2 2 2
1034+
>>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'],
1035+
i='id', j='year', sep='-')
1036+
X A(quarterly) B(quarterly)
1037+
id year
1038+
0 2010 0 0.531828 0.322959
1039+
1 2010 2 0.634401 0.361789
1040+
2 2010 2 0.849432 0.228263
1041+
0 2011 0 0.724455 0.293714
1042+
1 2011 2 0.611024 0.630976
1043+
2 2011 2 0.722443 0.092105
1044+
1045+
If we have many columns, we could also use a regex to find our
1046+
stubnames and pass that list on to wide_to_long
1047+
1048+
>>> stubnames = set([match[0] for match in
1049+
df.columns.str.findall('[A-B]\(.*\)').values
1050+
if match != [] ])
1051+
>>> list(stubnames)
1052+
['B(quarterly)', 'A(quarterly)']
1053+
9311054
Notes
9321055
-----
933-
All extra variables are treated as extra id variables. This simply uses
1056+
All extra variables are left untouched. This simply uses
9341057
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
9351058
in a typicaly case.
9361059
"""
937-
938-
def get_var_names(df, regex):
1060+
def get_var_names(df, stub, sep, suffix):
1061+
regex = "^{0}{1}{2}".format(re.escape(stub), re.escape(sep), suffix)
9391062
return df.filter(regex=regex).columns.tolist()
9401063

941-
def melt_stub(df, stub, i, j):
942-
varnames = get_var_names(df, "^" + stub)
943-
newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub,
944-
var_name=j)
945-
newdf_j = newdf[j].str.replace(stub, "")
946-
try:
947-
newdf_j = newdf_j.astype(int)
948-
except ValueError:
949-
pass
950-
newdf[j] = newdf_j
951-
return newdf
952-
953-
id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
954-
if i not in id_vars:
955-
id_vars += [i]
956-
957-
newdf = melt_stub(df, stubnames[0], id_vars, j)
958-
959-
for stub in stubnames[1:]:
960-
new = melt_stub(df, stub, id_vars, j)
961-
newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
962-
return newdf.set_index([i, j])
1064+
def melt_stub(df, stub, i, j, value_vars, sep):
1065+
newdf = melt(df, id_vars=i, value_vars=value_vars,
1066+
value_name=stub.rstrip(sep), var_name=j)
1067+
newdf[j] = Categorical(newdf[j])
1068+
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
1069+
1070+
return newdf.set_index(i + [j])
1071+
1072+
if any(map(lambda s: s in df.columns.tolist(), stubnames)):
1073+
raise ValueError("stubname can't be identical to a column name")
1074+
1075+
if not is_list_like(stubnames):
1076+
stubnames = [stubnames]
1077+
else:
1078+
stubnames = list(stubnames)
1079+
1080+
if not is_list_like(i):
1081+
i = [i]
1082+
else:
1083+
i = list(i)
1084+
1085+
value_vars = list(map(lambda stub:
1086+
get_var_names(df, stub, sep, suffix), stubnames))
1087+
1088+
value_vars_flattened = [e for sublist in value_vars for e in sublist]
1089+
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
1090+
1091+
melted = []
1092+
for s, v in zip(stubnames, value_vars):
1093+
melted.append(melt_stub(df, s, i, j, v, sep))
1094+
melted = melted[0].join(melted[1:], how='outer')
1095+
1096+
if len(i) == 1:
1097+
new = df[id_vars].set_index(i).join(melted)
1098+
return new
1099+
1100+
new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
1101+
1102+
return new
9631103

9641104

9651105
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,

0 commit comments

Comments
 (0)