Skip to content

Commit 5747a25

Browse files
committed
ENH/DOC: wide_to_long performance and functionality improvements (#14779)
Speed up by avoiding big copies, and regex on categorical column Add functionality to deal with "pathological" input Add docstring examples and more test cases
1 parent 06f26b5 commit 5747a25

File tree

5 files changed

+406
-36
lines changed

5 files changed

+406
-36
lines changed

asv_bench/benchmarks/reshape.py

+26-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .pandas_vb_common import *
2-
from pandas.core.reshape import melt
2+
from pandas.core.reshape import melt, wide_to_long
33

44

55
class melt_dataframe(object):
@@ -74,3 +74,28 @@ def setup(self):
7474

7575
def time_unstack_sparse_keyspace(self):
7676
self.idf.unstack()
77+
78+
79+
class wide_to_long_big(object):
80+
goal_time = 0.2
81+
82+
def setup(self):
83+
vars = 'ABCD'
84+
nyrs = 20
85+
nidvars = 20
86+
N = 5000
87+
yrvars = []
88+
for var in vars:
89+
for yr in range(1, nyrs + 1):
90+
yrvars.append(var + str(yr))
91+
92+
yearobs = dict(zip(yrvars, np.random.randn(len(yrvars), N)))
93+
idobs = dict(zip(range(nidvars), np.random.rand(nidvars, N)))
94+
95+
self.df = pd.concat([pd.DataFrame(idobs), pd.DataFrame(yearobs)],
96+
axis=1)
97+
self.vars = vars
98+
99+
def time_wide_to_long_big(self):
100+
self.df['id'] = self.df.index
101+
wide_to_long(self.df, list(self.vars), i='id', j='year')

doc/source/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ Data manipulations
156156
concat
157157
get_dummies
158158
factorize
159+
wide_to_long
159160

160161
Top-level missing data
161162
~~~~~~~~~~~~~~~~~~~~~~

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ Removal of prior version deprecations/changes
8888
Performance Improvements
8989
~~~~~~~~~~~~~~~~~~~~~~~~
9090

91+
- Improved performance of ``pd.wide_to_long()`` (:issue:`14779`)
9192

9293

9394

pandas/core/reshape.py

+179-34
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from pandas.compat import range, zip
44
from pandas import compat
55
import itertools
6+
import re
67

78
import numpy as np
89

@@ -875,29 +876,45 @@ def lreshape(data, groups, dropna=True, label=None):
875876
return DataFrame(mdata, columns=id_cols + pivot_cols)
876877

877878

878-
def wide_to_long(df, stubnames, i, j):
879+
def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True):
879880
"""
880881
Wide panel to long format. Less flexible but more user-friendly than melt.
881882
883+
With stubnames ['A', 'B'], this function expects to find one or more
884+
group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,...
885+
You specify what you want to call this suffix in the resulting long format
886+
with `j` (for example `j`='year')
887+
888+
Each row of these wide variables are assumed to be uniquely identified by
889+
`i` (can be a single column name or a list of column names)
890+
891+
All remaining variables in the data frame are left intact.
892+
882893
Parameters
883894
----------
884895
df : DataFrame
885896
The wide-format DataFrame
886-
stubnames : list
887-
A list of stub names. The wide format variables are assumed to
897+
stubnames : list or string
898+
The stub name(s). The wide format variables are assumed to
888899
start with the stub names.
889-
i : str
890-
The name of the id variable.
900+
i : list or string
901+
Column(s) to use as id variable(s)
891902
j : str
892-
The name of the subobservation variable.
893-
stubend : str
894-
Regex to match for the end of the stubs.
903+
The name of the subobservation variable. What you wish to name your
904+
suffix in the long format.
905+
sep : str, default ""
906+
A character indicating the separation of the variable names
907+
in the wide format, to be stripped from the names in the long format.
908+
For example, if your column names are A-suffix1, A-suffix2, you
909+
can strip the hypen by specifying `sep`='-'
910+
numeric_suffix : bool, default True
911+
Whether the stub suffix is assumed to be numeric or not.
895912
896913
Returns
897914
-------
898915
DataFrame
899-
A DataFrame that contains each stub name as a variable as well as
900-
variables for i and j.
916+
A DataFrame that contains each stub name as a variable, with new index
917+
(i, j)
901918
902919
Examples
903920
--------
@@ -916,7 +933,7 @@ def wide_to_long(df, stubnames, i, j):
916933
0 a d 2.5 3.2 -1.085631 0
917934
1 b e 1.2 1.3 0.997345 1
918935
2 c f 0.7 0.1 0.282978 2
919-
>>> wide_to_long(df, ["A", "B"], i="id", j="year")
936+
>>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
920937
X A B
921938
id year
922939
0 1970 -1.085631 a 2.5
@@ -926,38 +943,166 @@ def wide_to_long(df, stubnames, i, j):
926943
1 1980 0.997345 e 1.3
927944
2 1980 0.282978 f 0.1
928945
946+
With multuple id columns
947+
948+
>>> df = pd.DataFrame({
949+
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
950+
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
951+
... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
952+
... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
953+
... })
954+
>>> df
955+
birth famid ht1 ht2
956+
0 1 1 2.8 3.4
957+
1 2 1 2.9 3.8
958+
2 3 1 2.2 2.9
959+
3 1 2 2.0 3.2
960+
4 2 2 1.8 2.8
961+
5 3 2 1.9 2.4
962+
6 1 3 2.2 3.3
963+
7 2 3 2.3 3.4
964+
8 3 3 2.1 2.9
965+
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
966+
>>> l
967+
ht
968+
famid birth age
969+
1 1 1 2.8
970+
2 3.4
971+
2 1 2.9
972+
2 3.8
973+
3 1 2.2
974+
2 2.9
975+
2 1 1 2.0
976+
2 3.2
977+
2 1 1.8
978+
2 2.8
979+
3 1 1.9
980+
2 2.4
981+
3 1 1 2.2
982+
2 3.3
983+
2 1 2.3
984+
2 3.4
985+
3 1 2.1
986+
2 2.9
987+
988+
Going from long back to wide just takes some creative use of `unstack`
989+
990+
>>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack()
991+
>>> w.columns = [name + suffix for name, suffix in wide.columns.tolist()]
992+
>>> w.reset_index()
993+
famid birth ht1 ht2
994+
0 1 1 2.8 3.4
995+
1 1 2 2.9 3.8
996+
2 1 3 2.2 2.9
997+
3 2 1 2.0 3.2
998+
4 2 2 1.8 2.8
999+
5 2 3 1.9 2.4
1000+
6 3 1 2.2 3.3
1001+
7 3 2 2.3 3.4
1002+
8 3 3 2.1 2.9
1003+
1004+
Less wieldy column names are also handled
1005+
1006+
>>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
1007+
... 'A(quarterly)-2011': np.random.rand(3),
1008+
... 'B(quarterly)-2010': np.random.rand(3),
1009+
... 'B(quarterly)-2011': np.random.rand(3),
1010+
... 'X' : np.random.randint(3, size=3)})
1011+
>>> df['id'] = df.index
1012+
>>> df
1013+
A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011
1014+
0 0.531828 0.724455 0.322959 0.293714
1015+
1 0.634401 0.611024 0.361789 0.630976
1016+
2 0.849432 0.722443 0.228263 0.092105
1017+
\
1018+
X id
1019+
0 0 0
1020+
1 1 1
1021+
2 2 2
1022+
>>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'],
1023+
i='id', j='year', sep='-')
1024+
X A(quarterly) B(quarterly)
1025+
id year
1026+
0 2010 0 0.531828 0.322959
1027+
1 2010 2 0.634401 0.361789
1028+
2 2010 2 0.849432 0.228263
1029+
0 2011 0 0.724455 0.293714
1030+
1 2011 2 0.611024 0.630976
1031+
2 2011 2 0.722443 0.092105
1032+
1033+
If we have many columns, we could also use a regex to find our
1034+
stubnames and pass that list on to wide_to_long
1035+
1036+
>>> stubnames = set([match[0] for match in
1037+
df.columns.str.findall('[A-B]\(.*\)').values
1038+
if match != [] ])
1039+
>>> list(stubnames)
1040+
['B(quarterly)', 'A(quarterly)']
1041+
9291042
Notes
9301043
-----
931-
All extra variables are treated as extra id variables. This simply uses
1044+
All extra variables are left untouched. This simply uses
9321045
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
9331046
in a typicaly case.
9341047
"""
9351048

9361049
def get_var_names(df, regex):
9371050
return df.filter(regex=regex).columns.tolist()
9381051

939-
def melt_stub(df, stub, i, j):
940-
varnames = get_var_names(df, "^" + stub)
941-
newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub,
942-
var_name=j)
943-
newdf_j = newdf[j].str.replace(stub, "")
944-
try:
945-
newdf_j = newdf_j.astype(int)
946-
except ValueError:
947-
pass
948-
newdf[j] = newdf_j
949-
return newdf
950-
951-
id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
952-
if i not in id_vars:
953-
id_vars += [i]
954-
955-
newdf = melt_stub(df, stubnames[0], id_vars, j)
956-
957-
for stub in stubnames[1:]:
958-
new = melt_stub(df, stub, id_vars, j)
959-
newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
960-
return newdf.set_index([i, j])
1052+
def melt_stub(df, stub, i, j, value_vars, sep):
1053+
newdf = melt(df, id_vars=i, value_vars=value_vars,
1054+
value_name=stub.rstrip(sep), var_name=j)
1055+
newdf[j] = Categorical(newdf[j])
1056+
newdf[j] = newdf[j].str.replace(re.escape(stub), "")
1057+
1058+
return newdf.set_index(i + [j])
1059+
1060+
if any(map(lambda s: s in df.columns.tolist(), stubnames)):
1061+
raise ValueError("stubname can't be identical to a column name")
1062+
1063+
if not isinstance(stubnames, list):
1064+
stubnames = [stubnames]
1065+
1066+
if not isinstance(i, list):
1067+
i = [i]
1068+
1069+
stubs = list(map(lambda x: x + sep, stubnames))
1070+
1071+
# This regex is needed to avoid multiple "greedy" matches with stubs
1072+
# that have overlapping substrings
1073+
# For example A2011, A2012 are separate from AA2011, AA2012
1074+
# And BBone, BBtwo is different from Bone, Btwo, and BBBrating
1075+
value_vars = list(map(lambda x: get_var_names(
1076+
df, "^{0}(?!{1})".format(re.escape(x), re.escape(x[-1]))), stubs))
1077+
1078+
value_vars_flattened = [e for sublist in value_vars for e in sublist]
1079+
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
1080+
1081+
# If we know the stub end type is a number we can disambiguate potential
1082+
# misclassified value_vars, for ex, with stubname A: A2011, A2012 and
1083+
# Arating would all be found as value_vars. If the suffix is numeric we
1084+
# know the last one should be an id_var. (Note the converse disambiguation
1085+
# is not possible)
1086+
if numeric_suffix:
1087+
for s, v in zip(stubs, value_vars):
1088+
for vname in v[:]:
1089+
end = vname.replace(s, "")
1090+
if not end.isdigit():
1091+
v.remove(vname)
1092+
id_vars.append(vname)
1093+
1094+
melted = []
1095+
for s, v in zip(stubs, value_vars):
1096+
melted.append(melt_stub(df, s, i, j, v, sep))
1097+
melted = melted[0].join(melted[1:], how='outer')
1098+
1099+
if len(i) == 1:
1100+
new = df[id_vars].set_index(i).join(melted)
1101+
return new
1102+
1103+
new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
1104+
1105+
return new
9611106

9621107

9631108
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,

0 commit comments

Comments
 (0)