-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH/DOC: wide_to_long performance and docstring clarification #14779
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
5747a25
54c5920
1c49291
295d1e6
dc13064
df1edf8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
from .pandas_vb_common import * | ||
from pandas.core.reshape import melt | ||
from pandas.core.reshape import melt, wide_to_long | ||
|
||
|
||
class melt_dataframe(object): | ||
|
@@ -74,3 +74,28 @@ def setup(self): | |
|
||
def time_unstack_sparse_keyspace(self): | ||
self.idf.unstack() | ||
|
||
|
||
class wide_to_long_big(object): | ||
goal_time = 0.2 | ||
|
||
def setup(self): | ||
vars = 'ABCD' | ||
nyrs = 20 | ||
nidvars = 20 | ||
N = 5000 | ||
yrvars = [] | ||
for var in vars: | ||
for yr in range(1, nyrs + 1): | ||
yrvars.append(var + str(yr)) | ||
|
||
yearobs = dict(zip(yrvars, np.random.randn(len(yrvars), N))) | ||
idobs = dict(zip(range(nidvars), np.random.rand(nidvars, N))) | ||
|
||
self.df = pd.concat([pd.DataFrame(idobs), pd.DataFrame(yearobs)], | ||
axis=1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can also do something like |
||
self.vars = vars | ||
|
||
def time_wide_to_long_big(self): | ||
self.df['id'] = self.df.index | ||
wide_to_long(self.df, list(self.vars), i='id', j='year') |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
from pandas.compat import range, zip | ||
from pandas import compat | ||
import itertools | ||
import re | ||
|
||
import numpy as np | ||
|
||
|
@@ -875,29 +876,45 @@ def lreshape(data, groups, dropna=True, label=None): | |
return DataFrame(mdata, columns=id_cols + pivot_cols) | ||
|
||
|
||
def wide_to_long(df, stubnames, i, j): | ||
def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True): | ||
""" | ||
Wide panel to long format. Less flexible but more user-friendly than melt. | ||
|
||
With stubnames ['A', 'B'], this function expects to find one or more | ||
group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,... | ||
You specify what you want to call this suffix in the resulting long format | ||
with `j` (for example `j`='year') | ||
|
||
Each row of these wide variables are assumed to be uniquely identified by | ||
`i` (can be a single column name or a list of column names) | ||
|
||
All remaining variables in the data frame are left intact. | ||
|
||
Parameters | ||
---------- | ||
df : DataFrame | ||
The wide-format DataFrame | ||
stubnames : list | ||
A list of stub names. The wide format variables are assumed to | ||
stubnames : list or string | ||
The stub name(s). The wide format variables are assumed to | ||
start with the stub names. | ||
i : str | ||
The name of the id variable. | ||
i : list or string | ||
Column(s) to use as id variable(s) | ||
j : str | ||
The name of the subobservation variable. | ||
stubend : str | ||
Regex to match for the end of the stubs. | ||
The name of the subobservation variable. What you wish to name your | ||
suffix in the long format. | ||
sep : str, default "" | ||
A character indicating the separation of the variable names | ||
in the wide format, to be stripped from the names in the long format. | ||
For example, if your column names are A-suffix1, A-suffix2, you | ||
can strip the hypen by specifying `sep`='-' | ||
numeric_suffix : bool, default True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rather call this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that makes more sense |
||
Whether the stub suffix is assumed to be numeric or not. | ||
|
||
Returns | ||
------- | ||
DataFrame | ||
A DataFrame that contains each stub name as a variable as well as | ||
variables for i and j. | ||
A DataFrame that contains each stub name as a variable, with new index | ||
(i, j) | ||
|
||
Examples | ||
-------- | ||
|
@@ -916,7 +933,7 @@ def wide_to_long(df, stubnames, i, j): | |
0 a d 2.5 3.2 -1.085631 0 | ||
1 b e 1.2 1.3 0.997345 1 | ||
2 c f 0.7 0.1 0.282978 2 | ||
>>> wide_to_long(df, ["A", "B"], i="id", j="year") | ||
>>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") | ||
X A B | ||
id year | ||
0 1970 -1.085631 a 2.5 | ||
|
@@ -926,38 +943,166 @@ def wide_to_long(df, stubnames, i, j): | |
1 1980 0.997345 e 1.3 | ||
2 1980 0.282978 f 0.1 | ||
|
||
With multuple id columns | ||
|
||
>>> df = pd.DataFrame({ | ||
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], | ||
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], | ||
... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], | ||
... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] | ||
... }) | ||
>>> df | ||
birth famid ht1 ht2 | ||
0 1 1 2.8 3.4 | ||
1 2 1 2.9 3.8 | ||
2 3 1 2.2 2.9 | ||
3 1 2 2.0 3.2 | ||
4 2 2 1.8 2.8 | ||
5 3 2 1.9 2.4 | ||
6 1 3 2.2 3.3 | ||
7 2 3 2.3 3.4 | ||
8 3 3 2.1 2.9 | ||
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') | ||
>>> l | ||
ht | ||
famid birth age | ||
1 1 1 2.8 | ||
2 3.4 | ||
2 1 2.9 | ||
2 3.8 | ||
3 1 2.2 | ||
2 2.9 | ||
2 1 1 2.0 | ||
2 3.2 | ||
2 1 1.8 | ||
2 2.8 | ||
3 1 1.9 | ||
2 2.4 | ||
3 1 1 2.2 | ||
2 3.3 | ||
2 1 2.3 | ||
2 3.4 | ||
3 1 2.1 | ||
2 2.9 | ||
|
||
Going from long back to wide just takes some creative use of `unstack` | ||
|
||
>>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack() | ||
>>> w.columns = [name + suffix for name, suffix in wide.columns.tolist()] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use this:
|
||
>>> w.reset_index() | ||
famid birth ht1 ht2 | ||
0 1 1 2.8 3.4 | ||
1 1 2 2.9 3.8 | ||
2 1 3 2.2 2.9 | ||
3 2 1 2.0 3.2 | ||
4 2 2 1.8 2.8 | ||
5 2 3 1.9 2.4 | ||
6 3 1 2.2 3.3 | ||
7 3 2 2.3 3.4 | ||
8 3 3 2.1 2.9 | ||
|
||
Less wieldy column names are also handled | ||
|
||
>>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), | ||
... 'A(quarterly)-2011': np.random.rand(3), | ||
... 'B(quarterly)-2010': np.random.rand(3), | ||
... 'B(quarterly)-2011': np.random.rand(3), | ||
... 'X' : np.random.randint(3, size=3)}) | ||
>>> df['id'] = df.index | ||
>>> df | ||
A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011 | ||
0 0.531828 0.724455 0.322959 0.293714 | ||
1 0.634401 0.611024 0.361789 0.630976 | ||
2 0.849432 0.722443 0.228263 0.092105 | ||
\ | ||
X id | ||
0 0 0 | ||
1 1 1 | ||
2 2 2 | ||
>>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], | ||
i='id', j='year', sep='-') | ||
X A(quarterly) B(quarterly) | ||
id year | ||
0 2010 0 0.531828 0.322959 | ||
1 2010 2 0.634401 0.361789 | ||
2 2010 2 0.849432 0.228263 | ||
0 2011 0 0.724455 0.293714 | ||
1 2011 2 0.611024 0.630976 | ||
2 2011 2 0.722443 0.092105 | ||
|
||
If we have many columns, we could also use a regex to find our | ||
stubnames and pass that list on to wide_to_long | ||
|
||
>>> stubnames = set([match[0] for match in | ||
df.columns.str.findall('[A-B]\(.*\)').values | ||
if match != [] ]) | ||
>>> list(stubnames) | ||
['B(quarterly)', 'A(quarterly)'] | ||
|
||
Notes | ||
----- | ||
All extra variables are treated as extra id variables. This simply uses | ||
All extra variables are left untouched. This simply uses | ||
`pandas.melt` under the hood, but is hard-coded to "do the right thing" | ||
in a typicaly case. | ||
""" | ||
|
||
def get_var_names(df, regex): | ||
return df.filter(regex=regex).columns.tolist() | ||
|
||
def melt_stub(df, stub, i, j): | ||
varnames = get_var_names(df, "^" + stub) | ||
newdf = melt(df, id_vars=i, value_vars=varnames, value_name=stub, | ||
var_name=j) | ||
newdf_j = newdf[j].str.replace(stub, "") | ||
try: | ||
newdf_j = newdf_j.astype(int) | ||
except ValueError: | ||
pass | ||
newdf[j] = newdf_j | ||
return newdf | ||
|
||
id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames)) | ||
if i not in id_vars: | ||
id_vars += [i] | ||
|
||
newdf = melt_stub(df, stubnames[0], id_vars, j) | ||
|
||
for stub in stubnames[1:]: | ||
new = melt_stub(df, stub, id_vars, j) | ||
newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False) | ||
return newdf.set_index([i, j]) | ||
def melt_stub(df, stub, i, j, value_vars, sep): | ||
newdf = melt(df, id_vars=i, value_vars=value_vars, | ||
value_name=stub.rstrip(sep), var_name=j) | ||
newdf[j] = Categorical(newdf[j]) | ||
newdf[j] = newdf[j].str.replace(re.escape(stub), "") | ||
|
||
return newdf.set_index(i + [j]) | ||
|
||
if any(map(lambda s: s in df.columns.tolist(), stubnames)): | ||
raise ValueError("stubname can't be identical to a column name") | ||
|
||
if not isinstance(stubnames, list): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we usually use |
||
stubnames = [stubnames] | ||
|
||
if not isinstance(i, list): | ||
i = [i] | ||
|
||
stubs = list(map(lambda x: x + sep, stubnames)) | ||
|
||
# This regex is needed to avoid multiple "greedy" matches with stubs | ||
# that have overlapping substrings | ||
# For example A2011, A2012 are separate from AA2011, AA2012 | ||
# And BBone, BBtwo is different from Bone, Btwo, and BBBrating | ||
value_vars = list(map(lambda x: get_var_names( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ideally you would just look for a match of a letter followed by a non-letter (or vice versa), I think that is more robust. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But in the case of string stems, the three groups here will not be captured:
A negative lookahead |
||
df, "^{0}(?!{1})".format(re.escape(x), re.escape(x[-1]))), stubs)) | ||
|
||
value_vars_flattened = [e for sublist in value_vars for e in sublist] | ||
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) | ||
|
||
# If we know the stub end type is a number we can disambiguate potential | ||
# misclassified value_vars, for ex, with stubname A: A2011, A2012 and | ||
# Arating would all be found as value_vars. If the suffix is numeric we | ||
# know the last one should be an id_var. (Note the converse disambiguation | ||
# is not possible) | ||
if numeric_suffix: | ||
for s, v in zip(stubs, value_vars): | ||
for vname in v[:]: | ||
end = vname.replace(s, "") | ||
if not end.isdigit(): | ||
v.remove(vname) | ||
id_vars.append(vname) | ||
|
||
melted = [] | ||
for s, v in zip(stubs, value_vars): | ||
melted.append(melt_stub(df, s, i, j, v, sep)) | ||
melted = melted[0].join(melted[1:], how='outer') | ||
|
||
if len(i) == 1: | ||
new = df[id_vars].set_index(i).join(melted) | ||
return new | ||
|
||
new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) | ||
|
||
return new | ||
|
||
|
||
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you fix up the indentation here?