Skip to content

Commit 54c5920

Browse files
committed
Specify the suffix with a regex
1 parent 5747a25 commit 54c5920

File tree

1 file changed

+19
-27
lines changed

1 file changed

+19
-27
lines changed

pandas/core/reshape.py

+19-27
Original file line numberDiff line numberDiff line change
@@ -876,7 +876,7 @@ def lreshape(data, groups, dropna=True, label=None):
876876
return DataFrame(mdata, columns=id_cols + pivot_cols)
877877

878878

879-
def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True):
879+
def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'):
880880
"""
881881
Wide panel to long format. Less flexible but more user-friendly than melt.
882882
@@ -907,8 +907,10 @@ def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True):
907907
in the wide format, to be stripped from the names in the long format.
908908
For example, if your column names are A-suffix1, A-suffix2, you
909909
can strip the hypen by specifying `sep`='-'
910-
numeric_suffix : bool, default True
911-
Whether the stub suffix is assumed to be numeric or not.
910+
suffix : str default '\d+'
911+
A regular expression capturing the wanted suffixes. '\d+' captures
912+
numeric suffixes. Suffixes with no numbers could be specified with the
913+
negated character class '\D+'.
912914
913915
Returns
914916
-------
@@ -1045,15 +1047,24 @@ def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True):
10451047
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
10461048
in a typicaly case.
10471049
"""
1050+
def get_var_names(df, stub, sep, suffix):
1051+
# The first part of this regex is needed to avoid multiple "greedy"
1052+
# matches with stubs that have overlapping substrings. For example
1053+
# A2011, A2012 are separate from AA2011, AA2012. And BBone, BBtwo is
1054+
# different from Bone, Btwo, and BBBrating
1055+
# The last part lets us disambiguate suffixes. For example, with
1056+
# stubname A: (A2011, A2012) would be captured while Arating would
1057+
# be ignored by the numeric class \d+
1058+
regex = "^{0}(?!{1}){2}{3}".format(
1059+
re.escape(stub), re.escape(stub[-1]), re.escape(sep), suffix)
10481060

1049-
def get_var_names(df, regex):
10501061
return df.filter(regex=regex).columns.tolist()
10511062

10521063
def melt_stub(df, stub, i, j, value_vars, sep):
10531064
newdf = melt(df, id_vars=i, value_vars=value_vars,
10541065
value_name=stub.rstrip(sep), var_name=j)
10551066
newdf[j] = Categorical(newdf[j])
1056-
newdf[j] = newdf[j].str.replace(re.escape(stub), "")
1067+
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
10571068

10581069
return newdf.set_index(i + [j])
10591070

@@ -1066,33 +1077,14 @@ def melt_stub(df, stub, i, j, value_vars, sep):
10661077
if not isinstance(i, list):
10671078
i = [i]
10681079

1069-
stubs = list(map(lambda x: x + sep, stubnames))
1070-
1071-
# This regex is needed to avoid multiple "greedy" matches with stubs
1072-
# that have overlapping substrings
1073-
# For example A2011, A2012 are separate from AA2011, AA2012
1074-
# And BBone, BBtwo is different from Bone, Btwo, and BBBrating
1075-
value_vars = list(map(lambda x: get_var_names(
1076-
df, "^{0}(?!{1})".format(re.escape(x), re.escape(x[-1]))), stubs))
1080+
value_vars = list(map(lambda stub:
1081+
get_var_names(df, stub, sep, suffix), stubnames))
10771082

10781083
value_vars_flattened = [e for sublist in value_vars for e in sublist]
10791084
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
10801085

1081-
# If we know the stub end type is a number we can disambiguate potential
1082-
# misclassified value_vars, for ex, with stubname A: A2011, A2012 and
1083-
# Arating would all be found as value_vars. If the suffix is numeric we
1084-
# know the last one should be an id_var. (Note the converse disambiguation
1085-
# is not possible)
1086-
if numeric_suffix:
1087-
for s, v in zip(stubs, value_vars):
1088-
for vname in v[:]:
1089-
end = vname.replace(s, "")
1090-
if not end.isdigit():
1091-
v.remove(vname)
1092-
id_vars.append(vname)
1093-
10941086
melted = []
1095-
for s, v in zip(stubs, value_vars):
1087+
for s, v in zip(stubnames, value_vars):
10961088
melted.append(melt_stub(df, s, i, j, v, sep))
10971089
melted = melted[0].join(melted[1:], how='outer')
10981090

0 commit comments

Comments
 (0)