@@ -876,7 +876,7 @@ def lreshape(data, groups, dropna=True, label=None):
876
876
return DataFrame (mdata , columns = id_cols + pivot_cols )
877
877
878
878
879
- def wide_to_long (df , stubnames , i , j , sep = "" , numeric_suffix = True ):
879
+ def wide_to_long (df , stubnames , i , j , sep = "" , suffix = '\d+' ):
880
880
"""
881
881
Wide panel to long format. Less flexible but more user-friendly than melt.
882
882
@@ -907,8 +907,10 @@ def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True):
907
907
in the wide format, to be stripped from the names in the long format.
908
908
For example, if your column names are A-suffix1, A-suffix2, you
909
909
can strip the hypen by specifying `sep`='-'
910
- numeric_suffix : bool, default True
911
- Whether the stub suffix is assumed to be numeric or not.
910
+ suffix : str default '\d+'
911
+ A regular expression capturing the wanted suffixes. '\d+' captures
912
+ numeric suffixes. Suffixes with no numbers could be specified with the
913
+ negated character class '\D+'.
912
914
913
915
Returns
914
916
-------
@@ -1045,15 +1047,24 @@ def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True):
1045
1047
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
1046
1048
in a typicaly case.
1047
1049
"""
1050
+ def get_var_names (df , stub , sep , suffix ):
1051
+ # The first part of this regex is needed to avoid multiple "greedy"
1052
+ # matches with stubs that have overlapping substrings. For example
1053
+ # A2011, A2012 are separate from AA2011, AA2012. And BBone, BBtwo is
1054
+ # different from Bone, Btwo, and BBBrating
1055
+ # The last part lets us disambiguate suffixes. For example, with
1056
+ # stubname A: (A2011, A2012) would be captured while Arating would
1057
+ # be ignored by the numeric class \d+
1058
+ regex = "^{0}(?!{1}){2}{3}" .format (
1059
+ re .escape (stub ), re .escape (stub [- 1 ]), re .escape (sep ), suffix )
1048
1060
1049
- def get_var_names (df , regex ):
1050
1061
return df .filter (regex = regex ).columns .tolist ()
1051
1062
1052
1063
def melt_stub (df , stub , i , j , value_vars , sep ):
1053
1064
newdf = melt (df , id_vars = i , value_vars = value_vars ,
1054
1065
value_name = stub .rstrip (sep ), var_name = j )
1055
1066
newdf [j ] = Categorical (newdf [j ])
1056
- newdf [j ] = newdf [j ].str .replace (re .escape (stub ), "" )
1067
+ newdf [j ] = newdf [j ].str .replace (re .escape (stub + sep ), "" )
1057
1068
1058
1069
return newdf .set_index (i + [j ])
1059
1070
@@ -1066,33 +1077,14 @@ def melt_stub(df, stub, i, j, value_vars, sep):
1066
1077
if not isinstance (i , list ):
1067
1078
i = [i ]
1068
1079
1069
- stubs = list (map (lambda x : x + sep , stubnames ))
1070
-
1071
- # This regex is needed to avoid multiple "greedy" matches with stubs
1072
- # that have overlapping substrings
1073
- # For example A2011, A2012 are separate from AA2011, AA2012
1074
- # And BBone, BBtwo is different from Bone, Btwo, and BBBrating
1075
- value_vars = list (map (lambda x : get_var_names (
1076
- df , "^{0}(?!{1})" .format (re .escape (x ), re .escape (x [- 1 ]))), stubs ))
1080
+ value_vars = list (map (lambda stub :
1081
+ get_var_names (df , stub , sep , suffix ), stubnames ))
1077
1082
1078
1083
value_vars_flattened = [e for sublist in value_vars for e in sublist ]
1079
1084
id_vars = list (set (df .columns .tolist ()).difference (value_vars_flattened ))
1080
1085
1081
- # If we know the stub end type is a number we can disambiguate potential
1082
- # misclassified value_vars, for ex, with stubname A: A2011, A2012 and
1083
- # Arating would all be found as value_vars. If the suffix is numeric we
1084
- # know the last one should be an id_var. (Note the converse disambiguation
1085
- # is not possible)
1086
- if numeric_suffix :
1087
- for s , v in zip (stubs , value_vars ):
1088
- for vname in v [:]:
1089
- end = vname .replace (s , "" )
1090
- if not end .isdigit ():
1091
- v .remove (vname )
1092
- id_vars .append (vname )
1093
-
1094
1086
melted = []
1095
- for s , v in zip (stubs , value_vars ):
1087
+ for s , v in zip (stubnames , value_vars ):
1096
1088
melted .append (melt_stub (df , s , i , j , v , sep ))
1097
1089
melted = melted [0 ].join (melted [1 :], how = 'outer' )
1098
1090
0 commit comments