3
3
from pandas .compat import range , zip
4
4
from pandas import compat
5
5
import itertools
6
+ import re
6
7
7
8
import numpy as np
8
9
@@ -877,29 +878,55 @@ def lreshape(data, groups, dropna=True, label=None):
877
878
return DataFrame (mdata , columns = id_cols + pivot_cols )
878
879
879
880
880
- def wide_to_long (df , stubnames , i , j ):
881
- """
881
+ def wide_to_long (df , stubnames , i , j , sep = "" , suffix = '\d+' ):
882
+ r """
882
883
Wide panel to long format. Less flexible but more user-friendly than melt.
883
884
885
+ With stubnames ['A', 'B'], this function expects to find one or more
886
+ group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,...
887
+ You specify what you want to call this suffix in the resulting long format
888
+ with `j` (for example `j='year'`)
889
+
890
+ Each row of these wide variables are assumed to be uniquely identified by
891
+ `i` (can be a single column name or a list of column names)
892
+
893
+ All remaining variables in the data frame are left intact.
894
+
884
895
Parameters
885
896
----------
886
897
df : DataFrame
887
898
The wide-format DataFrame
888
- stubnames : list
889
- A list of stub names . The wide format variables are assumed to
899
+ stubnames : str or list-like
900
+ The stub name(s) . The wide format variables are assumed to
890
901
start with the stub names.
891
- i : str
892
- The name of the id variable.
902
+ i : str or list-like
903
+ Column(s) to use as id variable(s)
893
904
j : str
894
- The name of the subobservation variable.
895
- stubend : str
896
- Regex to match for the end of the stubs.
905
+ The name of the subobservation variable. What you wish to name your
906
+ suffix in the long format.
907
+ sep : str, default ""
908
+ A character indicating the separation of the variable names
909
+ in the wide format, to be stripped from the names in the long format.
910
+ For example, if your column names are A-suffix1, A-suffix2, you
911
+ can strip the hypen by specifying `sep='-'`
912
+
913
+ .. versionadded:: 0.20.0
914
+
915
+ suffix : str, default '\\d+'
916
+ A regular expression capturing the wanted suffixes. '\\d+' captures
917
+ numeric suffixes. Suffixes with no numbers could be specified with the
918
+ negated character class '\\D+'. You can also further disambiguate
919
+ suffixes, for example, if your wide variables are of the form
920
+ Aone, Btwo,.., and you have an unrelated column Arating, you can
921
+ ignore the last one by specifying `suffix='(!?one|two)'`
922
+
923
+ .. versionadded:: 0.20.0
897
924
898
925
Returns
899
926
-------
900
927
DataFrame
901
- A DataFrame that contains each stub name as a variable as well as
902
- variables for i and j.
928
+ A DataFrame that contains each stub name as a variable, with new index
929
+ (i, j)
903
930
904
931
Examples
905
932
--------
@@ -918,7 +945,7 @@ def wide_to_long(df, stubnames, i, j):
918
945
0 a d 2.5 3.2 -1.085631 0
919
946
1 b e 1.2 1.3 0.997345 1
920
947
2 c f 0.7 0.1 0.282978 2
921
- >>> wide_to_long(df, ["A", "B"], i="id", j="year")
948
+ >>> pd. wide_to_long(df, ["A", "B"], i="id", j="year")
922
949
X A B
923
950
id year
924
951
0 1970 -1.085631 a 2.5
@@ -928,38 +955,151 @@ def wide_to_long(df, stubnames, i, j):
928
955
1 1980 0.997345 e 1.3
929
956
2 1980 0.282978 f 0.1
930
957
958
+ With multuple id columns
959
+
960
+ >>> df = pd.DataFrame({
961
+ ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
962
+ ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
963
+ ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
964
+ ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
965
+ ... })
966
+ >>> df
967
+ birth famid ht1 ht2
968
+ 0 1 1 2.8 3.4
969
+ 1 2 1 2.9 3.8
970
+ 2 3 1 2.2 2.9
971
+ 3 1 2 2.0 3.2
972
+ 4 2 2 1.8 2.8
973
+ 5 3 2 1.9 2.4
974
+ 6 1 3 2.2 3.3
975
+ 7 2 3 2.3 3.4
976
+ 8 3 3 2.1 2.9
977
+ >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
978
+ >>> l
979
+ ht
980
+ famid birth age
981
+ 1 1 1 2.8
982
+ 2 3.4
983
+ 2 1 2.9
984
+ 2 3.8
985
+ 3 1 2.2
986
+ 2 2.9
987
+ 2 1 1 2.0
988
+ 2 3.2
989
+ 2 1 1.8
990
+ 2 2.8
991
+ 3 1 1.9
992
+ 2 2.4
993
+ 3 1 1 2.2
994
+ 2 3.3
995
+ 2 1 2.3
996
+ 2 3.4
997
+ 3 1 2.1
998
+ 2 2.9
999
+
1000
+ Going from long back to wide just takes some creative use of `unstack`
1001
+
1002
+ >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack()
1003
+ >>> w.columns = pd.Index(w.columns).str.join('')
1004
+ >>> w.reset_index()
1005
+ famid birth ht1 ht2
1006
+ 0 1 1 2.8 3.4
1007
+ 1 1 2 2.9 3.8
1008
+ 2 1 3 2.2 2.9
1009
+ 3 2 1 2.0 3.2
1010
+ 4 2 2 1.8 2.8
1011
+ 5 2 3 1.9 2.4
1012
+ 6 3 1 2.2 3.3
1013
+ 7 3 2 2.3 3.4
1014
+ 8 3 3 2.1 2.9
1015
+
1016
+ Less wieldy column names are also handled
1017
+
1018
+ >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
1019
+ ... 'A(quarterly)-2011': np.random.rand(3),
1020
+ ... 'B(quarterly)-2010': np.random.rand(3),
1021
+ ... 'B(quarterly)-2011': np.random.rand(3),
1022
+ ... 'X' : np.random.randint(3, size=3)})
1023
+ >>> df['id'] = df.index
1024
+ >>> df
1025
+ A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011
1026
+ 0 0.531828 0.724455 0.322959 0.293714
1027
+ 1 0.634401 0.611024 0.361789 0.630976
1028
+ 2 0.849432 0.722443 0.228263 0.092105
1029
+ \
1030
+ X id
1031
+ 0 0 0
1032
+ 1 1 1
1033
+ 2 2 2
1034
+ >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'],
1035
+ i='id', j='year', sep='-')
1036
+ X A(quarterly) B(quarterly)
1037
+ id year
1038
+ 0 2010 0 0.531828 0.322959
1039
+ 1 2010 2 0.634401 0.361789
1040
+ 2 2010 2 0.849432 0.228263
1041
+ 0 2011 0 0.724455 0.293714
1042
+ 1 2011 2 0.611024 0.630976
1043
+ 2 2011 2 0.722443 0.092105
1044
+
1045
+ If we have many columns, we could also use a regex to find our
1046
+ stubnames and pass that list on to wide_to_long
1047
+
1048
+ >>> stubnames = set([match[0] for match in
1049
+ df.columns.str.findall('[A-B]\(.*\)').values
1050
+ if match != [] ])
1051
+ >>> list(stubnames)
1052
+ ['B(quarterly)', 'A(quarterly)']
1053
+
931
1054
Notes
932
1055
-----
933
- All extra variables are treated as extra id variables . This simply uses
1056
+ All extra variables are left untouched . This simply uses
934
1057
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
935
1058
in a typicaly case.
936
1059
"""
937
-
938
- def get_var_names ( df , regex ):
1060
+ def get_var_names ( df , stub , sep , suffix ):
1061
+ regex = "^{0}{1}{2}" . format ( re . escape ( stub ), re . escape ( sep ), suffix )
939
1062
return df .filter (regex = regex ).columns .tolist ()
940
1063
941
- def melt_stub (df , stub , i , j ):
942
- varnames = get_var_names (df , "^" + stub )
943
- newdf = melt (df , id_vars = i , value_vars = varnames , value_name = stub ,
944
- var_name = j )
945
- newdf_j = newdf [j ].str .replace (stub , "" )
946
- try :
947
- newdf_j = newdf_j .astype (int )
948
- except ValueError :
949
- pass
950
- newdf [j ] = newdf_j
951
- return newdf
952
-
953
- id_vars = get_var_names (df , "^(?!%s)" % "|" .join (stubnames ))
954
- if i not in id_vars :
955
- id_vars += [i ]
956
-
957
- newdf = melt_stub (df , stubnames [0 ], id_vars , j )
958
-
959
- for stub in stubnames [1 :]:
960
- new = melt_stub (df , stub , id_vars , j )
961
- newdf = newdf .merge (new , how = "outer" , on = id_vars + [j ], copy = False )
962
- return newdf .set_index ([i , j ])
1064
+ def melt_stub (df , stub , i , j , value_vars , sep ):
1065
+ newdf = melt (df , id_vars = i , value_vars = value_vars ,
1066
+ value_name = stub .rstrip (sep ), var_name = j )
1067
+ newdf [j ] = Categorical (newdf [j ])
1068
+ newdf [j ] = newdf [j ].str .replace (re .escape (stub + sep ), "" )
1069
+
1070
+ return newdf .set_index (i + [j ])
1071
+
1072
+ if any (map (lambda s : s in df .columns .tolist (), stubnames )):
1073
+ raise ValueError ("stubname can't be identical to a column name" )
1074
+
1075
+ if not is_list_like (stubnames ):
1076
+ stubnames = [stubnames ]
1077
+ else :
1078
+ stubnames = list (stubnames )
1079
+
1080
+ if not is_list_like (i ):
1081
+ i = [i ]
1082
+ else :
1083
+ i = list (i )
1084
+
1085
+ value_vars = list (map (lambda stub :
1086
+ get_var_names (df , stub , sep , suffix ), stubnames ))
1087
+
1088
+ value_vars_flattened = [e for sublist in value_vars for e in sublist ]
1089
+ id_vars = list (set (df .columns .tolist ()).difference (value_vars_flattened ))
1090
+
1091
+ melted = []
1092
+ for s , v in zip (stubnames , value_vars ):
1093
+ melted .append (melt_stub (df , s , i , j , v , sep ))
1094
+ melted = melted [0 ].join (melted [1 :], how = 'outer' )
1095
+
1096
+ if len (i ) == 1 :
1097
+ new = df [id_vars ].set_index (i ).join (melted )
1098
+ return new
1099
+
1100
+ new = df [id_vars ].merge (melted .reset_index (), on = i ).set_index (i + [j ])
1101
+
1102
+ return new
963
1103
964
1104
965
1105
def get_dummies (data , prefix = None , prefix_sep = '_' , dummy_na = False ,
0 commit comments