3
3
from pandas .compat import range , zip
4
4
from pandas import compat
5
5
import itertools
6
+ import re
6
7
7
8
import numpy as np
8
9
@@ -875,29 +876,45 @@ def lreshape(data, groups, dropna=True, label=None):
875
876
return DataFrame (mdata , columns = id_cols + pivot_cols )
876
877
877
878
878
- def wide_to_long (df , stubnames , i , j ):
879
+ def wide_to_long (df , stubnames , i , j , sep = "" , numeric_suffix = True ):
879
880
"""
880
881
Wide panel to long format. Less flexible but more user-friendly than melt.
881
882
883
+ With stubnames ['A', 'B'], this function expects to find one or more
884
+ group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,...
885
+ You specify what you want to call this suffix in the resulting long format
886
+ with `j` (for example `j`='year')
887
+
888
+ Each row of these wide variables are assumed to be uniquely identified by
889
+ `i` (can be a single column name or a list of column names)
890
+
891
+ All remaining variables in the data frame are left intact.
892
+
882
893
Parameters
883
894
----------
884
895
df : DataFrame
885
896
The wide-format DataFrame
886
- stubnames : list
887
- A list of stub names . The wide format variables are assumed to
897
+ stubnames : list or string
898
+ The stub name(s) . The wide format variables are assumed to
888
899
start with the stub names.
889
- i : str
890
- The name of the id variable.
900
+ i : list or string
901
+ Column(s) to use as id variable(s)
891
902
j : str
892
- The name of the subobservation variable.
893
- stubend : str
894
- Regex to match for the end of the stubs.
903
+ The name of the subobservation variable. What you wish to name your
904
+ suffix in the long format.
905
+ sep : str, default ""
906
+ A character indicating the separation of the variable names
907
+ in the wide format, to be stripped from the names in the long format.
908
+ For example, if your column names are A-suffix1, A-suffix2, you
909
+ can strip the hypen by specifying `sep`='-'
910
+ numeric_suffix : bool, default True
911
+ Whether the stub suffix is assumed to be numeric or not.
895
912
896
913
Returns
897
914
-------
898
915
DataFrame
899
- A DataFrame that contains each stub name as a variable as well as
900
- variables for i and j.
916
+ A DataFrame that contains each stub name as a variable, with new index
917
+ (i, j)
901
918
902
919
Examples
903
920
--------
@@ -916,7 +933,7 @@ def wide_to_long(df, stubnames, i, j):
916
933
0 a d 2.5 3.2 -1.085631 0
917
934
1 b e 1.2 1.3 0.997345 1
918
935
2 c f 0.7 0.1 0.282978 2
919
- >>> wide_to_long(df, ["A", "B"], i="id", j="year")
936
+ >>> pd. wide_to_long(df, ["A", "B"], i="id", j="year")
920
937
X A B
921
938
id year
922
939
0 1970 -1.085631 a 2.5
@@ -926,38 +943,166 @@ def wide_to_long(df, stubnames, i, j):
926
943
1 1980 0.997345 e 1.3
927
944
2 1980 0.282978 f 0.1
928
945
946
+ With multuple id columns
947
+
948
+ >>> df = pd.DataFrame({
949
+ ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
950
+ ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
951
+ ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
952
+ ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
953
+ ... })
954
+ >>> df
955
+ birth famid ht1 ht2
956
+ 0 1 1 2.8 3.4
957
+ 1 2 1 2.9 3.8
958
+ 2 3 1 2.2 2.9
959
+ 3 1 2 2.0 3.2
960
+ 4 2 2 1.8 2.8
961
+ 5 3 2 1.9 2.4
962
+ 6 1 3 2.2 3.3
963
+ 7 2 3 2.3 3.4
964
+ 8 3 3 2.1 2.9
965
+ >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
966
+ >>> l
967
+ ht
968
+ famid birth age
969
+ 1 1 1 2.8
970
+ 2 3.4
971
+ 2 1 2.9
972
+ 2 3.8
973
+ 3 1 2.2
974
+ 2 2.9
975
+ 2 1 1 2.0
976
+ 2 3.2
977
+ 2 1 1.8
978
+ 2 2.8
979
+ 3 1 1.9
980
+ 2 2.4
981
+ 3 1 1 2.2
982
+ 2 3.3
983
+ 2 1 2.3
984
+ 2 3.4
985
+ 3 1 2.1
986
+ 2 2.9
987
+
988
+ Going from long back to wide just takes some creative use of `unstack`
989
+
990
+ >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack()
991
+ >>> w.columns = [name + suffix for name, suffix in wide.columns.tolist()]
992
+ >>> w.reset_index()
993
+ famid birth ht1 ht2
994
+ 0 1 1 2.8 3.4
995
+ 1 1 2 2.9 3.8
996
+ 2 1 3 2.2 2.9
997
+ 3 2 1 2.0 3.2
998
+ 4 2 2 1.8 2.8
999
+ 5 2 3 1.9 2.4
1000
+ 6 3 1 2.2 3.3
1001
+ 7 3 2 2.3 3.4
1002
+ 8 3 3 2.1 2.9
1003
+
1004
+ Less wieldy column names are also handled
1005
+
1006
+ >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3),
1007
+ ... 'A(quarterly)-2011': np.random.rand(3),
1008
+ ... 'B(quarterly)-2010': np.random.rand(3),
1009
+ ... 'B(quarterly)-2011': np.random.rand(3),
1010
+ ... 'X' : np.random.randint(3, size=3)})
1011
+ >>> df['id'] = df.index
1012
+ >>> df
1013
+ A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011
1014
+ 0 0.531828 0.724455 0.322959 0.293714
1015
+ 1 0.634401 0.611024 0.361789 0.630976
1016
+ 2 0.849432 0.722443 0.228263 0.092105
1017
+ \
1018
+ X id
1019
+ 0 0 0
1020
+ 1 1 1
1021
+ 2 2 2
1022
+ >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'],
1023
+ i='id', j='year', sep='-')
1024
+ X A(quarterly) B(quarterly)
1025
+ id year
1026
+ 0 2010 0 0.531828 0.322959
1027
+ 1 2010 2 0.634401 0.361789
1028
+ 2 2010 2 0.849432 0.228263
1029
+ 0 2011 0 0.724455 0.293714
1030
+ 1 2011 2 0.611024 0.630976
1031
+ 2 2011 2 0.722443 0.092105
1032
+
1033
+ If we have many columns, we could also use a regex to find our
1034
+ stubnames and pass that list on to wide_to_long
1035
+
1036
+ >>> stubnames = set([match[0] for match in
1037
+ df.columns.str.findall('[A-B]\(.*\)').values
1038
+ if match != [] ])
1039
+ >>> list(stubnames)
1040
+ ['B(quarterly)', 'A(quarterly)']
1041
+
929
1042
Notes
930
1043
-----
931
- All extra variables are treated as extra id variables . This simply uses
1044
+ All extra variables are left untouched . This simply uses
932
1045
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
933
1046
in a typicaly case.
934
1047
"""
935
1048
936
1049
def get_var_names (df , regex ):
937
1050
return df .filter (regex = regex ).columns .tolist ()
938
1051
939
- def melt_stub (df , stub , i , j ):
940
- varnames = get_var_names (df , "^" + stub )
941
- newdf = melt (df , id_vars = i , value_vars = varnames , value_name = stub ,
942
- var_name = j )
943
- newdf_j = newdf [j ].str .replace (stub , "" )
944
- try :
945
- newdf_j = newdf_j .astype (int )
946
- except ValueError :
947
- pass
948
- newdf [j ] = newdf_j
949
- return newdf
950
-
951
- id_vars = get_var_names (df , "^(?!%s)" % "|" .join (stubnames ))
952
- if i not in id_vars :
953
- id_vars += [i ]
954
-
955
- newdf = melt_stub (df , stubnames [0 ], id_vars , j )
956
-
957
- for stub in stubnames [1 :]:
958
- new = melt_stub (df , stub , id_vars , j )
959
- newdf = newdf .merge (new , how = "outer" , on = id_vars + [j ], copy = False )
960
- return newdf .set_index ([i , j ])
1052
+ def melt_stub (df , stub , i , j , value_vars , sep ):
1053
+ newdf = melt (df , id_vars = i , value_vars = value_vars ,
1054
+ value_name = stub .rstrip (sep ), var_name = j )
1055
+ newdf [j ] = Categorical (newdf [j ])
1056
+ newdf [j ] = newdf [j ].str .replace (re .escape (stub ), "" )
1057
+
1058
+ return newdf .set_index (i + [j ])
1059
+
1060
+ if any (map (lambda s : s in df .columns .tolist (), stubnames )):
1061
+ raise ValueError ("stubname can't be identical to a column name" )
1062
+
1063
+ if not isinstance (stubnames , list ):
1064
+ stubnames = [stubnames ]
1065
+
1066
+ if not isinstance (i , list ):
1067
+ i = [i ]
1068
+
1069
+ stubs = list (map (lambda x : x + sep , stubnames ))
1070
+
1071
+ # This regex is needed to avoid multiple "greedy" matches with stubs
1072
+ # that have overlapping substrings
1073
+ # For example A2011, A2012 are separate from AA2011, AA2012
1074
+ # And BBone, BBtwo is different from Bone, Btwo, and BBBrating
1075
+ value_vars = list (map (lambda x : get_var_names (
1076
+ df , "^{0}(?!{1})" .format (re .escape (x ), re .escape (x [- 1 ]))), stubs ))
1077
+
1078
+ value_vars_flattened = [e for sublist in value_vars for e in sublist ]
1079
+ id_vars = list (set (df .columns .tolist ()).difference (value_vars_flattened ))
1080
+
1081
+ # If we know the stub end type is a number we can disambiguate potential
1082
+ # misclassified value_vars, for ex, with stubname A: A2011, A2012 and
1083
+ # Arating would all be found as value_vars. If the suffix is numeric we
1084
+ # know the last one should be an id_var. (Note the converse disambiguation
1085
+ # is not possible)
1086
+ if numeric_suffix :
1087
+ for s , v in zip (stubs , value_vars ):
1088
+ for vname in v [:]:
1089
+ end = vname .replace (s , "" )
1090
+ if not end .isdigit ():
1091
+ v .remove (vname )
1092
+ id_vars .append (vname )
1093
+
1094
+ melted = []
1095
+ for s , v in zip (stubs , value_vars ):
1096
+ melted .append (melt_stub (df , s , i , j , v , sep ))
1097
+ melted = melted [0 ].join (melted [1 :], how = 'outer' )
1098
+
1099
+ if len (i ) == 1 :
1100
+ new = df [id_vars ].set_index (i ).join (melted )
1101
+ return new
1102
+
1103
+ new = df [id_vars ].merge (melted .reset_index (), on = i ).set_index (i + [j ])
1104
+
1105
+ return new
961
1106
962
1107
963
1108
def get_dummies (data , prefix = None , prefix_sep = '_' , dummy_na = False ,
0 commit comments