13
13
14
14
import re
15
15
from pandas .core .dtypes .missing import notna
16
+ from pandas .core .tools .numeric import to_numeric
16
17
17
18
18
19
@Appender (_shared_docs ['melt' ] %
@@ -199,6 +200,9 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
199
200
200
201
.. versionadded:: 0.20.0
201
202
203
+ .. versionchanged:: 0.22.0
204
+ When all suffixes are numeric, they are cast to int64/float64.
205
+
202
206
Returns
203
207
-------
204
208
DataFrame
@@ -278,8 +282,8 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
278
282
279
283
Going from long back to wide just takes some creative use of `unstack`
280
284
281
- >>> w = l.reset_index().set_index(['famid', 'birth', 'age']). unstack()
282
- >>> w.columns = pd.Index( w.columns).str.join('' )
285
+ >>> w = l.unstack()
286
+ >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format )
283
287
>>> w.reset_index()
284
288
famid birth ht1 ht2
285
289
0 1 1 2.8 3.4
@@ -333,26 +337,76 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
333
337
>>> list(stubnames)
334
338
['A(quarterly)', 'B(quarterly)']
335
339
340
+ All of the above examples have integers as suffixes. It is possible to
341
+ have non-integers as suffixes.
342
+
343
+ >>> df = pd.DataFrame({
344
+ ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
345
+ ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
346
+ ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
347
+ ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
348
+ ... })
349
+ >>> df
350
+ birth famid ht_one ht_two
351
+ 0 1 1 2.8 3.4
352
+ 1 2 1 2.9 3.8
353
+ 2 3 1 2.2 2.9
354
+ 3 1 2 2.0 3.2
355
+ 4 2 2 1.8 2.8
356
+ 5 3 2 1.9 2.4
357
+ 6 1 3 2.2 3.3
358
+ 7 2 3 2.3 3.4
359
+ 8 3 3 2.1 2.9
360
+
361
+ >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
362
+ sep='_', suffix='\w')
363
+ >>> l
364
+ ... # doctest: +NORMALIZE_WHITESPACE
365
+ ht
366
+ famid birth age
367
+ 1 1 one 2.8
368
+ two 3.4
369
+ 2 one 2.9
370
+ two 3.8
371
+ 3 one 2.2
372
+ two 2.9
373
+ 2 1 one 2.0
374
+ two 3.2
375
+ 2 one 1.8
376
+ two 2.8
377
+ 3 one 1.9
378
+ two 2.4
379
+ 3 1 one 2.2
380
+ two 3.3
381
+ 2 one 2.3
382
+ two 3.4
383
+ 3 one 2.1
384
+ two 2.9
385
+
336
386
Notes
337
387
-----
338
388
All extra variables are left untouched. This simply uses
339
389
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
340
- in a typicaly case.
390
+ in a typical case.
341
391
"""
342
392
def get_var_names (df , stub , sep , suffix ):
343
- regex = " ^{stub}{sep}{suffix}" .format (
393
+ regex = r' ^{stub}{sep}{suffix}$' .format (
344
394
stub = re .escape (stub ), sep = re .escape (sep ), suffix = suffix )
345
- return df .filter (regex = regex ).columns .tolist ()
395
+ pattern = re .compile (regex )
396
+ return [col for col in df .columns if pattern .match (col )]
346
397
347
398
def melt_stub (df , stub , i , j , value_vars , sep ):
348
399
newdf = melt (df , id_vars = i , value_vars = value_vars ,
349
400
value_name = stub .rstrip (sep ), var_name = j )
350
401
newdf [j ] = Categorical (newdf [j ])
351
402
newdf [j ] = newdf [j ].str .replace (re .escape (stub + sep ), "" )
352
403
404
+ # GH17627 Cast numerics suffixes to int/float
405
+ newdf [j ] = to_numeric (newdf [j ], errors = 'ignore' )
406
+
353
407
return newdf .set_index (i + [j ])
354
408
355
- if any (map ( lambda s : s in df .columns . tolist (), stubnames ) ):
409
+ if any ([ col in stubnames for col in df .columns ] ):
356
410
raise ValueError ("stubname can't be identical to a column name" )
357
411
358
412
if not is_list_like (stubnames ):
@@ -368,8 +422,7 @@ def melt_stub(df, stub, i, j, value_vars, sep):
368
422
if df [i ].duplicated ().any ():
369
423
raise ValueError ("the id variables need to uniquely identify each row" )
370
424
371
- value_vars = list (map (lambda stub :
372
- get_var_names (df , stub , sep , suffix ), stubnames ))
425
+ value_vars = [get_var_names (df , stub , sep , suffix ) for stub in stubnames ]
373
426
374
427
value_vars_flattened = [e for sublist in value_vars for e in sublist ]
375
428
id_vars = list (set (df .columns .tolist ()).difference (value_vars_flattened ))
0 commit comments