Skip to content

Commit a259b64

Browse files
tdpetroujorisvandenbossche
authored andcommitted
BUG: coerce pd.wide_to_long suffixes to ints (#17628)
1 parent 2db1cc0 commit a259b64

File tree

3 files changed

+198
-60
lines changed

3 files changed

+198
-60
lines changed

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ Other API Changes
191191
- Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`)
192192
- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:pr:`16672`)
193193
- :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`)
194+
- :func:`wide_to_long` previously kept numeric-like suffixes as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`)
194195

195196
.. _whatsnew_0220.deprecations:
196197

pandas/core/reshape/melt.py

+61-8
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import re
1515
from pandas.core.dtypes.missing import notna
16+
from pandas.core.tools.numeric import to_numeric
1617

1718

1819
@Appender(_shared_docs['melt'] %
@@ -199,6 +200,9 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
199200
200201
.. versionadded:: 0.20.0
201202
203+
.. versionchanged:: 0.22.0
204+
When all suffixes are numeric, they are cast to int64/float64.
205+
202206
Returns
203207
-------
204208
DataFrame
@@ -278,8 +282,8 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
278282
279283
Going from long back to wide just takes some creative use of `unstack`
280284
281-
>>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack()
282-
>>> w.columns = pd.Index(w.columns).str.join('')
285+
>>> w = l.unstack()
286+
>>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
283287
>>> w.reset_index()
284288
famid birth ht1 ht2
285289
0 1 1 2.8 3.4
@@ -333,26 +337,76 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
333337
>>> list(stubnames)
334338
['A(quarterly)', 'B(quarterly)']
335339
340+
All of the above examples have integers as suffixes. It is possible to
341+
have non-integers as suffixes.
342+
343+
>>> df = pd.DataFrame({
344+
... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
345+
... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
346+
... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
347+
... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
348+
... })
349+
>>> df
350+
birth famid ht_one ht_two
351+
0 1 1 2.8 3.4
352+
1 2 1 2.9 3.8
353+
2 3 1 2.2 2.9
354+
3 1 2 2.0 3.2
355+
4 2 2 1.8 2.8
356+
5 3 2 1.9 2.4
357+
6 1 3 2.2 3.3
358+
7 2 3 2.3 3.4
359+
8 3 3 2.1 2.9
360+
361+
>>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
362+
sep='_', suffix='\w')
363+
>>> l
364+
... # doctest: +NORMALIZE_WHITESPACE
365+
ht
366+
famid birth age
367+
1 1 one 2.8
368+
two 3.4
369+
2 one 2.9
370+
two 3.8
371+
3 one 2.2
372+
two 2.9
373+
2 1 one 2.0
374+
two 3.2
375+
2 one 1.8
376+
two 2.8
377+
3 one 1.9
378+
two 2.4
379+
3 1 one 2.2
380+
two 3.3
381+
2 one 2.3
382+
two 3.4
383+
3 one 2.1
384+
two 2.9
385+
336386
Notes
337387
-----
338388
All extra variables are left untouched. This simply uses
339389
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
340-
in a typicaly case.
390+
in a typical case.
341391
"""
342392
def get_var_names(df, stub, sep, suffix):
343-
regex = "^{stub}{sep}{suffix}".format(
393+
regex = r'^{stub}{sep}{suffix}$'.format(
344394
stub=re.escape(stub), sep=re.escape(sep), suffix=suffix)
345-
return df.filter(regex=regex).columns.tolist()
395+
pattern = re.compile(regex)
396+
return [col for col in df.columns if pattern.match(col)]
346397

347398
def melt_stub(df, stub, i, j, value_vars, sep):
348399
newdf = melt(df, id_vars=i, value_vars=value_vars,
349400
value_name=stub.rstrip(sep), var_name=j)
350401
newdf[j] = Categorical(newdf[j])
351402
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
352403

404+
# GH17627 Cast numerics suffixes to int/float
405+
newdf[j] = to_numeric(newdf[j], errors='ignore')
406+
353407
return newdf.set_index(i + [j])
354408

355-
if any(map(lambda s: s in df.columns.tolist(), stubnames)):
409+
if any([col in stubnames for col in df.columns]):
356410
raise ValueError("stubname can't be identical to a column name")
357411

358412
if not is_list_like(stubnames):
@@ -368,8 +422,7 @@ def melt_stub(df, stub, i, j, value_vars, sep):
368422
if df[i].duplicated().any():
369423
raise ValueError("the id variables need to uniquely identify each row")
370424

371-
value_vars = list(map(lambda stub:
372-
get_var_names(df, stub, sep, suffix), stubnames))
425+
value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]
373426

374427
value_vars_flattened = [e for sublist in value_vars for e in sublist]
375428
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))

0 commit comments

Comments
 (0)