Skip to content

Commit ad359f6

Browse files
committed
BUG: coerce pd.wide_to_long suffixes to numeric
1 parent 45a795e commit ad359f6

File tree

3 files changed

+151
-59
lines changed

3 files changed

+151
-59
lines changed

doc/source/whatsnew/v0.21.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,8 @@ Other API Changes
482482
- :func:`to_datetime` when passed a tz-aware ``origin=`` kwarg will now raise a more informative ``ValueError`` rather than a ``TypeError`` (:issue:`16842`)
483483
- Renamed non-functional ``index`` to ``index_col`` in :func:`read_stata` to improve API consistency (:issue:`16342`)
484484
- Bug in :func:`DataFrame.drop` caused boolean labels ``False`` and ``True`` to be treated as labels 0 and 1 respectively when dropping indices from a numeric index. This will now raise a ValueError (:issue:`16877`)
485+
- :func:`wide_to_long` previously suffixes were left as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`)
486+
485487

486488
.. _whatsnew_0210.deprecations:
487489

pandas/core/reshape/reshape.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from pandas.core.dtypes.cast import maybe_promote
1515
from pandas.core.dtypes.missing import notna
1616
import pandas.core.dtypes.concat as _concat
17+
from pandas.core.tools.numeric import to_numeric
1718

1819
from pandas.core.series import Series
1920
from pandas.core.frame import DataFrame
@@ -895,6 +896,10 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
895896
896897
.. versionadded:: 0.20.0
897898
899+
When all suffixes are numeric, they are cast to int64/float64.
900+
901+
.. versionadded:: 0.21.0
902+
898903
Returns
899904
-------
900905
DataFrame
@@ -1033,22 +1038,24 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
10331038
-----
10341039
All extra variables are left untouched. This simply uses
10351040
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
1036-
in a typicaly case.
1041+
in a typical case.
10371042
"""
10381043
def get_var_names(df, stub, sep, suffix):
1039-
regex = "^{stub}{sep}{suffix}".format(
1040-
stub=re.escape(stub), sep=re.escape(sep), suffix=suffix)
1041-
return df.filter(regex=regex).columns.tolist()
1044+
regex = '^{0}{1}{2}$'.format(re.escape(stub), re.escape(sep), suffix)
1045+
return [col for col in df.columns if re.match(regex, col)]
10421046

10431047
def melt_stub(df, stub, i, j, value_vars, sep):
10441048
newdf = melt(df, id_vars=i, value_vars=value_vars,
10451049
value_name=stub.rstrip(sep), var_name=j)
10461050
newdf[j] = Categorical(newdf[j])
10471051
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
10481052

1053+
# GH17627 Cast numerics suffixes to int/float
1054+
newdf[j] = to_numeric(newdf[j], errors='ignore')
1055+
10491056
return newdf.set_index(i + [j])
10501057

1051-
if any(map(lambda s: s in df.columns.tolist(), stubnames)):
1058+
if any([col in stubnames for col in df.columns]):
10521059
raise ValueError("stubname can't be identical to a column name")
10531060

10541061
if not is_list_like(stubnames):
@@ -1064,8 +1071,7 @@ def melt_stub(df, stub, i, j, value_vars, sep):
10641071
if df[i].duplicated().any():
10651072
raise ValueError("the id variables need to uniquely identify each row")
10661073

1067-
value_vars = list(map(lambda stub:
1068-
get_var_names(df, stub, sep, suffix), stubnames))
1074+
value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]
10691075

10701076
value_vars_flattened = [e for sublist in value_vars for e in sublist]
10711077
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))

pandas/tests/reshape/test_reshape.py

+136-52
Original file line numberDiff line numberDiff line change
@@ -764,12 +764,12 @@ def test_simple(self):
764764
exp_data = {"X": x.tolist() + x.tolist(),
765765
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
766766
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
767-
"year": ['1970', '1970', '1970', '1980', '1980', '1980'],
767+
"year": [1970, 1970, 1970, 1980, 1980, 1980],
768768
"id": [0, 1, 2, 0, 1, 2]}
769-
exp_frame = DataFrame(exp_data)
770-
exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]]
771-
long_frame = wide_to_long(df, ["A", "B"], i="id", j="year")
772-
tm.assert_frame_equal(long_frame, exp_frame)
769+
expected = DataFrame(exp_data)
770+
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
771+
result = wide_to_long(df, ["A", "B"], i="id", j="year")
772+
tm.assert_frame_equal(result, expected)
773773

774774
def test_stubs(self):
775775
# GH9204
@@ -804,12 +804,12 @@ def test_separating_character(self):
804804
exp_data = {"X": x.tolist() + x.tolist(),
805805
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
806806
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
807-
"year": ['1970', '1970', '1970', '1980', '1980', '1980'],
807+
"year": [1970, 1970, 1970, 1980, 1980, 1980],
808808
"id": [0, 1, 2, 0, 1, 2]}
809-
exp_frame = DataFrame(exp_data)
810-
exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]]
811-
long_frame = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
812-
tm.assert_frame_equal(long_frame, exp_frame)
809+
expected = DataFrame(exp_data)
810+
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
811+
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
812+
tm.assert_frame_equal(result, expected)
813813

814814
def test_escapable_characters(self):
815815
np.random.seed(123)
@@ -832,14 +832,14 @@ def test_escapable_characters(self):
832832
exp_data = {"X": x.tolist() + x.tolist(),
833833
"A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'],
834834
"B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
835-
"year": ['1970', '1970', '1970', '1980', '1980', '1980'],
835+
"year": [1970, 1970, 1970, 1980, 1980, 1980],
836836
"id": [0, 1, 2, 0, 1, 2]}
837-
exp_frame = DataFrame(exp_data)
838-
exp_frame = exp_frame.set_index(
837+
expected = DataFrame(exp_data)
838+
expected = expected.set_index(
839839
['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]]
840-
long_frame = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
841-
i="id", j="year")
842-
tm.assert_frame_equal(long_frame, exp_frame)
840+
result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
841+
i="id", j="year")
842+
tm.assert_frame_equal(result, expected)
843843

844844
def test_unbalanced(self):
845845
# test that we can have a varying amount of time variables
@@ -852,11 +852,11 @@ def test_unbalanced(self):
852852
'A': [1.0, 3.0, 2.0, 4.0],
853853
'B': [5.0, np.nan, 6.0, np.nan],
854854
'id': [0, 0, 1, 1],
855-
'year': ['2010', '2011', '2010', '2011']}
856-
exp_frame = pd.DataFrame(exp_data)
857-
exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]]
858-
long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year')
859-
tm.assert_frame_equal(long_frame, exp_frame)
855+
'year': [2010, 2011, 2010, 2011]}
856+
expected = pd.DataFrame(exp_data)
857+
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
858+
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
859+
tm.assert_frame_equal(result, expected)
860860

861861
def test_character_overlap(self):
862862
# Test we handle overlapping characters in both id_vars and value_vars
@@ -871,19 +871,19 @@ def test_character_overlap(self):
871871
'BBBZ': [91, 92, 93]
872872
})
873873
df['id'] = df.index
874-
exp_frame = pd.DataFrame({
874+
expected = pd.DataFrame({
875875
'BBBX': [91, 92, 93, 91, 92, 93],
876876
'BBBZ': [91, 92, 93, 91, 92, 93],
877877
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
878878
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
879879
'BB': [1, 2, 3, 4, 5, 6],
880880
'id': [0, 1, 2, 0, 1, 2],
881-
'year': ['11', '11', '11', '12', '12', '12']})
882-
exp_frame = exp_frame.set_index(['id', 'year'])[
881+
'year': [11, 11, 11, 12, 12, 12]})
882+
expected = expected.set_index(['id', 'year'])[
883883
['BBBX', 'BBBZ', 'A', 'B', 'BB']]
884-
long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
885-
tm.assert_frame_equal(long_frame.sort_index(axis=1),
886-
exp_frame.sort_index(axis=1))
884+
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
885+
tm.assert_frame_equal(result.sort_index(axis=1),
886+
expected.sort_index(axis=1))
887887

888888
def test_invalid_separator(self):
889889
# if an invalid separator is supplied a empty data frame is returned
@@ -901,13 +901,13 @@ def test_invalid_separator(self):
901901
'year': [],
902902
'A': [],
903903
'B': []}
904-
exp_frame = pd.DataFrame(exp_data)
905-
exp_frame = exp_frame.set_index(['id', 'year'])[[
904+
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
905+
expected = expected.set_index(['id', 'year'])[[
906906
'X', 'A2010', 'A2011', 'B2010', 'A', 'B']]
907-
exp_frame.index.set_levels([[0, 1], []], inplace=True)
908-
long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
909-
tm.assert_frame_equal(long_frame.sort_index(axis=1),
910-
exp_frame.sort_index(axis=1))
907+
expected.index.set_levels([0, 1], level=0, inplace=True)
908+
result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
909+
tm.assert_frame_equal(result.sort_index(axis=1),
910+
expected.sort_index(axis=1))
911911

912912
def test_num_string_disambiguation(self):
913913
# Test that we can disambiguate number value_vars from
@@ -923,19 +923,19 @@ def test_num_string_disambiguation(self):
923923
'Arating_old': [91, 92, 93]
924924
})
925925
df['id'] = df.index
926-
exp_frame = pd.DataFrame({
926+
expected = pd.DataFrame({
927927
'Arating': [91, 92, 93, 91, 92, 93],
928928
'Arating_old': [91, 92, 93, 91, 92, 93],
929929
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
930930
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
931931
'BB': [1, 2, 3, 4, 5, 6],
932932
'id': [0, 1, 2, 0, 1, 2],
933-
'year': ['11', '11', '11', '12', '12', '12']})
934-
exp_frame = exp_frame.set_index(['id', 'year'])[
933+
'year': [11, 11, 11, 12, 12, 12]})
934+
expected = expected.set_index(['id', 'year'])[
935935
['Arating', 'Arating_old', 'A', 'B', 'BB']]
936-
long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
937-
tm.assert_frame_equal(long_frame.sort_index(axis=1),
938-
exp_frame.sort_index(axis=1))
936+
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
937+
tm.assert_frame_equal(result.sort_index(axis=1),
938+
expected.sort_index(axis=1))
939939

940940
def test_invalid_suffixtype(self):
941941
# If all stubs names end with a string, but a numeric suffix is
@@ -953,13 +953,13 @@ def test_invalid_suffixtype(self):
953953
'year': [],
954954
'A': [],
955955
'B': []}
956-
exp_frame = pd.DataFrame(exp_data)
957-
exp_frame = exp_frame.set_index(['id', 'year'])[[
958-
'X', 'Aone', 'Atwo', 'Bone', 'A', 'B']]
959-
exp_frame.index.set_levels([[0, 1], []], inplace=True)
960-
long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year')
961-
tm.assert_frame_equal(long_frame.sort_index(axis=1),
962-
exp_frame.sort_index(axis=1))
956+
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
957+
958+
expected = expected.set_index(['id', 'year'])
959+
expected.index.set_levels([0, 1], level=0, inplace=True)
960+
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
961+
tm.assert_frame_equal(result.sort_index(axis=1),
962+
expected.sort_index(axis=1))
963963

964964
def test_multiple_id_columns(self):
965965
# Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
@@ -969,17 +969,17 @@ def test_multiple_id_columns(self):
969969
'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
970970
'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
971971
})
972-
exp_frame = pd.DataFrame({
972+
expected = pd.DataFrame({
973973
'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8,
974974
2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9],
975975
'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
976976
'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
977-
'age': ['1', '2', '1', '2', '1', '2', '1', '2', '1',
978-
'2', '1', '2', '1', '2', '1', '2', '1', '2']
977+
'age': [1, 2, 1, 2, 1, 2, 1, 2, 1,
978+
2, 1, 2, 1, 2, 1, 2, 1, 2]
979979
})
980-
exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']]
981-
long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
982-
tm.assert_frame_equal(long_frame, exp_frame)
980+
expected = expected.set_index(['famid', 'birth', 'age'])[['ht']]
981+
result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
982+
tm.assert_frame_equal(result, expected)
983983

984984
def test_non_unique_idvars(self):
985985
# GH16382
@@ -991,3 +991,87 @@ def test_non_unique_idvars(self):
991991
})
992992
with pytest.raises(ValueError):
993993
wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
994+
995+
def test_cast_j_int(self):
996+
df = pd.DataFrame({
997+
'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'],
998+
'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'],
999+
'actor_fb_likes_1': [1000.0, 40000.0, 11000.0],
1000+
'actor_fb_likes_2': [936.0, 5000.0, 393.0],
1001+
'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']})
1002+
1003+
expected = pd.DataFrame({
1004+
'actor': ['CCH Pounder',
1005+
'Johnny Depp',
1006+
'Christoph Waltz',
1007+
'Joel David Moore',
1008+
'Orlando Bloom',
1009+
'Rory Kinnear'],
1010+
'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
1011+
'num': [1, 1, 1, 2, 2, 2],
1012+
'title': ['Avatar',
1013+
'Pirates of the Caribbean',
1014+
'Spectre',
1015+
'Avatar',
1016+
'Pirates of the Caribbean',
1017+
'Spectre']}).set_index(['title', 'num'])
1018+
result = wide_to_long(df, ['actor', 'actor_fb_likes'],
1019+
i='title', j='num', sep='_')
1020+
1021+
tm.assert_frame_equal(result, expected)
1022+
1023+
def test_identical_stubnames(self):
1024+
df = pd.DataFrame({'A2010': [1.0, 2.0],
1025+
'A2011': [3.0, 4.0],
1026+
'B2010': [5.0, 6.0],
1027+
'A': ['X1', 'X2']})
1028+
with pytest.raises(ValueError):
1029+
wide_to_long(df, ['A', 'B'], i='A', j='colname')
1030+
1031+
def test_nonnumeric_suffix(self):
1032+
df = pd.DataFrame({'treatment_placebo': [1.0, 2.0],
1033+
'treatment_test': [3.0, 4.0],
1034+
'result_placebo': [5.0, 6.0],
1035+
'A': ['X1', 'X2']})
1036+
expected = pd.DataFrame({
1037+
'A': ['X1', 'X1', 'X2', 'X2'],
1038+
'colname': ['placebo', 'test', 'placebo', 'test'],
1039+
'result': [5.0, np.nan, 6.0, np.nan],
1040+
'treatment': [1.0, 3.0, 2.0, 4.0]})
1041+
expected = expected.set_index(['A', 'colname'])
1042+
result = wide_to_long(df, ['result', 'treatment'],
1043+
i='A', j='colname', suffix='[a-z]+', sep='_')
1044+
tm.assert_frame_equal(result, expected)
1045+
1046+
def test_mixed_type_suffix(self):
1047+
df = pd.DataFrame({
1048+
'treatment_1': [1.0, 2.0],
1049+
'treatment_foo': [3.0, 4.0],
1050+
'result_foo': [5.0, 6.0],
1051+
'result_1': [0, 9],
1052+
'A': ['X1', 'X2']})
1053+
expected = pd.DataFrame({
1054+
'A': ['X1', 'X2', 'X1', 'X2'],
1055+
'colname': ['1', '1', 'foo', 'foo'],
1056+
'result': [0.0, 9.0, 5.0, 6.0],
1057+
'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname'])
1058+
result = wide_to_long(df, ['result', 'treatment'],
1059+
i='A', j='colname', suffix='.+', sep='_')
1060+
tm.assert_frame_equal(result, expected)
1061+
1062+
def test_float_suffix(self):
1063+
df = pd.DataFrame({
1064+
'treatment_1.1': [1.0, 2.0],
1065+
'treatment_2.1': [3.0, 4.0],
1066+
'result_1.2': [5.0, 6.0],
1067+
'result_1': [0, 9],
1068+
'A': ['X1', 'X2']})
1069+
expected = pd.DataFrame({
1070+
'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'],
1071+
'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
1072+
'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
1073+
'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]})
1074+
expected = expected.set_index(['A', 'colname'])
1075+
result = wide_to_long(df, ['result', 'treatment'],
1076+
i='A', j='colname', suffix='[0-9.]+', sep='_')
1077+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)