Skip to content

ENH melt uses column name if available #4144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 6, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,14 @@ pandas 0.12
- Simplified the API and added a describe method to Categorical
- ``melt`` now accepts the optional parameters ``var_name`` and ``value_name``
to specify custom column names of the returned DataFrame (:issue:`3649`),
thanks @hoechenberger
thanks @hoechenberger. If ``var_name`` is not specified and ``dataframe.columns.name``
is not None, then this will be used as the ``var_name`` (:issue:`4144`).
- clipboard functions use pyperclip (no dependencies on Windows, alternative
dependencies offered for Linux) (:issue:`3837`).
- Plotting functions now raise a ``TypeError`` before trying to plot anything
if the associated objects have have a dtype of ``object`` (:issue:`1818`,
:issue:`3572`, :issue:`3911`, :issue:`3912`), but they will try to convert object arrays to
numeric arrays if possible so that you can still plot, for example, an
:issue:`3572`, :issue:`3911`, :issue:`3912`), but they will try to convert object
arrays to numeric arrays if possible so that you can still plot, for example, an
object array with floats. This happens before any drawing takes place which
elimnates any spurious plots from showing up.
- Added Faq section on repr display options, to help users customize their setup.
Expand Down
10 changes: 7 additions & 3 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ def _stack_multi_columns(frame, level=-1, dropna=True):


def melt(frame, id_vars=None, value_vars=None,
var_name='variable', value_name='value'):
var_name=None, value_name='value'):
"""
"Unpivots" a DataFrame from wide format to long format, optionally leaving
id variables set
Expand All @@ -611,8 +611,8 @@ def melt(frame, id_vars=None, value_vars=None,
frame : DataFrame
id_vars : tuple, list, or ndarray
value_vars : tuple, list, or ndarray
var_name : scalar
value_name : scalar
var_name : scalar, if None uses frame.column.name or 'variable'
value_name : scalar, default 'value'

Examples
--------
Expand All @@ -634,6 +634,7 @@ def melt(frame, id_vars=None, value_vars=None,
a B 1
b B 3
c B 5

"""
# TODO: what about the existing index?
if id_vars is not None:
Expand All @@ -651,6 +652,9 @@ def melt(frame, id_vars=None, value_vars=None,
else:
frame = frame.copy()

if var_name is None:
var_name = frame.columns.name if frame.columns.name is not None else 'variable'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

u could do frame.columns.name or 'variable' too :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...but just in case they had named their column False(y) :)


N, K = frame.shape
K -= len(id_vars)

Expand Down
234 changes: 135 additions & 99 deletions pandas/tests/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,105 +20,141 @@
_multiprocess_can_split_ = True


def test_melt():
df = tm.makeTimeDataFrame()[:10]
df['id1'] = (df['A'] > 0).astype(np.int64)
df['id2'] = (df['B'] > 0).astype(np.int64)

var_name = 'var'
value_name = 'val'

# Default column names
result = melt(df)
result1 = melt(df, id_vars=['id1'])
result2 = melt(df, id_vars=['id1', 'id2'])
result3 = melt(df, id_vars=['id1', 'id2'],
value_vars='A')
result4 = melt(df, id_vars=['id1', 'id2'],
value_vars=['A', 'B'])

expected4 = DataFrame({'id1': df['id1'].tolist() * 2,
'id2': df['id2'].tolist() * 2,
'variable': ['A']*10 + ['B']*10,
'value': df['A'].tolist() + df['B'].tolist()},
columns=['id1', 'id2', 'variable', 'value'])
tm.assert_frame_equal(result4, expected4)

# Supply custom name for the 'variable' column
result5 = melt(df, var_name=var_name)
result6 = melt(df, id_vars=['id1'], var_name=var_name)
result7 = melt(df, id_vars=['id1', 'id2'], var_name=var_name)
result8 = melt(df, id_vars=['id1', 'id2'],
value_vars='A', var_name=var_name)
result9 = melt(df, id_vars=['id1', 'id2'],
value_vars=['A', 'B'], var_name=var_name)

expected9 = DataFrame({'id1': df['id1'].tolist() * 2,
'id2': df['id2'].tolist() * 2,
var_name: ['A']*10 + ['B']*10,
'value': df['A'].tolist() + df['B'].tolist()},
columns=['id1', 'id2', var_name, 'value'])
tm.assert_frame_equal(result9, expected9)

# Supply custom name for the 'value' column
result10 = melt(df, value_name=value_name)
result11 = melt(df, id_vars=['id1'], value_name=value_name)
result12 = melt(df, id_vars=['id1', 'id2'], value_name=value_name)
result13 = melt(df, id_vars=['id1', 'id2'],
value_vars='A', value_name=value_name)
result14 = melt(df, id_vars=['id1', 'id2'],
value_vars=['A', 'B'], value_name=value_name)

expected14 = DataFrame({'id1': df['id1'].tolist() * 2,
'id2': df['id2'].tolist() * 2,
'variable': ['A']*10 + ['B']*10,
value_name: df['A'].tolist() + df['B'].tolist()},
columns=['id1', 'id2', 'variable', value_name])
tm.assert_frame_equal(result14, expected14)

# Supply custom names for the 'variable' and 'value' columns
result15 = melt(df, var_name=var_name, value_name=value_name)
result16 = melt(df, id_vars=['id1'], var_name=var_name, value_name=value_name)
result17 = melt(df, id_vars=['id1', 'id2'],
var_name=var_name, value_name=value_name)
result18 = melt(df, id_vars=['id1', 'id2'],
value_vars='A', var_name=var_name, value_name=value_name)
result19 = melt(df, id_vars=['id1', 'id2'],
value_vars=['A', 'B'], var_name=var_name, value_name=value_name)

expected19 = DataFrame({'id1': df['id1'].tolist() * 2,
'id2': df['id2'].tolist() * 2,
var_name: ['A']*10 + ['B']*10,
value_name: df['A'].tolist() + df['B'].tolist()},
columns=['id1', 'id2', var_name, value_name])
tm.assert_frame_equal(result19, expected19)

def test_convert_dummies():
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})

result = convert_dummies(df, ['A', 'B'])
result2 = convert_dummies(df, ['A', 'B'], prefix_sep='.')

expected = DataFrame({'A_foo': [1, 0, 1, 0, 1, 0, 1, 1],
'A_bar': [0, 1, 0, 1, 0, 1, 0, 0],
'B_one': [1, 1, 0, 0, 0, 0, 1, 0],
'B_two': [0, 0, 1, 0, 1, 1, 0, 0],
'B_three': [0, 0, 0, 1, 0, 0, 0, 1],
'C': df['C'].values,
'D': df['D'].values},
columns=result.columns, dtype=float)
expected2 = expected.rename(columns=lambda x: x.replace('_', '.'))

tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result2, expected2)


class Test_lreshape(unittest.TestCase):
class TestMelt(unittest.TestCase):

def setUp(self):
self.df = tm.makeTimeDataFrame()[:10]
self.df['id1'] = (self.df['A'] > 0).astype(np.int64)
self.df['id2'] = (self.df['B'] > 0).astype(np.int64)

self.var_name = 'var'
self.value_name = 'val'

def test_default_col_names(self):
result = melt(self.df)
self.assertEqual(result.columns.tolist(), ['variable', 'value'])

result1 = melt(self.df, id_vars=['id1'])
self.assertEqual(result1.columns.tolist(), ['id1', 'variable', 'value'])

result2 = melt(self.df, id_vars=['id1', 'id2'])
self.assertEqual(result2.columns.tolist(), ['id1', 'id2', 'variable', 'value'])

def test_value_vars(self):
result3 = melt(self.df, id_vars=['id1', 'id2'], value_vars='A')
self.assertEqual(len(result3), 10)

result4 = melt(self.df, id_vars=['id1', 'id2'], value_vars=['A', 'B'])
expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
'variable': ['A']*10 + ['B']*10,
'value': self.df['A'].tolist() + self.df['B'].tolist()},
columns=['id1', 'id2', 'variable', 'value'])
tm.assert_frame_equal(result4, expected4)

def test_custom_var_name(self):
result5 = melt(self.df, var_name=self.var_name)
self.assertEqual(result5.columns.tolist(), ['var', 'value'])

result6 = melt(self.df, id_vars=['id1'], var_name=self.var_name)
self.assertEqual(result6.columns.tolist(), ['id1', 'var', 'value'])

result7 = melt(self.df, id_vars=['id1', 'id2'], var_name=self.var_name)
self.assertEqual(result7.columns.tolist(), ['id1', 'id2', 'var', 'value'])

result8 = melt(self.df, id_vars=['id1', 'id2'],
value_vars='A', var_name=self.var_name)
self.assertEqual(result8.columns.tolist(), ['id1', 'id2', 'var', 'value'])

result9 = melt(self.df, id_vars=['id1', 'id2'],
value_vars=['A', 'B'], var_name=self.var_name)
expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
self.var_name: ['A']*10 + ['B']*10,
'value': self.df['A'].tolist() + self.df['B'].tolist()},
columns=['id1', 'id2', self.var_name, 'value'])
tm.assert_frame_equal(result9, expected9)

def test_custom_value_name(self):
result10 = melt(self.df, value_name=self.value_name)
self.assertEqual(result10.columns.tolist(), ['variable', 'val'])

result11 = melt(self.df, id_vars=['id1'], value_name=self.value_name)
self.assertEqual(result11.columns.tolist(), ['id1', 'variable', 'val'])

result12 = melt(self.df, id_vars=['id1', 'id2'], value_name=self.value_name)
self.assertEqual(result12.columns.tolist(), ['id1', 'id2', 'variable', 'val'])

result13 = melt(self.df, id_vars=['id1', 'id2'],
value_vars='A', value_name=self.value_name)
self.assertEqual(result13.columns.tolist(), ['id1', 'id2', 'variable', 'val'])

result14 = melt(self.df, id_vars=['id1', 'id2'],
value_vars=['A', 'B'], value_name=self.value_name)
expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
'variable': ['A']*10 + ['B']*10,
self.value_name: self.df['A'].tolist() + self.df['B'].tolist()},
columns=['id1', 'id2', 'variable', self.value_name])
tm.assert_frame_equal(result14, expected14)

def test_custom_var_and_value_name(self):

result15 = melt(self.df, var_name=self.var_name, value_name=self.value_name)
self.assertEqual(result15.columns.tolist(), ['var', 'val'])

result16 = melt(self.df, id_vars=['id1'], var_name=self.var_name, value_name=self.value_name)
self.assertEqual(result16.columns.tolist(), ['id1', 'var', 'val'])

result17 = melt(self.df, id_vars=['id1', 'id2'],
var_name=self.var_name, value_name=self.value_name)
self.assertEqual(result17.columns.tolist(), ['id1', 'id2', 'var', 'val'])

result18 = melt(df, id_vars=['id1', 'id2'],
value_vars='A', var_name=self.var_name, value_name=self.value_name)
self.assertEqual(result18.columns.tolist(), ['id1', 'id2', 'var', 'val'])

result19 = melt(self.df, id_vars=['id1', 'id2'],
value_vars=['A', 'B'], var_name=self.var_name, value_name=self.value_name)
expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
var_name: ['A']*10 + ['B']*10,
value_name: self.df['A'].tolist() + self.df['B'].tolist()},
columns=['id1', 'id2', self.var_name, self.value_name])
tm.assert_frame_equal(result19, expected19)

def test_custom_var_and_value_name(self):
self.df.columns.name = 'foo'
result20 = melt(self.df)
self.assertEqual(result20.columns.tolist(), ['foo', 'value'])

class TestConvertDummies(unittest.TestCase):
def test_convert_dummies(self):
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})

result = convert_dummies(df, ['A', 'B'])
result2 = convert_dummies(df, ['A', 'B'], prefix_sep='.')

expected = DataFrame({'A_foo': [1, 0, 1, 0, 1, 0, 1, 1],
'A_bar': [0, 1, 0, 1, 0, 1, 0, 0],
'B_one': [1, 1, 0, 0, 0, 0, 1, 0],
'B_two': [0, 0, 1, 0, 1, 1, 0, 0],
'B_three': [0, 0, 0, 1, 0, 0, 0, 1],
'C': df['C'].values,
'D': df['D'].values},
columns=result.columns, dtype=float)
expected2 = expected.rename(columns=lambda x: x.replace('_', '.'))

tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result2, expected2)


class TestLreshape(unittest.TestCase):

def test_pairs(self):
data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008',
Expand Down