-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG in clipboard (linux, python2) with unicode and separator (GH13747) #14599
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
fb922d6
c83d000
edb8553
66d8ebf
14d94a0
d565b1f
f708c2e
c5a87d8
825bbe2
02f87b0
71d58d0
dd57ae3
d202fd0
0665fd4
ed1375f
9946fb7
c0aafd7
b03ed56
ac8ae60
7af95da
cedb690
98b61e8
1dca292
9db42d8
b74fbc1
2aafb66
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
""" io on the clipboard """ | ||
from pandas import compat, get_option, option_context, DataFrame | ||
from pandas.compat import StringIO | ||
from pandas.compat import StringIO, PY2 | ||
|
||
|
||
def read_clipboard(sep='\s+', **kwargs): # pragma: no cover | ||
|
@@ -18,6 +18,14 @@ def read_clipboard(sep='\s+', **kwargs): # pragma: no cover | |
------- | ||
parsed : DataFrame | ||
""" | ||
encoding = kwargs.pop('encoding', 'utf-8') | ||
|
||
# only utf-8 is valid for passed value because that's what clipboard | ||
# supports | ||
if encoding is not None and encoding.lower().replace('-', '') != 'utf8': | ||
raise NotImplementedError( | ||
'reading from clipboard only supports utf-8 encoding') | ||
|
||
from pandas.util.clipboard import clipboard_get | ||
from pandas.io.parsers import read_table | ||
text = clipboard_get() | ||
|
@@ -78,6 +86,12 @@ def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover | |
- Windows: | ||
- OS X: | ||
""" | ||
encoding = kwargs.pop('encoding', 'utf-8') | ||
|
||
# testing if an invalid encoding is passed to clipboard | ||
if encoding is not None and encoding.lower().replace('-', '') != 'utf8': | ||
raise ValueError('clipboard only supports utf-8 encoding') | ||
|
||
from pandas.util.clipboard import clipboard_set | ||
if excel is None: | ||
excel = True | ||
|
@@ -87,8 +101,12 @@ def to_clipboard(obj, excel=None, sep=None, **kwargs): # pragma: no cover | |
if sep is None: | ||
sep = '\t' | ||
buf = StringIO() | ||
obj.to_csv(buf, sep=sep, **kwargs) | ||
clipboard_set(buf.getvalue()) | ||
# clipboard_set (pyperclip) expects unicode | ||
obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so Then check for encoding, if its not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback the changes that i have made earlier (show above) are in the to_clipboard method. I am trying to understand your review comment, are you suggesting that instead of explicitly passing encoding = 'utf-8' to obj.to_csv i should check before hand if the encoding is None or utf-8 and raise an exception if it is not one of them There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so you still should pass There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback oh ok, i get it now, let me make that change |
||
text = buf.getvalue() | ||
if PY2: | ||
text = text.decode('utf-8') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't we always decode here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
clipboard_set(text) | ||
return | ||
except: | ||
pass | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,7 @@ | |
from pandas import read_clipboard | ||
from pandas import get_option | ||
from pandas.util import testing as tm | ||
from pandas.util.testing import makeCustomDataframe as mkdf, disabled | ||
from pandas.util.testing import makeCustomDataframe as mkdf | ||
|
||
|
||
try: | ||
|
@@ -18,7 +18,6 @@ | |
raise nose.SkipTest("no clipboard found") | ||
|
||
|
||
@disabled | ||
class TestClipboard(tm.TestCase): | ||
|
||
@classmethod | ||
|
@@ -52,20 +51,24 @@ def setUpClass(cls): | |
# Test for non-ascii text: GH9263 | ||
cls.data['nonascii'] = pd.DataFrame({'en': 'in English'.split(), | ||
'es': 'en español'.split()}) | ||
# unicode round trip test for GH 13747 | ||
cls.data['utf8'] = pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], | ||
'b': ['øπ∆˚¬', 'œ∑´®']}) | ||
cls.data_types = list(cls.data.keys()) | ||
|
||
@classmethod | ||
def tearDownClass(cls): | ||
super(TestClipboard, cls).tearDownClass() | ||
del cls.data_types, cls.data | ||
|
||
def check_round_trip_frame(self, data_type, excel=None, sep=None): | ||
def check_round_trip_frame(self, data_type, excel=None, sep=None, | ||
encoding=None): | ||
data = self.data[data_type] | ||
data.to_clipboard(excel=excel, sep=sep) | ||
data.to_clipboard(excel=excel, sep=sep, encoding=encoding) | ||
if sep is not None: | ||
result = read_clipboard(sep=sep, index_col=0) | ||
result = read_clipboard(sep=sep, index_col=0, encoding=encoding) | ||
else: | ||
result = read_clipboard() | ||
result = read_clipboard(encoding=encoding) | ||
tm.assert_frame_equal(data, result, check_dtype=False) | ||
|
||
def test_round_trip_frame_sep(self): | ||
|
@@ -115,3 +118,16 @@ def test_read_clipboard_infer_excel(self): | |
exp = pd.read_clipboard() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add tests for other issues that we mentioned (or attach the issue number to an existing tests as appropriate) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
tm.assert_frame_equal(res, exp) | ||
|
||
# test case for testing invalid encoding | ||
def test_invalid_encoding(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. put comments inside the test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. implemented the suggested change |
||
data = self.data['string'] | ||
with tm.assertRaises(ValueError): | ||
data.to_clipboard(encoding='ascii') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i have not modified read_clipboard, so that check for encoding is not present in read_clipboard. Should i add the check in read_clipboard and the test backing it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also read_clipboard later on uses the encoding from kwargs**, if i pop the kwargs in read clipboard before this code, will that not cause any issues? if compat.PY3: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right in reading you still need to strip the passed encoding (and read from the clipboard as utc8) then i guess we could reencode as the encoding (if it's not None) put a test for this but for now I would just raise NotImplementError if an encoding is passed in and it's not utf8 or none |
||
with tm.assertRaises(NotImplementedError): | ||
pd.read_clipboard(encoding='ascii') | ||
|
||
def test_round_trip_valid_encodings(self): | ||
for enc in ['UTF-8', 'utf-8', 'utf8']: | ||
for dt in self.data_types: | ||
self.check_round_trip_frame(dt, encoding=enc) |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
list all of the isues this is expected to close
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
implemented this change now