-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
BUG: read_csv throws UnicodeDecodeError with unicode aliases #13571
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
d485c4a
ae62350
36bcdd8
285ccf9
173c38b
78d46d6
35dfb13
71f084e
da8fce4
1825486
1d30333
4f680d7
b582195
e26c92a
d14b69e
eeb7011
b8d78c4
75869f4
9c88919
6725536
671ad41
3c4a798
5675b82
ff6117e
b983957
451c054
33278a9
181cecd
a2e5d54
6c8b21b
5d99cff
8e7904f
a07b5d3
ff2a335
1f8cc7f
f743eb3
e161699
5765b92
ac18b36
1fc6b90
6b0e2ca
41a6fae
f730e60
05a2d04
c4e93bd
430273d
1fa91b9
e379e9f
a35521e
6c09821
5584dff
9463dee
5198179
3c30cd0
e77ac2d
69ab536
1eb478d
a2f178f
8e05f7e
ab153d5
0c1de9f
77ec966
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1339,8 +1339,8 @@ def test_compact_ints_use_unsigned(self): | |
'b': np.array([9], dtype=np.int64), | ||
'c': np.array([258], dtype=np.int64), | ||
}) | ||
out = self.read_csv(StringIO(data)) | ||
tm.assert_frame_equal(out, expected) | ||
result = self.read_csv(StringIO(data)) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
expected = DataFrame({ | ||
'a': np.array([1], dtype=np.int8), | ||
|
@@ -1351,14 +1351,14 @@ def test_compact_ints_use_unsigned(self): | |
# default behaviour for 'use_unsigned' | ||
with tm.assert_produces_warning( | ||
FutureWarning, check_stacklevel=False): | ||
out = self.read_csv(StringIO(data), compact_ints=True) | ||
tm.assert_frame_equal(out, expected) | ||
result = self.read_csv(StringIO(data), compact_ints=True) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
with tm.assert_produces_warning( | ||
FutureWarning, check_stacklevel=False): | ||
out = self.read_csv(StringIO(data), compact_ints=True, | ||
use_unsigned=False) | ||
tm.assert_frame_equal(out, expected) | ||
result = self.read_csv(StringIO(data), compact_ints=True, | ||
use_unsigned=False) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
expected = DataFrame({ | ||
'a': np.array([1], dtype=np.uint8), | ||
|
@@ -1368,9 +1368,9 @@ def test_compact_ints_use_unsigned(self): | |
|
||
with tm.assert_produces_warning( | ||
FutureWarning, check_stacklevel=False): | ||
out = self.read_csv(StringIO(data), compact_ints=True, | ||
use_unsigned=True) | ||
tm.assert_frame_equal(out, expected) | ||
result = self.read_csv(StringIO(data), compact_ints=True, | ||
use_unsigned=True) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_compact_ints_as_recarray(self): | ||
data = ('0,1,0,0\n' | ||
|
@@ -1399,27 +1399,28 @@ def test_as_recarray(self): | |
data = 'a,b\n1,a\n2,b' | ||
expected = np.array([(1, 'a'), (2, 'b')], | ||
dtype=[('a', '<i8'), ('b', 'O')]) | ||
out = self.read_csv(StringIO(data), as_recarray=True) | ||
tm.assert_numpy_array_equal(out, expected) | ||
result = self.read_csv(StringIO(data), as_recarray=True) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
# index_col ignored | ||
with tm.assert_produces_warning( | ||
FutureWarning, check_stacklevel=False): | ||
data = 'a,b\n1,a\n2,b' | ||
expected = np.array([(1, 'a'), (2, 'b')], | ||
dtype=[('a', '<i8'), ('b', 'O')]) | ||
out = self.read_csv(StringIO(data), as_recarray=True, index_col=0) | ||
tm.assert_numpy_array_equal(out, expected) | ||
result = self.read_csv( | ||
StringIO(data), as_recarray=True, index_col=0) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
# respects names | ||
with tm.assert_produces_warning( | ||
FutureWarning, check_stacklevel=False): | ||
data = '1,a\n2,b' | ||
expected = np.array([(1, 'a'), (2, 'b')], | ||
dtype=[('a', '<i8'), ('b', 'O')]) | ||
out = self.read_csv(StringIO(data), names=['a', 'b'], | ||
header=None, as_recarray=True) | ||
tm.assert_numpy_array_equal(out, expected) | ||
result = self.read_csv(StringIO(data), names=['a', 'b'], | ||
header=None, as_recarray=True) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
# header order is respected even though it conflicts | ||
# with the natural ordering of the column names | ||
|
@@ -1428,16 +1429,17 @@ def test_as_recarray(self): | |
data = 'b,a\n1,a\n2,b' | ||
expected = np.array([(1, 'a'), (2, 'b')], | ||
dtype=[('b', '<i8'), ('a', 'O')]) | ||
out = self.read_csv(StringIO(data), as_recarray=True) | ||
tm.assert_numpy_array_equal(out, expected) | ||
result = self.read_csv(StringIO(data), as_recarray=True) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
# overrides the squeeze parameter | ||
with tm.assert_produces_warning( | ||
FutureWarning, check_stacklevel=False): | ||
data = 'a\n1' | ||
expected = np.array([(1,)], dtype=[('a', '<i8')]) | ||
out = self.read_csv(StringIO(data), as_recarray=True, squeeze=True) | ||
tm.assert_numpy_array_equal(out, expected) | ||
result = self.read_csv( | ||
StringIO(data), as_recarray=True, squeeze=True) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
# does data conversions before doing recarray conversion | ||
with tm.assert_produces_warning( | ||
|
@@ -1446,18 +1448,18 @@ def test_as_recarray(self): | |
conv = lambda x: int(x) + 1 | ||
expected = np.array([(2, 'a'), (3, 'b')], | ||
dtype=[('a', '<i8'), ('b', 'O')]) | ||
out = self.read_csv(StringIO(data), as_recarray=True, | ||
converters={'a': conv}) | ||
tm.assert_numpy_array_equal(out, expected) | ||
result = self.read_csv(StringIO(data), as_recarray=True, | ||
converters={'a': conv}) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
# filters by usecols before doing recarray conversion | ||
with tm.assert_produces_warning( | ||
FutureWarning, check_stacklevel=False): | ||
data = 'a,b\n1,a\n2,b' | ||
expected = np.array([(1,), (2,)], dtype=[('a', '<i8')]) | ||
out = self.read_csv(StringIO(data), as_recarray=True, | ||
usecols=['a']) | ||
tm.assert_numpy_array_equal(out, expected) | ||
result = self.read_csv(StringIO(data), as_recarray=True, | ||
usecols=['a']) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_memory_map(self): | ||
mmap_file = os.path.join(self.dirpath, 'test_mmap.csv') | ||
|
@@ -1467,5 +1469,23 @@ def test_memory_map(self): | |
'c': ['I', 'II', 'III'] | ||
}) | ||
|
||
out = self.read_csv(mmap_file, memory_map=True) | ||
tm.assert_frame_equal(out, expected) | ||
result = self.read_csv(mmap_file, memory_map=True) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_read_csv_utf_aliases(self): | ||
# see gh issue 13549 | ||
path = 'test.csv' | ||
expected = DataFrame({'A': [0, 1], 'B': [2, 3], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose we could do one row as expected = pd.DataFrame({'mb_num': [4.8], 'multibyte': ['test']}) I used BytesIO because I don't think StringIO can support different encodings (I tried and wasn't able to get StringIO to work). |
||
'multibyte_test': ['testing123', 'bananabis'], | ||
'mb_nums': [154.868, 457.8798]}) | ||
with tm.ensure_clean(path) as path: | ||
for byte in [8, 16]: | ||
expected.to_csv(path, encoding='utf-' + str(byte), index=False) | ||
for fmt in ['utf-{0}', 'utf_{0}', 'UTF-{0}', 'UTF_{0}']: | ||
encoding = fmt.format(byte) | ||
for engine in ['c', 'python', None]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not necessary (nor is the # 'path' can most likely be changed as I referenced above
result = self.read_csv(path, encoding=encoding)
tm.assert_frame_equal(result, expected) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alright. |
||
result = self.read_csv( | ||
path, | ||
engine=engine, | ||
encoding=encoding) | ||
tm.assert_frame_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use the context manager
remove
os.remove(..)