-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
read_excel with dtype=str converts empty cells to np.nan #20429
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
dd53df8
6f771fb
37f00ad
f194b70
eb8f4c5
ac6a409
6994bb0
40a563f
9858259
5f71a99
0a93b60
f296f9a
7c0af1f
f0fd0a7
61e0519
9fdac27
ddb904f
694849d
5ba95a1
d3ceec3
ea1d73a
c1376a5
3103811
7d5f6b2
478d08d
edb26d7
c3ab9cb
69f6c95
97a345a
8b2fb0b
c9f5120
fab0b27
571d5c4
0712392
47bc105
3740dfe
7d453bb
bcd739d
7341cd1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -461,11 +461,17 @@ cpdef ndarray[object] astype_unicode(ndarray arr): | |
cdef: | ||
Py_ssize_t i, n = arr.size | ||
ndarray[object] result = np.empty(n, dtype=object) | ||
object arr_i | ||
|
||
for i in range(n): | ||
# we can use the unsafe version because we know `result` is mutable | ||
# since it was created from `np.empty` | ||
util.set_value_at_unsafe(result, i, unicode(arr[i])) | ||
arr_i = arr[i] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is arr_i in the cdef? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. d'oh! |
||
util.set_value_at_unsafe( | ||
result, | ||
i, | ||
unicode(arr_i) if not checknull(arr_i) else np.nan | ||
) | ||
|
||
return result | ||
|
||
|
@@ -474,11 +480,17 @@ cpdef ndarray[object] astype_str(ndarray arr): | |
cdef: | ||
Py_ssize_t i, n = arr.size | ||
ndarray[object] result = np.empty(n, dtype=object) | ||
object arr_i | ||
|
||
for i in range(n): | ||
# we can use the unsafe version because we know `result` is mutable | ||
# since it was created from `np.empty` | ||
util.set_value_at_unsafe(result, i, str(arr[i])) | ||
arr_i = arr[i] | ||
util.set_value_at_unsafe( | ||
result, | ||
i, | ||
str(arr_i) if not checknull(arr_i) else np.nan | ||
) | ||
|
||
return result | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4153,4 +4153,8 @@ def _try_cast(arr, take_fast_path): | |
data = np.array(data, dtype=dtype, copy=False) | ||
subarr = np.array(data, dtype=object, copy=copy) | ||
|
||
# GH 20377 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. huh? this should not be necessary (not to mention non-performant) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, this fixes an error I got in this test https://github.com/pandas-dev/pandas/blob/master/pandas/tests/frame/test_dtypes.py#L533 This line https://github.com/nikoskaragiannakis/pandas/blob/7341cd17e11461728969afa159250e882b32dee0/pandas/core/series.py#L4154 does the damage by turning a Am I missing something? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how's that an error? that test is correct. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that with the changes in the .pyx files,
leaves So I have to ask: do we want calls like the one above to turn There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback ^ |
||
# Turn all 'nan' to np.nan | ||
subarr[subarr == 'nan'] = np.nan | ||
|
||
return subarr |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -529,7 +529,7 @@ def test_astype_str(self): | |
# consistency in astype(str) | ||
for tt in set([str, compat.text_type]): | ||
result = DataFrame([np.NaN]).astype(tt) | ||
expected = DataFrame(['nan']) | ||
expected = DataFrame([np.NaN], dtype=object) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no leave this alone. turning a float nan into string should still work |
||
assert_frame_equal(result, expected) | ||
|
||
result = DataFrame([1.12345678901234567890]).astype(tt) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -369,3 +369,27 @@ def test_no_na_filter_on_index(self): | |
expected = DataFrame({"a": [1, 4], "c": [3, 6]}, | ||
index=Index([np.nan, 5.0], name="b")) | ||
tm.assert_frame_equal(out, expected) | ||
|
||
def test_na_values_with_dtype_str_and_na_filter_true(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you parameterize this on na_filter (you will need to provide the nan_value as well in the parameterize as they are different) |
||
# see gh-20377 | ||
data = "a,b,c\n1,,3\n4,5,6" | ||
|
||
out = self.read_csv(StringIO(data), na_filter=True, dtype=str) | ||
|
||
# missing data turn to np.nan, which stays as it is after dtype=str | ||
expected = DataFrame({"a": ["1", "4"], | ||
"b": [np.nan, "5"], | ||
"c": ["3", "6"]}) | ||
tm.assert_frame_equal(out, expected) | ||
|
||
def test_na_values_with_dtype_str_and_na_filter_false(self): | ||
# see gh-20377 | ||
data = "a,b,c\n1,,3\n4,5,6" | ||
|
||
out = self.read_csv(StringIO(data), na_filter=False, dtype=str) | ||
|
||
# missing data turn to empty string | ||
expected = DataFrame({"a": ["1", "4"], | ||
"b": ["", "5"], | ||
"c": ["3", "6"]}) | ||
tm.assert_frame_equal(out, expected) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -360,6 +360,33 @@ def test_reader_dtype(self, ext): | |
with pytest.raises(ValueError): | ||
actual = self.get_exceldf(basename, ext, dtype={'d': 'int64'}) | ||
|
||
def test_reader_dtype_str(self, ext): | ||
# GH 20377 | ||
basename = 'testdtype' | ||
actual = self.get_exceldf(basename, ext) | ||
|
||
expected = DataFrame({ | ||
'a': [1, 2, 3, 4], | ||
'b': [2.5, 3.5, 4.5, 5.5], | ||
'c': [1, 2, 3, 4], | ||
'd': [1.0, 2.0, np.nan, 4.0]}).reindex( | ||
columns=['a', 'b', 'c', 'd']) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just specify |
||
|
||
tm.assert_frame_equal(actual, expected) | ||
|
||
actual = self.get_exceldf(basename, ext, | ||
dtype={'a': 'float64', | ||
'b': 'float32', | ||
'c': str, | ||
'd': str}) | ||
|
||
expected['a'] = expected['a'].astype('float64') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move this higher (by the expected), you can simply construct things directly by using e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'm not sure what you mean here.
First of all, this is copy-paste from the previous test, which was added for #8212 Do you mean to do expected = DataFrame({'a': Series([1,2,3,4], dtype='float64'),
'b': Series([2.5,3.5,4.5,5.5], dtype='float32'),
...}) ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes exactly. If you need things like '001', then just do it that way, e.g. |
||
expected['b'] = expected['b'].astype('float32') | ||
expected['c'] = ['001', '002', '003', '004'] | ||
expected['d'] = ['1', '2', np.nan, '4'] | ||
|
||
tm.assert_frame_equal(actual, expected) | ||
|
||
def test_reading_all_sheets(self, ext): | ||
# Test reading all sheetnames by setting sheetname to None, | ||
# Ensure a dict is returned. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -149,6 +149,7 @@ def test_astype_str_map(self, dtype, series): | |
# see gh-4405 | ||
result = series.astype(dtype) | ||
expected = series.map(compat.text_type) | ||
expected = expected.replace('nan', np.nan) # see gh-20377 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove this, this is equiv to |
||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize("dtype", [str, compat.text_type]) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i mean make this a separate sub-section showing the previous and the new