|
4 | 4 | these tests out of this module as soon as the C parser can accept further
|
5 | 5 | arguments when parsing.
|
6 | 6 | """
|
| 7 | +from __future__ import annotations |
7 | 8 |
|
8 | 9 | import csv
|
9 | 10 | from io import (
|
|
13 | 14 |
|
14 | 15 | import pytest
|
15 | 16 |
|
16 |
| -from pandas.errors import ParserError |
| 17 | +from pandas.errors import ( |
| 18 | + ParserError, |
| 19 | + ParserWarning, |
| 20 | +) |
17 | 21 |
|
18 | 22 | from pandas import (
|
19 | 23 | DataFrame,
|
@@ -329,3 +333,128 @@ def readline(self):
|
329 | 333 | return self.data
|
330 | 334 |
|
331 | 335 | parser.read_csv(NoNextBuffer("a\n1"))
|
| 336 | + |
| 337 | + |
| 338 | +@pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]]) |
| 339 | +def test_on_bad_lines_callable(python_parser_only, bad_line_func): |
| 340 | + # GH 5686 |
| 341 | + parser = python_parser_only |
| 342 | + data = """a,b |
| 343 | +1,2 |
| 344 | +2,3,4,5,6 |
| 345 | +3,4 |
| 346 | +""" |
| 347 | + bad_sio = StringIO(data) |
| 348 | + result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func) |
| 349 | + expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) |
| 350 | + tm.assert_frame_equal(result, expected) |
| 351 | + |
| 352 | + |
| 353 | +def test_on_bad_lines_callable_write_to_external_list(python_parser_only): |
| 354 | + # GH 5686 |
| 355 | + parser = python_parser_only |
| 356 | + data = """a,b |
| 357 | +1,2 |
| 358 | +2,3,4,5,6 |
| 359 | +3,4 |
| 360 | +""" |
| 361 | + bad_sio = StringIO(data) |
| 362 | + lst = [] |
| 363 | + |
| 364 | + def bad_line_func(bad_line: list[str]) -> list[str]: |
| 365 | + lst.append(bad_line) |
| 366 | + return ["2", "3"] |
| 367 | + |
| 368 | + result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func) |
| 369 | + expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) |
| 370 | + tm.assert_frame_equal(result, expected) |
| 371 | + assert lst == [["2", "3", "4", "5", "6"]] |
| 372 | + |
| 373 | + |
| 374 | +@pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]]) |
| 375 | +@pytest.mark.parametrize("sep", [",", "111"]) |
| 376 | +def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep): |
| 377 | + # GH 5686 |
| 378 | + # iterator=True has a separate code path than iterator=False |
| 379 | + parser = python_parser_only |
| 380 | + data = f""" |
| 381 | +0{sep}1 |
| 382 | +hi{sep}there |
| 383 | +foo{sep}bar{sep}baz |
| 384 | +good{sep}bye |
| 385 | +""" |
| 386 | + bad_sio = StringIO(data) |
| 387 | + result_iter = parser.read_csv( |
| 388 | + bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep |
| 389 | + ) |
| 390 | + expecteds = [ |
| 391 | + {"0": "hi", "1": "there"}, |
| 392 | + {"0": "foo", "1": "bar"}, |
| 393 | + {"0": "good", "1": "bye"}, |
| 394 | + ] |
| 395 | + for i, (result, expected) in enumerate(zip(result_iter, expecteds)): |
| 396 | + expected = DataFrame(expected, index=range(i, i + 1)) |
| 397 | + tm.assert_frame_equal(result, expected) |
| 398 | + |
| 399 | + |
| 400 | +def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only): |
| 401 | + # GH 5686 |
| 402 | + parser = python_parser_only |
| 403 | + data = """a,b |
| 404 | +1,2 |
| 405 | +2,3,4,5,6 |
| 406 | +3,4 |
| 407 | +""" |
| 408 | + bad_sio = StringIO(data) |
| 409 | + msg = "This function is buggy." |
| 410 | + |
| 411 | + def bad_line_func(bad_line): |
| 412 | + raise ValueError(msg) |
| 413 | + |
| 414 | + with pytest.raises(ValueError, match=msg): |
| 415 | + parser.read_csv(bad_sio, on_bad_lines=bad_line_func) |
| 416 | + |
| 417 | + |
| 418 | +def test_on_bad_lines_callable_not_expected_length(python_parser_only): |
| 419 | + # GH 5686 |
| 420 | + parser = python_parser_only |
| 421 | + data = """a,b |
| 422 | +1,2 |
| 423 | +2,3,4,5,6 |
| 424 | +3,4 |
| 425 | +""" |
| 426 | + bad_sio = StringIO(data) |
| 427 | + |
| 428 | + with tm.assert_produces_warning(ParserWarning, match="Length of header or names"): |
| 429 | + result = parser.read_csv(bad_sio, on_bad_lines=lambda x: x) |
| 430 | + expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) |
| 431 | + tm.assert_frame_equal(result, expected) |
| 432 | + |
| 433 | + |
| 434 | +def test_on_bad_lines_callable_returns_none(python_parser_only): |
| 435 | + # GH 5686 |
| 436 | + parser = python_parser_only |
| 437 | + data = """a,b |
| 438 | +1,2 |
| 439 | +2,3,4,5,6 |
| 440 | +3,4 |
| 441 | +""" |
| 442 | + bad_sio = StringIO(data) |
| 443 | + |
| 444 | + result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None) |
| 445 | + expected = DataFrame({"a": [1, 3], "b": [2, 4]}) |
| 446 | + tm.assert_frame_equal(result, expected) |
| 447 | + |
| 448 | + |
| 449 | +def test_on_bad_lines_index_col_inferred(python_parser_only): |
| 450 | + # GH 5686 |
| 451 | + parser = python_parser_only |
| 452 | + data = """a,b |
| 453 | +1,2,3 |
| 454 | +4,5,6 |
| 455 | +""" |
| 456 | + bad_sio = StringIO(data) |
| 457 | + |
| 458 | + result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"]) |
| 459 | + expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4]) |
| 460 | + tm.assert_frame_equal(result, expected) |
0 commit comments