Skip to content

Commit 5d39744

Browse files
committed
BUG: Parse NULL char as null value
Fixes bug in C parser in which the NULL character ('\x00') was being interpreted as a true line terminator, escape character, or comment character because it was used to indicate that a user had not specified these values. As a result, if the data contains this value, it was being incorrectly parsed. It should be parsed as NULL. Closes pandas-devgh-14012.
1 parent 5d791cc commit 5d39744

File tree

3 files changed

+32
-8
lines changed

3 files changed

+32
-8
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -882,6 +882,7 @@ Bug Fixes
882882

883883
- Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`)
884884

885+
- Bug in ``pd.read_csv()`` in the C engine where the NULL character was not being parsed as NULL (:issue:`14012`)
885886
- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`)
886887
- Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`)
887888
- Bug in ``DatetimeIndex`` and ``Period`` subtraction raises ``ValueError`` or ``AttributeError`` rather than ``TypeError`` (:issue:`13078`)

pandas/io/tests/parser/c_parser_only.py

+18
Original file line numberDiff line numberDiff line change
@@ -543,3 +543,21 @@ def test_parse_trim_buffers(self):
543543

544544
# Check for data corruption if there was no segfault
545545
tm.assert_frame_equal(result, expected)
546+
547+
def test_internal_null_byte(self):
548+
# see gh-14012
549+
#
550+
# The null byte ('\x00') should not be used as a
551+
# true line terminator, escape character, or comment
552+
# character, only as a placeholder to indicate that
553+
# none was specified.
554+
#
555+
# This test should be moved to common.py ONLY when
556+
# Python's csv class supports parsing '\x00'.
557+
names = ['a', 'b', 'c']
558+
data = "1,2,3\n4,\x00,6\n7,8,9"
559+
expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6],
560+
[7, 8, 9]], columns=names)
561+
562+
result = self.read_csv(StringIO(data), names=names)
563+
tm.assert_frame_equal(result, expected)

pandas/src/parser/tokenizer.c

+13-8
Original file line numberDiff line numberDiff line change
@@ -684,14 +684,19 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
684684

685685
#define IS_WHITESPACE(c) ((c == ' ' || c == '\t'))
686686

687-
#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && \
688-
c == '\n') || c == self->lineterminator)
687+
#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && c == '\n') || \
688+
(self->lineterminator != '\0' && \
689+
c == self->lineterminator))
689690

690691
#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
691692

692693
// don't parse '\r' with a custom line terminator
693694
#define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r'))
694695

696+
#define IS_COMMENT_CHAR(c) ((self->commentchar != '\0' && c == self->commentchar))
697+
698+
#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar))
699+
695700
#define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \
696701
self->skipinitialspace))
697702

@@ -866,7 +871,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
866871
self->state = EAT_CRNL;
867872
}
868873
break;
869-
} else if (c == self->commentchar) {
874+
} else if (IS_COMMENT_CHAR(c)) {
870875
self->state = EAT_LINE_COMMENT;
871876
break;
872877
} else if (IS_WHITESPACE(c)) {
@@ -899,7 +904,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
899904
} else if (IS_QUOTE(c)) {
900905
// start quoted field
901906
self->state = IN_QUOTED_FIELD;
902-
} else if (c == self->escapechar) {
907+
} else if (IS_ESCAPE_CHAR(c)) {
903908
// possible escaped character
904909
self->state = ESCAPED_CHAR;
905910
} else if (IS_SKIPPABLE_SPACE(c)) {
@@ -912,7 +917,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
912917
// save empty field
913918
END_FIELD();
914919
}
915-
} else if (c == self->commentchar) {
920+
} else if (IS_COMMENT_CHAR(c)) {
916921
END_FIELD();
917922
self->state = EAT_COMMENT;
918923
} else {
@@ -950,7 +955,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
950955
} else if (IS_CARRIAGE(c)) {
951956
END_FIELD();
952957
self->state = EAT_CRNL;
953-
} else if (c == self->escapechar) {
958+
} else if (IS_ESCAPE_CHAR(c)) {
954959
// possible escaped character
955960
self->state = ESCAPED_CHAR;
956961
} else if (IS_DELIMITER(c)) {
@@ -962,7 +967,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
962967
} else {
963968
self->state = START_FIELD;
964969
}
965-
} else if (c == self->commentchar) {
970+
} else if (IS_COMMENT_CHAR(c)) {
966971
END_FIELD();
967972
self->state = EAT_COMMENT;
968973
} else {
@@ -973,7 +978,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
973978

974979
case IN_QUOTED_FIELD:
975980
// in quoted field
976-
if (c == self->escapechar) {
981+
if (IS_ESCAPE_CHAR(c)) {
977982
// possible escape character
978983
self->state = ESCAPE_IN_QUOTED_FIELD;
979984
} else if (IS_QUOTE(c)) {

0 commit comments

Comments
 (0)