Skip to content

Commit cb43b6c

Browse files
gfyoungjreback
authored andcommitted
BUG: Parse NULL char as null value
Fixes bug in C parser in which the `NULL` character (`'\x00'`) was being interpreted as a true line terminator, escape character, or comment character because it was used to indicate that a user had not specified these values. As a result, if the data contains this value, it was being incorrectly parsed. It should be parsed as `NULL`. Closes pandas-dev#14012. Author: gfyoung <[email protected]> Closes pandas-dev#14019 from gfyoung/null-char-parse and squashes the following commits: 5d39744 [gfyoung] BUG: Parse NULL char as null value
1 parent 0780443 commit cb43b6c

File tree

3 files changed

+37
-13
lines changed

3 files changed

+37
-13
lines changed

doc/source/whatsnew/v0.19.0.txt

+6-5
Original file line numberDiff line numberDiff line change
@@ -957,7 +957,8 @@ Bug Fixes
957957
- Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a ``tempfile.TemporaryFile`` on Windows with Python 3 (:issue:`13398`)
958958
- Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`)
959959
- Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`)
960-
- Bug in ``pd.read_csv()`` with ``engine='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`)
960+
- Bug in ``pd.read_csv()`` in the C engine where the NULL character was not being parsed as NULL (:issue:`14012`)
961+
- Bug in ``pd.read_csv()`` with ``engine='c'`` in which NULL ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`)
961962
- Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`)
962963
- Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`)
963964
- Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`)
@@ -970,8 +971,8 @@ Bug Fixes
970971

971972
- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`)
972973

973-
- Bug ``Series.isnull`` and ``Series.notnull`` ignore ``Period('NaT')`` (:issue:`13737`)
974-
- Bug ``Series.fillna`` and ``Series.dropna`` don't affect to ``Period('NaT')`` (:issue:`13737`)
974+
- Bug ``Series.isnull()`` and ``Series.notnull()`` ignore ``Period('NaT')`` (:issue:`13737`)
975+
- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')`` (:issue:`13737`)
975976

976977
- Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`)
977978
- Bug in ``.resample(..)`` where incorrect warnings were triggered by IPython introspection (:issue:`13618`)
@@ -1008,8 +1009,8 @@ Bug Fixes
10081009
- Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`)
10091010
- Bug in ``Series`` creation with ``np.datetime64`` which has other unit than ``ns`` as ``object`` dtype results in incorrect values (:issue:`13876`)
10101011

1011-
- Bug in ``isnull`` ``notnull`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`)
1012-
- Bug in ``.merge`` may raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`)
1012+
- Bug in ``pd.isnull()`` ``pd.notnull()`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`)
1013+
- Bug in ``pd.merge()`` may raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`)
10131014

10141015
- Bug in ``HDFStore``/``read_hdf()`` discarded ``DatetimeIndex.name`` if ``tz`` was set (:issue:`13884`)
10151016

pandas/io/tests/parser/c_parser_only.py

+18
Original file line numberDiff line numberDiff line change
@@ -543,3 +543,21 @@ def test_parse_trim_buffers(self):
543543

544544
# Check for data corruption if there was no segfault
545545
tm.assert_frame_equal(result, expected)
546+
547+
def test_internal_null_byte(self):
548+
# see gh-14012
549+
#
550+
# The null byte ('\x00') should not be used as a
551+
# true line terminator, escape character, or comment
552+
# character, only as a placeholder to indicate that
553+
# none was specified.
554+
#
555+
# This test should be moved to common.py ONLY when
556+
# Python's csv class supports parsing '\x00'.
557+
names = ['a', 'b', 'c']
558+
data = "1,2,3\n4,\x00,6\n7,8,9"
559+
expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6],
560+
[7, 8, 9]], columns=names)
561+
562+
result = self.read_csv(StringIO(data), names=names)
563+
tm.assert_frame_equal(result, expected)

pandas/src/parser/tokenizer.c

+13-8
Original file line numberDiff line numberDiff line change
@@ -684,14 +684,19 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
684684

685685
#define IS_WHITESPACE(c) ((c == ' ' || c == '\t'))
686686

687-
#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && \
688-
c == '\n') || c == self->lineterminator)
687+
#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && c == '\n') || \
688+
(self->lineterminator != '\0' && \
689+
c == self->lineterminator))
689690

690691
#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
691692

692693
// don't parse '\r' with a custom line terminator
693694
#define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r'))
694695

696+
#define IS_COMMENT_CHAR(c) ((self->commentchar != '\0' && c == self->commentchar))
697+
698+
#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar))
699+
695700
#define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \
696701
self->skipinitialspace))
697702

@@ -866,7 +871,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
866871
self->state = EAT_CRNL;
867872
}
868873
break;
869-
} else if (c == self->commentchar) {
874+
} else if (IS_COMMENT_CHAR(c)) {
870875
self->state = EAT_LINE_COMMENT;
871876
break;
872877
} else if (IS_WHITESPACE(c)) {
@@ -899,7 +904,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
899904
} else if (IS_QUOTE(c)) {
900905
// start quoted field
901906
self->state = IN_QUOTED_FIELD;
902-
} else if (c == self->escapechar) {
907+
} else if (IS_ESCAPE_CHAR(c)) {
903908
// possible escaped character
904909
self->state = ESCAPED_CHAR;
905910
} else if (IS_SKIPPABLE_SPACE(c)) {
@@ -912,7 +917,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
912917
// save empty field
913918
END_FIELD();
914919
}
915-
} else if (c == self->commentchar) {
920+
} else if (IS_COMMENT_CHAR(c)) {
916921
END_FIELD();
917922
self->state = EAT_COMMENT;
918923
} else {
@@ -950,7 +955,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
950955
} else if (IS_CARRIAGE(c)) {
951956
END_FIELD();
952957
self->state = EAT_CRNL;
953-
} else if (c == self->escapechar) {
958+
} else if (IS_ESCAPE_CHAR(c)) {
954959
// possible escaped character
955960
self->state = ESCAPED_CHAR;
956961
} else if (IS_DELIMITER(c)) {
@@ -962,7 +967,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
962967
} else {
963968
self->state = START_FIELD;
964969
}
965-
} else if (c == self->commentchar) {
970+
} else if (IS_COMMENT_CHAR(c)) {
966971
END_FIELD();
967972
self->state = EAT_COMMENT;
968973
} else {
@@ -973,7 +978,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
973978

974979
case IN_QUOTED_FIELD:
975980
// in quoted field
976-
if (c == self->escapechar) {
981+
if (IS_ESCAPE_CHAR(c)) {
977982
// possible escape character
978983
self->state = ESCAPE_IN_QUOTED_FIELD;
979984
} else if (IS_QUOTE(c)) {

0 commit comments

Comments
 (0)