BUG: Parse NULL char as null value

gfyoung · gfyoung · commit 5d39744ef0cb · 2016-08-17T00:11:16.000-04:00
Fixes bug in C parser in which the NULL character ('\x00') was being interpreted as a true line terminator, escape character, or comment character because it was used to indicate that a user had not specified these values. As a result, if the data contains this value, it was being incorrectly parsed. It should be parsed as NULL. Closes pandas-devgh-14012.
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -882,6 +882,7 @@ Bug Fixes
 
 - Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`)
 
+- Bug in ``pd.read_csv()`` in the C engine where the NULL character was not being parsed as NULL (:issue:`14012`)
 - Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError``  (:issue:`13156`)
 - Bug in ``Series.str.extractall()`` with single group and quantifier  (:issue:`13382`)
 - Bug in ``DatetimeIndex`` and ``Period`` subtraction raises ``ValueError`` or ``AttributeError`` rather than ``TypeError`` (:issue:`13078`)
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -543,3 +543,21 @@ def test_parse_trim_buffers(self):
 
         # Check for data corruption if there was no segfault
         tm.assert_frame_equal(result, expected)
+
+    def test_internal_null_byte(self):
+        # see gh-14012
+        #
+        # The null byte ('\x00') should not be used as a
+        # true line terminator, escape character, or comment
+        # character, only as a placeholder to indicate that
+        # none was specified.
+        #
+        # This test should be moved to common.py ONLY when
+        # Python's csv class supports parsing '\x00'.
+        names = ['a', 'b', 'c']
+        data = "1,2,3\n4,\x00,6\n7,8,9"
+        expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6],
+                                 [7, 8, 9]], columns=names)
+
+        result = self.read_csv(StringIO(data), names=names)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -684,14 +684,19 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
 
 #define IS_WHITESPACE(c) ((c == ' ' || c == '\t'))
 
-#define IS_TERMINATOR(c) ((self->lineterminator == '\0' &&          \
-                           c == '\n') || c == self->lineterminator)
+#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && c == '\n') || \
+                          (self->lineterminator != '\0' &&               \
+                           c == self->lineterminator))
 
 #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
 
 // don't parse '\r' with a custom line terminator
 #define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r'))
 
+#define IS_COMMENT_CHAR(c) ((self->commentchar != '\0' && c == self->commentchar))
+
+#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar))
+
 #define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \
                                 self->skipinitialspace))
 
@@ -866,7 +871,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
                     self->state = EAT_CRNL;
                 }
                 break;
-            } else if (c == self->commentchar) {
+            } else if (IS_COMMENT_CHAR(c)) {
                 self->state = EAT_LINE_COMMENT;
                 break;
             } else if (IS_WHITESPACE(c)) {
@@ -899,7 +904,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
             } else if (IS_QUOTE(c)) {
                 // start quoted field
                 self->state = IN_QUOTED_FIELD;
-            } else if (c == self->escapechar) {
+            } else if (IS_ESCAPE_CHAR(c)) {
                 // possible escaped character
                 self->state = ESCAPED_CHAR;
             } else if (IS_SKIPPABLE_SPACE(c)) {
@@ -912,7 +917,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
                     // save empty field
                     END_FIELD();
                 }
-            } else if (c == self->commentchar) {
+            } else if (IS_COMMENT_CHAR(c)) {
                 END_FIELD();
                 self->state = EAT_COMMENT;
             } else {
@@ -950,7 +955,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
             } else if (IS_CARRIAGE(c)) {
                 END_FIELD();
                 self->state = EAT_CRNL;
-            } else if (c == self->escapechar) {
+            } else if (IS_ESCAPE_CHAR(c)) {
                 // possible escaped character
                 self->state = ESCAPED_CHAR;
             } else if (IS_DELIMITER(c)) {
@@ -962,7 +967,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
                 } else {
                     self->state = START_FIELD;
                 }
-            } else if (c == self->commentchar) {
+            } else if (IS_COMMENT_CHAR(c)) {
                 END_FIELD();
                 self->state = EAT_COMMENT;
             } else {
@@ -973,7 +978,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
 
         case IN_QUOTED_FIELD:
             // in quoted field
-            if (c == self->escapechar) {
+            if (IS_ESCAPE_CHAR(c)) {
                 // possible escape character
                 self->state = ESCAPE_IN_QUOTED_FIELD;
             } else if (IS_QUOTE(c)) {