BUG: Parse NULL char as null value

gfyoung · jreback · commit cb43b6c5a1e6 · 2016-08-17T06:24:54.000-04:00
Fixes bug in C parser in which the `NULL` character (`'\x00'`) was being interpreted as a true line terminator, escape character, or comment character because it was used to indicate that a user had not specified these values. As a result, if the data contains this value, it was being incorrectly parsed. It should be parsed as `NULL`. Closes pandas-dev#14012. Author: gfyoung <gfyoung17@gmail.com> Closes pandas-dev#14019 from gfyoung/null-char-parse and squashes the following commits: 5d39744 [gfyoung] BUG: Parse NULL char as null value
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -957,7 +957,8 @@ Bug Fixes
 - Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a ``tempfile.TemporaryFile`` on Windows with Python 3 (:issue:`13398`)
 - Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`)
 - Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`)
-- Bug in ``pd.read_csv()`` with ``engine='c'`` in which null ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`)
+- Bug in ``pd.read_csv()`` in the C engine where the NULL character was not being parsed as NULL (:issue:`14012`)
+- Bug in ``pd.read_csv()`` with ``engine='c'`` in which NULL ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`)
 - Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`)
 - Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`)
 - Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`)
@@ -970,8 +971,8 @@ Bug Fixes
 
 - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`)
 
-- Bug ``Series.isnull`` and ``Series.notnull`` ignore ``Period('NaT')``  (:issue:`13737`)
-- Bug ``Series.fillna`` and ``Series.dropna`` don't affect to ``Period('NaT')``  (:issue:`13737`)
+- Bug ``Series.isnull()`` and ``Series.notnull()`` ignore ``Period('NaT')``  (:issue:`13737`)
+- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')``  (:issue:`13737`)
 
 - Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`)
 - Bug in ``.resample(..)`` where incorrect warnings were triggered by IPython introspection (:issue:`13618`)
@@ -1008,8 +1009,8 @@ Bug Fixes
 - Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`)
 - Bug in ``Series`` creation with ``np.datetime64`` which has other unit than ``ns`` as ``object`` dtype results in incorrect values (:issue:`13876`)
 
-- Bug in ``isnull`` ``notnull`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`)
-- Bug in ``.merge`` may raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`)
+- Bug in ``pd.isnull()`` ``pd.notnull()`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`)
+- Bug in ``pd.merge()`` may raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`)
 
 - Bug in ``HDFStore``/``read_hdf()`` discarded ``DatetimeIndex.name`` if ``tz`` was set (:issue:`13884`)
 
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -543,3 +543,21 @@ def test_parse_trim_buffers(self):
 
         # Check for data corruption if there was no segfault
         tm.assert_frame_equal(result, expected)
+
+    def test_internal_null_byte(self):
+        # see gh-14012
+        #
+        # The null byte ('\x00') should not be used as a
+        # true line terminator, escape character, or comment
+        # character, only as a placeholder to indicate that
+        # none was specified.
+        #
+        # This test should be moved to common.py ONLY when
+        # Python's csv class supports parsing '\x00'.
+        names = ['a', 'b', 'c']
+        data = "1,2,3\n4,\x00,6\n7,8,9"
+        expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6],
+                                 [7, 8, 9]], columns=names)
+
+        result = self.read_csv(StringIO(data), names=names)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -684,14 +684,19 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
 
 #define IS_WHITESPACE(c) ((c == ' ' || c == '\t'))
 
-#define IS_TERMINATOR(c) ((self->lineterminator == '\0' &&          \
-                           c == '\n') || c == self->lineterminator)
+#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && c == '\n') || \
+                          (self->lineterminator != '\0' &&               \
+                           c == self->lineterminator))
 
 #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
 
 // don't parse '\r' with a custom line terminator
 #define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r'))
 
+#define IS_COMMENT_CHAR(c) ((self->commentchar != '\0' && c == self->commentchar))
+
+#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar))
+
 #define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \
                                 self->skipinitialspace))
 
@@ -866,7 +871,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
                     self->state = EAT_CRNL;
                 }
                 break;
-            } else if (c == self->commentchar) {
+            } else if (IS_COMMENT_CHAR(c)) {
                 self->state = EAT_LINE_COMMENT;
                 break;
             } else if (IS_WHITESPACE(c)) {
@@ -899,7 +904,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
             } else if (IS_QUOTE(c)) {
                 // start quoted field
                 self->state = IN_QUOTED_FIELD;
-            } else if (c == self->escapechar) {
+            } else if (IS_ESCAPE_CHAR(c)) {
                 // possible escaped character
                 self->state = ESCAPED_CHAR;
             } else if (IS_SKIPPABLE_SPACE(c)) {
@@ -912,7 +917,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
                     // save empty field
                     END_FIELD();
                 }
-            } else if (c == self->commentchar) {
+            } else if (IS_COMMENT_CHAR(c)) {
                 END_FIELD();
                 self->state = EAT_COMMENT;
             } else {
@@ -950,7 +955,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
             } else if (IS_CARRIAGE(c)) {
                 END_FIELD();
                 self->state = EAT_CRNL;
-            } else if (c == self->escapechar) {
+            } else if (IS_ESCAPE_CHAR(c)) {
                 // possible escaped character
                 self->state = ESCAPED_CHAR;
             } else if (IS_DELIMITER(c)) {
@@ -962,7 +967,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
                 } else {
                     self->state = START_FIELD;
                 }
-            } else if (c == self->commentchar) {
+            } else if (IS_COMMENT_CHAR(c)) {
                 END_FIELD();
                 self->state = EAT_COMMENT;
             } else {
@@ -973,7 +978,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit)
 
         case IN_QUOTED_FIELD:
             // in quoted field
-            if (c == self->escapechar) {
+            if (IS_ESCAPE_CHAR(c)) {
                 // possible escape character
                 self->state = ESCAPE_IN_QUOTED_FIELD;
             } else if (IS_QUOTE(c)) {