Clean Up src/parsers (#26445)

WillAyd · jreback · commit 80bddaf79693 · 2019-05-18T12:15:54.000-04:00
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -149,9 +149,6 @@ cdef extern from "parser/tokenizer.h":
         int skipinitialspace       # ignore spaces following delimiter? */
         int quoting                # style of quoting to write */
 
-        # hmm =/
-        # int numeric_field
-
         char commentchar
         int allow_embedded_newline
         int strict                 # raise exception on bad CSV */
diff --git a/pandas/_libs/src/parser/io.h b/pandas/_libs/src/parser/io.h
@@ -10,7 +10,8 @@ The full license is in the LICENSE file, distributed with this software.
 #ifndef PANDAS__LIBS_SRC_PARSER_IO_H_
 #define PANDAS__LIBS_SRC_PARSER_IO_H_
 
-#include "Python.h"
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 #include "tokenizer.h"
 
 typedef struct _file_source {
@@ -37,8 +38,6 @@ typedef struct _memory_map {
     size_t position;
 } memory_map;
 
-#define MM(src) ((memory_map *)src)
-
 void *new_mmap(char *fname);
 
 int del_mmap(void *src);
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -131,8 +131,6 @@ void parser_set_default_options(parser_t *self) {
     self->skip_footer = 0;
 }
 
-int get_parser_memory_footprint(parser_t *self) { return 0; }
-
 parser_t *parser_new() { return (parser_t *)calloc(1, sizeof(parser_t)); }
 
 int parser_clear_data_buffers(parser_t *self) {
@@ -1426,21 +1424,6 @@ PANDAS_INLINE void uppercase(char *p) {
     for (; *p; ++p) *p = toupper_ascii(*p);
 }
 
-int PANDAS_INLINE to_longlong(char *item, long long *p_value) {
-    char *p_end;
-
-    // Try integer conversion.  We explicitly give the base to be 10. If
-    // we used 0, strtoll() would convert '012' to 10, because the leading 0 in
-    // '012' signals an octal number in C.  For a general purpose reader, that
-    // would be a bug, not a feature.
-    *p_value = strtoll(item, &p_end, 10);
-
-    // Allow trailing spaces.
-    while (isspace_ascii(*p_end)) ++p_end;
-
-    return (errno == 0) && (!*p_end);
-}
-
 int to_boolean(const char *item, uint8_t *val) {
     char *tmp;
     int i, status = 0;
@@ -1474,24 +1457,6 @@ int to_boolean(const char *item, uint8_t *val) {
     return status;
 }
 
-#ifdef TEST
-
-int main(int argc, char *argv[]) {
-    double x, y;
-    long long xi;
-    int status;
-    char *s;
-
-    s = "123,789";
-    status = to_longlong_thousands(s, &xi, ',');
-    printf("s = '%s'\n", s);
-    printf("status = %d\n", status);
-    printf("x = %d\n", (int)xi);
-
-    return 0;
-}
-#endif  // TEST
-
 // ---------------------------------------------------------------------------
 // Implementation of xstrtod
 
diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h
@@ -12,14 +12,8 @@ See LICENSE for the license
 #ifndef PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
 #define PANDAS__LIBS_SRC_PARSER_TOKENIZER_H_
 
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include "Python.h"
-
-#include <ctype.h>
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
 
 #define ERROR_OK 0
 #define ERROR_NO_DIGITS 1
@@ -31,9 +25,6 @@ See LICENSE for the license
 
 #include "khash.h"
 
-#define CHUNKSIZE 1024 * 256
-#define KB 1024
-#define MB 1024 * KB
 #define STREAM_INIT_SIZE 32
 
 #define REACHED_EOF 1
@@ -50,25 +41,10 @@ See LICENSE for the license
 
  */
 
-#define FALSE 0
-#define TRUE 1
-
-// Maximum number of columns in a file.
-#define MAX_NUM_COLUMNS 2000
-
-// Maximum number of characters in single field.
-#define FIELD_BUFFER_SIZE 2000
-
 /*
  *  Common set of error types for the read_rows() and tokenize()
  *  functions.
  */
-#define ERROR_OUT_OF_MEMORY 1
-#define ERROR_INVALID_COLUMN_INDEX 10
-#define ERROR_CHANGED_NUMBER_OF_FIELDS 12
-#define ERROR_TOO_MANY_CHARS 21
-#define ERROR_TOO_MANY_FIELDS 22
-#define ERROR_NO_DATA 23
 
 // #define VERBOSE
 #if defined(VERBOSE)
@@ -84,12 +60,6 @@ See LICENSE for the license
  *      of some file I/O.
  */
 
-/*
- *  WORD_BUFFER_SIZE determines the maximum amount of non-delimiter
- *  text in a row.
- */
-#define WORD_BUFFER_SIZE 4000
-
 typedef enum {
     START_RECORD,
     START_FIELD,
@@ -164,9 +134,6 @@ typedef struct parser_t {
     int skipinitialspace; /* ignore spaces following delimiter? */
     int quoting;          /* style of quoting to write */
 
-    // krufty, hmm =/
-    int numeric_field;
-
     char commentchar;
     int allow_embedded_newline;
     int strict; /* raise exception on bad CSV */
@@ -191,7 +158,7 @@ typedef struct parser_t {
     void *skipset;
     PyObject *skipfunc;
     int64_t skip_first_N_rows;
-    int skip_footer;
+    int64_t skip_footer;
     // pick one, depending on whether the converter requires GIL
     double (*double_converter_nogil)(const char *, char **,
                                      char, char, char, int);
@@ -208,7 +175,7 @@ typedef struct parser_t {
 typedef struct coliter_t {
     char **words;
     int64_t *line_start;
-    int col;
+    int64_t col;
 } coliter_t;
 
 void coliter_setup(coliter_t *self, parser_t *parser, int i, int start);