Skip to content

Commit c17b84a

Browse files
authored
CLN: remove unused c_encoding (#40342)
1 parent 0e1a6c1 commit c17b84a

File tree

1 file changed

+16
-113
lines changed

1 file changed

+16
-113
lines changed

pandas/_libs/parsers.pyx

+16-113
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ from cpython.ref cimport (
3636
from cpython.unicode cimport (
3737
PyUnicode_AsUTF8String,
3838
PyUnicode_Decode,
39+
PyUnicode_DecodeUTF8,
3940
)
4041

4142

@@ -321,7 +322,6 @@ cdef class TextReader:
321322
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
322323
uint64_t parser_start
323324
list clocks
324-
char *c_encoding
325325
const char *encoding_errors
326326
kh_str_starts_t *false_set
327327
kh_str_starts_t *true_set
@@ -381,7 +381,6 @@ cdef class TextReader:
381381
encoding_errors=b"strict"):
382382

383383
# set encoding for native Python and C library
384-
self.c_encoding = NULL
385384
if isinstance(encoding_errors, str):
386385
encoding_errors = encoding_errors.encode("utf-8")
387386
Py_INCREF(encoding_errors)
@@ -638,7 +637,6 @@ cdef class TextReader:
638637
char *word
639638
object name, old_name
640639
uint64_t hr, data_line = 0
641-
StringPath path = _string_path(self.c_encoding)
642640
list header = []
643641
set unnamed_cols = set()
644642

@@ -678,8 +676,8 @@ cdef class TextReader:
678676
for i in range(field_count):
679677
word = self.parser.words[start + i]
680678

681-
name = PyUnicode_Decode(word, strlen(word),
682-
self.c_encoding, self.encoding_errors)
679+
name = PyUnicode_DecodeUTF8(word, strlen(word),
680+
self.encoding_errors)
683681

684682
# We use this later when collecting placeholder names.
685683
old_name = name
@@ -987,8 +985,7 @@ cdef class TextReader:
987985
f"for column {name} - only the converter will "
988986
f"be used"), ParserWarning,
989987
stacklevel=5)
990-
results[i] = _apply_converter(conv, self.parser, i, start, end,
991-
self.c_encoding)
988+
results[i] = _apply_converter(conv, self.parser, i, start, end)
992989
continue
993990

994991
# Collect the list of NaN values associated with the column.
@@ -1102,8 +1099,7 @@ cdef class TextReader:
11021099
# TODO: I suspect that _categorical_convert could be
11031100
# optimized when dtype is an instance of CategoricalDtype
11041101
codes, cats, na_count = _categorical_convert(
1105-
self.parser, i, start, end, na_filter,
1106-
na_hashset, self.c_encoding)
1102+
self.parser, i, start, end, na_filter, na_hashset)
11071103

11081104
# Method accepts list of strings, not encoded ones.
11091105
true_values = [x.decode() for x in self.true_values]
@@ -1199,14 +1195,8 @@ cdef class TextReader:
11991195
cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
12001196
bint na_filter, kh_str_starts_t *na_hashset):
12011197

1202-
cdef StringPath path = _string_path(self.c_encoding)
1203-
1204-
if path == UTF8:
1205-
return _string_box_utf8(self.parser, i, start, end, na_filter,
1206-
na_hashset, self.encoding_errors)
1207-
elif path == ENCODED:
1208-
return _string_box_decode(self.parser, i, start, end,
1209-
na_filter, na_hashset, self.c_encoding)
1198+
return _string_box_utf8(self.parser, i, start, end, na_filter,
1199+
na_hashset, self.encoding_errors)
12101200

12111201
def _get_converter(self, i, name):
12121202
if self.converters is None:
@@ -1336,18 +1326,6 @@ def _maybe_upcast(arr):
13361326
return arr
13371327

13381328

1339-
cdef enum StringPath:
1340-
UTF8
1341-
ENCODED
1342-
1343-
1344-
# factored out logic to pick string converter
1345-
cdef inline StringPath _string_path(char *encoding):
1346-
if encoding != NULL and encoding != b"utf-8":
1347-
return ENCODED
1348-
return UTF8
1349-
1350-
13511329
# ----------------------------------------------------------------------
13521330
# Type conversions / inference support code
13531331

@@ -1406,68 +1384,10 @@ cdef _string_box_utf8(parser_t *parser, int64_t col,
14061384
return result, na_count
14071385

14081386

1409-
cdef _string_box_decode(parser_t *parser, int64_t col,
1410-
int64_t line_start, int64_t line_end,
1411-
bint na_filter, kh_str_starts_t *na_hashset,
1412-
char *encoding):
1413-
cdef:
1414-
int na_count = 0
1415-
Py_ssize_t i, size, lines
1416-
coliter_t it
1417-
const char *word = NULL
1418-
ndarray[object] result
1419-
1420-
int ret = 0
1421-
kh_strbox_t *table
1422-
1423-
char *errors = "strict"
1424-
1425-
object pyval
1426-
1427-
object NA = na_values[np.object_]
1428-
khiter_t k
1429-
1430-
table = kh_init_strbox()
1431-
lines = line_end - line_start
1432-
result = np.empty(lines, dtype=np.object_)
1433-
coliter_setup(&it, parser, col, line_start)
1434-
1435-
for i in range(lines):
1436-
COLITER_NEXT(it, word)
1437-
1438-
if na_filter:
1439-
if kh_get_str_starts_item(na_hashset, word):
1440-
# in the hash table
1441-
na_count += 1
1442-
result[i] = NA
1443-
continue
1444-
1445-
k = kh_get_strbox(table, word)
1446-
1447-
# in the hash table
1448-
if k != table.n_buckets:
1449-
# this increments the refcount, but need to test
1450-
pyval = <object>table.vals[k]
1451-
else:
1452-
# box it. new ref?
1453-
size = strlen(word)
1454-
pyval = PyUnicode_Decode(word, size, encoding, errors)
1455-
1456-
k = kh_put_strbox(table, word, &ret)
1457-
table.vals[k] = <PyObject *>pyval
1458-
1459-
result[i] = pyval
1460-
1461-
kh_destroy_strbox(table)
1462-
1463-
return result, na_count
1464-
1465-
14661387
@cython.boundscheck(False)
14671388
cdef _categorical_convert(parser_t *parser, int64_t col,
14681389
int64_t line_start, int64_t line_end,
1469-
bint na_filter, kh_str_starts_t *na_hashset,
1470-
char *encoding):
1390+
bint na_filter, kh_str_starts_t *na_hashset):
14711391
"Convert column data into codes, categories"
14721392
cdef:
14731393
int na_count = 0
@@ -1480,7 +1400,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
14801400
int64_t current_category = 0
14811401

14821402
char *errors = "strict"
1483-
StringPath path = _string_path(encoding)
14841403

14851404
int ret = 0
14861405
kh_str_t *table
@@ -1516,16 +1435,9 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
15161435

15171436
# parse and box categories to python strings
15181437
result = np.empty(table.n_occupied, dtype=np.object_)
1519-
if path == ENCODED:
1520-
for k in range(table.n_buckets):
1521-
if kh_exist_str(table, k):
1522-
size = strlen(table.keys[k])
1523-
result[table.vals[k]] = PyUnicode_Decode(
1524-
table.keys[k], size, encoding, errors)
1525-
elif path == UTF8:
1526-
for k in range(table.n_buckets):
1527-
if kh_exist_str(table, k):
1528-
result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
1438+
for k in range(table.n_buckets):
1439+
if kh_exist_str(table, k):
1440+
result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
15291441

15301442
kh_destroy_str(table)
15311443
return np.asarray(codes), result, na_count
@@ -2064,13 +1976,11 @@ for k in list(na_values):
20641976

20651977

20661978
cdef _apply_converter(object f, parser_t *parser, int64_t col,
2067-
int64_t line_start, int64_t line_end,
2068-
char* c_encoding):
1979+
int64_t line_start, int64_t line_end):
20691980
cdef:
20701981
Py_ssize_t i, lines
20711982
coliter_t it
20721983
const char *word = NULL
2073-
char *errors = "strict"
20741984
ndarray[object] result
20751985
object val
20761986

@@ -2079,17 +1989,10 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
20791989

20801990
coliter_setup(&it, parser, col, line_start)
20811991

2082-
if c_encoding == NULL or c_encoding == b'utf-8':
2083-
for i in range(lines):
2084-
COLITER_NEXT(it, word)
2085-
val = PyUnicode_FromString(word)
2086-
result[i] = f(val)
2087-
else:
2088-
for i in range(lines):
2089-
COLITER_NEXT(it, word)
2090-
val = PyUnicode_Decode(word, strlen(word),
2091-
c_encoding, errors)
2092-
result[i] = f(val)
1992+
for i in range(lines):
1993+
COLITER_NEXT(it, word)
1994+
val = PyUnicode_FromString(word)
1995+
result[i] = f(val)
20931996

20941997
return lib.maybe_convert_objects(result)
20951998

0 commit comments

Comments
 (0)