Skip to content

Commit 43dffec

Browse files
committed
replaced '_concat_date_cols' and 'convert_to_unicode' from lib.pyx to parsing.pyx
1 parent b45df3f commit 43dffec

File tree

5 files changed

+131
-125
lines changed

5 files changed

+131
-125
lines changed

asv_bench/benchmarks/io/parsers.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import numpy as np
22

3-
from pandas._libs.tslibs.parsing import _does_string_look_like_datetime
4-
5-
from pandas.io.parsers import _concat_date_cols
3+
from pandas._libs.tslibs.parsing import (
4+
_concat_date_cols, _does_string_look_like_datetime)
65

76

87
class DoesStringLookLikeDatetime(object):

pandas/_libs/lib.pyx

-114
Original file line numberDiff line numberDiff line change
@@ -2311,117 +2311,3 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan):
23112311
output[i] = default
23122312

23132313
return maybe_convert_objects(output)
2314-
2315-
2316-
@cython.wraparound(False)
2317-
@cython.boundscheck(False)
2318-
cdef inline object convert_to_unicode(object item,
2319-
bint keep_trivial_numbers):
2320-
"""
2321-
Convert `item` to str.
2322-
2323-
Parameters
2324-
----------
2325-
item : object
2326-
keep_trivial_numbers : bool
2327-
if True, then conversion (to string from integer/float zero)
2328-
is not performed
2329-
2330-
Returns
2331-
-------
2332-
str or int or float
2333-
"""
2334-
cdef:
2335-
float64_t float_item
2336-
2337-
if keep_trivial_numbers:
2338-
if isinstance(item, int):
2339-
if <int>item == 0:
2340-
return item
2341-
elif isinstance(item, float):
2342-
float_item = item
2343-
if float_item == 0.0 or float_item != float_item:
2344-
return item
2345-
2346-
if not isinstance(item, str):
2347-
item = PyObject_Str(item)
2348-
2349-
return item
2350-
2351-
2352-
@cython.wraparound(False)
2353-
@cython.boundscheck(False)
2354-
def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True):
2355-
"""
2356-
Concatenates elements from numpy arrays in `date_cols` into strings.
2357-
2358-
Parameters
2359-
----------
2360-
date_cols : tuple of numpy arrays
2361-
keep_trivial_numbers : bool, default True
2362-
if True and len(date_cols) == 1, then
2363-
conversion (to string from integer/float zero) is not performed
2364-
2365-
Returns
2366-
-------
2367-
arr_of_rows : ndarray (dtype=object)
2368-
2369-
Examples
2370-
--------
2371-
>>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
2372-
>>> times=np.array(['11:20', '10:45'], dtype=object)
2373-
>>> result = _concat_date_cols((dates, times))
2374-
>>> result
2375-
array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
2376-
"""
2377-
cdef:
2378-
Py_ssize_t rows_count = 0, col_count = len(date_cols)
2379-
Py_ssize_t col_idx, row_idx
2380-
list list_to_join
2381-
cnp.ndarray[object] iters
2382-
object[::1] iters_view
2383-
flatiter it
2384-
cnp.ndarray[object] result
2385-
object[:] result_view
2386-
2387-
if col_count == 0:
2388-
return np.zeros(0, dtype=object)
2389-
2390-
if not all(util.is_array(array) for array in date_cols):
2391-
raise ValueError("not all elements from date_cols are numpy arrays")
2392-
2393-
rows_count = min(len(array) for array in date_cols)
2394-
result = np.zeros(rows_count, dtype=object)
2395-
result_view = result
2396-
2397-
if col_count == 1:
2398-
array = date_cols[0]
2399-
it = <flatiter>PyArray_IterNew(array)
2400-
for row_idx in range(rows_count):
2401-
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
2402-
result_view[row_idx] = convert_to_unicode(item,
2403-
keep_trivial_numbers)
2404-
PyArray_ITER_NEXT(it)
2405-
else:
2406-
# create fixed size list - more effecient memory allocation
2407-
list_to_join = [None] * col_count
2408-
iters = np.zeros(col_count, dtype=object)
2409-
2410-
# create memoryview of iters ndarray, that will contain some
2411-
# flatiter's for each array in `date_cols` - more effecient indexing
2412-
iters_view = iters
2413-
for col_idx, array in enumerate(date_cols):
2414-
iters_view[col_idx] = PyArray_IterNew(array)
2415-
2416-
# array elements that are on the same line are converted to one string
2417-
for row_idx in range(rows_count):
2418-
for col_idx, array in enumerate(date_cols):
2419-
# this cast is needed, because we did not find a way
2420-
# to efficiently store `flatiter` type objects in ndarray
2421-
it = <flatiter>iters_view[col_idx]
2422-
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
2423-
list_to_join[col_idx] = convert_to_unicode(item, False)
2424-
PyArray_ITER_NEXT(it)
2425-
result_view[row_idx] = PyUnicode_Join(' ', list_to_join)
2426-
2427-
return result

pandas/_libs/tslibs/parsing.pyx

+123-1
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,19 @@ from io import StringIO
77

88
from libc.string cimport strchr
99

10+
import cython
11+
12+
from cpython cimport PyObject_Str, PyUnicode_Join
13+
1014
from cpython.datetime cimport datetime, datetime_new, import_datetime
1115
from cpython.version cimport PY_VERSION_HEX
1216
import_datetime()
1317

1418
import numpy as np
19+
cimport numpy as cnp
20+
from numpy cimport (PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT,
21+
PyArray_IterNew, flatiter, float64_t)
22+
cnp.import_array()
1523

1624
# dateutil compat
1725
from dateutil.tz import (tzoffset,
@@ -26,7 +34,7 @@ from pandas._config import get_option
2634

2735
from pandas._libs.tslibs.ccalendar import MONTH_NUMBERS
2836
from pandas._libs.tslibs.nattype import nat_strings, NaT
29-
from pandas._libs.tslibs.util cimport get_c_string_buf_and_size
37+
from pandas._libs.tslibs.util cimport is_array, get_c_string_buf_and_size
3038

3139
cdef extern from "../src/headers/portable.h":
3240
int getdigit_ascii(char c, int default) nogil
@@ -880,3 +888,117 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse,
880888
return guessed_format
881889
else:
882890
return None
891+
892+
893+
@cython.wraparound(False)
894+
@cython.boundscheck(False)
895+
cdef inline object convert_to_unicode(object item,
896+
bint keep_trivial_numbers):
897+
"""
898+
Convert `item` to str.
899+
900+
Parameters
901+
----------
902+
item : object
903+
keep_trivial_numbers : bool
904+
if True, then conversion (to string from integer/float zero)
905+
is not performed
906+
907+
Returns
908+
-------
909+
str or int or float
910+
"""
911+
cdef:
912+
float64_t float_item
913+
914+
if keep_trivial_numbers:
915+
if isinstance(item, int):
916+
if <int>item == 0:
917+
return item
918+
elif isinstance(item, float):
919+
float_item = item
920+
if float_item == 0.0 or float_item != float_item:
921+
return item
922+
923+
if not isinstance(item, str):
924+
item = PyObject_Str(item)
925+
926+
return item
927+
928+
929+
@cython.wraparound(False)
930+
@cython.boundscheck(False)
931+
def _concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True):
932+
"""
933+
Concatenates elements from numpy arrays in `date_cols` into strings.
934+
935+
Parameters
936+
----------
937+
date_cols : tuple of numpy arrays
938+
keep_trivial_numbers : bool, default True
939+
if True and len(date_cols) == 1, then
940+
conversion (to string from integer/float zero) is not performed
941+
942+
Returns
943+
-------
944+
arr_of_rows : ndarray (dtype=object)
945+
946+
Examples
947+
--------
948+
>>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
949+
>>> times=np.array(['11:20', '10:45'], dtype=object)
950+
>>> result = _concat_date_cols((dates, times))
951+
>>> result
952+
array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
953+
"""
954+
cdef:
955+
Py_ssize_t rows_count = 0, col_count = len(date_cols)
956+
Py_ssize_t col_idx, row_idx
957+
list list_to_join
958+
cnp.ndarray[object] iters
959+
object[::1] iters_view
960+
flatiter it
961+
cnp.ndarray[object] result
962+
object[:] result_view
963+
964+
if col_count == 0:
965+
return np.zeros(0, dtype=object)
966+
967+
if not all(is_array(array) for array in date_cols):
968+
raise ValueError("not all elements from date_cols are numpy arrays")
969+
970+
rows_count = min(len(array) for array in date_cols)
971+
result = np.zeros(rows_count, dtype=object)
972+
result_view = result
973+
974+
if col_count == 1:
975+
array = date_cols[0]
976+
it = <flatiter>PyArray_IterNew(array)
977+
for row_idx in range(rows_count):
978+
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
979+
result_view[row_idx] = convert_to_unicode(item,
980+
keep_trivial_numbers)
981+
PyArray_ITER_NEXT(it)
982+
else:
983+
# create fixed size list - more effecient memory allocation
984+
list_to_join = [None] * col_count
985+
iters = np.zeros(col_count, dtype=object)
986+
987+
# create memoryview of iters ndarray, that will contain some
988+
# flatiter's for each array in `date_cols` - more effecient indexing
989+
iters_view = iters
990+
for col_idx, array in enumerate(date_cols):
991+
iters_view[col_idx] = PyArray_IterNew(array)
992+
993+
# array elements that are on the same line are converted to one string
994+
for row_idx in range(rows_count):
995+
for col_idx, array in enumerate(date_cols):
996+
# this cast is needed, because we did not find a way
997+
# to efficiently store `flatiter` type objects in ndarray
998+
it = <flatiter>iters_view[col_idx]
999+
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
1000+
list_to_join[col_idx] = convert_to_unicode(item, False)
1001+
PyArray_ITER_NEXT(it)
1002+
result_view[row_idx] = PyUnicode_Join(' ', list_to_join)
1003+
1004+
return result

pandas/io/parsers.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
import numpy as np
1515

1616
import pandas._libs.lib as lib
17-
from pandas._libs.lib import _concat_date_cols
1817
import pandas._libs.ops as libops
1918
import pandas._libs.parsers as parsers
2019
from pandas._libs.tslibs import parsing
@@ -3187,7 +3186,7 @@ def _make_date_converter(date_parser=None, dayfirst=False,
31873186
infer_datetime_format=False, cache_dates=True):
31883187
def converter(*date_cols):
31893188
if date_parser is None:
3190-
strs = _concat_date_cols(date_cols)
3189+
strs = parsing._concat_date_cols(date_cols)
31913190

31923191
try:
31933192
return tools.to_datetime(
@@ -3217,10 +3216,10 @@ def converter(*date_cols):
32173216
except Exception:
32183217
try:
32193218
return tools.to_datetime(
3220-
parsing.try_parse_dates(_concat_date_cols(date_cols),
3221-
parser=date_parser,
3222-
dayfirst=dayfirst),
3223-
cache=cache_dates,
3219+
parsing.try_parse_dates(
3220+
parsing._concat_date_cols(date_cols),
3221+
parser=date_parser,
3222+
dayfirst=dayfirst),
32243223
errors='ignore')
32253224
except Exception:
32263225
return generic_parser(date_parser, *date_cols)

pandas/tests/io/parser/test_parse_dates.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def date_parser(*date_cols):
7676
-------
7777
parsed : Series
7878
"""
79-
return parsing.try_parse_dates(parsers._concat_date_cols(date_cols))
79+
return parsing.try_parse_dates(parsing._concat_date_cols(date_cols))
8080

8181
result = parser.read_csv(StringIO(data), header=None,
8282
date_parser=date_parser, prefix="X",

0 commit comments

Comments
 (0)