Skip to content

Commit 93dd57c

Browse files
committed
Merge remote-tracking branch 'upstream/master' into str_cat_set
2 parents 6f32d43 + 5e06c84 commit 93dd57c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+2379
-1200
lines changed

asv_bench/benchmarks/indexing.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,17 @@
22

33
import numpy as np
44
import pandas.util.testing as tm
5-
from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index,
6-
Float64Index, IntervalIndex, CategoricalIndex,
5+
from pandas import (Series, DataFrame, Panel, MultiIndex,
6+
Int64Index, UInt64Index, Float64Index,
7+
IntervalIndex, CategoricalIndex,
78
IndexSlice, concat, date_range)
89

910

1011
class NumericSeriesIndexing(object):
1112

1213
goal_time = 0.2
1314
params = [
14-
(Int64Index, Float64Index),
15+
(Int64Index, UInt64Index, Float64Index),
1516
('unique_monotonic_inc', 'nonunique_monotonic_inc'),
1617
]
1718
param_names = ['index_dtype', 'index_structure']
+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import numpy as np
2+
3+
from pandas._libs.index import (Int64Engine, UInt64Engine, Float64Engine,
4+
ObjectEngine)
5+
6+
7+
class NumericEngineIndexing(object):
8+
9+
goal_time = 0.2
10+
params = [[Int64Engine, UInt64Engine, Float64Engine],
11+
[np.int64, np.uint64, np.float64],
12+
['monotonic_incr', 'monotonic_decr', 'non_monotonic'],
13+
]
14+
param_names = ['engine', 'dtype', 'index_type']
15+
16+
def setup(self, engine, dtype, index_type):
17+
N = 10**5
18+
values = list([1] * N + [2] * N + [3] * N)
19+
arr = {
20+
'monotonic_incr': np.array(values, dtype=dtype),
21+
'monotonic_decr': np.array(list(reversed(values)),
22+
dtype=dtype),
23+
'non_monotonic': np.array([1, 2, 3] * N, dtype=dtype),
24+
}[index_type]
25+
26+
self.data = engine(lambda: arr, len(arr))
27+
# code belows avoids populating the mapping etc. while timing.
28+
self.data.get_loc(2)
29+
30+
def time_get_loc(self, engine, dtype, index_type):
31+
self.data.get_loc(2)
32+
33+
34+
class ObjectEngineIndexing(object):
35+
36+
goal_time = 0.2
37+
params = [('monotonic_incr', 'monotonic_decr', 'non_monotonic')]
38+
param_names = ['index_type']
39+
40+
def setup(self, index_type):
41+
N = 10**5
42+
values = list('a' * N + 'b' * N + 'c' * N)
43+
arr = {
44+
'monotonic_incr': np.array(values, dtype=object),
45+
'monotonic_decr': np.array(list(reversed(values)), dtype=object),
46+
'non_monotonic': np.array(list('abc') * N, dtype=object),
47+
}[index_type]
48+
49+
self.data = ObjectEngine(lambda: arr, len(arr))
50+
# code belows avoids populating the mapping etc. while timing.
51+
self.data.get_loc('b')
52+
53+
def time_get_loc(self, index_type):
54+
self.data.get_loc('b')

ci/azure-windows-36.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@ channels:
55
dependencies:
66
- blosc
77
- bottleneck
8+
- boost-cpp<1.67
89
- fastparquet
910
- feather-format
1011
- matplotlib
1112
- numexpr
1213
- numpy=1.14*
1314
- openpyxl=2.5.5
15+
- parquet-cpp
1416
- pyarrow
1517
- pytables
1618
- python-dateutil

ci/code_checks.sh

+5
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
5656
cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime
5757
RET=$(($RET + $?)) ; echo $MSG "DONE"
5858

59+
# Imports - Check formatting using isort see setup.cfg for settings
60+
MSG='Check import format using isort ' ; echo $MSG
61+
isort --recursive --check-only pandas
62+
RET=$(($RET + $?)) ; echo $MSG "DONE"
63+
5964
fi
6065

6166
### PATTERNS ###

ci/environment-dev.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ dependencies:
88
- flake8
99
- flake8-comprehensions
1010
- hypothesis>=3.58.0
11+
- isort
1112
- moto
1213
- pytest>=3.6
1314
- python-dateutil>=2.5.0

ci/requirements_dev.txt

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ NumPy
55
flake8
66
flake8-comprehensions
77
hypothesis>=3.58.0
8+
isort
89
moto
910
pytest>=3.6
1011
python-dateutil>=2.5.0

ci/travis-36.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ dependencies:
1414
- geopandas
1515
- html5lib
1616
- ipython
17+
- isort
1718
- jinja2
1819
- lxml
1920
- matplotlib

doc/source/whatsnew/v0.24.0.txt

+43-2
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ Other Enhancements
198198
- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`)
199199
- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).
200200
- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`).
201+
- :meth:`pandas.core.dtypes.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
202+
all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
201203
- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).
202204
- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
203205
- Compatibility with Matplotlib 3.0 (:issue:`22790`).
@@ -440,15 +442,15 @@ In addition to these API breaking changes, many :ref:`performance improvements a
440442
Raise ValueError in ``DataFrame.to_dict(orient='index')``
441443
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
442444

443-
Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
445+
Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with
444446
``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`)
445447

446448
.. ipython:: python
447449
:okexcept:
448450

449451
df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A'])
450452
df
451-
453+
452454
df.to_dict(orient='index')
453455

454456
.. _whatsnew_0240.api.datetimelike.normalize:
@@ -626,6 +628,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
626628
- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`).
627629
- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
628630
- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
631+
- Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`)
629632
- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`)
630633
- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
631634
- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`).
@@ -788,6 +791,7 @@ Categorical
788791
^^^^^^^^^^^
789792

790793
- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``.
794+
- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
791795
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
792796
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
793797

@@ -923,6 +927,41 @@ MultiIndex
923927
I/O
924928
^^^
925929

930+
.. _whatsnew_0240.bug_fixes.nan_with_str_dtype:
931+
932+
Proper handling of `np.NaN` in a string data-typed column with the Python engine
933+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
934+
935+
There was bug in :func:`read_excel` and :func:`read_csv` with the Python
936+
engine, where missing values turned to ``'nan'`` with ``dtype=str`` and
937+
``na_filter=True``. Now, these missing values are converted to the string
938+
missing indicator, ``np.nan``. (:issue `20377`)
939+
940+
.. ipython:: python
941+
:suppress:
942+
943+
from pandas.compat import StringIO
944+
945+
Previous Behavior:
946+
947+
.. code-block:: ipython
948+
949+
In [5]: data = 'a,b,c\n1,,3\n4,5,6'
950+
In [6]: df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
951+
In [7]: df.loc[0, 'b']
952+
Out[7]:
953+
'nan'
954+
955+
Current Behavior:
956+
957+
.. ipython:: python
958+
959+
data = 'a,b,c\n1,,3\n4,5,6'
960+
df = pd.read_csv(StringIO(data), engine='python', dtype=str, na_filter=True)
961+
df.loc[0, 'b']
962+
963+
Notice how we now instead output ``np.nan`` itself instead of a stringified form of it.
964+
926965
- :func:`read_html()` no longer ignores all-whitespace ``<tr>`` within ``<thead>`` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`)
927966
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
928967
- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`)
@@ -972,6 +1011,7 @@ Reshaping
9721011
- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`)
9731012
- Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`)
9741013
- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`)
1014+
- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`)
9751015

9761016
.. _whatsnew_0240.bug_fixes.sparse:
9771017

@@ -985,6 +1025,7 @@ Sparse
9851025
- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array.
9861026
- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`)
9871027
- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`)
1028+
- Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`)
9881029

9891030
Build Changes
9901031
^^^^^^^^^^^^^

pandas/_libs/algos_common_helper.pxi.in

+14-17
Original file line numberDiff line numberDiff line change
@@ -16,33 +16,30 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1616

1717
{{py:
1818

19-
# name, c_type, dest_type, dest_dtype
20-
dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'),
21-
('float32', 'float32_t', 'float32_t', 'np.float32'),
22-
('int8', 'int8_t', 'float32_t', 'np.float32'),
23-
('int16', 'int16_t', 'float32_t', 'np.float32'),
24-
('int32', 'int32_t', 'float64_t', 'np.float64'),
25-
('int64', 'int64_t', 'float64_t', 'np.float64')]
19+
# name, c_type, dest_type
20+
dtypes = [('float64', 'float64_t', 'float64_t'),
21+
('float32', 'float32_t', 'float32_t'),
22+
('int8', 'int8_t', 'float32_t'),
23+
('int16', 'int16_t', 'float32_t'),
24+
('int32', 'int32_t', 'float64_t'),
25+
('int64', 'int64_t', 'float64_t')]
2626

2727
def get_dispatch(dtypes):
2828

29-
for name, c_type, dest_type, dest_dtype, in dtypes:
30-
31-
dest_type2 = dest_type
32-
dest_type = dest_type.replace('_t', '')
33-
34-
yield name, c_type, dest_type, dest_type2, dest_dtype
29+
for name, c_type, dest_type, in dtypes:
30+
dest_name = dest_type[:-2] # i.e. strip "_t"
31+
yield name, c_type, dest_type, dest_name
3532

3633
}}
3734

38-
{{for name, c_type, dest_type, dest_type2, dest_dtype
35+
{{for name, c_type, dest_type, dest_name
3936
in get_dispatch(dtypes)}}
4037

4138

4239
@cython.boundscheck(False)
4340
@cython.wraparound(False)
4441
def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr,
45-
ndarray[{{dest_type2}}, ndim=2] out,
42+
ndarray[{{dest_type}}, ndim=2] out,
4643
Py_ssize_t periods, int axis):
4744
cdef:
4845
Py_ssize_t i, j, sx, sy
@@ -84,9 +81,9 @@ def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr,
8481
out[i, j] = arr[i, j] - arr[i, j - periods]
8582

8683

87-
def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values,
84+
def put2d_{{name}}_{{dest_name}}(ndarray[{{c_type}}, ndim=2, cast=True] values,
8885
ndarray[int64_t] indexer, Py_ssize_t loc,
89-
ndarray[{{dest_type2}}] out):
86+
ndarray[{{dest_type}}] out):
9087
cdef:
9188
Py_ssize_t i, j, k
9289

pandas/_libs/algos_rank_helper.pxi.in

+21-36
Original file line numberDiff line numberDiff line change
@@ -131,45 +131,20 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
131131
argsorted = _as.astype('i8')
132132

133133
{{if dtype == 'object'}}
134-
for i in range(n):
135-
sum_ranks += i + 1
136-
dups += 1
137-
isnan = sorted_mask[i]
138-
val = util.get_value_at(sorted_data, i)
139-
140-
if isnan and keep_na:
141-
ranks[argsorted[i]] = nan
142-
continue
143-
count += 1.0
144-
145-
if (i == n - 1 or
146-
are_diff(util.get_value_at(sorted_data, i + 1), val) or
147-
i == non_na_idx):
148-
if tiebreak == TIEBREAK_AVERAGE:
149-
for j in range(i - dups + 1, i + 1):
150-
ranks[argsorted[j]] = sum_ranks / dups
151-
elif tiebreak == TIEBREAK_MIN:
152-
for j in range(i - dups + 1, i + 1):
153-
ranks[argsorted[j]] = i - dups + 2
154-
elif tiebreak == TIEBREAK_MAX:
155-
for j in range(i - dups + 1, i + 1):
156-
ranks[argsorted[j]] = i + 1
157-
elif tiebreak == TIEBREAK_FIRST:
158-
raise ValueError('first not supported for non-numeric data')
159-
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
160-
for j in range(i - dups + 1, i + 1):
161-
ranks[argsorted[j]] = 2 * i - j - dups + 2
162-
elif tiebreak == TIEBREAK_DENSE:
163-
total_tie_count += 1
164-
for j in range(i - dups + 1, i + 1):
165-
ranks[argsorted[j]] = total_tie_count
166-
sum_ranks = dups = 0
134+
if True:
167135
{{else}}
168136
with nogil:
137+
{{endif}}
138+
# TODO: why does the 2d version not have a nogil block?
169139
for i in range(n):
170140
sum_ranks += i + 1
171141
dups += 1
142+
143+
{{if dtype == 'object'}}
144+
val = util.get_value_at(sorted_data, i)
145+
{{else}}
172146
val = sorted_data[i]
147+
{{endif}}
173148

174149
{{if dtype != 'uint64'}}
175150
isnan = sorted_mask[i]
@@ -180,8 +155,14 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
180155

181156
count += 1.0
182157

183-
if (i == n - 1 or sorted_data[i + 1] != val or
184-
i == non_na_idx):
158+
{{if dtype == 'object'}}
159+
if (i == n - 1 or
160+
are_diff(util.get_value_at(sorted_data, i + 1), val) or
161+
i == non_na_idx):
162+
{{else}}
163+
if (i == n - 1 or sorted_data[i + 1] != val or i == non_na_idx):
164+
{{endif}}
165+
185166
if tiebreak == TIEBREAK_AVERAGE:
186167
for j in range(i - dups + 1, i + 1):
187168
ranks[argsorted[j]] = sum_ranks / dups
@@ -192,8 +173,13 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
192173
for j in range(i - dups + 1, i + 1):
193174
ranks[argsorted[j]] = i + 1
194175
elif tiebreak == TIEBREAK_FIRST:
176+
{{if dtype == 'object'}}
177+
raise ValueError('first not supported for '
178+
'non-numeric data')
179+
{{else}}
195180
for j in range(i - dups + 1, i + 1):
196181
ranks[argsorted[j]] = j + 1
182+
{{endif}}
197183
elif tiebreak == TIEBREAK_FIRST_DESCENDING:
198184
for j in range(i - dups + 1, i + 1):
199185
ranks[argsorted[j]] = 2 * i - j - dups + 2
@@ -202,7 +188,6 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average',
202188
for j in range(i - dups + 1, i + 1):
203189
ranks[argsorted[j]] = total_tie_count
204190
sum_ranks = dups = 0
205-
{{endif}}
206191
if pct:
207192
if tiebreak == TIEBREAK_DENSE:
208193
return ranks / total_tie_count

0 commit comments

Comments
 (0)