Skip to content

CLN: Use defaultdict for minor optimization #32209

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 35 additions & 56 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,83 +31,62 @@ def time_maybe_convert_objects(self):

class Factorize:

params = [[True, False], ["int", "uint", "float", "string"]]
param_names = ["sort", "dtype"]

def setup(self, sort, dtype):
N = 10 ** 5
data = {
"int": pd.Int64Index(np.arange(N).repeat(5)),
"uint": pd.UInt64Index(np.arange(N).repeat(5)),
"float": pd.Float64Index(np.random.randn(N).repeat(5)),
"string": tm.makeStringIndex(N).repeat(5),
}
self.idx = data[dtype]

def time_factorize(self, sort, dtype):
self.idx.factorize(sort=sort)


class FactorizeUnique:

params = [[True, False], ["int", "uint", "float", "string"]]
param_names = ["sort", "dtype"]
params = [
[True, False],
[True, False],
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
]
param_names = ["unique", "sort", "dtype"]

def setup(self, sort, dtype):
def setup(self, unique, sort, dtype):
N = 10 ** 5
data = {
"int": pd.Int64Index(np.arange(N)),
"uint": pd.UInt64Index(np.arange(N)),
"float": pd.Float64Index(np.arange(N)),
"float": pd.Float64Index(np.random.randn(N)),
"string": tm.makeStringIndex(N),
}
self.idx = data[dtype]
assert self.idx.is_unique

def time_factorize(self, sort, dtype):
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
"datetime64[ns, tz]": pd.date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
}[dtype]
if not unique:
data = data.repeat(5)
self.idx = data

def time_factorize(self, unique, sort, dtype):
self.idx.factorize(sort=sort)


class Duplicated:

params = [["first", "last", False], ["int", "uint", "float", "string"]]
param_names = ["keep", "dtype"]

def setup(self, keep, dtype):
N = 10 ** 5
data = {
"int": pd.Int64Index(np.arange(N).repeat(5)),
"uint": pd.UInt64Index(np.arange(N).repeat(5)),
"float": pd.Float64Index(np.random.randn(N).repeat(5)),
"string": tm.makeStringIndex(N).repeat(5),
}
self.idx = data[dtype]
# cache is_unique
self.idx.is_unique

def time_duplicated(self, keep, dtype):
self.idx.duplicated(keep=keep)


class DuplicatedUniqueIndex:

params = ["int", "uint", "float", "string"]
param_names = ["dtype"]
params = [
[True, False],
["first", "last", False],
["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
]
param_names = ["unique", "keep", "dtype"]

def setup(self, dtype):
def setup(self, unique, keep, dtype):
N = 10 ** 5
data = {
"int": pd.Int64Index(np.arange(N)),
"uint": pd.UInt64Index(np.arange(N)),
"float": pd.Float64Index(np.random.randn(N)),
"string": tm.makeStringIndex(N),
}
self.idx = data[dtype]
"datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
"datetime64[ns, tz]": pd.date_range(
"2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
),
}[dtype]
if not unique:
data = data.repeat(5)
self.idx = data
# cache is_unique
self.idx.is_unique

def time_duplicated_unique(self, dtype):
self.idx.duplicated()
def time_duplicated(self, unique, keep, dtype):
self.idx.duplicated(keep=keep)


class Hashing:
Expand Down
3 changes: 0 additions & 3 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,6 @@ def setup(self):
def time_get_loc(self):
self.index.get_loc(self.category)

def time_shape(self):
self.index.shape

def time_shallow_copy(self):
self.index._shallow_copy()

Expand Down
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/index_cached_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class IndexCache:

params = [
[
"CategoricalIndex",
"DatetimeIndex",
"Float64Index",
"IntervalIndex",
Expand Down Expand Up @@ -42,6 +43,8 @@ def setup(self, index_type):
self.idx = pd.Float64Index(range(N))
elif index_type == "UInt64Index":
self.idx = pd.UInt64Index(range(N))
elif index_type == "CategoricalIndex":
self.idx = pd.CategoricalIndex(range(N), range(N))
else:
raise ValueError
assert len(self.idx) == N
Expand Down
8 changes: 0 additions & 8 deletions asv_bench/benchmarks/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,6 @@ def time_datetime_difference_disjoint(self):
self.datetime_left.difference(self.datetime_right)


class Datetime:
def setup(self):
self.dr = date_range("20000101", freq="D", periods=10000)

def time_is_dates_only(self):
self.dr._is_dates_only


class Range:
def setup(self):
self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3)
Expand Down
5 changes: 5 additions & 0 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
"""
These benchmarks are for Series and DataFrame indexing methods. For the
lower-level methods directly on Index and subclasses, see index_object.py,
indexing_engine.py, and index_cached.py
"""
import warnings

import numpy as np
Expand Down
3 changes: 0 additions & 3 deletions asv_bench/benchmarks/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,6 @@ def setup(self):
def time_get_loc(self):
self.index.get_loc(self.period)

def time_shape(self):
self.index.shape

def time_shallow_copy(self):
self.index._shallow_copy()

Expand Down
3 changes: 0 additions & 3 deletions asv_bench/benchmarks/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,6 @@ def setup(self):
def time_get_loc(self):
self.index.get_loc(self.timedelta)

def time_shape(self):
self.index.shape

def time_shallow_copy(self):
self.index._shallow_copy()

Expand Down
17 changes: 3 additions & 14 deletions asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ def time_to_date(self, index_type):
def time_to_pydatetime(self, index_type):
self.index.to_pydatetime()

def time_is_dates_only(self, index_type):
self.index._is_dates_only


class TzLocalize:

Expand Down Expand Up @@ -91,20 +94,6 @@ def time_reest_datetimeindex(self, tz):
self.df.reset_index()


class Factorize:

params = [None, "Asia/Tokyo"]
param_names = "tz"

def setup(self, tz):
N = 100000
self.dti = date_range("2011-01-01", freq="H", periods=N, tz=tz)
self.dti = self.dti.repeat(5)

def time_factorize(self, tz):
self.dti.factorize()


class InferFreq:

params = [None, "D", "B"]
Expand Down
2 changes: 1 addition & 1 deletion ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then

MSG='Doctests generic.py' ; echo $MSG
pytest -q --doctest-modules pandas/core/generic.py \
-k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs -to_clipboard"
-k"-_set_axis_name -_xs -describe -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs -to_clipboard"
RET=$(($RET + $?)) ; echo $MSG "DONE"

MSG='Doctests groupby.py' ; echo $MSG
Expand Down
2 changes: 1 addition & 1 deletion ci/setup_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ echo
echo "update conda"
conda config --set ssl_verify false
conda config --set quiet true --set always_yes true --set changeps1 false
conda install pip # create conda to create a historical artifact for pip & setuptools
conda install pip conda # create conda to create a historical artifact for pip & setuptools
conda update -n base conda

echo "conda info -a"
Expand Down
5 changes: 5 additions & 0 deletions doc/source/ecosystem.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ joining paths, replacing file extensions, and checking if files exist are also a
Statistics and machine learning
-------------------------------

`pandas-tfrecords <https://pypi.org/project/pandas-tfrecords/>`__
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Easy saving pandas dataframe to tensorflow tfrecords format and reading tfrecords to pandas.

`Statsmodels <https://www.statsmodels.org/>`__
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
11 changes: 11 additions & 0 deletions doc/source/getting_started/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,17 @@ of a 1D array of values. It can also be used as a function on regular arrays:
s.value_counts()
pd.value_counts(data)

.. versionadded:: 1.1.0

The :meth:`~DataFrame.value_counts` method can be used to count combinations across multiple columns.
By default all columns are used but a subset can be selected using the ``subset`` argument.

.. ipython:: python

data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}
frame = pd.DataFrame(data)
frame.value_counts()

Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame:

.. ipython:: python
Expand Down
1 change: 1 addition & 0 deletions doc/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ Computations / descriptive stats
DataFrame.std
DataFrame.var
DataFrame.nunique
DataFrame.value_counts

Reindexing / selection / label manipulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Other enhancements

- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
- When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`)
-
- `OptionError` is now exposed in `pandas.errors` (:issue:`27553`)
-

.. ---------------------------------------------------------------------------
Expand All @@ -55,6 +55,7 @@ Other API changes

- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
-
Expand Down Expand Up @@ -114,6 +115,7 @@ Datetimelike
- :meth:`DatetimeArray.searchsorted`, :meth:`TimedeltaArray.searchsorted`, :meth:`PeriodArray.searchsorted` not recognizing non-pandas scalars and incorrectly raising ``ValueError`` instead of ``TypeError`` (:issue:`30950`)
- Bug in :class:`Timestamp` where constructing :class:`Timestamp` with dateutil timezone less than 128 nanoseconds before daylight saving time switch from winter to summer would result in nonexistent time (:issue:`31043`)
- Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`)
- :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`)

Timedelta
^^^^^^^^^
Expand Down
23 changes: 19 additions & 4 deletions pandas/_libs/tslibs/timestamps.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -411,10 +411,25 @@ class Timestamp(_Timestamp):
)

elif ts_input is _no_input:
# User passed keyword arguments.
ts_input = datetime(year, month, day, hour or 0,
minute or 0, second or 0,
microsecond or 0)
# GH 31200
# When year, month or day is not given, we call the datetime
# constructor to make sure we get the same error message
# since Timestamp inherits datetime
datetime_kwargs = {
"hour": hour or 0,
"minute": minute or 0,
"second": second or 0,
"microsecond": microsecond or 0
}
if year is not None:
datetime_kwargs["year"] = year
if month is not None:
datetime_kwargs["month"] = month
if day is not None:
datetime_kwargs["day"] = day

ts_input = datetime(**datetime_kwargs)

elif is_integer_object(freq):
# User passed positional arguments:
# Timestamp(year, month, day[, hour[, minute[, second[,
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,8 +777,10 @@ def searchsorted(self, value, side="left", sorter=None):
if isinstance(value, str):
try:
value = self._scalar_from_string(value)
except ValueError:
raise TypeError("searchsorted requires compatible dtype or scalar")
except ValueError as e:
raise TypeError(
"searchsorted requires compatible dtype or scalar"
) from e

elif is_valid_nat_for_dtype(value, self.dtype):
value = NaT
Expand Down Expand Up @@ -1041,7 +1043,7 @@ def _validate_frequency(cls, index, freq, **kwargs):
raise ValueError(
f"Inferred frequency {inferred} from passed values "
f"does not conform to passed frequency {freq.freqstr}"
)
) from e

# monotonicity/uniqueness properties are called via frequencies.infer_freq,
# see GH#23789
Expand Down
Loading