pandas-dev · jaketae · Feb 23, 2020 · Feb 24, 2020 · Feb 26, 2020
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -31,83 +31,62 @@ def time_maybe_convert_objects(self):
 
 class Factorize:
 
-    params = [[True, False], ["int", "uint", "float", "string"]]
-    param_names = ["sort", "dtype"]
-
-    def setup(self, sort, dtype):
-        N = 10 ** 5
-        data = {
-            "int": pd.Int64Index(np.arange(N).repeat(5)),
-            "uint": pd.UInt64Index(np.arange(N).repeat(5)),
-            "float": pd.Float64Index(np.random.randn(N).repeat(5)),
-            "string": tm.makeStringIndex(N).repeat(5),
-        }
-        self.idx = data[dtype]
-
-    def time_factorize(self, sort, dtype):
-        self.idx.factorize(sort=sort)
-
-
-class FactorizeUnique:
-
-    params = [[True, False], ["int", "uint", "float", "string"]]
-    param_names = ["sort", "dtype"]
+    params = [
+        [True, False],
+        [True, False],
+        ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
+    ]
+    param_names = ["unique", "sort", "dtype"]
 
-    def setup(self, sort, dtype):
+    def setup(self, unique, sort, dtype):
         N = 10 ** 5
         data = {
             "int": pd.Int64Index(np.arange(N)),
             "uint": pd.UInt64Index(np.arange(N)),
-            "float": pd.Float64Index(np.arange(N)),
+            "float": pd.Float64Index(np.random.randn(N)),
             "string": tm.makeStringIndex(N),
-        }
-        self.idx = data[dtype]
-        assert self.idx.is_unique
-
-    def time_factorize(self, sort, dtype):
+            "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
+            "datetime64[ns, tz]": pd.date_range(
+                "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
+            ),
+        }[dtype]
+        if not unique:
+            data = data.repeat(5)
+        self.idx = data
+
+    def time_factorize(self, unique, sort, dtype):
         self.idx.factorize(sort=sort)
 
 
 class Duplicated:
 
-    params = [["first", "last", False], ["int", "uint", "float", "string"]]
-    param_names = ["keep", "dtype"]
-
-    def setup(self, keep, dtype):
-        N = 10 ** 5
-        data = {
-            "int": pd.Int64Index(np.arange(N).repeat(5)),
-            "uint": pd.UInt64Index(np.arange(N).repeat(5)),
-            "float": pd.Float64Index(np.random.randn(N).repeat(5)),
-            "string": tm.makeStringIndex(N).repeat(5),
-        }
-        self.idx = data[dtype]
-        # cache is_unique
-        self.idx.is_unique
-
-    def time_duplicated(self, keep, dtype):
-        self.idx.duplicated(keep=keep)
-
-
-class DuplicatedUniqueIndex:
-
-    params = ["int", "uint", "float", "string"]
-    param_names = ["dtype"]
+    params = [
+        [True, False],
+        ["first", "last", False],
+        ["int", "uint", "float", "string", "datetime64[ns]", "datetime64[ns, tz]"],
+    ]
+    param_names = ["unique", "keep", "dtype"]
 
-    def setup(self, dtype):
+    def setup(self, unique, keep, dtype):
         N = 10 ** 5
         data = {
             "int": pd.Int64Index(np.arange(N)),
             "uint": pd.UInt64Index(np.arange(N)),
             "float": pd.Float64Index(np.random.randn(N)),
             "string": tm.makeStringIndex(N),
-        }
-        self.idx = data[dtype]
+            "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
+            "datetime64[ns, tz]": pd.date_range(
+                "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
+            ),
+        }[dtype]
+        if not unique:
+            data = data.repeat(5)
+        self.idx = data
         # cache is_unique
         self.idx.is_unique
 
-    def time_duplicated_unique(self, dtype):
-        self.idx.duplicated()
+    def time_duplicated(self, unique, keep, dtype):
+        self.idx.duplicated(keep=keep)
 
 
 class Hashing:

diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py
@@ -258,9 +258,6 @@ def setup(self):
     def time_get_loc(self):
         self.index.get_loc(self.category)
 
-    def time_shape(self):
-        self.index.shape
-
     def time_shallow_copy(self):
         self.index._shallow_copy()
 

diff --git a/asv_bench/benchmarks/index_cached_properties.py b/asv_bench/benchmarks/index_cached_properties.py
@@ -7,6 +7,7 @@ class IndexCache:
 
     params = [
         [
+            "CategoricalIndex",
             "DatetimeIndex",
             "Float64Index",
             "IntervalIndex",
@@ -42,6 +43,8 @@ def setup(self, index_type):
             self.idx = pd.Float64Index(range(N))
         elif index_type == "UInt64Index":
             self.idx = pd.UInt64Index(range(N))
+        elif index_type == "CategoricalIndex":
+            self.idx = pd.CategoricalIndex(range(N), range(N))
         else:
             raise ValueError
         assert len(self.idx) == N

diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
@@ -55,14 +55,6 @@ def time_datetime_difference_disjoint(self):
         self.datetime_left.difference(self.datetime_right)
 
 
-class Datetime:
-    def setup(self):
-        self.dr = date_range("20000101", freq="D", periods=10000)
-
-    def time_is_dates_only(self):
-        self.dr._is_dates_only
-
-
 class Range:
     def setup(self):
         self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3)

diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -1,3 +1,8 @@
+"""
+These benchmarks are for Series and DataFrame indexing methods.  For the
+lower-level methods directly on Index and subclasses, see index_object.py,
+indexing_engine.py, and index_cached.py
+"""
 import warnings
 
 import numpy as np

diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py
@@ -85,9 +85,6 @@ def setup(self):
     def time_get_loc(self):
         self.index.get_loc(self.period)
 
-    def time_shape(self):
-        self.index.shape
-
     def time_shallow_copy(self):
         self.index._shallow_copy()
 

diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py
@@ -73,9 +73,6 @@ def setup(self):
     def time_get_loc(self):
         self.index.get_loc(self.timedelta)
 
-    def time_shape(self):
-        self.index.shape
-
     def time_shallow_copy(self):
         self.index._shallow_copy()
 

diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
@@ -57,6 +57,9 @@ def time_to_date(self, index_type):
     def time_to_pydatetime(self, index_type):
         self.index.to_pydatetime()
 
+    def time_is_dates_only(self, index_type):
+        self.index._is_dates_only
+
 
 class TzLocalize:
 
@@ -91,20 +94,6 @@ def time_reest_datetimeindex(self, tz):
         self.df.reset_index()
 
 
-class Factorize:
-
-    params = [None, "Asia/Tokyo"]
-    param_names = "tz"
-
-    def setup(self, tz):
-        N = 100000
-        self.dti = date_range("2011-01-01", freq="H", periods=N, tz=tz)
-        self.dti = self.dti.repeat(5)
-
-    def time_factorize(self, tz):
-        self.dti.factorize()
-
-
 class InferFreq:
 
     params = [None, "D", "B"]

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -269,7 +269,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
 
     MSG='Doctests generic.py' ; echo $MSG
     pytest -q --doctest-modules pandas/core/generic.py \
-        -k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs -to_clipboard"
+        -k"-_set_axis_name -_xs -describe -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs -to_clipboard"
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     MSG='Doctests groupby.py' ; echo $MSG

diff --git a/ci/setup_env.sh b/ci/setup_env.sh
@@ -50,7 +50,7 @@ echo
 echo "update conda"
 conda config --set ssl_verify false
 conda config --set quiet true --set always_yes true --set changeps1 false
-conda install pip  # create conda to create a historical artifact for pip & setuptools
+conda install pip conda  # create conda to create a historical artifact for pip & setuptools
 conda update -n base conda
 
 echo "conda info -a"

diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
@@ -56,6 +56,11 @@ joining paths, replacing file extensions, and checking if files exist are also a
 Statistics and machine learning
 -------------------------------
 
+`pandas-tfrecords <https://pypi.org/project/pandas-tfrecords/>`__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Easy saving pandas dataframe to tensorflow tfrecords format and reading tfrecords to pandas.
+
 `Statsmodels <https://www.statsmodels.org/>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst
@@ -689,6 +689,17 @@ of a 1D array of values. It can also be used as a function on regular arrays:
    s.value_counts()
    pd.value_counts(data)
 
+.. versionadded:: 1.1.0
+
+The :meth:`~DataFrame.value_counts` method can be used to count combinations across multiple columns.
+By default all columns are used but a subset can be selected using the ``subset`` argument.
+
+.. ipython:: python
+
+    data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}
+    frame = pd.DataFrame(data)
+    frame.value_counts()
+
 Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame:
 
 .. ipython:: python

diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -170,6 +170,7 @@ Computations / descriptive stats
    DataFrame.std
    DataFrame.var
    DataFrame.nunique
+   DataFrame.value_counts
 
 Reindexing / selection / label manipulation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -43,7 +43,7 @@ Other enhancements
 
 - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
 - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`)
--
+- `OptionError` is now exposed in `pandas.errors` (:issue:`27553`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -55,6 +55,7 @@ Other API changes
 
 - :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
   will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
+- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
 - :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`)
 - ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`)
 -
@@ -114,6 +115,7 @@ Datetimelike
 - :meth:`DatetimeArray.searchsorted`, :meth:`TimedeltaArray.searchsorted`, :meth:`PeriodArray.searchsorted` not recognizing non-pandas scalars and incorrectly raising ``ValueError`` instead of ``TypeError`` (:issue:`30950`)
 - Bug in :class:`Timestamp` where constructing :class:`Timestamp` with dateutil timezone less than 128 nanoseconds before daylight saving time switch from winter to summer would result in nonexistent time (:issue:`31043`)
 - Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`)
+- :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`)
 
 Timedelta
 ^^^^^^^^^

diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
@@ -411,10 +411,25 @@ class Timestamp(_Timestamp):
                 )
 
         elif ts_input is _no_input:
-            # User passed keyword arguments.
-            ts_input = datetime(year, month, day, hour or 0,
-                                minute or 0, second or 0,
-                                microsecond or 0)
+            # GH 31200
+            # When year, month or day is not given, we call the datetime
+            # constructor to make sure we get the same error message
+            # since Timestamp inherits datetime
+            datetime_kwargs = {
+                "hour": hour or 0,
+                "minute": minute or 0,
+                "second": second or 0,
+                "microsecond": microsecond or 0
+            }
+            if year is not None:
+                datetime_kwargs["year"] = year
+            if month is not None:
+                datetime_kwargs["month"] = month
+            if day is not None:
+                datetime_kwargs["day"] = day
+
+            ts_input = datetime(**datetime_kwargs)
+
         elif is_integer_object(freq):
             # User passed positional arguments:
             # Timestamp(year, month, day[, hour[, minute[, second[,

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -777,8 +777,10 @@ def searchsorted(self, value, side="left", sorter=None):
         if isinstance(value, str):
             try:
                 value = self._scalar_from_string(value)
-            except ValueError:
-                raise TypeError("searchsorted requires compatible dtype or scalar")
+            except ValueError as e:
+                raise TypeError(
+                    "searchsorted requires compatible dtype or scalar"
+                ) from e
 
         elif is_valid_nat_for_dtype(value, self.dtype):
             value = NaT
@@ -1041,7 +1043,7 @@ def _validate_frequency(cls, index, freq, **kwargs):
             raise ValueError(
                 f"Inferred frequency {inferred} from passed values "
                 f"does not conform to passed frequency {freq.freqstr}"
-            )
+            ) from e
 
     # monotonicity/uniqueness properties are called via frequencies.infer_freq,
     #  see GH#23789