pandas-dev
diff --git a/‎.github/workflows/ci.yml
+2-2 b/‎.github/workflows/ci.yml
+2-2
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎.pre-commit-config.yaml
+1-1 b/‎.pre-commit-config.yaml
+1-1
diff --git a/‎.travis.yml
+1-6 b/‎.travis.yml
+1-6
diff --git a/‎Dockerfile
+1-1 b/‎Dockerfile
+1-1
diff --git a/‎Makefile
+1-1 b/‎Makefile
+1-1
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎asv_bench/benchmarks/algorithms.py
+12 b/‎asv_bench/benchmarks/algorithms.py
+12
diff --git a/‎asv_bench/benchmarks/categoricals.py
+43 b/‎asv_bench/benchmarks/categoricals.py
+43
diff --git a/‎asv_bench/benchmarks/groupby.py
+1-1 b/‎asv_bench/benchmarks/groupby.py
+1-1
diff --git a/‎asv_bench/benchmarks/hash_functions.py
+164 b/‎asv_bench/benchmarks/hash_functions.py
+164
diff --git a/‎asv_bench/benchmarks/join_merge.py
+6 b/‎asv_bench/benchmarks/join_merge.py
+6
diff --git a/‎asv_bench/benchmarks/reshape.py
+4-1 b/‎asv_bench/benchmarks/reshape.py
+4-1
diff --git a/‎asv_bench/benchmarks/rolling.py
+13 b/‎asv_bench/benchmarks/rolling.py
+13
@@ -18,7 +18,7 @@ jobs:
     steps:
 
     - name: Setting conda path
-      run: echo "::add-path::${HOME}/miniconda3/bin"
+      run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH
 
     - name: Checkout
       uses: actions/checkout@v1
@@ -98,7 +98,7 @@ jobs:
     steps:
 
     - name: Setting conda path
-      run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}"
+      run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH
 
     - name: Checkout
       uses: actions/checkout@v1
 
@@ -12,6 +12,7 @@
 *.log
 *.swp
 *.pdb
+*.zip
 .project
 .pydevproject
 .settings
 
@@ -26,7 +26,7 @@ repos:
         name: isort (cython)
         types: [cython]
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v2.7.3
+    rev: v2.7.4
     hooks:
     -   id: pyupgrade
         args: [--py37-plus]
 
@@ -35,11 +35,6 @@ matrix:
   fast_finish: true
 
   include:
-    - dist: bionic
-      python: 3.9-dev
-      env:
-        - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)"
-
     - env:
       - JOB="3.8, slow" ENV_FILE="ci/deps/travis-38-slow.yaml" PATTERN="slow" SQL="1"
       services:
@@ -94,7 +89,7 @@ install:
 script:
   - echo "script start"
   - echo "$JOB"
-  - if [ "$JOB" != "3.9-dev" ]; then source activate pandas-dev; fi
+  - source activate pandas-dev
   - ci/run_tests.sh
 
 after_script:
 
@@ -43,5 +43,5 @@ RUN conda env update -n base -f "$pandas_home/environment.yml"
 
 # Build C extensions and pandas
 RUN cd "$pandas_home" \
-    && python setup.py build_ext --inplace -j 4 \
+    && python setup.py build_ext -j 4 \
     && python -m pip install -e .
@@ -9,7 +9,7 @@ clean_pyc:
 	-find . -name '*.py[co]' -exec rm {} \;
 
 build: clean_pyc
-	python setup.py build_ext --inplace
+	python setup.py build_ext
 
 lint-diff:
 	git diff upstream/master --name-only -- "*.py" | xargs flake8
 
@@ -60,7 +60,7 @@ Here are just a few of the things that pandas does well:
     and saving/loading data from the ultrafast [**HDF5 format**][hdfstore]
   - [**Time series**][timeseries]-specific functionality: date range
     generation and frequency conversion, moving window statistics,
-    date shifting and lagging.
+    date shifting and lagging
 
 
    [missing-data]: https://pandas.pydata.org/pandas-docs/stable/missing_data.html#working-with-missing-data
 
@@ -5,6 +5,7 @@
 from pandas._libs import lib
 
 import pandas as pd
+from pandas.core.algorithms import make_duplicates_of_left_unique_in_right
 
 from .pandas_vb_common import tm
 
@@ -174,4 +175,15 @@ def time_argsort(self, N):
         self.array.argsort()
 
 
+class RemoveDuplicates:
+    def setup(self):
+        N = 10 ** 5
+        na = np.arange(int(N / 2))
+        self.left = np.concatenate([na[: int(N / 4)], na[: int(N / 4)]])
+        self.right = np.concatenate([na, na])
+
+    def time_make_duplicates_of_left_unique_in_right(self):
+        make_duplicates_of_left_unique_in_right(self.left, self.right)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip
@@ -1,3 +1,5 @@
+import string
+import sys
 import warnings
 
 import numpy as np
@@ -67,6 +69,47 @@ def time_existing_series(self):
         pd.Categorical(self.series)
 
 
+class AsType:
+    def setup(self):
+        N = 10 ** 5
+
+        random_pick = np.random.default_rng().choice
+
+        categories = {
+            "str": list(string.ascii_letters),
+            "int": np.random.randint(2 ** 16, size=154),
+            "float": sys.maxsize * np.random.random((38,)),
+            "timestamp": [
+                pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578)
+            ],
+        }
+
+        self.df = pd.DataFrame(
+            {col: random_pick(cats, N) for col, cats in categories.items()}
+        )
+
+        for col in ("int", "float", "timestamp"):
+            self.df[col + "_as_str"] = self.df[col].astype(str)
+
+        for col in self.df.columns:
+            self.df[col] = self.df[col].astype("category")
+
+    def astype_str(self):
+        [self.df[col].astype("str") for col in "int float timestamp".split()]
+
+    def astype_int(self):
+        [self.df[col].astype("int") for col in "int_as_str timestamp".split()]
+
+    def astype_float(self):
+        [
+            self.df[col].astype("float")
+            for col in "float_as_str int int_as_str timestamp".split()
+        ]
+
+    def astype_datetime(self):
+        self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific"))
+
+
 class Concat:
     def setup(self):
         N = 10 ** 5
 
@@ -486,7 +486,7 @@ def setup(self):
         tmp2 = (np.random.random(10000) * 10.0).astype(np.float32)
         tmp = np.concatenate((tmp1, tmp2))
         arr = np.repeat(tmp, 10)
-        self.df = DataFrame(dict(a=arr, b=arr))
+        self.df = DataFrame({"a": arr, "b": arr})
 
     def time_sum(self):
         self.df.groupby(["a"])["b"].sum()
 
@@ -0,0 +1,164 @@
+import numpy as np
+
+import pandas as pd
+
+
+class IsinAlmostFullWithRandomInt:
+    params = [
+        [np.float64, np.int64, np.uint64, np.object],
+        range(10, 21),
+    ]
+    param_names = ["dtype", "exponent"]
+
+    def setup(self, dtype, exponent):
+        M = 3 * 2 ** (exponent - 2)
+        # 0.77-the maximal share of occupied buckets
+        np.random.seed(42)
+        self.s = pd.Series(np.random.randint(0, M, M)).astype(dtype)
+        self.values = np.random.randint(0, M, M).astype(dtype)
+        self.values_outside = self.values + M
+
+    def time_isin(self, dtype, exponent):
+        self.s.isin(self.values)
+
+    def time_isin_outside(self, dtype, exponent):
+        self.s.isin(self.values_outside)
+
+
+class IsinWithRandomFloat:
+    params = [
+        [np.float64, np.object],
+        [
+            1_300,
+            2_000,
+            7_000,
+            8_000,
+            70_000,
+            80_000,
+            750_000,
+            900_000,
+        ],
+    ]
+    param_names = ["dtype", "M"]
+
+    def setup(self, dtype, M):
+        np.random.seed(42)
+        self.values = np.random.rand(M)
+        self.s = pd.Series(self.values).astype(dtype)
+        np.random.shuffle(self.values)
+        self.values_outside = self.values + 0.1
+
+    def time_isin(self, dtype, M):
+        self.s.isin(self.values)
+
+    def time_isin_outside(self, dtype, M):
+        self.s.isin(self.values_outside)
+
+
+class IsinWithArangeSorted:
+    params = [
+        [np.float64, np.int64, np.uint64, np.object],
+        [
+            1_000,
+            2_000,
+            8_000,
+            100_000,
+            1_000_000,
+        ],
+    ]
+    param_names = ["dtype", "M"]
+
+    def setup(self, dtype, M):
+        self.s = pd.Series(np.arange(M)).astype(dtype)
+        self.values = np.arange(M).astype(dtype)
+
+    def time_isin(self, dtype, M):
+        self.s.isin(self.values)
+
+
+class IsinWithArange:
+    params = [
+        [np.float64, np.int64, np.uint64, np.object],
+        [
+            1_000,
+            2_000,
+            8_000,
+        ],
+        [-2, 0, 2],
+    ]
+    param_names = ["dtype", "M", "offset_factor"]
+
+    def setup(self, dtype, M, offset_factor):
+        offset = int(M * offset_factor)
+        np.random.seed(42)
+        tmp = pd.Series(np.random.randint(offset, M + offset, 10 ** 6))
+        self.s = tmp.astype(dtype)
+        self.values = np.arange(M).astype(dtype)
+
+    def time_isin(self, dtype, M, offset_factor):
+        self.s.isin(self.values)
+
+
+class Float64GroupIndex:
+    # GH28303
+    def setup(self):
+        self.df = pd.date_range(
+            start="1/1/2018", end="1/2/2018", periods=1e6
+        ).to_frame()
+        self.group_index = np.round(self.df.index.astype(int) / 1e9)
+
+    def time_groupby(self):
+        self.df.groupby(self.group_index).last()
+
+
+class UniqueAndFactorizeArange:
+    params = range(4, 16)
+    param_names = ["exponent"]
+
+    def setup(self, exponent):
+        a = np.arange(10 ** 4, dtype="float64")
+        self.a2 = (a + 10 ** exponent).repeat(100)
+
+    def time_factorize(self, exponent):
+        pd.factorize(self.a2)
+
+    def time_unique(self, exponent):
+        pd.unique(self.a2)
+
+
+class NumericSeriesIndexing:
+
+    params = [
+        (pd.Int64Index, pd.UInt64Index, pd.Float64Index),
+        (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
+    ]
+    param_names = ["index_dtype", "N"]
+
+    def setup(self, index, N):
+        vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
+        indices = index(vals)
+        self.data = pd.Series(np.arange(N), index=indices)
+
+    def time_loc_slice(self, index, N):
+        # trigger building of mapping
+        self.data.loc[:800]
+
+
+class NumericSeriesIndexingShuffled:
+
+    params = [
+        (pd.Int64Index, pd.UInt64Index, pd.Float64Index),
+        (10 ** 4, 10 ** 5, 5 * 10 ** 5, 10 ** 6, 5 * 10 ** 6),
+    ]
+    param_names = ["index_dtype", "N"]
+
+    def setup(self, index, N):
+        vals = np.array(list(range(55)) + [54] + list(range(55, N - 1)))
+        np.random.seed(42)
+        np.random.shuffle(vals)
+        indices = index(vals)
+        self.data = pd.Series(np.arange(N), index=indices)
+
+    def time_loc_slice(self, index, N):
+        # trigger building of mapping
+        self.data.loc[:800]
@@ -132,6 +132,9 @@ def time_join_dataframe_index_single_key_small(self, sort):
     def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort):
         self.df_shuf.join(self.df_key2, on="key2", sort=sort)
 
+    def time_join_dataframes_cross(self, sort):
+        self.df.loc[:2000].join(self.df_key1, how="cross", sort=sort)
+
 
 class JoinIndex:
     def setup(self):
@@ -205,6 +208,9 @@ def time_merge_dataframe_integer_2key(self, sort):
     def time_merge_dataframe_integer_key(self, sort):
         merge(self.df, self.df2, on="key1", sort=sort)
 
+    def time_merge_dataframes_cross(self, sort):
+        merge(self.left.loc[:2000], self.right.loc[:2000], how="cross", sort=sort)
+
 
 class I8Merge:
 
 
@@ -103,7 +103,10 @@ def setup(self):
         nidvars = 20
         N = 5000
         self.letters = list("ABCD")
-        yrvars = [l + str(num) for l, num in product(self.letters, range(1, nyrs + 1))]
+        yrvars = [
+            letter + str(num)
+            for letter, num in product(self.letters, range(1, nyrs + 1))
+        ]
         columns = [str(i) for i in range(nidvars)] + yrvars
         self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), columns=columns)
         self.df["id"] = self.df.index
 
@@ -225,4 +225,17 @@ def time_rolling_offset(self, method):
         getattr(self.groupby_roll_offset, method)()
 
 
+class GroupbyEWM:
+
+    params = ["cython", "numba"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
+        df = pd.DataFrame({"A": range(50), "B": range(50)})
+        self.gb_ewm = df.groupby("A").ewm(com=1.0)
+
+    def time_groupby_mean(self, engine):
+        self.gb_ewm.mean(engine=engine)
+
+
 from .pandas_vb_common import setup  # noqa: F401 isort:skip