Skip to content

Commit 2c3002f

Browse files
committed
ENH: gb.is_monotonic_increasing pandas-dev#17015 merged to current upstream/master
2 parents ee9fa7e + 2a0e54b commit 2c3002f

File tree

284 files changed

+13922
-9880
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

284 files changed

+13922
-9880
lines changed

.gitignore

-1
Original file line numberDiff line numberDiff line change
@@ -106,4 +106,3 @@ doc/build/html/index.html
106106
doc/tmp.sv
107107
doc/source/styled.xlsx
108108
doc/source/templates/
109-
doc/source/savefig/

asv_bench/benchmarks/algorithms.py

+86-82
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from importlib import import_module
22

33
import numpy as np
4-
54
import pandas as pd
65
from pandas.util import testing as tm
76

@@ -12,113 +11,118 @@
1211
except:
1312
pass
1413

15-
class Algorithms(object):
14+
15+
class Factorize(object):
16+
1617
goal_time = 0.2
1718

18-
def setup(self):
19-
N = 100000
20-
np.random.seed(1234)
19+
params = [True, False]
20+
param_names = ['sort']
2121

22-
self.int_unique = pd.Int64Index(np.arange(N * 5))
23-
# cache is_unique
24-
self.int_unique.is_unique
22+
def setup(self, sort):
23+
N = 10**5
24+
np.random.seed(1234)
25+
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
26+
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
27+
self.string_idx = tm.makeStringIndex(N)
2528

26-
self.int = pd.Int64Index(np.arange(N).repeat(5))
27-
self.float = pd.Float64Index(np.random.randn(N).repeat(5))
29+
def time_factorize_int(self, sort):
30+
self.int_idx.factorize(sort=sort)
2831

29-
# Convenience naming.
30-
self.checked_add = pd.core.algorithms.checked_add_with_arr
32+
def time_factorize_float(self, sort):
33+
self.float_idx.factorize(sort=sort)
3134

32-
self.arr = np.arange(1000000)
33-
self.arrpos = np.arange(1000000)
34-
self.arrneg = np.arange(-1000000, 0)
35-
self.arrmixed = np.array([1, -1]).repeat(500000)
36-
self.strings = tm.makeStringIndex(100000)
35+
def time_factorize_string(self, sort):
36+
self.string_idx.factorize(sort=sort)
3737

38-
self.arr_nan = np.random.choice([True, False], size=1000000)
39-
self.arrmixed_nan = np.random.choice([True, False], size=1000000)
4038

41-
# match
42-
self.uniques = tm.makeStringIndex(1000).values
43-
self.all = self.uniques.repeat(10)
39+
class Duplicated(object):
4440

45-
def time_factorize_string(self):
46-
self.strings.factorize()
41+
goal_time = 0.2
4742

48-
def time_factorize_int(self):
49-
self.int.factorize()
43+
params = ['first', 'last', False]
44+
param_names = ['keep']
5045

51-
def time_factorize_float(self):
52-
self.int.factorize()
46+
def setup(self, keep):
47+
N = 10**5
48+
np.random.seed(1234)
49+
self.int_idx = pd.Int64Index(np.arange(N).repeat(5))
50+
self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5))
51+
self.string_idx = tm.makeStringIndex(N)
5352

54-
def time_duplicated_int_unique(self):
55-
self.int_unique.duplicated()
53+
def time_duplicated_int(self, keep):
54+
self.int_idx.duplicated(keep=keep)
5655

57-
def time_duplicated_int(self):
58-
self.int.duplicated()
56+
def time_duplicated_float(self, keep):
57+
self.float_idx.duplicated(keep=keep)
5958

60-
def time_duplicated_float(self):
61-
self.float.duplicated()
59+
def time_duplicated_string(self, keep):
60+
self.string_idx.duplicated(keep=keep)
6261

63-
def time_match_strings(self):
64-
pd.match(self.all, self.uniques)
6562

66-
def time_add_overflow_pos_scalar(self):
67-
self.checked_add(self.arr, 1)
63+
class DuplicatedUniqueIndex(object):
6864

69-
def time_add_overflow_neg_scalar(self):
70-
self.checked_add(self.arr, -1)
65+
goal_time = 0.2
7166

72-
def time_add_overflow_zero_scalar(self):
73-
self.checked_add(self.arr, 0)
67+
def setup(self):
68+
N = 10**5
69+
self.idx_int_dup = pd.Int64Index(np.arange(N * 5))
70+
# cache is_unique
71+
self.idx_int_dup.is_unique
7472

75-
def time_add_overflow_pos_arr(self):
76-
self.checked_add(self.arr, self.arrpos)
73+
def time_duplicated_unique_int(self):
74+
self.idx_int_dup.duplicated()
7775

78-
def time_add_overflow_neg_arr(self):
79-
self.checked_add(self.arr, self.arrneg)
8076

81-
def time_add_overflow_mixed_arr(self):
82-
self.checked_add(self.arr, self.arrmixed)
77+
class Match(object):
8378

84-
def time_add_overflow_first_arg_nan(self):
85-
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)
79+
goal_time = 0.2
8680

87-
def time_add_overflow_second_arg_nan(self):
88-
self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan)
81+
def setup(self):
82+
np.random.seed(1234)
83+
self.uniques = tm.makeStringIndex(1000).values
84+
self.all = self.uniques.repeat(10)
8985

90-
def time_add_overflow_both_arg_nan(self):
91-
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
92-
b_mask=self.arrmixed_nan)
86+
def time_match_string(self):
87+
pd.match(self.all, self.uniques)
9388

9489

9590
class Hashing(object):
91+
9692
goal_time = 0.2
9793

98-
def setup(self):
99-
N = 100000
100-
101-
self.df = pd.DataFrame(
102-
{'A': pd.Series(tm.makeStringIndex(100).take(
103-
np.random.randint(0, 100, size=N))),
104-
'B': pd.Series(tm.makeStringIndex(10000).take(
105-
np.random.randint(0, 10000, size=N))),
106-
'D': np.random.randn(N),
107-
'E': np.arange(N),
108-
'F': pd.date_range('20110101', freq='s', periods=N),
109-
'G': pd.timedelta_range('1 day', freq='s', periods=N),
110-
})
111-
self.df['C'] = self.df['B'].astype('category')
112-
self.df.iloc[10:20] = np.nan
113-
114-
def time_frame(self):
115-
hashing.hash_pandas_object(self.df)
116-
117-
def time_series_int(self):
118-
hashing.hash_pandas_object(self.df.E)
119-
120-
def time_series_string(self):
121-
hashing.hash_pandas_object(self.df.B)
122-
123-
def time_series_categorical(self):
124-
hashing.hash_pandas_object(self.df.C)
94+
def setup_cache(self):
95+
np.random.seed(1234)
96+
N = 10**5
97+
98+
df = pd.DataFrame(
99+
{'strings': pd.Series(tm.makeStringIndex(10000).take(
100+
np.random.randint(0, 10000, size=N))),
101+
'floats': np.random.randn(N),
102+
'ints': np.arange(N),
103+
'dates': pd.date_range('20110101', freq='s', periods=N),
104+
'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)})
105+
df['categories'] = df['strings'].astype('category')
106+
df.iloc[10:20] = np.nan
107+
return df
108+
109+
def time_frame(self, df):
110+
hashing.hash_pandas_object(df)
111+
112+
def time_series_int(self, df):
113+
hashing.hash_pandas_object(df['ints'])
114+
115+
def time_series_string(self, df):
116+
hashing.hash_pandas_object(df['strings'])
117+
118+
def time_series_float(self, df):
119+
hashing.hash_pandas_object(df['floats'])
120+
121+
def time_series_categorical(self, df):
122+
hashing.hash_pandas_object(df['categories'])
123+
124+
def time_series_timedeltas(self, df):
125+
hashing.hash_pandas_object(df['timedeltas'])
126+
127+
def time_series_dates(self, df):
128+
hashing.hash_pandas_object(df['dates'])

asv_bench/benchmarks/attrs_caching.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
from .pandas_vb_common import *
1+
import numpy as np
2+
from pandas import DataFrame
23

34
try:
45
from pandas.util import cache_readonly
@@ -7,9 +8,11 @@
78

89

910
class DataFrameAttributes(object):
11+
1012
goal_time = 0.2
1113

1214
def setup(self):
15+
np.random.seed(1234)
1316
self.df = DataFrame(np.random.randn(10, 6))
1417
self.cur_index = self.df.index
1518

@@ -21,6 +24,7 @@ def time_set_index(self):
2124

2225

2326
class CacheReadonly(object):
27+
2428
goal_time = 0.2
2529

2630
def setup(self):

0 commit comments

Comments
 (0)