Skip to content

Commit 3479ed8

Browse files
authored
Merge branch 'master' into master
2 parents d4c7e3d + 7e15923 commit 3479ed8

File tree

281 files changed

+41841
-47360
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

281 files changed

+41841
-47360
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
.vagrant
1919
.noseids
2020
.ipynb_checkpoints
21+
.tags
2122

2223
# Compiled source #
2324
###################

.travis.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ env:
2424

2525
git:
2626
# for cloning
27-
depth: 500
27+
depth: 1000
2828

2929
matrix:
3030
fast_finish: true

appveyor.yml

+13-5
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,13 @@ environment:
1616
CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\ci\\run_with_env.cmd"
1717

1818
matrix:
19-
- PYTHON: "C:\\Python34_64"
20-
PYTHON_VERSION: "3.4"
21-
PYTHON_ARCH: "64"
22-
CONDA_PY: "34"
23-
CONDA_NPY: "19"
19+
20+
# disable python 3.4 ATM
21+
#- PYTHON: "C:\\Python34_64"
22+
# PYTHON_VERSION: "3.4"
23+
# PYTHON_ARCH: "64"
24+
# CONDA_PY: "34"
25+
# CONDA_NPY: "19"
2426

2527
- PYTHON: "C:\\Python27_64"
2628
PYTHON_VERSION: "2.7"
@@ -62,6 +64,12 @@ install:
6264
# install our build environment
6365
- cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false
6466
- cmd: conda update -q conda
67+
68+
# fix conda-build version
69+
# https://github.com/conda/conda-build/issues/1001
70+
# disabling 3.4 as windows complains upon compiling byte
71+
# code
72+
- cmd: conda install conda-build=1.21.7
6573
- cmd: conda config --set ssl_verify false
6674

6775
# add the pandas channel *before* defaults to have defaults take priority

asv_bench/asv.conf.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,11 @@
7777
// On conda install pytables, otherwise tables
7878
{"environment_type": "conda", "tables": ""},
7979
{"environment_type": "conda", "pytables": null},
80-
{"environment_type": "virtualenv", "tables": null},
81-
{"environment_type": "virtualenv", "pytables": ""},
80+
{"environment_type": "(?!conda).*", "tables": null},
81+
{"environment_type": "(?!conda).*", "pytables": ""},
8282
// On conda&win32, install libpython
8383
{"sys_platform": "(?!win32).*", "libpython": ""},
84-
{"sys_platform": "win32", "libpython": null},
84+
{"environment_type": "conda", "sys_platform": "win32", "libpython": null},
8585
{"environment_type": "(?!conda).*", "libpython": ""}
8686
],
8787
"include": [],

asv_bench/benchmarks/algorithms.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
5+
class algorithm(object):
6+
goal_time = 0.2
7+
8+
def setup(self):
9+
N = 100000
10+
11+
self.int_unique = pd.Int64Index(np.arange(N * 5))
12+
# cache is_unique
13+
self.int_unique.is_unique
14+
15+
self.int = pd.Int64Index(np.arange(N).repeat(5))
16+
self.float = pd.Float64Index(np.random.randn(N).repeat(5))
17+
18+
def time_int_factorize(self):
19+
self.int.factorize()
20+
21+
def time_float_factorize(self):
22+
self.int.factorize()
23+
24+
def time_int_unique_duplicated(self):
25+
self.int_unique.duplicated()
26+
27+
def time_int_duplicated(self):
28+
self.int.duplicated()
29+
30+
def time_float_duplicated(self):
31+
self.float.duplicated()

asv_bench/benchmarks/frame_methods.py

+16
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from .pandas_vb_common import *
2+
import string
23

34

45
class frame_apply_axis_1(object):
@@ -606,6 +607,21 @@ def time_frame_isnull(self):
606607
isnull(self.df)
607608

608609

610+
class frame_isnull_strings(object):
611+
goal_time = 0.2
612+
613+
def setup(self):
614+
np.random.seed(1234)
615+
self.sample = np.array(list(string.ascii_lowercase) +
616+
list(string.ascii_uppercase) +
617+
list(string.whitespace))
618+
self.data = np.random.choice(self.sample, (1000, 1000))
619+
self.df = DataFrame(self.data)
620+
621+
def time_frame_isnull(self):
622+
isnull(self.df)
623+
624+
609625
class frame_isnull_obj(object):
610626
goal_time = 0.2
611627

asv_bench/benchmarks/index_object.py

+55
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,27 @@ def time_index_datetime_union(self):
6363
self.rng.union(self.rng2)
6464

6565

66+
class index_datetime_set_difference(object):
67+
goal_time = 0.2
68+
69+
def setup(self):
70+
self.N = 100000
71+
self.A = self.N - 20000
72+
self.B = self.N + 20000
73+
self.idx1 = DatetimeIndex(range(self.N))
74+
self.idx2 = DatetimeIndex(range(self.A, self.B))
75+
self.idx3 = DatetimeIndex(range(self.N, self.B))
76+
77+
def time_index_datetime_difference(self):
78+
self.idx1.difference(self.idx2)
79+
80+
def time_index_datetime_difference_disjoint(self):
81+
self.idx1.difference(self.idx3)
82+
83+
def time_index_datetime_symmetric_difference(self):
84+
self.idx1.symmetric_difference(self.idx2)
85+
86+
6687
class index_float64_boolean_indexer(object):
6788
goal_time = 0.2
6889

@@ -183,6 +204,40 @@ def time_index_int64_union(self):
183204
self.left.union(self.right)
184205

185206

207+
class index_int64_set_difference(object):
208+
goal_time = 0.2
209+
210+
def setup(self):
211+
self.N = 500000
212+
self.options = np.arange(self.N)
213+
self.left = Index(self.options.take(
214+
np.random.permutation(self.N)[:(self.N // 2)]))
215+
self.right = Index(self.options.take(
216+
np.random.permutation(self.N)[:(self.N // 2)]))
217+
218+
def time_index_int64_difference(self):
219+
self.left.difference(self.right)
220+
221+
def time_index_int64_symmetric_difference(self):
222+
self.left.symmetric_difference(self.right)
223+
224+
225+
class index_str_set_difference(object):
226+
goal_time = 0.2
227+
228+
def setup(self):
229+
self.N = 10000
230+
self.strs = tm.rands_array(10, self.N)
231+
self.left = Index(self.strs[:self.N * 2 // 3])
232+
self.right = Index(self.strs[self.N // 3:])
233+
234+
def time_str_difference(self):
235+
self.left.difference(self.right)
236+
237+
def time_str_symmetric_difference(self):
238+
self.left.symmetric_difference(self.right)
239+
240+
186241
class index_str_boolean_indexer(object):
187242
goal_time = 0.2
188243

asv_bench/benchmarks/indexing.py

-20
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,6 @@ def time_dataframe_getitem_scalar(self):
1919
self.df[self.col][self.idx]
2020

2121

22-
class datamatrix_getitem_scalar(object):
23-
goal_time = 0.2
24-
25-
def setup(self):
26-
try:
27-
self.klass = DataMatrix
28-
except:
29-
self.klass = DataFrame
30-
self.index = tm.makeStringIndex(1000)
31-
self.columns = tm.makeStringIndex(30)
32-
self.df = self.klass(np.random.rand(1000, 30), index=self.index, columns=self.columns)
33-
self.idx = self.index[100]
34-
self.col = self.columns[10]
35-
36-
def time_datamatrix_getitem_scalar(self):
37-
self.df[self.col][self.idx]
38-
39-
4022
class series_get_value(object):
4123
goal_time = 0.2
4224

@@ -498,5 +480,3 @@ def setup(self):
498480

499481
def time_float_loc(self):
500482
self.ind.get_loc(0)
501-
502-

asv_bench/benchmarks/inference.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -135,4 +135,30 @@ def setup(self):
135135
self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B']))
136136

137137
def time_dtype_infer_uint32(self):
138-
(self.df_uint32['A'] + self.df_uint32['B'])
138+
(self.df_uint32['A'] + self.df_uint32['B'])
139+
140+
141+
class to_numeric(object):
142+
143+
param_names = ['dtype', 'downcast']
144+
params = [['string-float', 'string-int', 'string-nint', 'datetime64',
145+
'int-list', 'int32'],
146+
[None, 'integer', 'signed', 'unsigned', 'float']]
147+
148+
N = 500000
149+
150+
data_dict = {
151+
'string-int': (['1'] * (N / 2)) + ([2] * (N / 2)),
152+
'string-nint': (['-1'] * (N / 2)) + ([2] * (N / 2)),
153+
'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'],
154+
dtype='datetime64[D]'), N),
155+
'string-float': (['1.1'] * (N / 2)) + ([2] * (N / 2)),
156+
'int-list': ([1] * (N / 2)) + ([2] * (N / 2)),
157+
'int32': np.repeat(np.int32(1), N)
158+
}
159+
160+
def setup(self, dtype, downcast):
161+
self.data = self.data_dict[dtype]
162+
163+
def time_downcast(self, dtype, downcast):
164+
pd.to_numeric(self.data, downcast=downcast)

asv_bench/benchmarks/join_merge.py

+37-16
Original file line numberDiff line numberDiff line change
@@ -179,10 +179,6 @@ def setup(self):
179179
self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
180180
except:
181181
pass
182-
try:
183-
self.DataFrame = DataMatrix
184-
except:
185-
pass
186182
self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
187183
self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
188184
self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
@@ -210,10 +206,6 @@ def setup(self):
210206
self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
211207
except:
212208
pass
213-
try:
214-
self.DataFrame = DataMatrix
215-
except:
216-
pass
217209
self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
218210
self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
219211
self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
@@ -241,10 +233,6 @@ def setup(self):
241233
self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
242234
except:
243235
pass
244-
try:
245-
self.DataFrame = DataMatrix
246-
except:
247-
pass
248236
self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
249237
self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
250238
self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
@@ -272,10 +260,6 @@ def setup(self):
272260
self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D'])
273261
except:
274262
pass
275-
try:
276-
self.DataFrame = DataMatrix
277-
except:
278-
pass
279263
self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, })
280264
self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D'])
281265
self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D'])
@@ -309,6 +293,43 @@ def time_join_dataframe_integer_key(self):
309293
merge(self.df, self.df2, on='key1')
310294

311295

296+
class merge_asof_noby(object):
297+
298+
def setup(self):
299+
np.random.seed(0)
300+
one_count = 200000
301+
two_count = 1000000
302+
self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count),
303+
'value1': np.random.randn(one_count)})
304+
self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count),
305+
'value2': np.random.randn(two_count)})
306+
self.df1 = self.df1.sort_values('time')
307+
self.df2 = self.df2.sort_values('time')
308+
309+
def time_merge_asof_noby(self):
310+
merge_asof(self.df1, self.df2, on='time')
311+
312+
313+
class merge_asof_by(object):
314+
315+
def setup(self):
316+
import string
317+
np.random.seed(0)
318+
one_count = 200000
319+
two_count = 1000000
320+
self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count),
321+
'key': np.random.choice(list(string.uppercase), one_count),
322+
'value1': np.random.randn(one_count)})
323+
self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count),
324+
'key': np.random.choice(list(string.uppercase), two_count),
325+
'value2': np.random.randn(two_count)})
326+
self.df1 = self.df1.sort_values('time')
327+
self.df2 = self.df2.sort_values('time')
328+
329+
def time_merge_asof_by(self):
330+
merge_asof(self.df1, self.df2, on='time', by='key')
331+
332+
312333
class join_non_unique_equal(object):
313334
goal_time = 0.2
314335

asv_bench/benchmarks/parser_vb.py

+21
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,27 @@ def teardown(self):
114114
os.remove('test.csv')
115115

116116

117+
class read_csv_categorical(object):
118+
goal_time = 0.2
119+
120+
def setup(self):
121+
N = 100000
122+
group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee']
123+
df = DataFrame({'a': np.random.choice(group1, N).astype('object'),
124+
'b': np.random.choice(group1, N).astype('object'),
125+
'c': np.random.choice(group1, N).astype('object')})
126+
df.to_csv('strings.csv', index=False)
127+
128+
def time_read_csv_categorical_post(self):
129+
read_csv('strings.csv').apply(pd.Categorical)
130+
131+
def time_read_csv_categorical_direct(self):
132+
read_csv('strings.csv', dtype='category')
133+
134+
def teardown(self):
135+
os.remove('strings.csv')
136+
137+
117138
class read_table_multiple_date(object):
118139
goal_time = 0.2
119140

0 commit comments

Comments
 (0)