Skip to content

Commit 4708db0

Browse files
author
harisbal
committed
2 parents f00a2d1 + 0d86742 commit 4708db0

File tree

155 files changed

+8465
-4641
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

155 files changed

+8465
-4641
lines changed

.github/PULL_REQUEST_TEMPLATE.md

+24
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,27 @@
1+
Checklist for the pandas documentation sprint (ignore this if you are doing
2+
an unrelated PR):
3+
4+
- [ ] PR title is "DOC: update the <your-function-or-method> docstring"
5+
- [ ] The validation script passes: `scripts/validate_docstrings.py <your-function-or-method>`
6+
- [ ] The PEP8 style check passes: `git diff upstream/master -u -- "*.py" | flake8 --diff`
7+
- [ ] The html version looks good: `python doc/make.py --single <your-function-or-method>`
8+
- [ ] It has been proofread on language by another sprint participant
9+
10+
Please include the output of the validation script below between the "```" ticks:
11+
12+
```
13+
# paste output of "scripts/validate_docstrings.py <your-function-or-method>" here
14+
# between the "```" (remove this comment, but keep the "```")
15+
16+
```
17+
18+
If the validation script still gives errors, but you think there is a good reason
19+
to deviate in this case (and there are certainly such cases), please state this
20+
explicitly.
21+
22+
23+
Checklist for other PRs (remove this part if you are doing a PR for the pandas documentation sprint):
24+
125
- [ ] closes #xxxx
226
- [ ] tests added / passed
327
- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`

.gitignore

+3-2
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,9 @@ scikits
8888
*.c
8989
*.cpp
9090

91-
# Performance Testing #
92-
#######################
91+
# Unit / Performance Testing #
92+
##############################
93+
.pytest_cache/
9394
asv_bench/env/
9495
asv_bench/html/
9596
asv_bench/results/

README.md

+4-1
Original file line numberDiff line numberDiff line change
@@ -216,13 +216,16 @@ Further, general questions and discussions can also take place on the [pydata ma
216216
## Discussion and Development
217217
Most development discussion is taking place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions.
218218

219-
## Contributing to pandas
219+
## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas)
220+
220221
All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome.
221222

222223
A detailed overview on how to contribute can be found in the **[contributing guide.](https://pandas.pydata.org/pandas-docs/stable/contributing.html)**
223224

224225
If you are simply looking to start working with the pandas codebase, navigate to the [GitHub “issues” tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [Difficulty Novice](https://github.com/pandas-dev/pandas/issues?q=is%3Aopen+is%3Aissue+label%3A%22Difficulty+Novice%22) where you could start out.
225226

227+
You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas).
228+
226229
Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it!
227230

228231
Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas).

asv_bench/benchmarks/groupby.py

+69-78
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,16 @@
1111
from .pandas_vb_common import setup # noqa
1212

1313

14+
method_blacklist = {
15+
'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
16+
'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
17+
'var', 'mad', 'describe', 'std'},
18+
'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
19+
'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
20+
'std'}
21+
}
22+
23+
1424
class ApplyDictReturn(object):
1525
goal_time = 0.2
1626

@@ -83,45 +93,6 @@ def time_series_groups(self, data, key):
8393
self.ser.groupby(self.ser).groups
8494

8595

86-
class FirstLast(object):
87-
88-
goal_time = 0.2
89-
90-
param_names = ['dtype']
91-
params = ['float32', 'float64', 'datetime', 'object']
92-
93-
def setup(self, dtype):
94-
N = 10**5
95-
# with datetimes (GH7555)
96-
if dtype == 'datetime':
97-
self.df = DataFrame({'values': date_range('1/1/2011',
98-
periods=N,
99-
freq='s'),
100-
'key': range(N)})
101-
elif dtype == 'object':
102-
self.df = DataFrame({'values': ['foo'] * N,
103-
'key': range(N)})
104-
else:
105-
labels = np.arange(N / 10).repeat(10)
106-
data = Series(np.random.randn(len(labels)), dtype=dtype)
107-
data[::3] = np.nan
108-
data[1::3] = np.nan
109-
labels = labels.take(np.random.permutation(len(labels)))
110-
self.df = DataFrame({'values': data, 'key': labels})
111-
112-
def time_groupby_first(self, dtype):
113-
self.df.groupby('key').first()
114-
115-
def time_groupby_last(self, dtype):
116-
self.df.groupby('key').last()
117-
118-
def time_groupby_nth_all(self, dtype):
119-
self.df.groupby('key').nth(0, dropna='all')
120-
121-
def time_groupby_nth_none(self, dtype):
122-
self.df.groupby('key').nth(0)
123-
124-
12596
class GroupManyLabels(object):
12697

12798
goal_time = 0.2
@@ -142,38 +113,40 @@ class Nth(object):
142113

143114
goal_time = 0.2
144115

145-
def setup_cache(self):
146-
df = DataFrame(np.random.randint(1, 100, (10000, 2)))
147-
df.iloc[1, 1] = np.nan
148-
return df
149-
150-
def time_frame_nth_any(self, df):
151-
df.groupby(0).nth(0, dropna='any')
152-
153-
def time_frame_nth(self, df):
154-
df.groupby(0).nth(0)
116+
param_names = ['dtype']
117+
params = ['float32', 'float64', 'datetime', 'object']
155118

156-
def time_series_nth_any(self, df):
157-
df[1].groupby(df[0]).nth(0, dropna='any')
119+
def setup(self, dtype):
120+
N = 10**5
121+
# with datetimes (GH7555)
122+
if dtype == 'datetime':
123+
values = date_range('1/1/2011', periods=N, freq='s')
124+
elif dtype == 'object':
125+
values = ['foo'] * N
126+
else:
127+
values = np.arange(N).astype(dtype)
158128

159-
def time_series_nth(self, df):
160-
df[1].groupby(df[0]).nth(0)
129+
key = np.arange(N)
130+
self.df = DataFrame({'key': key, 'values': values})
131+
self.df.iloc[1, 1] = np.nan # insert missing data
161132

133+
def time_frame_nth_any(self, dtype):
134+
self.df.groupby('key').nth(0, dropna='any')
162135

163-
class NthObject(object):
136+
def time_groupby_nth_all(self, dtype):
137+
self.df.groupby('key').nth(0, dropna='all')
164138

165-
goal_time = 0.2
139+
def time_frame_nth(self, dtype):
140+
self.df.groupby('key').nth(0)
166141

167-
def setup_cache(self):
168-
df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
169-
df['obj'] = ['a'] * 5000 + ['b'] * 5000
170-
return df
142+
def time_series_nth_any(self, dtype):
143+
self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
171144

172-
def time_nth(self, df):
173-
df.groupby('g').nth(5)
145+
def time_groupby_nth_all(self, dtype):
146+
self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
174147

175-
def time_nth_last(self, df):
176-
df.groupby('g').last()
148+
def time_series_nth(self, dtype):
149+
self.df['values'].groupby(self.df['key']).nth(0)
177150

178151

179152
class DateAttributes(object):
@@ -235,7 +208,7 @@ def time_multi_count(self, df):
235208
df.groupby(['key1', 'key2']).count()
236209

237210

238-
class CountInt(object):
211+
class CountMultiInt(object):
239212

240213
goal_time = 0.2
241214

@@ -247,18 +220,18 @@ def setup_cache(self):
247220
'ints2': np.random.randint(0, 1000, size=n)})
248221
return df
249222

250-
def time_int_count(self, df):
223+
def time_multi_int_count(self, df):
251224
df.groupby(['key1', 'key2']).count()
252225

253-
def time_int_nunique(self, df):
226+
def time_multi_int_nunique(self, df):
254227
df.groupby(['key1', 'key2']).nunique()
255228

256229

257230
class AggFunctions(object):
258231

259232
goal_time = 0.2
260233

261-
def setup_cache(self):
234+
def setup_cache():
262235
N = 10**5
263236
fac1 = np.array(['A', 'B', 'C'], dtype='O')
264237
fac2 = np.array(['one', 'two'], dtype='O')
@@ -353,9 +326,6 @@ def setup(self):
353326
def time_multi_size(self):
354327
self.df.groupby(['key1', 'key2']).size()
355328

356-
def time_dt_size(self):
357-
self.df.groupby(['dates']).size()
358-
359329
def time_dt_timegrouper_size(self):
360330
with warnings.catch_warnings(record=True):
361331
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
@@ -368,30 +338,51 @@ class GroupByMethods(object):
368338

369339
goal_time = 0.2
370340

371-
param_names = ['dtype', 'method']
372-
params = [['int', 'float'],
341+
param_names = ['dtype', 'method', 'application']
342+
params = [['int', 'float', 'object', 'datetime'],
373343
['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
374344
'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
375345
'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
376346
'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
377-
'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]
347+
'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
348+
['direct', 'transformation']]
378349

379-
def setup(self, dtype, method):
350+
def setup(self, dtype, method, application):
351+
if method in method_blacklist.get(dtype, {}):
352+
raise NotImplementedError # skip benchmark
380353
ngroups = 1000
381354
size = ngroups * 2
382355
rng = np.arange(ngroups)
383356
values = rng.take(np.random.randint(0, ngroups, size=size))
384357
if dtype == 'int':
385358
key = np.random.randint(0, size, size=size)
386-
else:
359+
elif dtype == 'float':
387360
key = np.concatenate([np.random.random(ngroups) * 0.1,
388361
np.random.random(ngroups) * 10.0])
362+
elif dtype == 'object':
363+
key = ['foo'] * size
364+
elif dtype == 'datetime':
365+
key = date_range('1/1/2011', periods=size, freq='s')
389366

390367
df = DataFrame({'values': values, 'key': key})
391-
self.df_groupby_method = getattr(df.groupby('key')['values'], method)
392368

393-
def time_method(self, dtype, method):
394-
self.df_groupby_method()
369+
if application == 'transform':
370+
if method == 'describe':
371+
raise NotImplementedError
372+
373+
self.as_group_method = lambda: df.groupby(
374+
'key')['values'].transform(method)
375+
self.as_field_method = lambda: df.groupby(
376+
'values')['key'].transform(method)
377+
else:
378+
self.as_group_method = getattr(df.groupby('key')['values'], method)
379+
self.as_field_method = getattr(df.groupby('values')['key'], method)
380+
381+
def time_dtype_as_group(self, dtype, method, application):
382+
self.as_group_method()
383+
384+
def time_dtype_as_field(self, dtype, method, application):
385+
self.as_field_method()
395386

396387

397388
class Float32(object):

ci/environment-dev.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
dependencies:
66
- Cython
77
- NumPy
8+
- flake8
89
- moto
910
- pytest>=3.1
1011
- python-dateutil>=2.5.0

ci/requirements-3.6_DOC.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ sphinx
55
nbconvert
66
nbformat
77
notebook
8-
matplotlib
8+
matplotlib=2.1*
99
seaborn
1010
scipy
1111
lxml

ci/requirements_dev.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
# Do not modify directly
33
Cython
44
NumPy
5+
flake8
56
moto
67
pytest>=3.1
78
python-dateutil>=2.5.0
89
pytz
910
setuptools>=3.3
10-
sphinx
11+
sphinx

0 commit comments

Comments
 (0)