Skip to content

Commit 2d4c6ac

Browse files
Merge remote-tracking branch 'upstream/master' into doctests-travis
2 parents 5caf04b + 2431641 commit 2d4c6ac

File tree

303 files changed

+22721
-11145
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

303 files changed

+22721
-11145
lines changed

.github/PULL_REQUEST_TEMPLATE.md

+24
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,27 @@
1+
Checklist for the pandas documentation sprint (ignore this if you are doing
2+
an unrelated PR):
3+
4+
- [ ] PR title is "DOC: update the <your-function-or-method> docstring"
5+
- [ ] The validation script passes: `scripts/validate_docstrings.py <your-function-or-method>`
6+
- [ ] The PEP8 style check passes: `git diff upstream/master -u -- "*.py" | flake8 --diff`
7+
- [ ] The html version looks good: `python doc/make.py --single <your-function-or-method>`
8+
- [ ] It has been proofread on language by another sprint participant
9+
10+
Please include the output of the validation script below between the "```" ticks:
11+
12+
```
13+
# paste output of "scripts/validate_docstrings.py <your-function-or-method>" here
14+
# between the "```" (remove this comment, but keep the "```")
15+
16+
```
17+
18+
If the validation script still gives errors, but you think there is a good reason
19+
to deviate in this case (and there are certainly such cases), please state this
20+
explicitly.
21+
22+
23+
Checklist for other PRs (remove this part if you are doing a PR for the pandas documentation sprint):
24+
125
- [ ] closes #xxxx
226
- [ ] tests added / passed
327
- [ ] passes `git diff upstream/master -u -- "*.py" | flake8 --diff`

.gitignore

+4-2
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ dist
6161
.coverage
6262
coverage.xml
6363
coverage_html_report
64+
*.pytest_cache
6465

6566
# OS generated files #
6667
######################
@@ -88,8 +89,8 @@ scikits
8889
*.c
8990
*.cpp
9091

91-
# Performance Testing #
92-
#######################
92+
# Unit / Performance Testing #
93+
##############################
9394
asv_bench/env/
9495
asv_bench/html/
9596
asv_bench/results/
@@ -108,3 +109,4 @@ doc/tmp.sv
108109
doc/source/styled.xlsx
109110
doc/source/templates/
110111
env/
112+
doc/source/savefig/

.travis.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ matrix:
5656
# In allow_failures
5757
- dist: trusty
5858
env:
59-
- JOB="2.7_SLOW" SLOW=true
59+
- JOB="3.6_SLOW" SLOW=true
6060
# In allow_failures
6161
- dist: trusty
6262
env:
@@ -72,7 +72,7 @@ matrix:
7272
allow_failures:
7373
- dist: trusty
7474
env:
75-
- JOB="2.7_SLOW" SLOW=true
75+
- JOB="3.6_SLOW" SLOW=true
7676
- dist: trusty
7777
env:
7878
- JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate"

asv_bench/benchmarks/frame_ctor.py

+12-6
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ class FromDicts(object):
1616

1717
def setup(self):
1818
N, K = 5000, 50
19-
index = tm.makeStringIndex(N)
20-
columns = tm.makeStringIndex(K)
21-
frame = DataFrame(np.random.randn(N, K), index=index, columns=columns)
19+
self.index = tm.makeStringIndex(N)
20+
self.columns = tm.makeStringIndex(K)
21+
frame = DataFrame(np.random.randn(N, K), index=self.index,
22+
columns=self.columns)
2223
self.data = frame.to_dict()
23-
self.some_dict = list(self.data.values())[0]
2424
self.dict_list = frame.to_dict(orient='records')
2525
self.data2 = {i: {j: float(j) for j in range(100)}
2626
for i in range(2000)}
@@ -31,8 +31,14 @@ def time_list_of_dict(self):
3131
def time_nested_dict(self):
3232
DataFrame(self.data)
3333

34-
def time_dict(self):
35-
Series(self.some_dict)
34+
def time_nested_dict_index(self):
35+
DataFrame(self.data, index=self.index)
36+
37+
def time_nested_dict_columns(self):
38+
DataFrame(self.data, columns=self.columns)
39+
40+
def time_nested_dict_index_columns(self):
41+
DataFrame(self.data, index=self.index, columns=self.columns)
3642

3743
def time_nested_dict_int64(self):
3844
# nested dict, integer indexes, regression described in #621

asv_bench/benchmarks/groupby.py

+58-79
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
method_blacklist = {
1515
'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean',
1616
'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min',
17-
'var', 'mad', 'describe', 'std'}
17+
'var', 'mad', 'describe', 'std'},
18+
'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew',
19+
'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe',
20+
'std'}
1821
}
1922

2023

@@ -90,45 +93,6 @@ def time_series_groups(self, data, key):
9093
self.ser.groupby(self.ser).groups
9194

9295

93-
class FirstLast(object):
94-
95-
goal_time = 0.2
96-
97-
param_names = ['dtype']
98-
params = ['float32', 'float64', 'datetime', 'object']
99-
100-
def setup(self, dtype):
101-
N = 10**5
102-
# with datetimes (GH7555)
103-
if dtype == 'datetime':
104-
self.df = DataFrame({'values': date_range('1/1/2011',
105-
periods=N,
106-
freq='s'),
107-
'key': range(N)})
108-
elif dtype == 'object':
109-
self.df = DataFrame({'values': ['foo'] * N,
110-
'key': range(N)})
111-
else:
112-
labels = np.arange(N / 10).repeat(10)
113-
data = Series(np.random.randn(len(labels)), dtype=dtype)
114-
data[::3] = np.nan
115-
data[1::3] = np.nan
116-
labels = labels.take(np.random.permutation(len(labels)))
117-
self.df = DataFrame({'values': data, 'key': labels})
118-
119-
def time_groupby_first(self, dtype):
120-
self.df.groupby('key').first()
121-
122-
def time_groupby_last(self, dtype):
123-
self.df.groupby('key').last()
124-
125-
def time_groupby_nth_all(self, dtype):
126-
self.df.groupby('key').nth(0, dropna='all')
127-
128-
def time_groupby_nth_none(self, dtype):
129-
self.df.groupby('key').nth(0)
130-
131-
13296
class GroupManyLabels(object):
13397

13498
goal_time = 0.2
@@ -149,39 +113,40 @@ class Nth(object):
149113

150114
goal_time = 0.2
151115

152-
def setup_cache(self):
153-
df = DataFrame(np.random.randint(1, 100, (10000, 2)))
154-
df.iloc[1, 1] = np.nan
155-
return df
156-
157-
def time_frame_nth_any(self, df):
158-
df.groupby(0).nth(0, dropna='any')
159-
160-
def time_frame_nth(self, df):
161-
df.groupby(0).nth(0)
162-
116+
param_names = ['dtype']
117+
params = ['float32', 'float64', 'datetime', 'object']
163118

164-
def time_series_nth_any(self, df):
165-
df[1].groupby(df[0]).nth(0, dropna='any')
119+
def setup(self, dtype):
120+
N = 10**5
121+
# with datetimes (GH7555)
122+
if dtype == 'datetime':
123+
values = date_range('1/1/2011', periods=N, freq='s')
124+
elif dtype == 'object':
125+
values = ['foo'] * N
126+
else:
127+
values = np.arange(N).astype(dtype)
166128

167-
def time_series_nth(self, df):
168-
df[1].groupby(df[0]).nth(0)
129+
key = np.arange(N)
130+
self.df = DataFrame({'key': key, 'values': values})
131+
self.df.iloc[1, 1] = np.nan # insert missing data
169132

133+
def time_frame_nth_any(self, dtype):
134+
self.df.groupby('key').nth(0, dropna='any')
170135

171-
class NthObject(object):
136+
def time_groupby_nth_all(self, dtype):
137+
self.df.groupby('key').nth(0, dropna='all')
172138

173-
goal_time = 0.2
139+
def time_frame_nth(self, dtype):
140+
self.df.groupby('key').nth(0)
174141

175-
def setup_cache(self):
176-
df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g'])
177-
df['obj'] = ['a'] * 5000 + ['b'] * 5000
178-
return df
142+
def time_series_nth_any(self, dtype):
143+
self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
179144

180-
def time_nth(self, df):
181-
df.groupby('g').nth(5)
145+
def time_groupby_nth_all(self, dtype):
146+
self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
182147

183-
def time_nth_last(self, df):
184-
df.groupby('g').last()
148+
def time_series_nth(self, dtype):
149+
self.df['values'].groupby(self.df['key']).nth(0)
185150

186151

187152
class DateAttributes(object):
@@ -243,7 +208,7 @@ def time_multi_count(self, df):
243208
df.groupby(['key1', 'key2']).count()
244209

245210

246-
class CountInt(object):
211+
class CountMultiInt(object):
247212

248213
goal_time = 0.2
249214

@@ -255,18 +220,18 @@ def setup_cache(self):
255220
'ints2': np.random.randint(0, 1000, size=n)})
256221
return df
257222

258-
def time_int_count(self, df):
223+
def time_multi_int_count(self, df):
259224
df.groupby(['key1', 'key2']).count()
260225

261-
def time_int_nunique(self, df):
226+
def time_multi_int_nunique(self, df):
262227
df.groupby(['key1', 'key2']).nunique()
263228

264229

265230
class AggFunctions(object):
266231

267232
goal_time = 0.2
268233

269-
def setup_cache(self):
234+
def setup_cache():
270235
N = 10**5
271236
fac1 = np.array(['A', 'B', 'C'], dtype='O')
272237
fac2 = np.array(['one', 'two'], dtype='O')
@@ -361,9 +326,6 @@ def setup(self):
361326
def time_multi_size(self):
362327
self.df.groupby(['key1', 'key2']).size()
363328

364-
def time_dt_size(self):
365-
self.df.groupby(['dates']).size()
366-
367329
def time_dt_timegrouper_size(self):
368330
with warnings.catch_warnings(record=True):
369331
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()
@@ -376,15 +338,16 @@ class GroupByMethods(object):
376338

377339
goal_time = 0.2
378340

379-
param_names = ['dtype', 'method']
380-
params = [['int', 'float', 'object'],
341+
param_names = ['dtype', 'method', 'application']
342+
params = [['int', 'float', 'object', 'datetime'],
381343
['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
382344
'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
383345
'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
384346
'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
385-
'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]
347+
'std', 'sum', 'tail', 'unique', 'value_counts', 'var'],
348+
['direct', 'transformation']]
386349

387-
def setup(self, dtype, method):
350+
def setup(self, dtype, method, application):
388351
if method in method_blacklist.get(dtype, {}):
389352
raise NotImplementedError # skip benchmark
390353
ngroups = 1000
@@ -398,12 +361,28 @@ def setup(self, dtype, method):
398361
np.random.random(ngroups) * 10.0])
399362
elif dtype == 'object':
400363
key = ['foo'] * size
364+
elif dtype == 'datetime':
365+
key = date_range('1/1/2011', periods=size, freq='s')
401366

402367
df = DataFrame({'values': values, 'key': key})
403-
self.df_groupby_method = getattr(df.groupby('key')['values'], method)
404368

405-
def time_method(self, dtype, method):
406-
self.df_groupby_method()
369+
if application == 'transform':
370+
if method == 'describe':
371+
raise NotImplementedError
372+
373+
self.as_group_method = lambda: df.groupby(
374+
'key')['values'].transform(method)
375+
self.as_field_method = lambda: df.groupby(
376+
'values')['key'].transform(method)
377+
else:
378+
self.as_group_method = getattr(df.groupby('key')['values'], method)
379+
self.as_field_method = getattr(df.groupby('values')['key'], method)
380+
381+
def time_dtype_as_group(self, dtype, method, application):
382+
self.as_group_method()
383+
384+
def time_dtype_as_field(self, dtype, method, application):
385+
self.as_field_method()
407386

408387

409388
class Float32(object):

asv_bench/benchmarks/io/csv.py

-32
Original file line numberDiff line numberDiff line change
@@ -118,38 +118,6 @@ def time_read_uint64_na_values(self):
118118
na_values=self.na_values)
119119

120120

121-
class S3(object):
122-
# Make sure that we can read part of a file from S3 without
123-
# needing to download the entire thing. Use the timeit.default_timer
124-
# to measure wall time instead of CPU time -- we want to see
125-
# how long it takes to download the data.
126-
timer = timeit.default_timer
127-
params = ([None, "gzip", "bz2"], ["python", "c"])
128-
param_names = ["compression", "engine"]
129-
130-
def setup(self, compression, engine):
131-
if compression == "bz2" and engine == "c" and PY2:
132-
# The Python 2 C parser can't read bz2 from open files.
133-
raise NotImplementedError
134-
try:
135-
import s3fs # noqa
136-
except ImportError:
137-
# Skip these benchmarks if `boto` is not installed.
138-
raise NotImplementedError
139-
140-
ext = ""
141-
if compression == "gzip":
142-
ext = ".gz"
143-
elif compression == "bz2":
144-
ext = ".bz2"
145-
self.big_fname = "s3://pandas-test/large_random.csv" + ext
146-
147-
def time_read_csv_10_rows(self, compression, engine):
148-
# Read a small number of rows from a huge (100,000 x 50) table.
149-
read_csv(self.big_fname, nrows=10, compression=compression,
150-
engine=engine)
151-
152-
153121
class ReadCSVThousands(BaseIO):
154122

155123
goal_time = 0.2

asv_bench/benchmarks/timeseries.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,7 @@ def setup(self):
7575
freq='S'))
7676

7777
def time_infer_dst(self):
78-
with warnings.catch_warnings(record=True):
79-
self.index.tz_localize('US/Eastern', infer_dst=True)
78+
self.index.tz_localize('US/Eastern', ambiguous='infer')
8079

8180

8281
class ResetIndex(object):

ci/build_docs.sh

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ if [ "$DOC" ]; then
1515
source activate pandas
1616

1717
mv "$TRAVIS_BUILD_DIR"/doc /tmp
18+
mv "$TRAVIS_BUILD_DIR/LICENSE" /tmp # included in the docs.
1819
cd /tmp/doc
1920

2021
echo ###############################

ci/environment-dev.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
dependencies:
66
- Cython
77
- NumPy
8+
- flake8
89
- moto
910
- pytest>=3.1
1011
- python-dateutil>=2.5.0

0 commit comments

Comments
 (0)