Skip to content

Commit 6a1efaa

Browse files
committed
Merge branch 'wrong-sparseblock-initialization' of https://github.com/Licht-T/pandas into wrong-sparseblock-initialization
2 parents c22000a + b9e9172 commit 6a1efaa

File tree

286 files changed

+14195
-7330
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

286 files changed

+14195
-7330
lines changed

.travis.yml

+2
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ script:
121121
- ci/script_single.sh
122122
- ci/script_multi.sh
123123
- ci/lint.sh
124+
- echo "checking imports"
125+
- source activate pandas && python ci/check_imports.py
124126
- echo "script done"
125127

126128
after_success:

MANIFEST.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
include MANIFEST.in
22
include LICENSE
33
include RELEASE.md
4-
include README.rst
4+
include README.md
55
include setup.py
66
include pyproject.toml
77

asv_bench/benchmarks/index_object.py

+20
Original file line numberDiff line numberDiff line change
@@ -199,3 +199,23 @@ def time_datetime_level_values_full(self):
199199

200200
def time_datetime_level_values_sliced(self):
201201
self.mi[:10].values
202+
203+
204+
class Range(object):
205+
goal_time = 0.2
206+
207+
def setup(self):
208+
self.idx_inc = RangeIndex(start=0, stop=10**7, step=3)
209+
self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3)
210+
211+
def time_max(self):
212+
self.idx_inc.max()
213+
214+
def time_max_trivial(self):
215+
self.idx_dec.max()
216+
217+
def time_min(self):
218+
self.idx_dec.min()
219+
220+
def time_min_trivial(self):
221+
self.idx_inc.min()

asv_bench/benchmarks/io_bench.py

+30
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from .pandas_vb_common import *
23
from pandas import concat, Timestamp, compat
34
try:
@@ -192,3 +193,32 @@ def time_read_nrows(self, compression, engine):
192193
ext = ".bz2"
193194
pd.read_csv(self.big_fname + ext, nrows=10,
194195
compression=compression, engine=engine)
196+
197+
198+
class read_json_lines(object):
199+
goal_time = 0.2
200+
fname = "__test__.json"
201+
202+
def setup(self):
203+
self.N = 100000
204+
self.C = 5
205+
self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]))
206+
self.df.to_json(self.fname,orient="records",lines=True)
207+
208+
def teardown(self):
209+
try:
210+
os.remove(self.fname)
211+
except:
212+
pass
213+
214+
def time_read_json_lines(self):
215+
pd.read_json(self.fname, lines=True)
216+
217+
def time_read_json_lines_chunk(self):
218+
pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4))
219+
220+
def peakmem_read_json_lines(self):
221+
pd.read_json(self.fname, lines=True)
222+
223+
def peakmem_read_json_lines_chunk(self):
224+
pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4))

asv_bench/benchmarks/sparse.py

+66-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
from itertools import repeat
1+
import itertools
22

33
from .pandas_vb_common import *
44
import scipy.sparse
5-
from pandas import SparseSeries, SparseDataFrame
5+
from pandas import SparseSeries, SparseDataFrame, SparseArray
66

77

88
class sparse_series_to_frame(object):
@@ -23,6 +23,69 @@ def time_sparse_series_to_frame(self):
2323
SparseDataFrame(self.series)
2424

2525

26+
class sparse_array_constructor(object):
27+
goal_time = 0.2
28+
29+
def setup(self):
30+
np.random.seed(1)
31+
self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64)
32+
self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64)
33+
34+
self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64)
35+
self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64)
36+
37+
self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan)
38+
self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan)
39+
40+
self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0)
41+
self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0)
42+
43+
def make_numeric_array(self, length, dense_size, fill_value, dtype):
44+
arr = np.array([fill_value] * length, dtype=dtype)
45+
indexer = np.unique(np.random.randint(0, length, dense_size))
46+
arr[indexer] = np.random.randint(0, 100, len(indexer))
47+
return (arr, fill_value, dtype)
48+
49+
def make_object_array(self, length, dense_size, fill_value):
50+
elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object)
51+
arr = np.array([fill_value] * length, dtype=np.object)
52+
indexer = np.unique(np.random.randint(0, length, dense_size))
53+
arr[indexer] = np.random.choice(elems, len(indexer))
54+
return (arr, fill_value, np.object)
55+
56+
def time_sparse_array_constructor_int64_10percent(self):
57+
arr, fill_value, dtype = self.int64_10percent
58+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
59+
60+
def time_sparse_array_constructor_int64_1percent(self):
61+
arr, fill_value, dtype = self.int64_1percent
62+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
63+
64+
def time_sparse_array_constructor_float64_10percent(self):
65+
arr, fill_value, dtype = self.float64_10percent
66+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
67+
68+
def time_sparse_array_constructor_float64_1percent(self):
69+
arr, fill_value, dtype = self.float64_1percent
70+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
71+
72+
def time_sparse_array_constructor_object_nan_fill_value_10percent(self):
73+
arr, fill_value, dtype = self.object_nan_fill_value_10percent
74+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
75+
76+
def time_sparse_array_constructor_object_nan_fill_value_1percent(self):
77+
arr, fill_value, dtype = self.object_nan_fill_value_1percent
78+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
79+
80+
def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self):
81+
arr, fill_value, dtype = self.object_non_nan_fill_value_10percent
82+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
83+
84+
def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self):
85+
arr, fill_value, dtype = self.object_non_nan_fill_value_1percent
86+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
87+
88+
2689
class sparse_frame_constructor(object):
2790
goal_time = 0.2
2891

@@ -33,7 +96,7 @@ def time_sparse_from_scipy(self):
3396
SparseDataFrame(scipy.sparse.rand(1000, 1000, 0.005))
3497

3598
def time_sparse_from_dict(self):
36-
SparseDataFrame(dict(zip(range(1000), repeat([0]))))
99+
SparseDataFrame(dict(zip(range(1000), itertools.repeat([0]))))
37100

38101

39102
class sparse_series_from_coo(object):

asv_bench/benchmarks/timeseries.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def setup(self):
5656
self.no_freq = self.rng7[:50000].append(self.rng7[50002:])
5757
self.d_freq = self.rng7[:50000].append(self.rng7[50000:])
5858

59-
self.rng8 = date_range(start='1/1/1700', freq='B', periods=100000)
59+
self.rng8 = date_range(start='1/1/1700', freq='B', periods=75000)
6060
self.b_freq = self.rng8[:50000].append(self.rng8[50000:])
6161

6262
def time_add_timedelta(self):

asv_bench/benchmarks/timestamp.py

+29
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from .pandas_vb_common import *
22
from pandas import to_timedelta, Timestamp
3+
import pytz
4+
import datetime
35

46

57
class TimestampProperties(object):
@@ -58,3 +60,30 @@ def time_is_leap_year(self):
5860

5961
def time_microsecond(self):
6062
self.ts.microsecond
63+
64+
65+
class TimestampOps(object):
66+
goal_time = 0.2
67+
68+
def setup(self):
69+
self.ts = Timestamp('2017-08-25 08:16:14')
70+
self.ts_tz = Timestamp('2017-08-25 08:16:14', tz='US/Eastern')
71+
72+
dt = datetime.datetime(2016, 3, 27, 1)
73+
self.tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo
74+
self.ts2 = Timestamp(dt)
75+
76+
def time_replace_tz(self):
77+
self.ts.replace(tzinfo=pytz.timezone('US/Eastern'))
78+
79+
def time_replace_across_dst(self):
80+
self.ts2.replace(tzinfo=self.tzinfo)
81+
82+
def time_replace_None(self):
83+
self.ts_tz.replace(tzinfo=None)
84+
85+
def time_to_pydatetime(self):
86+
self.ts.to_pydatetime()
87+
88+
def time_to_pydatetime_tz(self):
89+
self.ts_tz.to_pydatetime()

ci/check_imports.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
Check that certain modules are not loaded by `import pandas`
3+
"""
4+
import sys
5+
6+
blacklist = {
7+
'bs4',
8+
'html5lib',
9+
'ipython',
10+
'jinja2'
11+
'lxml',
12+
'matplotlib',
13+
'numexpr',
14+
'openpyxl',
15+
'py',
16+
'pytest',
17+
's3fs',
18+
'scipy',
19+
'tables',
20+
'xlrd',
21+
'xlsxwriter',
22+
'xlwt',
23+
}
24+
25+
26+
def main():
27+
import pandas # noqa
28+
29+
modules = set(x.split('.')[0] for x in sys.modules)
30+
imported = modules & blacklist
31+
if modules & blacklist:
32+
sys.exit("Imported {}".format(imported))
33+
34+
35+
if __name__ == '__main__':
36+
main()

ci/requirements-2.7.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ matplotlib
88
openpyxl=1.6.2
99
xlrd=0.9.2
1010
sqlalchemy=0.9.6
11-
lxml=3.2.1
11+
lxml
1212
scipy
1313
xlsxwriter=0.5.2
1414
s3fs

ci/requirements-2.7_LOCALE.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ xlrd=0.9.2
88
bottleneck=1.0.0
99
matplotlib=1.4.3
1010
sqlalchemy=0.8.1
11-
lxml=3.2.1
11+
lxml
1212
scipy

ci/requirements-2.7_SLOW.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ s3fs
1616
psycopg2
1717
pymysql
1818
html5lib
19-
beautiful-soup
19+
beautifulsoup4

ci/requirements-2.7_WIN.run

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ matplotlib
88
openpyxl
99
xlrd
1010
sqlalchemy
11-
lxml=3.2.1
11+
lxml
1212
scipy
1313
xlsxwriter
1414
s3fs
1515
bottleneck
1616
html5lib
17-
beautiful-soup
17+
beautifulsoup4
1818
jinja2=2.8

ci/requirements-3.6_NUMPY_DEV.build

-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
python=3.6*
22
pytz
3-
cython

ci/requirements-3.6_NUMPY_DEV.build.sh

+3
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,7 @@ pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy
1414
# install dateutil from master
1515
pip install -U git+git://github.com/dateutil/dateutil.git
1616

17+
# cython via pip
18+
pip install cython
19+
1720
true

ci/requirements-3.6_WIN.run

-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ xlrd
88
xlwt
99
scipy
1010
feather-format
11-
pyarrow
1211
numexpr
1312
pytables
1413
matplotlib

ci/requirements_all.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ xlrd
1313
xlwt
1414
html5lib
1515
patsy
16-
beautiful-soup
16+
beautifulsoup4
1717
numpy
1818
cython
1919
scipy

doc/README.rst

+5-3
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
Contributing to the documentation
44
=================================
55

6-
If you're not the developer type, contributing to the documentation is still
7-
of huge value. You don't even have to be an expert on
8-
*pandas* to do so! Something as simple as rewriting small passages for clarity
6+
Whether you are someone who loves writing, teaching, or development,
7+
contributing to the documentation is a huge value. If you don't see yourself
8+
as a developer type, please don't stress and know that we want you to
9+
contribute. You don't even have to be an expert on *pandas* to do so!
10+
Something as simple as rewriting small passages for clarity
911
as you reference the docs is a simple but effective way to contribute. The
1012
next person to read that passage will be in your debt!
1113

doc/source/10min.rst

+2-12
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
np.random.seed(123456)
1212
np.set_printoptions(precision=4, suppress=True)
1313
import matplotlib
14-
matplotlib.style.use('ggplot')
14+
# matplotlib.style.use('default')
1515
pd.options.display.max_rows = 15
1616
1717
#### portions of this were borrowed from the
@@ -95,17 +95,7 @@ will be completed:
9595
df2.append df2.combine_first
9696
df2.apply df2.compound
9797
df2.applymap df2.consolidate
98-
df2.as_blocks df2.convert_objects
99-
df2.asfreq df2.copy
100-
df2.as_matrix df2.corr
101-
df2.astype df2.corrwith
102-
df2.at df2.count
103-
df2.at_time df2.cov
104-
df2.axes df2.cummax
105-
df2.B df2.cummin
106-
df2.between_time df2.cumprod
107-
df2.bfill df2.cumsum
108-
df2.blocks df2.D
98+
df2.D
10999

110100
As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically
111101
tab completed. ``E`` is there as well; the rest of the attributes have been

0 commit comments

Comments
 (0)