Skip to content

Commit 8942f76

Browse files
committed
Merge remote-tracking branch 'upstream/master' into nchmura4-nchmura-df-style-hide
2 parents 709ab50 + e2a0251 commit 8942f76

File tree

100 files changed

+3194
-1939
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

100 files changed

+3194
-1939
lines changed

appveyor.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ install:
5959

6060
# install our build environment
6161
- cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false
62-
- cmd: conda update -q conda
62+
# - cmd: conda update -q conda
6363
- cmd: conda config --set ssl_verify false
6464

6565
# add the pandas channel *before* defaults to have defaults take priority

asv_bench/benchmarks/io_bench.py

+30
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from .pandas_vb_common import *
23
from pandas import concat, Timestamp, compat
34
try:
@@ -192,3 +193,32 @@ def time_read_nrows(self, compression, engine):
192193
ext = ".bz2"
193194
pd.read_csv(self.big_fname + ext, nrows=10,
194195
compression=compression, engine=engine)
196+
197+
198+
class read_json_lines(object):
199+
goal_time = 0.2
200+
fname = "__test__.json"
201+
202+
def setup(self):
203+
self.N = 100000
204+
self.C = 5
205+
self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]))
206+
self.df.to_json(self.fname,orient="records",lines=True)
207+
208+
def teardown(self):
209+
try:
210+
os.remove(self.fname)
211+
except:
212+
pass
213+
214+
def time_read_json_lines(self):
215+
pd.read_json(self.fname, lines=True)
216+
217+
def time_read_json_lines_chunk(self):
218+
pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4))
219+
220+
def peakmem_read_json_lines(self):
221+
pd.read_json(self.fname, lines=True)
222+
223+
def peakmem_read_json_lines_chunk(self):
224+
pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4))

asv_bench/benchmarks/sparse.py

+64-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from .pandas_vb_common import *
44
import scipy.sparse
5-
from pandas import SparseSeries, SparseDataFrame
5+
from pandas import SparseSeries, SparseDataFrame, SparseArray
66

77

88
class sparse_series_to_frame(object):
@@ -23,6 +23,69 @@ def time_sparse_series_to_frame(self):
2323
SparseDataFrame(self.series)
2424

2525

26+
class sparse_array_constructor(object):
27+
goal_time = 0.2
28+
29+
def setup(self):
30+
np.random.seed(1)
31+
self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64)
32+
self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64)
33+
34+
self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64)
35+
self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64)
36+
37+
self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan)
38+
self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan)
39+
40+
self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0)
41+
self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0)
42+
43+
def make_numeric_array(self, length, dense_size, fill_value, dtype):
44+
arr = np.array([fill_value] * length, dtype=dtype)
45+
indexer = np.unique(np.random.randint(0, length, dense_size))
46+
arr[indexer] = np.random.randint(0, 100, len(indexer))
47+
return (arr, fill_value, dtype)
48+
49+
def make_object_array(self, length, dense_size, fill_value):
50+
elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object)
51+
arr = np.array([fill_value] * length, dtype=np.object)
52+
indexer = np.unique(np.random.randint(0, length, dense_size))
53+
arr[indexer] = np.random.choice(elems, len(indexer))
54+
return (arr, fill_value, np.object)
55+
56+
def time_sparse_array_constructor_int64_10percent(self):
57+
arr, fill_value, dtype = self.int64_10percent
58+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
59+
60+
def time_sparse_array_constructor_int64_1percent(self):
61+
arr, fill_value, dtype = self.int64_1percent
62+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
63+
64+
def time_sparse_array_constructor_float64_10percent(self):
65+
arr, fill_value, dtype = self.float64_10percent
66+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
67+
68+
def time_sparse_array_constructor_float64_1percent(self):
69+
arr, fill_value, dtype = self.float64_1percent
70+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
71+
72+
def time_sparse_array_constructor_object_nan_fill_value_10percent(self):
73+
arr, fill_value, dtype = self.object_nan_fill_value_10percent
74+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
75+
76+
def time_sparse_array_constructor_object_nan_fill_value_1percent(self):
77+
arr, fill_value, dtype = self.object_nan_fill_value_1percent
78+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
79+
80+
def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self):
81+
arr, fill_value, dtype = self.object_non_nan_fill_value_10percent
82+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
83+
84+
def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self):
85+
arr, fill_value, dtype = self.object_non_nan_fill_value_1percent
86+
SparseArray(arr, fill_value=fill_value, dtype=dtype)
87+
88+
2689
class sparse_frame_constructor(object):
2790
goal_time = 0.2
2891

ci/install.ps1

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ $MINICONDA_URL = "http://repo.continuum.io/miniconda/"
77

88
function DownloadMiniconda ($python_version, $platform_suffix) {
99
$webclient = New-Object System.Net.WebClient
10-
$filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe"
10+
$filename = "Miniconda3-4.3.21-Windows-" + $platform_suffix + ".exe"
1111
$url = $MINICONDA_URL + $filename
1212

1313
$basedir = $pwd.Path + "\"
@@ -85,7 +85,7 @@ function UpdateConda ($python_home) {
8585

8686
function main () {
8787
InstallMiniconda "3.5" $env:PYTHON_ARCH $env:CONDA_ROOT
88-
UpdateConda $env:CONDA_ROOT
88+
# UpdateConda $env:CONDA_ROOT
8989
InstallCondaPackages $env:CONDA_ROOT "conda-build jinja2 anaconda-client"
9090
}
9191

ci/install_circle.sh

+4-2
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,17 @@ echo "[Using clean Miniconda install]"
1010
rm -rf "$MINICONDA_DIR"
1111

1212
# install miniconda
13-
wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1
13+
# wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1
14+
# Pin miniconda
15+
wget https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -q -O miniconda.sh || exit 1
1416
bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1
1517

1618
export PATH="$MINICONDA_DIR/bin:$PATH"
1719

1820
echo "[update conda]"
1921
conda config --set ssl_verify false || exit 1
2022
conda config --set always_yes true --set changeps1 false || exit 1
21-
conda update -q conda
23+
# conda update -q conda
2224

2325
# add the pandas channel to take priority
2426
# to add extra packages

ci/install_travis.sh

+7-3
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,13 @@ fi
3434

3535
# install miniconda
3636
if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
37-
time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1
37+
# temporarily pin miniconda
38+
# time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1
39+
time wget https://repo.continuum.io/miniconda/Miniconda2-4.3.21-MacOSX-x86_64.sh -O miniconda.sh || exit 1
3840
else
39-
time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1
41+
# temporarily pin miniconda
42+
# time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1
43+
time wget https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -O miniconda.sh || exit 1
4044
fi
4145
time bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1
4246

@@ -48,7 +52,7 @@ echo
4852
echo "[update conda]"
4953
conda config --set ssl_verify false || exit 1
5054
conda config --set quiet true --set always_yes true --set changeps1 false || exit 1
51-
conda update -q conda
55+
# conda update -q conda
5256

5357
echo
5458
echo "[add channels]"

ci/requirements-2.7_SLOW.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,4 @@ s3fs
1616
psycopg2
1717
pymysql
1818
html5lib
19-
beautiful-soup
19+
beautifulsoup4

ci/requirements-2.7_WIN.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,5 @@ xlsxwriter
1414
s3fs
1515
bottleneck
1616
html5lib
17-
beautiful-soup
17+
beautifulsoup4
1818
jinja2=2.8

ci/requirements_all.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ xlrd
1313
xlwt
1414
html5lib
1515
patsy
16-
beautiful-soup
16+
beautifulsoup4
1717
numpy
1818
cython
1919
scipy

doc/source/10min.rst

+2-12
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
np.random.seed(123456)
1212
np.set_printoptions(precision=4, suppress=True)
1313
import matplotlib
14-
matplotlib.style.use('ggplot')
14+
# matplotlib.style.use('default')
1515
pd.options.display.max_rows = 15
1616
1717
#### portions of this were borrowed from the
@@ -95,17 +95,7 @@ will be completed:
9595
df2.append df2.combine_first
9696
df2.apply df2.compound
9797
df2.applymap df2.consolidate
98-
df2.as_blocks df2.convert_objects
99-
df2.asfreq df2.copy
100-
df2.as_matrix df2.corr
101-
df2.astype df2.corrwith
102-
df2.at df2.count
103-
df2.at_time df2.cov
104-
df2.axes df2.cummax
105-
df2.B df2.cummin
106-
df2.between_time df2.cumprod
107-
df2.bfill df2.cumsum
108-
df2.blocks df2.D
98+
df2.D
10999

110100
As you can see, the columns ``A``, ``B``, ``C``, and ``D`` are automatically
111101
tab completed. ``E`` is there as well; the rest of the attributes have been

doc/source/advanced.rst

+23
Original file line numberDiff line numberDiff line change
@@ -833,12 +833,21 @@ Of course if you need integer based selection, then use ``iloc``
833833
IntervalIndex
834834
~~~~~~~~~~~~~
835835
836+
:class:`IntervalIndex` together with its own dtype, ``interval`` as well as the
837+
:class:`Interval` scalar type, allow first-class support in pandas for interval
838+
notation.
839+
840+
The ``IntervalIndex`` allows some unique indexing and is also used as a
841+
return type for the categories in :func:`cut` and :func:`qcut`.
842+
836843
.. versionadded:: 0.20.0
837844
838845
.. warning::
839846
840847
These indexing behaviors are provisional and may change in a future version of pandas.
841848
849+
An ``IntervalIndex`` can be used in ``Series`` and in ``DataFrame`` as the index.
850+
842851
.. ipython:: python
843852
844853
df = pd.DataFrame({'A': [1, 2, 3, 4]},
@@ -860,6 +869,20 @@ If you select a lable *contained* within an interval, this will also select the
860869
df.loc[2.5]
861870
df.loc[[2.5, 3.5]]
862871
872+
``Interval`` and ``IntervalIndex`` are used by ``cut`` and ``qcut``:
873+
874+
.. ipython:: python
875+
876+
c = pd.cut(range(4), bins=2)
877+
c
878+
c.categories
879+
880+
Furthermore, ``IntervalIndex`` allows one to bin *other* data with these same
881+
bins, with ``NaN`` representing a missing value similar to other dtypes.
882+
883+
.. ipython:: python
884+
885+
pd.cut([0, 3, 5, 1], bins=c.categories)
863886
864887
Miscellaneous indexing FAQ
865888
--------------------------

doc/source/computation.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
np.set_printoptions(precision=4, suppress=True)
99
import pandas as pd
1010
import matplotlib
11-
matplotlib.style.use('ggplot')
11+
# matplotlib.style.use('default')
1212
import matplotlib.pyplot as plt
1313
plt.close('all')
1414
pd.options.display.max_rows=15

doc/source/cookbook.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
pd.options.display.max_rows=15
2121
2222
import matplotlib
23-
matplotlib.style.use('ggplot')
23+
# matplotlib.style.use('default')
2424
2525
np.set_printoptions(precision=4, suppress=True)
2626

doc/source/dsintro.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
pd.options.display.max_rows = 15
1111
1212
import matplotlib
13-
matplotlib.style.use('ggplot')
13+
# matplotlib.style.use('default')
1414
import matplotlib.pyplot as plt
1515
plt.close('all')
1616

doc/source/gotchas.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Frequently Asked Questions (FAQ)
1414
import pandas as pd
1515
pd.options.display.max_rows = 15
1616
import matplotlib
17-
matplotlib.style.use('ggplot')
17+
# matplotlib.style.use('default')
1818
import matplotlib.pyplot as plt
1919
plt.close('all')
2020

doc/source/groupby.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import pandas as pd
1111
pd.options.display.max_rows = 15
1212
import matplotlib
13-
matplotlib.style.use('ggplot')
13+
# matplotlib.style.use('default')
1414
import matplotlib.pyplot as plt
1515
plt.close('all')
1616
from collections import OrderedDict

doc/source/io.rst

+10
Original file line numberDiff line numberDiff line change
@@ -1845,6 +1845,7 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
18451845
seconds, milliseconds, microseconds or nanoseconds respectively.
18461846
- ``lines`` : reads file as one json object per line.
18471847
- ``encoding`` : The encoding to use to decode py3 bytes.
1848+
- ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration.
18481849

18491850
The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable.
18501851

@@ -2049,6 +2050,10 @@ Line delimited json
20492050
pandas is able to read and write line-delimited json files that are common in data processing pipelines
20502051
using Hadoop or Spark.
20512052

2053+
.. versionadded:: 0.21.0
2054+
2055+
For line-delimited json files, pandas can also return an iterator which reads in ``chunksize`` lines at a time. This can be useful for large files or to read from a stream.
2056+
20522057
.. ipython:: python
20532058
20542059
jsonl = '''
@@ -2059,6 +2064,11 @@ using Hadoop or Spark.
20592064
df
20602065
df.to_json(orient='records', lines=True)
20612066
2067+
# reader is an iterator that returns `chunksize` lines each iteration
2068+
reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1)
2069+
reader
2070+
for chunk in reader:
2071+
print(chunk)
20622072
20632073
.. _io.table_schema:
20642074

doc/source/missing_data.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import pandas as pd
88
pd.options.display.max_rows=15
99
import matplotlib
10-
matplotlib.style.use('ggplot')
10+
# matplotlib.style.use('default')
1111
import matplotlib.pyplot as plt
1212
1313
.. _missing_data:

0 commit comments

Comments
 (0)