Skip to content

Commit 5bb72c7

Browse files
committed
Merge remote-tracking branch 'pandas-dev/master' into Bug13247
2 parents a1d5d40 + 026e748 commit 5bb72c7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+4290
-3003
lines changed

asv_bench/benchmarks/join_merge.py

+30-6
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pandas import ordered_merge as merge_ordered
77

88

9-
#----------------------------------------------------------------------
9+
# ----------------------------------------------------------------------
1010
# Append
1111

1212
class Append(object):
@@ -35,7 +35,7 @@ def time_append_mixed(self):
3535
self.mdf1.append(self.mdf2)
3636

3737

38-
#----------------------------------------------------------------------
38+
# ----------------------------------------------------------------------
3939
# Concat
4040

4141
class Concat(object):
@@ -120,7 +120,7 @@ def time_f_ordered_axis1(self):
120120
concat(self.frames_f, axis=1, ignore_index=True)
121121

122122

123-
#----------------------------------------------------------------------
123+
# ----------------------------------------------------------------------
124124
# Joins
125125

126126
class Join(object):
@@ -202,7 +202,7 @@ def time_join_non_unique_equal(self):
202202
(self.fracofday * self.temp[self.fracofday.index])
203203

204204

205-
#----------------------------------------------------------------------
205+
# ----------------------------------------------------------------------
206206
# Merges
207207

208208
class Merge(object):
@@ -257,7 +257,31 @@ def time_i8merge(self):
257257
merge(self.left, self.right, how='outer')
258258

259259

260-
#----------------------------------------------------------------------
260+
class MergeCategoricals(object):
261+
goal_time = 0.2
262+
263+
def setup(self):
264+
self.left_object = pd.DataFrame(
265+
{'X': np.random.choice(range(0, 10), size=(10000,)),
266+
'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})
267+
268+
self.right_object = pd.DataFrame(
269+
{'X': np.random.choice(range(0, 10), size=(10000,)),
270+
'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})
271+
272+
self.left_cat = self.left_object.assign(
273+
Y=self.left_object['Y'].astype('category'))
274+
self.right_cat = self.right_object.assign(
275+
Z=self.right_object['Z'].astype('category'))
276+
277+
def time_merge_object(self):
278+
merge(self.left_object, self.right_object, on='X')
279+
280+
def time_merge_cat(self):
281+
merge(self.left_cat, self.right_cat, on='X')
282+
283+
284+
# ----------------------------------------------------------------------
261285
# Ordered merge
262286

263287
class MergeOrdered(object):
@@ -332,7 +356,7 @@ def time_multiby(self):
332356
merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'])
333357

334358

335-
#----------------------------------------------------------------------
359+
# ----------------------------------------------------------------------
336360
# data alignment
337361

338362
class Align(object):

ci/build_docs.sh

-3
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,6 @@ if [ x"$DOC_BUILD" != x"" ]; then
2323

2424
source activate pandas
2525

26-
# install sudo deps
27-
time sudo apt-get $APT_ARGS install dvipng texlive-latex-base texlive-latex-extra
28-
2926
mv "$TRAVIS_BUILD_DIR"/doc /tmp
3027
cd /tmp/doc
3128

ci/requirements_dev.txt

-1
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,4 @@ numpy
44
cython
55
pytest
66
pytest-cov
7-
pytest-xdist
87
flake8

doc/make.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def html():
197197
print(e)
198198
print("Failed to convert %s" % nb)
199199

200-
if os.system('sphinx-build -j 2 -P -b html -d build/doctrees '
200+
if os.system('sphinx-build -P -b html -d build/doctrees '
201201
'source build/html'):
202202
raise SystemExit("Building HTML failed.")
203203
try:

doc/source/api.rst

+11-4
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,6 @@ Google BigQuery
118118
:toctree: generated/
119119

120120
read_gbq
121-
to_gbq
122121

123122

124123
.. currentmodule:: pandas
@@ -712,8 +711,8 @@ Serialization / IO / Conversion
712711
Series.to_string
713712
Series.to_clipboard
714713

715-
Sparse methods
716-
~~~~~~~~~~~~~~
714+
Sparse
715+
~~~~~~
717716
.. autosummary::
718717
:toctree: generated/
719718

@@ -1031,6 +1030,13 @@ Serialization / IO / Conversion
10311030
DataFrame.to_string
10321031
DataFrame.to_clipboard
10331032

1033+
Sparse
1034+
~~~~~~
1035+
.. autosummary::
1036+
:toctree: generated/
1037+
1038+
SparseDataFrame.to_coo
1039+
10341040
.. _api.panel:
10351041

10361042
Panel
@@ -1237,7 +1243,7 @@ Serialization / IO / Conversion
12371243
Panel.to_frame
12381244
Panel.to_xarray
12391245
Panel.to_clipboard
1240-
1246+
12411247
.. _api.index:
12421248

12431249
Index
@@ -1405,6 +1411,7 @@ MultiIndex
14051411
:toctree: generated/
14061412

14071413
MultiIndex
1414+
IndexSlice
14081415

14091416
MultiIndex Components
14101417
~~~~~~~~~~~~~~~~~~~~~~

doc/source/categorical.rst

+3
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,9 @@ In this case the categories are not the same and so an error is raised:
646646
647647
The same applies to ``df.append(df_different)``.
648648

649+
See also the section on :ref:`merge dtypes<merging.dtypes>` for notes about preserving merge dtypes and performance.
650+
651+
649652
.. _categorical.union:
650653

651654
Unioning

doc/source/conf.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@
1616
import inspect
1717
from pandas.compat import u, PY3
1818

19+
# https://github.com/sphinx-doc/sphinx/pull/2325/files
20+
# Workaround for sphinx-build recursion limit overflow:
21+
# pickle.dump(doctree, f, pickle.HIGHEST_PROTOCOL)
22+
# RuntimeError: maximum recursion depth exceeded while pickling an object
23+
#
24+
# Python's default allowed recursion depth is 1000.
25+
sys.setrecursionlimit(5000)
26+
1927
# If extensions (or modules to document with autodoc) are in another directory,
2028
# add these directories to sys.path here. If the directory is relative to the
2129
# documentation root, use os.path.abspath to make it absolute, like shown here.
@@ -46,7 +54,7 @@
4654
'ipython_sphinxext.ipython_console_highlighting',
4755
'sphinx.ext.intersphinx',
4856
'sphinx.ext.coverage',
49-
'sphinx.ext.pngmath',
57+
'sphinx.ext.mathjax',
5058
'sphinx.ext.ifconfig',
5159
'sphinx.ext.linkcode',
5260
]

doc/source/index.rst.template

-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@ See the package overview for more detail about what's in the library.
116116
whatsnew
117117
install
118118
contributing
119-
faq
120119
overview
121120
10min
122121
tutorials

doc/source/install.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ Optional Dependencies
260260
<http://www.vergenet.net/~conrad/software/xsel/>`__, or `xclip
261261
<https://github.com/astrand/xclip/>`__: necessary to use
262262
:func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation.
263-
* For Google BigQuery I/O - see :ref:`here <io.bigquery_deps>`.
263+
* For Google BigQuery I/O - see `here <https://pandas-gbq.readthedocs.io/en/latest/install.html#dependencies>`__
264264

265265
* `Backports.lzma <https://pypi.python.org/pypi/backports.lzma/>`__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library.
266266
* One of the following combinations of libraries is needed to use the

doc/source/io.rst

+61-4
Original file line numberDiff line numberDiff line change
@@ -2070,9 +2070,9 @@ by the Table Schema spec.
20702070
The full list of types supported are described in the Table Schema
20712071
spec. This table shows the mapping from pandas types:
20722072

2073-
============== =================
2073+
=============== =================
20742074
Pandas type Table Schema type
2075-
============== =================
2075+
=============== =================
20762076
int64 integer
20772077
float64 number
20782078
bool boolean
@@ -3042,9 +3042,66 @@ any pickled pandas object (or any other pickled object) from file:
30423042
See `this question <http://stackoverflow.com/questions/20444593/pandas-compiled-from-source-default-pickle-behavior-changed>`__
30433043
for a detailed explanation.
30443044

3045-
.. note::
3045+
.. _io.pickle.compression:
3046+
3047+
Compressed pickle files
3048+
'''''''''''''''''''''''
3049+
3050+
.. versionadded:: 0.20.0
3051+
3052+
:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can read
3053+
and write compressed pickle files. The compression types of ``gzip``, ``bz2``, ``xz`` are supported for reading and writing.
3054+
`zip`` file supports read only and must contain only one data file
3055+
to be read in.
3056+
3057+
The compression type can be an explicit parameter or be inferred from the file extension.
3058+
If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or
3059+
``'.xz'``, respectively.
3060+
3061+
.. ipython:: python
3062+
3063+
df = pd.DataFrame({
3064+
'A': np.random.randn(1000),
3065+
'B': 'foo',
3066+
'C': pd.date_range('20130101', periods=1000, freq='s')})
3067+
df
3068+
3069+
Using an explicit compression type
3070+
3071+
.. ipython:: python
3072+
3073+
df.to_pickle("data.pkl.compress", compression="gzip")
3074+
rt = pd.read_pickle("data.pkl.compress", compression="gzip")
3075+
rt
3076+
3077+
Inferring compression type from the extension
3078+
3079+
.. ipython:: python
3080+
3081+
df.to_pickle("data.pkl.xz", compression="infer")
3082+
rt = pd.read_pickle("data.pkl.xz", compression="infer")
3083+
rt
30463084
3047-
These methods were previously ``pd.save`` and ``pd.load``, prior to 0.12.0, and are now deprecated.
3085+
The default is to 'infer
3086+
3087+
.. ipython:: python
3088+
3089+
df.to_pickle("data.pkl.gz")
3090+
rt = pd.read_pickle("data.pkl.gz")
3091+
rt
3092+
3093+
df["A"].to_pickle("s1.pkl.bz2")
3094+
rt = pd.read_pickle("s1.pkl.bz2")
3095+
rt
3096+
3097+
.. ipython:: python
3098+
:suppress:
3099+
3100+
import os
3101+
os.remove("data.pkl.compress")
3102+
os.remove("data.pkl.xz")
3103+
os.remove("data.pkl.gz")
3104+
os.remove("s1.pkl.bz2")
30483105
30493106
.. _io.msgpack:
30503107

doc/source/merging.rst

+73
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,79 @@ The ``indicator`` argument will also accept string arguments, in which case the
746746
pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
747747
748748
749+
.. _merging.dtypes:
750+
751+
Merge Dtypes
752+
~~~~~~~~~~~~
753+
754+
.. versionadded:: 0.19.0
755+
756+
Merging will preserve the dtype of the join keys.
757+
758+
.. ipython:: python
759+
760+
left = pd.DataFrame({'key': [1], 'v1': [10]})
761+
left
762+
right = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]})
763+
right
764+
765+
We are able to preserve the join keys
766+
767+
.. ipython:: python
768+
769+
pd.merge(left, right, how='outer')
770+
pd.merge(left, right, how='outer').dtypes
771+
772+
Of course if you have missing values that are introduced, then the
773+
resulting dtype will be upcast.
774+
775+
.. ipython:: python
776+
777+
pd.merge(left, right, how='outer', on='key')
778+
pd.merge(left, right, how='outer', on='key').dtypes
779+
780+
.. versionadded:: 0.20.0
781+
782+
Merging will preserve ``category`` dtypes of the mergands.
783+
784+
The left frame.
785+
786+
.. ipython:: python
787+
788+
X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,)))
789+
X = X.astype('category', categories=['foo', 'bar'])
790+
791+
left = DataFrame({'X': X,
792+
'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
793+
left
794+
left.dtypes
795+
796+
The right frame.
797+
798+
.. ipython:: python
799+
800+
right = DataFrame({'X': Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']),
801+
'Z': [1, 2]})
802+
right
803+
right.dtypes
804+
805+
The merged result
806+
807+
.. ipython:: python
808+
809+
result = pd.merge(left, right, how='outer')
810+
result
811+
result.dtypes
812+
813+
.. note::
814+
815+
The category dtypes must be *exactly* the same, meaning the same categories and the ordered attribute.
816+
Otherwise the result will coerce to ``object`` dtype.
817+
818+
.. note::
819+
820+
Merging on ``category`` dtypes that are the same can be quite performant compared to ``object`` dtype merging.
821+
749822
.. _merging.join.index:
750823

751824
Joining on index

0 commit comments

Comments
 (0)