Skip to content

Commit c4e7a2a

Browse files
committed
Merge remote-tracking branch 'upstream/master' into KeyError
2 parents 0947982 + 2466ecb commit c4e7a2a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+1791
-1474
lines changed

.travis.yml

+3-7
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,9 @@ cache:
1414

1515
env:
1616
global:
17-
# scatterci API key
18-
#- secure: "Bx5umgo6WjuGY+5XFa004xjCiX/vq0CyMZ/ETzcs7EIBI1BE/0fIDXOoWhoxbY9HPfdPGlDnDgB9nGqr5wArO2s+BavyKBWg6osZ3dmkfuJPMOWeyCa92EeP+sfKw8e5HSU5MizW9e319wHWOF/xkzdHR7T67Qd5erhv91x4DnQ="
19-
# ironcache API key
20-
#- secure: "e4eEFn9nDQc3Xa5BWYkzfX37jaWVq89XidVX+rcCNEr5OlOImvveeXnF1IzbRXznH4Sv0YsLwUd8RGUWOmyCvkONq/VJeqCHWtTMyfaCIdqSyhIP9Odz8r9ahch+Y0XFepBey92AJHmlnTh+2GjCDgIiqq4fzglojnp56Vg1ojA="
21-
#- secure: "CjmYmY5qEu3KrvMtel6zWFEtMq8ORBeS1S1odJHnjQpbwT1KY2YFZRVlLphfyDQXSz6svKUdeRrCNp65baBzs3DQNA8lIuXGIBYFeJxqVGtYAZZs6+TzBPfJJK798sGOj5RshrOJkFG2rdlWNuTq/XphI0JOrN3nPUkRrdQRpAw="
22-
# pandas-docs-bot GH
23-
- secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ="
17+
18+
# pandas-docs-travis GH
19+
- secure: "UJK7kUtkcnV9PFP4IBXAvgmRQKdwARlfqF4UZQ5tBwrpnD1a3n7FLBijcuXQ3jkvwpEc/FZB9RJDXmsqYXJPvq3BC++2Cv2tFDvKr/c+y8KffszAyVk47jKEHMNmGgauwaNMggsE/rH8YHe4so9LsJHTRbzmLo8lXPNTldoIu5s="
2420

2521
git:
2622
# for cloning

asv_bench/benchmarks/frame_methods.py

+11
Original file line numberDiff line numberDiff line change
@@ -1012,3 +1012,14 @@ def setup(self):
10121012

10131013
def time_frame_quantile_axis1(self):
10141014
self.df.quantile([0.1, 0.5], axis=1)
1015+
1016+
1017+
class frame_nlargest(object):
1018+
goal_time = 0.2
1019+
1020+
def setup(self):
1021+
self.df = DataFrame(np.random.randn(1000, 3),
1022+
columns=list('ABC'))
1023+
1024+
def time_frame_nlargest(self):
1025+
self.df.nlargest(100, 'A')

asv_bench/benchmarks/replace.py

+24
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,30 @@ def time_replace_large_dict(self):
3232
self.s.replace(self.to_rep, inplace=True)
3333

3434

35+
class replace_convert(object):
36+
goal_time = 0.5
37+
38+
def setup(self):
39+
self.n = (10 ** 3)
40+
self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n)))
41+
self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n)))
42+
self.s = Series(np.random.randint(self.n, size=(10 ** 3)))
43+
self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)),
44+
'B': np.random.randint(self.n, size=(10 ** 3))})
45+
46+
def time_replace_series_timestamp(self):
47+
self.s.replace(self.to_ts)
48+
49+
def time_replace_series_timedelta(self):
50+
self.s.replace(self.to_td)
51+
52+
def time_replace_frame_timestamp(self):
53+
self.df.replace(self.to_ts)
54+
55+
def time_replace_frame_timedelta(self):
56+
self.df.replace(self.to_td)
57+
58+
3559
class replace_replacena(object):
3660
goal_time = 0.2
3761

ci/build_docs.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ if [ x"$DOC_BUILD" != x"" ]; then
4343
cd /tmp/doc/build/html
4444
git config --global user.email "[email protected]"
4545
git config --global user.name "pandas-docs-bot"
46+
git config --global credential.helper cache
4647

48+
# create the repo
4749
git init
4850
touch README
4951
git add README
@@ -53,7 +55,7 @@ if [ x"$DOC_BUILD" != x"" ]; then
5355
touch .nojekyll
5456
git add --all .
5557
git commit -m "Version" --allow-empty
56-
git remote add origin https://$GH_TOKEN@github.com/pandas-docs/pandas-docs-travis
58+
git remote add origin "https://$GH_TOKEN@github.com/pandas-docs/pandas-docs-travis"
5759
git push origin gh-pages -f
5860
fi
5961

ci/lint.sh

+12
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,18 @@ if [ "$LINT" ]; then
3535
done
3636
echo "Linting *.pxi.in DONE"
3737

38+
# readability/casting: Warnings about C casting instead of C++ casting
39+
# runtime/int: Warnings about using C number types instead of C++ ones
40+
# build/include_subdir: Warnings about prefacing included header files with directory
41+
pip install cpplint
42+
43+
echo "Linting *.c and *.h"
44+
cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/src/parser
45+
if [ $? -ne "0" ]; then
46+
RET=1
47+
fi
48+
echo "Linting *.c and *.h DONE"
49+
3850
echo "Check for invalid testing"
3951
grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas
4052
if [ $? = "0" ]; then

doc/source/api.rst

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Flat File
2727
read_table
2828
read_csv
2929
read_fwf
30+
read_msgpack
3031

3132
Clipboard
3233
~~~~~~~~~
@@ -691,6 +692,7 @@ Serialization / IO / Conversion
691692
Series.to_pickle
692693
Series.to_csv
693694
Series.to_dict
695+
Series.to_excel
694696
Series.to_frame
695697
Series.to_xarray
696698
Series.to_hdf

doc/source/cookbook.rst

+2-4
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,8 @@ Splitting
107107
df = pd.DataFrame(
108108
{'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df
109109
110-
dflow = df[df.AAA <= 5]
111-
dfhigh = df[df.AAA > 5]
112-
113-
dflow; dfhigh
110+
dflow = df[df.AAA <= 5]; dflow
111+
dfhigh = df[df.AAA > 5]; dfhigh
114112
115113
Building Criteria
116114
*****************

doc/source/io.rst

+44-6
Original file line numberDiff line numberDiff line change
@@ -126,13 +126,23 @@ index_col : int or sequence or ``False``, default ``None``
126126
MultiIndex is used. If you have a malformed file with delimiters at the end of
127127
each line, you might consider ``index_col=False`` to force pandas to *not* use
128128
the first column as the index (row names).
129-
usecols : array-like, default ``None``
130-
Return a subset of the columns. All elements in this array must either
129+
usecols : array-like or callable, default ``None``
130+
Return a subset of the columns. If array-like, all elements must either
131131
be positional (i.e. integer indices into the document columns) or strings
132132
that correspond to column names provided either by the user in `names` or
133-
inferred from the document header row(s). For example, a valid `usecols`
134-
parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter
135-
results in much faster parsing time and lower memory usage.
133+
inferred from the document header row(s). For example, a valid array-like
134+
`usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz'].
135+
136+
If callable, the callable function will be evaluated against the column names,
137+
returning names where the callable function evaluates to True:
138+
139+
.. ipython:: python
140+
141+
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
142+
pd.read_csv(StringIO(data))
143+
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])
144+
145+
Using this parameter results in much faster parsing time and lower memory usage.
136146
as_recarray : boolean, default ``False``
137147
DEPRECATED: this argument will be removed in a future version. Please call
138148
``pd.read_csv(...).to_records()`` instead.
@@ -617,14 +627,17 @@ Filtering columns (``usecols``)
617627
+++++++++++++++++++++++++++++++
618628

619629
The ``usecols`` argument allows you to select any subset of the columns in a
620-
file, either using the column names or position numbers:
630+
file, either using the column names, position numbers or a callable:
631+
632+
.. versionadded:: 0.20.0 support for callable `usecols` arguments
621633

622634
.. ipython:: python
623635
624636
data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz'
625637
pd.read_csv(StringIO(data))
626638
pd.read_csv(StringIO(data), usecols=['b', 'd'])
627639
pd.read_csv(StringIO(data), usecols=[0, 2, 3])
640+
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C'])
628641
629642
Comments and Empty Lines
630643
''''''''''''''''''''''''
@@ -1268,11 +1281,22 @@ is whitespace).
12681281
df = pd.read_fwf('bar.csv', header=None, index_col=0)
12691282
df
12701283
1284+
.. versionadded:: 0.20.0
1285+
1286+
``read_fwf`` supports the ``dtype`` parameter for specifying the types of
1287+
parsed columns to be different from the inferred type.
1288+
1289+
.. ipython:: python
1290+
1291+
pd.read_fwf('bar.csv', header=None, index_col=0).dtypes
1292+
pd.read_fwf('bar.csv', header=None, dtype={2: 'object'}).dtypes
1293+
12711294
.. ipython:: python
12721295
:suppress:
12731296
12741297
os.remove('bar.csv')
12751298
1299+
12761300
Indexes
12771301
'''''''
12781302

@@ -2527,6 +2551,20 @@ missing data to recover integer dtype:
25272551
cfun = lambda x: int(x) if x else -1
25282552
read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun})
25292553
2554+
dtype Specifications
2555+
++++++++++++++++++++
2556+
2557+
.. versionadded:: 0.20
2558+
2559+
As an alternative to converters, the type for an entire column can
2560+
be specified using the `dtype` keyword, which takes a dictionary
2561+
mapping column names to types. To interpret data with
2562+
no type inference, use the type ``str`` or ``object``.
2563+
2564+
.. code-block:: python
2565+
2566+
read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str})
2567+
25302568
.. _io.excel_writer:
25312569
25322570
Writing Excel Files

doc/source/whatsnew/v0.19.2.txt

+8-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Highlights include:
2121
Performance Improvements
2222
~~~~~~~~~~~~~~~~~~~~~~~~
2323

24+
- Improved performance of ``.replace()`` (:issue:`12745`)
2425

2526
.. _whatsnew_0192.bug_fixes:
2627

@@ -32,9 +33,10 @@ Bug Fixes
3233
- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)
3334
- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`)
3435
- Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally.
36+
- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when ``skipfooter`` was not being respected by Python's CSV library (:issue:`13879`)
3537

3638

37-
39+
- Bug in ``.groupby(..., sort=True)`` of a non-lexsorted MultiIndex when grouping with multiple levels (:issue:`14776`)
3840

3941

4042

@@ -56,6 +58,10 @@ Bug Fixes
5658

5759

5860

61+
- Bug in ``HDFStore`` when writing a ``MultiIndex`` when using ``data_columns=True`` (:issue:`14435`)
62+
- Bug in ``HDFStore.append()`` when writing a ``Series`` and passing a ``min_itemsize`` argument containing a value for the ``index`` (:issue:`11412`)
63+
- Bug in ``Series.groupby.nunique()`` raising an ``IndexError`` for an empty ``Series`` (:issue:`12553`)
64+
- Bug in ``DataFrame.nlargest`` and ``DataFrame.nsmallest`` when the index had duplicate values (:issue:`13412`)
5965

6066

6167

@@ -65,6 +71,7 @@ Bug Fixes
6571

6672

6773
- Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`)
74+
- Bug in ``pd.read_csv()`` in which the ``nrows`` parameter was not being respected for large input when using the C engine for parsing (:issue:`7626`)
6875

6976

7077

doc/source/whatsnew/v0.20.0.txt

+15-2
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ New features
2222
~~~~~~~~~~~~
2323

2424

25-
``read_csv`` supports ``dtype`` keyword for python engine
26-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
25+
``dtype`` keyword for data io
26+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2727

2828
The ``dtype`` keyword argument in the :func:`read_csv` function for specifying the types of parsed columns
2929
is now supported with the ``'python'`` engine (:issue:`14295`). See the :ref:`io docs <io.dtypes>` for more information.
@@ -34,17 +34,30 @@ The ``dtype`` keyword argument in the :func:`read_csv` function for specifying t
3434
pd.read_csv(StringIO(data), engine='python').dtypes
3535
pd.read_csv(StringIO(data), engine='python', dtype={'a':'float64', 'b':'object'}).dtypes
3636

37+
The ``dtype`` keyword argument is also now supported in the :func:`read_fwf` function for parsing
38+
fixed-width text files, and :func:`read_excel` for parsing Excel files.
39+
40+
.. ipython:: python
41+
42+
data = "a b\n1 2\n3 4"
43+
pd.read_fwf(StringIO(data)).dtypes
44+
pd.read_fwf(StringIO(data), dtype={'a':'float64', 'b':'object'}).dtypes
45+
3746
.. _whatsnew_0200.enhancements.other:
3847

3948
Other enhancements
4049
^^^^^^^^^^^^^^^^^^
50+
- ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`)
4151

4252
- ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
4353

4454
- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an
4555
unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack
4656
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`
4757

58+
- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (issue:`14714`)
59+
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
60+
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
4861

4962
.. _whatsnew_0200.api_breaking:
5063

pandas/core/algorithms.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -684,11 +684,12 @@ def select_n_slow(dropped, n, keep, method):
684684
_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}
685685

686686

687-
def select_n(series, n, keep, method):
688-
"""Implement n largest/smallest.
687+
def select_n_series(series, n, keep, method):
688+
"""Implement n largest/smallest for pandas Series
689689
690690
Parameters
691691
----------
692+
series : pandas.Series object
692693
n : int
693694
keep : {'first', 'last'}, default 'first'
694695
method : str, {'nlargest', 'nsmallest'}
@@ -717,6 +718,31 @@ def select_n(series, n, keep, method):
717718
return dropped.iloc[inds]
718719

719720

721+
def select_n_frame(frame, columns, n, method, keep):
722+
"""Implement n largest/smallest for pandas DataFrame
723+
724+
Parameters
725+
----------
726+
frame : pandas.DataFrame object
727+
columns : list or str
728+
n : int
729+
keep : {'first', 'last'}, default 'first'
730+
method : str, {'nlargest', 'nsmallest'}
731+
732+
Returns
733+
-------
734+
nordered : DataFrame
735+
"""
736+
from pandas.core.series import Series
737+
if not is_list_like(columns):
738+
columns = [columns]
739+
columns = list(columns)
740+
ser = getattr(frame[columns[0]], method)(n, keep=keep)
741+
if isinstance(ser, Series):
742+
ser = ser.to_frame()
743+
return ser.merge(frame, on=columns[0], left_index=True)[frame.columns]
744+
745+
720746
def _finalize_nsmallest(arr, kth_val, n, keep, narr):
721747
ns, = np.nonzero(arr <= kth_val)
722748
inds = ns[arr[ns].argsort(kind='mergesort')][:n]

0 commit comments

Comments
 (0)