pandas-dev
diff --git a/‎.github/PULL_REQUEST_TEMPLATE.md
+1-1 b/‎.github/PULL_REQUEST_TEMPLATE.md
+1-1
diff --git a/‎.travis.yml
+5-4 b/‎.travis.yml
+5-4
diff --git a/‎asv_bench/benchmarks/reindex.py
+5 b/‎asv_bench/benchmarks/reindex.py
+5
diff --git a/‎ci/install_travis.sh
+29-21 b/‎ci/install_travis.sh
+29-21
diff --git a/‎codecov.yml
+3-1 b/‎codecov.yml
+3-1
diff --git a/‎doc/source/contributing.rst
+1-1 b/‎doc/source/contributing.rst
+1-1
diff --git a/‎doc/source/text.rst
-12 b/‎doc/source/text.rst
-12
diff --git a/‎doc/source/whatsnew/v0.20.0.txt
+46-1 b/‎doc/source/whatsnew/v0.20.0.txt
+46-1
diff --git a/‎pandas/_libs/algos.pxd
+13 b/‎pandas/_libs/algos.pxd
+13
@@ -1,4 +1,4 @@
  - [ ] closes #xxxx
  - [ ] tests added / passed
- - [ ] passes ``git diff upstream/master | flake8 --diff``
+ - [ ] passes ``git diff upstream/master --name-only -- '*.py' | flake8 --diff``
  - [ ] whatsnew entry
@@ -28,7 +28,11 @@ matrix:
       os: osx
       compiler: clang
       osx_image: xcode6.4
-      cache: ccache
+      cache:
+        ccache: true
+        directories:
+          - $HOME/.cache # cython cache
+          - $HOME/.ccache # compiler cache
       env:
         - PYTHON_VERSION=3.5
         - JOB_NAME: "35_osx"
@@ -175,13 +179,10 @@ matrix:
 before_install:
   - echo "before_install"
   - source ci/travis_process_gbq_encryption.sh
-  - echo $VIRTUAL_ENV
   - export PATH="$HOME/miniconda3/bin:$PATH"
   - df -h
-  - date
   - pwd
   - uname -a
-  - python -V
   - git --version
   - git tag
   - ci/before_install_travis.sh
 
@@ -132,6 +132,9 @@ def setup(self):
         self.K = 10000
         self.key1 = np.random.randint(0, self.K, size=self.N)
         self.df_int = DataFrame({'key1': self.key1})
+        self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K,
+                                                       dtype=bool)
+                                  for i in range(10)})
 
     def time_frame_drop_dups(self):
         self.df.drop_duplicates(['key1', 'key2'])
@@ -154,6 +157,8 @@ def time_series_drop_dups_string(self):
     def time_frame_drop_dups_int(self):
         self.df_int.drop_duplicates()
 
+    def time_frame_drop_dups_bool(self):
+        self.df_bool.drop_duplicates()
 
 #----------------------------------------------------------------------
 # blog "pandas escaped the zoo"
 
@@ -1,18 +1,6 @@
 #!/bin/bash
 
-# There are 2 distinct pieces that get zipped and cached
-# - The venv site-packages dir including the installed dependencies
-# - The pandas build artifacts, using the build cache support via
-#   scripts/use_build_cache.py
-#
-# if the user opted in to use the cache and we're on a whitelisted fork
-# - if the server doesn't hold a cached version of venv/pandas build,
-#   do things the slow way, and put the results on the cache server
-#   for the next time.
-# -  if the cache files are available, instal some necessaries via apt
-#    (no compiling needed), then directly goto script and collect 200$.
-#
-
+# edit the locale file if needed
 function edit_init()
 {
     if [ -n "$LOCALE_OVERRIDE" ]; then
@@ -26,20 +14,18 @@ function edit_init()
     fi
 }
 
+echo
 echo "[install_travis]"
 edit_init
 
 home_dir=$(pwd)
-echo "[home_dir: $home_dir]"
-
-if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
-    echo "[install ccache]"
-    time brew install ccache
-fi
+echo
+echo "[home_dir]: $home_dir"
 
 # install miniconda
 MINICONDA_DIR="$HOME/miniconda3"
 
+echo
 echo "[Using clean Miniconda install]"
 
 if [ -d "$MINICONDA_DIR" ]; then
@@ -54,14 +40,17 @@ else
 fi
 time bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1
 
+echo
 echo "[show conda]"
 which conda
 
+echo
 echo "[update conda]"
 conda config --set ssl_verify false || exit 1
 conda config --set always_yes true --set changeps1 false || exit 1
 conda update -q conda
 
+echo
 echo "[add channels]"
 # add the pandas channel to take priority
 # to add extra packages
@@ -78,18 +67,28 @@ fi
 conda info -a || exit 1
 
 # set the compiler cache to work
+echo
 if [ "$USE_CACHE" ] && [ "${TRAVIS_OS_NAME}" == "linux" ]; then
     echo "[Using ccache]"
     export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH
     gcc=$(which gcc)
-    echo "[gcc: $gcc]"
+    echo "[gcc]: $gcc"
     ccache=$(which ccache)
-    echo "[ccache: $ccache]"
+    echo "[ccache]: $ccache"
     export CC='ccache gcc'
+elif [ "$USE_CACHE" ] && [ "${TRAVIS_OS_NAME}" == "osx" ]; then
+    echo "[Using ccache]"
+    time brew install ccache
+    export PATH=/usr/local/opt/ccache/libexec:$PATH
+    gcc=$(which gcc)
+    echo "[gcc]: $gcc"
+    ccache=$(which ccache)
+    echo "[ccache]: $ccache"
 else
     echo "[Not using ccache]"
 fi
 
+echo
 echo "[create env]"
 
 # may have installation instructions for this build
@@ -103,13 +102,15 @@ else
 fi
 
 # build deps
+echo
 echo "[build installs]"
 REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.build"
 if [ -e ${REQ} ]; then
     time conda install -n pandas --file=${REQ} || exit 1
 fi
 
 # may have addtl installation instructions for this build
+echo
 echo "[build addtl installs]"
 REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.build.sh"
 if [ -e ${REQ} ]; then
@@ -129,6 +130,7 @@ if [ "$COVERAGE" ]; then
     pip install coverage pytest-cov
 fi
 
+echo
 if [ "$BUILD_TEST" ]; then
 
     # build & install testing
@@ -148,20 +150,23 @@ else
 fi
 
 # we may have run installations
+echo
 echo "[conda installs]"
 REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.run"
 if [ -e ${REQ} ]; then
     time conda install -n pandas --file=${REQ} || exit 1
 fi
 
 # we may have additional pip installs
+echo
 echo "[pip installs]"
 REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.pip"
 if [ -e ${REQ} ]; then
    pip install -r $REQ
 fi
 
 # may have addtl installation instructions for this build
+echo
 echo "[addtl installs]"
 REQ="ci/requirements-${PYTHON_VERSION}${JOB_TAG}.sh"
 if [ -e ${REQ} ]; then
@@ -173,14 +178,17 @@ if [ -z "$BUILD_TEST" ]; then
 
     # remove any installed pandas package
     # w/o removing anything else
+    echo
     echo "[removing installed pandas]"
     conda remove pandas --force
 
     # install our pandas
+    echo
     echo "[running setup.py develop]"
     python setup.py develop  || exit 1
 
 fi
 
+echo
 echo "[done]"
 exit 0
@@ -1,3 +1,6 @@
+codecov:
+  branch: master
+
 coverage:
   status:
     project:
@@ -6,4 +9,3 @@ coverage:
     patch:
       default:
         target: '50'
-        branches: null
 
@@ -518,7 +518,7 @@ Travis-CI will run the `flake8 <http://pypi.python.org/pypi/flake8>`_ tool
 and report any stylistic errors in your code. Therefore, it is helpful before
 submitting code to run the check yourself on the diff::
 
-   git diff master | flake8 --diff
+   git diff master --name-only -- '*.py' | flake8 --diff
 
 This command will catch any stylistic errors in your changes specifically, but
 be beware it may not catch all of them. For example, if you delete the only
 
@@ -385,18 +385,6 @@ or match a pattern:
 The distinction between ``match`` and ``contains`` is strictness: ``match``
 relies on strict ``re.match``, while ``contains`` relies on ``re.search``.
 
-.. warning::
-
-   In previous versions, ``match`` was for *extracting* groups,
-   returning a not-so-convenient Series of tuples. The new method ``extract``
-   (described in the previous section) is now preferred.
-
-   This old, deprecated behavior of ``match`` is still the default. As
-   demonstrated above, use the new behavior by setting ``as_indexer=True``.
-   In this mode, ``match`` is analogous to ``contains``, returning a boolean
-   Series. The new behavior will become the default behavior in a future
-   release.
-
 Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
  an extra ``na`` argument so missing values can be considered True or False:
 
 
@@ -291,6 +291,7 @@ Other enhancements
 - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
 - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value  (:issue:`14154`)
 - The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value  (:issue:`10882`)
+- The ``nrows`` and ``chunksize`` arguments in ``pd.read_csv()`` are supported if both are passed (:issue:`6774`, :issue:`15755`)
 - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
 - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
 - ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs <timedeltas.isoformat>` (:issue:`15136`)
@@ -470,6 +471,38 @@ New Behavior:
 
    s.map(lambda x: x.hour)
 
+
+.. _whatsnew_0200.api_breaking.index_dt_field:
+
+Accessing datetime fields of Index now return Index
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The datetime-related attributes (see :ref:`here <timeseries.components>`
+for an overview) of ``DatetimeIndex``, ``PeriodIndex`` and ``TimedeltaIndex`` previously
+returned numpy arrays. They will now return a new ``Index`` object, except
+in the case of a boolean field, where the result will stil be a boolean ndarray. (:issue:`15022`)
+
+Previous behaviour:
+
+.. code-block:: ipython
+
+    In [1]: idx = pd.date_range("2015-01-01", periods=5, freq='10H')
+
+    In [2]: idx.hour
+    Out[2]: array([ 0, 10, 20,  6, 16], dtype=int32)
+
+New Behavior:
+
+.. ipython:: python
+
+    idx = pd.date_range("2015-01-01", periods=5, freq='10H')
+    idx.hour
+
+This has the advantage that specific ``Index`` methods are still available on the
+result. On the other hand, this might have backward incompatibilities: e.g.
+compared to numpy arrays, ``Index`` objects are not mutable. To get the original
+ndarray, you can always convert explicitly using ``np.asarray(idx.hour)``.
+
 .. _whatsnew_0200.api_breaking.s3:
 
 S3 File Handling
@@ -728,6 +761,12 @@ Other API Changes
 - ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`)
 - ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)
 - ``SparseDataFrame.default_fill_value`` will be 0, previously was ``nan`` in the return from ``pd.get_dummies(..., sparse=True)`` (:issue:`15594`)
+- The default behaviour of ``Series.str.match`` has changed from extracting
+  groups to matching the pattern. The extracting behaviour was deprecated
+  since pandas version 0.13.0 and can be done with the ``Series.str.extract``
+  method (:issue:`5224`). As a consequence, the ``as_indexer`` keyword is
+  ignored (no longer needed to specify the new behaviour) and is deprecated.
+
 
 .. _whatsnew_0200.deprecations:
 
@@ -744,6 +783,7 @@ Deprecations
 - ``Series.sortlevel`` and ``DataFrame.sortlevel`` have been deprecated in favor of ``Series.sort_index`` and ``DataFrame.sort_index`` (:issue:`15099`)
 - importing ``concat`` from ``pandas.tools.merge`` has been deprecated in favor of imports from the ``pandas`` namespace. This should only affect explict imports (:issue:`15358`)
 - ``Series/DataFrame/Panel.consolidate()`` been deprecated as a public method. (:issue:`15483`)
+- The ``as_indexer`` keyword of ``Series.str.match()`` has been deprecated (ignored keyword) (:issue:`15257`).
 - The following top-level pandas functions have been deprecated and will be removed in a future version (:issue:`13790`)
   * ``pd.pnow()``, replaced by ``Period.now()``
   * ``pd.Term``, is removed, as it is not applicable to user code. Instead use in-line string expressions in the where clause when searching in HDFStore
@@ -789,6 +829,7 @@ Performance Improvements
 - Improved performance of ``.rank()`` for categorical data (:issue:`15498`)
 - Improved performance when using ``.unstack()`` (:issue:`15503`)
 - Improved performance of merge/join on ``category`` columns (:issue:`10409`)
+- Improved performance of ``drop_duplicates()`` on ``bool`` columns (:issue:`12963`)
 
 
 .. _whatsnew_0200.bug_fixes:
@@ -813,6 +854,7 @@ Bug Fixes
 - Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
 - Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)
 - Bug in ``pd.read_csv()`` in which a file containing a row with many columns followed by rows with fewer columns would cause a crash (:issue:`14125`)
+- Added checks in ``pd.read_csv()`` ensuring that values for ``nrows`` and ``chunksize`` are valid (:issue:`15767`)
 - Bug in ``pd.tools.hashing.hash_pandas_object()`` in which hashing of categoricals depended on the ordering of categories, instead of just their values. (:issue:`15143`)
 - Bug in ``.groupby(..).resample()`` when passed the ``on=`` kwarg. (:issue:`15021`)
 - Bug in using ``__deepcopy__`` on empty NDFrame objects (:issue:`15370`)
@@ -823,9 +865,10 @@ Bug Fixes
 - Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`)
 - Compat with SciPy 0.19.0 for testing on ``.interpolate()`` (:issue:`15662`)
 
+- Compat for 32-bit platforms for ``.qcut/cut``; bins will now be ``int64`` dtype (:issue:`14866`)
 
 - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)
-- Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`)
+- Bug in ``.replace()`` may result in incorrect dtypes. (:issue:`12747`, :issue:`15765`)
 
 - Bug in ``.asfreq()``, where frequency was not set for empty ``Series`` (:issue:`14320`)
 
@@ -916,6 +959,8 @@ Bug Fixes
 - Avoid use of ``np.finfo()`` during ``import pandas`` removed to mitigate deadlock on Python GIL misuse (:issue:`14641`)
 
 - Bug in ``DataFrame.to_stata()`` and ``StataWriter`` which produces incorrectly formatted files to be produced for some locales (:issue:`13856`)
+- Bug in ``StataReader`` and ``StataWriter`` which allows invalid encodings (:issue:`15723`)
+
 - Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`)
 - Bug in ``groupby.agg()`` incorrectly localizing timezone on ``datetime`` (:issue:`15426`, :issue:`10668`, :issue:`13046`)
 
 
@@ -0,0 +1,13 @@
+from util cimport numeric
+from numpy cimport float64_t, double_t
+
+cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil
+
+cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil:
+    cdef numeric t
+
+    # cython doesn't allow pointer dereference so use array syntax
+    t = a[0]
+    a[0] = b[0]
+    b[0] = t
+    return 0