Merge branch 'master' of https://github.com/pydata/pandas into fix-column-dtype-mixing

jbuyl · jbuyl · commit b80d69bb125c · 2015-08-05T12:33:36.000+02:00
Conflicts:
	doc/source/whatsnew/v0.17.0.txt
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -904,6 +904,8 @@ Reshaping, sorting, transposing
    DataFrame.sort
    DataFrame.sort_index
    DataFrame.sortlevel
+   DataFrame.nlargest
+   DataFrame.nsmallest
    DataFrame.swaplevel
    DataFrame.stack
    DataFrame.unstack
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1497,6 +1497,20 @@ faster than sorting the entire Series and calling ``head(n)`` on the result.
    s.nsmallest(3)
    s.nlargest(3)
 
+.. versionadded:: 0.17.0
+
+``DataFrame`` also has the ``nlargest`` and ``nsmallest`` methods.
+
+.. ipython:: python
+
+   df = DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1],
+                   'b': list('abdceff'),
+                   'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]})
+   df.nlargest(3, 'a')
+   df.nlargest(5, ['a', 'c'])
+   df.nsmallest(3, 'a')
+   df.nsmallest(5, ['a', 'c'])
+
 
 .. _basics.multi-index_sorting:
 
diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -267,11 +267,11 @@ Optional Dependencies
   installation.
 * Google's `python-gflags <http://code.google.com/p/python-gflags/>`__
   and `google-api-python-client <http://github.com/google/google-api-python-client>`__
-   * Needed for :mod:`~pandas.io.gbq`
+  * Needed for :mod:`~pandas.io.gbq`
 * `setuptools <https://pypi.python.org/pypi/setuptools/>`__
-   * Needed for :mod:`~pandas.io.gbq` (specifically, it utilizes `pkg_resources`)
+  * Needed for :mod:`~pandas.io.gbq` (specifically, it utilizes `pkg_resources`)
 * `httplib2 <http://pypi.python.org/pypi/httplib2>`__
-   * Needed for :mod:`~pandas.io.gbq`
+  * Needed for :mod:`~pandas.io.gbq`
 * One of the following combinations of libraries is needed to use the
   top-level :func:`~pandas.io.html.read_html` function:
 
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -3610,7 +3610,7 @@ below and the SQLAlchemy `documentation <http://docs.sqlalchemy.org/en/rel_0_9/c
 
 If you want to manage your own connections you can pass one of those instead:
 
-.. ipython:: python
+.. code-block:: python
 
    with engine.connect() as conn, conn.begin():
        data = pd.read_sql_table('data', conn)
diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst
@@ -208,21 +208,13 @@ Pass ``errors='coerce'`` to convert invalid data to ``NaT`` (not a time):
    :okexcept:
 
    # this is the default, raise when unparseable
-   to_datetime(['2009-07-31', 'asd'], errors='raise')
+   to_datetime(['2009/07/31', 'asd'], errors='raise')
 
    # return the original input when unparseable
-   to_datetime(['2009-07-31', 'asd'], errors='ignore')
+   to_datetime(['2009/07/31', 'asd'], errors='ignore')
 
    # return NaT for input when unparseable
-   to_datetime(['2009-07-31', 'asd'], errors='coerce')
-
-
-Take care, ``to_datetime`` may not act as you expect on mixed data:
-
-.. ipython:: python
-   :okexcept:
-
-   to_datetime([1, '1'])
+   to_datetime(['2009/07/31', 'asd'], errors='coerce')
 
 Epoch Timestamps
 ~~~~~~~~~~~~~~~~
diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst
@@ -1649,6 +1649,7 @@ values, the resulting grid has two columns and two rows. A histogram is
 displayed for each cell of the grid.
 
 .. ipython:: python
+   :okwarning:
 
    plt.figure()
 
@@ -1680,6 +1681,7 @@ Example below is the same as previous except the plot is set to kernel density
 estimation. A ``seaborn`` example is included beneath.
 
 .. ipython:: python
+   :okwarning:
 
    plt.figure()
 
@@ -1706,6 +1708,7 @@ The plot below shows that it is possible to have two or more plots for the same
 data displayed on the same Trellis grid cell.
 
 .. ipython:: python
+   :okwarning:
 
    plt.figure()
 
@@ -1745,6 +1748,7 @@ Below is a similar plot but with 2D kernel density estimation plot superimposed,
 followed by a ``seaborn`` equivalent:
 
 .. ipython:: python
+   :okwarning:
 
    plt.figure()
 
@@ -1774,6 +1778,7 @@ only uses 'sex' attribute. If the second grouping attribute is not specified,
 the plots will be arranged in a column.
 
 .. ipython:: python
+   :okwarning:
 
    plt.figure()
 
@@ -1792,6 +1797,7 @@ the plots will be arranged in a column.
 If the first grouping attribute is not specified the plots will be arranged in a row.
 
 .. ipython:: python
+   :okwarning:
 
    plt.figure()
 
@@ -1816,6 +1822,7 @@ scale objects to specify these mappings. The list of scale classes is
 given below with initialization arguments for quick reference.
 
 .. ipython:: python
+   :okwarning:
 
    plt.figure()
 
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -13,13 +13,13 @@ users upgrade to this version.
 
 Highlights include:
 
-  - Release the Global Interpreter Lock (GIL) on some cython operations, see :ref:`here <whatsnew_0170.gil>`
-  - The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats,
-    previously this would return the original input, see :ref:`here <whatsnew_0170.api_breaking.to_datetime>`
-  - The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even
-    if they are all ``NaN``, see :ref:`here <whatsnew_0170.api_breaking.hdf_dropna>`
-  - Support for ``Series.dt.strftime`` to generate formatted strings for datetime-likes, see :ref:`here <whatsnew_0170.strftime>`
-  - Development installed versions of pandas will now have ``PEP440`` compliant version strings (:issue:`9518`)
+- Release the Global Interpreter Lock (GIL) on some cython operations, see :ref:`here <whatsnew_0170.gil>`
+- The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats,
+  previously this would return the original input, see :ref:`here <whatsnew_0170.api_breaking.to_datetime>`
+- The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even
+  if they are all ``NaN``, see :ref:`here <whatsnew_0170.api_breaking.hdf_dropna>`
+- Support for ``Series.dt.strftime`` to generate formatted strings for datetime-likes, see :ref:`here <whatsnew_0170.strftime>`
+- Development installed versions of pandas will now have ``PEP440`` compliant version strings (:issue:`9518`)
 
 Check the :ref:`API Changes <whatsnew_0170.api>` and :ref:`deprecations <whatsnew_0170.deprecations>` before updating.
 
@@ -32,6 +32,7 @@ Check the :ref:`API Changes <whatsnew_0170.api>` and :ref:`deprecations <whatsne
 New features
 ~~~~~~~~~~~~
 
+- ``DataFrame`` has the ``nlargest`` and ``nsmallest`` methods (:issue:`10393`)
 - SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`)
 - Enable writing complex values to HDF stores when using table format (:issue:`10447`)
 - Enable reading gzip compressed files via URL, either by explicitly setting the compression parameter or by inferring from the presence of the HTTP Content-Encoding header in the response (:issue:`8685`)
@@ -448,6 +449,7 @@ from ``7``.
 
 .. ipython:: python
   :suppress:
+
   pd.set_option('display.precision', 6)
 
 
@@ -457,6 +459,8 @@ Other API Changes
 ^^^^^^^^^^^^^^^^^
 
 - Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`)
+- Calling the ``.value_counts`` method on a Series with ``categorical`` dtype now returns a
+Series with a ``CategoricalIndex`` (:issue:`10704`)
 - Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`)
 - Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`)
 - Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
@@ -479,9 +483,9 @@ Other API Changes
 - ``groupby`` using ``Categorical`` follows the same rule as ``Categorical.unique`` described above  (:issue:`10508`)
 - ``NaT``'s methods now either raise ``ValueError``, or return ``np.nan`` or ``NaT`` (:issue:`9513`)
 
-   ===============================     ==============================================================
+   ===============================     ===============================================================
    Behavior                            Methods
-   ===============================     ==============================================================
+   ===============================     ===============================================================
    ``return np.nan``                   ``weekday``, ``isoweekday``
    ``return NaT``                      ``date``, ``now``, ``replace``, ``to_datetime``, ``today``
    ``return np.datetime64('NaT')``     ``to_datetime64`` (unchanged)
@@ -544,6 +548,8 @@ Performance Improvements
 Bug Fixes
 ~~~~~~~~~
 
+
+- Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`)
 - Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
 - Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`)
 - Bug in ``Index.drop_duplicates`` dropping name(s) (:issue:`10115`)
@@ -606,4 +612,5 @@ Bug Fixes
 - Bug in vectorised setting of timestamp columns with python ``datetime.date`` and numpy ``datetime64`` (:issue:`10408`, :issue:`10412`)
 
 - Bug in ``pd.DataFrame`` when constructing an empty DataFrame with a string dtype (:issue:`9428`)
-- Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10739`)
+- Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10739`)
+- Bug in ``pd.unique`` for arrays with the ``datetime64`` or ``timedelta64`` dtype that meant an array with object dtype was returned instead the original dtype (:issue: `9431`)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -36,7 +36,7 @@ def match(to_match, values, na_sentinel=-1):
         values = np.array(values, dtype='O')
 
     f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
-    result = _hashtable_algo(f, values.dtype)
+    result = _hashtable_algo(f, values.dtype, np.int64)
 
     if na_sentinel != -1:
 
@@ -66,14 +66,20 @@ def unique(values):
     return _hashtable_algo(f, values.dtype)
 
 
-def _hashtable_algo(f, dtype):
+def _hashtable_algo(f, dtype, return_dtype=None):
     """
     f(HashTable, type_caster) -> result
     """
     if com.is_float_dtype(dtype):
         return f(htable.Float64HashTable, com._ensure_float64)
     elif com.is_integer_dtype(dtype):
         return f(htable.Int64HashTable, com._ensure_int64)
+    elif com.is_datetime64_dtype(dtype):
+        return_dtype = return_dtype or 'M8[ns]'
+        return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype)
+    elif com.is_timedelta64_dtype(dtype):
+        return_dtype = return_dtype or 'm8[ns]'
+        return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype)
     else:
         return f(htable.PyObjectHashTable, com._ensure_object)
 
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -1027,6 +1027,7 @@ def value_counts(self, dropna=True):
         """
         import pandas.hashtable as htable
         from pandas.core.series import Series
+        from pandas.core.index import CategoricalIndex
 
         cat = self.dropna() if dropna else self
         keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes))
@@ -1036,10 +1037,12 @@ def value_counts(self, dropna=True):
         if not dropna and -1 in keys:
             ix = np.append(ix, -1)
         result = result.reindex(ix, fill_value=0)
-        result.index = (np.append(cat.categories, np.nan)
+        index = (np.append(cat.categories, np.nan)
             if not dropna and -1 in keys
             else cat.categories)
 
+        result.index = CategoricalIndex(index, self.categories, self.ordered)
+
         return result
 
     def get_values(self):
diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -1037,7 +1037,7 @@ def _column_header():
             self.write_tr(col_row, indent, self.indent_delta, header=True,
                           align=align)
 
-        if self.fmt.has_index_names:
+        if self.fmt.has_index_names and self.fmt.index:
             row = [
                 x if x is not None else '' for x in self.frame.index.names
             ] + [''] * min(len(self.columns), self.max_cols)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3127,6 +3127,79 @@ def sortlevel(self, level=0, axis=0, ascending=True,
         else:
             return self._constructor(new_data).__finalize__(self)
 
+    def _nsorted(self, columns, n, method, take_last):
+        if not com.is_list_like(columns):
+            columns = [columns]
+        columns = list(columns)
+        ser = getattr(self[columns[0]], method)(n, take_last=take_last)
+        ascending = dict(nlargest=False, nsmallest=True)[method]
+        return self.loc[ser.index].sort(columns, ascending=ascending,
+                                        kind='mergesort')
+
+    def nlargest(self, n, columns, take_last=False):
+        """Get the rows of a DataFrame sorted by the `n` largest
+        values of `columns`.
+
+        .. versionadded:: 0.17.0
+
+        Parameters
+        ----------
+        n : int
+            Number of items to retrieve
+        columns : list or str
+            Column name or names to order by
+        take_last : bool, optional
+            Where there are duplicate values, take the last duplicate
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> df = DataFrame({'a': [1, 10, 8, 11, -1],
+        ...                 'b': list('abdce'),
+        ...                 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
+        >>> df.nlargest(3, 'a')
+            a  b   c
+        3  11  c   3
+        1  10  b   2
+        2   8  d NaN
+        """
+        return self._nsorted(columns, n, 'nlargest', take_last)
+
+    def nsmallest(self, n, columns, take_last=False):
+        """Get the rows of a DataFrame sorted by the `n` smallest
+        values of `columns`.
+
+        .. versionadded:: 0.17.0
+
+        Parameters
+        ----------
+        n : int
+            Number of items to retrieve
+        columns : list or str
+            Column name or names to order by
+        take_last : bool, optional
+            Where there are duplicate values, take the last duplicate
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> df = DataFrame({'a': [1, 10, 8, 11, -1],
+        ...                 'b': list('abdce'),
+        ...                 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
+        >>> df.nsmallest(3, 'a')
+           a  b   c
+        4 -1  e   4
+        0  1  a   1
+        2  8  d NaN
+        """
+        return self._nsorted(columns, n, 'nsmallest', take_last)
+
     def swaplevel(self, i, j, axis=0):
         """
         Swap levels i and j in a MultiIndex on a particular axis
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -193,7 +193,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True):
 
     See Also
     --------
-    match : analagous, but stricter, relying on re.match instead of re.search
+    match : analogous, but stricter, relying on re.match instead of re.search
 
     """
     if regex:
diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py
@@ -532,14 +532,30 @@ class TestMsgpack():
     http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class
     """
     def setUp(self):
-        from pandas.io.tests.generate_legacy_storage_files import create_msgpack_data
+        from pandas.io.tests.generate_legacy_storage_files import (
+            create_msgpack_data, create_data)
         self.data = create_msgpack_data()
+        self.all_data = create_data()
         self.path = u('__%s__.msgpack' % tm.rands(10))
+        self.minimum_structure = {'series': ['float', 'int', 'mixed', 'ts', 'mi', 'dup'],
+                                  'frame': ['float', 'int', 'mixed', 'mi'],
+                                  'panel': ['float'],
+                                  'index': ['int', 'date', 'period'],
+                                  'mi': ['reg2']}
+
+    def check_min_structure(self, data):
+        for typ, v in self.minimum_structure.items():
+            assert typ in data, '"{0}" not found in unpacked data'.format(typ)
+            for kind in v:
+                assert kind in data[typ], '"{0}" not found in data["{1}"]'.format(kind, typ)
 
     def compare(self, vf):
         data = read_msgpack(vf)
+        self.check_min_structure(data)
         for typ, dv in data.items():
+            assert typ in self.all_data, 'unpacked data contains extra key "{0}"'.format(typ)
             for dt, result in dv.items():
+                assert dt in self.all_data[typ], 'data["{0}"] contains extra key "{1}"'.format(typ, dt)
                 try:
                     expected = self.data[typ][dt]
                 except KeyError:
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py