pandas-dev · echozzy629 · Apr 12, 2020 · Apr 12, 2020 · Apr 12, 2020
diff --git a/0001-standardize-term-pandas-in-documentation.patch b/0001-standardize-term-pandas-in-documentation.patch
diff --git a/0002-add-a-new-feature-sample-into-groupby.patch b/0002-add-a-new-feature-sample-into-groupby.patch
@@ -0,0 +1,165 @@
+From c0be8032bffd42b194b99f1538c6040f0f2b354f Mon Sep 17 00:00:00 2001
+From: ziyi zhang <[email protected]>
+Date: Sun, 12 Apr 2020 15:31:43 -0700
+Subject: [PATCH 2/2] add a new feature sample() into groupby
+
+---
+ pandas/core/groupby/groupby.py | 142 +++++++++++++++++++++++++++++++++
+ 1 file changed, 142 insertions(+)
+
+diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
+index 873f24b96..41b48055e 100644
+--- a/pandas/core/groupby/groupby.py
++++ b/pandas/core/groupby/groupby.py
+@@ -1436,6 +1436,148 @@ class GroupBy(_GroupBy[FrameOrSeries]):
+                 return result.T
+             return result.unstack()
+
++
++    def sample(groupby_result, size=None, frac=None, replace=False, weights=None):
++        """
++        Returns a random sample in dictionary.
++
++        Parameters
++        ----------
++        n : int, optional
++            Number of items from axis to return. Cannot be used with `frac`.
++            Default = 1 if `frac` = None.
++        frac : float, optional
++            Fraction of items to return. Cannot be used with `size`.
++        replace : boolean, optional
++            Sample with or without replacement. Default = False.
++        weights : list of float, optional
++            Default 'None' results in equal probability weighting.
++            Index values in sampled object not in weights will be assigned
++            weights of zero.
++            If weights do not sum to 1, they will be normalized to sum to 1.
++            Missing values in the weights column will be treated as zero.
++            inf and -inf values not allowed.
++    
++        Returns
++        -------
++        A new object of same type as caller.
++
++        Examples
++        --------
++        Generate an example ``DataFrame``:
++
++        >>> df = pd.DataFrame([['Male', 1], ['Female', 3], ['Female', 2], ['Other', 1]], columns=['gender', 'feature'])
++           gender  feature
++            0    Male        1
++            1  Female        3
++            2  Female        2
++            3   Other        1
++        
++        >>> grouped_df = df.groupby('gender')
++            <pandas.core.groupby.generic.DataFrameGroupBy object at 0x1034409b0>
++
++         Next extract a random sample:
++
++         2 random elements sample:
++
++        >>> sample=groupby.sample(size = 2)
++            {'Female': Int64Index([1, 2], dtype='int64'), 'Male': Int64Index([0], dtype='int64')}
++
++         2 random elements samplt with given weights:
++        >>> sample=groupby.sample(size = 2, weights = [0.1,0.1,0.2])
++            {'Male': Int64Index([0], dtype='int64'), 'Other': Int64Index([3], dtype='int64')}
++
++         A random 40% with replacement:
++        >>> sample=groupby.sample(frac = 0.4, replace = True)
++            {'Male': Int64Index([0], dtype='int64')}
++
++        """
++        groups_dictionary=groupby_result.groups
++   
++        #check size and frac:
++            #if no input sieze and no input frac: default sto size = 1
++        if(size == None and frac == None):
++            final_size=1
++    
++        #if no input size but have the frac:
++        elif(size == None and frac is not None):
++            final_size=int(round(frac*len(groups_dictionary)))
++
++        #if no input frac but have the size:
++        elif(size is not None and frac is None and size % 1 ==0):
++            final_size=size
++        elif(size is not None and frac is None and size % 1 !=0):
++            raise ValueError("Only integers accepted as size value")
++        #if both enter size and frac: error
++        elif(size is not None and frac is not None):
++            raise ValueError('Please enter a value for `frac` OR `size`, not both')
++
++        print("For the given group, the size of sample is %d" %final_size)
++
++        #errors:
++        if(size is not None):
++        #1. non-integer size error:
++        #if(size%1 !=0):
++        #    raise ValueError("Only integers accepted as size value")
++
++        #2. negative size error:
++            if size < 0:
++                raise ValueError("A negative number of sample size requested. Please provide a positive value.")
++                
++        #3. overflow error:
++            maximum_size=len(groups_dictionary)
++            if size > maximum_size:
++               raise ValueError("The size of requested sample is overflow. Please provide the value of size in range.")
++               
++        if(frac is not None):
++            if(frac >1):
++                raise ValueError("Only float between 0 an 1 accepted as frac value")
++
++
++        #edge warning:
++        if(size==0 or frac ==0):
++            raise Warning("Random sample is empty: the input sample size is 0")
++        if(size==len(groups_dictionary) or frac ==1):
++            raise Warning("Random sample equals to the given groupbt: the inplut size is the same as the size of the input group")
++
++        if weights is not None:
++            #weights is a list
++            if(len(weights) != len(groups_dictionary.keys())):
++                raise ValueError("Weights and axis to be sampled must be the same length")
++            for w in weights:
++                #if(w == np.inf() or w == -np.inf()):
++                #    raise ValueError("Weight vectr may not inclue `inf` values")
++                if(w < 0):
++                    raise ValueError("Weight vector may no include nagative value")
++                # If has nan, set to zero:
++                if(w==np.nan):
++                    w=0
++
++            # Renormalize if don's sum to 1:
++            if(sum(weights)!=1):
++                if(sum(weights)!=0):
++                    new_weights=[]
++                    for w in weights:
++                        new_w = w / sum(weights)
++                        new_weights.append(new_w)
++                    weights=new_weights
++                else:
++                    raise ValueError("Invalid weights: weights sum to zero")
++
++        #random sampling:
++        #sample=random.sample(groups_dictionary.keys(),final_size, replace=replace)
++        dictionary_keys=list(groups_dictionary.keys())
++        num_of_keys=len(dictionary_keys)
++        sample=np.random.choice(num_of_keys,size=final_size,replace=replace,p=weights)
++        sample_keys=[]
++        for i in sample:
++            sample_keys.append(dictionary_keys[i])
++        sample_dictionary={key: value for key, value in groups_dictionary.items() if key in sample_keys}
++
++        return(sample_dictionary)
++
++
++
+     def resample(self, rule, *args, **kwargs):
+         """
+         Provide resampling when using a TimeGrouper.
+-- 
+2.19.0
+
diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst
@@ -9,7 +9,7 @@ pandas code style guide
 .. contents:: Table of contents:
    :local:
 
-*pandas* follows the `PEP8 <https://www.python.org/dev/peps/pep-0008/>`_
+pandas follows the `PEP8 <https://www.python.org/dev/peps/pep-0008/>`_
 standard and uses `Black <https://black.readthedocs.io/en/stable/>`_
 and `Flake8 <https://flake8.pycqa.org/en/latest/>`_ to ensure a
 consistent code format throughout the project. For details see the

diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
@@ -134,7 +134,7 @@ want to clone your fork to your machine::
     git remote add upstream https://github.com/pandas-dev/pandas.git
 
 This creates the directory `pandas-yourname` and connects your repository to
-the upstream (main project) *pandas* repository.
+the upstream (main project) pandas repository.
 
 .. _contributing.dev_env:
 
@@ -150,7 +150,7 @@ Using a Docker container
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 Instead of manually setting up a development environment, you can use Docker to
-automatically create the environment with just several commands. Pandas provides a `DockerFile`
+automatically create the environment with just several commands. pandas provides a `DockerFile`
 in the root directory to build a Docker image with a full pandas development environment.
 
 Even easier, you can use the DockerFile to launch a remote session with Visual Studio Code,
@@ -162,7 +162,7 @@ See https://code.visualstudio.com/docs/remote/containers for details.
 Installing a C compiler
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-Pandas uses C extensions (mostly written using Cython) to speed up certain
+pandas uses C extensions (mostly written using Cython) to speed up certain
 operations. To install pandas from source, you need to compile these C
 extensions, which means you need a C compiler. This process depends on which
 platform you're using.
@@ -1157,7 +1157,7 @@ This test shows off several useful features of Hypothesis, as well as
 demonstrating a good use-case: checking properties that should hold over
 a large or complicated domain of inputs.
 
-To keep the Pandas test suite running quickly, parametrized tests are
+To keep the pandas test suite running quickly, parametrized tests are
 preferred if the inputs or logic are simple, with Hypothesis tests reserved
 for cases with complex logic or where there are too many combinations of
 options or subtle interactions to test (or think of!) all of them.

diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst
@@ -998,4 +998,4 @@ mapping function names to docstrings. Wherever possible, we prefer using
 
 See ``pandas.core.generic.NDFrame.fillna`` for an example template, and
 ``pandas.core.series.Series.fillna`` and ``pandas.core.generic.frame.fillna``
-for the filled versions.
+for the filled versions.
diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst
@@ -182,4 +182,4 @@ As an example of fully-formed metadata:
     'creator': {
       'library': 'pyarrow',
       'version': '0.13.0'
-    }}
+    }}
diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst
@@ -501,4 +501,4 @@ registers the default "matplotlib" backend as follows.
 
 
 More information on how to implement a third-party plotting backend can be found at
-https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1.
+https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1.
diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst
@@ -190,4 +190,4 @@ The current list of core-team members is at
 https://github.com/pandas-dev/pandas-governance/blob/master/people.md
 
 .. _governance documents: https://github.com/pandas-dev/pandas-governance
-.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization
+.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
@@ -93,7 +93,7 @@ With Altair, you can spend more time understanding your data and its
 meaning. Altair's API is simple, friendly and consistent and built on
 top of the powerful Vega-Lite JSON specification. This elegant
 simplicity produces beautiful and effective visualizations with a
-minimal amount of code. Altair works with Pandas DataFrames.
+minimal amount of code. Altair works with pandas DataFrames.
 
 
 `Bokeh <https://bokeh.pydata.org>`__
@@ -104,8 +104,8 @@ the latest web technologies. Its goal is to provide elegant, concise constructio
 graphics in the style of Protovis/D3, while delivering high-performance interactivity over
 large data to thin clients.
 
-`Pandas-Bokeh <https://github.com/PatrikHlobil/Pandas-Bokeh>`__ provides a high level API
-for Bokeh that can be loaded as a native Pandas plotting backend via
+`pandas-Bokeh <https://github.com/PatrikHlobil/pandas-Bokeh>`__ provides a high level API
+for Bokeh that can be loaded as a native pandas plotting backend via
 
 .. code:: python
 
@@ -147,7 +147,7 @@ A good implementation for Python users is `has2k1/plotnine <https://github.com/h
 
 `Plotly’s <https://plot.ly/>`__ `Python API <https://plot.ly/python/>`__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js <https://d3js.org/>`__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn <https://plot.ly/python/matplotlib-to-plotly-tutorial/>`__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks <https://plot.ly/ipython-notebooks/>`__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud <https://plot.ly/product/plans/>`__, `offline <https://plot.ly/python/offline/>`__, or `on-premise <https://plot.ly/product/enterprise/>`__ accounts for private use.
 
-`QtPandas <https://github.com/draperjames/qtpandas>`__
+`Qtpandas <https://github.com/draperjames/qtpandas>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Spun off from the main pandas library, the `qtpandas <https://github.com/draperjames/qtpandas>`__
@@ -163,7 +163,7 @@ IDE
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 IPython is an interactive command shell and distributed computing
-environment. IPython tab completion works with Pandas methods and also
+environment. IPython tab completion works with pandas methods and also
 attributes like DataFrame columns.
 
 `Jupyter Notebook / Jupyter Lab <https://jupyter.org>`__
@@ -177,7 +177,7 @@ Jupyter notebooks can be converted to a number of open standard output formats
 Python) through 'Download As' in the web interface and ``jupyter convert``
 in a shell.
 
-Pandas DataFrames implement ``_repr_html_``and ``_repr_latex`` methods
+pandas DataFrames implement ``_repr_html_``and ``_repr_latex`` methods
 which are utilized by Jupyter Notebook for displaying
 (abbreviated) HTML or LaTeX tables. LaTeX output is properly escaped.
 (Note: HTML tables may or may not be
@@ -205,7 +205,7 @@ Its `Variable Explorer <https://docs.spyder-ide.org/variableexplorer.html>`__
 allows users to view, manipulate and edit pandas ``Index``, ``Series``,
 and ``DataFrame`` objects like a "spreadsheet", including copying and modifying
 values, sorting, displaying a "heatmap", converting data types and more.
-Pandas objects can also be renamed, duplicated, new columns added,
+pandas objects can also be renamed, duplicated, new columns added,
 copyed/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file.
 Spyder can also import data from a variety of plain text and binary files
 or the clipboard into a new pandas DataFrame via a sophisticated import wizard.
@@ -252,13 +252,13 @@ The following data feeds are available:
 `quandl/Python <https://github.com/quandl/Python>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Quandl API for Python wraps the Quandl REST API to return
-Pandas DataFrames with timeseries indexes.
+pandas DataFrames with timeseries indexes.
 
 `pydatastream <https://github.com/vfilimonov/pydatastream>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 PyDatastream is a Python interface to the
 `Refinitiv Datastream (DWS) <https://www.refinitiv.com/en/products/datastream-macroeconomic-analysis>`__
-REST API to return indexed Pandas DataFrames with financial data.
+REST API to return indexed pandas DataFrames with financial data.
 This package requires valid credentials for this API (non free).
 
 `pandaSDMX <https://pandasdmx.readthedocs.io>`__
@@ -312,7 +312,7 @@ Out-of-core
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Blaze provides a standard API for doing computations with various
-in-memory and on-disk backends: NumPy, Pandas, SQLAlchemy, MongoDB, PyTables,
+in-memory and on-disk backends: NumPy, pandas, SQLAlchemy, MongoDB, PyTables,
 PySpark.
 
 `Dask <https://dask.readthedocs.io/en/latest/>`__
@@ -358,7 +358,7 @@ If also displays progress bars.
 `Ray <https://ray.readthedocs.io/en/latest/pandas_on_ray.html>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Pandas on Ray is an early stage DataFrame library that wraps Pandas and transparently distributes the data and computation. The user does not need to know how many cores their system has, nor do they need to specify how to distribute the data. In fact, users can continue using their previous Pandas notebooks while experiencing a considerable speedup from Pandas on Ray, even on a single machine. Only a modification of the import statement is needed, as we demonstrate below. Once you’ve changed your import statement, you’re ready to use Pandas on Ray just like you would Pandas.
+pandas on Ray is an early stage DataFrame library that wraps pandas and transparently distributes the data and computation. The user does not need to know how many cores their system has, nor do they need to specify how to distribute the data. In fact, users can continue using their previous pandas notebooks while experiencing a considerable speedup from pandas on Ray, even on a single machine. Only a modification of the import statement is needed, as we demonstrate below. Once you’ve changed your import statement, you’re ready to use pandas on Ray just like you would pandas.
 
 .. code:: python
 
@@ -369,7 +369,7 @@ Pandas on Ray is an early stage DataFrame library that wraps Pandas and transpar
 `Vaex <https://docs.vaex.io/>`__
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a python library for Out-of-Core DataFrames (similar to Pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted).
+Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted).
 
  * vaex.from_pandas
  * vaex.to_pandas_df
@@ -379,7 +379,7 @@ Increasingly, packages are being built on top of pandas to address specific need
 Extension data types
 --------------------
 
-Pandas provides an interface for defining
+pandas provides an interface for defining
 :ref:`extension types <extending.extension-types>` to extend NumPy's type
 system. The following libraries implement that interface to provide types not
 found in NumPy or pandas, which work well with pandas' data containers.
@@ -411,4 +411,4 @@ Library         Accessor   Classes                   Description
 .. _pdvega: https://altair-viz.github.io/pdvega/
 .. _Altair: https://altair-viz.github.io/
 .. _pandas_path: https://github.com/drivendataorg/pandas-path/
-.. _pathlib.Path: https://docs.python.org/3/library/pathlib.html
+.. _pathlib.Path: https://docs.python.org/3/library/pathlib.html
diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst
@@ -752,4 +752,4 @@ to interop data between SAS and pandas is to serialize to csv.
    Wall time: 14.6 s
 
    In [9]: %time df = pd.read_csv('big.csv')
-   Wall time: 4.86 s
+   Wall time: 4.86 s
Original file line number	Diff line number	Diff line change
Expand Up		@@ -501,4 +501,4 @@ registers the default "matplotlib" backend as follows.


		More information on how to implement a third-party plotting backend can be found at
		https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1.
		https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1.