diff --git a/.gitignore b/.gitignore index 25f1efd830f5c..da77462c632bc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*~ *.pyc *.pyo *.swp diff --git a/.travis.yml b/.travis.yml index 4705e6b200b42..053554b5cf93c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,40 +11,65 @@ env: - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" matrix: + fast_finish: true include: - python: 2.6 env: - NOSE_ARGS="not slow and not network and not disabled" - CLIPBOARD=xclip - LOCALE_OVERRIDE="it_IT.UTF-8" - - JOB_NAME: "26_nslow_nnet" # ScatterCI Build name, 20 chars max + - JOB_NAME: "26_nslow_nnet" - python: 2.7 env: - NOSE_ARGS="slow and not network and not disabled" - LOCALE_OVERRIDE="zh_CN.GB18030" - FULL_DEPS=true - JOB_TAG=_LOCALE - - JOB_NAME: "27_slow_nnet_LOCALE" # ScatterCI Build name, 20 chars max + - JOB_NAME: "27_slow_nnet_LOCALE" - python: 2.7 env: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD_GUI=gtk2 - - JOB_NAME: "27_nslow" # ScatterCI Build name, 20 chars max + - JOB_NAME: "27_nslow" - DOC_BUILD=true # if rst files were changed, build docs in parallel with tests - python: 3.2 env: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD_GUI=qt4 - - JOB_NAME: "32_nslow" # ScatterCI Build name, 20 chars max + - JOB_NAME: "32_nslow" - python: 3.3 env: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD=xsel - - JOB_NAME: "33_nslow" # ScatterCI Build name, 20 chars max - + - JOB_NAME: "33_nslow" + - python: 2.7 + env: + - NOSE_ARGS="not slow and not network and not disabled" + - JOB_NAME: "27_numpy_master" + - JOB_TAG=_NUMPY_DEV_master + - NUMPY_BUILD=master + - python: 2.7 + env: + - NOSE_ARGS="not slow and not network and not disabled" + - JOB_NAME: "27_numpy_1.8.x" + - JOB_TAG=_NUMPY_DEV_1_8_x + - NUMPY_BUILD=maintenance/1.8.x + allow_failures: + - python: 2.7 + env: + - NOSE_ARGS="not slow and not network and not disabled" + - JOB_NAME: "27_numpy_master" + - JOB_TAG=_NUMPY_DEV_master + - NUMPY_BUILD=master + - python: 2.7 + env: + - NOSE_ARGS="not slow and not network and not disabled" + - JOB_NAME: "27_numpy_1.8.x" + - JOB_TAG=_NUMPY_DEV_1_8_x + - NUMPY_BUILD=maintenance/1.8.x # allow importing from site-packages, # so apt-get python-x works for system pythons diff --git a/ci/install.sh b/ci/install.sh index 77755a26393c0..175252db3c9f1 100755 --- a/ci/install.sh +++ b/ci/install.sh @@ -31,28 +31,47 @@ edit_init python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" [ "$python_major_version" == "2" ] && python_major_version="" -pip install -I -U setuptools -pip install wheel +# fix these versions +pip install -I pip==1.5.1 +pip install -I setuptools==2.2 +pip install wheel==0.22 # comment this line to disable the fetching of wheel files -base_url=http://cache27diy-cpycloud.rhcloud.com +base_url=http://pandas.pydata.org/pandas-build/dev/wheels + wheel_box=${TRAVIS_PYTHON_VERSION}${JOB_TAG} PIP_ARGS+=" -I --use-wheel --find-links=$base_url/$wheel_box/ --allow-external --allow-insecure" -# Force virtualenv to accept system_site_packages -rm -f $VIRTUAL_ENV/lib/python$TRAVIS_PYTHON_VERSION/no-global-site-packages.txt - - if [ -n "$LOCALE_OVERRIDE" ]; then # make sure the locale is available # probably useless, since you would need to relogin time sudo locale-gen "$LOCALE_OVERRIDE" fi - # we need these for numpy time sudo apt-get $APT_ARGS install libatlas-base-dev gfortran +if [ -n "$NUMPY_BUILD" ]; then + # building numpy + curdir=$(pwd) + echo "building numpy: $curdir" + + # remove the system installed numpy + pip uninstall numpy -y + + # clone & install + git clone --branch $NUMPY_BUILD https://github.com/numpy/numpy.git numpy + cd numpy + time sudo python setup.py install + + cd $curdir + numpy_version=$(python -c 'import numpy; print(numpy.__version__)') + echo "numpy: $numpy_version" +else + # Force virtualenv to accept system_site_packages + rm -f $VIRTUAL_ENV/lib/python$TRAVIS_PYTHON_VERSION/no-global-site-packages.txt +fi + time pip install $PIP_ARGS -r ci/requirements-${wheel_box}.txt @@ -98,6 +117,10 @@ export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH which gcc ccache -z time pip install $(find dist | grep gz | head -n 1) -# restore cython -time pip install $PIP_ARGS $(cat ci/requirements-${wheel_box}.txt | grep -i cython) + +# restore cython (if not numpy building) +if [ -z "$NUMPY_BUILD" ]; then + time pip install $PIP_ARGS $(cat ci/requirements-${wheel_box}.txt | grep -i cython) +fi + true diff --git a/ci/requirements-2.7.txt b/ci/requirements-2.7.txt index 721204f5c8f6e..8fc289b5e1511 100644 --- a/ci/requirements-2.7.txt +++ b/ci/requirements-2.7.txt @@ -13,7 +13,6 @@ xlrd==0.9.2 patsy==0.1.0 html5lib==1.0b2 lxml==3.2.1 -scikits.timeseries==0.91.3 scipy==0.10.0 beautifulsoup4==4.2.1 statsmodels==0.5.0 diff --git a/ci/requirements-2.7_NUMPY_DEV_1_8_x.txt b/ci/requirements-2.7_NUMPY_DEV_1_8_x.txt new file mode 100644 index 0000000000000..90fa8f11c1cfd --- /dev/null +++ b/ci/requirements-2.7_NUMPY_DEV_1_8_x.txt @@ -0,0 +1,3 @@ +python-dateutil +pytz==2013b +cython==0.19.1 diff --git a/ci/requirements-2.7_NUMPY_DEV_master.txt b/ci/requirements-2.7_NUMPY_DEV_master.txt new file mode 100644 index 0000000000000..7d1d11daf9eeb --- /dev/null +++ b/ci/requirements-2.7_NUMPY_DEV_master.txt @@ -0,0 +1,3 @@ +python-dateutil +pytz +cython==0.19.1 diff --git a/ci/requirements-3.3.txt b/ci/requirements-3.3.txt index 8096745447648..8e85c1108b5bf 100644 --- a/ci/requirements-3.3.txt +++ b/ci/requirements-3.3.txt @@ -8,6 +8,7 @@ numpy==1.8.0 cython==0.19.1 numexpr==2.3 tables==3.1.0 +bottleneck==0.8.0 matplotlib==1.2.1 patsy==0.1.0 lxml==3.2.1 diff --git a/ci/speedpack/build.sh b/ci/speedpack/build.sh index 689f9aa5db8ea..21ebdcf78b548 100755 --- a/ci/speedpack/build.sh +++ b/ci/speedpack/build.sh @@ -103,6 +103,12 @@ function generate_wheels() { } +# generate a single wheel version +# generate_wheels "/reqf/requirements-3.3.txt" +# +# if vagrant is already up +# run as vagrant provision + for reqfile in $(ls -1 /reqf/requirements-*.*); do generate_wheels "$reqfile" done diff --git a/doc/source/api.rst b/doc/source/api.rst index 90a12d449839b..811301a6bbbca 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -424,10 +424,25 @@ Time series-related Series.shift Series.first_valid_index Series.last_valid_index - Series.weekday Series.resample Series.tz_convert Series.tz_localize + Series.year + Series.month + Series.day + Series.hour + Series.minute + Series.second + Series.microsecond + Series.nanosecond + Series.date + Series.time + Series.dayofyear + Series.weekofyear + Series.week + Series.dayofweek + Series.weekday + Series.quarter String handling ~~~~~~~~~~~~~~~~~~~ @@ -1129,7 +1144,9 @@ Time/Date Components DatetimeIndex.dayofweek DatetimeIndex.weekday DatetimeIndex.quarter - + DatetimeIndex.tz + DatetimeIndex.freq + DatetimeIndex.freqstr Selecting ~~~~~~~~~ @@ -1159,7 +1176,7 @@ Conversion DatetimeIndex.to_datetime DatetimeIndex.to_period DatetimeIndex.to_pydatetime - + DatetimeIndex.to_series GroupBy ------- diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index 253eafb36653f..7de0b85ede51f 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -30,6 +30,43 @@ R packages. Base R ------ +Slicing with R's |c|_ +~~~~~~~~~~~~~~~~~~~~~ + +R makes it easy to access ``data.frame`` columns by name + +.. code-block:: r + + df <- data.frame(a=rnorm(5), b=rnorm(5), c=rnorm(5), d=rnorm(5), e=rnorm(5)) + df[, c("a", "c", "e")] + +or by integer location + +.. code-block:: r + + df <- data.frame(matrix(rnorm(1000), ncol=100)) + df[, c(1:10, 25:30, 40, 50:100)] + +Selecting multiple columns by name in ``pandas`` is straightforward + +.. ipython:: python + + df = DataFrame(np.random.randn(10, 3), columns=list('abc')) + df[['a', 'c']] + df.loc[:, ['a', 'c']] + +Selecting multiple noncontiguous columns by integer location can be achieved +with a combination of the ``iloc`` indexer attribute and ``numpy.r_``. + +.. ipython:: python + + named = list('abcdefg') + n = 30 + columns = named + np.arange(len(named), n).tolist() + df = DataFrame(np.random.randn(n, n), columns=columns) + + df.iloc[:, np.r_[:10, 24:30]] + |aggregate|_ ~~~~~~~~~~~~ @@ -407,6 +444,9 @@ The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: For more details and examples see :ref:`the reshaping documentation ` or :ref:`the groupby documentation`. +.. |c| replace:: ``c`` +.. _c: http://stat.ethz.ch/R-manual/R-patched/library/base/html/c.html + .. |aggregate| replace:: ``aggregate`` .. _aggregate: http://finzi.psych.upenn.edu/R/library/stats/html/aggregate.html diff --git a/doc/source/conf.py b/doc/source/conf.py index ee007e3489e3a..dd6635b8d70df 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -40,8 +40,8 @@ 'sphinx.ext.extlinks', 'sphinx.ext.todo', 'numpydoc', # used to parse numpy-style docstrings for autodoc - 'ipython_directive', - 'ipython_console_highlighting', + 'ipython_sphinxext.ipython_directive', + 'ipython_sphinxext.ipython_console_highlighting', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 4cbee3e79a2f1..1a443df8b7b77 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -250,6 +250,9 @@ Turn a matrix with hours in columns and days in rows into a continuous row seque `How to rearrange a python pandas DataFrame? `__ +`Dealing with duplicates when reindexing a timeseries to a specified frequency +`__ + .. _cookbook.resample: Resampling @@ -470,6 +473,75 @@ Storing Attributes to a group node store.close() os.remove('test.h5') + +.. _cookbook.binary: + +Binary Files +~~~~~~~~~~~~ + +Pandas readily accepts numpy record arrays, if you need to read in a binary +file consisting of an array of C structs. For example, given this C program +in a file called ``main.c`` compiled with ``gcc main.c -std=gnu99`` on a +64-bit machine, + +.. code-block:: c + + #include + #include + + typedef struct _Data + { + int32_t count; + double avg; + float scale; + } Data; + + int main(int argc, const char *argv[]) + { + size_t n = 10; + Data d[n]; + + for (int i = 0; i < n; ++i) + { + d[i].count = i; + d[i].avg = i + 1.0; + d[i].scale = (float) i + 2.0f; + } + + FILE *file = fopen("binary.dat", "wb"); + fwrite(&d, sizeof(Data), n, file); + fclose(file); + + return 0; + } + +the following Python code will read the binary file ``'binary.dat'`` into a +pandas ``DataFrame``, where each element of the struct corresponds to a column +in the frame: + +.. code-block:: python + + import numpy as np + from pandas import DataFrame + + names = 'count', 'avg', 'scale' + + # note that the offsets are larger than the size of the type because of + # struct padding + offsets = 0, 8, 16 + formats = 'i4', 'f8', 'f4' + dt = np.dtype({'names': names, 'offsets': offsets, 'formats': formats}, + align=True) + df = DataFrame(np.fromfile('binary.dat', dt)) + +.. note:: + + The offsets of the structure elements may be different depending on the + architecture of the machine on which the file was created. Using a raw + binary file format like this for general data storage is not recommended, as + it is not cross platform. We recommended either HDF5 or msgpack, both of + which are supported by pandas' IO facilities. + Computation ----------- diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 34166343817a4..00c76632ce17b 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -300,7 +300,7 @@ Expression Evaluation via :func:`~pandas.eval` (Experimental) .. versionadded:: 0.13 -The top-level function :func:`~pandas.eval` implements expression evaluation of +The top-level function :func:`pandas.eval` implements expression evaluation of :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects. .. note:: @@ -336,11 +336,11 @@ engine in addition to some extensions available only in pandas. Supported Syntax ~~~~~~~~~~~~~~~~ -These operations are supported by :func:`~pandas.eval`: +These operations are supported by :func:`pandas.eval`: - Arithmetic operations except for the left shift (``<<``) and right shift (``>>``) operators, e.g., ``df + 2 * pi / s ** 4 % 42 - the_golden_ratio`` -- Comparison operations, e.g., ``2 < df < df2`` +- Comparison operations, including chained comparisons, e.g., ``2 < df < df2`` - Boolean operations, e.g., ``df < df2 and df3 < df4 or not df_bool`` - ``list`` and ``tuple`` literals, e.g., ``[1, 2]`` or ``(1, 2)`` - Attribute access, e.g., ``df.a`` @@ -373,9 +373,9 @@ This Python syntax is **not** allowed: :func:`~pandas.eval` Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`~pandas.eval` works wonders for expressions containing large arrays +:func:`pandas.eval` works well with expressions containing large arrays -First let's create 4 decent-sized arrays to play with: +First let's create a few decent-sized arrays to play with: .. ipython:: python @@ -441,8 +441,10 @@ Now let's do the same thing but with comparisons: The ``DataFrame.eval`` method (Experimental) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In addition to the top level :func:`~pandas.eval` function you can also -evaluate an expression in the "context" of a ``DataFrame``. +.. versionadded:: 0.13 + +In addition to the top level :func:`pandas.eval` function you can also +evaluate an expression in the "context" of a :class:`~pandas.DataFrame`. .. ipython:: python :suppress: @@ -462,10 +464,10 @@ evaluate an expression in the "context" of a ``DataFrame``. df = DataFrame(randn(5, 2), columns=['a', 'b']) df.eval('a + b') -Any expression that is a valid :func:`~pandas.eval` expression is also a valid -``DataFrame.eval`` expression, with the added benefit that *you don't have to -prefix the name of the* ``DataFrame`` *to the column(s) you're interested in -evaluating*. +Any expression that is a valid :func:`pandas.eval` expression is also a valid +:meth:`DataFrame.eval` expression, with the added benefit that you don't have to +prefix the name of the :class:`~pandas.DataFrame` to the column(s) you're +interested in evaluating. In addition, you can perform assignment of columns within an expression. This allows for *formulaic evaluation*. Only a single assignment is permitted. @@ -480,55 +482,75 @@ it must be a valid Python identifier. df.eval('a = 1') df +The equivalent in standard Python would be + +.. ipython:: python + + df = DataFrame(dict(a=range(5), b=range(5, 10))) + df['c'] = df.a + df.b + df['d'] = df.a + df.b + df.c + df['a'] = 1 + df + Local Variables ~~~~~~~~~~~~~~~ -You can refer to local variables the same way you would in vanilla Python +In pandas version 0.14 the local variable API has changed. In pandas 0.13.x, +you could refer to local variables the same way you would in standard Python. +For example, -.. ipython:: python +.. code-block:: python df = DataFrame(randn(5, 2), columns=['a', 'b']) newcol = randn(len(df)) df.eval('b + newcol') -.. note:: + UndefinedVariableError: name 'newcol' is not defined - The one exception is when you have a local (or global) with the same name as - a column in the ``DataFrame`` +As you can see from the exception generated, this syntax is no longer allowed. +You must *explicitly reference* any local variable that you want to use in an +expression by placing the ``@`` character in front of the name. For example, - .. code-block:: python +.. ipython:: python - df = DataFrame(randn(5, 2), columns=['a', 'b']) - a = randn(len(df)) - df.eval('a + b') - NameResolutionError: resolvers and locals overlap on names ['a'] + df = DataFrame(randn(5, 2), columns=list('ab')) + newcol = randn(len(df)) + df.eval('b + @newcol') + df.query('b < @newcol') +If you don't prefix the local variable with ``@``, pandas will raise an +exception telling you the variable is undefined. - To deal with these conflicts, a special syntax exists for referring - variables with the same name as a column +When using :meth:`DataFrame.eval` and :meth:`DataFrame.query`, this allows you +to have a local variable and a :class:`~pandas.DataFrame` column with the same +name in an expression. - .. ipython:: python - :suppress: - a = randn(len(df)) +.. ipython:: python - .. ipython:: python + a = randn() + df.query('@a < a') + df.loc[a < df.a] # same as the previous expression - df.eval('@a + b') +With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it +isn't defined in that context. ``pandas`` will let you know this if you try to +use ``@`` in a top-level call to :func:`pandas.eval`. For example, - The same is true for :meth:`~pandas.DataFrame.query` +.. ipython:: python + :okexcept: - .. ipython:: python + a, b = 1, 2 + pd.eval('@a + b') - df.query('@a < b') +In this case, you should simply refer to the variables like you would in +standard Python. - .. ipython:: python - :suppress: +.. ipython:: python - del a + pd.eval('a + b') -:func:`~pandas.eval` Parsers +:func:`pandas.eval` Parsers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are two different parsers and and two different engines you can use as @@ -568,7 +590,7 @@ The ``and`` and ``or`` operators here have the same precedence that they would in vanilla Python. -:func:`~pandas.eval` Backends +:func:`pandas.eval` Backends ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There's also the option to make :func:`~pandas.eval` operate identical to plain @@ -577,12 +599,12 @@ ol' Python. .. note:: Using the ``'python'`` engine is generally *not* useful, except for testing - other :func:`~pandas.eval` engines against it. You will acheive **no** - performance benefits using :func:`~pandas.eval` with ``engine='python'``. + other evaluation engines against it. You will acheive **no** performance + benefits using :func:`~pandas.eval` with ``engine='python'`` and in fact may + incur a performance hit. -You can see this by using :func:`~pandas.eval` with the ``'python'`` engine is -actually a bit slower (not by much) than evaluating the same expression in -Python: +You can see this by using :func:`pandas.eval` with the ``'python'`` engine. It +is a bit slower (not by much) than evaluating the same expression in Python .. ipython:: python @@ -593,15 +615,15 @@ Python: %timeit pd.eval('df1 + df2 + df3 + df4', engine='python') -:func:`~pandas.eval` Performance +:func:`pandas.eval` Performance ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :func:`~pandas.eval` is intended to speed up certain kinds of operations. In particular, those operations involving complex expressions with large -``DataFrame``/``Series`` objects should see a significant performance benefit. -Here is a plot showing the running time of :func:`~pandas.eval` as function of -the size of the frame involved in the computation. The two lines are two -different engines. +:class:`~pandas.DataFrame`/:class:`~pandas.Series` objects should see a +significant performance benefit. Here is a plot showing the running time of +:func:`pandas.eval` as function of the size of the frame involved in the +computation. The two lines are two different engines. .. image:: _static/eval-perf.png @@ -618,19 +640,31 @@ different engines. This plot was created using a ``DataFrame`` with 3 columns each containing floating point values generated using ``numpy.random.randn()``. -Technical Minutia -~~~~~~~~~~~~~~~~~ -- Expressions that would result in an object dtype (including simple - variable evaluation) have to be evaluated in Python space. The main reason - for this behavior is to maintain backwards compatbility with versions of - numpy < 1.7. In those versions of ``numpy`` a call to ``ndarray.astype(str)`` - will truncate any strings that are more than 60 characters in length. Second, - we can't pass ``object`` arrays to ``numexpr`` thus string comparisons must - be evaluated in Python space. -- The upshot is that this *only* applies to object-dtype'd expressions. So, - if you have an expression--for example--that's a string comparison - ``and``-ed together with another boolean expression that's from a numeric - comparison, the numeric comparison will be evaluated by ``numexpr``. In fact, - in general, :func:`~pandas.query`/:func:`~pandas.eval` will "pick out" the - subexpressions that are ``eval``-able by ``numexpr`` and those that must be - evaluated in Python space transparently to the user. +Technical Minutia Regarding Expression Evaluation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Expressions that would result in an object dtype or involve datetime operations +(because of ``NaT``) must be evaluated in Python space. The main reason for +this behavior is to maintain backwards compatbility with versions of numpy < +1.7. In those versions of ``numpy`` a call to ``ndarray.astype(str)`` will +truncate any strings that are more than 60 characters in length. Second, we +can't pass ``object`` arrays to ``numexpr`` thus string comparisons must be +evaluated in Python space. + +The upshot is that this *only* applies to object-dtype'd expressions. So, if +you have an expression--for example + +.. ipython:: python + + df = DataFrame({'strings': np.repeat(list('cba'), 3), + 'nums': np.repeat(range(3), 3)}) + df + df.query('strings == "a" and nums == 1') + +the numeric part of the comparison (``nums == 1``) will be evaluated by +``numexpr``. + +In general, :meth:`DataFrame.query`/:func:`pandas.eval` will +evaluate the subexpressions that *can* be evaluated by ``numexpr`` and those +that must be evaluated in Python space transparently to the user. This is done +by inferring the result type of an expression from its arguments and operators. diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 97699aa32890d..49d463d07e75e 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -533,7 +533,15 @@ parse HTML tables in the top-level pandas io function ``read_html``. Byte-Ordering Issues -------------------- Occasionally you may have to deal with data that were created on a machine with -a different byte order than the one on which you are running Python. To deal +a different byte order than the one on which you are running Python. A common symptom of this issue is an error like + +.. code-block:: python + + Traceback + ... + ValueError: Big-endian buffer not supported on little-endian compiler + +To deal with this issue you should convert the underlying NumPy array to the native system byte order *before* passing it to Series/DataFrame/Panel constructors using something similar to the following: diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index a88b7332d9b9e..b5c15f83bb9d3 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -707,6 +707,66 @@ can be used as group keys. If so, the order of the levels will be preserved: data.groupby(factor).mean() + +Taking the first rows of each group +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Just like for a DataFrame or Series you can call head and tail on a groupby: + +.. ipython:: python + + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + df + + g = df.groupby('A') + g.head(1) + + g.tail(1) + +This shows the first or last n rows from each group. + +.. warning:: + + Before 0.14.0 this was implemented with a fall-through apply, + so the result would incorrectly respect the as_index flag: + + .. code-block:: python + + >>> g.head(1): # was equivalent to g.apply(lambda x: x.head(1)) + A B + A + 1 0 1 2 + 5 2 5 6 + +Taking the nth row of each group +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To select from a DataFrame or Series the nth item, use the nth method: + +.. ipython:: python + + DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.nth(0) + + g.nth(1) + + g.nth(-1) + +If you want to select the nth not-null method, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna, for a Series this just needs to be truthy. + +.. ipython:: python + + g.nth(0, dropna='any') + + g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna + + g.B.nth(0, dropna=True) + +.. warning:: + + Before 0.14.0 this method existed but did not work correctly on DataFrames. The API has changed so that it filters by default, but the old behaviour (for Series) can be achieved by passing dropna. An alternative is to dropna before doing the groupby. + Enumerate group items ~~~~~~~~~~~~~~~~~~~~~ @@ -734,3 +794,28 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on df = pd.DataFrame({'a':[1,0,0], 'b':[0,1,0], 'c':[1,0,0], 'd':[2,3,4]}) df df.groupby(df.sum(), axis=1).sum() + + +Group DataFrame columns, compute a set of metrics and return a named Series. +The Series name is used as the name for the column index. This is especially +useful in conjunction with reshaping operations such as stacking in which the +column index name will be used as the name of the inserted column: + +.. ipython:: python + + df = pd.DataFrame({ + 'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], + 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], + 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], + }) + + def compute_metrics(x): + result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()} + return pd.Series(result, name='metrics') + + result = df.groupby('a').apply(compute_metrics) + + result + + result.stack() diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index afeb3fcc7764c..bca009c6b8931 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -77,9 +77,9 @@ of multi-axis indexing. See more at :ref:`Selection by Label ` - ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of - the axis), will raise ``IndexError`` if a single index is requested and it - is out-of-bounds, otherwise it will conform the bounds to size of the object. - Allowed inputs are: + the axis), will raise ``IndexError`` if an indexer is requested and it + is out-of-bounds, except *slice* indexers which allow out-of-bounds indexing. + (this conforms with python/numpy *slice* semantics). Allowed inputs are: - An integer e.g. ``5`` - A list or array of integers ``[4, 3, 0]`` @@ -421,19 +421,28 @@ python/numpy allow slicing past the end of an array without an associated error. x[4:10] x[8:10] -- as of v0.14.0, ``iloc`` will now accept out-of-bounds indexers, e.g. a value that exceeds the length of the object being +- as of v0.14.0, ``iloc`` will now accept out-of-bounds indexers for slices, e.g. a value that exceeds the length of the object being indexed. These will be excluded. This will make pandas conform more with pandas/numpy indexing of out-of-bounds - values. A single indexer that is out-of-bounds and drops the dimensions of the object will still raise - ``IndexError`` (:issue:`6296`). This could result in an empty axis (e.g. an empty DataFrame being returned) + values. A single indexer / list of indexers that is out-of-bounds will still raise + ``IndexError`` (:issue:`6296`, :issue:`6299`). This could result in an empty axis (e.g. an empty DataFrame being returned) .. ipython:: python dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) dfl - dfl.iloc[[4,5,6]] - dfl.iloc[4:6] dfl.iloc[:,2:3] dfl.iloc[:,1:3] + dfl.iloc[4:6] + +These are out-of-bounds selections + +.. code-block:: python + + dfl.iloc[[4,5,6]] + IndexError: positional indexers are out-of-bounds + + dfl.iloc[:,4] + IndexError: single positional indexer is out-of-bounds .. _indexing.basics.partial_setting: @@ -911,9 +920,9 @@ You can combine this with other expressions for very succinct queries: **expression itself** is evaluated in vanilla Python. For example, in the expression - .. code-block:: python + .. code-block:: python - df.query('a in b + c + d') + df.query('a in b + c + d') ``(b + c + d)`` is evaluated by ``numexpr`` and *then* the ``in`` operation is evaluated in plain Python. In general, any operations that can @@ -1504,6 +1513,18 @@ operators: a & b a - b +Also available is the ``sym_diff (^)`` operation, which returns elements +that appear in either ``idx1`` or ``idx2`` but not both. This is +equivalent to the Index created by ``(idx1 - idx2) + (idx2 - idx1)``, +with duplicates dropped. + +.. ipython:: python + + idx1 = Index([1, 2, 3, 4]) + idx2 = Index([2, 3, 4, 5]) + idx1.sym_diff(idx2) + idx1 ^ idx2 + The ``isin`` method of Index objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 72344ee003547..04fb0b0695f8f 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -213,6 +213,33 @@ This is also a valid argument to ``DataFrame.append``: df1.append(df2, ignore_index=True) +.. _merging.mixed_ndims: + +Concatenating with mixed ndims +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can concatenate a mix of Series and DataFrames. The +Series will be transformed to DataFrames with the column name as +the name of the Series. + +.. ipython:: python + + df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D']) + s1 = Series(randn(6), name='foo') + concat([df1, s1],axis=1) + +If unnamed Series are passed they will be numbered consecutively. + +.. ipython:: python + + s2 = Series(randn(6)) + concat([df1, s2, s2, s2],axis=1) + +Passing ``ignore_index=True`` will drop all name references. + +.. ipython:: python + + concat([df1, s1],axis=1,ignore_index=True) More concatenating with group keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 8a2f647792f47..ac5c8a4463b39 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -602,17 +602,19 @@ You can also operate on the DataFrame in place argument to ``replace`` (``to_replace``) must match the type of the value being replaced type. For example, - .. code-block:: + .. code-block:: python s = Series([True, False, True]) - s.replace({'a string': 'new value', True: False}) + s.replace({'a string': 'new value', True: False}) # raises + + TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' will raise a ``TypeError`` because one of the ``dict`` keys is not of the correct type for replacement. However, when replacing a *single* object such as, - .. code-block:: + .. ipython:: python s = Series([True, False, True]) s.replace('a string', 'another string') diff --git a/doc/source/release.rst b/doc/source/release.rst index 6e1632f036f38..394e429186045 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -54,6 +54,11 @@ New features ~~~~~~~~~~~~ - Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`) +- Added the ``sym_diff`` method to ``Index`` (:issue:`5543`) +- Added ``to_julian_date`` to ``TimeStamp`` and ``DatetimeIndex``. The Julian + Date is used primarily in astronomy and represents the number of days from + noon, January 1, 4713 BC. Because nanoseconds are used to define the time + in pandas the actual range of dates that you can use is 1678 AD to 2262 AD. (:issue:`4041`) API Changes ~~~~~~~~~~~ @@ -66,11 +71,63 @@ API Changes - ``df['col'] = value`` and ``df.loc[:,'col'] = value`` are now completely equivalent; previously the ``.loc`` would not necessarily coerce the dtype of the resultant series (:issue:`6149`) - ``dtypes`` and ``ftypes`` now return a series with ``dtype=object`` on empty containers (:issue:`5740`) +- ``df.to_csv`` will now return a string of the CSV data if neither a target path nor a buffer is provided + (:issue:`6061`) +- The ``interpolate`` ``downcast`` keyword default has been changed from ``infer`` to + ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). +- allow a Series to utilize index methods depending on its index type, e.g. ``Series.year`` is now defined + for a Series with a ``DatetimeIndex`` or a ``PeriodIndex``; trying this on a non-supported Index type will + now raise a ``TypeError``. (:issue:`4551`, :issue:`4056`, :issue:`5519`) + + The following affected: + + - ``date,time,year,month,day`` + - ``hour,minute,second,weekofyear`` + - ``week,dayofweek,dayofyear,quarter`` + - ``microsecond,nanosecond,qyear`` + - ``min(),max()`` + - ``pd.infer_freq()`` +- ``pd.infer_freq()`` will now raise a ``TypeError`` if given an invalid ``Series/Index`` + type (:issue:`6407`, :issue:`6463`) + +- Local variable usage has changed in + :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` + (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have + changed + - Column names are now given precedence over locals + - Local variables must be referred to explicitly. This means that even if + you have a local variable that is *not* a column you must still refer to + it with the ``'@'`` prefix. + - You can have an expression like ``df.query('@a < a')`` with no complaints + from ``pandas`` about ambiguity of the name ``a``. + +- The top-level :func:`pandas.eval` function does not allow you use the + ``'@'`` prefix and provides you with an error message telling you so. +- ``NameResolutionError`` was removed because it isn't necessary anymore. +- ``concat`` will now concatenate mixed Series and DataFrames using the Series name + or numbering columns as needed (:issue:`2385`) +- Slicing and advanced/boolean indexing operations on ``Index`` classes will no + longer change type of the resulting index (:issue:`6440`). +- ``set_index`` no longer converts MultiIndexes to an Index of tuples (:issue:`6459`). +- Slicing with negative start, stop & step values handles corner cases better (:issue:`6531`): + - ``df.iloc[:-len(df)]`` is now empty + - ``df.iloc[len(df)::-1]`` now enumerates all elements in reverse + +- Better propagation/preservation of Series names when performing groupby + operations: + - ``SeriesGroupBy.agg`` will ensure that the name attribute of the original + series is propagated to the result (:issue:`6265`). + - If the function provided to ``GroupBy.apply`` returns a named series, the + name of the series will be kept as the name of the column index of the + DataFrame returned by ``GroupBy.apply`` (:issue:`6124`). This facilitates + ``DataFrame.stack`` operations where the name of the column index is used as + the name of the inserted column containing the pivoted data. Experimental Features ~~~~~~~~~~~~~~~~~~~~~ + Improvements to existing features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -84,16 +141,25 @@ Improvements to existing features - improve performance of slice indexing on Series with string keys (:issue:`6341`, :issue:`6372`) - implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`) - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`) -- Testing statements updated to use specialized asserts (:issue: `6175`) -- ``Series.rank()`` now has a percentage rank option (:issue: `5971`) +- Testing statements updated to use specialized asserts (:issue:`6175`) +- ``Series.rank()`` now has a percentage rank option (:issue:`5971`) +- ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`) - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) +- perf improvements in DataFrame construction with certain offsets, by removing faulty caching + (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) +- perf improvements in single-dtyped indexing (:issue:`6484`) +- ``StataWriter`` and ``DataFrame.to_stata`` accept time stamp and data labels (:issue:`6545`) +- offset/freq info now in Timestamp __repr__ (:issue:`4553`) .. _release.bug_fixes-0.14.0: Bug Fixes ~~~~~~~~~ +- Bug in Series ValueError when index doesn't match data (:issue:`6532`) +- Bug in ``pd.DataFrame.sort_index`` where mergesort wasn't stable when ``ascending=False`` (:issue:`6399`) +- Bug in ``pd.tseries.frequencies.to_offset`` when argument has leading zeroes (:issue:`6391`) - Bug in version string gen. for dev versions with shallow clones / install from tarball (:issue:`6127`) - Inconsistent tz parsing Timestamp/to_datetime for current year (:issue:`5958`) - Indexing bugs with reordered indexes (:issue:`6252`, :issue:`6254`) @@ -121,6 +187,46 @@ Bug Fixes - Bug in ``DataFrame.replace()`` when passing a nested ``dict`` that contained keys not in the values to be replaced (:issue:`6342`) - Bug in take with duplicate columns not consolidated (:issue:`6240`) +- Bug in interpolate changing dtypes (:issue:`6290`) +- Bug in Series.get, was using a buggy access method (:issue:`6383`) +- Bug in hdfstore queries of the form ``where=[('date', '>=', datetime(2013,1,1)), ('date', '<=', datetime(2014,1,1))]`` (:issue:`6313`) +- Bug in DataFrame.dropna with duplicate indices (:issue:`6355`) +- Regression in chained getitem indexing with embedded list-like from 0.12 (:issue:`6394`) +- ``Float64Index`` with nans not comparing correctly +- ``eval``/``query`` expressions with strings containing the ``@`` character + will now work (:issue:`6366`). +- Bug in ``Series.reindex`` when specifying a ``method`` with some nan values was inconsistent (noted on a resample) (:issue:`6418`) +- Bug in :meth:`DataFrame.replace` where nested dicts were erroneously + depending on the order of dictionary keys and values (:issue:`5338`). +- Perf issue in concatting with empty objects (:issue:`3259`) +- Clarify sorting of ``sym_diff`` on ``Index``es with ``NaN``s (:issue:`6444`) +- Bug in ``str.split`` when passed ``pat=None`` and ``n=1`` (:issue:`6466`) +- Bug in ``io.data.DataReader`` when passed ``"F-F_Momentum_Factor"`` and ``data_source="famafrench"`` (:issue:`6460`) +- Bug in ``sum`` of a ``timedelta64[ns]`` series (:issue:`6462`) +- Bug in ``resample`` with a timezone and certain offsets (:issue:`6397`) +- Bug in ``iat/iloc`` with duplicate indices on a Series (:issue:`6493`) +- Bug in ``read_html`` where nan's were incorrectly being used to indicate + missing values in text. Should use the empty string for consistency with the + rest of pandas (:issue:`5129`). +- Bug in ``read_html`` tests where redirected invalid URLs would make one test + fail (:issue:`6445`). +- Bug in multi-axis indexing using ``.loc`` on non-unique indices (:issue:`6504`) +- Bug that caused _ref_locs corruption when slice indexing across columns axis of a DataFrame (:issue:`6525`) +- Regression from 0.13 in the treatmenet of numpy ``datetime64`` non-ns dtypes in Series creation (:issue:`6529`) +- ``.names`` attribute of MultiIndexes passed to ``set_index`` are now preserved (:issue:`6459`). +- Bug in setitem with a duplicate index and an alignable rhs (:issue:`6541`) +- Bug in setitem with loc on mixed integer Indexes (:issue:`6546`) +- Bug in ``pd.read_stata`` which would use the wrong data types and missing values (:issue:`6327`) +- Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases, and could exported using the + wrong data types and missing values (:issue:`6335`) +- Inconsistent types in Timestamp addition/subtraction (:issue:`6543`) +- Bug in preserving frequency across Timestamp addition/subtraction (:issue:`4547`) +- Bug in indexing: empty list lookup caused ``IndexError`` exceptions (:issue:`6536`, :issue:`6551`) +- Series.quantile raising on an ``object`` dtype (:issue:`6555`) +- Bug in ``.xs`` with a ``nan`` in level when dropped (:issue:`6574`) +- Bug in fillna with method = 'bfill/ffill' and ``datetime64[ns]`` dtype (:issue:`6587`) +- Bug in sql writing with mixed dtypes possibly leading to data loss (:issue:`6509`) +- Bug in popping from a Series (:issue:`6600`) pandas 0.13.1 ------------- @@ -3706,7 +3812,7 @@ Improvements to existing features aggregation with axis != 0 API Changes -~~~~~~~~~ +~~~~~~~~~~~ Bug Fixes ~~~~~~~~~ diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index c2ee0fe9fc729..65ff95a905c14 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -22,7 +22,9 @@ are examples with real-world data, and all the bugs and weirdness that that entails. Here are links to the v0.1 release. For an up-to-date table of contents, see the `pandas-cookbook GitHub -repository `_. +repository `_. To run the examples in this tutorial, you'll need to +clone the GitHub repository and get IPython Notebook running. +See `How to use this cookbook `_. - `A quick tour of the IPython Notebook: `_ Shows off IPython's awesome tab completion and magic functions. diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 58ae5084c4827..d773f3e7df799 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -15,19 +15,155 @@ Highlights include: API changes ~~~~~~~~~~~ -- ``iloc`` will now accept out-of-bounds indexers, e.g. a value that exceeds the length of the object being +- ``iloc`` will now accept out-of-bounds indexers for slices, e.g. a value that exceeds the length of the object being indexed. These will be excluded. This will make pandas conform more with pandas/numpy indexing of out-of-bounds - values. A single indexer that is out-of-bounds and drops the dimensions of the object will still raise - ``IndexError`` (:issue:`6296`). This could result in an empty axis (e.g. an empty DataFrame being returned) + values. A single indexer / list of indexers that is out-of-bounds will still raise + ``IndexError`` (:issue:`6296`, :issue:`6299`). This could result in an empty axis (e.g. an empty DataFrame being returned) + +.. ipython:: python + + dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) + dfl + dfl.iloc[:,2:3] + dfl.iloc[:,1:3] + dfl.iloc[4:6] + +These are out-of-bounds selections + +.. code-block:: python + + dfl.iloc[[4,5,6]] + IndexError: positional indexers are out-of-bounds + + dfl.iloc[:,4] + IndexError: single positional indexer is out-of-bounds + + +- The ``DataFrame.interpolate()`` ``downcast`` keyword default has been changed from ``infer`` to + ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). +- allow a Series to utilize index methods depending on its index type, e.g. ``Series.year`` is now defined + for a Series with a ``DatetimeIndex`` or a ``PeriodIndex``; trying this on a non-supported Index type will + now raise a ``TypeError``. (:issue:`4551`, :issue:`4056`, :issue:`5519`) + + The following affected: + + - ``date,time,year,month,day`` + - ``hour,minute,second,weekofyear`` + - ``week,dayofweek,dayofyear,quarter`` + - ``microsecond,nanosecond,qyear`` + - ``min(),max()`` + - ``pd.infer_freq()`` + + .. ipython:: python + + s = Series(np.random.randn(5),index=tm.makeDateIndex(5)) + s + s.year + s.index.year + +- More consistent behaviour for some groupby methods: + - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: + + .. ipython:: python + + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.head(1) # filters DataFrame + + g.apply(lambda x: x.head(1)) # used to simply fall-through + + - groupby head and tail respect column selection: + + .. ipython:: python + + g[['B']].head(1) + + - groupby ``nth`` now filters by default, with optional dropna argument to ignore + NaN (to replicate the previous behaviour.) + + .. ipython:: python + + DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.nth(0) # can also use negative ints + + g.nth(0, dropna='any') # similar to old behaviour + +- Local variable usage has changed in + :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` + (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have + changed + + - Column names are now given precedence over locals + - Local variables must be referred to explicitly. This means that even if + you have a local variable that is *not* a column you must still refer to + it with the ``'@'`` prefix. + - You can have an expression like ``df.query('@a < a')`` with no complaints + from ``pandas`` about ambiguity of the name ``a``. + +- The top-level :func:`pandas.eval` function does not allow you use the + ``'@'`` prefix and provides you with an error message telling you so. +- ``NameResolutionError`` was removed because it isn't necessary anymore. +- ``concat`` will now concatenate mixed Series and DataFrames using the Series name + or numbering columns as needed (:issue:`2385`). See :ref:`the docs ` +- Slicing and advanced/boolean indexing operations on ``Index`` classes will no + longer change type of the resulting index (:issue:`6440`) + + .. ipython:: python + + i = pd.Index([1, 2, 3, 'a' , 'b', 'c']) + i[[0,1,2]] + + Previously, the above operation would return ``Int64Index``. If you'd like + to do this manually, use :meth:`Index.astype` + + .. ipython:: python + + i[[0,1,2]].astype(np.int_) +- ``set_index`` no longer converts MultiIndexes to an Index of tuples. For example, + the old behavior returned an Index in this case (:issue:`6459`): + + .. ipython:: python + :suppress: + + from itertools import product + tuples = list(product(('a', 'b'), ('c', 'd'))) + mi = MultiIndex.from_tuples(tuples) + df_multi = DataFrame(np.random.randn(4, 2), index=mi) + tuple_ind = pd.Index(tuples) .. ipython:: python - df = DataFrame(np.random.randn(5,2),columns=list('AB')) - df - df.iloc[[4,5,6]] - df.iloc[4:6] - df.iloc[:,2:3] - df.iloc[:,1:3] + df_multi.index + + @suppress + df_multi.index = tuple_ind + + # Old behavior, casted MultiIndex to an Index + df_multi.set_index(df_multi.index) + + @suppress + df_multi.index = mi + + # New behavior + df_multi.set_index(df_multi.index) + + This also applies when passing multiple indices to ``set_index``: + + .. ipython:: python + + @suppress + df_multi.index = tuple_ind + + # Old output, 2-level MultiIndex of tuples + df_multi.set_index([df_multi.index, df_multi.index]) + + @suppress + df_multi.index = mi + + # New output, 4-level MultiIndex + df_multi.set_index([df_multi.index, df_multi.index]) + MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -181,10 +317,21 @@ Enhancements - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) +- Added a ``to_julian_date`` function to ``TimeStamp`` and ``DatetimeIndex`` + to convert to the Julian Date used primarily in astronomy. (:issue:`4041`) +- ``DataFrame.to_stata`` will now check data for compatibility with Stata data types + and will upcast when needed. When it isn't possibly to losslessly upcast, a warning + is raised (:issue:`6327`) +- ``DataFrame.to_stata`` and ``StataWriter`` will accept keyword arguments time_stamp + and data_label which allow the time stamp and dataset label to be set when creating a + file. (:issue:`6545`) Performance ~~~~~~~~~~~ +- perf improvements in DataFrame construction with certain offsets, by removing faulty caching + (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) + Experimental ~~~~~~~~~~~~ diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 081dfd0292cdc..5827f2e971e42 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -414,7 +414,7 @@ setting `kind='kde'`: @savefig kde_plot.png ser.plot(kind='kde') -.. _visualization.hexbin +.. _visualization.hexbin: Hexagonal Bin plot ~~~~~~~~~~~~~~~~~~ @@ -577,6 +577,11 @@ are what constitutes the bootstrap plot. @savefig bootstrap_plot.png bootstrap_plot(data, size=50, samples=500, color='grey') +.. ipython:: python + :suppress: + + plt.close('all') + .. _visualization.radviz: RadViz diff --git a/doc/sphinxext/README.rst b/doc/sphinxext/README.rst new file mode 100644 index 0000000000000..e39cf8daac036 --- /dev/null +++ b/doc/sphinxext/README.rst @@ -0,0 +1,17 @@ +sphinxext +========= + +This directory contains copies of different sphinx extensions in use in the +pandas documentation. These copies originate from other projects: + +- ``numpydoc`` - Numpy's Sphinx extensions: this can be found at its own + repository: https://github.com/numpy/numpydoc +- ``ipython_directive`` and ``ipython_console_highlighting`` in the folder + `ipython_sphinxext` - Sphinx extensions from IPython: these are included + in IPython: https://github.com/ipython/ipython/tree/master/IPython/sphinxext + +.. note:: + + These copies are maintained at the respective projects, so fixes should, + to the extent possible, be pushed upstream instead of only adapting our + local copy to avoid divergence between the the local and upstream version. diff --git a/doc/sphinxext/ipython_sphinxext/__init__.py b/doc/sphinxext/ipython_sphinxext/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/doc/sphinxext/ipython_console_highlighting.py b/doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py similarity index 100% rename from doc/sphinxext/ipython_console_highlighting.py rename to doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py diff --git a/doc/sphinxext/ipython_directive.py b/doc/sphinxext/ipython_sphinxext/ipython_directive.py similarity index 100% rename from doc/sphinxext/ipython_directive.py rename to doc/sphinxext/ipython_sphinxext/ipython_directive.py diff --git a/pandas/__init__.py b/pandas/__init__.py index ff5588e778284..442c6b08c9dce 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -51,3 +51,4 @@ from pandas.tools.tile import cut, qcut from pandas.core.reshape import melt from pandas.util.print_versions import show_versions +import pandas.util.testing diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 7f406611c82f7..14c9ec2f3355d 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -68,12 +68,14 @@ cdef: int TIEBREAK_MAX = 2 int TIEBREAK_FIRST = 3 int TIEBREAK_FIRST_DESCENDING = 4 + int TIEBREAK_DENSE = 5 tiebreakers = { 'average' : TIEBREAK_AVERAGE, 'min' : TIEBREAK_MIN, 'max' : TIEBREAK_MAX, - 'first' : TIEBREAK_FIRST + 'first' : TIEBREAK_FIRST, + 'dense' : TIEBREAK_DENSE, } @@ -137,7 +139,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[float64_t] sorted_data, ranks, values ndarray[int64_t] argsorted float64_t val, nan_value @@ -200,6 +202,10 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: return ranks / count @@ -214,7 +220,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[int64_t] sorted_data, values ndarray[float64_t] ranks ndarray[int64_t] argsorted @@ -265,6 +271,10 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: return ranks / count @@ -279,7 +289,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', """ cdef: - Py_ssize_t i, j, z, k, n, dups = 0 + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 ndarray[float64_t, ndim=2] ranks, values ndarray[int64_t, ndim=2] argsorted float64_t val, nan_value @@ -324,6 +334,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = 0 + total_tie_count = 0 for j in range(k): sum_ranks += j + 1 dups += 1 @@ -347,6 +358,10 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: @@ -362,7 +377,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', """ cdef: - Py_ssize_t i, j, z, k, n, dups = 0 + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 ndarray[float64_t, ndim=2] ranks ndarray[int64_t, ndim=2] argsorted ndarray[int64_t, ndim=2, cast=True] values @@ -395,6 +410,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = 0 + total_tie_count = 0 for j in range(k): sum_ranks += j + 1 dups += 1 @@ -415,6 +431,10 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: @@ -430,7 +450,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[float64_t] ranks ndarray sorted_data, values ndarray[int64_t] argsorted @@ -502,6 +522,10 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: raise ValueError('first not supported for non-numeric data') + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: ranks / count @@ -545,6 +569,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', cdef: Py_ssize_t i, j, z, k, n, infs, dups = 0 + Py_ssize_t total_tie_count = 0 ndarray[float64_t, ndim=2] ranks ndarray[object, ndim=2] values ndarray[int64_t, ndim=2] argsorted @@ -600,6 +625,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = infs = 0 + total_tie_count = 0 for j in range(k): val = values[i, j] if val is nan_value and keep_na: @@ -621,6 +647,10 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST: raise ValueError('first not supported for ' 'non-numeric data') + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 8ec3adcdffd6f..bff6eb1f95abc 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -54,6 +54,8 @@ import pickle as cPickle import http.client as httplib +from pandas.compat.chainmap import DeepChainMap + if PY3: def isidentifier(s): diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py new file mode 100644 index 0000000000000..9edd2ef056a52 --- /dev/null +++ b/pandas/compat/chainmap.py @@ -0,0 +1,26 @@ +try: + from collections import ChainMap +except ImportError: + from pandas.compat.chainmap_impl import ChainMap + + +class DeepChainMap(ChainMap): + def __setitem__(self, key, value): + for mapping in self.maps: + if key in mapping: + mapping[key] = value + return + self.maps[0][key] = value + + def __delitem__(self, key): + for mapping in self.maps: + if key in mapping: + del mapping[key] + return + raise KeyError(key) + + # override because the m parameter is introduced in Python 3.4 + def new_child(self, m=None): + if m is None: + m = {} + return self.__class__(m, *self.maps) diff --git a/pandas/compat/chainmap_impl.py b/pandas/compat/chainmap_impl.py new file mode 100644 index 0000000000000..92d2424057f83 --- /dev/null +++ b/pandas/compat/chainmap_impl.py @@ -0,0 +1,136 @@ +from collections import MutableMapping + +try: + from thread import get_ident +except ImportError: + from _thread import get_ident + + +def recursive_repr(fillvalue='...'): + 'Decorator to make a repr function return fillvalue for a recursive call' + + def decorating_function(user_function): + repr_running = set() + + def wrapper(self): + key = id(self), get_ident() + if key in repr_running: + return fillvalue + repr_running.add(key) + try: + result = user_function(self) + finally: + repr_running.discard(key) + return result + + # Can't use functools.wraps() here because of bootstrap issues + wrapper.__module__ = getattr(user_function, '__module__') + wrapper.__doc__ = getattr(user_function, '__doc__') + wrapper.__name__ = getattr(user_function, '__name__') + return wrapper + + return decorating_function + + +class ChainMap(MutableMapping): + ''' A ChainMap groups multiple dicts (or other mappings) together + to create a single, updateable view. + + The underlying mappings are stored in a list. That list is public and can + accessed or updated using the *maps* attribute. There is no other state. + + Lookups search the underlying mappings successively until a key is found. + In contrast, writes, updates, and deletions only operate on the first + mapping. + + ''' + + def __init__(self, *maps): + '''Initialize a ChainMap by setting *maps* to the given mappings. + If no mappings are provided, a single empty dictionary is used. + + ''' + self.maps = list(maps) or [{}] # always at least one map + + def __missing__(self, key): + raise KeyError(key) + + def __getitem__(self, key): + for mapping in self.maps: + try: + return mapping[key] # can't use 'key in mapping' with defaultdict + except KeyError: + pass + return self.__missing__(key) # support subclasses that define __missing__ + + def get(self, key, default=None): + return self[key] if key in self else default + + def __len__(self): + return len(set().union(*self.maps)) # reuses stored hash values if possible + + def __iter__(self): + return iter(set().union(*self.maps)) + + def __contains__(self, key): + return any(key in m for m in self.maps) + + def __bool__(self): + return any(self.maps) + + @recursive_repr() + def __repr__(self): + return '{0.__class__.__name__}({1})'.format( + self, ', '.join(repr(m) for m in self.maps)) + + @classmethod + def fromkeys(cls, iterable, *args): + 'Create a ChainMap with a single dict created from the iterable.' + return cls(dict.fromkeys(iterable, *args)) + + def copy(self): + 'New ChainMap or subclass with a new copy of maps[0] and refs to maps[1:]' + return self.__class__(self.maps[0].copy(), *self.maps[1:]) + + __copy__ = copy + + def new_child(self, m=None): # like Django's Context.push() + ''' + New ChainMap with a new map followed by all previous maps. If no + map is provided, an empty dict is used. + ''' + if m is None: + m = {} + return self.__class__(m, *self.maps) + + @property + def parents(self): # like Django's Context.pop() + 'New ChainMap from maps[1:].' + return self.__class__(*self.maps[1:]) + + def __setitem__(self, key, value): + self.maps[0][key] = value + + def __delitem__(self, key): + try: + del self.maps[0][key] + except KeyError: + raise KeyError('Key not found in the first mapping: {!r}'.format(key)) + + def popitem(self): + 'Remove and return an item pair from maps[0]. Raise KeyError is maps[0] is empty.' + try: + return self.maps[0].popitem() + except KeyError: + raise KeyError('No keys found in the first mapping.') + + def pop(self, key, *args): + 'Remove *key* from maps[0] and return its value. Raise KeyError if *key* not in maps[0].' + try: + return self.maps[0].pop(key, *args) + except KeyError: + raise KeyError('Key not found in the first mapping: {!r}'.format(key)) + + def clear(self): + 'Clear maps[0], leaving maps[1:] intact.' + self.maps[0].clear() diff --git a/pandas/computation/align.py b/pandas/computation/align.py index 1685f66c15416..2e0845bddf7e2 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -34,40 +34,6 @@ def _zip_axes_from_type(typ, new_axes): return axes -def _maybe_promote_shape(values, naxes): - # test to see if we have an array else leave since must be a number - if not isinstance(values, np.ndarray): - return values - - ndims = values.ndim - if ndims > naxes: - raise AssertionError('cannot have more dims than axes, ' - '{0} > {1}'.format(ndims, naxes)) - if ndims == naxes: - return values - - ndim, nax = range(ndims), range(naxes) - - axes_slice = [slice(None)] * naxes - - # set difference of numaxes and ndims - slices = list(set(nax) - set(ndim)) - - if ndims == naxes: - if slices: - raise AssertionError('slices should be empty if ndims == naxes ' - '{0}'.format(slices)) - else: - if not slices: - raise AssertionError('slices should NOT be empty if ndim != naxes ' - '{0}'.format(slices)) - - for sl in slices: - axes_slice[sl] = np.newaxis - - return values[tuple(axes_slice)] - - def _any_pandas_objects(terms): """Check a sequence of terms for instances of PandasObject.""" return any(isinstance(term.value, pd.core.generic.PandasObject) @@ -83,12 +49,7 @@ def wrapper(terms): term_values = (term.value for term in terms) - # only scalars or indexes - if all(isinstance(term.value, pd.Index) or term.isscalar for term in - terms): - return _result_type_many(*term_values), None - - # no pandas objects + # we don't have any pandas objects if not _any_pandas_objects(terms): return _result_type_many(*term_values), None @@ -148,44 +109,13 @@ def _align_core(terms): f = partial(ti.reindex_axis, reindexer, axis=axis, copy=False) - # need to fill if we have a bool dtype/array - if (isinstance(ti, (np.ndarray, pd.Series)) - and ti.dtype == object - and pd.lib.is_bool_array(ti.values)): - r = f(fill_value=True) - else: - r = f() - - terms[i].update(r) + terms[i].update(f()) - res = _maybe_promote_shape(terms[i].value.T if transpose else - terms[i].value, naxes) - res = res.T if transpose else res - - try: - v = res.values - except AttributeError: - v = res - terms[i].update(v) + terms[i].update(terms[i].value.values) return typ, _zip_axes_from_type(typ, axes) -def _filter_terms(flat): - # numeric literals - literals = frozenset(filter(lambda x: isinstance(x, Constant), flat)) - - # these are strings which are variable names - names = frozenset(flat) - literals - - # literals are not names and names are not literals, so intersection should - # be empty - if literals & names: - raise ValueError('literals cannot be names and names cannot be ' - 'literals') - return names, literals - - def _align(terms): """Align a set of terms""" try: @@ -231,10 +161,7 @@ def _reconstruct_object(typ, obj, axes, dtype): except AttributeError: pass - try: - res_t = np.result_type(obj.dtype, dtype) - except AttributeError: - res_t = dtype + res_t = np.result_type(obj.dtype, dtype) if (not isinstance(typ, partial) and issubclass(typ, pd.core.generic.PandasObject)): diff --git a/pandas/computation/common.py b/pandas/computation/common.py index 0d5e639032b94..105cc497a4207 100644 --- a/pandas/computation/common.py +++ b/pandas/computation/common.py @@ -16,10 +16,7 @@ def _result_type_many(*arrays_and_dtypes): try: return np.result_type(*arrays_and_dtypes) except ValueError: - # length 0 or length > NPY_MAXARGS both throw a ValueError, so check - # which one we're dealing with - if len(arrays_and_dtypes) == 0: - raise ValueError('at least one array or dtype is required') + # we have > NPY_MAXARGS terms in our expression return reduce(np.result_type, arrays_and_dtypes) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 9738cac58fb2d..58b822af546c8 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -4,9 +4,34 @@ import abc from pandas import compat +from pandas.compat import DeepChainMap, map from pandas.core import common as com from pandas.computation.align import _align, _reconstruct_object -from pandas.computation.ops import UndefinedVariableError +from pandas.computation.ops import UndefinedVariableError, _mathops, _reductions + + +_ne_builtins = frozenset(_mathops + _reductions) + + +class NumExprClobberingError(NameError): + pass + + +def _check_ne_builtin_clash(expr): + """Attempt to prevent foot-shooting in a helpful way. + + Parameters + ---------- + terms : Term + Terms can contain + """ + names = expr.names + overlap = names & _ne_builtins + + if overlap: + s = ', '.join(map(repr, overlap)) + raise NumExprClobberingError('Variables in expression "%s" overlap with ' + 'numexpr builtins: (%s)' % (expr, s)) class AbstractEngine(object): @@ -29,9 +54,6 @@ def convert(self): """ return com.pprint_thing(self.expr) - def pre_evaluate(self): - self.expr.check_name_clashes() - def evaluate(self): """Run the engine on the expression @@ -47,7 +69,6 @@ def evaluate(self): self.result_type, self.aligned_axes = _align(self.expr.terms) # make sure no names in resolvers and locals/globals clash - self.pre_evaluate() res = self._evaluate() return _reconstruct_object(self.result_type, res, self.aligned_axes, self.expr.terms.return_type) @@ -87,16 +108,15 @@ def convert(self): def _evaluate(self): import numexpr as ne - # add the resolvers to locals - self.expr.add_resolvers_to_locals() - # convert the expression to a valid numexpr expression s = self.convert() try: - return ne.evaluate(s, local_dict=self.expr.env.locals, - global_dict=self.expr.env.globals, - truediv=self.expr.truediv) + env = self.expr.env + scope = env.full_scope + truediv = scope['truediv'] + _check_ne_builtin_clash(self.expr) + return ne.evaluate(s, local_dict=scope, truediv=truediv) except KeyError as e: # python 3 compat kludge try: @@ -118,7 +138,6 @@ def __init__(self, expr): super(PythonEngine, self).__init__(expr) def evaluate(self): - self.pre_evaluate() return self.expr() def _evaluate(self): diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 4cc68ac4770b3..82c68fb10e7d6 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -3,9 +3,11 @@ """Top level ``eval`` module. """ - +import tokenize from pandas.core import common as com -from pandas.computation.expr import Expr, _parsers, _ensure_scope +from pandas.computation.expr import Expr, _parsers, tokenize_string +from pandas.computation.scope import _ensure_scope +from pandas.compat import DeepChainMap, builtins from pandas.computation.engines import _engines from distutils.version import LooseVersion @@ -67,8 +69,8 @@ def _check_resolvers(resolvers): for resolver in resolvers: if not hasattr(resolver, '__getitem__'): name = type(resolver).__name__ - raise AttributeError('Resolver of type {0!r} must implement ' - 'the __getitem__ method'.format(name)) + raise TypeError('Resolver of type %r does not implement ' + 'the __getitem__ method' % name) def _check_expression(expr): @@ -116,8 +118,26 @@ def _convert_expression(expr): return s +def _check_for_locals(expr, stack_level, parser): + at_top_of_stack = stack_level == 0 + not_pandas_parser = parser != 'pandas' + + if not_pandas_parser: + msg = "The '@' prefix is only supported by the pandas parser" + elif at_top_of_stack: + msg = ("The '@' prefix is not allowed in " + "top-level eval calls, \nplease refer to " + "your variables by name without the '@' " + "prefix") + + if at_top_of_stack or not_pandas_parser: + for toknum, tokval in tokenize_string(expr): + if toknum == tokenize.OP and tokval == '@': + raise SyntaxError(msg) + + def eval(expr, parser='pandas', engine='numexpr', truediv=True, - local_dict=None, global_dict=None, resolvers=None, level=2, + local_dict=None, global_dict=None, resolvers=(), level=0, target=None): """Evaluate a Python expression as a string using various backends. @@ -198,10 +218,13 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, _check_engine(engine) _check_parser(parser) _check_resolvers(resolvers) + _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope - env = _ensure_scope(global_dict=global_dict, local_dict=local_dict, - resolvers=resolvers, level=level, target=target) + level += 1 + env = _ensure_scope(level, global_dict=global_dict, + local_dict=local_dict, resolvers=resolvers, + target=target) parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 71501d5079c4c..1c40dc9930856 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -7,274 +7,130 @@ import inspect import tokenize import datetime -import struct from functools import partial import pandas as pd from pandas import compat -from pandas.compat import StringIO, zip, reduce, string_types +from pandas.compat import StringIO, lmap, zip, reduce, string_types from pandas.core.base import StringMixin from pandas.core import common as com -from pandas.computation.common import NameResolutionError +from pandas.tools.util import compose from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms, _arith_ops_syms, _unary_ops_syms, is_term) from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG from pandas.computation.ops import Op, BinOp, UnaryOp, Term, Constant, Div from pandas.computation.ops import UndefinedVariableError +from pandas.computation.scope import Scope, _ensure_scope -def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, - target=None, **kwargs): - """Ensure that we are grabbing the correct scope.""" - return Scope(gbls=global_dict, lcls=local_dict, level=level, - resolvers=resolvers, target=target) +def tokenize_string(source): + """Tokenize a Python source code string. - -def _check_disjoint_resolver_names(resolver_keys, local_keys, global_keys): - """Make sure that variables in resolvers don't overlap with locals or - globals. - """ - res_locals = list(com.intersection(resolver_keys, local_keys)) - if res_locals: - msg = "resolvers and locals overlap on names {0}".format(res_locals) - raise NameResolutionError(msg) - - res_globals = list(com.intersection(resolver_keys, global_keys)) - if res_globals: - msg = "resolvers and globals overlap on names {0}".format(res_globals) - raise NameResolutionError(msg) - - -def _replacer(x, pad_size): - """Replace a number with its padded hexadecimal representation. Used to tag - temporary variables with their calling scope's id. + Parameters + ---------- + source : str + A Python source code string """ - # get the hex repr of the binary char and remove 0x and pad by pad_size - # zeros - try: - hexin = ord(x) - except TypeError: - # bytes literals masquerade as ints when iterating in py3 - hexin = x + line_reader = StringIO(source).readline + for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader): + yield toknum, tokval - return hex(hexin).replace('0x', '').rjust(pad_size, '0') - -def _raw_hex_id(obj, pad_size=2): - """Return the padded hexadecimal id of ``obj``.""" - # interpret as a pointer since that's what really what id returns - packed = struct.pack('@P', id(obj)) - - return ''.join(_replacer(x, pad_size) for x in packed) - - -class Scope(StringMixin): - - """Object to hold scope, with a few bells to deal with some custom syntax - added by pandas. +def _rewrite_assign(tok): + """Rewrite the assignment operator for PyTables expressions that use ``=`` + as a substitute for ``==``. Parameters ---------- - gbls : dict or None, optional, default None - lcls : dict or Scope or None, optional, default None - level : int, optional, default 1 - resolvers : list-like or None, optional, default None + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module - Attributes - ---------- - globals : dict - locals : dict - level : int - resolvers : tuple - resolver_keys : frozenset + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values """ - __slots__ = ('globals', 'locals', 'resolvers', '_global_resolvers', - 'resolver_keys', '_resolver', 'level', 'ntemps', 'target') - - def __init__(self, gbls=None, lcls=None, level=1, resolvers=None, - target=None): - self.level = level - self.resolvers = tuple(resolvers or []) - self.globals = dict() - self.locals = dict() - self.target = target - self.ntemps = 1 # number of temporary variables in this scope - - if isinstance(lcls, Scope): - ld, lcls = lcls, dict() - self.locals.update(ld.locals.copy()) - self.globals.update(ld.globals.copy()) - self.resolvers += ld.resolvers - if ld.target is not None: - self.target = ld.target - self.update(ld.level) - - frame = sys._getframe(level) - try: - self.globals.update(gbls or frame.f_globals) - self.locals.update(lcls or frame.f_locals) - finally: - del frame - - # add some useful defaults - self.globals['Timestamp'] = pd.lib.Timestamp - self.globals['datetime'] = datetime - - # SUCH a hack - self.globals['True'] = True - self.globals['False'] = False - - # function defs - self.globals['list'] = list - self.globals['tuple'] = tuple + toknum, tokval = tok + return toknum, '==' if tokval == '=' else tokval - res_keys = (list(o.keys()) for o in self.resolvers) - self.resolver_keys = frozenset(reduce(operator.add, res_keys, [])) - self._global_resolvers = self.resolvers + (self.locals, self.globals) - self._resolver = None - self.resolver_dict = {} - for o in self.resolvers: - self.resolver_dict.update(dict(o)) +def _replace_booleans(tok): + """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise + precedence is changed to boolean precedence. - def __unicode__(self): - return com.pprint_thing( - 'locals: {0}\nglobals: {0}\nresolvers: ' - '{0}\ntarget: {0}'.format(list(self.locals.keys()), - list(self.globals.keys()), - list(self.resolver_keys), - self.target)) - - def __getitem__(self, key): - return self.resolve(key, globally=False) - - def resolve(self, key, globally=False): - resolvers = self.locals, self.globals - if globally: - resolvers = self._global_resolvers - - for resolver in resolvers: - try: - return resolver[key] - except KeyError: - pass + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module - def update(self, level=None): - """Update the current scope by going back `level` levels. + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == tokenize.OP: + if tokval == '&': + return tokenize.NAME, 'and' + elif tokval == '|': + return tokenize.NAME, 'or' + return toknum, tokval + return toknum, tokval - Parameters - ---------- - level : int or None, optional, default None - """ - # we are always 2 levels below the caller - # plus the caller may be below the env level - # in which case we need addtl levels - sl = 2 - if level is not None: - sl += level - - # add sl frames to the scope starting with the - # most distant and overwritting with more current - # makes sure that we can capture variable scope - frame = inspect.currentframe() - try: - frames = [] - while sl >= 0: - frame = frame.f_back - sl -= 1 - if frame is None: - break - frames.append(frame) - for f in frames[::-1]: - self.locals.update(f.f_locals) - self.globals.update(f.f_globals) - finally: - del frame, frames - - def add_tmp(self, value, where='locals'): - """Add a temporary variable to the scope. - - Parameters - ---------- - value : object - An arbitrary object to be assigned to a temporary variable. - where : basestring, optional, default 'locals', {'locals', 'globals'} - What scope to add the value to. - Returns - ------- - name : basestring - The name of the temporary variable created. - """ - d = getattr(self, where, None) - - if d is None: - raise AttributeError("Cannot add value to non-existent scope " - "{0!r}".format(where)) - if not isinstance(d, dict): - raise TypeError("Cannot add value to object of type {0!r}, " - "scope must be a dictionary" - "".format(type(d).__name__)) - name = 'tmp_var_{0}_{1}_{2}'.format(type(value).__name__, self.ntemps, - _raw_hex_id(self)) - d[name] = value - - # only increment if the variable gets put in the scope - self.ntemps += 1 - return name - - def remove_tmp(self, name, where='locals'): - d = getattr(self, where, None) - if d is None: - raise AttributeError("Cannot remove value from non-existent scope " - "{0!r}".format(where)) - if not isinstance(d, dict): - raise TypeError("Cannot remove value from object of type {0!r}, " - "scope must be a dictionary" - "".format(type(d).__name__)) - del d[name] - self.ntemps -= 1 - - -def _rewrite_assign(source): - """Rewrite the assignment operator for PyTables expression that want to use - ``=`` as a substitute for ``==``. - """ - res = [] - g = tokenize.generate_tokens(StringIO(source).readline) - for toknum, tokval, _, _, _ in g: - res.append((toknum, '==' if tokval == '=' else tokval)) - return tokenize.untokenize(res) +def _replace_locals(tok): + """Replace local variables with a syntactically valid name. + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module -def _replace_booleans(source): - """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise - precedence is changed to boolean precedence. + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values + + Notes + ----- + This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as + ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_`` + is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. """ - res = [] - g = tokenize.generate_tokens(StringIO(source).readline) - for toknum, tokval, _, _, _ in g: - if toknum == tokenize.OP: - if tokval == '&': - res.append((tokenize.NAME, 'and')) - elif tokval == '|': - res.append((tokenize.NAME, 'or')) - else: - res.append((toknum, tokval)) - else: - res.append((toknum, tokval)) - return tokenize.untokenize(res) + toknum, tokval = tok + if toknum == tokenize.OP and tokval == '@': + return tokenize.OP, _LOCAL_TAG + return toknum, tokval -def _replace_locals(source, local_symbol='@'): - """Replace local variables with a syntacticall valid name.""" - return source.replace(local_symbol, _LOCAL_TAG) +def _preparse(source, f=compose(_replace_locals, _replace_booleans, + _rewrite_assign)): + """Compose a collection of tokenization functions + Parameters + ---------- + source : str + A Python source code string + f : callable + This takes a tuple of (toknum, tokval) as its argument and returns a + tuple with the same structure but possibly different elements. Defaults + to the composition of ``_rewrite_assign``, ``_replace_booleans``, and + ``_replace_locals``. -def _preparse(source): - """Compose assignment and boolean replacement.""" - return _replace_booleans(_rewrite_assign(source)) + Returns + ------- + s : str + Valid Python source code + + Notes + ----- + The `f` parameter can be any callable that takes *and* returns input of the + form ``(toknum, tokval)``, where ``toknum`` is one of the constants from + the ``tokenize`` module and ``tokval`` is a string. + """ + assert callable(f), 'f must be callable' + return tokenize.untokenize(lmap(f, tokenize_string(source))) def _is_type(t): @@ -452,9 +308,6 @@ def visit(self, node, **kwargs): if isinstance(node, string_types): clean = self.preparser(node) node = ast.fix_missing_locations(ast.parse(clean)) - elif not isinstance(node, ast.AST): - raise TypeError("Cannot visit objects of type {0!r}" - "".format(node.__class__.__name__)) method = 'visit_' + node.__class__.__name__ visitor = getattr(self, method) @@ -549,8 +402,8 @@ def visit_BinOp(self, node, **kwargs): return self._possibly_evaluate_binop(op, op_class, left, right) def visit_Div(self, node, **kwargs): - return lambda lhs, rhs: Div(lhs, rhs, - truediv=self.env.locals['truediv']) + truediv = self.env.scope['truediv'] + return lambda lhs, rhs: Div(lhs, rhs, truediv) def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) @@ -677,7 +530,7 @@ def visit_Call(self, node, side=None, **kwargs): args = [self.visit(targ).value for targ in node.args] if node.starargs is not None: - args = args + self.visit(node.starargs).value + args += self.visit(node.starargs).value keywords = {} for key in node.keywords: @@ -741,7 +594,8 @@ def visitor(x, y): class PandasExprVisitor(BaseExprVisitor): def __init__(self, env, engine, parser, - preparser=lambda x: _replace_locals(_replace_booleans(x))): + preparser=partial(_preparse, f=compose(_replace_locals, + _replace_booleans))): super(PandasExprVisitor, self).__init__(env, engine, parser, preparser) @@ -768,21 +622,20 @@ class Expr(StringMixin): """ def __init__(self, expr, engine='numexpr', parser='pandas', env=None, - truediv=True, level=2): + truediv=True, level=0): self.expr = expr - self.env = _ensure_scope(level=level, local_dict=env) + self.env = env or Scope(level=level + 1) self.engine = engine self.parser = parser + self.env.scope['truediv'] = truediv self._visitor = _parsers[parser](self.env, self.engine, self.parser) self.terms = self.parse() - self.truediv = truediv @property def assigner(self): return getattr(self._visitor, 'assigner', None) def __call__(self): - self.env.locals['truediv'] = self.truediv return self.terms(self.env) def __unicode__(self): @@ -795,10 +648,6 @@ def parse(self): """Parse an expression""" return self._visitor.visit(self.expr) - def align(self): - """align a set of Terms""" - return self.terms.align(self.env) - @property def names(self): """Get the names in an expression""" @@ -806,34 +655,5 @@ def names(self): return frozenset([self.terms.name]) return frozenset(term.name for term in com.flatten(self.terms)) - def check_name_clashes(self): - env = self.env - names = self.names - res_keys = frozenset(env.resolver_dict.keys()) & names - lcl_keys = frozenset(env.locals.keys()) & names - gbl_keys = frozenset(env.globals.keys()) & names - _check_disjoint_resolver_names(res_keys, lcl_keys, gbl_keys) - - def add_resolvers_to_locals(self): - """Add the extra scope (resolvers) to local scope - - Notes - ----- - This should be done after parsing and pre-evaluation, otherwise - unnecessary name clashes will occur. - """ - self.env.locals.update(self.env.resolver_dict) - - -def isexpr(s, check_names=True): - """Strict checking for a valid expression.""" - try: - Expr(s, env=_ensure_scope() if check_names else None) - except SyntaxError: - return False - except NameError: - return not check_names - return True - _parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor} diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 270ba92d4483a..041ab77bb61f4 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -23,33 +23,18 @@ _LOCAL_TAG = '__pd_eval_local_' -_TAG_RE = re.compile('^{0}'.format(_LOCAL_TAG)) class UndefinedVariableError(NameError): """NameError subclass for local variables.""" - def __init__(self, *args): - msg = 'name {0!r} is not defined' - subbed = _TAG_RE.sub('', args[0]) - if subbed != args[0]: - subbed = '@' + subbed + def __init__(self, name, is_local): + if is_local: msg = 'local variable {0!r} is not defined' - super(UndefinedVariableError, self).__init__(msg.format(subbed)) - - -def _possibly_update_key(d, value, old_key, new_key=None): - if new_key is None: - new_key = old_key - - try: - del d[old_key] - except KeyError: - return False - else: - d[new_key] = value - return True + else: + msg = 'name {0!r} is not defined' + super(UndefinedVariableError, self).__init__(msg.format(name)) class Term(StringMixin): @@ -65,13 +50,13 @@ def __init__(self, name, env, side=None, encoding=None): self._name = name self.env = env self.side = side - self.local = _TAG_RE.search(text_type(name)) is not None + self.is_local = text_type(name).startswith(_LOCAL_TAG) self._value = self._resolve_name() self.encoding = encoding @property def local_name(self): - return _TAG_RE.sub('', self.name) + return self.name.replace(_LOCAL_TAG, '') def __unicode__(self): return com.pprint_thing(self.name) @@ -83,19 +68,13 @@ def evaluate(self, *args, **kwargs): return self def _resolve_name(self): - env = self.env key = self.name - res = env.resolve(self.local_name, globally=not self.local) + res = self.env.resolve(self.local_name, is_local=self.is_local) self.update(res) - if res is None: - if not isinstance(key, string_types): - return key - raise UndefinedVariableError(key) - if hasattr(res, 'ndim') and res.ndim > 2: - raise NotImplementedError("N-dimensional objects, where N > 2, are" - " not supported with eval") + raise NotImplementedError("N-dimensional objects, where N > 2," + " are not supported with eval") return res def update(self, value): @@ -108,34 +87,11 @@ def update(self, value): ('locals', 'key'), ('globals', 'key')] """ - env = self.env key = self.name # if it's a variable name (otherwise a constant) if isinstance(key, string_types): - if self.local: - # get it's name WITHOUT the local tag (defined above) - local_name = self.local_name - - # search for the local in the above specified order - scope_pairs = product([env.locals, env.globals], - [local_name, key]) - - # a[::2] + a[1::2] but iterators - scope_iter = chain(islice(scope_pairs, None, None, 2), - islice(scope_pairs, 1, None, 2)) - for d, k in scope_iter: - if _possibly_update_key(d, value, k, key): - break - else: - raise UndefinedVariableError(key) - else: - # otherwise we look in resolvers -> locals -> globals - for r in (env.resolver_dict, env.locals, env.globals): - if _possibly_update_key(r, value, key): - break - else: - raise UndefinedVariableError(key) + self.env.swapkey(self.local_name, key, new_value=value) self.value = value @@ -191,10 +147,7 @@ def name(self, new_name): @property def ndim(self): - try: - return self._value.ndim - except AttributeError: - return 0 + return self._value.ndim class Constant(Term): @@ -374,7 +327,7 @@ def __call__(self, env): The result of an evaluated expression. """ # handle truediv - if self.op == '/' and env.locals['truediv']: + if self.op == '/' and env.scope['truediv']: self.func = op.truediv # recurse over the left/right nodes @@ -472,7 +425,7 @@ class Div(BinOp): regardless of the value of ``truediv``. """ - def __init__(self, lhs, rhs, truediv=True, *args, **kwargs): + def __init__(self, lhs, rhs, truediv, *args, **kwargs): super(Div, self).__init__('/', lhs, rhs, *args, **kwargs) if truediv or PY3: diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index bf477cd71df62..8fc842d958075 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -7,25 +7,24 @@ from datetime import datetime, timedelta import numpy as np import pandas as pd -from pandas.compat import u, string_types, PY3 +from pandas.compat import u, string_types, PY3, DeepChainMap from pandas.core.base import StringMixin import pandas.core.common as com from pandas.computation import expr, ops -from pandas.computation.ops import is_term +from pandas.computation.ops import is_term, UndefinedVariableError +from pandas.computation.scope import _ensure_scope from pandas.computation.expr import BaseExprVisitor from pandas.computation.common import _ensure_decoded from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type class Scope(expr.Scope): - __slots__ = 'globals', 'locals', 'queryables' - - def __init__(self, gbls=None, lcls=None, queryables=None, level=1): - super( - Scope, - self).__init__(gbls=gbls, - lcls=lcls, - level=level) + __slots__ = 'queryables', + + def __init__(self, level, global_dict=None, local_dict=None, + queryables=None): + super(Scope, self).__init__(level + 1, global_dict=global_dict, + local_dict=local_dict) self.queryables = queryables or dict() @@ -48,9 +47,11 @@ def _resolve_name(self): raise NameError('name {0!r} is not defined'.format(self.name)) return self.name - # resolve the rhs (and allow to be None) - return self.env.locals.get(self.name, - self.env.globals.get(self.name, self.name)) + # resolve the rhs (and allow it to be None) + try: + return self.env.resolve(self.name, is_local=False) + except UndefinedVariableError: + return self.name @property def value(self): @@ -66,10 +67,6 @@ def __init__(self, value, env, side=None, encoding=None): def _resolve_name(self): return self._name - @property - def name(self): - return self._value - class BinOp(ops.BinOp): @@ -232,9 +229,6 @@ def format(self): def evaluate(self): - if not isinstance(self.lhs, string_types): - return self - if not self.is_valid: raise ValueError("query term is not valid [%s]" % self) @@ -306,9 +300,6 @@ def format(self): def evaluate(self): - if not isinstance(self.lhs, string_types): - return self - if not self.is_valid: raise ValueError("query term is not valid [%s]" % self) @@ -389,9 +380,6 @@ def visit_UnaryOp(self, node, **kwargs): elif isinstance(node.op, ast.UAdd): raise NotImplementedError('Unary addition not supported') - def visit_USub(self, node, **kwargs): - return self.const_type(-self.visit(node.operand).value, self.env) - def visit_Index(self, node, **kwargs): return self.visit(node.value).value @@ -478,7 +466,7 @@ class Expr(expr.Expr): """ def __init__(self, where, op=None, value=None, queryables=None, - encoding=None, scope_level=None): + encoding=None, scope_level=0): # try to be back compat where = self.parse_back_compat(where, op, value) @@ -489,26 +477,24 @@ def __init__(self, where, op=None, value=None, queryables=None, self.terms = None self._visitor = None - # capture the environement if needed - lcls = dict() - if isinstance(where, Expr): + # capture the environment if needed + local_dict = DeepChainMap() - lcls.update(where.env.locals) + if isinstance(where, Expr): + local_dict = where.env.scope where = where.expr elif isinstance(where, (list, tuple)): - - for w in where: + for idx, w in enumerate(where): if isinstance(w, Expr): - lcls.update(w.env.locals) + local_dict = w.env.scope else: w = self.parse_back_compat(w) - + where[idx] = w where = ' & ' .join(["(%s)" % w for w in where]) self.expr = where - self.env = Scope(lcls=lcls) - self.env.update(scope_level) + self.env = Scope(scope_level + 1, local_dict=local_dict) if queryables is not None and isinstance(self.expr, string_types): self.env.queryables.update(queryables) @@ -528,6 +514,15 @@ def parse_back_compat(self, w, op=None, value=None): warnings.warn("passing a dict to Expr is deprecated, " "pass the where as a single string", DeprecationWarning) + if isinstance(w, tuple): + if len(w) == 2: + w, value = w + op = '==' + elif len(w) == 3: + w, op, value = w + warnings.warn("passing a tuple into Expr is deprecated, " + "pass the where as a single string", + DeprecationWarning) if op is not None: if not isinstance(w, string_types): diff --git a/pandas/computation/scope.py b/pandas/computation/scope.py new file mode 100644 index 0000000000000..004d8d39d5e82 --- /dev/null +++ b/pandas/computation/scope.py @@ -0,0 +1,306 @@ +"""Module for scope operations +""" + +import sys +import operator +import struct +import inspect +import datetime +import itertools +import pprint + +import pandas as pd +from pandas.compat import DeepChainMap, map, StringIO +from pandas.core import common as com +from pandas.core.base import StringMixin +from pandas.computation.ops import UndefinedVariableError, _LOCAL_TAG + + +def _ensure_scope(level, global_dict=None, local_dict=None, resolvers=(), + target=None, **kwargs): + """Ensure that we are grabbing the correct scope.""" + return Scope(level + 1, global_dict=global_dict, local_dict=local_dict, + resolvers=resolvers, target=target) + + +def _replacer(x): + """Replace a number with its hexadecimal representation. Used to tag + temporary variables with their calling scope's id. + """ + # get the hex repr of the binary char and remove 0x and pad by pad_size + # zeros + try: + hexin = ord(x) + except TypeError: + # bytes literals masquerade as ints when iterating in py3 + hexin = x + + return hex(hexin) + + +def _raw_hex_id(obj): + """Return the padded hexadecimal id of ``obj``.""" + # interpret as a pointer since that's what really what id returns + packed = struct.pack('@P', id(obj)) + return ''.join(map(_replacer, packed)) + + + +_DEFAULT_GLOBALS = { + 'Timestamp': pd.lib.Timestamp, + 'datetime': datetime.datetime, + 'True': True, + 'False': False, + 'list': list, + 'tuple': tuple +} + + +def _get_pretty_string(obj): + """Return a prettier version of obj + + Parameters + ---------- + obj : object + Object to pretty print + + Returns + ------- + s : str + Pretty print object repr + """ + sio = StringIO() + pprint.pprint(obj, stream=sio) + return sio.getvalue() + + +class Scope(StringMixin): + + """Object to hold scope, with a few bells to deal with some custom syntax + and contexts added by pandas. + + Parameters + ---------- + level : int + global_dict : dict or None, optional, default None + local_dict : dict or Scope or None, optional, default None + resolvers : list-like or None, optional, default None + target : object + + Attributes + ---------- + level : int + scope : DeepChainMap + target : object + temps : dict + """ + __slots__ = 'level', 'scope', 'target', 'temps' + + def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), + target=None): + self.level = level + 1 + + # shallow copy because we don't want to keep filling this up with what + # was there before if there are multiple calls to Scope/_ensure_scope + self.scope = DeepChainMap(_DEFAULT_GLOBALS.copy()) + self.target = target + + if isinstance(local_dict, Scope): + self.scope.update(local_dict.scope) + if local_dict.target is not None: + self.target = local_dict.target + self.update(local_dict.level) + + frame = sys._getframe(self.level) + + try: + # shallow copy here because we don't want to replace what's in + # scope when we align terms (alignment accesses the underlying + # numpy array of pandas objects) + self.scope = self.scope.new_child((global_dict or + frame.f_globals).copy()) + if not isinstance(local_dict, Scope): + self.scope = self.scope.new_child((local_dict or + frame.f_locals).copy()) + finally: + del frame + + # assumes that resolvers are going from outermost scope to inner + if isinstance(local_dict, Scope): + resolvers += tuple(local_dict.resolvers.maps) + self.resolvers = DeepChainMap(*resolvers) + self.temps = {} + + def __unicode__(self): + scope_keys = _get_pretty_string(list(self.scope.keys())) + res_keys = _get_pretty_string(list(self.resolvers.keys())) + return '%s(scope=%s, resolvers=%s)' % (type(self).__name__, scope_keys, + res_keys) + + @property + def has_resolvers(self): + """Return whether we have any extra scope. + + For example, DataFrames pass Their columns as resolvers during calls to + ``DataFrame.eval()`` and ``DataFrame.query()``. + + Returns + ------- + hr : bool + """ + return bool(len(self.resolvers)) + + def resolve(self, key, is_local): + """Resolve a variable name in a possibly local context + + Parameters + ---------- + key : text_type + A variable name + is_local : bool + Flag indicating whether the variable is local or not (prefixed with + the '@' symbol) + + Returns + ------- + value : object + The value of a particular variable + """ + try: + # only look for locals in outer scope + if is_local: + return self.scope[key] + + # not a local variable so check in resolvers if we have them + if self.has_resolvers: + return self.resolvers[key] + + # if we're here that means that we have no locals and we also have + # no resolvers + assert not is_local and not self.has_resolvers + return self.scope[key] + except KeyError: + try: + # last ditch effort we look in temporaries + # these are created when parsing indexing expressions + # e.g., df[df > 0] + return self.temps[key] + except KeyError: + raise UndefinedVariableError(key, is_local) + + def swapkey(self, old_key, new_key, new_value=None): + """Replace a variable name, with a potentially new value. + + Parameters + ---------- + old_key : str + Current variable name to replace + new_key : str + New variable name to replace `old_key` with + new_value : object + Value to be replaced along with the possible renaming + """ + if self.has_resolvers: + maps = self.resolvers.maps + self.scope.maps + else: + maps = self.scope.maps + + maps.append(self.temps) + + for mapping in maps: + if old_key in mapping: + mapping[new_key] = new_value + return + + def _get_vars(self, stack, scopes): + """Get specifically scoped variables from a list of stack frames. + + Parameters + ---------- + stack : list + A list of stack frames as returned by ``inspect.stack()`` + scopes : sequence of strings + A sequence containing valid stack frame attribute names that + evaluate to a dictionary. For example, ('locals', 'globals') + """ + variables = itertools.product(scopes, stack) + for scope, (frame, _, _, _, _, _) in variables: + try: + d = getattr(frame, 'f_' + scope) + self.scope = self.scope.new_child(d) + finally: + # won't remove it, but DECREF it + # in Py3 this probably isn't necessary since frame won't be + # scope after the loop + del frame + + def update(self, level): + """Update the current scope by going back `level` levels. + + Parameters + ---------- + level : int or None, optional, default None + """ + sl = level + 1 + + # add sl frames to the scope starting with the + # most distant and overwriting with more current + # makes sure that we can capture variable scope + stack = inspect.stack() + + try: + self._get_vars(stack[:sl], scopes=['locals']) + finally: + del stack[:], stack + + def add_tmp(self, value): + """Add a temporary variable to the scope. + + Parameters + ---------- + value : object + An arbitrary object to be assigned to a temporary variable. + + Returns + ------- + name : basestring + The name of the temporary variable created. + """ + name = '{0}_{1}_{2}'.format(type(value).__name__, self.ntemps, + _raw_hex_id(self)) + + # add to inner most scope + assert name not in self.temps + self.temps[name] = value + assert name in self.temps + + # only increment if the variable gets put in the scope + return name + + def remove_tmp(self, name): + """Remove a temporary variable from this scope + + Parameters + ---------- + name : str + The name of a temporary to be removed + """ + del self.temps[name] + + @property + def ntemps(self): + """The number of temporary variables in this scope""" + return len(self.temps) + + @property + def full_scope(self): + """Return the full scope for use with passing to engines transparently + as a mapping. + + Returns + ------- + vars : DeepChainMap + All variables in this scope. + """ + maps = [self.temps] + self.resolvers.maps + self.scope.maps + return DeepChainMap(*maps) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index bb700c0d594e8..0ce93c48d32f5 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -18,12 +18,11 @@ from pandas.util.testing import makeCustomDataframe as mkdf from pandas.computation import pytables -from pandas.computation.engines import _engines +from pandas.computation.engines import _engines, NumExprClobberingError from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor from pandas.computation.ops import (_binary_ops_dict, _special_case_arith_ops_syms, _arith_ops_syms, _bool_ops_syms) -from pandas.computation.common import NameResolutionError import pandas.computation.expr as expr import pandas.util.testing as tm @@ -74,23 +73,6 @@ def _bool_and_frame(lhs, rhs): return isinstance(lhs, bool) and isinstance(rhs, pd.core.generic.NDFrame) -def skip_incompatible_operand(f): - @functools.wraps(f) - def wrapper(self, lhs, arith1, rhs, *args, **kwargs): - if _series_and_2d_ndarray(lhs, rhs): - self.assertRaises(Exception, pd.eval, 'lhs {0} rhs'.format(arith1), - local_dict={'lhs': lhs, 'rhs': rhs}, - engine=self.engine, parser=self.parser) - elif (np.isscalar(lhs) and np.isscalar(rhs) and arith1 in - _bool_ops_syms): - with tm.assertRaises(NotImplementedError): - pd.eval('lhs {0} rhs'.format(arith1), engine=self.engine, - parser=self.parser) - else: - f(self, lhs, arith1, rhs, *args, **kwargs) - return wrapper - - def _is_py3_complex_incompat(result, expected): return (PY3 and isinstance(expected, (complex, np.complexfloating)) and np.isnan(result)) @@ -200,7 +182,6 @@ def test_compound_invert_op(self): @slow def test_chained_cmp_op(self): mids = self.lhses - # tuple(set(self.cmp_ops) - set(['==', '!=', '<=', '>='])) cmp_ops = '<', '>' for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, cmp_ops, mids, cmp_ops, self.rhses): @@ -214,26 +195,11 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): scalar_with_in_notin = (np.isscalar(rhs) and (cmp1 in skip_these or cmp2 in skip_these)) if scalar_with_in_notin: + with tm.assertRaises(TypeError): + pd.eval(ex, engine=self.engine, parser=self.parser) self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, parser=self.parser, local_dict={'lhs': lhs, 'rhs': rhs}) - elif (_series_and_frame(lhs, rhs) and (cmp1 in - _series_frame_incompatible or - cmp2 in _series_frame_incompatible)): - self.assertRaises(TypeError, pd.eval, ex, - local_dict={'lhs': lhs, 'rhs': rhs}, - engine=self.engine, parser=self.parser) - elif _bool_and_frame(lhs, rhs): - self.assertRaises(TypeError, _eval_single_bin, lhs, '&', - rhs, self.engine) - self.assertRaises(TypeError, pd.eval, ex, - local_dict={'lhs': lhs, 'rhs': rhs}, - engine=self.engine, parser=self.parser) - elif (np.isscalar(lhs) and np.isnan(lhs) and - not np.isscalar(rhs) and (cmp1 in skip_these or cmp2 in - skip_these)): - with tm.assertRaises(TypeError): - _eval_single_bin(lhs, binop, rhs, self.engine) else: lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) @@ -250,51 +216,17 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): # except AssertionError: #import ipdb; ipdb.set_trace() # raise - elif (np.isscalar(lhs_new) and np.isnan(lhs_new) and - not np.isscalar(rhs_new) and binop in skip_these): - with tm.assertRaises(TypeError): - _eval_single_bin(lhs_new, binop, rhs_new, self.engine) else: expected = _eval_single_bin( lhs_new, binop, rhs_new, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(result, expected) - @skip_incompatible_operand def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): skip_these = _scalar_skip def check_operands(left, right, cmp_op): - if (np.isscalar(left) and np.isnan(left) and not np.isscalar(right) - and cmp_op in skip_these): - ex = 'left {0} right'.format(cmp_op) - with tm.assertRaises(ValueError): - pd.eval(ex, engine=self.engine, parser=self.parser) - return - if (np.isscalar(left) and np.isscalar(right) and - cmp_op in _bool_ops_syms): - ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) - ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) - ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) - for ex in (ex1, ex2, ex3): - with tm.assertRaises(NotImplementedError): - pd.eval(ex, engine=self.engine, parser=self.parser) - return - if (np.isscalar(right) and not np.isscalar(left) and cmp_op in - skip_these): - self.assertRaises(Exception, _eval_single_bin, left, cmp_op, - right, self.engine) - elif _series_and_2d_ndarray(right, left): - self.assertRaises(Exception, _eval_single_bin, right, cmp_op, - left, self.engine) - elif (np.isscalar(right) and np.isscalar(left) and cmp_op in - skip_these): - self.assertRaises(Exception, _eval_single_bin, right, cmp_op, - left, self.engine) - else: - new = _eval_single_bin(left, cmp_op, right, self.engine) - return new - return + return _eval_single_bin(left, cmp_op, right, self.engine) lhs_new = check_operands(lhs, mid, cmp1) rhs_new = check_operands(mid, rhs, cmp2) @@ -310,7 +242,6 @@ def check_operands(left, right, cmp_op): parser=self.parser) assert_array_equal(result, expected) - @skip_incompatible_operand def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) if cmp1 in ('in', 'not in') and not com.is_list_like(rhs): @@ -322,7 +253,6 @@ def check_simple_cmp_op(self, lhs, cmp1, rhs): result = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(result, expected) - @skip_incompatible_operand def check_binary_arith_op(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -345,9 +275,8 @@ def check_alignment(self, result, nlhs, ghs, op): expected = self.ne.evaluate('nlhs {0} ghs'.format(op)) assert_array_equal(result, expected) - # the following 3 tests require special casing + # modulus, pow, and floor division require special casing - @skip_incompatible_operand def check_modulus(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -356,7 +285,6 @@ def check_modulus(self, lhs, arith1, rhs): expected = self.ne.evaluate('expected {0} rhs'.format(arith1)) assert_allclose(result, expected) - @skip_incompatible_operand def check_floor_division(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) @@ -390,7 +318,6 @@ def get_expected_pow_result(self, lhs, rhs): raise return expected - @skip_incompatible_operand def check_pow(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) expected = self.get_expected_pow_result(lhs, rhs) @@ -409,7 +336,6 @@ def check_pow(self, lhs, arith1, rhs): self.get_expected_pow_result(lhs, rhs), rhs) assert_allclose(result, expected) - @skip_incompatible_operand def check_single_invert_op(self, lhs, cmp1, rhs): # simple for el in (lhs, rhs): @@ -426,7 +352,6 @@ def check_single_invert_op(self, lhs, cmp1, rhs): assert_array_equal(result, pd.eval('~elb', engine=engine, parser=self.parser)) - @skip_incompatible_operand def check_compound_invert_op(self, lhs, cmp1, rhs): skip_these = 'in', 'not in' ex = '~(lhs {0} rhs)'.format(cmp1) @@ -435,10 +360,6 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, parser=self.parser, local_dict={'lhs': lhs, 'rhs': rhs}) - elif (np.isscalar(lhs) and np.isnan(lhs) and not np.isscalar(rhs) - and cmp1 in skip_these): - with tm.assertRaises(ValueError): - pd.eval(ex, engine=self.engine, parser=self.parser) else: # compound if np.isscalar(lhs) and np.isscalar(rhs): @@ -735,13 +656,14 @@ def setUpClass(cls): cls.engine = 'python' cls.parser = 'python' - @skip_incompatible_operand def check_modulus(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) - result = pd.eval(ex, engine=self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = lhs % rhs assert_allclose(result, expected) - expected = eval('expected {0} rhs'.format(arith1)) + + expected = _eval_single_bin(expected, arith1, rhs, self.engine) assert_allclose(result, expected) def check_alignment(self, result, nlhs, ghs, op): @@ -1043,6 +965,7 @@ def tearDownClass(cls): def eval(self, *args, **kwargs): kwargs['engine'] = self.engine kwargs['parser'] = self.parser + kwargs['level'] = kwargs.pop('level', 0) + 1 return pd.eval(*args, **kwargs) def test_simple_arith_ops(self): @@ -1114,10 +1037,10 @@ def test_truediv(self): d = {'s': s} if PY3: - res = self.eval(ex, truediv=False, local_dict=d) + res = self.eval(ex, truediv=False) assert_array_equal(res, np.array([1.0])) - res = self.eval(ex, truediv=True, local_dict=d) + res = self.eval(ex, truediv=True) assert_array_equal(res, np.array([1.0])) res = self.eval('1 / 2', truediv=True) @@ -1128,18 +1051,18 @@ def test_truediv(self): expec = 0.5 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=False, local_dict={'s': s}) + res = self.eval('s / 2', truediv=False) expec = 0.5 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=True, local_dict={'s': s}) + res = self.eval('s / 2', truediv=True) expec = 0.5 self.assertEqual(res, expec) else: - res = self.eval(ex, truediv=False, local_dict=d) + res = self.eval(ex, truediv=False) assert_array_equal(res, np.array([1])) - res = self.eval(ex, truediv=True, local_dict=d) + res = self.eval(ex, truediv=True) assert_array_equal(res, np.array([1.0])) res = self.eval('1 / 2', truediv=True) @@ -1150,18 +1073,18 @@ def test_truediv(self): expec = 0 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=False, local_dict={'s': s}) + res = self.eval('s / 2', truediv=False) expec = 0 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=True, local_dict={'s': s}) + res = self.eval('s / 2', truediv=True) expec = 0.5 self.assertEqual(res, expec) def test_failing_subscript_with_name_error(self): df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(NameError, self.eval, 'df[x > 2] > 2', - local_dict={'df': df}) + with tm.assertRaises(NameError): + self.eval('df[x > 2] > 2') def test_lhs_expression_subscript(self): df = DataFrame(np.random.randn(5, 3)) @@ -1232,8 +1155,11 @@ def f(): def f(): a = 1 - df.eval('a=a+b') - self.assertRaises(NameResolutionError, f) + old_a = df.a.copy() + df.eval('a = a + b') + assert_series_equal(old_a + df.b, df.a) + + f() # multiple assignment df = orig_df.copy() @@ -1486,34 +1412,6 @@ def test_invalid_parser(): parser='asdf') -def check_is_expr_syntax(engine): - tm.skip_if_no_ne(engine) - s = 1 - valid1 = 's + 1' - valid2 = '__y + _xx' - assert_true(expr.isexpr(valid1, check_names=False)) - assert_true(expr.isexpr(valid2, check_names=False)) - - -def check_is_expr_names(engine): - tm.skip_if_no_ne(engine) - r, s = 1, 2 - valid = 's + r' - invalid = '__y + __x' - assert_true(expr.isexpr(valid, check_names=True)) - assert_false(expr.isexpr(invalid, check_names=True)) - - -def test_is_expr_syntax(): - for engine in _engines: - yield check_is_expr_syntax, engine - - -def test_is_expr_names(): - for engine in _engines: - yield check_is_expr_names, engine - - _parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, 'pandas': PandasExprVisitor} @@ -1547,7 +1445,8 @@ def test_syntax_error_exprs(): def check_name_error_exprs(engine, parser): tm.skip_if_no_ne(engine) e = 's + t' - assert_raises(NameError, pd.eval, e, engine=engine, parser=parser) + with tm.assertRaises(NameError): + pd.eval(e, engine=engine, parser=parser) def test_name_error_exprs(): @@ -1582,6 +1481,89 @@ def test_invalid_numexpr_version(): yield check_invalid_numexpr_version, engine, parser +def check_invalid_local_variable_reference(engine, parser): + tm.skip_if_no_ne(engine) + + a, b = 1, 2 + exprs = 'a + @b', '@a + b', '@a + @b' + for expr in exprs: + if parser != 'pandas': + with tm.assertRaisesRegexp(SyntaxError, "The '@' prefix is only"): + pd.eval(exprs, engine=engine, parser=parser) + else: + with tm.assertRaisesRegexp(SyntaxError, "The '@' prefix is not"): + pd.eval(exprs, engine=engine, parser=parser) + + +def test_invalid_local_variable_reference(): + for engine, parser in ENGINES_PARSERS: + yield check_invalid_local_variable_reference, engine, parser + + +def check_numexpr_builtin_raises(engine, parser): + tm.skip_if_no_ne(engine) + sin, dotted_line = 1, 2 + if engine == 'numexpr': + with tm.assertRaisesRegexp(NumExprClobberingError, + 'Variables in expression .+'): + pd.eval('sin + dotted_line', engine=engine, parser=parser) + else: + res = pd.eval('sin + dotted_line', engine=engine, parser=parser) + tm.assert_equal(res, sin + dotted_line) + + +def test_numexpr_builtin_raises(): + for engine, parser in ENGINES_PARSERS: + yield check_numexpr_builtin_raises, engine, parser + + +def check_bad_resolver_raises(engine, parser): + tm.skip_if_no_ne(engine) + cannot_resolve = 42, 3.0 + with tm.assertRaisesRegexp(TypeError, 'Resolver of type .+'): + pd.eval('1 + 2', resolvers=cannot_resolve, engine=engine, + parser=parser) + + +def test_bad_resolver_raises(): + for engine, parser in ENGINES_PARSERS: + yield check_bad_resolver_raises, engine, parser + + +def check_more_than_one_expression_raises(engine, parser): + tm.skip_if_no_ne(engine) + with tm.assertRaisesRegexp(SyntaxError, + 'only a single expression is allowed'): + pd.eval('1 + 1; 2 + 2', engine=engine, parser=parser) + + +def test_more_than_one_expression_raises(): + for engine, parser in ENGINES_PARSERS: + yield check_more_than_one_expression_raises, engine, parser + + +def check_bool_ops_fails_on_scalars(gen, lhs, cmp, rhs, engine, parser): + tm.skip_if_no_ne(engine) + mid = gen[type(lhs)]() + ex1 = 'lhs {0} mid {1} rhs'.format(cmp, cmp) + ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp, cmp) + ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp, cmp) + for ex in (ex1, ex2, ex3): + with tm.assertRaises(NotImplementedError): + pd.eval(ex, engine=engine, parser=parser) + + +def test_bool_ops_fails_on_scalars(): + _bool_ops_syms = 'and', 'or' + dtypes = int, float + gen = {int: lambda : np.random.randint(10), float: np.random.randn} + for engine, parser, dtype1, cmp, dtype2 in product(_engines, expr._parsers, + dtypes, _bool_ops_syms, + dtypes): + yield (check_bool_ops_fails_on_scalars, gen, gen[dtype1](), cmp, + gen[dtype2](), engine, parser) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9c972c9795c47..f20c316393244 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -67,15 +67,6 @@ def unique(values): return _hashtable_algo(f, values.dtype) -# def count(values, uniques=None): -# f = lambda htype, caster: _count_generic(values, htype, caster) - -# if uniques is not None: -# raise NotImplementedError -# else: -# return _hashtable_algo(f, values.dtype) - - def _hashtable_algo(f, dtype): """ f(HashTable, type_caster) -> result @@ -88,16 +79,6 @@ def _hashtable_algo(f, dtype): return f(htable.PyObjectHashTable, com._ensure_object) -def _count_generic(values, table_type, type_caster): - from pandas.core.series import Series - - values = type_caster(values) - table = table_type(min(len(values), 1000000)) - uniques, labels = table.factorize(values) - - return Series(counts, index=uniques) - - def _match_generic(values, index, table_type, type_caster): values = type_caster(values) index = type_caster(index) diff --git a/pandas/core/base.py b/pandas/core/base.py index 36c5a65163fad..f9bf4ca4ce91d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -5,7 +5,6 @@ import numpy as np from pandas.core import common as com - class StringMixin(object): """implements string methods so long as object defines a `__unicode__` @@ -200,3 +199,90 @@ def __unicode__(self): prepr = com.pprint_thing(self, escape_chars=('\t', '\r', '\n'), quote_strings=True) return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) + + +# facilitate the properties on the wrapped ops +def _field_accessor(name, docstring=None): + op_accessor = '_{0}'.format(name) + def f(self): + return self._ops_compat(name,op_accessor) + + f.__name__ = name + f.__doc__ = docstring + return property(f) + +class IndexOpsMixin(object): + """ common ops mixin to support a unified inteface / docs for Series / Index """ + + def _is_allowed_index_op(self, name): + if not self._allow_index_ops: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(self._get_access_object()))) + + def _is_allowed_datetime_index_op(self, name): + if not self._allow_datetime_index_ops: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(self._get_access_object()))) + + def _is_allowed_period_index_op(self, name): + if not self._allow_period_index_ops: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(self._get_access_object()))) + + def _ops_compat(self, name, op_accessor): + from pandas.tseries.index import DatetimeIndex + from pandas.tseries.period import PeriodIndex + obj = self._get_access_object() + if isinstance(obj, DatetimeIndex): + self._is_allowed_datetime_index_op(name) + elif isinstance(obj, PeriodIndex): + self._is_allowed_period_index_op(name) + try: + return self._wrap_access_object(getattr(obj,op_accessor)) + except AttributeError: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(obj))) + + def _get_access_object(self): + if isinstance(self, com.ABCSeries): + return self.index + return self + + def _wrap_access_object(self, obj): + # we may need to coerce the input as we don't want non int64 if + # we have an integer result + if hasattr(obj,'dtype') and com.is_integer_dtype(obj): + obj = obj.astype(np.int64) + + if isinstance(self, com.ABCSeries): + return self._constructor(obj,index=self.index).__finalize__(self) + + return obj + + def max(self): + """ The maximum value of the object """ + self._is_allowed_index_op('max') + return self.values.max() + + def min(self): + """ The minimum value of the object """ + self._is_allowed_index_op('min') + return self.values.min() + + date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps') + time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps') + year = _field_accessor('year', "The year of the datetime") + month = _field_accessor('month', "The month as January=1, December=12") + day = _field_accessor('day', "The days of the datetime") + hour = _field_accessor('hour', "The hours of the datetime") + minute = _field_accessor('minute', "The minutes of the datetime") + second = _field_accessor('second', "The seconds of the datetime") + microsecond = _field_accessor('microsecond', "The microseconds of the datetime") + nanosecond = _field_accessor('nanosecond', "The nanoseconds of the datetime") + weekofyear = _field_accessor('weekofyear', "The week ordinal of the year") + week = weekofyear + dayofweek = _field_accessor('dayofweek', "The day of the week with Monday=0, Sunday=6") + weekday = dayofweek + dayofyear = _field_accessor('dayofyear', "The ordinal day of the year") + quarter = _field_accessor('quarter', "The quarter of the date") + qyear = _field_accessor('qyear') diff --git a/pandas/core/common.py b/pandas/core/common.py index 785c1f45db607..dadd21f8fc128 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -121,6 +121,10 @@ def isnull(obj): isnulled : array-like of bool or bool Array or bool indicating whether an object is null or if an array is given which of the element is null. + + See also + -------- + pandas.notnull: boolean inverse of pandas.isnull """ return _isnull(obj) @@ -268,6 +272,10 @@ def notnull(obj): isnulled : array-like of bool or bool Array or bool indicating whether an object is *not* null or if an array is given which of the element is *not* null. + + See also + -------- + pandas.isnull : boolean inverse of pandas.notnull """ res = isnull(obj) if np.isscalar(res): @@ -303,11 +311,14 @@ def array_equivalent(left, right): >>> array_equivalent(np.array([1, nan, 2]), np.array([1, 2, nan])) False """ + left, right = np.asarray(left), np.asarray(right) if left.shape != right.shape: return False # NaNs occur only in object arrays, float or complex arrays. + if issubclass(left.dtype.type, np.object_): + return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all() if not issubclass(left.dtype.type, (np.floating, np.complexfloating)): return np.array_equal(left, right) - return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() def _iterable_not_string(x): return (isinstance(x, collections.Iterable) and @@ -1236,13 +1247,14 @@ def wrapper(arr, mask, limit=None): np.int64) -def pad_1d(values, limit=None, mask=None): +def pad_1d(values, limit=None, mask=None, dtype=None): - dtype = values.dtype.name + if dtype is None: + dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'pad_inplace_%s' % dtype, None) - elif is_datetime64_dtype(values): + _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _pad_1d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) @@ -1251,7 +1263,7 @@ def pad_1d(values, limit=None, mask=None): _method = algos.pad_inplace_object if _method is None: - raise ValueError('Invalid dtype for pad_1d [%s]' % dtype) + raise ValueError('Invalid dtype for pad_1d [%s]' % dtype.name) if mask is None: mask = isnull(values) @@ -1260,13 +1272,14 @@ def pad_1d(values, limit=None, mask=None): return values -def backfill_1d(values, limit=None, mask=None): +def backfill_1d(values, limit=None, mask=None, dtype=None): - dtype = values.dtype.name + if dtype is None: + dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'backfill_inplace_%s' % dtype, None) - elif is_datetime64_dtype(values): + _method = getattr(algos, 'backfill_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _backfill_1d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) @@ -1275,7 +1288,7 @@ def backfill_1d(values, limit=None, mask=None): _method = algos.backfill_inplace_object if _method is None: - raise ValueError('Invalid dtype for backfill_1d [%s]' % dtype) + raise ValueError('Invalid dtype for backfill_1d [%s]' % dtype.name) if mask is None: mask = isnull(values) @@ -1285,13 +1298,14 @@ def backfill_1d(values, limit=None, mask=None): return values -def pad_2d(values, limit=None, mask=None): +def pad_2d(values, limit=None, mask=None, dtype=None): - dtype = values.dtype.name + if dtype is None: + dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'pad_2d_inplace_%s' % dtype, None) - elif is_datetime64_dtype(values): + _method = getattr(algos, 'pad_2d_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _pad_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) @@ -1300,7 +1314,7 @@ def pad_2d(values, limit=None, mask=None): _method = algos.pad_2d_inplace_object if _method is None: - raise ValueError('Invalid dtype for pad_2d [%s]' % dtype) + raise ValueError('Invalid dtype for pad_2d [%s]' % dtype.name) if mask is None: mask = isnull(values) @@ -1314,13 +1328,14 @@ def pad_2d(values, limit=None, mask=None): return values -def backfill_2d(values, limit=None, mask=None): +def backfill_2d(values, limit=None, mask=None, dtype=None): - dtype = values.dtype.name + if dtype is None: + dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype, None) - elif is_datetime64_dtype(values): + _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _backfill_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) @@ -1329,7 +1344,7 @@ def backfill_2d(values, limit=None, mask=None): _method = algos.backfill_2d_inplace_object if _method is None: - raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype) + raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: mask = isnull(values) @@ -1495,7 +1510,7 @@ def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None, return new_y -def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None): +def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, dtype=None): """ perform an actual interpolation of values, values will be make 2-d if needed fills inplace, returns the result """ @@ -1517,9 +1532,9 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None): method = _clean_fill_method(method) if method == 'pad': - values = transf(pad_2d(transf(values), limit=limit, mask=mask)) + values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) else: - values = transf(backfill_2d(transf(values), limit=limit, mask=mask)) + values = transf(backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) # reshape back if ndim == 1: @@ -1719,10 +1734,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False): dtype = value.dtype if dtype.kind == 'M' and dtype != _NS_DTYPE: - try: - value = tslib.array_to_datetime(value) - except: - raise + value = value.astype(_NS_DTYPE) elif dtype.kind == 'm' and dtype != _TD_DTYPE: from pandas.tseries.timedeltas import \ @@ -1991,18 +2003,6 @@ def intersection(*seqs): return type(seqs[0])(list(result)) -def _shift_indexer(N, periods): - # small reusable utility - indexer = np.zeros(N, dtype=int) - - if periods > 0: - indexer[periods:] = np.arange(N - periods) - else: - indexer[:periods] = np.arange(-periods, N) - - return indexer - - def _asarray_tuplesafe(values, dtype=None): from pandas.core.index import Index diff --git a/pandas/core/format.py b/pandas/core/format.py index 04413970440b9..537fdc6cd0a27 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -943,7 +943,7 @@ def grouper(x): class CSVFormatter(object): - def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, + def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, line_terminator='\n', chunksize=None, engine=None, @@ -953,6 +953,9 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, self.engine = engine # remove for 0.13 self.obj = obj + if path_or_buf is None: + path_or_buf = StringIO() + self.path_or_buf = path_or_buf self.sep = sep self.na_rep = na_rep @@ -1144,7 +1147,7 @@ def strftime_with_nulls(x): def save(self): # create the writer & save - if hasattr(self.path_or_buf, 'read'): + if hasattr(self.path_or_buf, 'write'): f = self.path_or_buf close = False else: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e66e09624a04f..134ca9a87aeb8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -36,7 +36,7 @@ from pandas.core.series import Series import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval -from pandas.computation.expr import _ensure_scope +from pandas.computation.scope import _ensure_scope from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -1067,7 +1067,7 @@ def to_panel(self): to_wide = deprecate('to_wide', to_panel) - def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, + def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, @@ -1077,8 +1077,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, Parameters ---------- - path_or_buf : string or file handle / StringIO - File path + path_or_buf : string or file handle, default None + File path or object, if None is provided the result is returned as + a string. sep : character, default "," Field delimiter for the output file. na_rep : string, default '' @@ -1144,6 +1145,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, escapechar=escapechar) formatter.save() + if path_or_buf is None: + return formatter.path_or_buf.getvalue() + def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None, @@ -1216,7 +1220,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', def to_stata( self, fname, convert_dates=None, write_index=True, encoding="latin-1", - byteorder=None): + byteorder=None, time_stamp=None, data_label=None): """ A class for writing Stata binary dta files from array-like objects @@ -1247,7 +1251,8 @@ def to_stata( """ from pandas.io.stata import StataWriter writer = StataWriter(fname, self, convert_dates=convert_dates, - encoding=encoding, byteorder=byteorder) + encoding=encoding, byteorder=byteorder, + time_stamp=time_stamp, data_label=data_label) writer.write_file() def to_sql(self, name, con, flavor='sqlite', if_exists='fail', **kwargs): @@ -1519,7 +1524,7 @@ def _unpickle_matrix_compat(self, state): # pragma: no cover #---------------------------------------------------------------------- # Getting and setting elements - def get_value(self, index, col): + def get_value(self, index, col, takeable=False): """ Quickly retrieve single value at passed column and index @@ -1527,16 +1532,22 @@ def get_value(self, index, col): ---------- index : row label col : column label + takeable : interpret the index/col as indexers, default False Returns ------- value : scalar value """ + + if takeable is True: + series = self._iget_item_cache(col) + return series.values[index] + series = self._get_item_cache(col) engine = self.index._engine return engine.get_value(series.values, index) - def set_value(self, index, col, value): + def set_value(self, index, col, value, takeable=False): """ Put single value at passed column and index @@ -1545,6 +1556,7 @@ def set_value(self, index, col, value): index : row label col : column label value : scalar value + takeable : interpret the index/col as indexers, default False Returns ------- @@ -1553,6 +1565,10 @@ def set_value(self, index, col, value): otherwise a new object """ try: + if takeable is True: + series = self._iget_item_cache(col) + return series.set_value(index, value, takeable=True) + series = self._get_item_cache(col) engine = self.index._engine engine.set_value(series.values, index, value) @@ -1738,26 +1754,30 @@ def _getitem_frame(self, key): def query(self, expr, **kwargs): """Query the columns of a frame with a boolean expression. + .. versionadded:: 0.13 + Parameters ---------- expr : string - The query string to evaluate. The result of the evaluation of this - expression is first passed to :attr:`~pandas.DataFrame.loc` and if - that fails because of a multidimensional key (e.g., a DataFrame) - then the result will be passed to - :meth:`~pandas.DataFrame.__getitem__`. + The query string to evaluate. You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. kwargs : dict - See the documentation for :func:`~pandas.eval` for complete details - on the keyword arguments accepted by - :meth:`~pandas.DataFrame.query`. + See the documentation for :func:`pandas.eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. Returns ------- - q : DataFrame or Series + q : DataFrame Notes ----- - This method uses the top-level :func:`~pandas.eval` function to + The result of the evaluation of this expression is first passed to + :attr:`DataFrame.loc` and if that fails because of a + multidimensional key (e.g., a DataFrame) then the result will be passed + to :meth:`DataFrame.__getitem__`. + + This method uses the top-level :func:`pandas.eval` function to evaluate the passed query. The :meth:`~pandas.DataFrame.query` method uses a slightly @@ -1773,12 +1793,12 @@ def query(self, expr, **kwargs): recommended as it is inefficient compared to using ``numexpr`` as the engine. - The :attr:`~pandas.DataFrame.index` and - :attr:`~pandas.DataFrame.columns` attributes of the - :class:`~pandas.DataFrame` instance is placed in the namespace by - default, which allows you to treat both the index and columns of the + The :attr:`DataFrame.index` and + :attr:`DataFrame.columns` attributes of the + :class:`~pandas.DataFrame` instance are placed in the query namespace + by default, which allows you to treat both the index and columns of the frame as a column in the frame. - The identifier ``index`` is used for this variable, and you can also + The identifier ``index`` is used for the frame index; you can also use the name of the index to identify it in a query. For further details and examples see the ``query`` documentation in @@ -1797,18 +1817,7 @@ def query(self, expr, **kwargs): >>> df.query('a > b') >>> df[df.a > df.b] # same result as the previous expression """ - # need to go up at least 4 stack frames - # 4 expr.Scope - # 3 expr._ensure_scope - # 2 self.eval - # 1 self.query - # 0 self.query caller (implicit) - level = kwargs.setdefault('level', 4) - if level < 4: - raise ValueError("Going up fewer than 4 stack frames will not" - " capture the necessary variable scope for a " - "query expression") - + kwargs['level'] = kwargs.pop('level', 0) + 1 res = self.eval(expr, **kwargs) try: @@ -1852,21 +1861,17 @@ def eval(self, expr, **kwargs): >>> from pandas import DataFrame >>> df = DataFrame(randn(10, 2), columns=list('ab')) >>> df.eval('a + b') - >>> df.eval('c=a + b') + >>> df.eval('c = a + b') """ resolvers = kwargs.pop('resolvers', None) + kwargs['level'] = kwargs.pop('level', 0) + 1 if resolvers is None: - index_resolvers = self._get_resolvers() - resolvers = [self, index_resolvers] - kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs) + index_resolvers = self._get_index_resolvers() + resolvers = index_resolvers, dict(self.iteritems()) kwargs['target'] = self + kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers return _eval(expr, **kwargs) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): - axis = self._get_block_manager_axis(axis) - new_data = self._data.get_slice( - slobj, axis=axis, raise_on_error=raise_on_error) - return self._constructor(new_data) def _box_item_values(self, key, values): items = self.columns[self.columns.get_loc(key)] @@ -2033,6 +2038,8 @@ def _sanitize_column(self, key, value): value = com._asarray_tuplesafe(value) elif isinstance(value, PeriodIndex): value = value.asobject + elif isinstance(value, DatetimeIndex): + value = value._to_embed(keep_tz=True).copy() elif value.ndim == 2: value = value.copy().T else: @@ -2233,7 +2240,15 @@ def set_index(self, keys, drop=True, append=False, inplace=False, to_remove = [] for col in keys: - if isinstance(col, Series): + if isinstance(col, MultiIndex): + # append all but the last column so we don't have to modify + # the end of this loop + for n in range(col.nlevels - 1): + arrays.append(col.get_level_values(n)) + + level = col.get_level_values(col.nlevels - 1) + names.extend(col.names) + elif isinstance(col, (Series, Index)): level = col.values names.append(col.name) elif isinstance(col, (list, np.ndarray)): @@ -2414,8 +2429,8 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, agg_obj = self if subset is not None: - agg_axis_name = self._get_axis_name(agg_axis) - agg_obj = self.reindex(**{agg_axis_name: subset}) + ax = self._get_axis(agg_axis) + agg_obj = self.take(ax.get_indexer_for(subset),axis=agg_axis) count = agg_obj.count(axis=agg_axis) @@ -2612,11 +2627,14 @@ def trans(v): if k.ndim == 2: raise ValueError('Cannot sort by duplicate column %s' % str(by)) - indexer = k.argsort(kind=kind) if isinstance(ascending, (tuple, list)): ascending = ascending[0] + + if not ascending: + k = k[::-1] + indexer = k.argsort(kind=kind) if not ascending: - indexer = indexer[::-1] + indexer = indexer.max() - indexer[::-1] elif isinstance(labels, MultiIndex): indexer = _lexsort_indexer(labels.labels, orders=ascending) indexer = com._ensure_platform_int(indexer) @@ -3840,7 +3858,7 @@ def count(self, axis=0, level=None, numeric_only=False): ---------- axis : {0, 1} 0 for row-wise, 1 for column-wise - level : int, default None + level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a DataFrame numeric_only : boolean, default False @@ -3916,7 +3934,7 @@ def any(self, axis=None, bool_only=None, skipna=True, level=None, skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA - level : int, default None + level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a DataFrame bool_only : boolean, default None @@ -3947,7 +3965,7 @@ def all(self, axis=None, bool_only=None, skipna=True, level=None, skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA - level : int, default None + level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a DataFrame bool_only : boolean, default None @@ -4172,11 +4190,12 @@ def rank(self, axis=0, numeric_only=None, method='average', Ranks over columns (0) or rows (1) numeric_only : boolean, default None Include only float, int, boolean data - method : {'average', 'min', 'max', 'first'} + method : {'average', 'min', 'max', 'first', 'dense'} * average: average rank of group * min: lowest rank in group * max: highest rank in group * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups na_option : {'keep', 'top', 'bottom'} * keep: leave NA values where they are * top: smallest rank if ascending diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d607be6bfb733..7d00cef5c66bb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -201,6 +201,7 @@ def _setup_axes( def set_axis(a, i): setattr(cls, a, lib.AxisProperty(i)) + cls._internal_names_set.add(a) if axes_are_reversed: m = cls._AXIS_LEN - 1 @@ -349,7 +350,7 @@ def _get_axis_resolvers(self, axis): d[axis] = dindex return d - def _get_resolvers(self): + def _get_index_resolvers(self): d = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) @@ -392,6 +393,10 @@ def _expand_axes(self, key): return new_axes + def set_axis(self, axis, labels): + """ public verson of axis assignment """ + setattr(self,self._get_axis_name(axis),labels) + def _set_axis(self, axis, labels): self._data.set_axis(axis, labels) self._clear_item_cache() @@ -999,6 +1004,7 @@ def __getitem__(self, item): return self._get_item_cache(item) def _get_item_cache(self, item): + """ return the cached item, item represents a label indexer """ cache = self._item_cache res = cache.get(item) if res is None: @@ -1016,6 +1022,15 @@ def _set_as_cached(self, item, cacher): a weakref to cacher """ self._cacher = (item, weakref.ref(cacher)) + def _iget_item_cache(self, item): + """ return the cached item, item represents a positional indexer """ + ax = self._info_axis + if ax.is_unique: + lower = self._get_item_cache(ax[item]) + else: + lower = self.take(item, axis=self._info_axis_number, convert=True) + return lower + def _box_item_values(self, key, values): raise NotImplementedError @@ -1064,6 +1079,16 @@ def _clear_item_cache(self, i=None): else: self._item_cache.clear() + def _slice(self, slobj, axis=0, typ=None): + """ + Construct a slice of this container. + + typ parameter is maintained for compatibility with Series slicing. + + """ + axis = self._get_block_manager_axis(axis) + return self._constructor(self._data.get_slice(slobj, axis=axis)) + def _set_item(self, key, value): self._data.set(key, value) self._clear_item_cache() @@ -1307,7 +1332,11 @@ def xs(self, key, axis=0, level=None, copy=True, drop_level=True): new_values, copy = self._data.fast_xs(loc, copy=copy) # may need to box a datelike-scalar - if not is_list_like(new_values): + # + # if we encounter an array-like and we only have 1 dim + # that means that their are list/ndarrays inside the Series! + # so just return them (GH 6394) + if not is_list_like(new_values) or self.ndim == 1: return _maybe_box_datetimelike(new_values) result = Series(new_values, index=self.columns, @@ -1380,7 +1409,7 @@ def drop(self, labels, axis=0, level=None, inplace=False, **kwargs): ---------- labels : single label or list-like axis : int or axis name - level : int or name, default None + level : int or level name, default None For MultiIndex inplace : bool, default False If True, do operation inplace and return None. @@ -1580,27 +1609,14 @@ def _reindex_axes(self, axes, level, limit, method, fill_value, copy, axis = self._get_axis_number(a) ax = self._get_axis(a) - try: - new_index, indexer = ax.reindex( - labels, level=level, limit=limit, method=method, - takeable=takeable) - except (ValueError): - - # catch trying to reindex a non-monotonic index with a - # specialized indexer e.g. pad, so fallback to the regular - # indexer this will show up on reindexing a not-naturally - # ordering series, - # e.g. - # Series( - # [1,2,3,4], index=['a','b','c','d'] - # ).reindex(['c','b','g'], method='pad') - new_index, indexer = ax.reindex( - labels, level=level, limit=limit, method=None, - takeable=takeable) + new_index, indexer = ax.reindex( + labels, level=level, limit=limit, method=method, + takeable=takeable) obj = obj._reindex_with_indexers( {axis: [new_index, indexer]}, method=method, - fill_value=fill_value, limit=limit, copy=copy) + fill_value=fill_value, limit=limit, copy=copy, + allow_dups=takeable) return obj @@ -2332,8 +2348,12 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, value_dict = {} for k, v in items: - to_rep_dict[k] = list(v.keys()) - value_dict[k] = list(v.values()) + keys, values = zip(*v.items()) + if set(keys) & set(values): + raise ValueError("Replacement not allowed with " + "overlapping keys and values") + to_rep_dict[k] = list(keys) + value_dict[k] = list(values) to_replace, value = to_rep_dict, value_dict else: @@ -2430,7 +2450,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, return self._constructor(new_data).__finalize__(self) def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - downcast='infer', **kwargs): + downcast=None, **kwargs): """ Interpolate values according to different methods. @@ -2463,7 +2483,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, Maximum number of consecutive NaNs to fill. inplace : bool, default False Update the NDFrame in place if possible. - downcast : optional, 'infer' or None, defaults to 'infer' + downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. Returns @@ -2487,7 +2507,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, dtype: float64 """ - if self.ndim > 2: raise NotImplementedError("Interpolate has not been implemented " "on Panel and Panel 4D objects.") @@ -2529,7 +2548,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, inplace=inplace, downcast=downcast, **kwargs) - if inplace: if axis == 1: self._update_inplace(new_data) @@ -2548,12 +2566,20 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, def isnull(self): """ Return a boolean same-sized object indicating if the values are null + + See also + -------- + notnull : boolean inverse of isnull """ return isnull(self).__finalize__(self) def notnull(self): """Return a boolean same-sized object indicating if the values are not null + + See also + -------- + isnull : boolean inverse of notnull """ return notnull(self).__finalize__(self) @@ -2853,7 +2879,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, join : {'outer', 'inner', 'left', 'right'}, default 'outer' axis : allowed axis of the other object, default None Align on index (0), columns (1), or both (None) - level : int or name + level : int or level name, default None Broadcast across a level, matching Index values on the passed MultiIndex level copy : boolean, default True @@ -3154,12 +3180,14 @@ def shift(self, periods=1, freq=None, axis=0, **kwds): periods : int Number of periods to move, can be positive or negative freq : DateOffset, timedelta, or time rule string, optional - Increment to use from datetools module or time rule (e.g. 'EOM') + Increment to use from datetools module or time rule (e.g. 'EOM'). + See Notes. Notes ----- If freq is specified then the index values are shifted but the data - if not realigned + is not realigned. That is, use freq if you would like to extend the + index when shifting and preserve the original data. Returns ------- @@ -3169,9 +3197,7 @@ def shift(self, periods=1, freq=None, axis=0, **kwds): return self if freq is None and not len(kwds): - block_axis = self._get_block_manager_axis(axis) - indexer = com._shift_indexer(len(self._get_axis(axis)), periods) - new_data = self._data.shift(indexer=indexer, periods=periods, axis=block_axis) + new_data = self._data.shift(periods=periods, axis=axis) else: return self.tshift(periods, freq, **kwds) @@ -3288,7 +3314,7 @@ def truncate(self, before=None, after=None, axis=None, copy=True): def tz_convert(self, tz, axis=0, copy=True): """ - Convert TimeSeries to target time zone. If it is time zone naive, it + Convert the axis to target time zone. If it is time zone naive, it will be localized to the passed time zone. Parameters @@ -3304,24 +3330,18 @@ def tz_convert(self, tz, axis=0, copy=True): ax = self._get_axis(axis) if not hasattr(ax, 'tz_convert'): - ax_name = self._get_axis_name(axis) - raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % - ax_name) - - new_data = self._data - if copy: - new_data = new_data.copy() - - new_obj = self._constructor(new_data) - new_ax = ax.tz_convert(tz) - - if axis == 0: - new_obj._set_axis(1, new_ax) - elif axis == 1: - new_obj._set_axis(0, new_ax) - self._clear_item_cache() + if len(ax) > 0: + ax_name = self._get_axis_name(axis) + raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % + ax_name) + else: + ax = DatetimeIndex([],tz=tz) + else: + ax = ax.tz_convert(tz) - return new_obj.__finalize__(self) + result = self._constructor(self._data, copy=copy) + result.set_axis(axis,ax) + return result.__finalize__(self) def tz_localize(self, tz, axis=0, copy=True, infer_dst=False): """ @@ -3342,24 +3362,18 @@ def tz_localize(self, tz, axis=0, copy=True, infer_dst=False): ax = self._get_axis(axis) if not hasattr(ax, 'tz_localize'): - ax_name = self._get_axis_name(axis) - raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % - ax_name) - - new_data = self._data - if copy: - new_data = new_data.copy() - - new_obj = self._constructor(new_data) - new_ax = ax.tz_localize(tz, infer_dst=infer_dst) - - if axis == 0: - new_obj._set_axis(1, new_ax) - elif axis == 1: - new_obj._set_axis(0, new_ax) - self._clear_item_cache() + if len(ax) > 0: + ax_name = self._get_axis_name(axis) + raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % + ax_name) + else: + ax = DatetimeIndex([],tz=tz) + else: + ax = ax.tz_localize(tz, infer_dst=infer_dst) - return new_obj.__finalize__(self) + result = self._constructor(self._data, copy=copy) + result.set_axis(axis,ax) + return result.__finalize__(self) #---------------------------------------------------------------------- # Numeric Methods @@ -3448,7 +3462,7 @@ def _add_numeric_operations(cls): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA -level : int, default None +level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a """ + name + """ numeric_only : boolean, default None diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0919309afd434..86590d2319447 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -208,6 +208,8 @@ class GroupBy(PandasObject): Number of groups """ _apply_whitelist = _common_apply_whitelist + _internal_names = ['_cache'] + _internal_names_set = set(_internal_names) def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, @@ -288,10 +290,12 @@ def _local_dir(self): return sorted(set(self.obj._local_dir() + list(self._apply_whitelist))) def __getattr__(self, attr): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) if attr in self.obj: return self[attr] - if hasattr(self.obj, attr) and attr != '_cache': + if hasattr(self.obj, attr): return self._make_wrapper(attr) raise AttributeError("%r object has no attribute %r" % @@ -302,18 +306,18 @@ def __getitem__(self, key): def _make_wrapper(self, name): if name not in self._apply_whitelist: - is_callable = callable(getattr(self.obj, name, None)) + is_callable = callable(getattr(self._selected_obj, name, None)) kind = ' callable ' if is_callable else ' ' msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try " "using the 'apply' method".format(kind, name, type(self).__name__)) raise AttributeError(msg) - f = getattr(self.obj, name) + f = getattr(self._selected_obj, name) if not isinstance(f, types.MethodType): return self.apply(lambda self: getattr(self, name)) - f = getattr(type(self.obj), name) + f = getattr(type(self._selected_obj), name) def wrapper(*args, **kwargs): # a little trickery for aggregation functions that need an axis @@ -362,7 +366,7 @@ def get_group(self, name, obj=None): group : type of obj """ if obj is None: - obj = self.obj + obj = self._selected_obj inds = self._get_index(name) return obj.take(inds, axis=self.axis, convert=False) @@ -424,7 +428,8 @@ def f(g): return self._python_apply_general(f) def _python_apply_general(self, f): - keys, values, mutated = self.grouper.apply(f, self.obj, self.axis) + keys, values, mutated = self.grouper.apply(f, self._selected_obj, + self.axis) return self._wrap_applied_output(keys, values, not_indexed_same=mutated) @@ -437,7 +442,7 @@ def agg(self, func, *args, **kwargs): return self.aggregate(func, *args, **kwargs) def _iterate_slices(self): - yield self.name, self.obj + yield self.name, self._selected_obj def transform(self, func, *args, **kwargs): raise NotImplementedError @@ -523,13 +528,73 @@ def ohlc(self): """ return self._cython_agg_general('ohlc') - def nth(self, n): - def picker(arr): - arr = arr[notnull(arr)] - if len(arr) >= n + 1: - return arr.iget(n) + def nth(self, n, dropna=None): + """ + Take the nth row from each group. + + If dropna, will not show nth non-null row, dropna is either + Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent + to calling dropna(how=dropna) before the groupby. + + Examples + -------- + >>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + >>> g = df.groupby('A') + >>> g.nth(0) + A B + 0 1 NaN + 2 5 6 + >>> g.nth(1) + A B + 1 1 4 + >>> g.nth(-1) + A B + 1 1 4 + 2 5 6 + >>> g.nth(0, dropna='any') + B + A + 1 4 + 5 6 + >>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna + B + A + 1 NaN + 5 NaN + + """ + + if not dropna: # good choice + m = self.grouper._max_groupsize + if n >= m or n < -m: + return self._selected_obj.loc[[]] + rng = np.zeros(m, dtype=bool) + if n >= 0: + rng[n] = True + is_nth = self._cumcount_array(rng) else: + rng[- n - 1] = True + is_nth = self._cumcount_array(rng, ascending=False) + return self._selected_obj[is_nth] + + if (isinstance(self._selected_obj, DataFrame) + and dropna not in ['any', 'all']): + # Note: when agg-ing picker doesn't raise this, just returns NaN + raise ValueError("For a DataFrame groupby, dropna must be " + "either None, 'any' or 'all', " + "(was passed %s)." % (dropna),) + + # old behaviour, but with all and any support for DataFrames. + + max_len = n if n >= 0 else - 1 - n + + def picker(x): + x = x.dropna(how=dropna) # Note: how is ignored if Series + if len(x) <= max_len: return np.nan + else: + return x.iloc[n] + return self.agg(picker) def cumcount(self, **kwargs): @@ -578,16 +643,16 @@ def cumcount(self, **kwargs): """ ascending = kwargs.pop('ascending', True) - index = self.obj.index - rng = np.arange(self.grouper._max_groupsize, dtype='int64') - cumcounts = self._cumcount_array(rng, ascending=ascending) + index = self._selected_obj.index + cumcounts = self._cumcount_array(ascending=ascending) return Series(cumcounts, index) def head(self, n=5): """ Returns first n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.head(n))`` + Essentially equivalent to ``.apply(lambda x: x.head(n))``, + except ignores as_index flag. Example ------- @@ -599,24 +664,22 @@ def head(self, n=5): 0 1 2 2 5 6 >>> df.groupby('A').head(1) - A B - A - 1 0 1 2 - 5 2 5 6 + A B + 0 1 2 + 2 5 6 """ - rng = np.arange(self.grouper._max_groupsize, dtype='int64') - in_head = self._cumcount_array(rng) < n - head = self.obj[in_head] - if self.as_index: - head.index = self._index_with_as_index(in_head) + obj = self._selected_obj + in_head = self._cumcount_array() < n + head = obj[in_head] return head def tail(self, n=5): """ Returns last n rows of each group - Essentially equivalent to ``.apply(lambda x: x.tail(n))`` + Essentially equivalent to ``.apply(lambda x: x.tail(n))``, + except ignores as_index flag. Example ------- @@ -628,24 +691,29 @@ def tail(self, n=5): 0 1 2 2 5 6 >>> df.groupby('A').head(1) - A B - A - 1 0 1 2 - 5 2 5 6 + A B + 0 1 2 + 2 5 6 """ + obj = self._selected_obj rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64') in_tail = self._cumcount_array(rng, ascending=False) > -n - tail = self.obj[in_tail] - if self.as_index: - tail.index = self._index_with_as_index(in_tail) + tail = obj[in_tail] return tail - def _cumcount_array(self, arr, **kwargs): + def _cumcount_array(self, arr=None, **kwargs): + """ + arr is where cumcount gets it's values from + """ ascending = kwargs.pop('ascending', True) - len_index = len(self.obj.index) - cumcounts = np.zeros(len_index, dtype='int64') + if arr is None: + arr = np.arange(self.grouper._max_groupsize, dtype='int64') + + len_index = len(self._selected_obj.index) + cumcounts = np.empty(len_index, dtype=arr.dtype) + if ascending: for v in self.indices.values(): cumcounts[v] = arr[:len(v)] @@ -654,6 +722,13 @@ def _cumcount_array(self, arr, **kwargs): cumcounts[v] = arr[len(v)-1::-1] return cumcounts + @cache_readonly + def _selected_obj(self): + if self._selection is None or isinstance(self.obj, Series): + return self.obj + else: + return self.obj[self._selection] + def _index_with_as_index(self, b): """ Take boolean mask of index to be returned from apply, if as_index=True @@ -661,7 +736,7 @@ def _index_with_as_index(self, b): """ # TODO perf, it feels like this should already be somewhere... from itertools import chain - original = self.obj.index + original = self._selected_obj.index gp = self.grouper levels = chain((gp.levels[i][gp.labels[i][b]] for i in range(len(gp.groupings))), @@ -743,7 +818,7 @@ def _concat_objects(self, keys, values, not_indexed_same=False): if not not_indexed_same: result = concat(values, axis=self.axis) - ax = self.obj._get_axis(self.axis) + ax = self._selected_obj._get_axis(self.axis) if isinstance(result, Series): result = result.reindex(ax) @@ -766,14 +841,14 @@ def _apply_filter(self, indices, dropna): else: indices = np.sort(np.concatenate(indices)) if dropna: - filtered = self.obj.take(indices) + filtered = self._selected_obj.take(indices) else: - mask = np.empty(len(self.obj.index), dtype=bool) + mask = np.empty(len(self._selected_obj.index), dtype=bool) mask.fill(False) mask[indices.astype(int)] = True # mask fails to broadcast when passed to where; broadcast manually. - mask = np.tile(mask, list(self.obj.shape[1:]) + [1]).T - filtered = self.obj.where(mask) # Fill with NaNs. + mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T + filtered = self._selected_obj.where(mask) # Fill with NaNs. return filtered @@ -1783,7 +1858,8 @@ def _wrap_aggregated_output(self, output, names=None): def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: - return Series([]) + # GH #6265 + return Series([], name=self.name) def _get_index(): if self.grouper.nkeys > 1: @@ -1805,7 +1881,8 @@ def _get_index(): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - return Series(values, index=_get_index()) + # GH #6265 + return Series(values, index=_get_index(), name=self.name) def _aggregate_named(self, func, *args, **kwargs): result = {} @@ -1837,7 +1914,7 @@ def transform(self, func, *args, **kwargs): ------- transformed : Series """ - result = self.obj.copy() + result = self._selected_obj.copy() if hasattr(result, 'values'): result = result.values dtype = result.dtype @@ -1849,7 +1926,6 @@ def transform(self, func, *args, **kwargs): for name, group in self: - group = com.ensure_float(group) object.__setattr__(group, 'name', name) res = wrapper(group) if hasattr(res, 'values'): @@ -1863,8 +1939,8 @@ def transform(self, func, *args, **kwargs): # downcast if we can (and need) result = _possibly_downcast_to_dtype(result, dtype) - return self.obj.__class__(result, index=self.obj.index, - name=self.obj.name) + return self._selected_obj.__class__(result, index=self._selected_obj.index, + name=self._selected_obj.name) def filter(self, func, dropna=True, *args, **kwargs): """ @@ -2012,7 +2088,7 @@ def aggregate(self, arg, *args, **kwargs): if self.axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') - obj = self.obj + obj = self._selected_obj if any(isinstance(x, (list, tuple, dict)) for x in arg.values()): new_arg = OrderedDict() @@ -2025,7 +2101,7 @@ def aggregate(self, arg, *args, **kwargs): keys = [] if self._selection is not None: - subset = obj[self._selection] + subset = obj if isinstance(subset, DataFrame): raise NotImplementedError @@ -2210,10 +2286,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # make Nones an empty object if com._count_not_none(*values) != len(values): - v = None - for v in values: - if v is not None: - break + v = next(v for v in values if v is not None) if v is None: return DataFrame() elif isinstance(v, NDFrame): @@ -2227,7 +2300,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if isinstance(v, (np.ndarray, Series)): if isinstance(v, Series): - applied_index = self.obj._get_axis(self.axis) + applied_index = self._selected_obj._get_axis(self.axis) all_indexed_same = _all_indexes_same([ x.index for x in values ]) @@ -2266,17 +2339,29 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): try: if self.axis == 0: + # GH6124 if the list of Series have a consistent name, + # then propagate that name to the result. + index = v.index.copy() + if index.name is None: + # Only propagate the series name to the result + # if all series have a consistent name. If the + # series do not have a consistent name, do + # nothing. + names = set(v.name for v in values) + if len(names) == 1: + index.name = list(names)[0] # normally use vstack as its faster than concat # and if we have mi-columns if not _np_version_under1p7 or isinstance(v.index,MultiIndex): stacked_values = np.vstack([np.asarray(x) for x in values]) - result = DataFrame(stacked_values,index=key_index,columns=v.index) + result = DataFrame(stacked_values,index=key_index,columns=index) else: # GH5788 instead of stacking; concat gets the dtypes correct from pandas.tools.merge import concat result = concat(values,keys=key_index,names=key_index.names, axis=self.axis).unstack() + result.columns = index else: stacked_values = np.vstack([np.asarray(x) for x in values]) result = DataFrame(stacked_values.T,index=v.index,columns=key_index) @@ -2288,7 +2373,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here - cd = 'coerce' if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any() else True + if (self._selected_obj.ndim == 2 + and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()): + cd = 'coerce' + else: + cd = True return result.convert_objects(convert_dates=cd) else: @@ -2440,7 +2529,7 @@ def filter(self, func, dropna=True, *args, **kwargs): indices = [] - obj = self._obj_with_exclusions + obj = self._selected_obj gen = self.grouper.get_iterator(obj, axis=self.axis) fast_path, slow_path = self._define_paths(func, *args, **kwargs) @@ -2589,8 +2678,8 @@ def _wrap_agged_blocks(self, blocks): return result.convert_objects() def _iterate_column_groupbys(self): - for i, colname in enumerate(self.obj.columns): - yield colname, SeriesGroupBy(self.obj.iloc[:, i], + for i, colname in enumerate(self._selected_obj.columns): + yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i], selection=colname, grouper=self.grouper, exclusions=self.exclusions) @@ -2600,7 +2689,7 @@ def _apply_to_column_groupbys(self, func): return concat( (func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()), - keys=self.obj.columns, axis=1) + keys=self._selected_obj.columns, axis=1) def ohlc(self): """ @@ -2622,10 +2711,10 @@ def _iterate_slices(self): if self.axis == 0: # kludge if self._selection is None: - slice_axis = self.obj.items + slice_axis = self._selected_obj.items else: slice_axis = self._selection_list - slicer = lambda x: self.obj[x] + slicer = lambda x: self._selected_obj[x] else: raise NotImplementedError diff --git a/pandas/core/index.py b/pandas/core/index.py index a4eca1216ea84..3bc3783fffcbd 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -10,7 +10,7 @@ import pandas.algos as _algos import pandas.index as _index from pandas.lib import Timestamp, is_datetime_array -from pandas.core.base import FrozenList, FrozenNDArray +from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin from pandas.util.decorators import cache_readonly, deprecate from pandas.core.common import isnull @@ -57,7 +57,7 @@ def _shouldbe_timestamp(obj): _Identity = object -class Index(FrozenNDArray): +class Index(IndexOpsMixin, FrozenNDArray): """ Immutable ndarray implementing an ordered, sliceable set. The basic object @@ -92,6 +92,9 @@ class Index(FrozenNDArray): name = None asi8 = None _comparables = ['name'] + _allow_index_ops = True + _allow_datetime_index_ops = False + _allow_period_index_ops = False _engine_type = _index.ObjectEngine @@ -263,13 +266,28 @@ def copy(self, names=None, name=None, dtype=None, deep=False): new_index = new_index.astype(dtype) return new_index - def to_series(self): + def to_series(self, keep_tz=False): """ - return a series with both index and values equal to the index keys + Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index + + Parameters + ---------- + keep_tz : optional, defaults False. + applies only to a DatetimeIndex + + Returns + ------- + Series : dtype will be based on the type of the Index values. """ + import pandas as pd - return pd.Series(self.values, index=self, name=self.name) + values = self._to_embed(keep_tz) + return pd.Series(values, index=self, name=self.name) + + def _to_embed(self, keep_tz=False): + """ return an array repr of this object, potentially casting to object """ + return self.values def astype(self, dtype): return Index(self.values.astype(dtype), name=self.name, @@ -537,6 +555,29 @@ def _convert_list_indexer(self, key, typ=None): """ convert a list indexer. these should be locations """ return key + def _convert_list_indexer_for_mixed(self, keyarr, typ=None): + """ passed a key that is tuplesafe that is integer based + and we have a mixed index (e.g. number/labels). figure out + the indexer. return None if we can't help + """ + if com.is_integer_dtype(keyarr) and not self.is_floating(): + if self.inferred_type != 'integer': + keyarr = np.where(keyarr < 0, + len(self) + keyarr, keyarr) + + if self.inferred_type == 'mixed-integer': + indexer = self.get_indexer(keyarr) + if (indexer >= 0).all(): + return indexer + + from pandas.core.indexing import _maybe_convert_indices + return _maybe_convert_indices(indexer, len(self)) + + elif not self.inferred_type == 'integer': + return keyarr + + return None + def _convert_indexer_error(self, key, msg=None): if msg is None: msg = 'label' @@ -613,34 +654,35 @@ def __hash__(self): raise TypeError("unhashable type: %r" % type(self).__name__) def __getitem__(self, key): - """Override numpy.ndarray's __getitem__ method to work as desired""" - arr_idx = self.view(np.ndarray) + """ + Override numpy.ndarray's __getitem__ method to work as desired. + + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). + + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. + + """ + # There's no custom logic to be implemented in __getslice__, so it's + # not overloaded intentionally. + __getitem__ = super(Index, self).__getitem__ if np.isscalar(key): - return arr_idx[key] - else: - if com._is_bool_indexer(key): - key = np.asarray(key) + return __getitem__(key) - try: - result = arr_idx[key] - if result.ndim > 1: - return result - except (IndexError): - if not len(key): - result = [] - else: - raise + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization of basic indexing. + return __getitem__(key) - return Index(result, name=self.name) + if com._is_bool_indexer(key): + return __getitem__(np.asarray(key)) - def _getitem_slice(self, key): - """ getitem for a bool/sliceable, fallback to standard getitem """ - try: - arr_idx = self.view(np.ndarray) - result = arr_idx[key] - return self.__class__(result, name=self.name, fastpath=True) - except: - return self.__getitem__(key) + result = __getitem__(key) + if result.ndim > 1: + return result.view(np.ndarray) + else: + return result def append(self, other): """ @@ -866,6 +908,9 @@ def __and__(self, other): def __or__(self, other): return self.union(other) + def __xor__(self, other): + return self.sym_diff(other) + def union(self, other): """ Form the union of two Index objects and sorts if possible @@ -965,24 +1010,33 @@ def intersection(self, other): except TypeError: pass - indexer = self.get_indexer(other.values) - indexer = indexer.take((indexer != -1).nonzero()[0]) + try: + indexer = self.get_indexer(other.values) + indexer = indexer.take((indexer != -1).nonzero()[0]) + except: + # duplicates + indexer = self.get_indexer_non_unique(other.values)[0].unique() + return self.take(indexer) def diff(self, other): """ Compute sorted set difference of two Index objects + Parameters + ---------- + other : Index or array-like + + Returns + ------- + diff : Index + Notes ----- One can do either of these and achieve the same result >>> index - index2 >>> index.diff(index2) - - Returns - ------- - diff : Index """ if not hasattr(other, '__iter__'): @@ -1000,6 +1054,51 @@ def diff(self, other): theDiff = sorted(set(self) - set(other)) return Index(theDiff, name=result_name) + def sym_diff(self, other, result_name=None): + """ + Compute the sorted symmetric_difference of two Index objects. + + Parameters + ---------- + + other : array-like + result_name : str + + Returns + ------- + sym_diff : Index + + Notes + ----- + ``sym_diff`` contains elements that appear in either ``idx1`` or + ``idx2`` but not both. Equivalent to the Index created by + ``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped. + + The sorting of a result containing ``NaN``s is not guaranteed + across Python versions. See GitHub issue #6444. + + Examples + -------- + >>> idx1 = Index([1, 2, 3, 4]) + >>> idx2 = Index([2, 3, 4, 5]) + >>> idx1.sym_diff(idx2) + Int64Index([1, 5], dtype='int64') + + You can also use the ``^`` operator: + + >>> idx1 ^ idx2 + Int64Index([1, 5], dtype='int64') + """ + if not hasattr(other, '__iter__'): + raise TypeError('Input must be iterable!') + + if not isinstance(other, Index): + other = Index(other) + result_name = result_name or self.name + + the_diff = sorted(set((self - other) + (other - self))) + return Index(the_diff, name=result_name) + def unique(self): """ Return array of unique values in the Index. Significantly faster than @@ -1160,6 +1259,12 @@ def get_indexer_non_unique(self, target, **kwargs): indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return Index(indexer), missing + def get_indexer_for(self, target, **kwargs): + """ guaranteed return of an indexer even when non-unique """ + if self.is_unique: + return self.get_indexer(target, **kwargs) + return self.get_indexer_non_unique(target, **kwargs)[0] + def _possibly_promote(self, other): # A hack, but it works from pandas.tseries.index import DatetimeIndex @@ -1258,7 +1363,7 @@ def join(self, other, how='left', level=None, return_indexers=False): ---------- other : Index how : {'left', 'right', 'inner', 'outer'} - level : + level : int or level name, default None return_indexers : boolean, default False Returns @@ -1891,8 +1996,14 @@ def equals(self, other): if self is other: return True + # need to compare nans locations and make sure that they are the same + # since nans don't compare equal this is a bit tricky try: - return np.array_equal(self, other) + if not isinstance(other, Float64Index): + other = self._constructor(other) + if self.dtype != other.dtype or self.shape != other.shape: return False + left, right = self.values, other.values + return ((left == right) | (isnull(left) & isnull(right))).all() except TypeError: # e.g. fails in numpy 1.6 with DatetimeIndex #1681 return False @@ -2381,7 +2492,7 @@ def get_level_values(self, level): Parameters ---------- - level : int + level : int or level name Returns ------- @@ -2718,8 +2829,6 @@ def __getitem__(self, key): return result - _getitem_slice = __getitem__ - def take(self, indexer, axis=None): """ Analogous to ndarray.take @@ -2764,7 +2873,7 @@ def drop(self, labels, level=None): ---------- labels : array-like Must be a list of tuples - level : int or name, default None + level : int or level name, default None Returns ------- @@ -2838,7 +2947,13 @@ def droplevel(self, level=0): new_names.pop(i) if len(new_levels) == 1: + + # set nan if needed + mask = new_labels[0] == -1 result = new_levels[0].take(new_labels[0]) + if mask.any(): + np.putmask(result, mask, np.nan) + result.name = new_names[0] return result else: @@ -3160,6 +3275,7 @@ def get_loc_level(self, key, level=0, drop_level=True): Parameters ---------- key : label or tuple + level : int/level name or list thereof Returns ------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 830051ed41d44..39ddc9a7ee22a 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -91,32 +91,8 @@ def _get_label(self, label, axis=0): def _get_loc(self, key, axis=0): return self.obj._ixs(key, axis=axis) - def _slice(self, obj, axis=0, raise_on_error=False, typ=None): - - # make out-of-bounds into bounds of the object - if typ == 'iloc': - ax = self.obj._get_axis(axis) - l = len(ax) - start = obj.start - stop = obj.stop - step = obj.step - if start is not None: - # degenerate to return nothing - if start >= l: - return self._getitem_axis(tuple(),axis=axis) - - # equiv to a null slice - elif start <= -l: - start = None - if stop is not None: - if stop > l: - stop = None - elif stop <= -l: - stop = None - obj = slice(start,stop,step) - - return self.obj._slice(obj, axis=axis, raise_on_error=raise_on_error, - typ=typ) + def _slice(self, obj, axis=0, typ=None): + return self.obj._slice(obj, axis=axis, typ=typ) def __setitem__(self, key, value): @@ -441,7 +417,9 @@ def can_do_equal_len(): # align to if item in value: v = value[item] - v = v.reindex(self.obj[item].index & v.index) + i = self.obj[item].index + v = v.reindex(i & v.index) + setter(item, v.values) else: setter(item, np.nan) @@ -710,6 +688,8 @@ def _multi_take_opportunity(self, tup): return False elif com._is_bool_indexer(indexer): return False + elif not ax.is_unique: + return False return True @@ -907,20 +887,10 @@ def _reindex(keys, level=None): # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) - if is_integer_dtype(keyarr) and not labels.is_floating(): - if labels.inferred_type != 'integer': - keyarr = np.where(keyarr < 0, - len(labels) + keyarr, keyarr) - - if labels.inferred_type == 'mixed-integer': - indexer = labels.get_indexer(keyarr) - if (indexer >= 0).all(): - self.obj.take(indexer, axis=axis, convert=True) - else: - return self.obj.take(keyarr, axis=axis) - elif not labels.inferred_type == 'integer': - - return self.obj.take(keyarr, axis=axis) + # handle a mixed integer scenario + indexer = labels._convert_list_indexer_for_mixed(keyarr, typ=self.name) + if indexer is not None: + return self.obj.take(indexer, axis=axis) # this is not the most robust, but... if (isinstance(labels, MultiIndex) and @@ -1060,11 +1030,9 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): objarr = _asarray_tuplesafe(obj) # If have integer labels, defer to label-based indexing - if is_integer_dtype(objarr) and not is_int_index: - if labels.inferred_type != 'integer': - objarr = np.where(objarr < 0, - len(labels) + objarr, objarr) - return objarr + indexer = labels._convert_list_indexer_for_mixed(objarr, typ=self.name) + if indexer is not None: + return indexer # this is not the most robust, but... if (isinstance(labels, MultiIndex) and @@ -1351,8 +1319,7 @@ def _get_slice_axis(self, slice_obj, axis=0): return obj if isinstance(slice_obj, slice): - return self._slice(slice_obj, axis=axis, raise_on_error=True, - typ='iloc') + return self._slice(slice_obj, axis=axis, typ='iloc') else: return self.obj.take(slice_obj, axis=axis, convert=False) @@ -1376,7 +1343,7 @@ def _getitem_axis(self, key, axis=0, validate_iterable=False): arr = np.array(key) l = len(ax) if len(arr) and (arr.max() >= l or arr.min() <= -l): - key = arr[(arr>-l) & (arr len(ax): - raise IndexError("single indexer is out-of-bounds") + raise IndexError("single positional indexer is out-of-bounds") return self._get_loc(key, axis=axis) @@ -1419,7 +1386,7 @@ def __getitem__(self, key): raise ValueError('Invalid call for scalar access (getting)!') key = self._convert_key(key) - return self.obj.get_value(*key) + return self.obj.get_value(*key, takeable=self._takeable) def __setitem__(self, key, value): if not isinstance(key, tuple): @@ -1427,33 +1394,32 @@ def __setitem__(self, key, value): if len(key) != self.obj.ndim: raise ValueError('Not enough indexers for scalar access ' '(setting)!') - key = self._convert_key(key) + key = list(self._convert_key(key)) key.append(value) - self.obj.set_value(*key) + self.obj.set_value(*key, takeable=self._takeable) class _AtIndexer(_ScalarAccessIndexer): """ label based scalar accessor """ - pass + _takeable = False class _iAtIndexer(_ScalarAccessIndexer): """ integer based scalar accessor """ + _takeable = True def _has_valid_setitem_indexer(self, indexer): self._has_valid_positional_setitem_indexer(indexer) def _convert_key(self, key): """ require integer args (and convert to label arguments) """ - ckey = [] for a, i in zip(self.obj.axes, key): if not com.is_integer(i): raise ValueError("iAt based indexing can only have integer " "indexers") - ckey.append(a[i]) - return ckey + return key # 32-bit floating point machine epsilon _eps = np.finfo('f4').eps @@ -1589,6 +1555,10 @@ def _maybe_convert_indices(indices, n): """ if isinstance(indices, list): indices = np.array(indices) + if len(indices) == 0: + # If list is empty, np.array will return float and cause indexing + # errors. + return np.empty(0, dtype=np.int_) mask = indices < 0 if mask.any(): @@ -1656,18 +1626,6 @@ def _need_slice(obj): (obj.step is not None and obj.step != 1)) -def _check_slice_bounds(slobj, values): - l = len(values) - start = slobj.start - if start is not None: - if start < -l or start > l - 1: - raise IndexError("out-of-bounds on slice (start)") - stop = slobj.stop - if stop is not None: - if stop < -l - 1 or stop > l: - raise IndexError("out-of-bounds on slice (end)") - - def _maybe_droplevels(index, key): # drop levels original_index = index diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c89aac0fa7923..9892dc77e9e23 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -14,8 +14,7 @@ _values_from_object, _is_null_datelike_scalar) from pandas.core.index import (Index, MultiIndex, _ensure_index, _handle_legacy_indexes) -from pandas.core.indexing import (_check_slice_bounds, _maybe_convert_indices, - _length_of_indexer) +from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer) import pandas.core.common as com from pandas.sparse.array import _maybe_to_sparse, SparseArray import pandas.lib as lib @@ -61,8 +60,8 @@ def __init__(self, values, items, ref_items, ndim=None, fastpath=False, raise ValueError('Wrong number of dimensions') if len(items) != len(values): - raise ValueError('Wrong number of items passed %d, indices imply ' - '%d' % (len(items), len(values))) + raise ValueError('Wrong number of items passed %d, index implies ' + '%d' % (len(values), len(items))) self.set_ref_locs(placement) self.values = values @@ -133,6 +132,12 @@ def take_ref_locs(self, indexer): tindexer[indexer] = False tindexer = tindexer.astype(int).cumsum()[indexer] ref_locs = ref_locs[indexer] + + # Make sure the result is a copy, or otherwise self._ref_locs will be + # updated. + if ref_locs.base is not None: + ref_locs = ref_locs.copy() + ref_locs -= tindexer return ref_locs @@ -265,7 +270,7 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, new_ref_items, indexer = self.items.reindex(new_ref_items, limit=limit) - needs_fill = method is not None and limit is None + needs_fill = method is not None if fill_value is None: fill_value = self.fill_value @@ -275,10 +280,13 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, else: - # single block reindex + # single block reindex, filling is already happending if self.ndim == 1: new_values = com.take_1d(self.values, indexer, fill_value=fill_value) + block = make_block(new_values, new_items, new_ref_items, + ndim=self.ndim, fastpath=True) + return block else: masked_idx = indexer[indexer != -1] @@ -805,6 +813,15 @@ def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, fill_value=None, coerce=False, downcast=None, **kwargs): + def check_int_bool(self, inplace): + # Only FloatBlocks will contain NaNs. + # timedelta subclasses IntBlock + if (self.is_bool or self.is_integer) and not self.is_timedelta: + if inplace: + return self + else: + return self.copy() + # a fill na type method try: m = com._clean_fill_method(method) @@ -812,6 +829,9 @@ def interpolate(self, method='pad', axis=0, index=None, m = None if m is not None: + r = check_int_bool(self, inplace) + if r is not None: + return r return self._interpolate_with_fill(method=m, axis=axis, inplace=inplace, @@ -826,6 +846,9 @@ def interpolate(self, method='pad', axis=0, index=None, m = None if m is not None: + r = check_int_bool(self, inplace) + if r is not None: + return r return self._interpolate(method=m, index=index, values=values, @@ -855,7 +878,12 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, fill_value = self._try_fill(fill_value) values = self.values if inplace else self.values.copy() values = self._try_operate(values) - values = com.interpolate_2d(values, method, axis, limit, fill_value) + values = com.interpolate_2d(values, + method=method, + axis=axis, + limit=limit, + fill_value=fill_value, + dtype=self.dtype) values = self._try_coerce_result(values) blocks = [make_block(values, self.items, self.ref_items, @@ -928,25 +956,22 @@ def diff(self, n): return [make_block(new_values, self.items, self.ref_items, ndim=self.ndim, fastpath=True)] - def shift(self, indexer, periods, axis=0): + def shift(self, periods, axis=0): """ shift the block by periods, possibly upcast """ - - new_values = self.values.take(indexer, axis=axis) # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values, fill_value = com._maybe_upcast(new_values) - + new_values, fill_value = com._maybe_upcast(self.values) + new_values = np.roll(new_values.T,periods,axis=axis) axis_indexer = [ slice(None) ] * self.ndim if periods > 0: axis_indexer[axis] = slice(None,periods) else: - axis_indexer = [ slice(None) ] * self.ndim axis_indexer[axis] = slice(periods,None) new_values[tuple(axis_indexer)] = fill_value - return [make_block(new_values, self.items, self.ref_items, + return [make_block(new_values.T, self.items, self.ref_items, ndim=self.ndim, fastpath=True)] - + def eval(self, func, other, raise_on_error=True, try_cast=False): """ evaluate the block; return result block from the result @@ -1169,12 +1194,12 @@ class NumericBlock(Block): class FloatOrComplexBlock(NumericBlock): + def equals(self, other): if self.dtype != other.dtype or self.shape != other.shape: return False left, right = self.values, other.values return ((left == right) | (np.isnan(left) & np.isnan(right))).all() - class FloatBlock(FloatOrComplexBlock): is_float = True _downcast_dtype = 'int64' @@ -1868,9 +1893,20 @@ def fillna(self, value, inplace=False, downcast=None): values = self.values if inplace else self.values.copy() return [self.make_block(values.get_values(value), fill_value=value)] - def shift(self, indexer, periods, axis=0): + @classmethod + def _shift_indexer(cls,N, periods): + # small reusable utility + indexer = np.zeros(N, dtype=int) + + if periods > 0: + indexer[periods:] = np.arange(N - periods) + else: + indexer[:periods] = np.arange(-periods, N) + return indexer + + def shift(self, periods, axis=0): """ shift the block by periods """ - + indexer = self._shift_indexer(len(self.values.T),periods) new_values = self.values.to_dense().take(indexer) # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also @@ -2645,12 +2681,9 @@ def combine(self, blocks): new_axes[0] = new_items return self.__class__(new_blocks, new_axes, do_integrity_check=False) - def get_slice(self, slobj, axis=0, raise_on_error=False): + def get_slice(self, slobj, axis=0): new_axes = list(self.axes) - if raise_on_error: - _check_slice_bounds(slobj, new_axes[axis]) - new_axes[axis] = new_axes[axis][slobj] if axis == 0: @@ -2677,22 +2710,31 @@ def get_slice(self, slobj, axis=0, raise_on_error=False): return bm def _slice_blocks(self, slobj, axis): - new_blocks = [] + """ + slice the blocks using the provided slice object + this is only for slicing on axis != 0 + """ + + if axis == 0: + raise AssertionError("cannot _slice_blocks on axis=0") slicer = [slice(None, None) for _ in range(self.ndim)] slicer[axis] = slobj slicer = tuple(slicer) + is_unique = self.axes[0].is_unique - for block in self.blocks: - newb = make_block(block._slice(slicer), - block.items, - block.ref_items, - klass=block.__class__, - fastpath=True, - placement=block._ref_locs) - newb.set_ref_locs(block._ref_locs) - new_blocks.append(newb) - return new_blocks + def place(block): + if not is_unique: + return block._ref_locs + return None + + return [ make_block(block._slice(slicer), + block.items, + block.ref_items, + klass=block.__class__, + fastpath=True, + placement=place(block) + ) for block in self.blocks ] def get_series_dict(self): # For DataFrame @@ -3375,8 +3417,11 @@ def reindex_items(self, new_items, indexer=None, copy=True, # unique if self.axes[0].is_unique and new_items.is_unique: + # ok to use the global indexer if only 1 block + i = indexer if len(self.blocks) == 1 else None + for block in self.blocks: - blk = block.reindex_items_from(new_items, copy=copy) + blk = block.reindex_items_from(new_items, indexer=i, copy=copy) new_blocks.extend(_valid_blocks(blk)) # non-unique @@ -3690,26 +3735,32 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): - if method is None: - indexer = None return self.reindex(new_axis, indexer=indexer, method=method, fill_value=fill_value, limit=limit, copy=copy) def _delete_from_block(self, i, item): super(SingleBlockManager, self)._delete_from_block(i, item) - # reset our state - self._block = ( - self.blocks[0] if len(self.blocks) else - make_block(np.array([], dtype=self._block.dtype), [], []) - ) + # possibly need to merge split blocks + if len(self.blocks) > 1: + new_items = Index(list(itertools.chain(*[ b.items for b in self.blocks ]))) + block = make_block(np.concatenate([ b.values for b in self.blocks ]), + new_items, + new_items, + dtype=self._block.dtype) + + elif len(self.blocks): + block = self.blocks[0] + else: + block = make_block(np.array([], dtype=self._block.dtype), [], []) + + self.blocks = [block] + self._block = block self._values = self._block.values - def get_slice(self, slobj, raise_on_error=False): - if raise_on_error: - _check_slice_bounds(slobj, self.index) + def get_slice(self, slobj): return self.__class__(self._block._slice(slobj), - self.index._getitem_slice(slobj), fastpath=True) + self.index[slobj], fastpath=True) def set_axis(self, axis, value, maybe_rename=True, check_axis=True): cur_axis, value = self._set_axis(axis, value, check_axis) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 636532bc5fbf9..a47c7f82d9199 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -231,7 +231,8 @@ def nansum(values, axis=None, skipna=True): values, mask, dtype = _get_values(values, skipna, 0) the_sum = values.sum(axis) the_sum = _maybe_null_out(the_sum, axis, mask) - return the_sum + + return _wrap_results(the_sum, dtype) @disallow('M8') diff --git a/pandas/core/panel.py b/pandas/core/panel.py index cb149abb7c9cf..2bf50bb1bf142 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -444,7 +444,7 @@ def as_matrix(self): #---------------------------------------------------------------------- # Getting and setting elements - def get_value(self, *args): + def get_value(self, *args, **kwargs): """ Quickly retrieve single value at (item, major, minor) location @@ -453,6 +453,7 @@ def get_value(self, *args): item : item label (panel item) major : major axis label (panel item row) minor : minor axis label (panel item column) + takeable : interpret the passed labels as indexers, default False Returns ------- @@ -466,12 +467,16 @@ def get_value(self, *args): raise TypeError('There must be an argument for each axis, you gave' ' {0} args, but {1} are required'.format(nargs, nreq)) + takeable = kwargs.get('takeable') - # hm, two layers to the onion - frame = self._get_item_cache(args[0]) - return frame.get_value(*args[1:]) + if takeable is True: + lower = self._iget_item_cache(args[0]) + else: + lower = self._get_item_cache(args[0]) + + return lower.get_value(*args[1:], takeable=takeable) - def set_value(self, *args): + def set_value(self, *args, **kwargs): """ Quickly set single value at (item, major, minor) location @@ -481,6 +486,7 @@ def set_value(self, *args): major : major axis label (panel item row) minor : minor axis label (panel item column) value : scalar + takeable : interpret the passed labels as indexers, default False Returns ------- @@ -496,10 +502,15 @@ def set_value(self, *args): raise TypeError('There must be an argument for each axis plus the ' 'value provided, you gave {0} args, but {1} are ' 'required'.format(nargs, nreq)) + takeable = kwargs.get('takeable') try: - frame = self._get_item_cache(args[0]) - frame.set_value(*args[1:]) + if takeable is True: + lower = self._iget_item_cache(args[0]) + else: + lower = self._get_item_cache(args[0]) + + lower.set_value(*args[1:], takeable=takeable) return self except KeyError: axes = self._expand_axes(args) @@ -528,12 +539,6 @@ def _box_item_values(self, key, values): d = self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:]) return self._constructor_sliced(values, **d) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): - new_data = self._data.get_slice(slobj, - axis=axis, - raise_on_error=raise_on_error) - return self._constructor(new_data) - def __setitem__(self, key, value): shape = tuple(self.shape) if isinstance(value, self._constructor_sliced): diff --git a/pandas/core/series.py b/pandas/core/series.py index 35acfffe5b598..409bbf60193af 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -28,9 +28,9 @@ from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) from pandas.core.indexing import ( - _check_bool_indexer, _check_slice_bounds, + _check_bool_indexer, _is_index_slice, _maybe_convert_indices) -from pandas.core import generic +from pandas.core import generic, base from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical from pandas.tseries.index import DatetimeIndex @@ -91,7 +91,7 @@ def f(self, *args, **kwargs): # Series class -class Series(generic.NDFrame): +class Series(base.IndexOpsMixin, generic.NDFrame): """ One-dimensional ndarray with axis labels (including time series). @@ -122,6 +122,15 @@ class Series(generic.NDFrame): Copy input data """ _metadata = ['name'] + _allow_index_ops = True + + @property + def _allow_datetime_index_ops(self): + return self.index.is_all_dates and isinstance(self.index, DatetimeIndex) + + @property + def _allow_period_index_ops(self): + return self.index.is_all_dates and isinstance(self.index, PeriodIndex) def __init__(self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): @@ -153,7 +162,8 @@ def __init__(self, data=None, index=None, dtype=None, name=None, # need to copy to avoid aliasing issues if name is None: name = data.name - data = data.values + + data = data._to_embed(keep_tz=True) copy = True elif isinstance(data, pa.Array): pass @@ -459,17 +469,24 @@ def _ixs(self, i, axis=0): def _is_mixed_type(self): return False - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): - if raise_on_error: - _check_slice_bounds(slobj, self.values) + def _slice(self, slobj, axis=0, typ=None): slobj = self.index._convert_slice_indexer(slobj, typ=typ or 'getitem') return self._get_values(slobj) def __getitem__(self, key): try: result = self.index.get_value(self, key) - if isinstance(result, np.ndarray): - return self._constructor(result,index=[key]*len(result)).__finalize__(self) + + if not np.isscalar(result): + if is_list_like(result) and not isinstance(result, Series): + + # we need to box if we have a non-unique index here + # otherwise have inline ndarray/lists + if not self.index.is_unique: + result = self._constructor(result, + index=[key]*len(result) + ,dtype=self.dtype).__finalize__(self) + return result except InvalidIndexError: pass @@ -702,46 +719,28 @@ def reshape(self, *args, **kwargs): return self.values.reshape(shape, **kwargs) - def get(self, label, default=None): - """ - Returns value occupying requested label, default to specified - missing value if not present. Analogous to dict.get - - Parameters - ---------- - label : object - Label value looking for - default : object, optional - Value to return if label not in index - - Returns - ------- - y : scalar - """ - try: - return self.get_value(label) - except KeyError: - return default - iget_value = _ixs iget = _ixs irow = _ixs - def get_value(self, label): + def get_value(self, label, takeable=False): """ Quickly retrieve single value at passed index label Parameters ---------- index : label + takeable : interpret the index as indexers, default False Returns ------- value : scalar value """ + if takeable is True: + return self.values[label] return self.index.get_value(self.values, label) - def set_value(self, label, value): + def set_value(self, label, value, takeable=False): """ Quickly set single value at passed label. If label is not contained, a new object is created with the label placed at the end of the result @@ -753,6 +752,7 @@ def set_value(self, label, value): Partial indexing with MultiIndex not allowed value : object Scalar value + takeable : interpret the index as indexers, default False Returns ------- @@ -761,7 +761,10 @@ def set_value(self, label, value): otherwise a new object """ try: - self.index._engine.set_value(self.values, label, value) + if takeable: + self.values[label] = value + else: + self.index._engine.set_value(self.values, label, value) return self except KeyError: @@ -1059,7 +1062,7 @@ def count(self, level=None): Parameters ---------- - level : int, default None + level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a smaller Series @@ -1284,7 +1287,7 @@ def quantile(self, q=0.5): if len(valid_values) == 0: return pa.NA result = _quantile(valid_values, q * 100) - if result.dtype == _TD_DTYPE: + if not np.isscalar and com.is_timedelta64_dtype(result): from pandas.tseries.timedeltas import to_timedelta return to_timedelta(result) @@ -1507,7 +1510,7 @@ def _binop(self, other, func, level=None, fill_value=None): fill_value : float or object Value to substitute for NA/null values. If both Series are NA in a location, the result will be NA regardless of the passed fill value - level : int or name + level : int or level name, default None Broadcast across a level, matching Index values on the passed MultiIndex level @@ -1715,18 +1718,19 @@ def rank(self, method='average', na_option='keep', ascending=True, Parameters ---------- - method : {'average', 'min', 'max', 'first'} + method : {'average', 'min', 'max', 'first', 'dense'} * average: average rank of group * min: lowest rank in group * max: highest rank in group * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups na_option : {'keep'} keep: leave NA values where they are ascending : boolean, default True False for ranks by high (1) to low (N) pct : boolean, defeault False Computes percentage rank of data - + Returns ------- ranks : Series @@ -1797,7 +1801,7 @@ def sortlevel(self, level=0, ascending=True): Parameters ---------- - level : int + level : int or level name, default None ascending : bool, default True Returns @@ -2318,72 +2322,6 @@ def asof(self, where): new_values = com.take_1d(values, locs) return self._constructor(new_values, index=where).__finalize__(self) - @property - def weekday(self): - return self._constructor([d.weekday() for d in self.index], - index=self.index).__finalize__(self) - - def tz_convert(self, tz, copy=True): - """ - Convert TimeSeries to target time zone - - Parameters - ---------- - tz : string or pytz.timezone object - copy : boolean, default True - Also make a copy of the underlying data - - Returns - ------- - converted : TimeSeries - """ - new_index = self.index.tz_convert(tz) - - new_values = self.values - if copy: - new_values = new_values.copy() - - return self._constructor(new_values, - index=new_index).__finalize__(self) - - def tz_localize(self, tz, copy=True, infer_dst=False): - """ - Localize tz-naive TimeSeries to target time zone - Entries will retain their "naive" value but will be annotated as - being relative to the specified tz. - - After localizing the TimeSeries, you may use tz_convert() to - get the Datetime values recomputed to a different tz. - - Parameters - ---------- - tz : string or pytz.timezone object - copy : boolean, default True - Also make a copy of the underlying data - infer_dst : boolean, default False - Attempt to infer fall dst-transition hours based on order - - Returns - ------- - localized : TimeSeries - """ - from pandas.tseries.index import DatetimeIndex - - if not isinstance(self.index, DatetimeIndex): - if len(self.index) > 0: - raise Exception('Cannot tz-localize non-time series') - - new_index = DatetimeIndex([], tz=tz) - else: - new_index = self.index.tz_localize(tz, infer_dst=infer_dst) - - new_values = self.values - if copy: - new_values = new_values.copy() - - return self._constructor(new_values, - index=new_index).__finalize__(self) - @cache_readonly def str(self): from pandas.core.strings import StringMethods diff --git a/pandas/core/strings.py b/pandas/core/strings.py index bd8e1b196c59d..3e3d1e2dbd76e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -163,11 +163,11 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): na : default NaN, fill value for missing values. regex : bool, default True If True use re.search, otherwise use Python in operator - + Returns ------- Series of boolean values - + See Also -------- match : analagous, but stricter, relying on re.match instead of re.search @@ -345,7 +345,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False): See Also -------- - contains : analagous, but less strict, relying on re.search instead of + contains : analagous, but less strict, relying on re.search instead of re.match extract : now preferred to the deprecated usage of match (as_indexer=False) @@ -413,7 +413,7 @@ def str_extract(arr, pat, flags=0): dtype: object A pattern with more than one group will return a DataFrame. - + >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') 0 1 0 a 1 @@ -421,7 +421,7 @@ def str_extract(arr, pat, flags=0): 2 NaN NaN A pattern may contain optional groups. - + >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)') 0 1 0 a 1 @@ -429,7 +429,7 @@ def str_extract(arr, pat, flags=0): 2 NaN 3 Named groups will become column names in the result. - + >>> Series(['a1', 'b2', 'c3']).str.extract('(?P[ab])(?P\d)') letter digit 0 a 1 @@ -451,11 +451,15 @@ def f(x): else: return empty_row if regex.groups == 1: - result = Series([f(val)[0] for val in arr], name=regex.groupindex.get(1)) + result = Series([f(val)[0] for val in arr], + name=regex.groupindex.get(1), + index=arr.index) else: names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] - result = DataFrame([f(val) for val in arr], columns=columns) + result = DataFrame([f(val) for val in arr], + columns=columns, + index=arr.index) return result @@ -613,7 +617,7 @@ def str_split(arr, pat=None, n=None): if pat is None: if n is None or n == 0: n = -1 - f = lambda x: x.split() + f = lambda x: x.split(pat, n) else: if len(pat) == 1: if n is None or n == 0: diff --git a/pandas/io/data.py b/pandas/io/data.py index eb182e77a5db5..dc5dd2b4b7d80 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -494,7 +494,7 @@ def get_data_famafrench(name): tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: - data = zf.open(name + '.txt').readlines() + data = zf.open(zf.namelist()[0]).readlines() line_lengths = np.array(lmap(len, data)) file_edges = np.where(line_lengths == 2)[0] diff --git a/pandas/io/html.py b/pandas/io/html.py index e60630204a8b9..4375d08abc37c 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -579,8 +579,9 @@ def _expand_elements(body): lens_max = lens.max() not_max = lens[lens != lens_max] + empty = [''] for ind, length in iteritems(not_max): - body[ind] += [np.nan] * (lens_max - length) + body[ind] += empty * (lens_max - length) def _data_to_frame(data, header, index_col, skiprows, infer_types, @@ -760,15 +761,15 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, the table in the HTML. These are not checked for validity before being passed to lxml or Beautiful Soup. However, these attributes must be valid HTML table attributes to work correctly. For example, :: - + attrs = {'id': 'table'} - + is a valid attribute dictionary because the 'id' HTML tag attribute is a valid HTML attribute for *any* HTML tag as per `this document `__. :: - + attrs = {'asdf': 'table'} - + is *not* a valid attribute dictionary because 'asdf' is not a valid HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 table attributes can be found `here diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 85a9cf4ea0f9f..76f630082aa15 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -30,7 +30,7 @@ import pandas.core.common as com from pandas.tools.merge import concat from pandas import compat -from pandas.compat import u_safe as u, PY3, range, lrange, string_types +from pandas.compat import u_safe as u, PY3, range, lrange, string_types, filter from pandas.io.common import PerformanceWarning from pandas.core.config import get_option from pandas.computation.pytables import Expr, maybe_expression @@ -66,7 +66,7 @@ def _ensure_encoding(encoding): Term = Expr -def _ensure_term(where): +def _ensure_term(where, scope_level): """ ensure that the where is a Term or a list of Term this makes sure that we are capturing the scope of variables @@ -76,11 +76,17 @@ def _ensure_term(where): # only consider list/tuple here as an ndarray is automaticaly a coordinate # list + level = scope_level + 1 if isinstance(where, (list, tuple)): - where = [w if not maybe_expression(w) else Term(w, scope_level=2) - for w in where if w is not None] + wlist = [] + for w in filter(lambda x: x is not None, where): + if not maybe_expression(w): + wlist.append(w) + else: + wlist.append(Term(w, scope_level=level)) + where = wlist elif maybe_expression(where): - where = Term(where, scope_level=2) + where = Term(where, scope_level=level) return where @@ -311,7 +317,7 @@ def read_hdf(path_or_buf, key, **kwargs): # grab the scope if 'where' in kwargs: - kwargs['where'] = _ensure_term(kwargs['where']) + kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1) f = lambda store, auto_close: store.select( key, auto_close=auto_close, **kwargs) @@ -643,7 +649,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None, raise KeyError('No object named %s in the file' % key) # create the storer and axes - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) s = self._create_storer(group) s.infer_axes() @@ -675,7 +681,7 @@ def select_as_coordinates( start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection """ - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs) @@ -730,7 +736,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, """ # default to single select - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, string_types): @@ -776,8 +782,8 @@ def func(_start, _stop): c = s.read_coordinates(where=where, start=_start, stop=_stop, **kwargs) else: c = None - - objs = [t.read(where=c, start=_start, stop=_stop, + + objs = [t.read(where=c, start=_start, stop=_stop, columns=columns, **kwargs) for t in tbls] # concat and return @@ -838,7 +844,7 @@ def remove(self, key, where=None, start=None, stop=None): raises KeyError if key is not a valid store """ - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) try: s = self.get_storer(key) except: diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 989f6983b28d3..4c0c18a0e7bd0 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2,13 +2,15 @@ Collection of query wrappers / abstractions to both facilitate data retrieval and to reduce dependency on DB-specific API. """ -from __future__ import print_function -from datetime import datetime, date +from __future__ import print_function, division +from datetime import datetime, date, timedelta + import warnings -from pandas.compat import lzip, map, zip, raise_with_traceback, string_types +import itertools import numpy as np - +import pandas.core.common as com +from pandas.compat import lzip, map, zip, raise_with_traceback, string_types from pandas.core.api import DataFrame from pandas.core.base import PandasObject from pandas.tseries.tools import to_datetime @@ -360,7 +362,7 @@ def pandasSQL_builder(con, flavor=None, meta=None): class PandasSQLTable(PandasObject): - """ + """ For mapping Pandas tables to SQL tables. Uses fact that table is reflected by SQLAlchemy to do better type convertions. @@ -419,13 +421,21 @@ def maybe_asscalar(self, i): def insert(self): ins = self.insert_statement() - - for t in self.frame.iterrows(): - data = dict((k, self.maybe_asscalar(v)) - for k, v in t[1].iteritems()) - if self.index is not None: + data_list = [] + # to avoid if check for every row + keys = self.frame.columns + if self.index is not None: + for t in self.frame.itertuples(): + data = dict((k, self.maybe_asscalar(v)) + for k, v in zip(keys, t[1:])) data[self.index] = self.maybe_asscalar(t[0]) - self.pd_sql.execute(ins, **data) + data_list.append(data) + else: + for t in self.frame.itertuples(): + data = dict((k, self.maybe_asscalar(v)) + for k, v in zip(keys, t[1:])) + data_list.append(data) + self.pd_sql.execute(ins, data_list) def read(self, coerce_float=True, parse_dates=None, columns=None): @@ -480,7 +490,7 @@ def _create_table_statement(self): if self.index is not None: columns.insert(0, Column(self.index, self._sqlalchemy_type( - self.frame.index.dtype), + self.frame.index), index=True)) return Table(self.name, self.pd_sql.meta, *columns) @@ -537,22 +547,25 @@ def _harmonize_columns(self, parse_dates=None): except KeyError: pass # this column not in results - def _sqlalchemy_type(self, dtype): - from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date + def _sqlalchemy_type(self, arr_or_dtype): + from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date, Interval - pytype = dtype.type - - if pytype is date: + if arr_or_dtype is date: return Date - if issubclass(pytype, np.datetime64) or pytype is datetime: - # Caution: np.datetime64 is also a subclass of np.number. - return DateTime - if issubclass(pytype, np.floating): + if com.is_datetime64_dtype(arr_or_dtype): + try: + tz = arr_or_dtype.tzinfo + return DateTime(timezone=True) + except: + return DateTime + if com.is_timedelta64_dtype(arr_or_dtype): + return Interval + elif com.is_float_dtype(arr_or_dtype): return Float - if issubclass(pytype, np.integer): + elif com.is_integer_dtype(arr_or_dtype): # TODO: Refine integer size. return Integer - if issubclass(pytype, np.bool_): + elif com.is_bool(arr_or_dtype): return Boolean return Text @@ -638,14 +651,18 @@ def to_sql(self, frame, name, if_exists='fail', index=True): name, self, frame=frame, index=index, if_exists=if_exists) table.insert() + @property + def tables(self): + return self.meta.tables + def has_table(self, name): - return self.engine.has_table(name) + if self.meta.tables.get(name) is not None: + return True + else: + return False def get_table(self, table_name): - if self.engine.has_table(table_name): - return self.meta.tables[table_name] - else: - return None + return self.meta.tables.get(table_name) def read_table(self, table_name, index_col=None, coerce_float=True, parse_dates=None, columns=None): @@ -742,12 +759,10 @@ def insert_statement(self): def insert(self): ins = self.insert_statement() cur = self.pd_sql.con.cursor() - for r in self.frame.iterrows(): - data = [self.maybe_asscalar(v) for v in r[1].values] + for r in self.frame.itertuples(): + data = [self.maybe_asscalar(v) for v in r[1:]] if self.index is not None: data.insert(0, self.maybe_asscalar(r[0])) - print(type(data[2])) - print(type(r[0])) cur.execute(ins, tuple(data)) cur.close() @@ -772,7 +787,7 @@ def _create_table_statement(self): x for x in zip(safe_columns, column_types)) template = """CREATE TABLE %(name)s ( %(columns)s - );""" + )""" create_statement = template % {'name': self.name, 'columns': columns} return create_statement diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 55bcbd76c2248..7d9d272eea1b6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -23,7 +23,7 @@ from pandas.compat import long, lrange, lmap, lzip from pandas import isnull from pandas.io.common import get_filepath_or_buffer - +from pandas.tslib import NaT def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None): @@ -48,7 +48,7 @@ def read_stata(filepath_or_buffer, convert_dates=True, return reader.data(convert_dates, convert_categoricals, index) -_date_formats = ["%tc", "%tC", "%td", "%tw", "%tm", "%tq", "%th", "%ty"] +_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] def _stata_elapsed_date_to_datetime(date, fmt): @@ -97,6 +97,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): # numpy types and numpy datetime isn't mature enough / we can't rely on # pandas version > 0.7.1 #TODO: IIRC relative delta doesn't play well with np.datetime? + #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly if np.isnan(date): return np.datetime64('nat') @@ -109,7 +110,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): from warnings import warn warn("Encountered %tC format. Leaving in Stata Internal Format.") return date - elif fmt in ["%td", "td"]: + elif fmt in ["%td", "td", "%d", "d"]: return stata_epoch + datetime.timedelta(int(date)) elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week year = datetime.datetime(stata_epoch.year + date // 52, 1, 1) @@ -150,6 +151,11 @@ def _datetime_to_stata_elapsed(date, fmt): if not isinstance(date, datetime.datetime): raise ValueError("date should be datetime.datetime format") stata_epoch = datetime.datetime(1960, 1, 1) + # Handle NaTs + if date is NaT: + # Missing value for dates ('.'), assumed always double + # TODO: Should be moved so a const somewhere, and consolidated + return struct.unpack(' 100 or data[col].min() < -127: + data[col] = data[col].astype(np.int16) + elif dtype == np.int16: + if data[col].max() > 32740 or data[col].min() < -32767: + data[col] = data[col].astype(np.int32) + elif dtype == np.int64: + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: + data[col] = data[col].astype(np.int32) + else: + data[col] = data[col].astype(np.float64) + if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53: + ws = precision_loss_doc % ('int64', 'float64') + + if ws: + import warnings + + warnings.warn(ws, PossiblePrecisionLoss) + + return data + + class StataMissingValue(StringMixin): """ An observation's missing value. @@ -193,14 +255,23 @@ class StataMissingValue(StringMixin): ----- More information: """ - + # TODO: Needs test def __init__(self, offset, value): self._value = value - if type(value) is int or type(value) is long: - self._str = value - offset is 1 and \ - '.' or ('.' + chr(value - offset + 96)) + value_type = type(value) + if value_type in int: + loc = value - offset + elif value_type in (float, np.float32, np.float64): + if value <= np.finfo(np.float32).max: # float32 + conv_str, byte_loc, scale = ' strlen = struct.unpack('b', self.path_or_buf.read(1))[0] - self.time_stamp = self.path_or_buf.read(strlen) + self.time_stamp = self._null_terminate(self.path_or_buf.read(strlen)) self.path_or_buf.read(26) # self.path_or_buf.read(8) # 0x0000000000000000 self.path_or_buf.read(8) # position of @@ -456,11 +555,11 @@ def _read_header(self): self.nobs = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] if self.format_version > 105: - self.data_label = self.path_or_buf.read(81) + self.data_label = self._null_terminate(self.path_or_buf.read(81)) else: - self.data_label = self.path_or_buf.read(32) + self.data_label = self._null_terminate(self.path_or_buf.read(32)) if self.format_version > 104: - self.time_stamp = self.path_or_buf.read(18) + self.time_stamp = self._null_terminate(self.path_or_buf.read(18)) # descriptors if self.format_version > 108: @@ -556,8 +655,8 @@ def _col_size(self, k=None): def _unpack(self, fmt, byt): d = struct.unpack(self.byteorder + fmt, byt)[0] - if fmt[-1] in self.MISSING_VALUES: - nmin, nmax = self.MISSING_VALUES[fmt[-1]] + if fmt[-1] in self.VALID_RANGE: + nmin, nmax = self.VALID_RANGE[fmt[-1]] if d < nmin or d > nmax: if self._missing_values: return StataMissingValue(nmax, d) @@ -855,11 +954,12 @@ def _dtype_to_stata_type(dtype): See TYPE_MAP and comments for an explanation. This is also explained in the dta spec. 1 - 244 are strings of this length - 251 - chr(251) - for int8 and int16, byte - 252 - chr(252) - for int32, int - 253 - chr(253) - for int64, long - 254 - chr(254) - for float32, float - 255 - chr(255) - double, double + Pandas Stata + 251 - chr(251) - for int8 byte + 252 - chr(252) - for int16 int + 253 - chr(253) - for int32 long + 254 - chr(254) - for float32 float + 255 - chr(255) - for double double If there are dates to convert, then dtype will already have the correct type inserted. @@ -878,8 +978,10 @@ def _dtype_to_stata_type(dtype): elif dtype == np.int64: return chr(253) elif dtype == np.int32: + return chr(253) + elif dtype == np.int16: return chr(252) - elif dtype == np.int8 or dtype == np.int16: + elif dtype == np.int8: return chr(251) else: # pragma : no cover raise ValueError("Data type %s not currently understood. " @@ -939,6 +1041,11 @@ class StataWriter(StataParser): byteorder : str Can be ">", "<", "little", or "big". The default is None which uses `sys.byteorder` + time_stamp : datetime + A date time to use when writing the file. Can be None, in which + case the current time is used. + dataset_label : str + A label for the data set. Should be 80 characters or smaller. Returns ------- @@ -957,10 +1064,13 @@ class StataWriter(StataParser): >>> writer.write_file() """ def __init__(self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None): + encoding="latin-1", byteorder=None, time_stamp=None, + data_label=None): super(StataWriter, self).__init__(encoding) self._convert_dates = convert_dates self._write_index = write_index + self._time_stamp = time_stamp + self._data_label = data_label # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) @@ -970,7 +1080,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._file = _open_file_binary_write( fname, self._encoding or self._default_encoding ) - self.type_converters = {253: np.long, 252: int} + self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): """ @@ -990,11 +1100,14 @@ def __init__(self, data): self.data = data def __iter__(self): - for i, row in data.iterrows(): - yield row + for row in data.itertuples(): + # First element is index, so remove + yield row[1:] if self._write_index: data = data.reset_index() + # Check columns for compatibility with stata + data = _cast_to_stata_types(data) self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape self.data = data @@ -1017,7 +1130,8 @@ def __iter__(self): self.fmtlist[key] = self._convert_dates[key] def write_file(self): - self._write_header() + self._write_header(time_stamp=self._time_stamp, + data_label=self._data_label) self._write_descriptors() self._write_variable_labels() # write 5 zeros for expansion fields @@ -1054,7 +1168,7 @@ def _write_header(self, data_label=None, time_stamp=None): # format dd Mon yyyy hh:mm if time_stamp is None: time_stamp = datetime.datetime.now() - elif not isinstance(time_stamp, datetime): + elif not isinstance(time_stamp, datetime.datetime): raise ValueError("time_stamp should be datetime type") self._file.write( self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M")) @@ -1076,7 +1190,9 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, for c in name: if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_': name = name.replace(c, '_') - + # Variable name must not be a reserved word + if name in self.RESERVED_WORDS: + name = '_' + name # Variable name may not start with a number if name[0] > '0' and name[0] < '9': name = '_' + name @@ -1181,7 +1297,7 @@ def _write_data_dates(self): self._write(var) else: if isnull(var): # this only matters for floats - var = MISSING_VALUES[typ] + var = MISSING_VALUES[TYPE_MAP[typ]] self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) def _null_terminate(self, s, as_string=False): diff --git a/pandas/io/tests/data/computer_sales_page.html b/pandas/io/tests/data/computer_sales_page.html new file mode 100644 index 0000000000000..ff2b031b58d64 --- /dev/null +++ b/pandas/io/tests/data/computer_sales_page.html @@ -0,0 +1,619 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 
 Three months ended
+April 30
 Six months ended
+April 30
 
 
 2013  2012  2013  2012  
 
 In millions
 

Net revenue:

             

Notebooks

 $3,718 $4,900 $7,846 $9,842 

Desktops

  3,103  3,827  6,424  7,033 

Workstations

  521  537  1,056  1,072 

Other

  242  206  462  415 
          

Personal Systems

  7,584  9,470  15,788  18,362 
          

Supplies

  4,122  4,060  8,015  8,139 

Commercial Hardware

  1,398  1,479  2,752  2,968 

Consumer Hardware

  561  593  1,240  1,283 
          

Printing

  6,081  6,132  12,007  12,390 
          

Printing and Personal Systems Group

  13,665  15,602  27,795  30,752 
          

Industry Standard Servers

  2,806  3,186  5,800  6,258 

Technology Services

  2,272  2,335  4,515  4,599 

Storage

  857  990  1,690  1,945 

Networking

  618  614  1,226  1,200 

Business Critical Systems

  266  421  572  826 
          

Enterprise Group

  6,819  7,546  13,803  14,828 
          

Infrastructure Technology Outsourcing

  3,721  3,954  7,457  7,934 

Application and Business Services

  2,278  2,535  4,461  4,926 
          

Enterprise Services

  5,999  6,489  11,918  12,860 
          

Software

  941  970  1,867  1,916 

HP Financial Services

  881  968  1,838  1,918 

Corporate Investments

  10  7  14  37 
          

Total segments

  28,315  31,582  57,235  62,311 
          

Eliminations of intersegment net revenue and other

  (733) (889) (1,294) (1,582)
          

Total HP consolidated net revenue

 $27,582 $30,693 $55,941 $60,729 
          
diff --git a/pandas/io/tests/data/stata1.dta b/pandas/io/tests/data/stata1_114.dta similarity index 100% rename from pandas/io/tests/data/stata1.dta rename to pandas/io/tests/data/stata1_114.dta diff --git a/pandas/io/tests/data/stata1_v13.dta b/pandas/io/tests/data/stata1_117.dta similarity index 100% rename from pandas/io/tests/data/stata1_v13.dta rename to pandas/io/tests/data/stata1_117.dta diff --git a/pandas/io/tests/data/stata2_113.dta b/pandas/io/tests/data/stata2_113.dta new file mode 100644 index 0000000000000..09c90dca943d1 Binary files /dev/null and b/pandas/io/tests/data/stata2_113.dta differ diff --git a/pandas/io/tests/data/stata2.dta b/pandas/io/tests/data/stata2_114.dta similarity index 100% rename from pandas/io/tests/data/stata2.dta rename to pandas/io/tests/data/stata2_114.dta diff --git a/pandas/io/tests/data/stata2_115.dta b/pandas/io/tests/data/stata2_115.dta new file mode 100644 index 0000000000000..ad7dda3fdc4b3 Binary files /dev/null and b/pandas/io/tests/data/stata2_115.dta differ diff --git a/pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats b/pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats new file mode 100644 index 0000000000000..ad7dda3fdc4b3 Binary files /dev/null and b/pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats differ diff --git a/pandas/io/tests/data/stata2_v13.dta b/pandas/io/tests/data/stata2_117.dta similarity index 100% rename from pandas/io/tests/data/stata2_v13.dta rename to pandas/io/tests/data/stata2_117.dta diff --git a/pandas/io/tests/data/stata3_113.dta b/pandas/io/tests/data/stata3_113.dta new file mode 100644 index 0000000000000..f78150a2e4326 Binary files /dev/null and b/pandas/io/tests/data/stata3_113.dta differ diff --git a/pandas/io/tests/data/stata3.dta b/pandas/io/tests/data/stata3_114.dta similarity index 100% rename from pandas/io/tests/data/stata3.dta rename to pandas/io/tests/data/stata3_114.dta diff --git a/pandas/io/tests/data/stata3_115.dta b/pandas/io/tests/data/stata3_115.dta new file mode 100644 index 0000000000000..1c4ad0dae8092 Binary files /dev/null and b/pandas/io/tests/data/stata3_115.dta differ diff --git a/pandas/io/tests/data/stata3_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats b/pandas/io/tests/data/stata3_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats new file mode 100644 index 0000000000000..1c4ad0dae8092 Binary files /dev/null and b/pandas/io/tests/data/stata3_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats differ diff --git a/pandas/io/tests/data/stata3_v13.dta b/pandas/io/tests/data/stata3_117.dta similarity index 100% rename from pandas/io/tests/data/stata3_v13.dta rename to pandas/io/tests/data/stata3_117.dta diff --git a/pandas/io/tests/data/stata4_113.dta b/pandas/io/tests/data/stata4_113.dta new file mode 100644 index 0000000000000..9d7d5abb1b921 Binary files /dev/null and b/pandas/io/tests/data/stata4_113.dta differ diff --git a/pandas/io/tests/data/stata4.dta b/pandas/io/tests/data/stata4_114.dta similarity index 100% rename from pandas/io/tests/data/stata4.dta rename to pandas/io/tests/data/stata4_114.dta diff --git a/pandas/io/tests/data/stata4_115.dta b/pandas/io/tests/data/stata4_115.dta new file mode 100644 index 0000000000000..2c68cfb393b9e Binary files /dev/null and b/pandas/io/tests/data/stata4_115.dta differ diff --git a/pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats b/pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats new file mode 100644 index 0000000000000..2c68cfb393b9e Binary files /dev/null and b/pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats differ diff --git a/pandas/io/tests/data/stata4_v13.dta b/pandas/io/tests/data/stata4_117.dta similarity index 100% rename from pandas/io/tests/data/stata4_v13.dta rename to pandas/io/tests/data/stata4_117.dta diff --git a/pandas/io/tests/data/stata5.csv b/pandas/io/tests/data/stata5.csv new file mode 100644 index 0000000000000..8eb0c2854a740 --- /dev/null +++ b/pandas/io/tests/data/stata5.csv @@ -0,0 +1,19 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,,"a","a" +1,1,1,1,1,,"ab","b" +-1,-1,-1,-1,-1,,"abc","c" +100,32740,-2147483647,-1.70100000027769e+38,-2.0000000000000e+307,1970-01-01,"abcdefghijklmnop","d" +-127,-32767,2147483620,1.70100000027769e+38,8.0000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" +,0,,,,2014-01-01,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" +0,,,,,2114-01-01,"1234567890","1" +,,0,,,2014-12-31,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" +.a,.a,.a,.a,.a,2012-02-29,"!","A" +100,32740,-2.15e+09,-1.70e+38,-2.0e+307,01jan1970,"abcdefghijklmnop","d" +-127,-32767,2.15e+09,1.70e+38,8.0e+307,02jan1970,"abcdefghijklmnopqrstuvwxyz","e" +,0,,,,01jan2014,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" +0,,,,,01jan2114,"1234567890","1" +,,0,,,31dec2014,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" +.a,.a,.a,.a,.a,29feb2012,"!","A" +.z,.z,.z,.z,.z,,"&","Z" +,,,0,,,"1.23","!" +,,,,0,,"10jan1970","." diff --git a/pandas/io/tests/data/stata5_113.dta b/pandas/io/tests/data/stata5_113.dta new file mode 100644 index 0000000000000..3615928d55838 Binary files /dev/null and b/pandas/io/tests/data/stata5_113.dta differ diff --git a/pandas/io/tests/data/stata5_114.dta b/pandas/io/tests/data/stata5_114.dta new file mode 100644 index 0000000000000..bebc6a72c5e34 Binary files /dev/null and b/pandas/io/tests/data/stata5_114.dta differ diff --git a/pandas/io/tests/data/stata5_115.dta b/pandas/io/tests/data/stata5_115.dta new file mode 100644 index 0000000000000..c54bd62c24dd2 Binary files /dev/null and b/pandas/io/tests/data/stata5_115.dta differ diff --git a/pandas/io/tests/data/stata6.csv b/pandas/io/tests/data/stata6.csv new file mode 100644 index 0000000000000..27a1dc64f530b --- /dev/null +++ b/pandas/io/tests/data/stata6.csv @@ -0,0 +1,6 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,1960-01-01,"a","a" +1,1,1,1,1,3014-12-31,"ab","b" +-1,-1,-1,-1,-1,2014-12-31,"abc","c" +100,32740,-2147483647,-1.7010000002777e+38,-2.000000000000e+307,1970-01-01,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","d" +-127,-32767,2147483620,1.7010000002777e+38,8.000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" diff --git a/pandas/io/tests/data/stata6_113.dta b/pandas/io/tests/data/stata6_113.dta new file mode 100644 index 0000000000000..2e4795b167f26 Binary files /dev/null and b/pandas/io/tests/data/stata6_113.dta differ diff --git a/pandas/io/tests/data/stata6_114.dta b/pandas/io/tests/data/stata6_114.dta new file mode 100644 index 0000000000000..aa507e474dd43 Binary files /dev/null and b/pandas/io/tests/data/stata6_114.dta differ diff --git a/pandas/io/tests/data/stata6_115.dta b/pandas/io/tests/data/stata6_115.dta new file mode 100644 index 0000000000000..c513463868113 Binary files /dev/null and b/pandas/io/tests/data/stata6_115.dta differ diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py index 9100673f99579..6cfe4bea01045 100644 --- a/pandas/io/tests/test_cparser.py +++ b/pandas/io/tests/test_cparser.py @@ -82,8 +82,8 @@ def test_skipinitialspace(self): header=None) result = reader.read() - self.assert_(np.array_equal(result[0], ['a', 'a', 'a', 'a'])) - self.assert_(np.array_equal(result[1], ['b', 'b', 'b', 'b'])) + self.assert_numpy_array_equal(result[0], ['a', 'a', 'a', 'a']) + self.assert_numpy_array_equal(result[1], ['b', 'b', 'b', 'b']) def test_parse_booleans(self): data = 'True\nFalse\nTrue\nTrue' @@ -100,8 +100,8 @@ def test_delimit_whitespace(self): header=None) result = reader.read() - self.assert_(np.array_equal(result[0], ['a', 'a', 'a'])) - self.assert_(np.array_equal(result[1], ['b', 'b', 'b'])) + self.assert_numpy_array_equal(result[0], ['a', 'a', 'a']) + self.assert_numpy_array_equal(result[1], ['b', 'b', 'b']) def test_embedded_newline(self): data = 'a\n"hello\nthere"\nthis' @@ -110,7 +110,7 @@ def test_embedded_newline(self): result = reader.read() expected = ['a', 'hello\nthere', 'this'] - self.assert_(np.array_equal(result[0], expected)) + self.assert_numpy_array_equal(result[0], expected) def test_euro_decimal(self): data = '12345,67\n345,678' diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index a044b388c00c4..498e8ed58aee1 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -387,7 +387,7 @@ def test_read_fred(self): def test_read_famafrench(self): for name in ("F-F_Research_Data_Factors", "F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3", - "F-F_ST_Reversal_Factor"): + "F-F_ST_Reversal_Factor","F-F_Momentum_Factor"): ff = DataReader(name, "famafrench") assert ff assert isinstance(ff, dict) @@ -418,6 +418,8 @@ def test_fred_nan(self): @network def test_fred_parts(self): + raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') + start = datetime(2010, 1, 1) end = datetime(2013, 1, 27) df = web.get_data_fred("CPIAUCSL", start, end) @@ -444,6 +446,8 @@ def test_invalid_series(self): @network def test_fred_multi(self): + raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') + names = ['CPIAUCSL', 'CPALTT01USQ661S', 'CPILFESL'] start = datetime(2010, 1, 1) end = datetime(2013, 1, 27) diff --git a/pandas/io/tests/test_date_converters.py b/pandas/io/tests/test_date_converters.py index 6aa1f7e1786a1..e1e6286aabcc1 100644 --- a/pandas/io/tests/test_date_converters.py +++ b/pandas/io/tests/test_date_converters.py @@ -49,7 +49,7 @@ def test_parse_date_time(self): datecols = {'date_time': [0, 1]} df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_date_time) - self.assert_('date_time' in df) + self.assertIn('date_time', df) self.assertEqual(df.date_time.ix[0], datetime(2001, 1, 5, 10, 0, 0)) data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" @@ -73,7 +73,7 @@ def test_parse_date_fields(self): df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_date_fields) - self.assert_('ymd' in df) + self.assertIn('ymd', df) self.assertEqual(df.ymd.ix[0], datetime(2001, 1, 10)) def test_datetime_six_col(self): @@ -90,7 +90,7 @@ def test_datetime_six_col(self): df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_all_fields) - self.assert_('ymdHMS' in df) + self.assertIn('ymdHMS', df) self.assertEqual(df.ymdHMS.ix[0], datetime(2001, 1, 5, 10, 0, 0)) def test_datetime_fractional_seconds(self): @@ -103,7 +103,7 @@ def test_datetime_fractional_seconds(self): df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_all_fields) - self.assert_('ymdHMS' in df) + self.assertIn('ymdHMS', df) self.assertEqual(df.ymdHMS.ix[0], datetime(2001, 1, 5, 10, 0, 0, microsecond=123456)) self.assertEqual(df.ymdHMS.ix[1], datetime(2001, 1, 5, 10, 0, 0, @@ -116,7 +116,7 @@ def test_generic(self): df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=dateconverter) - self.assert_('ym' in df) + self.assertIn('ym', df) self.assertEqual(df.ym.ix[0], date(2001, 1, 1)) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 893b1768b00c3..3a7106fc6b4bb 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -22,6 +22,7 @@ from pandas.compat import map, zip, StringIO, string_types from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html +from pandas.parser import CParserError import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network @@ -143,7 +144,7 @@ def test_banklist(self): def test_spam_no_types(self): with tm.assert_produces_warning(FutureWarning): df1 = self.read_html(self.spam_data, '.*Water.*', - infer_types=False) + infer_types=False) with tm.assert_produces_warning(FutureWarning): df2 = self.read_html(self.spam_data, 'Unit', infer_types=False) @@ -305,8 +306,11 @@ def test_bad_url_protocol(self): @network def test_invalid_url(self): - with tm.assertRaises(URLError): - self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') + try: + with tm.assertRaises(URLError): + self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') + except ValueError as e: + tm.assert_equal(str(e), 'No tables found') @slow def test_file_url(self): @@ -499,10 +503,10 @@ def test_gold_canyon(self): with open(self.banklist_data, 'r') as f: raw_text = f.read() - self.assert_(gc in raw_text) + self.assertIn(gc, raw_text) df = self.read_html(self.banklist_data, 'Gold Canyon', attrs={'id': 'table'})[0] - self.assert_(gc in df.to_string()) + self.assertIn(gc, df.to_string()) def test_different_number_of_rows(self): expected = """ @@ -581,6 +585,14 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) + def test_computer_sales_page(self): + data = os.path.join(DATA_PATH, 'computer_sales_page.html') + with tm.assertRaisesRegexp(CParserError, r"Passed header=\[0,1\] are " + "too many rows for this multi_index " + "of columns"): + with tm.assert_produces_warning(FutureWarning): + self.read_html(data, infer_types=False, header=[0, 1]) + class TestReadHtmlLxml(tm.TestCase): @classmethod @@ -631,6 +643,12 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) + def test_computer_sales_page(self): + import pandas as pd + data = os.path.join(DATA_PATH, 'computer_sales_page.html') + with tm.assert_produces_warning(FutureWarning): + self.read_html(data, infer_types=False, header=[0, 1]) + def test_invalid_flavor(): url = 'google.com' diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 0bbf81384672e..1386439d51757 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -359,7 +359,7 @@ def test_multi(self): l = [self.frame['float'], self.frame['float'] .A, self.frame['float'].B, None] l_rec = self.encode_decode(l) - self.assert_(isinstance(l_rec, tuple)) + self.assertIsInstance(l_rec, tuple) check_arbitrary(l, l_rec) def test_iterator(self): diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 344f5a3f215b2..612840e82e3ff 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -302,11 +302,11 @@ def func(*date_cols): prefix='X', parse_dates={'nominal': [1, 2], 'actual': [1, 3]}) - self.assert_('nominal' in df) - self.assert_('actual' in df) - self.assert_('X1' not in df) - self.assert_('X2' not in df) - self.assert_('X3' not in df) + self.assertIn('nominal', df) + self.assertIn('actual', df) + self.assertNotIn('X1', df) + self.assertNotIn('X2', df) + self.assertNotIn('X3', df) d = datetime(1999, 1, 27, 19, 0) self.assertEqual(df.ix[0, 'nominal'], d) @@ -316,12 +316,12 @@ def func(*date_cols): parse_dates={'nominal': [1, 2], 'actual': [1, 3]}, keep_date_col=True) - self.assert_('nominal' in df) - self.assert_('actual' in df) + self.assertIn('nominal', df) + self.assertIn('actual', df) - self.assert_(1 in df) - self.assert_(2 in df) - self.assert_(3 in df) + self.assertIn(1, df) + self.assertIn(2, df) + self.assertIn(3, df) data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 @@ -335,11 +335,11 @@ def func(*date_cols): prefix='X', parse_dates=[[1, 2], [1, 3]]) - self.assert_('X1_X2' in df) - self.assert_('X1_X3' in df) - self.assert_('X1' not in df) - self.assert_('X2' not in df) - self.assert_('X3' not in df) + self.assertIn('X1_X2', df) + self.assertIn('X1_X3', df) + self.assertNotIn('X1', df) + self.assertNotIn('X2', df) + self.assertNotIn('X3', df) d = datetime(1999, 1, 27, 19, 0) self.assertEqual(df.ix[0, 'X1_X2'], d) @@ -347,11 +347,11 @@ def func(*date_cols): df = read_csv(StringIO(data), header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True) - self.assert_('1_2' in df) - self.assert_('1_3' in df) - self.assert_(1 in df) - self.assert_(2 in df) - self.assert_(3 in df) + self.assertIn('1_2', df) + self.assertIn('1_3', df) + self.assertIn(1, df) + self.assertIn(2, df) + self.assertIn(3, df) data = '''\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 @@ -378,7 +378,7 @@ def test_multiple_date_cols_int_cast(self): # it works! df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec, date_parser=conv.parse_date_time) - self.assert_('nominal' in df) + self.assertIn('nominal', df) def test_multiple_date_col_timestamp_parse(self): data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 @@ -415,7 +415,7 @@ def test_multiple_date_cols_with_header(self): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) - self.assert_(not isinstance(df.nominal[0], compat.string_types)) + self.assertNotIsInstance(df.nominal[0], compat.string_types) ts_data = """\ ID,date,nominalTime,actualTime,A,B,C,D,E @@ -523,7 +523,7 @@ def test_malformed(self): StringIO(data), sep=',', header=1, comment='#') self.assert_(False) except Exception as inst: - self.assert_('Expected 3 fields in line 4, saw 5' in str(inst)) + self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) # skip_footer data = """ignore @@ -540,7 +540,7 @@ def test_malformed(self): skip_footer=1) self.assert_(False) except Exception as inst: - self.assert_('Expected 3 fields in line 4, saw 5' in str(inst)) + self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) # first chunk data = """ignore @@ -558,7 +558,7 @@ def test_malformed(self): df = it.read(5) self.assert_(False) except Exception as inst: - self.assert_('Expected 3 fields in line 6, saw 5' in str(inst)) + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) # middle chunk data = """ignore @@ -577,7 +577,7 @@ def test_malformed(self): it.read(2) self.assert_(False) except Exception as inst: - self.assert_('Expected 3 fields in line 6, saw 5' in str(inst)) + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) # last chunk data = """ignore @@ -596,7 +596,7 @@ def test_malformed(self): it.read() self.assert_(False) except Exception as inst: - self.assert_('Expected 3 fields in line 6, saw 5' in str(inst)) + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) def test_passing_dtype(self): @@ -810,9 +810,9 @@ def test_unnamed_columns(self): [11, 12, 13, 14, 15]] df = self.read_table(StringIO(data), sep=',') tm.assert_almost_equal(df.values, expected) - self.assert_(np.array_equal(df.columns, - ['A', 'B', 'C', 'Unnamed: 3', - 'Unnamed: 4'])) + self.assert_numpy_array_equal(df.columns, + ['A', 'B', 'C', 'Unnamed: 3', + 'Unnamed: 4']) def test_string_nas(self): data = """A,B,C @@ -873,8 +873,7 @@ def test_parse_dates_implicit_first_col(self): """ df = self.read_csv(StringIO(data), parse_dates=True) expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True) - self.assert_( - isinstance(df.index[0], (datetime, np.datetime64, Timestamp))) + self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) tm.assert_frame_equal(df, expected) def test_parse_dates_string(self): @@ -963,11 +962,11 @@ def test_no_header(self): tm.assert_almost_equal(df.values, expected) tm.assert_almost_equal(df.values, df2.values) - self.assert_(np.array_equal(df_pref.columns, - ['X0', 'X1', 'X2', 'X3', 'X4'])) - self.assert_(np.array_equal(df.columns, lrange(5))) + self.assert_numpy_array_equal(df_pref.columns, + ['X0', 'X1', 'X2', 'X3', 'X4']) + self.assert_numpy_array_equal(df.columns, lrange(5)) - self.assert_(np.array_equal(df2.columns, names)) + self.assert_numpy_array_equal(df2.columns, names) def test_no_header_prefix(self): data = """1,2,3,4,5 @@ -982,8 +981,8 @@ def test_no_header_prefix(self): [11, 12, 13, 14, 15]] tm.assert_almost_equal(df_pref.values, expected) - self.assert_(np.array_equal(df_pref.columns, - ['Field0', 'Field1', 'Field2', 'Field3', 'Field4'])) + self.assert_numpy_array_equal(df_pref.columns, + ['Field0', 'Field1', 'Field2', 'Field3', 'Field4']) def test_header_with_index_col(self): data = """foo,1,2,3 @@ -1004,10 +1003,9 @@ def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = self.read_table(self.csv1, sep=',', index_col=0, parse_dates=True) - self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D'])) + self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D']) self.assertEqual(df.index.name, 'index') - self.assert_(isinstance(df.index[0], (datetime, np.datetime64, - Timestamp))) + self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) self.assertEqual(df.values.dtype, np.float64) tm.assert_frame_equal(df, df2) @@ -1015,9 +1013,8 @@ def test_read_csv_no_index_name(self): df = self.read_csv(self.csv2, index_col=0, parse_dates=True) df2 = self.read_table(self.csv2, sep=',', index_col=0, parse_dates=True) - self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D', 'E'])) - self.assert_(isinstance(df.index[0], (datetime, np.datetime64, - Timestamp))) + self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D', 'E']) + self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64) tm.assert_frame_equal(df, df2) @@ -1441,13 +1438,13 @@ def test_multi_index_parse_dates(self): 20090103,three,c,4,5 """ df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True) - self.assert_(isinstance(df.index.levels[0][0], - (datetime, np.datetime64, Timestamp))) + self.assertIsInstance(df.index.levels[0][0], + (datetime, np.datetime64, Timestamp)) # specify columns out of order! df2 = self.read_csv(StringIO(data), index_col=[1, 0], parse_dates=True) - self.assert_(isinstance(df2.index.levels[1][0], - (datetime, np.datetime64, Timestamp))) + self.assertIsInstance(df2.index.levels[1][0], + (datetime, np.datetime64, Timestamp)) def test_skip_footer(self): data = """A,B,C @@ -1480,7 +1477,7 @@ def test_no_unnamed_index(self): 2 2 2 e f """ df = self.read_table(StringIO(data), sep=' ') - self.assert_(df.index.name is None) + self.assertIsNone(df.index.name) def test_converters(self): data = """A,B,C,D @@ -1661,7 +1658,7 @@ def test_parse_tz_aware(self): stamp = result.index[0] self.assertEqual(stamp.minute, 39) try: - self.assert_(result.index.tz is pytz.utc) + self.assertIs(result.index.tz, pytz.utc) except AssertionError: # hello Yaroslav arr = result.index.to_pydatetime() result = tools.to_datetime(arr, utc=True)[0] @@ -1698,7 +1695,7 @@ def test_multiple_date_cols_chunked(self): chunks = list(reader) - self.assert_('nominalTime' not in df) + self.assertNotIn('nominalTime', df) tm.assert_frame_equal(chunks[0], df[:2]) tm.assert_frame_equal(chunks[1], df[2:4]) @@ -2177,7 +2174,7 @@ def test_regex_separator(self): df = self.read_table(StringIO(data), sep='\s+') expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)), index_col=0) - self.assert_(expected.index.name is None) + self.assertIsNone(expected.index.name) tm.assert_frame_equal(df, expected) def test_1000_fwf(self): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index b12915753127d..c579e8502eb84 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -59,6 +59,7 @@ def create_tempfile(path): """ create an unopened named temporary file """ return os.path.join(tempfile.gettempdir(),path) + @contextmanager def ensure_clean_store(path, mode='a', complevel=None, complib=None, fletcher32=False): @@ -77,6 +78,7 @@ def ensure_clean_store(path, mode='a', complevel=None, complib=None, if mode == 'w' or mode == 'a': safe_remove(path) + @contextmanager def ensure_clean_path(path): """ @@ -95,6 +97,7 @@ def ensure_clean_path(path): for f in filenames: safe_remove(f) + # set these parameters so we don't have file sharing tables.parameters.MAX_NUMEXPR_THREADS = 1 tables.parameters.MAX_BLOSC_THREADS = 1 @@ -256,7 +259,6 @@ def test_api(self): self.assertRaises(TypeError, df.to_hdf, path,'df',append=True,format='foo') self.assertRaises(TypeError, df.to_hdf, path,'df',append=False,format='bar') - def test_api_default_format(self): # default_format option @@ -363,18 +365,18 @@ def test_contains(self): store['a'] = tm.makeTimeSeries() store['b'] = tm.makeDataFrame() store['foo/bar'] = tm.makeDataFrame() - self.assert_('a' in store) - self.assert_('b' in store) - self.assert_('c' not in store) - self.assert_('foo/bar' in store) - self.assert_('/foo/bar' in store) - self.assert_('/foo/b' not in store) - self.assert_('bar' not in store) + self.assertIn('a', store) + self.assertIn('b', store) + self.assertNotIn('c', store) + self.assertIn('foo/bar', store) + self.assertIn('/foo/bar', store) + self.assertNotIn('/foo/b', store) + self.assertNotIn('bar', store) # GH 2694 warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) store['node())'] = tm.makeDataFrame() - self.assert_('node())' in store) + self.assertIn('node())', store) def test_versioning(self): @@ -514,7 +516,7 @@ def test_open_args(self): if LooseVersion(tables.__version__) >= '3.0.0': # the file should not have actually been written - self.assert_(os.path.exists(path) is False) + self.assertFalse(os.path.exists(path)) def test_flush(self): @@ -777,12 +779,12 @@ def test_append_series(self): store.append('ss', ss) result = store['ss'] tm.assert_series_equal(result, ss) - self.assert_(result.name is None) + self.assertIsNone(result.name) store.append('ts', ts) result = store['ts'] tm.assert_series_equal(result, ts) - self.assert_(result.name is None) + self.assertIsNone(result.name) ns.name = 'foo' store.append('ns', ns) @@ -2257,7 +2259,6 @@ def test_remove_startstop(self): expected = wp.reindex(major_axis=wp.major_axis-wp.major_axis[np.arange(0,20,3)]) assert_panel_equal(result, expected) - def test_remove_crit(self): with ensure_clean_store(self.path) as store: @@ -2370,8 +2371,11 @@ def test_terms(self): wp = tm.makePanel() p4d = tm.makePanel4D() + wpneg = Panel.fromDict({-1: tm.makeDataFrame(), 0: tm.makeDataFrame(), + 1: tm.makeDataFrame()}) store.put('wp', wp, table=True) store.put('p4d', p4d, table=True) + store.put('wpneg', wpneg, table=True) # panel result = store.select('wp', [Term( @@ -2432,6 +2436,18 @@ def test_terms(self): for t in terms: store.select('p4d', t) + with tm.assertRaisesRegexp(TypeError, 'Only named functions are supported'): + store.select('wp', Term('major_axis == (lambda x: x)("20130101")')) + + # check USub node parsing + res = store.select('wpneg', Term('items == -1')) + expected = Panel({-1: wpneg[-1]}) + tm.assert_panel_equal(res, expected) + + with tm.assertRaisesRegexp(NotImplementedError, + 'Unary addition not supported'): + store.select('wpneg', Term('items == +1')) + def test_term_compat(self): with ensure_clean_store(self.path) as store: @@ -2474,6 +2490,50 @@ def test_term_compat(self): expected = wp.loc[:,:,['A','B']] assert_panel_equal(result, expected) + def test_backwards_compat_without_term_object(self): + with ensure_clean_store(self.path) as store: + + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp',wp) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis>20000102'), + ('minor_axis', '=', ['A','B']) ]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']] + assert_panel_equal(result, expected) + + store.remove('wp', ('major_axis>20000103')) + result = store.select('wp') + expected = wp.loc[:,wp.major_axis<=Timestamp('20000103'),:] + assert_panel_equal(result, expected) + + with ensure_clean_store(self.path) as store: + + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp',wp) + + # stringified datetimes + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis','>',datetime.datetime(2000,1,2))]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] + assert_panel_equal(result, expected) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis','>',datetime.datetime(2000,1,2,0,0))]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] + assert_panel_equal(result, expected) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis','=',[datetime.datetime(2000,1,2,0,0), + datetime.datetime(2000,1,3,0,0)])]) + expected = wp.loc[:,[Timestamp('20000102'),Timestamp('20000103')]] + assert_panel_equal(result, expected) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('minor_axis','=',['A','B'])]) + expected = wp.loc[:,:,['A','B']] + assert_panel_equal(result, expected) + def test_same_name_scoping(self): with ensure_clean_store(self.path) as store: @@ -3213,7 +3273,7 @@ def test_retain_index_attributes(self): index=date_range('2002-1-1',periods=3,freq='D')))) store.append('data',df2) - self.assert_(store.get_storer('data').info['index']['freq'] is None) + self.assertIsNone(store.get_storer('data').info['index']['freq']) # this is ok _maybe_remove(store,'df2') @@ -3248,7 +3308,7 @@ def test_retain_index_attributes2(self): df2 = DataFrame(dict(A = Series(lrange(3), index=idx2))) df2.to_hdf(path,'data',append=True) - self.assert_(read_hdf(path,'data').index.name is None) + self.assertIsNone(read_hdf(path,'data').index.name) def test_panel_select(self): @@ -3279,6 +3339,8 @@ def test_frame_select(self): date = df.index[len(df) // 2] crit1 = Term('index>=date') + self.assertEqual(crit1.env.scope['date'], date) + crit2 = ("columns=['A', 'D']") crit3 = ('columns=A') @@ -3492,7 +3554,7 @@ def f(): # valid result = store.select_column('df', 'index') tm.assert_almost_equal(result.values, Series(df.index).values) - self.assert_(isinstance(result,Series)) + self.assertIsInstance(result,Series) # not a data indexable column self.assertRaises( @@ -3560,7 +3622,7 @@ def test_coordinates(self): result = store.select('df', where=c) expected = df.ix[3:4, :] tm.assert_frame_equal(result, expected) - self.assert_(isinstance(c, Index)) + self.assertIsInstance(c, Index) # multiple tables _maybe_remove(store, 'df1') @@ -3732,7 +3794,6 @@ def test_select_as_multiple(self): self.assertRaises(ValueError, store.select_as_multiple, ['df1','df3'], where=['A>0', 'B>0'], selector='df1') - def test_nan_selection_bug_4858(self): # GH 4858; nan selection bug, only works for pytables >= 3.1 @@ -3783,6 +3844,10 @@ def test_select_filter_corner(self): result = store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) + crit = Term('columns=df.columns[:75:2]') + result = store.select('frame', [crit]) + tm.assert_frame_equal(result, df.ix[:, df.columns[:75:2]]) + def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} @@ -3829,10 +3894,10 @@ def test_multiple_open_close(self): # single store = HDFStore(path) - self.assert_('CLOSED' not in str(store)) + self.assertNotIn('CLOSED', str(store)) self.assert_(store.is_open) store.close() - self.assert_('CLOSED' in str(store)) + self.assertIn('CLOSED', str(store)) self.assert_(not store.is_open) with ensure_clean_path(self.path) as path: @@ -3852,20 +3917,20 @@ def f(): store1 = HDFStore(path) store2 = HDFStore(path) - self.assert_('CLOSED' not in str(store1)) - self.assert_('CLOSED' not in str(store2)) + self.assertNotIn('CLOSED', str(store1)) + self.assertNotIn('CLOSED', str(store2)) self.assert_(store1.is_open) self.assert_(store2.is_open) store1.close() - self.assert_('CLOSED' in str(store1)) + self.assertIn('CLOSED', str(store1)) self.assert_(not store1.is_open) - self.assert_('CLOSED' not in str(store2)) + self.assertNotIn('CLOSED', str(store2)) self.assert_(store2.is_open) store2.close() - self.assert_('CLOSED' in str(store1)) - self.assert_('CLOSED' in str(store2)) + self.assertIn('CLOSED', str(store1)) + self.assertIn('CLOSED', str(store2)) self.assert_(not store1.is_open) self.assert_(not store2.is_open) @@ -3876,11 +3941,11 @@ def f(): store2 = HDFStore(path) store2.append('df2',df) store2.close() - self.assert_('CLOSED' in str(store2)) + self.assertIn('CLOSED', str(store2)) self.assert_(not store2.is_open) store.close() - self.assert_('CLOSED' in str(store)) + self.assertIn('CLOSED', str(store)) self.assert_(not store.is_open) # double closing @@ -3889,11 +3954,11 @@ def f(): store2 = HDFStore(path) store.close() - self.assert_('CLOSED' in str(store)) + self.assertIn('CLOSED', str(store)) self.assert_(not store.is_open) store2.close() - self.assert_('CLOSED' in str(store2)) + self.assertIn('CLOSED', str(store2)) self.assert_(not store2.is_open) # ops on a closed store @@ -4183,6 +4248,7 @@ def test_query_with_nested_special_character(self): result = store.select('test', 'a = "test & test"') tm.assert_frame_equal(expected, result) + def _test_sort(obj): if isinstance(obj, DataFrame): return obj.reindex(sorted(obj.index)) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 2be086cddf7c4..0e26a66921df4 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -7,7 +7,7 @@ import nose import numpy as np -from pandas import DataFrame +from pandas import DataFrame, Series from pandas.compat import range, lrange, iteritems #from pandas.core.datetools import format as date_format @@ -554,6 +554,18 @@ def test_date_parsing(self): self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") + def test_mixed_dtype_insert(self): + # see GH6509 + s1 = Series(2**25 + 1,dtype=np.int32) + s2 = Series(0.0,dtype=np.float32) + df = DataFrame({'s1': s1, 's2': s2}) + + # write and read again + df.to_sql("test_read_write", self.conn, index=False) + df2 = sql.read_table("test_read_write", self.conn) + + tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) + class TestSQLAlchemy(_TestSQLAlchemy): """ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1640bee7a9929..a99420493d047 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -1,6 +1,7 @@ # pylint: disable=E1101 from datetime import datetime +import datetime as dt import os import warnings import nose @@ -27,22 +28,46 @@ def setUp(self): # Unit test datasets for dta7 - dta9 (old stata formats 104, 105 and 107) can be downloaded from: # http://stata-press.com/data/glmext.html self.dirpath = tm.get_data_path() - self.dta1 = os.path.join(self.dirpath, 'stata1.dta') - self.dta2 = os.path.join(self.dirpath, 'stata2.dta') - self.dta3 = os.path.join(self.dirpath, 'stata3.dta') + self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') + self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') + + self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') + self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') + self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') + self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') + + self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') + self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') + self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') + self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') - self.dta4 = os.path.join(self.dirpath, 'stata4.dta') + + self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') + self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') + self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') + self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') + self.dta7 = os.path.join(self.dirpath, 'cancer.dta') self.csv7 = os.path.join(self.dirpath, 'cancer.csv') + self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') + self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') + self.dta9 = os.path.join(self.dirpath, 'lbw.dta') self.csv9 = os.path.join(self.dirpath, 'lbw.csv') + self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') - self.dta1_13 = os.path.join(self.dirpath, 'stata1_v13.dta') - self.dta2_13 = os.path.join(self.dirpath, 'stata2_v13.dta') - self.dta3_13 = os.path.join(self.dirpath, 'stata3_v13.dta') - self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') + + self.csv14 = os.path.join(self.dirpath, 'stata5.csv') + self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') + self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') + self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') + + self.csv15 = os.path.join(self.dirpath, 'stata6.csv') + self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') + self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') + self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -51,10 +76,10 @@ def read_csv(self, file): return read_csv(file, parse_dates=True) def test_read_dta1(self): - reader = StataReader(self.dta1) - parsed = reader.data() - reader_13 = StataReader(self.dta1_13) - parsed_13 = reader_13.data() + reader_114 = StataReader(self.dta1_114) + parsed_114 = reader_114.data() + reader_117 = StataReader(self.dta1_117) + parsed_117 = reader_117.data() # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], @@ -65,8 +90,8 @@ def test_read_dta1(self): # the casting doesn't fail so need to match stata here expected['float_miss'] = expected['float_miss'].astype(np.float32) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta2(self): if LooseVersion(sys.version) < '2.7': @@ -109,34 +134,48 @@ def test_read_dta2(self): 'monthly_date', 'quarterly_date', 'half_yearly_date', 'yearly_date'] ) + expected['yearly_date'] = expected['yearly_date'].astype('O') with warnings.catch_warnings(record=True) as w: - parsed = self.read_dta(self.dta2) - parsed_13 = self.read_dta(self.dta2_13) + parsed_114 = self.read_dta(self.dta2_114) + parsed_115 = self.read_dta(self.dta2_115) + parsed_117 = self.read_dta(self.dta2_117) + # 113 is buggy due ot limits date format support in Stata + # parsed_113 = self.read_dta(self.dta2_113) + np.testing.assert_equal( len(w), 1) # should get a warning for that format. # buggy test because of the NaT comparison on certain platforms - # - #tm.assert_frame_equal(parsed, expected) - #tm.assert_frame_equal(parsed_13, expected) + # Format 113 test fails since it does not support tc and tC formats + # tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta3(self): - parsed = self.read_dta(self.dta3) - parsed_13 = self.read_dta(self.dta3_13) + parsed_113 = self.read_dta(self.dta3_113) + parsed_114 = self.read_dta(self.dta3_114) + parsed_115 = self.read_dta(self.dta3_115) + parsed_117 = self.read_dta(self.dta3_117) # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int32) - expected['quarter'] = expected['quarter'].astype(np.int16) + expected['year'] = expected['year'].astype(np.int16) + expected['quarter'] = expected['quarter'].astype(np.int8) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta4(self): - parsed = self.read_dta(self.dta4) - parsed_13 = self.read_dta(self.dta4_13) + parsed_113 = self.read_dta(self.dta4_113) + parsed_114 = self.read_dta(self.dta4_114) + parsed_115 = self.read_dta(self.dta4_115) + parsed_117 = self.read_dta(self.dta4_117) + expected = DataFrame.from_records( [ ["one", "ten", "one", "one", "one"], @@ -153,11 +192,13 @@ def test_read_dta4(self): columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled']) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_write_dta5(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', @@ -171,10 +212,13 @@ def test_read_write_dta5(self): original) def test_write_dta6(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = self.read_csv(self.csv3) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['year'] = original['year'].astype(np.int32) + original['quarter'] = original['quarter'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None, False) @@ -201,14 +245,16 @@ def test_read_dta9(self): tm.assert_frame_equal(parsed, expected) def test_read_write_dta10(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], - columns=['string', 'object', 'integer', 'float', + columns=['string', 'object', 'integer', 'floating', 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, False) @@ -231,20 +277,21 @@ def test_encoding(self): if compat.PY3: expected = raw.kreis1849[0] self.assertEqual(result, expected) - self.assert_(isinstance(result, compat.string_types)) + self.assertIsInstance(result, compat.string_types) else: expected = raw.kreis1849.str.decode("latin-1")[0] self.assertEqual(result, expected) - self.assert_(isinstance(result, unicode)) + self.assertIsInstance(result, unicode) def test_read_write_dta11(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______']) formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -256,13 +303,24 @@ def test_read_write_dta11(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) def test_read_write_dta12(self): - skip_if_not_little_endian() - - original = DataFrame([(1, 2, 3, 4)], - columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-']) - formatted = DataFrame([(1, 2, 3, 4)], - columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) + # skip_if_not_little_endian() + + original = DataFrame([(1, 2, 3, 4, 5, 6)], + columns=['astringwithmorethan32characters_1', + 'astringwithmorethan32characters_2', + '+', + '-', + 'short', + 'delete']) + formatted = DataFrame([(1, 2, 3, 4, 5, 6)], + columns=['astringwithmorethan32characters_', + '_0astringwithmorethan32character', + '_', + '_1_', + '_short', + '_delete']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -272,6 +330,75 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta13(self): + s1 = Series(2**9,dtype=np.int16) + s2 = Series(2**17,dtype=np.int32) + s3 = Series(2**33,dtype=np.int64) + original = DataFrame({'int16':s1,'int32':s2,'int64':s3}) + original.index.name = 'index' + + formatted = original + formatted['int64'] = formatted['int64'].astype(np.float64) + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + formatted) + + def test_read_write_reread_dta14(self): + expected = self.read_csv(self.csv14) + cols = ['byte_', 'int_', 'long_', 'float_', 'double_'] + for col in cols: + expected[col] = expected[col].convert_objects(convert_numeric=True) + expected['float_'] = expected['float_'].astype(np.float32) + expected['date_td'] = pd.to_datetime(expected['date_td'], coerce=True) + + parsed_113 = self.read_dta(self.dta14_113) + parsed_113.index.name = 'index' + parsed_114 = self.read_dta(self.dta14_114) + parsed_114.index.name = 'index' + parsed_115 = self.read_dta(self.dta14_115) + parsed_115.index.name = 'index' + + tm.assert_frame_equal(parsed_114, parsed_113) + tm.assert_frame_equal(parsed_114, parsed_115) + + with tm.ensure_clean() as path: + parsed_114.to_stata(path, {'date_td': 'td'}, write_index=False) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114) + + def test_read_write_reread_dta15(self): + expected = self.read_csv(self.csv15) + expected['byte_'] = expected['byte_'].astype(np.int8) + expected['int_'] = expected['int_'].astype(np.int16) + expected['long_'] = expected['long_'].astype(np.int32) + expected['float_'] = expected['float_'].astype(np.float32) + expected['double_'] = expected['double_'].astype(np.float64) + expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) + + parsed_113 = self.read_dta(self.dta15_113) + parsed_114 = self.read_dta(self.dta15_114) + parsed_115 = self.read_dta(self.dta15_115) + + tm.assert_frame_equal(expected, parsed_114) + tm.assert_frame_equal(parsed_113, parsed_114) + tm.assert_frame_equal(parsed_114, parsed_115) + + def test_timestamp_and_label(self): + original = DataFrame([(1,)], columns=['var']) + time_stamp = datetime(2000, 2, 29, 14, 21) + data_label = 'This is a data file.' + with tm.ensure_clean() as path: + original.to_stata(path, time_stamp=time_stamp, data_label=data_label) + reader = StataReader(path) + parsed_time_stamp = dt.datetime.strptime(reader.time_stamp, ('%d %b %Y %H:%M')) + assert parsed_time_stamp == time_stamp + assert reader.data_label == data_label + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index deb9065a2b5a6..a69c07494af8a 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -13,7 +13,7 @@ from pandas.core.common import (isnull, notnull, _pickle_array, _unpickle_array, _try_sort) from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices +from pandas.core.indexing import _maybe_convert_indices from pandas.core.series import Series from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, _default_index) @@ -346,10 +346,15 @@ def __getitem__(self, key): return self._get_item_cache(key) @Appender(DataFrame.get_value.__doc__, indents=0) - def get_value(self, index, col): - return self._get_item_cache(col).get_value(index) + def get_value(self, index, col, takeable=False): + if takeable is True: + series = self._iget_item_cache(col) + else: + series = self._get_item_cache(col) + + return series.get_value(index, takeable=takeable) - def set_value(self, index, col, value): + def set_value(self, index, col, value, takeable=False): """ Put single value at passed column and index @@ -358,6 +363,7 @@ def set_value(self, index, col, value): index : row label col : column label value : scalar value + takeable : interpret the index/col as indexers, default False Notes ----- @@ -369,19 +375,15 @@ def set_value(self, index, col, value): ------- frame : DataFrame """ - dense = self.to_dense().set_value(index, col, value) + dense = self.to_dense().set_value(index, col, value, takeable=takeable) return dense.to_sparse(kind=self._default_kind, fill_value=self._default_fill_value) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): + def _slice(self, slobj, axis=0, typ=None): if axis == 0: - if raise_on_error: - _check_slice_bounds(slobj, self.index) new_index = self.index[slobj] new_columns = self.columns else: - if raise_on_error: - _check_slice_bounds(slobj, self.columns) new_index = self.index new_columns = self.columns[slobj] diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py index 86dcf97c8bd3d..20bbc58cc908f 100644 --- a/pandas/sparse/panel.py +++ b/pandas/sparse/panel.py @@ -187,7 +187,7 @@ def _ixs(self, i, axis=0): return self.xs(key, axis=axis) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): + def _slice(self, slobj, axis=0, typ=None): """ for compat as we don't support Block Manager here """ diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index cf4060fa6d871..1c599653f9fc5 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -409,22 +409,23 @@ def get(self, label, default=None): else: return default - def get_value(self, label): + def get_value(self, label, takeable=False): """ Retrieve single value at passed index label Parameters ---------- index : label + takeable : interpret the index as indexers, default False Returns ------- value : scalar value """ - loc = self.index.get_loc(label) + loc = label if takeable is True else self.index.get_loc(label) return self._get_val_at(loc) - def set_value(self, label, value): + def set_value(self, label, value, takeable=False): """ Quickly set single value at passed label. If label is not contained, a new object is created with the label placed at the end of the result @@ -436,6 +437,7 @@ def set_value(self, label, value): Partial indexing with MultiIndex not allowed value : object Scalar value + takeable : interpret the index as indexers, default False Notes ----- @@ -450,7 +452,7 @@ def set_value(self, label, value): # if the label doesn't exist, we will create a new object here # and possibily change the index - new_values = values.set_value(label, value) + new_values = values.set_value(label, value, takeable=takeable) if new_values is not None: values = new_values new_index = values.index diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index 8cbebad61c068..499114f6be9e6 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -269,7 +269,7 @@ def test_to_int_index(self): def test_to_block_index(self): index = BlockIndex(10, [0, 5], [4, 5]) - self.assert_(index.to_block_index() is index) + self.assertIs(index.to_block_index(), index) class TestIntIndex(tm.TestCase): @@ -294,7 +294,7 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): def test_to_int_index(self): index = IntIndex(10, [2, 3, 4, 5, 6]) - self.assert_(index.to_int_index() is index) + self.assertIs(index.to_int_index(), index) class TestSparseOperators(tm.TestCase): diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 7523b2d912f6f..030fe5fb821c4 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -280,8 +280,8 @@ def test_constructor_nonnan(self): def test_copy_astype(self): cop = self.bseries.astype(np.float64) - self.assert_(cop is not self.bseries) - self.assert_(cop.sp_index is self.bseries.sp_index) + self.assertIsNot(cop, self.bseries) + self.assertIs(cop.sp_index, self.bseries.sp_index) self.assertEqual(cop.dtype, np.float64) cop2 = self.iseries.copy() @@ -360,7 +360,7 @@ def _check_getitem(sp, dense): def test_get_get_value(self): assert_almost_equal(self.bseries.get(10), self.bseries[10]) - self.assert_(self.bseries.get(len(self.bseries) + 1) is None) + self.assertIsNone(self.bseries.get(len(self.bseries) + 1)) dt = self.btseries.index[10] result = self.btseries.get(dt) @@ -405,7 +405,7 @@ def _compare_with_dense(sp): def _compare(idx): dense_result = dense.take(idx).values sparse_result = sp.take(idx) - self.assert_(isinstance(sparse_result, SparseSeries)) + self.assertIsInstance(sparse_result, SparseSeries) assert_almost_equal(dense_result, sparse_result.values.values) _compare([1., 2., 3., 4., 5., 0.]) @@ -524,7 +524,7 @@ def _compare_with_series(sps, new_index): # special cases same_index = self.bseries.reindex(self.bseries.index) assert_sp_series_equal(self.bseries, same_index) - self.assert_(same_index is not self.bseries) + self.assertIsNot(same_index, self.bseries) # corner cases sp = SparseSeries([], index=[]) @@ -547,7 +547,7 @@ def _check(values, index1, index2, fill_value): first_series = SparseSeries(values, sparse_index=index1, fill_value=fill_value) reindexed = first_series.sparse_reindex(index2) - self.assert_(reindexed.sp_index is index2) + self.assertIs(reindexed.sp_index, index2) int_indices1 = index1.to_int_index().indices int_indices2 = index2.to_int_index().indices @@ -652,7 +652,7 @@ def test_dropna(self): result = self.bseries.dropna() expected = self.bseries.to_dense().dropna() - self.assert_(not isinstance(result, SparseSeries)) + self.assertNotIsInstance(result, SparseSeries) tm.assert_series_equal(result, expected) def test_homogenize(self): @@ -699,7 +699,7 @@ def test_shift(self): index=np.arange(6)) shifted = series.shift(0) - self.assert_(shifted is not series) + self.assertIsNot(shifted, series) assert_sp_series_equal(shifted, series) f = lambda s: s.shift(1) @@ -1093,14 +1093,14 @@ def test_set_value(self): res.index = res.index.astype(object) res = self.frame.set_value('foobar', 'B', 1.5) - self.assert_(res is not self.frame) + self.assertIsNot(res, self.frame) self.assertEqual(res.index[-1], 'foobar') self.assertEqual(res.get_value('foobar', 'B'), 1.5) res2 = res.set_value('foobar', 'qux', 1.5) - self.assert_(res2 is not res) - self.assert_(np.array_equal(res2.columns, - list(self.frame.columns) + ['qux'])) + self.assertIsNot(res2, res) + self.assert_numpy_array_equal(res2.columns, + list(self.frame.columns) + ['qux']) self.assertEqual(res2.get_value('foobar', 'qux'), 1.5) def test_fancy_index_misc(self): @@ -1126,7 +1126,7 @@ def test_getitem_overload(self): subindex = self.frame.index[indexer] subframe = self.frame[indexer] - self.assert_(np.array_equal(subindex, subframe.index)) + self.assert_numpy_array_equal(subindex, subframe.index) self.assertRaises(Exception, self.frame.__getitem__, indexer[:-1]) def test_setitem(self): @@ -1200,15 +1200,15 @@ def test_delitem(self): C = self.frame['C'] del self.frame['B'] - self.assert_('B' not in self.frame) + self.assertNotIn('B', self.frame) assert_sp_series_equal(self.frame['A'], A) assert_sp_series_equal(self.frame['C'], C) del self.frame['D'] - self.assert_('D' not in self.frame) + self.assertNotIn('D', self.frame) del self.frame['A'] - self.assert_('A' not in self.frame) + self.assertNotIn('A', self.frame) def test_set_columns(self): self.frame.columns = self.frame.columns @@ -1247,7 +1247,7 @@ def test_apply(self): assert_frame_equal(broadcasted.to_dense(), self.frame.to_dense().apply(np.sum, broadcast=True)) - self.assert_(self.empty.apply(np.sqrt) is self.empty) + self.assertIs(self.empty.apply(np.sqrt), self.empty) from pandas.core import nanops applied = self.frame.apply(np.sum) @@ -1372,11 +1372,11 @@ def _check_frame(frame): # with copy=False reindexed = self.frame.reindex(self.frame.index, copy=False) reindexed['F'] = reindexed['A'] - self.assert_('F' in self.frame) + self.assertIn('F', self.frame) reindexed = self.frame.reindex(self.frame.index) reindexed['G'] = reindexed['A'] - self.assert_('G' not in self.frame) + self.assertNotIn('G', self.frame) def test_reindex_fill_value(self): rng = bdate_range('20110110', periods=20) @@ -1413,8 +1413,8 @@ def _check(frame): from_sparse_lp = spf.stack_sparse_frame(frame) - self.assert_(np.array_equal(from_dense_lp.values, - from_sparse_lp.values)) + self.assert_numpy_array_equal(from_dense_lp.values, + from_sparse_lp.values) _check(self.frame) _check(self.iframe) @@ -1624,7 +1624,7 @@ def _compare_with_dense(panel): slp = panel.to_frame() dlp = panel.to_dense().to_frame() - self.assert_(np.array_equal(slp.values, dlp.values)) + self.assert_numpy_array_equal(slp.values, dlp.values) self.assert_(slp.index.equals(dlp.index)) _compare_with_dense(self.panel) @@ -1656,7 +1656,7 @@ def test_setitem(self): def test_set_value(self): def _check_loc(item, major, minor, val=1.5): res = self.panel.set_value(item, major, minor, val) - self.assert_(res is not self.panel) + self.assertIsNot(res, self.panel) self.assertEquals(res.get_value(item, major, minor), val) _check_loc('ItemA', self.panel.major_axis[4], self.panel.minor_axis[3]) @@ -1669,7 +1669,7 @@ def test_delitem_pop(self): assert_almost_equal(self.panel.items, ['ItemA', 'ItemC', 'ItemD']) crackle = self.panel['ItemC'] pop = self.panel.pop('ItemC') - self.assert_(pop is crackle) + self.assertIs(pop, crackle) assert_almost_equal(self.panel.items, ['ItemA', 'ItemD']) self.assertRaises(KeyError, self.panel.__delitem__, 'ItemC') @@ -1702,7 +1702,7 @@ def _compare_with_dense(swp, items, major, minor): # test copying cp = self.panel.reindex(self.panel.major_axis, copy=True) cp['ItemA']['E'] = cp['ItemA']['A'] - self.assert_('E' not in self.panel['ItemA']) + self.assertNotIn('E', self.panel['ItemA']) def test_operators(self): def _check_ops(panel): diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index ca4bbc3c8868a..ec01113abc8f2 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -37,16 +37,31 @@ Parameters ---------- %s -window : Number of observations used for calculating statistic -min_periods : int +window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. +min_periods : int, default None Minimum number of observations in window required to have a value -freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic - time_rule is a legacy alias for freq - + (otherwise result is NA). +freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. +center : boolean, default False + Set the labels at the center of the window. + Returns ------- %s + +Notes +----- +By default, the result is set to the right edge of the window. This can be +changed to the center of the window by setting ``center=True``. + +The `freq` keyword is used to conform time series data to a specified +frequency by resampling the data. This is done with the default parameters +of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ @@ -97,14 +112,23 @@ Parameters ---------- %s -min_periods : int +min_periods : int, default None Minimum number of observations in window required to have a value -freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + (otherwise result is NA). +freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. Returns ------- %s + +Notes +----- +The `freq` keyword is used to conform time series data to a specified +frequency by resampling the data. This is done with the default parameters +of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ @@ -135,16 +159,25 @@ def rolling_count(arg, window, freq=None, center=False, time_rule=None): Parameters ---------- arg : DataFrame or numpy ndarray-like - window : Number of observations used for calculating statistic - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq` center : boolean, default False Whether the label should correspond with center of window - time_rule : Legacy alias for freq Returns ------- rolling_count : type of caller + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ arg = _conv_timerule(arg, freq, time_rule) window = min(window, len(arg)) @@ -161,7 +194,7 @@ def rolling_count(arg, window, freq=None, center=False, time_rule=None): return return_hook(result) -@Substitution("Unbiased moving covariance", _binary_arg_flex, _flex_retval) +@Substitution("Unbiased moving covariance.", _binary_arg_flex, _flex_retval) @Appender(_doc_template) def rolling_cov(arg1, arg2, window, min_periods=None, freq=None, center=False, time_rule=None): @@ -178,7 +211,7 @@ def _get_cov(X, Y): return rs -@Substitution("Moving sample correlation", _binary_arg_flex, _flex_retval) +@Substitution("Moving sample correlation.", _binary_arg_flex, _flex_retval) @Appender(_doc_template) def rolling_corr(arg1, arg2, window, min_periods=None, freq=None, center=False, time_rule=None): @@ -228,13 +261,17 @@ def _flex_binary_moment(arg1, arg2, f): def rolling_corr_pairwise(df, window, min_periods=None): """ Computes pairwise rolling correlation matrices as Panel whose items are - dates + dates. Parameters ---------- df : DataFrame window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). Returns ------- @@ -523,43 +560,57 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): return f -rolling_max = _rolling_func(algos.roll_max2, 'Moving maximum') -rolling_min = _rolling_func(algos.roll_min2, 'Moving minimum') -rolling_sum = _rolling_func(algos.roll_sum, 'Moving sum') -rolling_mean = _rolling_func(algos.roll_mean, 'Moving mean') -rolling_median = _rolling_func(algos.roll_median_cython, 'Moving median') +rolling_max = _rolling_func(algos.roll_max2, 'Moving maximum.') +rolling_min = _rolling_func(algos.roll_min2, 'Moving minimum.') +rolling_sum = _rolling_func(algos.roll_sum, 'Moving sum.') +rolling_mean = _rolling_func(algos.roll_mean, 'Moving mean.') +rolling_median = _rolling_func(algos.roll_median_cython, 'Moving median.') _ts_std = lambda *a, **kw: _zsqrt(algos.roll_var(*a, **kw)) -rolling_std = _rolling_func(_ts_std, 'Unbiased moving standard deviation', +rolling_std = _rolling_func(_ts_std, 'Unbiased moving standard deviation.', check_minp=_require_min_periods(1)) -rolling_var = _rolling_func(algos.roll_var, 'Unbiased moving variance', +rolling_var = _rolling_func(algos.roll_var, 'Unbiased moving variance.', check_minp=_require_min_periods(1)) -rolling_skew = _rolling_func(algos.roll_skew, 'Unbiased moving skewness', +rolling_skew = _rolling_func(algos.roll_skew, 'Unbiased moving skewness.', check_minp=_require_min_periods(3)) -rolling_kurt = _rolling_func(algos.roll_kurt, 'Unbiased moving kurtosis', +rolling_kurt = _rolling_func(algos.roll_kurt, 'Unbiased moving kurtosis.', check_minp=_require_min_periods(4)) def rolling_quantile(arg, window, quantile, min_periods=None, freq=None, center=False, time_rule=None): - """Moving quantile + """Moving quantile. Parameters ---------- arg : Series, DataFrame - window : Number of observations used for calculating statistic - quantile : 0 <= quantile <= 1 - min_periods : int + window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. + quantile : float + 0 <= quantile <= 1 + min_periods : int, default None Minimum number of observations in window required to have a value - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq` center : boolean, default False Whether the label should correspond with center of window - time_rule : Legacy alias for freq Returns ------- y : type of input argument + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ def call_cython(arg, window, minp, args=(), kwargs={}): @@ -571,21 +622,25 @@ def call_cython(arg, window, minp, args=(), kwargs={}): def rolling_apply(arg, window, func, min_periods=None, freq=None, center=False, time_rule=None, args=(), kwargs={}): - """Generic moving function application + """Generic moving function application. Parameters ---------- arg : Series, DataFrame - window : Number of observations used for calculating statistic + window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. func : function Must produce a single value from an ndarray input - min_periods : int + min_periods : int, default None Minimum number of observations in window required to have a value - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. center : boolean, default False Whether the label should correspond with center of window - time_rule : Legacy alias for freq args : tuple Passed on to func kwargs : dict @@ -594,6 +649,15 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None, Returns ------- y : type of input argument + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ def call_cython(arg, window, minp, args, kwargs): minp = _use_window(minp, window) @@ -618,15 +682,17 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, treated as the window length and win_type is required win_type : str, default None Window type (see Notes) - min_periods : int - Minimum number of observations in window required to have a value. - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. center : boolean, default False Whether the label should correspond with center of window mean : boolean, default True If True computes weighted mean, else weighted sum - time_rule : Legacy alias for freq axis : {0, 1}, default 0 Returns @@ -651,6 +717,13 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, * ``gaussian`` (needs std) * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). + + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ if isinstance(window, (list, tuple, np.ndarray)): if win_type is not None: @@ -722,23 +795,23 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): return f -expanding_max = _expanding_func(algos.roll_max2, 'Expanding maximum') -expanding_min = _expanding_func(algos.roll_min2, 'Expanding minimum') -expanding_sum = _expanding_func(algos.roll_sum, 'Expanding sum') -expanding_mean = _expanding_func(algos.roll_mean, 'Expanding mean') +expanding_max = _expanding_func(algos.roll_max2, 'Expanding maximum.') +expanding_min = _expanding_func(algos.roll_min2, 'Expanding minimum.') +expanding_sum = _expanding_func(algos.roll_sum, 'Expanding sum.') +expanding_mean = _expanding_func(algos.roll_mean, 'Expanding mean.') expanding_median = _expanding_func( - algos.roll_median_cython, 'Expanding median') + algos.roll_median_cython, 'Expanding median.') expanding_std = _expanding_func(_ts_std, - 'Unbiased expanding standard deviation', + 'Unbiased expanding standard deviation.', check_minp=_require_min_periods(2)) -expanding_var = _expanding_func(algos.roll_var, 'Unbiased expanding variance', +expanding_var = _expanding_func(algos.roll_var, 'Unbiased expanding variance.', check_minp=_require_min_periods(2)) expanding_skew = _expanding_func( - algos.roll_skew, 'Unbiased expanding skewness', + algos.roll_skew, 'Unbiased expanding skewness.', check_minp=_require_min_periods(3)) expanding_kurt = _expanding_func( - algos.roll_kurt, 'Unbiased expanding kurtosis', + algos.roll_kurt, 'Unbiased expanding kurtosis.', check_minp=_require_min_periods(4)) @@ -749,15 +822,22 @@ def expanding_count(arg, freq=None, center=False, time_rule=None): Parameters ---------- arg : DataFrame or numpy ndarray-like - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. center : boolean, default False - Whether the label should correspond with center of window - time_rule : Legacy alias for freq + Whether the label should correspond with center of window. Returns ------- expanding_count : type of caller + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ return rolling_count(arg, len(arg), freq=freq, center=center, time_rule=time_rule) @@ -765,29 +845,38 @@ def expanding_count(arg, freq=None, center=False, time_rule=None): def expanding_quantile(arg, quantile, min_periods=1, freq=None, center=False, time_rule=None): - """Expanding quantile + """Expanding quantile. Parameters ---------- arg : Series, DataFrame - quantile : 0 <= quantile <= 1 - min_periods : int + quantile : float + 0 <= quantile <= 1 + min_periods : int, default None Minimum number of observations in window required to have a value - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. center : boolean, default False - Whether the label should correspond with center of window - time_rule : Legacy alias for freq + Whether the label should correspond with center of window. Returns ------- y : type of input argument + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ return rolling_quantile(arg, len(arg), quantile, min_periods=min_periods, freq=freq, center=center, time_rule=time_rule) -@Substitution("Unbiased expanding covariance", _binary_arg_flex, _flex_retval) +@Substitution("Unbiased expanding covariance.", _binary_arg_flex, _flex_retval) @Appender(_expanding_doc) def expanding_cov(arg1, arg2, min_periods=1, freq=None, center=False, time_rule=None): @@ -797,7 +886,7 @@ def expanding_cov(arg1, arg2, min_periods=1, freq=None, center=False, center=center, time_rule=time_rule) -@Substitution("Expanding sample correlation", _binary_arg_flex, _flex_retval) +@Substitution("Expanding sample correlation.", _binary_arg_flex, _flex_retval) @Appender(_expanding_doc) def expanding_corr(arg1, arg2, min_periods=1, freq=None, center=False, time_rule=None): @@ -810,12 +899,14 @@ def expanding_corr(arg1, arg2, min_periods=1, freq=None, center=False, def expanding_corr_pairwise(df, min_periods=1): """ Computes pairwise expanding correlation matrices as Panel whose items are - dates + dates. Parameters ---------- df : DataFrame min_periods : int, default 1 + Minimum number of observations in window required to have a value + (otherwise result is NA). Returns ------- @@ -829,20 +920,22 @@ def expanding_corr_pairwise(df, min_periods=1): def expanding_apply(arg, func, min_periods=1, freq=None, center=False, time_rule=None, args=(), kwargs={}): - """Generic expanding function application + """Generic expanding function application. Parameters ---------- arg : Series, DataFrame func : function Must produce a single value from an ndarray input - min_periods : int + min_periods : int, default None Minimum number of observations in window required to have a value - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. center : boolean, default False - Whether the label should correspond with center of window - time_rule : Legacy alias for freq + Whether the label should correspond with center of window. args : tuple Passed on to func kwargs : dict @@ -851,6 +944,12 @@ def expanding_apply(arg, func, min_periods=1, freq=None, center=False, Returns ------- y : type of input argument + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ window = len(arg) return rolling_apply(arg, window, func, min_periods=min_periods, freq=freq, diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 752d2f8ce16f2..82f96bd444429 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -881,7 +881,7 @@ def testFilterWithDictRHS(self): self.tsAssertEqual(exp_rhs2, rhs['x2']) def tsAssertEqual(self, ts1, ts2): - self.assert_(np.array_equal(ts1, ts2)) + self.assert_numpy_array_equal(ts1, ts2) if __name__ == '__main__': diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 1bfe56c8fce45..32416dc975e64 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1,9 +1,14 @@ import re import numpy as np import pandas.compat as compat +import pandas as pd from pandas.compat import u from pandas.core.base import FrozenList, FrozenNDArray from pandas.util.testing import assertRaisesRegexp, assert_isinstance +from pandas import Series, Index, DatetimeIndex, PeriodIndex +from pandas import _np_version_under1p7 +import nose + import pandas.util.testing as tm class CheckStringMixin(object): @@ -100,7 +105,7 @@ def setUp(self): def test_shallow_copying(self): original = self.container.copy() assert_isinstance(self.container.view(), FrozenNDArray) - self.assert_(not isinstance(self.container.view(np.ndarray), FrozenNDArray)) + self.assertFalse(isinstance(self.container.view(np.ndarray), FrozenNDArray)) self.assertIsNot(self.container.view(), self.container) self.assert_numpy_array_equal(self.container, original) # shallow copy should be the same too @@ -120,6 +125,101 @@ def test_values(self): self.assert_numpy_array_equal(self.container, original) self.assertEqual(vals[0], n) +class Ops(tm.TestCase): + def setUp(self): + self.int_index = tm.makeIntIndex(10) + self.float_index = tm.makeFloatIndex(10) + self.dt_index = tm.makeDateIndex(10) + self.period_index = tm.makePeriodIndex(10) + self.string_index = tm.makeStringIndex(10) + + arr = np.random.randn(10) + self.int_series = Series(arr, index=self.int_index) + self.float_series = Series(arr, index=self.int_index) + self.dt_series = Series(arr, index=self.dt_index) + self.period_series = Series(arr, index=self.period_index) + self.string_series = Series(arr, index=self.string_index) + + self.objs = [ getattr(self,"{0}_{1}".format(t,f)) for t in ['int','float','dt','period','string'] for f in ['index','series'] ] + + def check_ops_properties(self, props, filter=None, ignore_failures=False): + for op in props: + for o in self.is_valid_objs: + + # if a filter, skip if it doesn't match + if filter is not None: + filt = o.index if isinstance(o, Series) else o + if not filter(filt): + continue + + try: + if isinstance(o, Series): + expected = Series(getattr(o.index,op),index=o.index) + else: + expected = getattr(o,op) + except (AttributeError): + if ignore_failures: + continue + + result = getattr(o,op) + + # these couuld be series, arrays or scalars + if isinstance(result,Series) and isinstance(expected,Series): + tm.assert_series_equal(result,expected) + elif isinstance(result,Index) and isinstance(expected,Index): + tm.assert_index_equal(result,expected) + elif isinstance(result,np.ndarray) and isinstance(expected,np.ndarray): + self.assert_numpy_array_equal(result,expected) + else: + self.assertEqual(result, expected) + + # freq raises AttributeError on an Int64Index because its not defined + # we mostly care about Series hwere anyhow + if not ignore_failures: + for o in self.not_valid_objs: + self.assertRaises(TypeError, lambda : getattr(o,op)) + +class TestIndexOps(Ops): + + def setUp(self): + super(TestIndexOps, self).setUp() + self.is_valid_objs = [ o for o in self.objs if o._allow_index_ops ] + self.not_valid_objs = [ o for o in self.objs if not o._allow_index_ops ] + + def test_ops(self): + if _np_version_under1p7: + raise nose.SkipTest("test only valid in numpy >= 1.7") + for op in ['max','min']: + for o in self.objs: + result = getattr(o,op)() + expected = getattr(o.values,op)() + self.assertEqual(result, expected) + +class TestDatetimeIndexOps(Ops): + _allowed = '_allow_datetime_index_ops' + + def setUp(self): + super(TestDatetimeIndexOps, self).setUp() + mask = lambda x: x._allow_datetime_index_ops or x._allow_period_index_ops + self.is_valid_objs = [ o for o in self.objs if mask(o) ] + self.not_valid_objs = [ o for o in self.objs if not mask(o) ] + + def test_ops_properties(self): + self.check_ops_properties(['year','month','day','hour','minute','second','weekofyear','week','dayofweek','dayofyear','quarter']) + self.check_ops_properties(['date','time','microsecond','nanosecond'], lambda x: isinstance(x,DatetimeIndex)) + +class TestPeriodIndexOps(Ops): + _allowed = '_allow_period_index_ops' + + def setUp(self): + super(TestPeriodIndexOps, self).setUp() + mask = lambda x: x._allow_datetime_index_ops or x._allow_period_index_ops + self.is_valid_objs = [ o for o in self.objs if mask(o) ] + self.not_valid_objs = [ o for o in self.objs if not mask(o) ] + + def test_ops_properties(self): + self.check_ops_properties(['year','month','day','hour','minute','second','weekofyear','week','dayofweek','dayofyear','quarter']) + self.check_ops_properties(['qyear'], lambda x: isinstance(x,PeriodIndex)) if __name__ == '__main__': import nose diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 3b3b2becc82db..59bfce8d9d636 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -5,7 +5,7 @@ from nose.tools import assert_equal import numpy as np from pandas.tslib import iNaT, NaT -from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp +from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp, Float64Index from pandas import compat from pandas.compat import range, long, lrange, lmap, u from pandas.core.common import notnull, isnull, array_equivalent @@ -181,7 +181,11 @@ def test_array_equivalent(): assert not array_equivalent(np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])) assert not array_equivalent(np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) - + assert array_equivalent(Float64Index([0, np.nan]), Float64Index([0, np.nan])) + assert not array_equivalent(Float64Index([0, np.nan]), Float64Index([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan])) + assert not array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) + def test_datetimeindex_from_empty_datetime64_array(): for unit in [ 'ms', 'us', 'ns' ]: idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) diff --git a/pandas/tests/test_compat.py b/pandas/tests/test_compat.py index a8b9a88126861..0d38bb23d6aa7 100644 --- a/pandas/tests/test_compat.py +++ b/pandas/tests/test_compat.py @@ -11,10 +11,10 @@ import nose import pandas.util.testing as tm -class TestBuiltinIterators(unittest.TestCase): +class TestBuiltinIterators(tm.TestCase): def check_result(self, actual, expected, lengths): for (iter_res, list_res), exp, length in zip(actual, expected, lengths): - self.assert_(not isinstance(iter_res, list)) + self.assertNotIsInstance(iter_res, list) tm.assert_isinstance(list_res, list) iter_res = list(iter_res) self.assertEqual(len(list_res), length) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index ac42266b3c4eb..5296be15f242e 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -127,15 +127,15 @@ def test_repr_truncation(self): for line, value in lzip(r.split('\n'), df['B']): if _strlen(value) + 1 > max_len: - self.assert_('...' in line) + self.assertIn('...', line) else: - self.assert_('...' not in line) + self.assertNotIn('...', line) with option_context("display.max_colwidth", 999999): - self.assert_('...' not in repr(df)) + self.assertNotIn('...', repr(df)) with option_context("display.max_colwidth", max_len + 2): - self.assert_('...' not in repr(df)) + self.assertNotIn('...', repr(df)) def test_repr_chop_threshold(self): df = DataFrame([[0.1, 0.5],[0.5, -0.1]]) @@ -831,7 +831,7 @@ def test_wide_repr_named(self): self.assert_(len(wider_repr) < len(wide_repr)) for line in wide_repr.splitlines()[1::13]: - self.assert_('DataFrame Index' in line) + self.assertIn('DataFrame Index', line) reset_option('display.expand_frame_repr') @@ -855,7 +855,7 @@ def test_wide_repr_multiindex(self): self.assert_(len(wider_repr) < len(wide_repr)) for line in wide_repr.splitlines()[1::13]: - self.assert_('Level 0 Level 1' in line) + self.assertIn('Level 0 Level 1', line) reset_option('display.expand_frame_repr') @@ -979,7 +979,7 @@ def test_to_string(self): buf = StringIO() retval = biggie.to_string(buf=buf) - self.assert_(retval is None) + self.assertIsNone(retval) self.assertEqual(buf.getvalue(), s) tm.assert_isinstance(s, compat.string_types) @@ -1208,7 +1208,7 @@ def test_to_html(self): buf = StringIO() retval = biggie.to_html(buf=buf) - self.assert_(retval is None) + self.assertIsNone(retval) self.assertEqual(buf.getvalue(), s) tm.assert_isinstance(s, compat.string_types) @@ -1251,7 +1251,7 @@ def test_to_html_with_no_bold(self): def test_to_html_columns_arg(self): result = self.frame.to_html(columns=['A']) - self.assert_('' not in result) + self.assertNotIn('', result) def test_to_html_multiindex(self): columns = pandas.MultiIndex.from_tuples(list(zip(np.arange(2).repeat(2), @@ -1417,13 +1417,13 @@ def test_to_html_index(self): index=index) result = df.to_html(index=False) for i in index: - self.assert_(i not in result) + self.assertNotIn(i, result) tuples = [('foo', 'car'), ('foo', 'bike'), ('bar', 'car')] df.index = pandas.MultiIndex.from_tuples(tuples) result = df.to_html(index=False) for i in ['foo', 'bar', 'car', 'bike']: - self.assert_(i not in result) + self.assertNotIn(i, result) def test_repr_html(self): self.frame._repr_html_() @@ -1574,11 +1574,11 @@ def get_ipython(): {'parent_appname': 'ipython-qtconsole'}}} repstr = self.frame._repr_html_() - self.assert_(repstr is not None) + self.assertIsNotNone(repstr) fmt.set_option('display.max_rows', 5, 'display.max_columns', 2) repstr = self.frame._repr_html_() - self.assert_('class' in repstr) # info fallback + self.assertIn('class', repstr) # info fallback fmt.reset_option('^display.') @@ -1787,6 +1787,12 @@ def test_to_csv_escapechar(self): with open(path, 'r') as f: self.assertEqual(f.read(), expected) + def test_csv_to_string(self): + df = DataFrame({'col' : [1,2]}) + expected = ',col\n0,1\n1,2\n' + self.assertEqual(df.to_csv(), expected) + + class TestSeriesFormatting(tm.TestCase): _multiprocess_can_split_ = True @@ -1807,7 +1813,7 @@ def test_to_string(self): s = self.ts.to_string() retval = self.ts.to_string(buf=buf) - self.assert_(retval is None) + self.assertIsNone(retval) self.assertEqual(buf.getvalue().strip(), s) # pass float_format @@ -1888,9 +1894,9 @@ def test_float_trim_zeros(self): if line.startswith('dtype:'): continue if _three_digit_exp(): - self.assert_('+010' in line) + self.assertIn('+010', line) else: - self.assert_('+10' in line) + self.assertIn('+10', line) def test_datetimeindex(self): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index f85c95e8b81db..4758670517df0 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -117,9 +117,9 @@ def test_getitem(self): self.assert_(tm.equalContents(series.index, sl.index)) for key, _ in compat.iteritems(self.frame._series): - self.assert_(self.frame[key] is not None) + self.assertIsNotNone(self.frame[key]) - self.assert_('random' not in self.frame) + self.assertNotIn('random', self.frame) with assertRaisesRegexp(KeyError, 'no item named random'): self.frame['random'] @@ -144,14 +144,14 @@ def test_get(self): b = self.frame.get('B') assert_series_equal(b, self.frame['B']) - self.assert_(self.frame.get('foo') is None) + self.assertIsNone(self.frame.get('foo')) assert_series_equal(self.frame.get('foo', self.frame['B']), self.frame['B']) # None # GH 5652 for df in [DataFrame(), DataFrame(columns=list('AB')), DataFrame(columns=list('AB'),index=range(3)) ]: result = df.get(None) - self.assert_(result is None) + self.assertIsNone(result) def test_getitem_iterator(self): idx = iter(['A', 'B', 'C']) @@ -399,7 +399,7 @@ def test_setitem(self): # not sure what else to do here series = self.frame['A'][::2] self.frame['col5'] = series - self.assert_('col5' in self.frame) + self.assertIn('col5', self.frame) tm.assert_dict_equal(series, self.frame['col5'], compare_keys=False) @@ -551,7 +551,7 @@ def test_setitem_corner(self): index=np.arange(3)) del df['B'] df['B'] = [1., 2., 3.] - self.assert_('B' in df) + self.assertIn('B', df) self.assertEqual(len(df.columns), 2) df['A'] = 'beginning' @@ -614,15 +614,15 @@ def test_setitem_ambig(self): dm[0] = np.ones(3) self.assertEqual(len(dm.columns), 3) - # self.assert_(dm.objects is None) + # self.assertIsNone(dm.objects) dm[1] = coercable_series self.assertEqual(len(dm.columns), 3) - # self.assert_(dm.objects is None) + # self.assertIsNone(dm.objects) dm[2] = uncoercable_series self.assertEqual(len(dm.columns), 3) - # self.assert_(dm.objects is not None) + # self.assertIsNotNone(dm.objects) self.assertEqual(dm[2].dtype, np.object_) def test_setitem_clear_caches(self): @@ -637,7 +637,7 @@ def test_setitem_clear_caches(self): df.ix[2:, 'z'] = 42 expected = Series([np.nan, np.nan, 42, 42], index=df.index) - self.assert_(df['z'] is not foo) + self.assertIsNot(df['z'], foo) assert_series_equal(df['z'], expected) def test_setitem_None(self): @@ -1046,7 +1046,7 @@ def test_getitem_fancy_1d(self): ix = f.ix # return self if no slicing...for now - self.assert_(ix[:, :] is f) + self.assertIs(ix[:, :], f) # low dimensional slice xs1 = ix[2, ['C', 'B', 'A']] @@ -1532,7 +1532,7 @@ def test_set_value(self): def test_set_value_resize(self): res = self.frame.set_value('foobar', 'B', 0) - self.assert_(res is self.frame) + self.assertIs(res, self.frame) self.assertEqual(res.index[-1], 'foobar') self.assertEqual(res.get_value('foobar', 'B'), 0) @@ -1761,7 +1761,7 @@ def test_copy_index_name_checking(self): ind.name = None cp = self.frame.copy() getattr(cp, attr).name = 'foo' - self.assert_(getattr(self.frame, attr).name is None) + self.assertIsNone(getattr(self.frame, attr).name) def test_getitem_pop_assign_name(self): s = self.frame['A'] @@ -1943,8 +1943,8 @@ def test_get_axis(self): self.assertEquals(f._get_axis_name('rows'), 'index') self.assertEquals(f._get_axis_name('columns'), 'columns') - self.assert_(f._get_axis(0) is f.index) - self.assert_(f._get_axis(1) is f.columns) + self.assertIs(f._get_axis(0), f.index) + self.assertIs(f._get_axis(1), f.columns) assertRaisesRegexp(ValueError, 'No axis named', f._get_axis_number, 2) assertRaisesRegexp(ValueError, 'No axis.*foo', f._get_axis_name, 'foo') @@ -1957,7 +1957,7 @@ def test_set_index(self): # cache it _ = self.mixed_frame['foo'] self.mixed_frame.index = idx - self.assert_(self.mixed_frame['foo'].index is idx) + self.assertIs(self.mixed_frame['foo'].index, idx) with assertRaisesRegexp(ValueError, 'Length mismatch'): self.mixed_frame.index = idx[::2] @@ -2060,7 +2060,7 @@ def test_set_index_nonuniq(self): 'E': np.random.randn(5)}) with assertRaisesRegexp(ValueError, 'Index has duplicate keys'): df.set_index('A', verify_integrity=True, inplace=True) - self.assert_('A' in df) + self.assertIn('A', df) def test_set_index_bug(self): # GH1590 @@ -2092,6 +2092,39 @@ def test_set_index_cast_datetimeindex(self): idf = df.set_index('A') tm.assert_isinstance(idf.index, DatetimeIndex) + # don't cast a DatetimeIndex WITH a tz, leave as object + # GH 6032 + i = pd.DatetimeIndex(pd.tseries.tools.to_datetime(['2013-1-1 13:00','2013-1-2 14:00'], errors="raise")).tz_localize('US/Pacific') + df = DataFrame(np.random.randn(2,1),columns=['A']) + + expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), + pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) + + # convert index to series + result = Series(i) + assert_series_equal(result, expected) + + # assignt to frame + df['B'] = i + result = df['B'] + assert_series_equal(result, expected) + + # keep the timezone + result = i.to_series(keep_tz=True) + assert_series_equal(result.reset_index(drop=True), expected) + + # convert to utc + df['C'] = i.to_series().reset_index(drop=True) + result = df['C'] + comp = DatetimeIndex(expected.values).copy() + comp.tz = None + self.assert_numpy_array_equal(result.values, comp.values) + + # list of datetimes with a tz + df['D'] = i.to_pydatetime() + result = df['D'] + assert_series_equal(result, expected) + def test_set_index_multiindexcolumns(self): columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)]) df = DataFrame(np.random.randn(3, 3), columns=columns) @@ -2122,7 +2155,7 @@ def test_set_columns(self): def test_keys(self): getkeys = self.frame.keys - self.assert_(getkeys() is self.frame.columns) + self.assertIs(getkeys(), self.frame.columns) def test_column_contains_typeerror(self): try: @@ -2174,7 +2207,7 @@ def test_constructor_dtype_nocast_view(self): def test_constructor_dtype_list_data(self): df = DataFrame([[1, '2'], [None, 'a']], dtype=object) - self.assert_(df.ix[1, 0] is None) + self.assertIsNone(df.ix[1, 0]) self.assertEqual(df.ix[0, 1], '2') def test_constructor_list_frames(self): @@ -2287,7 +2320,7 @@ def test_constructor_dict(self): columns=['col2', 'col3', 'col4']) self.assertEqual(len(frame), len(self.ts2)) - self.assert_('col1' not in frame) + self.assertNotIn('col1', frame) self.assert_(isnull(frame['col3']).all()) # Corner cases @@ -2305,13 +2338,13 @@ def test_constructor_dict(self): # empty dict plus index idx = Index([0, 1, 2]) frame = DataFrame({}, index=idx) - self.assert_(frame.index is idx) + self.assertIs(frame.index, idx) # empty with index and columns idx = Index([0, 1, 2]) frame = DataFrame({}, index=idx, columns=idx) - self.assert_(frame.index is idx) - self.assert_(frame.columns is idx) + self.assertIs(frame.index, idx) + self.assertIs(frame.columns, idx) self.assertEqual(len(frame._series), 3) # with dict of empty list and Series @@ -3256,6 +3289,23 @@ def test_column_dups2(self): result = df2.drop('C',axis=1) assert_frame_equal(result, expected) + # dropna + df = DataFrame({'A' : np.random.randn(5), + 'B' : np.random.randn(5), + 'C' : np.random.randn(5), + 'D' : ['a','b','c','d','e'] }) + df.iloc[2,[0,1,2]] = np.nan + df.iloc[0,0] = np.nan + df.iloc[1,1] = np.nan + df.iloc[:,3] = np.nan + expected = df.dropna(subset=['A','B','C'],how='all') + expected.columns = ['A','A','B','C'] + + df.columns = ['A','A','B','C'] + + result = df.dropna(subset=['A','C'],how='all') + assert_frame_equal(result, expected) + def test_column_dups_indexing(self): def check(result, expected=None): @@ -3606,7 +3656,7 @@ def test_new_empty_index(self): df1 = DataFrame(randn(0, 3)) df2 = DataFrame(randn(0, 3)) df1.index.name = 'foo' - self.assert_(df2.index.name is None) + self.assertIsNone(df2.index.name) def test_astype(self): casted = self.frame.astype(int) @@ -3700,8 +3750,8 @@ def test_astype_cast_nan_int(self): def test_array_interface(self): result = np.sqrt(self.frame) tm.assert_isinstance(result, type(self.frame)) - self.assert_(result.index is self.frame.index) - self.assert_(result.columns is self.frame.columns) + self.assertIs(result.index, self.frame.index) + self.assertIs(result.columns, self.frame.columns) assert_frame_equal(result, self.frame.apply(np.sqrt)) @@ -3810,7 +3860,7 @@ def test_from_records_to_records(self): records = indexed_frame.to_records(index=False) self.assertEqual(len(records.dtype.names), 2) - self.assert_('index' not in records.dtype.names) + self.assertNotIn('index', records.dtype.names) def test_from_records_nones(self): tuples = [(1, 2, None, 3), @@ -4007,16 +4057,16 @@ def test_to_recods_index_name(self): df = DataFrame(np.random.randn(3, 3)) df.index.name = 'X' rs = df.to_records() - self.assert_('X' in rs.dtype.fields) + self.assertIn('X', rs.dtype.fields) df = DataFrame(np.random.randn(3, 3)) rs = df.to_records() - self.assert_('index' in rs.dtype.fields) + self.assertIn('index', rs.dtype.fields) df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')]) df.index.names = ['A', None] rs = df.to_records() - self.assert_('level_0' in rs.dtype.fields) + self.assertIn('level_0', rs.dtype.fields) def test_join_str_datetime(self): str_dates = ['20120209', '20120222'] @@ -4174,10 +4224,10 @@ def test_from_records_len0_with_columns(self): def test_get_agg_axis(self): cols = self.frame._get_agg_axis(0) - self.assert_(cols is self.frame.columns) + self.assertIs(cols, self.frame.columns) idx = self.frame._get_agg_axis(1) - self.assert_(idx is self.frame.index) + self.assertIs(idx, self.frame.index) self.assertRaises(ValueError, self.frame._get_agg_axis, 2) @@ -4340,7 +4390,7 @@ def test_repr_column_name_unicode_truncation_bug(self): ' the File through the code..')}) result = repr(df) - self.assert_('StringCol' in result) + self.assertIn('StringCol', result) def test_head_tail(self): assert_frame_equal(self.frame.head(), self.frame[:5]) @@ -4404,17 +4454,17 @@ def test_insert(self): def test_delitem(self): del self.frame['A'] - self.assert_('A' not in self.frame) + self.assertNotIn('A', self.frame) def test_pop(self): self.frame.columns.name = 'baz' A = self.frame.pop('A') - self.assert_('A' not in self.frame) + self.assertNotIn('A', self.frame) self.frame['foo'] = 'bar' foo = self.frame.pop('foo') - self.assert_('foo' not in self.frame) + self.assertNotIn('foo', self.frame) # TODO self.assertEqual(self.frame.columns.name, 'baz') def test_pop_non_unique_cols(self): @@ -4714,7 +4764,7 @@ def test_constructor_lists_to_object_dtype(self): # from #1074 d = DataFrame({'a': [np.nan, False]}) self.assertEqual(d['a'].dtype, np.object_) - self.assert_(d['a'][1] is False) + self.assertFalse(d['a'][1]) def test_constructor_with_nas(self): # GH 5016 @@ -5162,7 +5212,7 @@ def test_combineSeries(self): for key, s in compat.iteritems(self.frame): assert_series_equal(larger_added[key], s + series[key]) - self.assert_('E' in larger_added) + self.assertIn('E', larger_added) self.assert_(np.isnan(larger_added['E']).all()) # vs mix (upcast) as needed @@ -5226,7 +5276,7 @@ def test_combineFunc(self): _check_mixed_float(result, dtype = dict(C = None)) result = self.empty * 2 - self.assert_(result.index is self.empty.index) + self.assertIs(result.index, self.empty.index) self.assertEqual(len(result.columns), 0) def test_comparisons(self): @@ -6356,7 +6406,7 @@ def test_asfreq(self): # test does not blow up on length-0 DataFrame zero_length = self.tsframe.reindex([]) result = zero_length.asfreq('BM') - self.assert_(result is not zero_length) + self.assertIsNot(result, zero_length) def test_asfreq_datetimeindex(self): df = DataFrame({'A': [1, 2, 3]}, @@ -6473,11 +6523,11 @@ def test_deepcopy(self): def test_copy(self): cop = self.frame.copy() cop['E'] = cop['A'] - self.assert_('E' not in self.frame) + self.assertNotIn('E', self.frame) # copy objects copy = self.mixed_frame.copy() - self.assert_(copy._data is not self.mixed_frame._data) + self.assertIsNot(copy._data, self.mixed_frame._data) def _check_method(self, method='pearson', check_minp=False): if not check_minp: @@ -6614,10 +6664,10 @@ def test_corrwith(self): dropped = a.corrwith(b, axis=0, drop=True) assert_almost_equal(dropped['A'], a['A'].corr(b['A'])) - self.assert_('B' not in dropped) + self.assertNotIn('B', dropped) dropped = a.corrwith(b, axis=1, drop=True) - self.assert_(a.index[-1] not in dropped.index) + self.assertNotIn(a.index[-1], dropped.index) # non time-series data index = ['a', 'b', 'c', 'd', 'e'] @@ -7117,7 +7167,7 @@ def test_fillna_inplace(self): df[3][-4:] = np.nan expected = df.fillna(value=0) - self.assert_(expected is not df) + self.assertIsNot(expected, df) df.fillna(value=0, inplace=True) assert_frame_equal(df, expected) @@ -7125,7 +7175,7 @@ def test_fillna_inplace(self): df[1][:4] = np.nan df[3][-4:] = np.nan expected = df.fillna(method='ffill') - self.assert_(expected is not df) + self.assertIsNot(expected, df) df.fillna(method='ffill', inplace=True) assert_frame_equal(df, expected) @@ -8039,6 +8089,19 @@ def test_replace_with_dict_with_bool_keys(self): with tm.assertRaisesRegexp(TypeError, 'Cannot compare types .+'): df.replace({'asdf': 'asdb', True: 'yes'}) + def test_replace_int_to_int_chain(self): + df = DataFrame({'a': lrange(1, 5)}) + with tm.assertRaisesRegexp(ValueError, "Replacement not allowed .+"): + df.replace({'a': dict(zip(range(1, 5), range(2, 6)))}) + + def test_replace_str_to_str_chain(self): + a = np.arange(1, 5) + astr = a.astype(str) + bstr = np.arange(2, 6).astype(str) + df = DataFrame({'a': astr}) + with tm.assertRaisesRegexp(ValueError, "Replacement not allowed .+"): + df.replace({'a': dict(zip(astr, bstr))}) + def test_combine_multiple_frames_dtypes(self): # GH 2759 @@ -8266,7 +8329,7 @@ def test_reindex(self): # Same index, copies values but not index if copy=False newFrame = self.frame.reindex(self.frame.index, copy=False) - self.assert_(newFrame.index is self.frame.index) + self.assertIs(newFrame.index, self.frame.index) # length zero newFrame = self.frame.reindex([]) @@ -8327,7 +8390,7 @@ def test_reindex_columns(self): assert_series_equal(newFrame['B'], self.frame['B']) self.assert_(np.isnan(newFrame['E']).all()) - self.assert_('C' not in newFrame) + self.assertNotIn('C', newFrame) # length zero newFrame = self.frame.reindex(columns=[]) @@ -8407,10 +8470,10 @@ def test_reindex_dups(self): def test_align(self): af, bf = self.frame.align(self.frame) - self.assert_(af._data is not self.frame._data) + self.assertIsNot(af._data, self.frame._data) af, bf = self.frame.align(self.frame, copy=False) - self.assert_(af._data is self.frame._data) + self.assertIs(af._data, self.frame._data) # axis = 0 other = self.frame.ix[:-5, :3] @@ -8885,15 +8948,15 @@ def test_rename_nocopy(self): def test_rename_inplace(self): self.frame.rename(columns={'C': 'foo'}) - self.assert_('C' in self.frame) - self.assert_('foo' not in self.frame) + self.assertIn('C', self.frame) + self.assertNotIn('foo', self.frame) c_id = id(self.frame['C']) frame = self.frame.copy() frame.rename(columns={'C': 'foo'}, inplace=True) - self.assert_('C' not in frame) - self.assert_('foo' in frame) + self.assertNotIn('C', frame) + self.assertIn('foo', frame) self.assertNotEqual(id(frame['foo']), c_id) def test_rename_bug(self): @@ -9089,7 +9152,7 @@ def test_apply(self): d = self.frame.index[0] applied = self.frame.apply(np.mean, axis=1) self.assertEqual(applied[d], np.mean(self.frame.xs(d))) - self.assert_(applied.index is self.frame.index) # want this + self.assertIs(applied.index, self.frame.index) # want this # invalid axis df = DataFrame( @@ -9215,7 +9278,7 @@ def _checkit(axis=0, raw=False): if is_reduction: agg_axis = df._get_agg_axis(axis) tm.assert_isinstance(res, Series) - self.assert_(res.index is agg_axis) + self.assertIs(res.index, agg_axis) else: tm.assert_isinstance(res, DataFrame) @@ -9424,11 +9487,11 @@ def test_filter(self): # items filtered = self.frame.filter(['A', 'B', 'E']) self.assertEqual(len(filtered.columns), 2) - self.assert_('E' not in filtered) + self.assertNotIn('E', filtered) filtered = self.frame.filter(['A', 'B', 'E'], axis='columns') self.assertEqual(len(filtered.columns), 2) - self.assert_('E' not in filtered) + self.assertNotIn('E', filtered) # other axis idx = self.frame.index[0:4] @@ -9442,7 +9505,7 @@ def test_filter(self): filtered = fcopy.filter(like='A') self.assertEqual(len(filtered.columns), 2) - self.assert_('AA' in filtered) + self.assertIn('AA', filtered) # like with ints in column names df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B']) @@ -9455,7 +9518,7 @@ def test_filter(self): # objects filtered = self.mixed_frame.filter(like='foo') - self.assert_('foo' in filtered) + self.assertIn('foo', filtered) # unicode columns, won't ascii-encode df = self.frame.rename(columns={'B': u('\u2202')}) @@ -9469,7 +9532,7 @@ def test_filter_regex_search(self): # regex filtered = fcopy.filter(regex='[A]+') self.assertEqual(len(filtered.columns), 2) - self.assert_('AA' in filtered) + self.assertIn('AA', filtered) # doesn't have to be at beginning df = DataFrame({'aBBa': [1, 2], @@ -9753,6 +9816,13 @@ def test_frame_column_inplace_sort_exception(self): cp = s.copy() cp.sort() # it works! + def test_stable_descending_sort(self): + df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']], + columns=['sort_col', 'order']) + sorted = df.sort_index(by='sort_col', kind='mergesort', + ascending=False) + assert_frame_equal(df, sorted) + def test_combine_first(self): # disjoint head, tail = self.frame[:5], self.frame[5:] @@ -10357,11 +10427,11 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, df = DataFrame({'b': date_range('1/1/2001', periods=2)}) _f = getattr(df, name) result = _f() - self.assert_(isinstance(result, Series)) + self.assertIsInstance(result, Series) df['a'] = lrange(len(df)) result = getattr(df, name)() - self.assert_(isinstance(result, Series)) + self.assertIsInstance(result, Series) self.assert_(len(result)) if has_skipna: @@ -10821,10 +10891,10 @@ def test_reindex_boolean(self): def test_reindex_objects(self): reindexed = self.mixed_frame.reindex(columns=['foo', 'A', 'B']) - self.assert_('foo' in reindexed) + self.assertIn('foo', reindexed) reindexed = self.mixed_frame.reindex(columns=['A', 'B']) - self.assert_('foo' not in reindexed) + self.assertNotIn('foo', reindexed) def test_reindex_corner(self): index = Index(['a', 'b', 'c']) @@ -10903,8 +10973,8 @@ def test_reindex_multi(self): def test_rename_objects(self): renamed = self.mixed_frame.rename(columns=str.upper) - self.assert_('FOO' in renamed) - self.assert_('foo' not in renamed) + self.assertIn('FOO', renamed) + self.assertNotIn('foo', renamed) def test_fill_corner(self): self.mixed_frame['foo'][5:20] = nan @@ -11428,7 +11498,7 @@ def test_consolidate(self): # Ensure copy, do I want this? recons = consolidated.consolidate() - self.assert_(recons is not consolidated) + self.assertIsNot(recons, consolidated) assert_frame_equal(recons, consolidated) self.frame['F'] = 8. @@ -12065,7 +12135,6 @@ def test_isin_dupe_self(self): expected.iloc[1, 1] = True assert_frame_equal(result, expected) - def test_isin_against_series(self): df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}, index=['a', 'b', 'c', 'd']) @@ -12191,13 +12260,54 @@ def test_empty_frame_dtypes_ftypes(self): ('c', 'float64:dense')]))) # same but for empty slice of df - assert_series_equal(df[:0].dtypes, pd.Series(odict([('a', np.int), + assert_series_equal(df[:0].dtypes, pd.Series(odict([('a', np.int64), ('b', np.bool), - ('c', np.float)]))) + ('c', np.float64)]))) assert_series_equal(df[:0].ftypes, pd.Series(odict([('a', 'int64:dense'), ('b', 'bool:dense'), ('c', 'float64:dense')]))) + def test_dtypes_are_correct_after_column_slice(self): + # GH6525 + df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) + odict = OrderedDict + assert_series_equal(df.dtypes, + pd.Series(odict([('a', np.float_), ('b', np.float_), + ('c', np.float_),]))) + assert_series_equal(df.iloc[:,2:].dtypes, + pd.Series(odict([('c', np.float_)]))) + assert_series_equal(df.dtypes, + pd.Series(odict([('a', np.float_), ('b', np.float_), + ('c', np.float_),]))) + + def test_set_index_names(self): + df = pd.util.testing.makeDataFrame() + df.index.name = 'name' + + self.assertEquals(df.set_index(df.index).index.names, ['name']) + + mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B']) + mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values, + names=['A', 'B', 'A', 'B']) + + df = df.set_index(['A', 'B']) + + self.assertEquals(df.set_index(df.index).index.names, ['A', 'B']) + + # Check that set_index isn't converting a MultiIndex into an Index + self.assertTrue(isinstance(df.set_index(df.index).index, MultiIndex)) + + # Check actual equality + tm.assert_index_equal(df.set_index(df.index).index, mi) + + # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather + # than a pair of tuples + self.assertTrue(isinstance(df.set_index([df.index, df.index]).index, MultiIndex)) + + # Check equality + tm.assert_index_equal(df.set_index([df.index, df.index]).index, mi2) + + def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': try: @@ -12274,6 +12384,7 @@ def check_query_with_unnamed_multiindex(self, parser, engine): df = DataFrame(randn(10, 2), index=index) ind = Series(df.index.get_level_values(0).values, index=index) + #import ipdb; ipdb.set_trace() res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine) res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine) exp = df[ind == 'red'] @@ -12397,7 +12508,7 @@ def test_query_multiindex_get_index_resolvers(self): def check_query_multiindex_get_index_resolvers(self, parser, engine): df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=['spam', 'eggs']) - resolvers = df._get_resolvers() + resolvers = df._get_index_resolvers() def to_series(mi, level): level_values = mi.get_level_values(level) @@ -12457,17 +12568,29 @@ def tearDownClass(cls): super(TestDataFrameQueryNumExprPandas, cls).tearDownClass() del cls.engine, cls.parser - def test_date_query_method(self): + def test_date_query_with_attribute_access(self): engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) df = DataFrame(randn(5, 3)) df['dates1'] = date_range('1/1/2012', periods=5) df['dates2'] = date_range('1/1/2013', periods=5) df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('dates1 < 20130101 < dates3', engine=engine, + res = df.query('@df.dates1 < 20130101 < @df.dates3', engine=engine, parser=parser) expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] assert_frame_equal(res, expec) + def test_date_query_no_attribute_access(self): + engine, parser = self.engine, self.parser + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + df['dates2'] = date_range('1/1/2013', periods=5) + df['dates3'] = date_range('1/1/2014', periods=5) + res = df.query('dates1 < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + tm.assert_frame_equal(res, expec) + def test_date_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 @@ -12525,7 +12648,7 @@ def test_date_query_with_non_date(self): n = 10 df = DataFrame({'dates': date_range('1/1/2012', periods=n), - 'nondate': np.arange(n)}) + 'nondate': np.arange(n)}) ops = '==', '!=', '<', '>', '<=', '>=' @@ -12533,32 +12656,61 @@ def test_date_query_with_non_date(self): with tm.assertRaises(TypeError): df.query('dates %s nondate' % op, parser=parser, engine=engine) - def test_query_scope(self): + def test_query_syntax_error(self): engine, parser = self.engine, self.parser - from pandas.computation.common import NameResolutionError - df = DataFrame({"i": lrange(10), "+": lrange(3, 13), "r": lrange(4, 14)}) - i, s = 5, 6 - self.assertRaises(NameResolutionError, df.query, 'i < 5', - engine=engine, parser=parser, local_dict={'i': i}) - self.assertRaises(SyntaxError, df.query, 'i - +', engine=engine, - parser=parser) - self.assertRaises(NameResolutionError, df.query, 'i == s', - engine=engine, parser=parser, local_dict={'i': i, - 's': s}) - - def test_query_scope_index(self): + with tm.assertRaises(SyntaxError): + df.query('i - +', engine=engine, parser=parser) + + def test_query_scope(self): + from pandas.computation.ops import UndefinedVariableError engine, parser = self.engine, self.parser - from pandas.computation.common import NameResolutionError - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=Index(range(10), name='blob'), - columns=['a', 'b', 'c']) + skip_if_no_pandas_parser(parser) + + df = DataFrame(np.random.randn(20, 2), columns=list('ab')) + + a, b = 1, 2 + res = df.query('a > b', engine=engine, parser=parser) + expected = df[df.a > df.b] + tm.assert_frame_equal(res, expected) + + res = df.query('@a > b', engine=engine, parser=parser) + expected = df[a > df.b] + tm.assert_frame_equal(res, expected) + + # no local variable c + with tm.assertRaises(UndefinedVariableError): + df.query('@a > b > @c', engine=engine, parser=parser) + + # no column named 'c' + with tm.assertRaises(UndefinedVariableError): + df.query('@a > b > c', engine=engine, parser=parser) + + def test_query_doesnt_pickup_local(self): + from pandas.computation.ops import UndefinedVariableError + + engine, parser = self.engine, self.parser + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + from numpy import sin + + # we don't pick up the local 'sin' + with tm.assertRaises(UndefinedVariableError): + df.query('sin > 5', engine=engine, parser=parser) + + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + df.index.name = 'sin' - self.assertRaises(NameResolutionError, df.query, 'sin > 5', - engine=engine, parser=parser, local_dict={'sin': - sin}) + with tm.assertRaisesRegexp(NumExprClobberingError, + 'Variables in expression.+'): + df.query('sin > 5', engine=engine, parser=parser) def test_query(self): engine, parser = self.engine, self.parser @@ -12570,16 +12722,6 @@ def test_query(self): parser=parser), df[df.a + df.b > df.b * df.c]) - local_dict = dict(df.iteritems()) - local_dict.update({'df': df}) - self.assertRaises(NameError, df.query, 'a < d & b < f', - local_dict=local_dict, engine=engine, parser=parser) - - # make sure that it's not just because we didn't pass the locals in - self.assertRaises(AssertionError, self.assertRaises, NameError, - df.query, 'a < b', local_dict={'df': df}, - engine=engine, parser=parser) - def test_query_index_with_name(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randint(10, size=(10, 3)), @@ -12612,36 +12754,41 @@ def test_query_index_without_name(self): def test_nested_scope(self): engine = self.engine parser = self.parser - # smoke test - x = 1 - result = pd.eval('x + 1', engine=engine, parser=parser) - self.assertEqual(result, 2) - df = DataFrame(np.random.randn(5, 3)) + skip_if_no_pandas_parser(parser) + + df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) - expected = df[(df>0) & (df2>0)] + expected = df[(df > 0) & (df2 > 0)] - result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + result = df.query('(@df > 0) & (@df2 > 0)', engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[(df > 0) and (df2 > 0)]', engine=engine, + result = pd.eval('df[df > 0 and df2 > 0]', engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[(df > 0) and (df2 > 0) and df[df > 0] > 0]', + result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] assert_frame_equal(result, expected) result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) - expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) assert_frame_equal(result, expected) + def test_nested_raises_on_local_self_reference(self): + from pandas.computation.ops import UndefinedVariableError + + df = DataFrame(np.random.randn(5, 3)) + + # can't reference ourself b/c we're a local so @ is necessary + with tm.assertRaises(UndefinedVariableError): + df.query('df > 0', engine=self.engine, parser=self.parser) + def test_local_syntax(self): skip_if_no_pandas_parser(self.parser) - from pandas.computation.common import NameResolutionError - engine, parser = self.engine, self.parser df = DataFrame(randn(100, 10), columns=list('abcdefghij')) b = 1 @@ -12649,13 +12796,6 @@ def test_local_syntax(self): result = df.query('a < @b', engine=engine, parser=parser) assert_frame_equal(result, expect) - # scope issue with self.assertRaises so just catch it and let it pass - try: - df.query('a < @b', engine=engine, parser=parser) - except NameResolutionError: - pass - - del b expect = df[df.a < df.b] result = df.query('a < b', engine=engine, parser=parser) assert_frame_equal(result, expect) @@ -12671,6 +12811,40 @@ def test_chained_cmp_and_in(self): expec = df[ind] assert_frame_equal(res, expec) + def test_local_variable_with_in(self): + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + a = Series(np.random.randint(3, size=15), name='a') + b = Series(np.random.randint(10, size=15), name='b') + df = DataFrame({'a': a, 'b': b}) + + expected = df.loc[(df.b - 1).isin(a)] + result = df.query('b - 1 in a', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + b = Series(np.random.randint(10, size=15), name='b') + expected = df.loc[(b - 1).isin(a)] + result = df.query('@b - 1 in a', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + def test_at_inside_string(self): + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + c = 1 + df = DataFrame({'a': ['a', 'a', 'b', 'b', '@c', '@c']}) + result = df.query('a == "@c"', engine=engine, parser=parser) + expected = df[df.a == "@c"] + tm.assert_frame_equal(result, expected) + + def test_query_undefined_local(self): + from pandas.computation.ops import UndefinedVariableError + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + df = DataFrame(np.random.rand(10, 2), columns=list('ab')) + with tm.assertRaisesRegexp(UndefinedVariableError, + "local variable 'c' is not defined"): + df.query('a == @c', engine=engine, parser=parser) + class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): @@ -12682,17 +12856,16 @@ def setUpClass(cls): tm.skip_if_no_ne(cls.engine) cls.frame = _frame.copy() - def test_date_query_method(self): + def test_date_query_no_attribute_access(self): engine, parser = self.engine, self.parser df = DataFrame(randn(5, 3)) df['dates1'] = date_range('1/1/2012', periods=5) df['dates2'] = date_range('1/1/2013', periods=5) df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('(df.dates1 < 20130101) & (20130101 < df.dates3)', + res = df.query('(dates1 < 20130101) & (20130101 < dates3)', engine=engine, parser=parser) expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] - assert_frame_equal(res, expec) - + tm.assert_frame_equal(res, expec) def test_date_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 @@ -12741,10 +12914,10 @@ def test_date_index_query_with_NaT_duplicates(self): df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT df.set_index('dates1', inplace=True, drop=True) with tm.assertRaises(NotImplementedError): - res = df.query('index < 20130101 < dates3', engine=engine, - parser=parser) + df.query('index < 20130101 < dates3', engine=engine, parser=parser) def test_nested_scope(self): + from pandas.computation.ops import UndefinedVariableError engine = self.engine parser = self.parser # smoke test @@ -12754,23 +12927,23 @@ def test_nested_scope(self): df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) - expected = df[(df>0) & (df2>0)] - result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) - assert_frame_equal(result, expected) + # don't have the pandas parser + with tm.assertRaises(SyntaxError): + df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) + with tm.assertRaises(UndefinedVariableError): + df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + + expected = df[(df > 0) & (df2 > 0)] result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, parser=parser) - assert_frame_equal(result, expected) + tm.assert_frame_equal(expected, result) + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', engine=engine, parser=parser) - expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] - assert_frame_equal(result, expected) - - result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) - expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) - assert_frame_equal(result, expected) + tm.assert_frame_equal(expected, result) class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas): @@ -12782,6 +12955,18 @@ def setUpClass(cls): cls.parser = 'pandas' cls.frame = _frame.copy() + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + + df.index.name = 'sin' + expected = df[df.index > 5] + result = df.query('sin > 5', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): @@ -12791,6 +12976,18 @@ def setUpClass(cls): cls.engine = cls.parser = 'python' cls.frame = _frame.copy() + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + + df.index.name = 'sin' + expected = df[df.index > 5] + result = df.query('sin > 5', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + PARSERS = 'python', 'pandas' ENGINES = 'python', 'numexpr' @@ -12859,8 +13056,8 @@ def check_str_list_query_method(self, parser, engine): for lhs, op, rhs in zip(lhs, ops, rhs): ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) - assertRaises(NotImplementedError, df.query, ex, engine=engine, - parser=parser, local_dict={'strings': df.strings}) + with tm.assertRaises(NotImplementedError): + df.query(ex, engine=engine, parser=parser) else: res = df.query('strings == ["a", "b"]', engine=engine, parser=parser) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index d694efff9b351..6c6e70b86105f 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -227,7 +227,7 @@ def check_metadata(self, x, y=None): for m in x._metadata: v = getattr(x,m,None) if y is None: - self.assert_(v is None) + self.assertIsNone(v) else: self.assertEqual(v, getattr(y,m,None)) @@ -393,10 +393,10 @@ def test_nonzero_single_element(self): # allow single item via bool method s = Series([True]) - self.assert_(s.bool() is True) + self.assertTrue(s.bool()) s = Series([False]) - self.assert_(s.bool() is False) + self.assertFalse(s.bool()) # single item nan to raise for s in [ Series([np.nan]), Series([pd.NaT]), Series([True]), Series([False]) ]: @@ -459,7 +459,10 @@ def test_interpolate(self): self.assert_numpy_array_equal(time_interp, ord_ts) # try time interpolation on a non-TimeSeries - self.assertRaises(ValueError, self.series.interpolate, method='time') + # Only raises ValueError if there are NaNs. + non_ts = self.series.copy() + non_ts[0] = np.NaN + self.assertRaises(ValueError, non_ts.interpolate, method='time') def test_interp_regression(self): _skip_if_no_scipy() @@ -512,7 +515,7 @@ def test_interpolate_non_ts(self): def test_nan_interpolate(self): s = Series([0, 1, np.nan, 3]) result = s.interpolate() - expected = Series([0, 1, 2, 3]) + expected = Series([0., 1., 2., 3.]) assert_series_equal(result, expected) _skip_if_no_scipy() @@ -522,20 +525,20 @@ def test_nan_interpolate(self): def test_nan_irregular_index(self): s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) result = s.interpolate() - expected = Series([1, 2, 3, 4], index=[1, 3, 5, 9]) + expected = Series([1., 2., 3., 4.], index=[1, 3, 5, 9]) assert_series_equal(result, expected) def test_nan_str_index(self): s = Series([0, 1, 2, np.nan], index=list('abcd')) result = s.interpolate() - expected = Series([0, 1, 2, 2], index=list('abcd')) + expected = Series([0., 1., 2., 2.], index=list('abcd')) assert_series_equal(result, expected) def test_interp_quad(self): _skip_if_no_scipy() sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4]) result = sq.interpolate(method='quadratic') - expected = Series([1, 4, 9, 16], index=[1, 2, 3, 4]) + expected = Series([1., 4., 9., 16.], index=[1, 2, 3, 4]) assert_series_equal(result, expected) def test_interp_scipy_basic(self): @@ -545,18 +548,30 @@ def test_interp_scipy_basic(self): expected = Series([1., 3., 7.5, 12., 18.5, 25.]) result = s.interpolate(method='slinear') assert_series_equal(result, expected) + + result = s.interpolate(method='slinear', donwcast='infer') + assert_series_equal(result, expected) # nearest expected = Series([1, 3, 3, 12, 12, 25]) result = s.interpolate(method='nearest') + assert_series_equal(result, expected.astype('float')) + + result = s.interpolate(method='nearest', downcast='infer') assert_series_equal(result, expected) # zero expected = Series([1, 3, 3, 12, 12, 25]) result = s.interpolate(method='zero') + assert_series_equal(result, expected.astype('float')) + + result = s.interpolate(method='zero', downcast='infer') assert_series_equal(result, expected) # quadratic expected = Series([1, 3., 6.769231, 12., 18.230769, 25.]) result = s.interpolate(method='quadratic') assert_series_equal(result, expected) + + result = s.interpolate(method='quadratic', downcast='infer') + assert_series_equal(result, expected) # cubic expected = Series([1., 3., 6.8, 12., 18.2, 25.]) result = s.interpolate(method='cubic') @@ -585,7 +600,6 @@ def test_interp_multiIndex(self): expected = s.copy() expected.loc[2] = 2 - expected = expected.astype(np.int64) result = s.interpolate() assert_series_equal(result, expected) @@ -595,7 +609,7 @@ def test_interp_multiIndex(self): def test_interp_nonmono_raise(self): _skip_if_no_scipy() - s = pd.Series([1, 2, 3], index=[0, 2, 1]) + s = Series([1, np.nan, 3], index=[0, 2, 1]) with tm.assertRaises(ValueError): s.interpolate(method='krogh') @@ -603,7 +617,7 @@ def test_interp_datetime64(self): _skip_if_no_scipy() df = Series([1, np.nan, 3], index=date_range('1/1/2000', periods=3)) result = df.interpolate(method='nearest') - expected = Series([1, 1, 3], index=date_range('1/1/2000', periods=3)) + expected = Series([1., 1., 3.], index=date_range('1/1/2000', periods=3)) assert_series_equal(result, expected) class TestDataFrame(tm.TestCase, Generic): @@ -619,10 +633,10 @@ def test_nonzero_single_element(self): # allow single item via bool method df = DataFrame([[True]]) - self.assert_(df.bool() is True) + self.assertTrue(df.bool()) df = DataFrame([[False]]) - self.assert_(df.bool() is False) + self.assertFalse(df.bool()) df = DataFrame([[False, False]]) self.assertRaises(ValueError, lambda : df.bool()) @@ -639,7 +653,7 @@ def test_get_numeric_data_preserve_dtype(self): def test_interp_basic(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan], 'C': [1, 2, 3, 5], 'D': list('abcd')}) - expected = DataFrame({'A': [1, 2, 3, 4], 'B': [1, 4, 9, 9], + expected = DataFrame({'A': [1., 2., 3., 4.], 'B': [1., 4., 9., 9.], 'C': [1, 2, 3, 5], 'D': list('abcd')}) result = df.interpolate() assert_frame_equal(result, expected) @@ -648,8 +662,6 @@ def test_interp_basic(self): expected = df.set_index('C') expected.A.loc[3] = 3 expected.B.loc[5] = 9 - expected[['A', 'B']] = expected[['A', 'B']].astype(np.int64) - assert_frame_equal(result, expected) def test_interp_bad_method(self): @@ -663,9 +675,14 @@ def test_interp_combo(self): 'C': [1, 2, 3, 5], 'D': list('abcd')}) result = df['A'].interpolate() + expected = Series([1., 2., 3., 4.]) + assert_series_equal(result, expected) + + result = df['A'].interpolate(downcast='infer') expected = Series([1, 2, 3, 4]) assert_series_equal(result, expected) + def test_interp_nan_idx(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]}) df = df.set_index('A') @@ -722,13 +739,16 @@ def test_interp_alt_scipy(self): expected = df.copy() expected['A'].iloc[2] = 3 expected['A'].iloc[5] = 6 + assert_frame_equal(result, expected) + + result = df.interpolate(method='barycentric', downcast='infer') assert_frame_equal(result, expected.astype(np.int64)) result = df.interpolate(method='krogh') expectedk = df.copy() - expectedk['A'].iloc[2] = 3 - expectedk['A'].iloc[5] = 6 - expectedk['A'] = expected['A'].astype(np.int64) + # expectedk['A'].iloc[2] = 3 + # expectedk['A'].iloc[5] = 6 + expectedk['A'] = expected['A'] assert_frame_equal(result, expectedk) _skip_if_no_pchip() @@ -786,9 +806,32 @@ def test_interp_raise_on_only_mixed(self): def test_interp_inplace(self): df = DataFrame({'a': [1., 2., np.nan, 4.]}) - expected = DataFrame({'a': [1, 2, 3, 4]}) - df['a'].interpolate(inplace=True) - assert_frame_equal(df, expected) + expected = DataFrame({'a': [1., 2., 3., 4.]}) + result = df.copy() + result['a'].interpolate(inplace=True) + assert_frame_equal(result, expected) + + result = df.copy() + result['a'].interpolate(inplace=True, downcast='infer') + assert_frame_equal(result, expected.astype('int64')) + + def test_interp_ignore_all_good(self): + # GH + df = DataFrame({'A': [1, 2, np.nan, 4], + 'B': [1, 2, 3, 4], + 'C': [1., 2., np.nan, 4.], + 'D': [1., 2., 3., 4.]}) + expected = DataFrame({'A': np.array([1, 2, 3, 4], dtype='float64'), + 'B': np.array([1, 2, 3, 4], dtype='int64'), + 'C': np.array([1., 2., 3, 4.], dtype='float64'), + 'D': np.array([1., 2., 3., 4.], dtype='float64')}) + + result = df.interpolate(downcast=None) + assert_frame_equal(result, expected) + + # all good + result = df[['B', 'D']].interpolate(downcast=None) + assert_frame_equal(result, df[['B', 'D']]) def test_no_order(self): _skip_if_no_scipy() @@ -802,7 +845,7 @@ def test_spline(self): _skip_if_no_scipy() s = Series([1, 2, np.nan, 4, 5, np.nan, 7]) result = s.interpolate(method='spline', order=1) - expected = Series([1, 2, 3, 4, 5, 6, 7]) + expected = Series([1., 2., 3., 4., 5., 6., 7.]) assert_series_equal(result, expected) def test_metadata_propagation_indiv(self): diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 041920e1de6ea..30ba5cd5a70fe 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -469,13 +469,13 @@ def test_xcompat(self): df = tm.makeTimeDataFrame() ax = df.plot(x_compat=True) lines = ax.get_lines() - self.assert_(not isinstance(lines[0].get_xdata(), PeriodIndex)) + self.assertNotIsInstance(lines[0].get_xdata(), PeriodIndex) tm.close() pd.plot_params['xaxis.compat'] = True ax = df.plot() lines = ax.get_lines() - self.assert_(not isinstance(lines[0].get_xdata(), PeriodIndex)) + self.assertNotIsInstance(lines[0].get_xdata(), PeriodIndex) tm.close() pd.plot_params['x_compat'] = False @@ -488,7 +488,7 @@ def test_xcompat(self): with pd.plot_params.use('x_compat', True): ax = df.plot() lines = ax.get_lines() - self.assert_(not isinstance(lines[0].get_xdata(), PeriodIndex)) + self.assertNotIsInstance(lines[0].get_xdata(), PeriodIndex) tm.close() ax = df.plot() @@ -524,7 +524,7 @@ def test_subplots(self): axes = df.plot(subplots=True, sharex=True, legend=True) for ax in axes: - self.assert_(ax.get_legend() is not None) + self.assertIsNotNone(ax.get_legend()) axes = df.plot(subplots=True, sharex=True) for ax in axes[:-2]: @@ -649,7 +649,7 @@ def test_kde(self): _check_plot_works(df.plot, kind='kde') _check_plot_works(df.plot, kind='kde', subplots=True) ax = df.plot(kind='kde') - self.assert_(ax.get_legend() is not None) + self.assertIsNotNone(ax.get_legend()) axes = df.plot(kind='kde', logy=True, subplots=True) for ax in axes: self.assertEqual(ax.get_yscale(), 'log') @@ -964,7 +964,7 @@ def test_hexbin_basic(self): ax = df.plot(kind='hexbin', x='A', y='B', gridsize=10) # TODO: need better way to test. This just does existence. - self.assert_(len(ax.collections) == 1) + self.assertEqual(len(ax.collections), 1) @slow def test_hexbin_with_c(self): @@ -973,11 +973,11 @@ def test_hexbin_with_c(self): "C": np.arange(20) + np.random.uniform(size=20)}) ax = df.plot(kind='hexbin', x='A', y='B', C='C') - self.assert_(len(ax.collections) == 1) + self.assertEqual(len(ax.collections), 1) ax = df.plot(kind='hexbin', x='A', y='B', C='C', reduce_C_function=np.std) - self.assert_(len(ax.collections) == 1) + self.assertEqual(len(ax.collections), 1) @slow def test_hexbin_cmap(self): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 2c8b60ea25a6e..adca8389b8939 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -156,8 +156,7 @@ def test_first_last_nth(self): assert_frame_equal(last, expected, check_names=False) nth = grouped.nth(1) - expected = self.df.ix[[3, 2], ['B', 'C', 'D']] - expected.index = ['bar', 'foo'] + expected = self.df.iloc[[2, 3]] assert_frame_equal(nth, expected, check_names=False) # it works! @@ -165,10 +164,10 @@ def test_first_last_nth(self): grouped['B'].last() grouped['B'].nth(0) - self.df['B'][self.df['A'] == 'foo'] = np.nan + self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan self.assert_(com.isnull(grouped['B'].first()['foo'])) self.assert_(com.isnull(grouped['B'].last()['foo'])) - self.assert_(com.isnull(grouped['B'].nth(0)['foo'])) + self.assert_(com.isnull(grouped['B'].nth(0)[0])) # not sure what this is testing def test_first_last_nth_dtypes(self): @@ -189,8 +188,7 @@ def test_first_last_nth_dtypes(self): assert_frame_equal(last, expected, check_names=False) nth = grouped.nth(1) - expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']] - expected.index = ['bar', 'foo'] + expected = df.iloc[[2, 3]] assert_frame_equal(nth, expected, check_names=False) # GH 2763, first/last shifting dtypes @@ -201,6 +199,29 @@ def test_first_last_nth_dtypes(self): f = s.groupby(level=0).first() self.assertEqual(f.dtype, 'int64') + def test_nth(self): + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + + assert_frame_equal(g.nth(0), df.iloc[[0, 2]]) + assert_frame_equal(g.nth(1), df.iloc[[1]]) + assert_frame_equal(g.nth(2), df.loc[[]]) + assert_frame_equal(g.nth(-1), df.iloc[[1, 2]]) + assert_frame_equal(g.nth(-2), df.iloc[[0]]) + assert_frame_equal(g.nth(-3), df.loc[[]]) + assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]]) + assert_series_equal(g.B.nth(1), df.B.iloc[[1]]) + assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['B']]) + + exp = df.set_index('A') + assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) + + exp['B'] = np.nan + assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) + + def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index @@ -546,14 +567,14 @@ def test_len(self): def test_groups(self): grouped = self.df.groupby(['A']) groups = grouped.groups - self.assert_(groups is grouped.groups) # caching works + self.assertIs(groups, grouped.groups) # caching works for k, v in compat.iteritems(grouped.groups): self.assert_((self.df.ix[v]['A'] == k).all()) grouped = self.df.groupby(['A', 'B']) groups = grouped.groups - self.assert_(groups is grouped.groups) # caching works + self.assertIs(groups, grouped.groups) # caching works for k, v in compat.iteritems(grouped.groups): self.assert_((self.df.ix[v]['A'] == k[0]).all()) self.assert_((self.df.ix[v]['B'] == k[1]).all()) @@ -961,6 +982,7 @@ def test_frame_groupby(self): assert_frame_equal(stragged, aggregated, check_names=False) # transform + grouped = self.tsframe.head(30).groupby(lambda x: x.weekday()) transformed = grouped.transform(lambda x: x - x.mean()) self.assertEqual(len(transformed), 30) self.assertEqual(len(transformed.columns), 4) @@ -1314,12 +1336,10 @@ def test_groupby_as_index_apply(self): g_not_as = df.groupby('user_id', as_index=False) res_as = g_as.head(2).index - exp_as = MultiIndex.from_tuples([(1, 0), (2, 1), (1, 2), (3, 4)]) - assert_index_equal(res_as, exp_as) - res_not_as = g_not_as.head(2).index - exp_not_as = Index([0, 1, 2, 4]) - assert_index_equal(res_not_as, exp_not_as) + exp = Index([0, 1, 2, 4]) + assert_index_equal(res_as, exp) + assert_index_equal(res_not_as, exp) res_as_apply = g_as.apply(lambda x: x.head(2)).index res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index @@ -1354,11 +1374,8 @@ def test_groupby_head_tail(self): assert_frame_equal(df, g_not_as.head(7)) # contains all assert_frame_equal(df, g_not_as.tail(7)) - # as_index=True, yuck - # prepend the A column as an index, in a roundabout way - df_as = df.copy() - df_as.index = df.set_index('A', append=True, - drop=False).index.swaplevel(0, 1) + # as_index=True, (used to be different) + df_as = df assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) @@ -1372,6 +1389,18 @@ def test_groupby_head_tail(self): assert_frame_equal(df_as, g_as.head(7)) # contains all assert_frame_equal(df_as, g_as.tail(7)) + # test with selection + assert_frame_equal(g_as[[]].head(1), df_as.loc[[0,2], []]) + assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0,2], ['A']]) + assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0,2], ['B']]) + assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0,2]]) + + assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0,2], []]) + assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0,2], ['A']]) + assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0,2], ['B']]) + assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0,2]]) + + def test_groupby_multiple_key(self): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year, @@ -2047,14 +2076,49 @@ def test_groupby_series_with_name(self): result = self.df.groupby(self.df['A']).mean() result2 = self.df.groupby(self.df['A'], as_index=False).mean() self.assertEquals(result.index.name, 'A') - self.assert_('A' in result2) + self.assertIn('A', result2) result = self.df.groupby([self.df['A'], self.df['B']]).mean() result2 = self.df.groupby([self.df['A'], self.df['B']], as_index=False).mean() self.assertEquals(result.index.names, ('A', 'B')) - self.assert_('A' in result2) - self.assert_('B' in result2) + self.assertIn('A', result2) + self.assertIn('B', result2) + + def test_seriesgroupby_name_attr(self): + # GH 6265 + result = self.df.groupby('A')['C'] + self.assertEquals(result.count().name, 'C') + self.assertEquals(result.mean().name, 'C') + + testFunc = lambda x: np.sum(x)*2 + self.assertEquals(result.agg(testFunc).name, 'C') + + def test_groupby_name_propagation(self): + # GH 6124 + def summarize(df, name=None): + return Series({ + 'count': 1, + 'mean': 2, + 'omissions': 3, + }, name=name) + + def summarize_random_name(df): + # Provide a different name for each Series. In this case, groupby + # should not attempt to propagate the Series name since they are + # inconsistent. + return Series({ + 'count': 1, + 'mean': 2, + 'omissions': 3, + }, name=df.iloc[0]['A']) + + metrics = self.df.groupby('A').apply(summarize) + self.assertEqual(metrics.columns.name, None) + metrics = self.df.groupby('A').apply(summarize, 'metrics') + self.assertEqual(metrics.columns.name, 'metrics') + metrics = self.df.groupby('A').apply(summarize_random_name) + self.assertEqual(metrics.columns.name, None) def test_groupby_nonstring_columns(self): df = DataFrame([np.arange(10) for x in range(10)]) @@ -2203,7 +2267,7 @@ def test_panel_groupby(self): grouped = self.panel.groupby(lambda x: x.month, axis='major') agged = grouped.mean() - self.assert_numpy_array_equal(agged.major_axis, [1, 2]) + self.assert_numpy_array_equal(agged.major_axis, sorted(list(set(self.panel.major_axis.month)))) grouped = self.panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, axis='minor') @@ -2397,7 +2461,7 @@ def test_no_nonsense_name(self): s.name = None result = s.groupby(self.frame['A']).agg(np.sum) - self.assert_(result.name is None) + self.assertIsNone(result.name) def test_wrap_agg_out(self): grouped = self.three_group.groupby(['A', 'B']) @@ -2575,7 +2639,7 @@ def test_no_dummy_key_names(self): # GH #1291 result = self.df.groupby(self.df['A'].values).sum() - self.assert_(result.index.name is None) + self.assertIsNone(result.index.name) result = self.df.groupby([self.df['A'].values, self.df['B'].values]).sum() @@ -2731,6 +2795,15 @@ def test_groupby_max_datetime64(self): result = df.groupby('A')['A'].max() assert_series_equal(result,expected) + def test_groupby_datetime64_32_bit(self): + # GH 6410 / numpy 4328 + # 32-bit under 1.9-dev indexing issue + + df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')]*2}) + result = df.groupby("A")["B"].transform(min) + expected = Series([pd.Timestamp('2000-01-1')]*2) + assert_series_equal(result,expected) + def test_groupby_categorical_unequal_len(self): import pandas as pd #GH3011 @@ -3365,6 +3438,13 @@ def test_filter_and_transform_with_non_unique_string_index(self): actual = grouped_df.pid.transform(len) assert_series_equal(actual, expected) + def test_filter_has_access_to_grouped_cols(self): + df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + # previously didn't have access to col A #???? + filt = g.filter(lambda x: x['A'].sum() == 2) + assert_frame_equal(filt, df.iloc[[0, 1]]) + def test_index_label_overlaps_location(self): # checking we don't have any label/location confusion in the # the wake of GH5375 @@ -3393,6 +3473,48 @@ def test_index_label_overlaps_location(self): expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) + def test_groupby_selection_with_methods(self): + # some methods which require DatetimeIndex + rng = pd.date_range('2014', periods=len(self.df)) + self.df.index = rng + + g = self.df.groupby(['A'])[['C']] + g_exp = self.df[['C']].groupby(self.df['A']) + # TODO check groupby with > 1 col ? + + # methods which are called as .foo() + methods = ['count', + 'corr', + 'cummax', 'cummin', 'cumprod', + 'describe', 'rank', + 'quantile', + 'diff', 'shift', + 'all', 'any', + 'idxmin', 'idxmax', + 'ffill', 'bfill', + 'pct_change', + 'tshift', + #'ohlc' + ] + + for m in methods: + res = getattr(g, m)() + exp = getattr(g_exp, m)() + assert_frame_equal(res, exp) # should always be frames! + + # methods which aren't just .foo() + assert_frame_equal(g.fillna(0), g_exp.fillna(0)) + assert_frame_equal(g.dtypes, g_exp.dtypes) + assert_frame_equal(g.apply(lambda x: x.sum()), + g_exp.apply(lambda x: x.sum())) + + assert_frame_equal(g.resample('D'), g_exp.resample('D')) + assert_frame_equal(g.resample('D', how='ohlc'), + g_exp.resample('D', how='ohlc')) + + assert_frame_equal(g.filter(lambda x: len(x) == 3), + g_exp.filter(lambda x: len(x) == 3)) + def test_groupby_whitelist(self): from string import ascii_lowercase letters = np.array(list(ascii_lowercase)) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index f938066011e06..c6c405306afb8 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -71,7 +71,7 @@ def test_set_name_methods(self): self.assertEqual(ind.name, original_name) res = ind.rename(new_name, inplace=True) # should return None - self.assert_(res is None) + self.assertIsNone(res) self.assertEqual(ind.name, new_name) self.assertEqual(ind.names, [new_name]) with assertRaisesRegexp(TypeError, "list-like"): @@ -100,7 +100,7 @@ def test_copy_and_deepcopy(self): for func in (copy, deepcopy): idx_copy = func(self.strIndex) - self.assert_(idx_copy is not self.strIndex) + self.assertIsNot(idx_copy, self.strIndex) self.assert_(idx_copy.equals(self.strIndex)) new_copy = self.strIndex.copy(deep=True, name="banana") @@ -255,7 +255,7 @@ def test_is_(self): def test_asof(self): d = self.dateIndex[0] - self.assert_(self.dateIndex.asof(d) is d) + self.assertIs(self.dateIndex.asof(d), d) self.assert_(np.isnan(self.dateIndex.asof(d - timedelta(1)))) d = self.dateIndex[-1] @@ -295,7 +295,7 @@ def _check(op): index_result = op(index, element) tm.assert_isinstance(index_result, np.ndarray) - self.assert_(not isinstance(index_result, Index)) + self.assertNotIsInstance(index_result, Index) self.assert_numpy_array_equal(arr_result, index_result) _check(operator.eq) @@ -323,6 +323,25 @@ def test_fancy(self): for i in sl: self.assertEqual(i, sl[sl.get_loc(i)]) + def test_empty_fancy(self): + empty_farr = np.array([], dtype=np.float_) + empty_iarr = np.array([], dtype=np.int_) + empty_barr = np.array([], dtype=np.bool_) + + # pd.DatetimeIndex is excluded, because it overrides getitem and should + # be tested separately. + for idx in [self.strIndex, self.intIndex, self.floatIndex]: + empty_idx = idx.__class__([]) + values = idx.values + + self.assert_(idx[[]].identical(empty_idx)) + self.assert_(idx[empty_iarr].identical(empty_idx)) + self.assert_(idx[empty_barr].identical(empty_idx)) + + # np.ndarray only accepts ndarray of int & bool dtypes, so should + # Index. + self.assertRaises(IndexError, idx.__getitem__, empty_farr) + def test_getitem(self): arr = np.array(self.dateIndex) exp = self.dateIndex[5] @@ -332,7 +351,7 @@ def test_getitem(self): def test_shift(self): shifted = self.dateIndex.shift(0, timedelta(1)) - self.assert_(shifted is self.dateIndex) + self.assertIs(shifted, self.dateIndex) shifted = self.dateIndex.shift(5, timedelta(1)) self.assert_numpy_array_equal(shifted, self.dateIndex + timedelta(5)) @@ -352,7 +371,7 @@ def test_intersection(self): # Corner cases inter = first.intersection(first) - self.assert_(inter is first) + self.assertIs(inter, first) # non-iterable input assertRaisesRegexp(TypeError, "iterable", first.intersection, 0.5) @@ -366,13 +385,13 @@ def test_union(self): # Corner cases union = first.union(first) - self.assert_(union is first) + self.assertIs(union, first) union = first.union([]) - self.assert_(union is first) + self.assertIs(union, first) union = Index([]).union(first) - self.assert_(union is first) + self.assertIs(union, first) # non-iterable input assertRaisesRegexp(TypeError, "iterable", first.union, 0.5) @@ -385,7 +404,7 @@ def test_union(self): second.name = 'B' union = first.union(second) - self.assert_(union.name is None) + self.assertIsNone(union.name) def test_add(self): firstCat = self.strIndex + self.dateIndex @@ -424,23 +443,23 @@ def test_append_empty_preserve_name(self): right = Index([1, 2, 3], name='bar') result = left.append(right) - self.assert_(result.name is None) + self.assertIsNone(result.name) def test_add_string(self): # from bug report index = Index(['a', 'b', 'c']) index2 = index + 'foo' - self.assert_('a' not in index2) - self.assert_('afoo' in index2) + self.assertNotIn('a', index2) + self.assertIn('afoo', index2) def test_iadd_string(self): index = pd.Index(['a', 'b', 'c']) # doesn't fail test unless there is a check before `+=` - self.assert_('a' in index) + self.assertIn('a', index) index += '_x' - self.assert_('a_x' in index) + self.assertIn('a_x', index) def test_diff(self): first = self.strIndex[5:20] @@ -471,6 +490,55 @@ def test_diff(self): # non-iterable input assertRaisesRegexp(TypeError, "iterable", first.diff, 0.5) + def test_symmetric_diff(self): + # smoke + idx1 = Index([1, 2, 3, 4], name='idx1') + idx2 = Index([2, 3, 4, 5]) + result = idx1.sym_diff(idx2) + expected = Index([1, 5]) + self.assert_(tm.equalContents(result, expected)) + self.assertIsNone(result.name) + + # __xor__ syntax + expected = idx1 ^ idx2 + self.assert_(tm.equalContents(result, expected)) + self.assertIsNone(result.name) + + # multiIndex + idx1 = MultiIndex.from_tuples(self.tuples) + idx2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)]) + result = idx1.sym_diff(idx2) + expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)]) + self.assert_(tm.equalContents(result, expected)) + + # nans: + # GH #6444, sorting of nans. Make sure the number of nans is right + # and the correct non-nan values are there. punt on sorting. + idx1 = Index([1, 2, 3, np.nan]) + idx2 = Index([0, 1, np.nan]) + result = idx1.sym_diff(idx2) + # expected = Index([0.0, np.nan, 2.0, 3.0, np.nan]) + nans = pd.isnull(result) + self.assertEqual(nans.sum(), 2) + self.assertEqual((~nans).sum(), 3) + [self.assertIn(x, result) for x in [0.0, 2.0, 3.0]] + + # other not an Index: + idx1 = Index([1, 2, 3, 4], name='idx1') + idx2 = np.array([2, 3, 4, 5]) + expected = Index([1, 5]) + result = idx1.sym_diff(idx2) + self.assert_(tm.equalContents(result, expected)) + self.assertEquals(result.name, 'idx1') + + result = idx1.sym_diff(idx2, result_name='new_name') + self.assert_(tm.equalContents(result, expected)) + self.assertEquals(result.name, 'new_name') + + # other isn't iterable + with tm.assertRaises(TypeError): + idx1 - 1 + def test_pickle(self): def testit(index): pickled = pickle.dumps(index) @@ -505,8 +573,8 @@ def test_summary(self): ind = Index(['{other}%s', "~:{range}:0"], name='A') result = ind.summary() # shouldn't be formatted accidentally. - self.assert_('~:{range}:0' in result) - self.assert_('{other}%s' in result) + self.assertIn('~:{range}:0', result) + self.assertIn('{other}%s', result) def test_format(self): self._check_method_works(Index.format) @@ -551,7 +619,7 @@ def test_format_none(self): idx = Index(values) idx.format() - self.assert_(idx[3] is None) + self.assertIsNone(idx[3]) def test_take(self): indexer = [4, 3, 0, 2] @@ -694,7 +762,7 @@ def test_boolean_cmp(self): self.assert_(res.all()) self.assertEqual(res.dtype, 'bool') - self.assert_(not isinstance(res, Index)) + self.assertNotIsInstance(res, Index) def test_get_level_values(self): result = self.strIndex.get_level_values(0) @@ -711,7 +779,15 @@ def test_join_self(self): for kind in kinds: res = getattr(self, '{0}Index'.format(index_kind)) joined = res.join(res, how=kind) - self.assert_(res is joined) + self.assertIs(res, joined) + + def test_indexing_doesnt_change_class(self): + idx = Index([1, 2, 3, 'a', 'b', 'c']) + + self.assert_(idx[1:3].identical( + pd.Index([2, 3], dtype=np.object_))) + self.assert_(idx[[0,1]].identical( + pd.Index([1, 2], dtype=np.object_))) class TestFloat64Index(tm.TestCase): @@ -732,12 +808,13 @@ def test_repr_roundtrip(self): tm.assert_index_equal(eval(repr(ind)), ind) def check_is_index(self, i): - self.assert_(isinstance(i, Index) and not isinstance(i, Float64Index)) + self.assertIsInstance(i, Index) + self.assertNotIsInstance(i, Float64Index) def check_coerce(self, a, b, is_float_index=True): self.assert_(a.equals(b)) if is_float_index: - self.assert_(isinstance(b, Float64Index)) + self.assertIsInstance(b, Float64Index) else: self.check_is_index(b) @@ -745,22 +822,22 @@ def test_constructor(self): # explicit construction index = Float64Index([1,2,3,4,5]) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) self.assert_((index.values == np.array([1,2,3,4,5],dtype='float64')).all()) index = Float64Index(np.array([1,2,3,4,5])) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) index = Float64Index([1.,2,3,4,5]) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) index = Float64Index(np.array([1.,2,3,4,5])) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) self.assertEqual(index.dtype, object) index = Float64Index(np.array([1.,2,3,4,5]),dtype=np.float32) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) self.assertEqual(index.dtype, object) index = Float64Index(np.array([1,2,3,4,5]),dtype=np.float32) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) self.assertEqual(index.dtype, object) # nan handling @@ -806,6 +883,21 @@ def test_astype(self): self.assert_(i.equals(result)) self.check_is_index(result) + def test_equals(self): + + i = Float64Index([1.0,2.0]) + self.assertTrue(i.equals(i)) + self.assertTrue(i.identical(i)) + + i2 = Float64Index([1.0,2.0]) + self.assertTrue(i.equals(i2)) + + i = Float64Index([1.0,np.nan]) + self.assertTrue(i.equals(i)) + self.assertTrue(i.identical(i)) + + i2 = Float64Index([1.0,np.nan]) + self.assertTrue(i.equals(i2)) class TestInt64Index(tm.TestCase): _multiprocess_can_split_ = True @@ -1010,7 +1102,7 @@ def test_join_left(self): tm.assert_isinstance(res, Int64Index) self.assert_(res.equals(eres)) - self.assert_(lidx is None) + self.assertIsNone(lidx) self.assert_numpy_array_equal(ridx, eridx) # monotonic @@ -1020,7 +1112,7 @@ def test_join_left(self): dtype=np.int64) tm.assert_isinstance(res, Int64Index) self.assert_(res.equals(eres)) - self.assert_(lidx is None) + self.assertIsNone(lidx) self.assert_numpy_array_equal(ridx, eridx) # non-unique @@ -1050,7 +1142,7 @@ def test_join_right(self): tm.assert_isinstance(other, Int64Index) self.assert_(res.equals(eres)) self.assert_numpy_array_equal(lidx, elidx) - self.assert_(ridx is None) + self.assertIsNone(ridx) # monotonic res, lidx, ridx = self.index.join(other_mono, how='right', @@ -1061,7 +1153,7 @@ def test_join_right(self): tm.assert_isinstance(other, Int64Index) self.assert_(res.equals(eres)) self.assert_numpy_array_equal(lidx, elidx) - self.assert_(ridx is None) + self.assertIsNone(ridx) # non-unique """ @@ -1128,7 +1220,7 @@ def test_join_self(self): kinds = 'outer', 'inner', 'left', 'right' for kind in kinds: joined = self.index.join(self.index, how=kind) - self.assert_(self.index is joined) + self.assertIs(self.index, joined) def test_intersection(self): other = Index([1, 2, 3, 4, 5]) @@ -1257,7 +1349,7 @@ def test_set_names_and_rename(self): ind.set_names(new_names + new_names) new_names2 = [name + "SUFFIX2" for name in new_names] res = ind.set_names(new_names2, inplace=True) - self.assert_(res is None) + self.assertIsNone(res) self.assertEqual(ind.names, new_names2) def test_set_levels_and_set_labels(self): @@ -1287,7 +1379,7 @@ def assert_matching(actual, expected): # level changing [w/ mutation] ind2 = self.index.copy() inplace_return = ind2.set_levels(new_levels, inplace=True) - self.assert_(inplace_return is None) + self.assertIsNone(inplace_return) assert_matching(ind2.levels, new_levels) # label changing [w/o mutation] @@ -1298,7 +1390,7 @@ def assert_matching(actual, expected): # label changing [w/ mutation] ind2 = self.index.copy() inplace_return = ind2.set_labels(new_labels, inplace=True) - self.assert_(inplace_return is None) + self.assertIsNone(inplace_return) assert_matching(ind2.labels, new_labels) def test_set_levels_labels_names_bad_input(self): @@ -1404,10 +1496,10 @@ def test_set_value_keeps_names(self): columns=['one', 'two', 'three', 'four'], index=idx) df = df.sortlevel() - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) self.assertEqual(df.index.names, ('Name', 'Number')) df = df.set_value(('grethe', '4'), 'one', 99.34) - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) self.assertEqual(df.index.names, ('Name', 'Number')) def test_names(self): @@ -1457,12 +1549,12 @@ def test_constructor_single_level(self): labels=[[0, 1, 2, 3]], names=['first']) tm.assert_isinstance(single_level, Index) - self.assert_(not isinstance(single_level, MultiIndex)) + self.assertNotIsInstance(single_level, MultiIndex) self.assertEqual(single_level.name, 'first') single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]]) - self.assert_(single_level.name is None) + self.assertIsNone(single_level.name) def test_constructor_no_levels(self): assertRaisesRegexp(ValueError, "non-zero number of levels/labels", @@ -1515,11 +1607,11 @@ def assert_multiindex_copied(self, copy, original): # labels doesn't matter which way copied assert_almost_equal(copy.labels, original.labels) - self.assert_(copy.labels is not original.labels) + self.assertIsNot(copy.labels, original.labels) # names doesn't matter which way copied self.assertEqual(copy.names, original.names) - self.assert_(copy.names is not original.names) + self.assertIsNot(copy.names, original.names) # sort order should be copied self.assertEqual(copy.sortorder, original.sortorder) @@ -1727,9 +1819,9 @@ def test_from_tuples_index_values(self): self.assert_((result.values == self.index.values).all()) def test_contains(self): - self.assert_(('foo', 'two') in self.index) - self.assert_(('bar', 'two') not in self.index) - self.assert_(None not in self.index) + self.assertIn(('foo', 'two'), self.index) + self.assertNotIn(('bar', 'two'), self.index) + self.assertNotIn(None, self.index) def test_is_all_dates(self): self.assert_(not self.index.is_all_dates) @@ -1804,7 +1896,7 @@ def test_get_loc_level(self): loc, new_index = index.get_loc_level((0, 1, 0)) expected = 1 self.assertEqual(loc, expected) - self.assert_(new_index is None) + self.assertIsNone(new_index) self.assertRaises(KeyError, index.get_loc_level, (2, 2)) @@ -1938,12 +2030,12 @@ def test_truncate(self): labels=[major_labels, minor_labels]) result = index.truncate(before=1) - self.assert_('foo' not in result.levels[0]) - self.assert_(1 in result.levels[0]) + self.assertNotIn('foo', result.levels[0]) + self.assertIn(1, result.levels[0]) result = index.truncate(after=1) - self.assert_(2 not in result.levels[0]) - self.assert_(1 in result.levels[0]) + self.assertNotIn(2, result.levels[0]) + self.assertIn(1, result.levels[0]) result = index.truncate(before=1, after=2) self.assertEqual(len(result.levels[0]), 2) @@ -2157,10 +2249,10 @@ def test_union(self): # corner case, pass self or empty thing: the_union = self.index.union(self.index) - self.assert_(the_union is self.index) + self.assertIs(the_union, self.index) the_union = self.index.union(self.index[:0]) - self.assert_(the_union is self.index) + self.assertIs(the_union, self.index) # won't work in python 3 # tuples = self.index._tuple_index @@ -2172,8 +2264,8 @@ def test_union(self): # other = Index(['A', 'B', 'C']) # result = other.union(self.index) - # self.assert_(('foo', 'one') in result) - # self.assert_('B' in result) + # self.assertIn(('foo', 'one'), result) + # self.assertIn('B', result) # result2 = self.index.union(other) # self.assert_(result.equals(result2)) @@ -2189,7 +2281,7 @@ def test_intersection(self): # corner case, pass self the_int = self.index.intersection(self.index) - self.assert_(the_int is self.index) + self.assertIs(the_int, self.index) # empty intersection: disjoint empty = self.index[:2] & self.index[2:] @@ -2444,7 +2536,7 @@ def test_join_self(self): for kind in kinds: res = self.index joined = res.join(res, how=kind) - self.assert_(res is joined) + self.assertIs(res, joined) def test_reindex(self): result, indexer = self.index.reindex(list(self.index[:4])) @@ -2453,7 +2545,7 @@ def test_reindex(self): result, indexer = self.index.reindex(list(self.index)) tm.assert_isinstance(result, MultiIndex) - self.assert_(indexer is None) + self.assertIsNone(indexer) self.check_level_names(result, self.index.names) def test_reindex_level(self): diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 8ffa5f8b1bba0..3f6ae24756d47 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -334,6 +334,33 @@ def test_at_timestamp(self): def test_iat_invalid_args(self): pass + def test_imethods_with_dups(self): + + # GH6493 + # iat/iloc with dups + + s = Series(range(5), index=[1,1,2,2,3], dtype='int64') + result = s.iloc[2] + self.assertEqual(result,2) + result = s.iat[2] + self.assertEqual(result,2) + + self.assertRaises(IndexError, lambda : s.iat[10]) + self.assertRaises(IndexError, lambda : s.iat[-10]) + + result = s.iloc[[2,3]] + expected = Series([2,3],[2,2],dtype='int64') + assert_series_equal(result,expected) + + df = s.to_frame() + result = df.iloc[2] + expected = Series(2,index=[0]) + assert_series_equal(result,expected) + + result = df.iat[2,0] + expected = 2 + self.assertEqual(result,2) + def test_repeated_getitem_dups(self): # GH 5678 # repeated gettitems on a dup index returing a ndarray @@ -348,65 +375,83 @@ def test_iloc_exceeds_bounds(self): # iloc should allow indexers that exceed the bounds df = DataFrame(np.random.random_sample((20,5)), columns=list('ABCDE')) expected = df - result = df.iloc[:,[0,1,2,3,4,5]] + + # lists of positions should raise IndexErrror! + with tm.assertRaisesRegexp(IndexError, 'positional indexers are out-of-bounds'): + df.iloc[:,[0,1,2,3,4,5]] + self.assertRaises(IndexError, lambda : df.iloc[[1,30]]) + self.assertRaises(IndexError, lambda : df.iloc[[1,-30]]) + self.assertRaises(IndexError, lambda : df.iloc[[100]]) + + s = df['A'] + self.assertRaises(IndexError, lambda : s.iloc[[100]]) + self.assertRaises(IndexError, lambda : s.iloc[[-100]]) + + # still raise on a single indexer + with tm.assertRaisesRegexp(IndexError, 'single positional indexer is out-of-bounds'): + df.iloc[30] + self.assertRaises(IndexError, lambda : df.iloc[-30]) + + # slices are ok + result = df.iloc[:,4:10] # 0 < start < len < stop + expected = df.iloc[:,4:] assert_frame_equal(result,expected) - result = df.iloc[[1,30]] - expected = df.iloc[[1]] + result = df.iloc[:,-4:-10] # stop < 0 < start < len + expected = df.iloc[:,:0] assert_frame_equal(result,expected) - result = df.iloc[[1,-30]] - expected = df.iloc[[1]] + result = df.iloc[:,10:4:-1] # 0 < stop < len < start (down) + expected = df.iloc[:,:4:-1] assert_frame_equal(result,expected) - result = df.iloc[:,4:10] - expected = df.iloc[:,4:] + result = df.iloc[:,4:-10:-1] # stop < 0 < start < len (down) + expected = df.iloc[:,4::-1] assert_frame_equal(result,expected) - result = df.iloc[:,-4:-10] - expected = df.iloc[:,-4:] + result = df.iloc[:,-10:4] # start < 0 < stop < len + expected = df.iloc[:,:4] assert_frame_equal(result,expected) - result = df.iloc[[100]] - expected = DataFrame(columns=df.columns) + result = df.iloc[:,10:4] # 0 < stop < len < start + expected = df.iloc[:,:0] assert_frame_equal(result,expected) - # still raise on a single indexer - def f(): - df.iloc[30] - self.assertRaises(IndexError, f) + result = df.iloc[:,-10:-11:-1] # stop < start < 0 < len (down) + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) - s = df['A'] - result = s.iloc[[100]] - expected = Series() + result = df.iloc[:,10:11] # 0 < len < start < stop + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) + + # slice bounds exceeding is ok + result = s.iloc[18:30] + expected = s.iloc[18:] assert_series_equal(result,expected) - result = s.iloc[[-100]] - expected = Series() + result = s.iloc[30:] + expected = s.iloc[:0] assert_series_equal(result,expected) - # slice - result = s.iloc[18:30] - expected = s.iloc[18:] + result = s.iloc[30::-1] + expected = s.iloc[::-1] assert_series_equal(result,expected) # doc example - df = DataFrame(np.random.randn(5,2),columns=list('AB')) - result = df.iloc[[4,5,6]] - expected = df.iloc[[4]] - assert_frame_equal(result,expected) + def check(result,expected): + str(result) + result.dtypes + assert_frame_equal(result,expected) - result = df.iloc[4:6] - expected = df.iloc[[4]] - assert_frame_equal(result,expected) + dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) + check(dfl.iloc[:,2:3],DataFrame(index=dfl.index)) + check(dfl.iloc[:,1:3],dfl.iloc[:,[1]]) + check(dfl.iloc[4:6],dfl.iloc[[4]]) - result = df.iloc[:,2:3] - expected = DataFrame(index=df.index) - assert_frame_equal(result,expected) + self.assertRaises(IndexError, lambda : dfl.iloc[[4,5,6]]) + self.assertRaises(IndexError, lambda : dfl.iloc[:,4]) - result = df.iloc[:,1:3] - expected = df.iloc[:,[1]] - assert_frame_equal(result,expected) def test_iloc_getitem_int(self): @@ -551,6 +596,49 @@ def test_loc_setitem(self): expected = DataFrame({'a' : [0.5,-0.5,-1.5], 'b' : [0,1,2] }) assert_frame_equal(df,expected) + def test_loc_setitem_dups(self): + + # GH 6541 + df_orig = DataFrame({'me' : list('rttti'), + 'foo': list('aaade'), + 'bar': np.arange(5,dtype='float64')*1.34+2, + 'bar2': np.arange(5,dtype='float64')*-.34+2}).set_index('me') + + indexer = tuple(['r',['bar','bar2']]) + df = df_orig.copy() + df.loc[indexer]*=2.0 + assert_series_equal(df.loc[indexer],2.0*df_orig.loc[indexer]) + + indexer = tuple(['r','bar']) + df = df_orig.copy() + df.loc[indexer]*=2.0 + self.assertEqual(df.loc[indexer],2.0*df_orig.loc[indexer]) + + indexer = tuple(['t',['bar','bar2']]) + df = df_orig.copy() + df.loc[indexer]*=2.0 + assert_frame_equal(df.loc[indexer],2.0*df_orig.loc[indexer]) + + def test_chained_getitem_with_lists(self): + + # GH6394 + # Regression in chained getitem indexing with embedded list-like from 0.12 + def check(result, expected): + self.assert_numpy_array_equal(result,expected) + tm.assert_isinstance(result, np.ndarray) + + + df = DataFrame({'A': 5*[np.zeros(3)], 'B':5*[np.ones(3)]}) + expected = df['A'].iloc[2] + result = df.loc[2,'A'] + check(result, expected) + result2 = df.iloc[2]['A'] + check(result2, expected) + result3 = df['A'].loc[2] + check(result3, expected) + result4 = df['A'].iloc[2] + check(result4, expected) + def test_loc_getitem_int(self): # int label @@ -779,6 +867,19 @@ def test_loc_setitem_frame(self): expected = DataFrame(dict(A = Series(val1,index=keys1), B = Series(val2,index=keys2))).reindex(index=index) assert_frame_equal(df, expected) + # GH 6546 + # setting with mixed labels + df = DataFrame({1:[1,2],2:[3,4],'a':['a','b']}) + + result = df.loc[0,[1,2]] + expected = Series([1,3],index=[1,2],dtype=object) + assert_series_equal(result,expected) + + expected = DataFrame({1:[5,2],2:[6,4],'a':['a','b']}) + df.loc[0,[1,2]] = [5,6] + assert_frame_equal(df, expected) + + def test_loc_setitem_frame_multiples(self): # multiple setting @@ -1566,6 +1667,21 @@ def test_dups_fancy_indexing(self): result = df.ix[:,['A','B','C']] assert_frame_equal(result, expected) + # GH 6504, multi-axis indexing + df = DataFrame(np.random.randn(9,2), index=[1,1,1,2,2,2,3,3,3], columns=['a', 'b']) + + expected = df.iloc[0:6] + result = df.loc[[1, 2]] + assert_frame_equal(result, expected) + + expected = df + result = df.loc[:,['a', 'b']] + assert_frame_equal(result, expected) + + expected = df.iloc[0:6,:] + result = df.loc[[1, 2], ['a', 'b']] + assert_frame_equal(result, expected) + def test_indexing_mixed_frame_bug(self): # GH3492 @@ -2474,14 +2590,14 @@ def test_cache_updating(self): df = tm.makeDataFrame() df['A'] # cache series df.ix["Hello Friend"] = df.ix[0] - self.assert_("Hello Friend" in df['A'].index) - self.assert_("Hello Friend" in df['B'].index) + self.assertIn("Hello Friend", df['A'].index) + self.assertIn("Hello Friend", df['B'].index) panel = tm.makePanel() panel.ix[0] # get first item into cache panel.ix[:, :, 'A+1'] = panel.ix[:, :, 'A'] + 1 - self.assert_("A+1" in panel.ix[0].columns) - self.assert_("A+1" in panel.ix[1].columns) + self.assertIn("A+1", panel.ix[0].columns) + self.assertIn("A+1", panel.ix[1].columns) # 5216 # make sure that we don't try to set a dead cache @@ -2578,7 +2694,7 @@ def test_detect_chained_assignment(self): # work with the chain expected = DataFrame([[-5,1],[-6,3]],columns=list('AB')) df = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),dtype='int64') - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df['A'][0] = -5 df['A'][1] = -6 @@ -2586,11 +2702,11 @@ def test_detect_chained_assignment(self): expected = DataFrame([[-5,2],[np.nan,3.]],columns=list('AB')) df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df['A'][0] = -5 df['A'][1] = np.nan assert_frame_equal(df, expected) - self.assert_(df['A'].is_copy is None) + self.assertIsNone(df['A'].is_copy) # using a copy (the chain), fails df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) @@ -2602,7 +2718,7 @@ def f(): df = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], 'c' : Series(range(7),dtype='int64') }) - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) expected = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], 'c' : [42,42,2,3,4,42,6]}) @@ -2631,7 +2747,7 @@ def f(): # make sure that is_copy is picked up reconstruction # GH5475 df = DataFrame({"A": [1,2]}) - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) with tm.ensure_clean('__tmp__pickle') as path: df.to_pickle(path) df2 = pd.read_pickle(path) @@ -2656,34 +2772,34 @@ def random_text(nobs=100): # always a copy x = df.iloc[[0,1,2]] - self.assert_(x.is_copy is not None) + self.assertIsNotNone(x.is_copy) x = df.iloc[[0,1,2,4]] - self.assert_(x.is_copy is not None) + self.assertIsNotNone(x.is_copy) # explicity copy indexer = df.letters.apply(lambda x : len(x) > 10) df = df.ix[indexer].copy() - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df['letters'] = df['letters'].apply(str.lower) # implicity take df = random_text(100000) indexer = df.letters.apply(lambda x : len(x) > 10) df = df.ix[indexer] - self.assert_(df.is_copy is not None) + self.assertIsNotNone(df.is_copy) df['letters'] = df['letters'].apply(str.lower) # implicity take 2 df = random_text(100000) indexer = df.letters.apply(lambda x : len(x) > 10) df = df.ix[indexer] - self.assert_(df.is_copy is not None) + self.assertIsNotNone(df.is_copy) df.loc[:,'letters'] = df['letters'].apply(str.lower) # should be ok even though its a copy! - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df['letters'] = df['letters'].apply(str.lower) - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df = random_text(100000) indexer = df.letters.apply(lambda x : len(x) > 10) @@ -2691,7 +2807,7 @@ def random_text(nobs=100): # an identical take, so no copy df = DataFrame({'a' : [1]}).dropna() - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df['a'] += 1 # inplace ops @@ -3059,6 +3175,26 @@ def test_set_ix_out_of_bounds_axis_1(self): df = pd.DataFrame(randn(5, 2), index=["row%s" % i for i in range(5)], columns=["col%s" % i for i in range(2)]) self.assertRaises(ValueError, df.ix.__setitem__, (0 , 2), 100) + def test_iloc_empty_list_indexer_is_ok(self): + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + assert_frame_equal(df.iloc[:,[]], df.iloc[:, :0]) # vertical empty + assert_frame_equal(df.iloc[[],:], df.iloc[:0, :]) # horizontal empty + + # FIXME: fix loc & xs + def test_loc_empty_list_indexer_is_ok(self): + raise nose.SkipTest('loc discards columns names') + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + assert_frame_equal(df.loc[:,[]], df.iloc[:, :0]) # vertical empty + assert_frame_equal(df.loc[[],:], df.iloc[:0, :]) # horizontal empty + + def test_ix_empty_list_indexer_is_ok(self): + raise nose.SkipTest('ix discards columns names') + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + assert_frame_equal(df.ix[:,[]], df.iloc[:, :0]) # vertical empty + assert_frame_equal(df.ix[[],:], df.iloc[:0, :]) # horizontal empty if __name__ == '__main__': import nose diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 5e47245ff86ab..2c9c8a94a1902 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -260,15 +260,15 @@ def test_attrs(self): self.assertEquals(len(self.mgr), len(self.mgr.items)) def test_is_mixed_dtype(self): - self.assert_(self.mgr.is_mixed_type) + self.assertTrue(self.mgr.is_mixed_type) mgr = create_blockmanager([get_bool_ex(['a']), get_bool_ex(['b'])]) - self.assert_(not mgr.is_mixed_type) + self.assertFalse(mgr.is_mixed_type) def test_is_indexed_like(self): - self.assert_(self.mgr._is_indexed_like(self.mgr)) + self.assertTrue(self.mgr._is_indexed_like(self.mgr)) mgr2 = self.mgr.reindex_axis(np.arange(N - 1), axis=1) - self.assert_(not self.mgr._is_indexed_like(mgr2)) + self.assertFalse(self.mgr._is_indexed_like(mgr2)) def test_block_id_vector_item_dtypes(self): expected = [0, 1, 0, 1, 0, 2, 3, 4] @@ -378,7 +378,7 @@ def test_sparse(self): def test_sparse_mixed(self): mgr = create_blockmanager([get_sparse_ex1(),get_sparse_ex2(),get_float_ex()]) self.assertEqual(len(mgr.blocks), 3) - self.assert_(isinstance(mgr,BlockManager)) + self.assertIsInstance(mgr, BlockManager) # what to test here? @@ -512,7 +512,7 @@ def test_consolidate_ordering_issues(self): cons = self.mgr.consolidate() self.assertEquals(cons.nblocks, 1) - self.assert_(cons.blocks[0].items.equals(cons.items)) + self.assertTrue(cons.blocks[0].items.equals(cons.items)) def test_reindex_index(self): pass @@ -591,7 +591,7 @@ def test_equals(self): block1.ref_items = block2.ref_items = index bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])]) bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])]) - self.assert_(bm1.equals(bm2)) + self.assertTrue(bm1.equals(bm2)) # non-unique items index = Index(list('aaabbb')) @@ -602,7 +602,7 @@ def test_equals(self): block1.ref_items = block2.ref_items = index bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])]) bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])]) - self.assert_(bm1.equals(bm2)) + self.assertTrue(bm1.equals(bm2)) if __name__ == '__main__': import nose diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 492f681a72541..aef4e3a72c099 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -75,7 +75,7 @@ def test_dataframe_constructor(self): index=[np.array(['a', 'a', 'b', 'b']), np.array(['x', 'y', 'x', 'y'])]) tm.assert_isinstance(multi.index, MultiIndex) - self.assert_(not isinstance(multi.columns, MultiIndex)) + self.assertNotIsInstance(multi.columns, MultiIndex) multi = DataFrame(np.random.randn(4, 4), columns=[['a', 'a', 'b', 'b'], @@ -157,17 +157,17 @@ def test_reindex(self): def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) - self.assert_(chunk.index is new_index) + self.assertIs(chunk.index, new_index) chunk = self.ymd.ix[new_index] - self.assert_(chunk.index is new_index) + self.assertIs(chunk.index, new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) - self.assert_(chunk.columns is new_index) + self.assertIs(chunk.columns, new_index) chunk = ymdT.ix[:, new_index] - self.assert_(chunk.columns is new_index) + self.assertIs(chunk.columns, new_index) def test_sort_index_preserve_levels(self): result = self.frame.sort_index() @@ -391,6 +391,22 @@ def test_xs(self): assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) + # GH 6574 + # missing values in returned index should be preserrved + acc = [ + ('a','abcde',1), + ('b','bbcde',2), + ('y','yzcde',25), + ('z','xbcde',24), + ('z',None,26), + ('z','zbcde',25), + ('z','ybcde',26), + ] + df = DataFrame(acc, columns=['a1','a2','cnt']).set_index(['a1','a2']) + expected = DataFrame({ 'cnt' : [24,26,25,26] }, index=Index(['xbcde',np.nan,'zbcde','ybcde'],name='a2')) + result = df.xs('z',level='a1') + assert_frame_equal(result, expected) + def test_xs_partial(self): result = self.frame.xs('foo') result2 = self.frame.ix['foo'] @@ -679,8 +695,8 @@ def test_reset_index_with_drop(self): deleveled = self.series.reset_index() tm.assert_isinstance(deleveled, DataFrame) - self.assert_( - len(deleveled.columns) == len(self.series.index.levels) + 1) + self.assertEqual(len(deleveled.columns), + len(self.series.index.levels) + 1) deleveled = self.series.reset_index(drop=True) tm.assert_isinstance(deleveled, Series) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 702307c8b7109..3f6e4c6f3288c 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -64,7 +64,7 @@ def test_copy_names(self): getattr(self.panel, attr).name = None cp = self.panel.copy() getattr(cp, attr).name = 'foo' - self.assert_(getattr(self.panel, attr).name is None) + self.assertIsNone(getattr(self.panel, attr).name) def test_iter(self): tm.equalContents(list(self.panel), self.panel.items) @@ -192,18 +192,18 @@ def test_set_axis(self): self.panel.items = new_items if hasattr(self.panel, '_item_cache'): - self.assert_('ItemA' not in self.panel._item_cache) - self.assert_(self.panel.items is new_items) + self.assertNotIn('ItemA', self.panel._item_cache) + self.assertIs(self.panel.items, new_items) item = self.panel[0] self.panel.major_axis = new_major - self.assert_(self.panel[0].index is new_major) - self.assert_(self.panel.major_axis is new_major) + self.assertIs(self.panel[0].index, new_major) + self.assertIs(self.panel.major_axis, new_major) item = self.panel[0] self.panel.minor_axis = new_minor - self.assert_(self.panel[0].columns is new_minor) - self.assert_(self.panel.minor_axis is new_minor) + self.assertIs(self.panel[0].columns, new_minor) + self.assertIs(self.panel.minor_axis, new_minor) def test_get_axis_number(self): self.assertEqual(self.panel._get_axis_number('items'), 0) @@ -409,10 +409,10 @@ def test_delitem_and_pop(self): expected = self.panel['ItemA'] result = self.panel.pop('ItemA') assert_frame_equal(expected, result) - self.assert_('ItemA' not in self.panel.items) + self.assertNotIn('ItemA', self.panel.items) del self.panel['ItemB'] - self.assert_('ItemB' not in self.panel.items) + self.assertNotIn('ItemB', self.panel.items) self.assertRaises(Exception, self.panel.__delitem__, 'ItemB') values = np.empty((3, 3, 3)) @@ -796,7 +796,7 @@ def test_set_value(self): # resize res = self.panel.set_value('ItemE', 'foo', 'bar', 1.5) tm.assert_isinstance(res, Panel) - self.assert_(res is not self.panel) + self.assertIsNot(res, self.panel) self.assertEqual(res.get_value('ItemE', 'foo', 'bar'), 1.5) res3 = self.panel.set_value('ItemE', 'foobar', 'baz', 5) @@ -831,10 +831,10 @@ def setUp(self): def test_constructor(self): # with BlockManager wp = Panel(self.panel._data) - self.assert_(wp._data is self.panel._data) + self.assertIs(wp._data, self.panel._data) wp = Panel(self.panel._data, copy=True) - self.assert_(wp._data is not self.panel._data) + self.assertIsNot(wp._data, self.panel._data) assert_panel_equal(wp, self.panel) # strings handled prop @@ -846,11 +846,11 @@ def test_constructor(self): # no copy wp = Panel(vals) - self.assert_(wp.values is vals) + self.assertIs(wp.values, vals) # copy wp = Panel(vals, copy=True) - self.assert_(wp.values is not vals) + self.assertIsNot(wp.values, vals) def test_constructor_cast(self): zero_filled = self.panel.fillna(0) @@ -1211,9 +1211,9 @@ def test_reindex_multi(self): minor=self.panel.minor_axis, copy = False) - self.assert_(result.items is self.panel.items) - self.assert_(result.major_axis is self.panel.major_axis) - self.assert_(result.minor_axis is self.panel.minor_axis) + self.assertIs(result.items, self.panel.items) + self.assertIs(result.major_axis, self.panel.major_axis) + self.assertIs(result.minor_axis, self.panel.minor_axis) result = self.panel.reindex(items=self.panel.items, major=self.panel.major_axis, @@ -1337,13 +1337,13 @@ def test_truncate_fillna_bug(self): def test_swapaxes(self): result = self.panel.swapaxes('items', 'minor') - self.assert_(result.items is self.panel.minor_axis) + self.assertIs(result.items, self.panel.minor_axis) result = self.panel.swapaxes('items', 'major') - self.assert_(result.items is self.panel.major_axis) + self.assertIs(result.items, self.panel.major_axis) result = self.panel.swapaxes('major', 'minor') - self.assert_(result.major_axis is self.panel.minor_axis) + self.assertIs(result.major_axis, self.panel.minor_axis) panel = self.panel.copy() result = panel.swapaxes('major', 'minor') @@ -1353,7 +1353,7 @@ def test_swapaxes(self): # this should also work result = self.panel.swapaxes(0, 1) - self.assert_(result.items is self.panel.major_axis) + self.assertIs(result.items, self.panel.major_axis) # this works, but return a copy result = self.panel.swapaxes('items', 'items') diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index d24ff186e2b04..a7a87c998d839 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -171,16 +171,16 @@ def test_set_axis(self): self.panel4d.labels = new_labels if hasattr(self.panel4d, '_item_cache'): - self.assert_('l1' not in self.panel4d._item_cache) - self.assert_(self.panel4d.labels is new_labels) + self.assertNotIn('l1', self.panel4d._item_cache) + self.assertIs(self.panel4d.labels, new_labels) self.panel4d.major_axis = new_major - self.assert_(self.panel4d[0].major_axis is new_major) - self.assert_(self.panel4d.major_axis is new_major) + self.assertIs(self.panel4d[0].major_axis, new_major) + self.assertIs(self.panel4d.major_axis, new_major) self.panel4d.minor_axis = new_minor - self.assert_(self.panel4d[0].minor_axis is new_minor) - self.assert_(self.panel4d.minor_axis is new_minor) + self.assertIs(self.panel4d[0].minor_axis, new_minor) + self.assertIs(self.panel4d.minor_axis, new_minor) def test_get_axis_number(self): self.assertEqual(self.panel4d._get_axis_number('labels'), 0) @@ -294,10 +294,10 @@ def test_delitem_and_pop(self): expected = self.panel4d['l2'] result = self.panel4d.pop('l2') assert_panel_equal(expected, result) - self.assert_('l2' not in self.panel4d.labels) + self.assertNotIn('l2', self.panel4d.labels) del self.panel4d['l3'] - self.assert_('l3' not in self.panel4d.labels) + self.assertNotIn('l3', self.panel4d.labels) self.assertRaises(Exception, self.panel4d.__delitem__, 'l3') values = np.empty((4, 4, 4, 4)) @@ -535,7 +535,7 @@ def test_set_value(self): # resize res = self.panel4d.set_value('l4', 'ItemE', 'foo', 'bar', 1.5) tm.assert_isinstance(res, Panel4D) - self.assert_(res is not self.panel4d) + self.assertIsNot(res, self.panel4d) self.assertEqual(res.get_value('l4', 'ItemE', 'foo', 'bar'), 1.5) res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) @@ -558,10 +558,10 @@ def setUp(self): def test_constructor(self): # with BlockManager panel4d = Panel4D(self.panel4d._data) - self.assert_(panel4d._data is self.panel4d._data) + self.assertIs(panel4d._data, self.panel4d._data) panel4d = Panel4D(self.panel4d._data, copy=True) - self.assert_(panel4d._data is not self.panel4d._data) + self.assertIsNot(panel4d._data, self.panel4d._data) assert_panel4d_equal(panel4d, self.panel4d) # strings handled prop @@ -573,11 +573,11 @@ def test_constructor(self): # no copy panel4d = Panel4D(vals) - self.assert_(panel4d.values is vals) + self.assertIs(panel4d.values, vals) # copy panel4d = Panel4D(vals, copy=True) - self.assert_(panel4d.values is not vals) + self.assertIsNot(panel4d.values, vals) def test_constructor_cast(self): zero_filled = self.panel4d.fillna(0) @@ -851,23 +851,23 @@ def test_fillna(self): def test_swapaxes(self): result = self.panel4d.swapaxes('labels', 'items') - self.assert_(result.items is self.panel4d.labels) + self.assertIs(result.items, self.panel4d.labels) result = self.panel4d.swapaxes('labels', 'minor') - self.assert_(result.labels is self.panel4d.minor_axis) + self.assertIs(result.labels, self.panel4d.minor_axis) result = self.panel4d.swapaxes('items', 'minor') - self.assert_(result.items is self.panel4d.minor_axis) + self.assertIs(result.items, self.panel4d.minor_axis) result = self.panel4d.swapaxes('items', 'major') - self.assert_(result.items is self.panel4d.major_axis) + self.assertIs(result.items, self.panel4d.major_axis) result = self.panel4d.swapaxes('major', 'minor') - self.assert_(result.major_axis is self.panel4d.minor_axis) + self.assertIs(result.major_axis, self.panel4d.minor_axis) # this should also work result = self.panel4d.swapaxes(0, 1) - self.assert_(result.labels is self.panel4d.items) + self.assertIs(result.labels, self.panel4d.items) # this works, but return a copy result = self.panel4d.swapaxes('items', 'items') diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 2ec67de989069..93fa739f7f218 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -74,7 +74,7 @@ def test_copy_index_name_checking(self): cp = self.ts.copy() cp.index.name = 'foo' print(self.ts.index.name) - self.assert_(self.ts.index.name is None) + self.assertIsNone(self.ts.index.name) def test_append_preserve_name(self): result = self.ts[:5].append(self.ts[5:]) @@ -93,7 +93,7 @@ def test_binop_maybe_preserve_name(self): cp = self.ts.copy() cp.name = 'something else' result = self.ts + cp - self.assert_(result.name is None) + self.assertIsNone(result.name) def test_combine_first_name(self): result = self.ts.combine_first(self.ts[:5]) @@ -113,6 +113,28 @@ def test_combine_first_dt64(self): xp = Series([datetime(2010, 1, 1), '2011']) assert_series_equal(rs, xp) + def test_get(self): + + # GH 6383 + s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, + 45, 51, 39, 55, 43, 54, 52, 51, 54])) + + result = s.get(25, 0) + expected = 0 + self.assertEquals(result,expected) + + s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, + 45, 51, 39, 55, 43, 54, 52, 51, 54]), + index=pd.Float64Index([25.0, 36.0, 49.0, 64.0, 81.0, 100.0, + 121.0, 144.0, 169.0, 196.0, 1225.0, + 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, + 1681.0, 1764.0, 1849.0, 1936.0], + dtype='object')) + + result = s.get(25, 0) + expected = 43 + self.assertEquals(result,expected) + def test_delitem(self): # GH 5542 @@ -209,18 +231,18 @@ def test_name_printing(self): # test small series s = Series([0, 1, 2]) s.name = "test" - self.assert_("Name: test" in repr(s)) + self.assertIn("Name: test", repr(s)) s.name = None - self.assert_(not "Name:" in repr(s)) + self.assertNotIn("Name:", repr(s)) # test big series (diff code path) s = Series(lrange(0, 1000)) s.name = "test" - self.assert_("Name: test" in repr(s)) + self.assertIn("Name: test", repr(s)) s.name = None - self.assert_(not "Name:" in repr(s)) + self.assertNotIn("Name:", repr(s)) s = Series(index=date_range('20010101', '20020101'), name='test') - self.assert_("Name: test" in repr(s)) + self.assertIn("Name: test", repr(s)) def test_pickle_preserve_name(self): unpickled = self._pickle_roundtrip_name(self.ts) @@ -321,7 +343,7 @@ def test_scalar_conversion(self): # Pass in scalar is disabled scalar = Series(0.5) - self.assert_(not isinstance(scalar, float)) + self.assertNotIsInstance(scalar, float) # coercion self.assertEqual(float(Series([1.])), 1.0) @@ -351,10 +373,10 @@ def test_constructor(self): # Mixed type Series mixed = Series(['hello', np.NaN], index=[0, 1]) self.assertEqual(mixed.dtype, np.object_) - self.assert_(mixed[1] is np.NaN) + self.assertIs(mixed[1], np.NaN) - self.assert_(not self.empty.is_time_series) - self.assert_(not Series({}).is_time_series) + self.assertFalse(self.empty.is_time_series) + self.assertFalse(Series({}).is_time_series) self.assertRaises(Exception, Series, np.random.randn(3, 3), index=np.arange(3)) @@ -578,6 +600,25 @@ def test_constructor_dtype_datetime64(self): self.assertEqual(result['a'], Timestamp('20130101')) self.assertEqual(result['b'], 1) + # GH6529 + # coerce datetime64 non-ns properly + dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') + values2 = dates.view(np.ndarray).astype('datetime64[ns]') + expected = Series(values2, dates) + + # numpy < 1.7 is very odd about astyping + if not _np_version_under1p7: + for dtype in ['s','D','ms','us','ns']: + values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) + result = Series(values1, dates) + assert_series_equal(result,expected) + + # leave datetime.date alone + dates2 = np.array([ d.date() for d in dates.to_pydatetime() ],dtype=object) + series1 = Series(dates2, dates) + self.assert_numpy_array_equal(series1.values,dates2) + self.assertEqual(series1.dtype,object) + def test_constructor_dict(self): d = {'a': 0., 'b': 1., 'c': 2.} result = Series(d, index=['b', 'c', 'd', 'a']) @@ -637,7 +678,7 @@ def test_fromDict(self): data = {'a': 0, 'b': 1, 'c': 2, 'd': 3} series = Series(data) - self.assert_(tm.is_sorted(series.index)) + self.assertTrue(tm.is_sorted(series.index)) data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()} series = Series(data) @@ -669,6 +710,21 @@ def test_setindex(self): def test_array_finalize(self): pass + def test_pop(self): + # GH 6600 + df = DataFrame({ + 'A': 0, + 'B': np.arange(5,dtype='int64'), + 'C': 0, + }) + k = df.iloc[4] + + result = k.pop('B') + self.assertEqual(result, 4) + + expected = Series([0,0],index=['A','C']) + assert_series_equal(k, expected) + def test_not_hashable(self): s_empty = Series() s = Series([1]) @@ -728,7 +784,7 @@ def test_getitem_get(self): # GH 5652 for s in [Series(), Series(index=list('abc'))]: result = s.get(None) - self.assert_(result is None) + self.assertIsNone(result) def test_iget(self): s = Series(np.random.randn(10), index=lrange(0, 20, 2)) @@ -927,6 +983,12 @@ def test_getitem_dups_with_missing(self): result = s[['foo', 'bar', 'bah', 'bam']] assert_series_equal(result, expected) + def test_getitem_dups(self): + s = Series(range(5),index=['A','A','B','C','C'],dtype=np.int64) + expected = Series([3,4],index=['C','C'],dtype=np.int64) + result = s['C'] + assert_series_equal(result, expected) + def test_setitem_ambiguous_keyerror(self): s = Series(lrange(10), index=lrange(0, 20, 2)) @@ -956,8 +1018,8 @@ def test_slice(self): numSliceEnd = self.series[-10:] objSlice = self.objSeries[10:20] - self.assert_(self.series.index[9] not in numSlice.index) - self.assert_(self.objSeries.index[9] not in objSlice.index) + self.assertNotIn(self.series.index[9], numSlice.index) + self.assertNotIn(self.objSeries.index[9], objSlice.index) self.assertEqual(len(numSlice), len(numSlice.index)) self.assertEqual(self.series[numSlice.index[0]], @@ -1075,13 +1137,13 @@ def test_setitem_dtypes(self): def test_set_value(self): idx = self.ts.index[10] res = self.ts.set_value(idx, 0) - self.assert_(res is self.ts) + self.assertIs(res, self.ts) self.assertEqual(self.ts[idx], 0) # equiv s = self.series.copy() res = s.set_value('foobar', 0) - self.assert_(res is s) + self.assertIs(res, s) self.assertEqual(res.index[-1], 'foobar') self.assertEqual(res['foobar'], 0) @@ -1128,10 +1190,10 @@ def test_reshape_non_2d(self): def test_reshape_2d_return_array(self): x = Series(np.random.random(201), name='x') result = x.reshape((-1, 1)) - self.assert_(not isinstance(result, Series)) + self.assertNotIsInstance(result, Series) result2 = np.reshape(x, (-1, 1)) - self.assert_(not isinstance(result, Series)) + self.assertNotIsInstance(result, Series) result = x[:, None] expected = x.reshape((-1, 1)) @@ -1628,12 +1690,12 @@ def test_repr(self): # 0 as name ser = Series(np.random.randn(100), name=0) rep_str = repr(ser) - self.assert_("Name: 0" in rep_str) + self.assertIn("Name: 0", rep_str) # tidy repr ser = Series(np.random.randn(1001), name=0) rep_str = repr(ser) - self.assert_("Name: 0" in rep_str) + self.assertIn("Name: 0", rep_str) ser = Series(["a\n\r\tb"], name=["a\n\r\td"], index=["a\n\r\tf"]) self.assertFalse("\t" in repr(ser)) @@ -1734,7 +1796,7 @@ def test_keys(self): # HACK: By doing this in two stages, we avoid 2to3 wrapping the call # to .keys() in a list() getkeys = self.ts.keys - self.assert_(getkeys() is self.ts.index) + self.assertIs(getkeys(), self.ts.index) def test_values(self): self.assert_numpy_array_equal(self.ts, self.ts.values) @@ -2044,7 +2106,7 @@ def test_round(self): def test_prod_numpy16_bug(self): s = Series([1., 1., 1.], index=lrange(3)) result = s.prod() - self.assert_(not isinstance(result, Series)) + self.assertNotIsInstance(result, Series) def test_quantile(self): from pandas.compat.scipy import scoreatpercentile @@ -2055,6 +2117,10 @@ def test_quantile(self): q = self.ts.quantile(0.9) self.assertEqual(q, scoreatpercentile(self.ts.valid(), 90)) + # object dtype + q = Series(self.ts,dtype=object).quantile(0.9) + self.assertEqual(q, scoreatpercentile(self.ts.valid(), 90)) + def test_describe(self): _ = self.series.describe() _ = self.ts.describe() @@ -2803,6 +2869,14 @@ def test_datetime64_fillna(self): Timestamp('20130103 9:01:01')]) assert_series_equal(result, expected) + # GH 6587 + # make sure that we are treating as integer when filling + s = Series([pd.NaT, pd.NaT, '2013-08-05 15:30:00.000001']) + expected = Series(['2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001'], dtype='M8[ns]') + result = s.fillna(method='backfill') + assert_series_equal(result, expected) + + def test_fillna_int(self): s = Series(np.random.randint(-100, 100, 50)) s.fillna(method='ffill', inplace=True) @@ -2828,8 +2902,8 @@ def test_fillna(self): ts[2] = np.NaN - self.assert_( - np.array_equal(ts.fillna(method='ffill'), [0., 1., 1., 3., 4.])) + self.assert_numpy_array_equal(ts.fillna(method='ffill'), + [0., 1., 1., 3., 4.]) self.assert_numpy_array_equal(ts.fillna(method='backfill'), [0., 1., 3., 3., 4.]) @@ -2890,7 +2964,7 @@ def test_fillna_invalid_method(self): try: self.ts.fillna(method='ffil') except ValueError as inst: - self.assert_('ffil' in str(inst)) + self.assertIn('ffil', str(inst)) def test_ffill(self): ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) @@ -4008,8 +4082,8 @@ def test_from_csv(self): self.series.to_csv(path) series = Series.from_csv(path) - self.assert_(series.name is None) - self.assert_(series.index.name is None) + self.assertIsNone(series.name) + self.assertIsNone(series.index.name) assert_series_equal(self.series, series) outfile = open(path, 'w') @@ -4791,6 +4865,7 @@ def test_apply_args(self): result = s.apply(str.split, args=(',',)) self.assertEqual(result[0], ['foo', 'bar']) + tm.assert_isinstance(result[0], list) def test_align(self): def _check_align(a, b, how='left', fill=None): @@ -4894,12 +4969,12 @@ def test_align_nocopy(self): def test_align_sameindex(self): a, b = self.ts.align(self.ts, copy=False) - self.assert_(a.index is self.ts.index) - self.assert_(b.index is self.ts.index) + self.assertIs(a.index, self.ts.index) + self.assertIs(b.index, self.ts.index) # a, b = self.ts.align(self.ts, copy=True) - # self.assert_(a.index is not self.ts.index) - # self.assert_(b.index is not self.ts.index) + # self.assertIsNot(a.index, self.ts.index) + # self.assertIsNot(b.index, self.ts.index) def test_reindex(self): identity = self.series.reindex(self.series.index) @@ -4948,7 +5023,7 @@ def test_reindex_corner(self): def test_reindex_pad(self): - s = Series(np.arange(10)) + s = Series(np.arange(10),dtype='int64') s2 = s[::2] reindexed = s2.reindex(s.index, method='pad') @@ -4970,9 +5045,8 @@ def test_reindex_pad(self): result = s.reindex(new_index).ffill(downcast='infer') assert_series_equal(result, expected) - # this preserves dtype - result = s.reindex(new_index, method='ffill') - assert_series_equal(result, expected) + # invalid because we can't forward fill on this type of index + self.assertRaises(ValueError, lambda : s.reindex(new_index, method='ffill')) # inferrence of new dtype s = Series([True,False,False,True],index=list('abcd')) @@ -5093,13 +5167,13 @@ def test_rename(self): assert_series_equal(renamed, renamed2) # partial dict - s = Series(np.arange(4), index=['a', 'b', 'c', 'd']) + s = Series(np.arange(4), index=['a', 'b', 'c', 'd'], dtype='int64') renamed = s.rename({'b': 'foo', 'd': 'bar'}) self.assert_numpy_array_equal(renamed.index, ['a', 'foo', 'c', 'bar']) # index with name renamer = Series( - np.arange(4), index=Index(['a', 'b', 'c', 'd'], name='name')) + np.arange(4), index=Index(['a', 'b', 'c', 'd'], name='name'), dtype='int64') renamed = renamer.rename({}) self.assertEqual(renamed.index.name, renamer.index.name) @@ -5443,7 +5517,7 @@ def test_asfreq(self): result = ts[:0].asfreq('M') self.assertEqual(len(result), 0) - self.assert_(result is not ts) + self.assertIsNot(result, ts) def test_weekday(self): # Just run the function @@ -5521,12 +5595,12 @@ def test_first_last_valid(self): self.assertEqual(index, ts.index[-6]) ts[:] = np.nan - self.assert_(ts.last_valid_index() is None) - self.assert_(ts.first_valid_index() is None) + self.assertIsNone(ts.last_valid_index()) + self.assertIsNone(ts.first_valid_index()) ser = Series([], index=[]) - self.assert_(ser.last_valid_index() is None) - self.assert_(ser.first_valid_index() is None) + self.assertIsNone(ser.last_valid_index()) + self.assertIsNone(ser.first_valid_index()) def test_mpl_compat_hack(self): result = self.ts[:, np.newaxis] @@ -5635,10 +5709,10 @@ def test_reset_index(self): ser.name = 'value' df = ser.reset_index() - self.assert_('value' in df) + self.assertIn('value', df) df = ser.reset_index(name='value2') - self.assert_('value2' in df) + self.assertIn('value2', df) # check inplace s = ser.reset_index(drop=True) @@ -5671,7 +5745,7 @@ def test_timeseries_coercion(self): idx = tm.makeDateIndex(10000) ser = Series(np.random.randn(len(idx)), idx.astype(object)) self.assertTrue(ser.is_time_series) - self.assert_(isinstance(ser.index, DatetimeIndex)) + self.assertIsInstance(ser.index, DatetimeIndex) def test_replace(self): N = 100 diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py index 7e2144e801122..cb3fdcafd4056 100644 --- a/pandas/tests/test_stats.py +++ b/pandas/tests/test_stats.py @@ -12,7 +12,6 @@ assert_almost_equal) import pandas.util.testing as tm - class TestRank(tm.TestCase): _multiprocess_can_split_ = True s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) @@ -23,7 +22,8 @@ class TestRank(tm.TestCase): 3.5, 1.5, 8.0, nan, 5.5]), 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), - 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]) + 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), + 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), } def test_rank_tie_methods(self): @@ -43,6 +43,24 @@ def _check(s, expected, method='average'): series = s if dtype is None else s.astype(dtype) _check(series, results[method], method=method) + def test_rank_dense_method(self): + dtypes = ['O', 'f8', 'i8'] + in_out = [([1], [1]), + ([2], [1]), + ([0], [1]), + ([2,2], [1,1]), + ([1,2,3], [1,2,3]), + ([4,2,1], [3,2,1],), + ([1,1,5,5,3], [1,1,3,3,2]), + ([-5,-4,-3,-2,-1], [1,2,3,4,5])] + + for ser, exp in in_out: + for dtype in dtypes: + s = Series(ser).astype(dtype) + result = s.rank(method='dense') + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + def test_rank_descending(self): dtypes = ['O', 'f8', 'i8'] diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ad55ce5c3aec5..db2d61d997c43 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -32,7 +32,7 @@ class TestStringMethods(tm.TestCase): def test_api(self): # GH 6106 - self.assert_(Series.str is None) + self.assertIsNone(Series.str) def test_iter(self): # GH3638 @@ -548,6 +548,23 @@ def test_extract(self): exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number']) tm.assert_frame_equal(result, exp) + # GH6348 + # not passing index to the extractor + def check_index(index): + data = ['A1', 'B2', 'C'] + index = index[:len(data)] + result = Series(data, index=index).str.extract('(\d)') + exp = Series(['1', '2', NA], index=index) + tm.assert_series_equal(result, exp) + + result = Series(data, index=index).str.extract('(?P\D)(?P\d)?') + exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'], index=index) + tm.assert_frame_equal(result, exp) + + for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + check_index(index()) + def test_get_dummies(self): s = Series(['a|b', 'a|c', np.nan]) result = s.str.get_dummies('|') @@ -778,6 +795,12 @@ def test_split_maxsplit(self): result = s.str.split('asdf', n=-1) tm.assert_series_equal(result, xp) + def test_split_no_pat_with_nonzero_n(self): + s = Series(['split once', 'split once too!']) + result = s.str.split(n=1) + expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']}) + tm.assert_series_equal(expected, result) + def test_pipe_failures(self): # #2119 s = Series(['A|B|C']) @@ -1074,6 +1097,7 @@ def test_encode_decode_errors(self): tm.assert_series_equal(result, exp) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 41a4cf9984c14..90e713d72bdda 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -957,7 +957,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, objs = [objs[k] for k in keys] if keys is None: - objs = [obj for obj in objs if obj is not None] + objs = [obj for obj in objs if obj is not None ] else: # #1649 clean_keys = [] @@ -973,16 +973,43 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, if len(objs) == 0: raise Exception('All objects passed were None') - # consolidate data + # consolidate data & figure out what our result ndim is going to be + ndims = set() for obj in objs: - if isinstance(obj, NDFrame): - obj.consolidate(inplace=True) - self.objs = objs + if not isinstance(obj, NDFrame): + raise TypeError("cannot concatenate a non-NDFrame object") + + # consolidate + obj.consolidate(inplace=True) + ndims.add(obj.ndim) + + # get the sample + # want the higest ndim that we have, and must be non-empty + # unless all objs are empty + sample = None + if len(ndims) > 1: + max_ndim = max(ndims) + for obj in objs: + if obj.ndim == max_ndim and np.sum(obj.shape): + sample = obj + break - sample = objs[0] + else: + # filter out the empties + # if we have not multi-index possibiltes + df = DataFrame([ obj.shape for obj in objs ]).sum(1) + non_empties = df[df!=0] + if len(non_empties) and (keys is None and names is None and levels is None and join_axes is None): + objs = [ objs[i] for i in non_empties.index ] + sample = objs[0] + + if sample is None: + sample = objs[0] + self.objs = objs # Need to flip BlockManager axis in the DataFrame special case - if isinstance(sample, DataFrame): + self._is_frame = isinstance(sample, DataFrame) + if self._is_frame: axis = 1 if axis == 0 else 0 self._is_series = isinstance(sample, ABCSeries) @@ -990,11 +1017,39 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, raise AssertionError("axis must be between 0 and {0}, " "input was {1}".format(sample.ndim, axis)) + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + current_column = 0 + max_ndim = sample.ndim + self.objs, objs = [], self.objs + for obj in objs: + + ndim = obj.ndim + if ndim == max_ndim: + pass + + elif ndim != max_ndim-1: + raise ValueError("cannot concatenate unaligned mixed " + "dimensional NDFrame objects") + + else: + name = getattr(obj,'name',None) + if ignore_index or name is None: + name = current_column + current_column += 1 + + # doing a row-wise concatenation so need everything + # to line up + if self._is_frame and axis == 1: + name = 0 + obj = sample._constructor({ name : obj }) + + self.objs.append(obj) + # note: this is the BlockManager axis (since DataFrame is transposed) self.axis = axis - self.join_axes = join_axes - self.keys = keys self.names = names self.levels = levels diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 6645391aeda64..c3fa5b49fa28b 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -90,8 +90,8 @@ def test_cython_left_outer_join(self): exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 - self.assert_(np.array_equal(ls, exp_ls)) - self.assert_(np.array_equal(rs, exp_rs)) + self.assert_numpy_array_equal(ls, exp_ls) + self.assert_numpy_array_equal(rs, exp_rs) def test_cython_right_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) @@ -116,8 +116,8 @@ def test_cython_right_outer_join(self): exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 - self.assert_(np.array_equal(ls, exp_ls)) - self.assert_(np.array_equal(rs, exp_rs)) + self.assert_numpy_array_equal(ls, exp_ls) + self.assert_numpy_array_equal(rs, exp_rs) def test_cython_inner_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) @@ -140,8 +140,8 @@ def test_cython_inner_join(self): exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 - self.assert_(np.array_equal(ls, exp_ls)) - self.assert_(np.array_equal(rs, exp_rs)) + self.assert_numpy_array_equal(ls, exp_ls) + self.assert_numpy_array_equal(rs, exp_rs) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2') @@ -179,15 +179,15 @@ def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) - self.assert_('key1.foo' in joined) - self.assert_('key1.bar' in joined) + self.assertIn('key1.foo', joined) + self.assertIn('key1.bar', joined) def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) - self.assert_('key1.foo' in joined) - self.assert_('key2.bar' in joined) + self.assertIn('key1.foo', joined) + self.assertIn('key2.bar', joined) def test_merge_common(self): joined = merge(self.df, self.df2) @@ -199,8 +199,8 @@ def test_join_on(self): source = self.source merged = target.join(source, on='C') - self.assert_(np.array_equal(merged['MergedA'], target['A'])) - self.assert_(np.array_equal(merged['MergedD'], target['D'])) + self.assert_numpy_array_equal(merged['MergedA'], target['A']) + self.assert_numpy_array_equal(merged['MergedD'], target['D']) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) @@ -269,7 +269,7 @@ def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on='C') for col in self.source: - self.assert_(col in merged) + self.assertIn(col, merged) self.assert_(merged[col].isnull().all()) merged2 = self.target.join(self.source.reindex([]), on='C', @@ -285,8 +285,8 @@ def test_join_on_inner(self): expected = df.join(df2, on='key') expected = expected[expected['value'].notnull()] - self.assert_(np.array_equal(joined['key'], expected['key'])) - self.assert_(np.array_equal(joined['value'], expected['value'])) + self.assert_numpy_array_equal(joined['key'], expected['key']) + self.assert_numpy_array_equal(joined['value'], expected['value']) self.assert_(joined.index.equals(expected.index)) def test_join_on_singlekey_list(self): @@ -565,8 +565,8 @@ def test_merge_overlap(self): merged = merge(self.left, self.left, on='key') exp_len = (self.left['key'].value_counts() ** 2).sum() self.assertEqual(len(merged), exp_len) - self.assert_('v1_x' in merged) - self.assert_('v1_y' in merged) + self.assertIn('v1_x', merged) + self.assertIn('v1_y', merged) def test_merge_different_column_key_names(self): left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], @@ -612,7 +612,7 @@ def test_join_sort(self): # smoke test joined = left.join(right, on='key', sort=False) - self.assert_(np.array_equal(joined.index, lrange(4))) + self.assert_numpy_array_equal(joined.index, lrange(4)) def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -651,15 +651,15 @@ def test_handle_join_key_pass_array(self): rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer') - self.assert_(np.array_equal(merged['key_0'], - np.array([1, 1, 1, 1, 2, 2, 3, 4, 5]))) + self.assert_numpy_array_equal(merged['key_0'], + np.array([1, 1, 1, 1, 2, 2, 3, 4, 5])) left = DataFrame({'value': lrange(3)}) right = DataFrame({'rvalue': lrange(6)}) key = np.array([0, 1, 1, 2, 2, 3]) merged = merge(left, right, left_index=True, right_on=key, how='outer') - self.assert_(np.array_equal(merged['key_0'], key)) + self.assert_numpy_array_equal(merged['key_0'], key) def test_mixed_type_join_with_suffix(self): # GH #916 @@ -1222,10 +1222,10 @@ def test_append(self): del end_frame['A'] partial_appended = begin_frame.append(end_frame) - self.assert_('A' in partial_appended) + self.assertIn('A', partial_appended) partial_appended = end_frame.append(begin_frame) - self.assert_('A' in partial_appended) + self.assertIn('A', partial_appended) # mixed type handling appended = self.mixed_frame[:5].append(self.mixed_frame[5:]) @@ -1245,11 +1245,11 @@ def test_append(self): appended = self.frame.append(empty) assert_frame_equal(self.frame, appended) - self.assert_(appended is not self.frame) + self.assertIsNot(appended, self.frame) appended = empty.append(self.frame) assert_frame_equal(self.frame, appended) - self.assert_(appended is not self.frame) + self.assertIsNot(appended, self.frame) # overlap self.assertRaises(ValueError, self.frame.append, self.frame, @@ -1414,7 +1414,7 @@ def test_concat_keys_specific_levels(self): levels=[level], names=['group_key']) - self.assert_(np.array_equal(result.columns.levels[0], level)) + self.assert_numpy_array_equal(result.columns.levels[0], level) self.assertEqual(result.columns.names[0], 'group_key') def test_concat_dataframe_keys_bug(self): @@ -1518,7 +1518,7 @@ def test_concat_keys_and_levels(self): ('baz', 'one'), ('baz', 'two')], names=['first', 'second']) self.assertEqual(result.index.names, ('first', 'second') + (None,)) - self.assert_(np.array_equal(result.index.levels[0], ['baz', 'foo'])) + self.assert_numpy_array_equal(result.index.levels[0], ['baz', 'foo']) def test_concat_keys_levels_no_overlap(self): # GH #1406 @@ -1653,6 +1653,77 @@ def test_handle_empty_objects(self): tm.assert_frame_equal(concatted, expected) + # empty as first element with time series + # GH3259 + df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) + empty = DataFrame() + result = concat([df,empty],axis=1) + assert_frame_equal(result, df) + result = concat([empty,df],axis=1) + assert_frame_equal(result, df) + + result = concat([df,empty]) + assert_frame_equal(result, df) + result = concat([empty,df]) + assert_frame_equal(result, df) + + def test_concat_mixed_objs(self): + + # concat mixed series/frames + # G2385 + + # axis 1 + index=date_range('01-Jan-2013', periods=10, freq='H') + arr = np.arange(10, dtype='int64') + s1 = Series(arr, index=index) + s2 = Series(arr, index=index) + df = DataFrame(arr.reshape(-1,1), index=index) + + expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 0]) + result = concat([df,df], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 1]) + result = concat([s1,s2], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2]) + result = concat([s1,s2,s1], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,5).reshape(-1,5), index=index, columns = [0, 0, 1, 2, 3]) + result = concat([s1,df,s2,s2,s1], axis=1) + assert_frame_equal(result, expected) + + # with names + s1.name = 'foo' + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 0]) + result = concat([s1,df,s2], axis=1) + assert_frame_equal(result, expected) + + s2.name = 'bar' + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 'bar']) + result = concat([s1,df,s2], axis=1) + assert_frame_equal(result, expected) + + # ignore index + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2]) + result = concat([s1,df,s2], axis=1, ignore_index=True) + assert_frame_equal(result, expected) + + # axis 0 + expected = DataFrame(np.tile(arr,3).reshape(-1,1), index=index.tolist() * 3, columns = [0]) + result = concat([s1,df,s2]) + assert_frame_equal(result, expected) + + expected = DataFrame(np.tile(arr,3).reshape(-1,1), columns = [0]) + result = concat([s1,df,s2], ignore_index=True) + assert_frame_equal(result, expected) + + # invalid concatente of mixed dims + panel = tm.makePanel() + self.assertRaises(ValueError, lambda : concat([panel,s1],axis=1)) + def test_panel_join(self): panel = tm.makePanel() tm.add_nans(panel) @@ -1967,6 +2038,13 @@ def test_concat_series_axis1_same_names_ignore_index(self): result = concat([s1, s2], axis=1, ignore_index=True) self.assertTrue(np.array_equal(result.columns, [0, 1])) + def test_concat_invalid(self): + + # trying to concat a ndframe with a non-ndframe + df1 = mkdf(10, 2) + for obj in [1, dict(), [1, 2], (1, 2) ]: + self.assertRaises(TypeError, lambda x: concat([ df1, obj ])) + def test_concat_invalid_first_argument(self): df1 = mkdf(10, 2) df2 = mkdf(10, 2) @@ -1975,15 +2053,6 @@ def test_concat_invalid_first_argument(self): # generator ok though concat(DataFrame(np.random.rand(5,5)) for _ in range(3)) - def test_concat_mixed_types_fails(self): - df = DataFrame(randn(10, 1)) - - with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"): - concat([df[0], df], axis=1) - - with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"): - concat([df, df[0]], axis=1) - class TestOrderedMerge(tm.TestCase): def setUp(self): diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index efb28ccb4c9e2..e3cd561920b74 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -76,12 +76,12 @@ def test_labels(self): result, bins = cut(arr, 4, retbins=True) ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', '(0.75, 1]'] - self.assert_(np.array_equal(result.levels, ex_levels)) + self.assert_numpy_array_equal(result.levels, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', '[0.75, 1.001)'] - self.assert_(np.array_equal(result.levels, ex_levels)) + self.assert_numpy_array_equal(result.levels, ex_levels) def test_cut_pass_series_name_to_factor(self): s = Series(np.random.randn(100), name='foo') @@ -95,7 +95,7 @@ def test_label_precision(self): result = cut(arr, 4, precision=2) ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]', '(0.54, 0.72]'] - self.assert_(np.array_equal(result.levels, ex_levels)) + self.assert_numpy_array_equal(result.levels, ex_levels) def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) @@ -137,7 +137,7 @@ def test_qcut(self): assert_almost_equal(bins, ex_bins) ex_levels = cut(arr, ex_bins, include_lowest=True) - self.assert_(np.array_equal(labels, ex_levels)) + self.assert_numpy_array_equal(labels, ex_levels) def test_qcut_bounds(self): arr = np.random.randn(1000) @@ -162,7 +162,7 @@ def test_cut_out_of_bounds(self): mask = result.labels == -1 ex_mask = (arr < -1) | (arr > 1) - self.assert_(np.array_equal(mask, ex_mask)) + self.assert_numpy_array_equal(mask, ex_mask) def test_cut_pass_labels(self): arr = [50, 5, 10, 15, 20, 30, 70] diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 36cfb4870a8fe..f022d97c02395 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -6,6 +6,7 @@ import numpy as np from numpy.testing import assert_equal +import pandas import pandas.util.testing as tm from pandas.tools.util import cartesian_product @@ -23,6 +24,13 @@ def test_simple(self): np.array([ 1, 22, 1, 22, 1, 22])] assert_equal(result, expected) + def test_datetimeindex(self): + # regression test for GitHub issue #6439 + x = pandas.date_range('2000-01-01', periods=2) + result = [pandas.Index(y).day for y in cartesian_product([x, x])] + expected = [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])] + assert_equal(result, expected) + class TestLocaleUtils(tm.TestCase): diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 7de8c25379258..6dbefc4b70930 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,3 +1,4 @@ +from pandas.compat import reduce from pandas.core.index import Index import numpy as np @@ -6,6 +7,7 @@ def match(needles, haystack): needles = Index(needles) return haystack.get_indexer(needles) + def cartesian_product(X): ''' Numpy version of itertools.product or pandas.compat.product. @@ -27,6 +29,17 @@ def cartesian_product(X): b = cumprodX[-1] / cumprodX - return [np.tile(np.repeat(x, b[i]), + return [np.tile(np.repeat(np.asarray(x), b[i]), np.product(a[i])) - for i, x in enumerate(X)] \ No newline at end of file + for i, x in enumerate(X)] + + +def _compose2(f, g): + """Compose 2 callables""" + return lambda *args, **kwargs: f(g(*args, **kwargs)) + + +def compose(*funcs): + """Compose 2 or more callables""" + assert len(funcs) > 1, 'At least 2 callables must be passed to compose' + return reduce(_compose2, funcs) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 3892897e43bb0..7988b01af8c48 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -13,7 +13,6 @@ import pandas.lib as lib import pandas.tslib as tslib - class FreqGroup(object): FR_ANN = 1000 FR_QTR = 2000 @@ -283,11 +282,11 @@ def to_offset(freqstr): try: for stride, name, _ in opattern.findall(freqstr): offset = get_offset(name) + if stride_sign is None: + stride_sign = -1 if stride.startswith('-') else 1 if not stride: stride = 1 stride = int(stride) - if stride_sign is None: - stride_sign = np.sign(stride) offset = offset * int(np.fabs(stride) * stride_sign) if delta is None: delta = offset @@ -637,22 +636,31 @@ def infer_freq(index, warn=True): Parameters ---------- index : DatetimeIndex + if passed a Series will use the values of the series (NOT THE INDEX) warn : boolean, default True Returns ------- freq : string or None None if no discernible frequency + TypeError if the index is not datetime-like """ - from pandas.tseries.index import DatetimeIndex - - if not isinstance(index, DatetimeIndex): - from pandas.tseries.period import PeriodIndex - if isinstance(index, PeriodIndex): - raise ValueError("PeriodIndex given. Check the `freq` attribute " - "instead of using infer_freq.") - index = DatetimeIndex(index) - + import pandas as pd + + if isinstance(index, com.ABCSeries): + values = index.values + if not (com.is_datetime64_dtype(index.values) or values.dtype == object): + raise TypeError("cannot infer freq from a non-convertible dtype on a Series of {0}".format(index.dtype)) + index = values + if isinstance(index, pd.PeriodIndex): + raise TypeError("PeriodIndex given. Check the `freq` attribute " + "instead of using infer_freq.") + if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): + if isinstance(index, (pd.Int64Index, pd.Float64Index)): + raise TypeError("cannot infer freq from a non-convertible index type {0}".format(type(index))) + index = index.values + + index = pd.DatetimeIndex(index) inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq() diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 23181485d3bbb..c58447acec621 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -9,7 +9,7 @@ from pandas.core.common import (isnull, _NS_DTYPE, _INT64_DTYPE, is_list_like,_values_from_object, _maybe_box, notnull, ABCSeries) -from pandas.core.index import Index, Int64Index, _Identity +from pandas.core.index import Index, Int64Index, _Identity, Float64Index import pandas.compat as compat from pandas.compat import u from pandas.tseries.frequencies import ( @@ -144,8 +144,10 @@ class DatetimeIndex(Int64Index): _engine_type = _index.DatetimeEngine + tz = None offset = None _comparables = ['name','freqstr','tz'] + _allow_datetime_index_ops = True def __new__(cls, data=None, freq=None, start=None, end=None, periods=None, @@ -720,6 +722,38 @@ def _get_time_micros(self): values = self._local_timestamps() return tslib.get_time_micros(values) + def to_series(self, keep_tz=False): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index + + Parameters + ---------- + keep_tz : optional, defaults False. + return the data keeping the timezone. + + If keep_tz is True: + + If the timezone is not set or is UTC, the resulting + Series will have a datetime64[ns] dtype. + Otherwise the Series will have an object dtype. + + If keep_tz is False: + + Series will have a datetime64[ns] dtype. + + Returns + ------- + Series + """ + return super(DatetimeIndex, self).to_series(keep_tz=keep_tz) + + def _to_embed(self, keep_tz=False): + """ return an array repr of this object, potentially casting to object """ + if keep_tz and self.tz is not None and str(self.tz) != 'UTC': + return self.asobject.values + return self.values + @property def asobject(self): """ @@ -1033,7 +1067,13 @@ def _can_fast_union(self, other): left_end = left[-1] # Only need to "adjoin", not overlap - return (right_start == left_end + offset) or right_start in left + try: + return (right_start == left_end + offset) or right_start in left + except (ValueError): + + # if we are comparing an offset that does not propogate timezones + # this will raise + return False def _fast_union(self, other): if len(other) == 0: @@ -1366,8 +1406,6 @@ def __getitem__(self, key): return self._simple_new(result, self.name, new_offset, self.tz) - _getitem_slice = __getitem__ - # Try to run function on index first, and then on elements of index # Especially important for group-by functionality def map(self, f): @@ -1382,6 +1420,7 @@ def map(self, f): # alias to offset @property def freq(self): + """ return the frequency object if its set, otherwise None """ return self.offset @cache_readonly @@ -1393,26 +1432,27 @@ def inferred_freq(self): @property def freqstr(self): + """ return the frequency object as a string if its set, otherwise None """ return self.offset.freqstr - year = _field_accessor('year', 'Y') - month = _field_accessor('month', 'M', "The month as January=1, December=12") - day = _field_accessor('day', 'D') - hour = _field_accessor('hour', 'h') - minute = _field_accessor('minute', 'm') - second = _field_accessor('second', 's') - microsecond = _field_accessor('microsecond', 'us') - nanosecond = _field_accessor('nanosecond', 'ns') - weekofyear = _field_accessor('weekofyear', 'woy') - week = weekofyear - dayofweek = _field_accessor('dayofweek', 'dow', - "The day of the week with Monday=0, Sunday=6") - weekday = dayofweek - dayofyear = _field_accessor('dayofyear', 'doy') - quarter = _field_accessor('quarter', 'q') + _year = _field_accessor('year', 'Y') + _month = _field_accessor('month', 'M', "The month as January=1, December=12") + _day = _field_accessor('day', 'D') + _hour = _field_accessor('hour', 'h') + _minute = _field_accessor('minute', 'm') + _second = _field_accessor('second', 's') + _microsecond = _field_accessor('microsecond', 'us') + _nanosecond = _field_accessor('nanosecond', 'ns') + _weekofyear = _field_accessor('weekofyear', 'woy') + _week = _weekofyear + _dayofweek = _field_accessor('dayofweek', 'dow', + "The day of the week with Monday=0, Sunday=6") + _weekday = _dayofweek + _dayofyear = _field_accessor('dayofyear', 'doy') + _quarter = _field_accessor('quarter', 'q') @property - def time(self): + def _time(self): """ Returns numpy array of datetime.time. The time part of the Timestamps. """ @@ -1421,7 +1461,7 @@ def time(self): return _algos.arrmap_object(self.asobject, lambda x: x.time()) @property - def date(self): + def _date(self): """ Returns numpy array of datetime.date. The date part of the Timestamps. """ @@ -1717,6 +1757,33 @@ def max(self, axis=None): max_stamp = self.asi8.max() return Timestamp(max_stamp, tz=self.tz) + def to_julian_date(self): + """ + Convert DatetimeIndex to Float64Index of Julian Dates. + 0 Julian date is noon January 1, 4713 BC. + http://en.wikipedia.org/wiki/Julian_day + """ + + # http://mysite.verizon.net/aesir_research/date/jdalg2.htm + year = self.year + month = self.month + day = self.day + testarr = month < 3 + year[testarr] -= 1 + month[testarr] += 12 + return Float64Index(day + + np.fix((153*month - 457)/5) + + 365*year + + np.floor(year / 4) - + np.floor(year / 100) + + np.floor(year / 400) + + 1721118.5 + + (self.hour + + self.minute/60.0 + + self.second/3600.0 + + self.microsecond/3600.0/1e+6 + + self.nanosecond/3600.0/1e+9 + )/24.0) def _generate_regular_range(start, end, periods, offset): if isinstance(offset, Tick): diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index ab9f49ddd321e..299d532c20b08 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -291,7 +291,7 @@ def _from_name(cls, suffix=None): return cls() -class BusinessDay(CacheableOffset, SingleConstructorOffset): +class BusinessDay(SingleConstructorOffset): """ DateOffset subclass representing possibly n business days """ @@ -399,7 +399,7 @@ def apply(self, other): n -= 5 * k if n == 0 and result.weekday() > 4: n -= 1 - + while n != 0: k = n // abs(n) result = result + timedelta(k) @@ -548,7 +548,7 @@ def name(self): return "%s-%s" % (self.rule_code, _int_to_month[self.n]) -class MonthEnd(CacheableOffset, MonthOffset): +class MonthEnd(MonthOffset): """DateOffset of one month end""" def apply(self, other): @@ -572,7 +572,7 @@ def onOffset(cls, dt): _prefix = 'M' -class MonthBegin(CacheableOffset, MonthOffset): +class MonthBegin(MonthOffset): """DateOffset of one month at beginning""" def apply(self, other): @@ -591,7 +591,7 @@ def onOffset(cls, dt): _prefix = 'MS' -class BusinessMonthEnd(CacheableOffset, MonthOffset): +class BusinessMonthEnd(MonthOffset): """DateOffset increments between business EOM dates""" def isAnchored(self): @@ -619,7 +619,7 @@ def apply(self, other): _prefix = 'BM' -class BusinessMonthBegin(CacheableOffset, MonthOffset): +class BusinessMonthBegin(MonthOffset): """DateOffset of one business month at beginning""" def apply(self, other): @@ -654,7 +654,7 @@ def onOffset(cls, dt): _prefix = 'BMS' -class Week(CacheableOffset, DateOffset): +class Week(DateOffset): """ Weekly offset @@ -744,7 +744,7 @@ class WeekDay(object): _weekday_to_int = dict((v, k) for k, v in _int_to_weekday.items()) -class WeekOfMonth(CacheableOffset, DateOffset): +class WeekOfMonth(DateOffset): """ Describes monthly dates like "the Tuesday of the 2nd week of each month" @@ -830,7 +830,7 @@ def _from_name(cls, suffix=None): weekday = _weekday_to_int[suffix[1:]] return cls(week=week, weekday=weekday) -class LastWeekOfMonth(CacheableOffset, DateOffset): +class LastWeekOfMonth(DateOffset): """ Describes monthly dates in last week of month like "the last Tuesday of each month" @@ -940,7 +940,7 @@ def rule_code(self): return '%s-%s' % (self._prefix, _int_to_month[self.startingMonth]) -class BQuarterEnd(CacheableOffset, QuarterOffset): +class BQuarterEnd(QuarterOffset): """DateOffset increments between business Quarter dates startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... @@ -999,7 +999,7 @@ def onOffset(self, dt): # TODO: This is basically the same as BQuarterEnd -class BQuarterBegin(CacheableOffset, QuarterOffset): +class BQuarterBegin(QuarterOffset): _outputName = "BusinessQuarterBegin" # I suspect this is wrong for *all* of them. _default_startingMonth = 3 @@ -1036,7 +1036,7 @@ def apply(self, other): return as_timestamp(result) -class QuarterEnd(CacheableOffset, QuarterOffset): +class QuarterEnd(QuarterOffset): """DateOffset increments between business Quarter dates startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... @@ -1077,7 +1077,7 @@ def onOffset(self, dt): return MonthEnd().onOffset(dt) and modMonth == 0 -class QuarterBegin(CacheableOffset, QuarterOffset): +class QuarterBegin(QuarterOffset): _outputName = 'QuarterBegin' _default_startingMonth = 3 _from_name_startingMonth = 1 @@ -1129,7 +1129,7 @@ def rule_code(self): return '%s-%s' % (self._prefix, _int_to_month[self.month]) -class BYearEnd(CacheableOffset, YearOffset): +class BYearEnd(YearOffset): """DateOffset increments between business EOM dates""" _outputName = 'BusinessYearEnd' _default_month = 12 @@ -1166,7 +1166,7 @@ def apply(self, other): return result -class BYearBegin(CacheableOffset, YearOffset): +class BYearBegin(YearOffset): """DateOffset increments between business year begin dates""" _outputName = 'BusinessYearBegin' _default_month = 1 @@ -1198,7 +1198,7 @@ def apply(self, other): return as_timestamp(datetime(other.year, self.month, first)) -class YearEnd(CacheableOffset, YearOffset): +class YearEnd(YearOffset): """DateOffset increments between calendar year ends""" _default_month = 12 _prefix = 'A' @@ -1254,7 +1254,7 @@ def onOffset(self, dt): return self.month == dt.month and dt.day == days_in_month -class YearBegin(CacheableOffset, YearOffset): +class YearBegin(YearOffset): """DateOffset increments between calendar year begin dates""" _default_month = 1 _prefix = 'AS' @@ -1300,7 +1300,7 @@ def onOffset(self, dt): return dt.month == self.month and dt.day == 1 -class FY5253(CacheableOffset, DateOffset): +class FY5253(DateOffset): """ Describes 52-53 week fiscal year. This is also known as a 4-4-5 calendar. @@ -1501,7 +1501,7 @@ def _from_name(cls, *args): return cls(**cls._parse_suffix(*args)) -class FY5253Quarter(CacheableOffset, DateOffset): +class FY5253Quarter(DateOffset): """ DateOffset increments between business quarter dates for 52-53 week fiscal year (also known as a 4-4-5 calendar). @@ -1772,7 +1772,7 @@ def _delta_to_nanoseconds(delta): + delta.microseconds) * 1000 -class Day(CacheableOffset, Tick): +class Day(Tick): _inc = timedelta(1) _prefix = 'D' diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 974c0a52a35de..5fca119c14e83 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -551,6 +551,7 @@ class PeriodIndex(Int64Index): >>> idx2 = PeriodIndex(start='2000', end='2010', freq='A') """ _box_scalars = True + _allow_period_index_ops = True __eq__ = _period_index_cmp('__eq__') __ne__ = _period_index_cmp('__ne__') @@ -773,19 +774,19 @@ def asfreq(self, freq=None, how='E'): def to_datetime(self, dayfirst=False): return self.to_timestamp() - year = _field_accessor('year', 0) - month = _field_accessor('month', 3) - day = _field_accessor('day', 4) - hour = _field_accessor('hour', 5) - minute = _field_accessor('minute', 6) - second = _field_accessor('second', 7) - weekofyear = _field_accessor('week', 8) - week = weekofyear - dayofweek = _field_accessor('dayofweek', 10) - weekday = dayofweek - dayofyear = day_of_year = _field_accessor('dayofyear', 9) - quarter = _field_accessor('quarter', 2) - qyear = _field_accessor('qyear', 1) + _year = _field_accessor('year', 0) + _month = _field_accessor('month', 3) + _day = _field_accessor('day', 4) + _hour = _field_accessor('hour', 5) + _minute = _field_accessor('minute', 6) + _second = _field_accessor('second', 7) + _weekofyear = _field_accessor('week', 8) + _week = _weekofyear + _dayofweek = _field_accessor('dayofweek', 10) + _weekday = _dayofweek + _dayofyear = day_of_year = _field_accessor('dayofyear', 9) + _quarter = _field_accessor('quarter', 2) + _qyear = _field_accessor('qyear', 1) # Try to run function on index first, and then on elements of index # Especially important for group-by functionality @@ -1055,8 +1056,6 @@ def __getitem__(self, key): return PeriodIndex(result, name=self.name, freq=self.freq) - _getitem_slice = __getitem__ - def _format_with_header(self, header, **kwargs): return header + self._format_native_types(**kwargs) diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 479fb599f9e3a..626d47b51a30e 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -42,13 +42,13 @@ class TestGenRangeGeneration(tm.TestCase): def test_generate(self): rng1 = list(generate_range(START, END, offset=datetools.bday)) rng2 = list(generate_range(START, END, time_rule='B')) - self.assert_(np.array_equal(rng1, rng2)) + self.assert_numpy_array_equal(rng1, rng2) def test_generate_cday(self): _skip_if_no_cday() rng1 = list(generate_range(START, END, offset=datetools.cday)) rng2 = list(generate_range(START, END, time_rule='C')) - self.assert_(np.array_equal(rng1, rng2)) + self.assert_numpy_array_equal(rng1, rng2) def test_1(self): eq_gen_range(dict(start=datetime(2009, 3, 25), periods=2), @@ -139,7 +139,7 @@ def test_repr(self): def test_getitem(self): smaller = self.rng[:5] - self.assert_(np.array_equal(smaller, self.rng.view(np.ndarray)[:5])) + self.assert_numpy_array_equal(smaller, self.rng.view(np.ndarray)[:5]) self.assertEquals(smaller.offset, self.rng.offset) sliced = self.rng[::5] @@ -148,7 +148,7 @@ def test_getitem(self): fancy_indexed = self.rng[[4, 3, 2, 1, 0]] self.assertEquals(len(fancy_indexed), 5) tm.assert_isinstance(fancy_indexed, DatetimeIndex) - self.assert_(fancy_indexed.freq is None) + self.assertIsNone(fancy_indexed.freq) # 32-bit vs. 64-bit platforms self.assertEquals(self.rng[4], self.rng[np.int_(4)]) @@ -156,7 +156,7 @@ def test_getitem(self): def test_getitem_matplotlib_hackaround(self): values = self.rng[:, None] expected = self.rng.values[:, None] - self.assert_(np.array_equal(values, expected)) + self.assert_numpy_array_equal(values, expected) def test_shift(self): shifted = self.rng.shift(5) @@ -179,7 +179,7 @@ def test_pickle_unpickle(self): pickled = pickle.dumps(self.rng) unpickled = pickle.loads(pickled) - self.assert_(unpickled.offset is not None) + self.assertIsNotNone(unpickled.offset) def test_union(self): # overlapping @@ -204,7 +204,7 @@ def test_union(self): tm.assert_isinstance(the_union, DatetimeIndex) # order does not matter - self.assert_(np.array_equal(right.union(left), the_union)) + self.assert_numpy_array_equal(right.union(left), the_union) # overlapping, but different offset rng = date_range(START, END, freq=datetools.bmonthEnd) @@ -228,7 +228,7 @@ def test_outer_join(self): the_join = left.join(right, how='outer') tm.assert_isinstance(the_join, DatetimeIndex) - self.assert_(the_join.freq is None) + self.assertIsNone(the_join.freq) # non-overlapping, no gap left = self.rng[:5] @@ -242,7 +242,7 @@ def test_outer_join(self): the_join = self.rng.join(rng, how='outer') tm.assert_isinstance(the_join, DatetimeIndex) - self.assert_(the_join.freq is None) + self.assertIsNone(the_join.freq) def test_union_not_cacheable(self): rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) @@ -352,7 +352,7 @@ def test_range_bug(self): start = datetime(2011, 1, 1) exp_values = [start + i * offset for i in range(5)] - self.assert_(np.array_equal(result, DatetimeIndex(exp_values))) + self.assert_numpy_array_equal(result, DatetimeIndex(exp_values)) def test_range_tz(self): # GH 2906 @@ -459,7 +459,7 @@ def test_repr(self): def test_getitem(self): smaller = self.rng[:5] - self.assert_(np.array_equal(smaller, self.rng.view(np.ndarray)[:5])) + self.assert_numpy_array_equal(smaller, self.rng.view(np.ndarray)[:5]) self.assertEquals(smaller.offset, self.rng.offset) sliced = self.rng[::5] @@ -468,7 +468,7 @@ def test_getitem(self): fancy_indexed = self.rng[[4, 3, 2, 1, 0]] self.assertEquals(len(fancy_indexed), 5) tm.assert_isinstance(fancy_indexed, DatetimeIndex) - self.assert_(fancy_indexed.freq is None) + self.assertIsNone(fancy_indexed.freq) # 32-bit vs. 64-bit platforms self.assertEquals(self.rng[4], self.rng[np.int_(4)]) @@ -476,7 +476,7 @@ def test_getitem(self): def test_getitem_matplotlib_hackaround(self): values = self.rng[:, None] expected = self.rng.values[:, None] - self.assert_(np.array_equal(values, expected)) + self.assert_numpy_array_equal(values, expected) def test_shift(self): shifted = self.rng.shift(5) @@ -499,7 +499,7 @@ def test_pickle_unpickle(self): pickled = pickle.dumps(self.rng) unpickled = pickle.loads(pickled) - self.assert_(unpickled.offset is not None) + self.assertIsNotNone(unpickled.offset) def test_union(self): # overlapping @@ -524,7 +524,7 @@ def test_union(self): tm.assert_isinstance(the_union, DatetimeIndex) # order does not matter - self.assert_(np.array_equal(right.union(left), the_union)) + self.assert_numpy_array_equal(right.union(left), the_union) # overlapping, but different offset rng = date_range(START, END, freq=datetools.bmonthEnd) @@ -548,7 +548,7 @@ def test_outer_join(self): the_join = left.join(right, how='outer') tm.assert_isinstance(the_join, DatetimeIndex) - self.assert_(the_join.freq is None) + self.assertIsNone(the_join.freq) # non-overlapping, no gap left = self.rng[:5] @@ -562,7 +562,7 @@ def test_outer_join(self): the_join = self.rng.join(rng, how='outer') tm.assert_isinstance(the_join, DatetimeIndex) - self.assert_(the_join.freq is None) + self.assertIsNone(the_join.freq) def test_intersection_bug(self): # GH #771 diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index b17a1c11efad7..aeb6b74f773dc 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -7,7 +7,7 @@ import numpy as np -from pandas import Index, DatetimeIndex, Timestamp, date_range, period_range +from pandas import Index, DatetimeIndex, Timestamp, Series, date_range, period_range from pandas.tseries.frequencies import to_offset, infer_freq from pandas.tseries.tools import to_datetime @@ -74,6 +74,16 @@ def test_to_offset_negative(): assert(result.n == -310) +def test_to_offset_leading_zero(): + freqstr = '00H 00T 01S' + result = to_offset(freqstr) + assert(result.n == 1) + + freqstr = '-00H 03T 14S' + result = to_offset(freqstr) + assert(result.n == -194) + + def test_anchored_shortcuts(): result = to_offset('W') expected = to_offset('W-SUN') @@ -91,7 +101,7 @@ class TestFrequencyInference(tm.TestCase): def test_raise_if_period_index(self): index = PeriodIndex(start="1/1/1990", periods=20, freq="M") - self.assertRaises(ValueError, infer_freq, index) + self.assertRaises(TypeError, infer_freq, index) def test_raise_if_too_few(self): index = _dti(['12/31/1998', '1/3/1999']) @@ -145,11 +155,11 @@ def _check_tick(self, base_delta, code): index = _dti([b + base_delta * 7] + [b + base_delta * j for j in range(3)]) - self.assert_(infer_freq(index) is None) + self.assertIsNone(infer_freq(index)) index = _dti([b + base_delta * j for j in range(3)] + [b + base_delta * 7]) - self.assert_(infer_freq(index) is None) + self.assertIsNone(infer_freq(index)) def test_weekly(self): days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] @@ -249,7 +259,7 @@ def test_infer_freq(self): def test_not_monotonic(self): rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) rng = rng[::-1] - self.assert_(rng.inferred_freq is None) + self.assertIsNone(rng.inferred_freq) def test_non_datetimeindex(self): rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) @@ -259,6 +269,53 @@ def test_non_datetimeindex(self): result = infer_freq(vals) self.assertEqual(result, rng.inferred_freq) + def test_invalid_index_types(self): + + # test all index types + for i in [ tm.makeIntIndex(10), + tm.makeFloatIndex(10), + tm.makePeriodIndex(10) ]: + self.assertRaises(TypeError, lambda : infer_freq(i)) + + for i in [ tm.makeStringIndex(10), + tm.makeUnicodeIndex(10) ]: + self.assertRaises(ValueError, lambda : infer_freq(i)) + + def test_string_datetimelike_compat(self): + + # GH 6463 + expected = infer_freq(['2004-01', '2004-02', '2004-03', '2004-04']) + result = infer_freq(Index(['2004-01', '2004-02', '2004-03', '2004-04'])) + self.assertEqual(result,expected) + + def test_series(self): + + # GH6407 + # inferring series + + # invalid type of Series + for s in [ Series(np.arange(10)), + Series(np.arange(10.))]: + self.assertRaises(TypeError, lambda : infer_freq(s)) + + # a non-convertible string + self.assertRaises(ValueError, lambda : infer_freq(Series(['foo','bar']))) + + # cannot infer on PeriodIndex + for freq in [None, 'MS', 'Y']: + s = Series(period_range('2013',periods=10,freq=freq)) + self.assertRaises(TypeError, lambda : infer_freq(s)) + + # DateTimeIndex + for freq in ['MS', 'L', 'S']: + s = Series(date_range('20130101',periods=10,freq=freq)) + inferred = infer_freq(s) + self.assertEqual(inferred,freq) + + s = Series(date_range('20130101','20130110')) + inferred = infer_freq(s) + self.assertEqual(inferred,'D') + MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index d30a646b1b1d6..b303b7bb50526 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -107,7 +107,7 @@ def test_apply_out_of_range(self): offset = self._offset(10000) result = Timestamp('20080101') + offset - self.assert_(isinstance(result, datetime)) + self.assertIsInstance(result, datetime) except (OutOfBoundsDatetime): raise except (ValueError, KeyError): @@ -316,7 +316,7 @@ def test_apply_large_n(self): rs = st + off xp = datetime(2011, 12, 26) self.assertEqual(rs, xp) - + off = BDay() * 10 rs = datetime(2014, 1, 5) + off # see #5890 xp = datetime(2014, 1, 17) @@ -2427,25 +2427,9 @@ def get_all_subclasses(cls): return ret class TestCaching(tm.TestCase): - no_simple_ctr = [WeekOfMonth, FY5253, - FY5253Quarter, - LastWeekOfMonth] - - def test_should_cache_month_end(self): - self.assertTrue(MonthEnd()._should_cache()) - - def test_should_cache_bmonth_end(self): - self.assertTrue(BusinessMonthEnd()._should_cache()) - - def test_should_cache_week_month(self): - self.assertTrue(WeekOfMonth(weekday=1, week=2)._should_cache()) - def test_all_cacheableoffsets(self): - for subclass in get_all_subclasses(CacheableOffset): - if subclass.__name__[0] == "_" \ - or subclass in TestCaching.no_simple_ctr: - continue - self.run_X_index_creation(subclass) + # as of GH 6479 (in 0.14.0), offset caching is turned off + # as of v0.12.0 only BusinessMonth/Quarter were actually caching def setUp(self): _daterange_cache.clear() @@ -2462,20 +2446,35 @@ def run_X_index_creation(self, cls): DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=inst1, normalize=True) self.assertTrue(cls() in _daterange_cache, cls) + def test_should_cache_month_end(self): + self.assertFalse(MonthEnd()._should_cache()) + + def test_should_cache_bmonth_end(self): + self.assertFalse(BusinessMonthEnd()._should_cache()) + + def test_should_cache_week_month(self): + self.assertFalse(WeekOfMonth(weekday=1, week=2)._should_cache()) + + def test_all_cacheableoffsets(self): + for subclass in get_all_subclasses(CacheableOffset): + if subclass.__name__[0] == "_" \ + or subclass in TestCaching.no_simple_ctr: + continue + self.run_X_index_creation(subclass) + def test_month_end_index_creation(self): DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=MonthEnd(), normalize=True) - self.assertTrue(MonthEnd() in _daterange_cache) + self.assertFalse(MonthEnd() in _daterange_cache) def test_bmonth_end_index_creation(self): DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=BusinessMonthEnd(), normalize=True) - self.assertTrue(BusinessMonthEnd() in _daterange_cache) + self.assertFalse(BusinessMonthEnd() in _daterange_cache) def test_week_of_month_index_creation(self): inst1 = WeekOfMonth(weekday=1, week=2) DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=inst1, normalize=True) inst2 = WeekOfMonth(weekday=1, week=2) - self.assertTrue(inst2 in _daterange_cache) - + self.assertFalse(inst2 in _daterange_cache) class TestReprNames(tm.TestCase): def test_str_for_named_is_name(self): diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 074a8a04a96a5..4a4fbb146861d 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -53,7 +53,7 @@ def test_period_cons_quarterly(self): for month in MONTHS: freq = 'Q-%s' % month exp = Period('1989Q3', freq=freq) - self.assert_('1989Q3' in str(exp)) + self.assertIn('1989Q3', str(exp)) stamp = exp.to_timestamp('D', how='end') p = Period(stamp, freq=freq) self.assertEquals(p, exp) @@ -203,10 +203,10 @@ def test_freq_str(self): def test_repr(self): p = Period('Jan-2000') - self.assert_('2000-01' in repr(p)) + self.assertIn('2000-01', repr(p)) p = Period('2000-12-15') - self.assert_('2000-12-15' in repr(p)) + self.assertIn('2000-12-15', repr(p)) def test_millisecond_repr(self): p = Period('2000-01-01 12:15:02.123') @@ -1088,7 +1088,7 @@ def test_astype(self): idx = period_range('1990', '2009', freq='A') result = idx.astype('i8') - self.assert_(np.array_equal(result, idx.values)) + self.assert_numpy_array_equal(result, idx.values) def test_constructor_use_start_freq(self): # GH #1118 @@ -1140,8 +1140,8 @@ def test_constructor_arrays_negative_year(self): pindex = PeriodIndex(year=years, quarter=quarters) - self.assert_(np.array_equal(pindex.year, years)) - self.assert_(np.array_equal(pindex.quarter, quarters)) + self.assert_numpy_array_equal(pindex.year, years) + self.assert_numpy_array_equal(pindex.quarter, quarters) def test_constructor_invalid_quarters(self): self.assertRaises(ValueError, PeriodIndex, year=lrange(2000, 2004), @@ -1210,7 +1210,7 @@ def test_comp_period(self): result = idx < idx[10] exp = idx.values < idx.values[10] - self.assert_(np.array_equal(result, exp)) + self.assert_numpy_array_equal(result, exp) def test_getitem_ndim2(self): idx = period_range('2007-01', periods=3, freq='M') @@ -1920,7 +1920,7 @@ def test_join_self(self): for kind in ['inner', 'outer', 'left', 'right']: res = index.join(index, how=kind) - self.assert_(index is res) + self.assertIs(index, res) def test_join_does_not_recur(self): df = tm.makeCustomDataframe(3, 2, data_gen_f=lambda *args: @@ -2215,7 +2215,7 @@ def test_nanosecondly(self): def _check_freq(self, freq, base_date): rng = PeriodIndex(start=base_date, periods=10, freq=freq) exp = np.arange(10, dtype=np.int64) - self.assert_(np.array_equal(rng.values, exp)) + self.assert_numpy_array_equal(rng.values, exp) def test_negone_ordinals(self): freqs = ['A', 'M', 'Q', 'D', 'H', 'T', 'S'] diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 3c6f97b73a6b4..118c09ddf826f 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -124,7 +124,7 @@ def test_high_freq(self): def test_get_datevalue(self): from pandas.tseries.converter import get_datevalue - self.assert_(get_datevalue(None, 'D') is None) + self.assertIsNone(get_datevalue(None, 'D')) self.assertEqual(get_datevalue(1987, 'A'), 1987) self.assertEqual(get_datevalue(Period(1987, 'A'), 'M'), Period('1987-12', 'M').ordinal) @@ -245,7 +245,7 @@ def test_irregular_datetime64_repr_bug(self): plt.clf() ax = fig.add_subplot(211) ret = ser.plot() - self.assert_(ret is not None) + self.assertIsNotNone(ret) for rs, xp in zip(ax.get_lines()[0].get_xdata(), ser.index): self.assertEqual(rs, xp) @@ -793,7 +793,7 @@ def test_secondary_legend(self): self.assertEqual(leg.get_texts()[1].get_text(), 'B (right)') self.assertEqual(leg.get_texts()[2].get_text(), 'C') self.assertEqual(leg.get_texts()[3].get_text(), 'D') - self.assert_(ax.right_ax.get_legend() is None) + self.assertIsNone(ax.right_ax.get_legend()) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) @@ -829,7 +829,7 @@ def test_secondary_legend(self): ax = df.plot(secondary_y=['C', 'D']) leg = ax.get_legend() self.assertEqual(len(leg.get_lines()), 4) - self.assert_(ax.right_ax.get_legend() is None) + self.assertIsNone(ax.right_ax.get_legend()) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) @@ -844,7 +844,7 @@ def test_secondary_legend(self): ax = df.plot(secondary_y=['A', 'B']) leg = ax.get_legend() self.assertEqual(len(leg.get_lines()), 4) - self.assert_(ax.right_ax.get_legend() is None) + self.assertIsNone(ax.right_ax.get_legend()) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) @@ -857,7 +857,7 @@ def test_secondary_legend(self): ax = df.plot(secondary_y=['C', 'D']) leg = ax.get_legend() self.assertEqual(len(leg.get_lines()), 4) - self.assert_(ax.right_ax.get_legend() is None) + self.assertIsNone(ax.right_ax.get_legend()) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 7e9433ac41ddd..23b8905b2ae9a 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -650,6 +650,28 @@ def test_resample_unequal_times(self): # it works! df.resample('AS', 'sum') + def test_resample_consistency(self): + + # GH 6418 + # resample with bfill / limit / reindex consistency + + i30 = index=pd.date_range('2002-02-02', periods=4, freq='30T') + s=pd.Series(np.arange(4.), index=i30) + s[2] = np.NaN + + # Upsample by factor 3 with reindex() and resample() methods: + i10 = pd.date_range(i30[0], i30[-1], freq='10T') + + s10 = s.reindex(index=i10, method='bfill') + s10_2 = s.reindex(index=i10, method='bfill', limit=2) + rl = s.reindex_like(s10, method='bfill', limit=2) + r10_2 = s.resample('10Min', fill_method='bfill', limit=2) + r10 = s.resample('10Min', fill_method='bfill') + + # s10_2, r10, r10_2, rl should all be equal + assert_series_equal(s10_2, r10) + assert_series_equal(s10_2, r10_2) + assert_series_equal(s10_2, rl) def _simple_ts(start, end, freq='D'): rng = date_range(start, end, freq=freq) @@ -951,6 +973,18 @@ def test_resample_tz_localized(self): expected = Series([1.5], index=ex_index) assert_series_equal(result, expected) + # GH 6397 + # comparing an offset that doesn't propogate tz's + rng = date_range('1/1/2011', periods=20000, freq='H') + rng = rng.tz_localize('EST') + ts = DataFrame(index=rng) + ts['first']=np.random.randn(len(rng)) + ts['second']=np.cumsum(np.random.randn(len(rng))) + expected = DataFrame({ 'first' : ts.resample('A',how=np.sum)['first'], + 'second' : ts.resample('A',how=np.mean)['second'] },columns=['first','second']) + result = ts.resample('A', how={'first':np.sum, 'second':np.mean}).reindex(columns=['first','second']) + assert_frame_equal(result,expected) + def test_closed_left_corner(self): # #1465 s = Series(np.random.randn(21), diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 8863a50e86c2e..c490aee134a1a 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -207,6 +207,12 @@ def test_timedelta_ops(self): expected = to_timedelta('00:00:08') tm.assert_almost_equal(result, expected) + # GH 6462 + # consistency in returned values for sum + result = td.sum()[0] + expected = to_timedelta('00:01:21') + tm.assert_almost_equal(result, expected) + def test_to_timedelta_on_missing_values(self): _skip_if_numpy_not_friendly() diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index a5d108df3a232..4113419ba8004 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -13,7 +13,7 @@ from pandas import (Index, Series, TimeSeries, DataFrame, isnull, date_range, Timestamp, Period, DatetimeIndex, - Int64Index, to_datetime, bdate_range) + Int64Index, to_datetime, bdate_range, Float64Index) from pandas.core.daterange import DateRange import pandas.core.datetools as datetools @@ -170,7 +170,7 @@ def test_indexing_over_size_cutoff(self): pos = n * 3 timestamp = df.index[pos] - self.assert_(timestamp in df.index) + self.assertIn(timestamp, df.index) # it works! df.ix[timestamp] @@ -298,7 +298,7 @@ def test_dti_slicing(self): self.assertEquals(v3, Timestamp('6/30/2005')) # don't carry freq through irregular slicing - self.assert_(dti2.freq is None) + self.assertIsNone(dti2.freq) def test_pass_datetimeindex_to_index(self): # Bugs in #1396 @@ -308,7 +308,7 @@ def test_pass_datetimeindex_to_index(self): expected = Index(rng.to_pydatetime(), dtype=object) - self.assert_(np.array_equal(idx.values, expected.values)) + self.assert_numpy_array_equal(idx.values, expected.values) def test_contiguous_boolean_preserve_freq(self): rng = date_range('1/1/2000', '3/1/2000', freq='B') @@ -318,12 +318,12 @@ def test_contiguous_boolean_preserve_freq(self): masked = rng[mask] expected = rng[10:20] - self.assert_(expected.freq is not None) + self.assertIsNotNone(expected.freq) assert_range_equal(masked, expected) mask[22] = True masked = rng[mask] - self.assert_(masked.freq is None) + self.assertIsNone(masked.freq) def test_getitem_median_slice_bug(self): index = date_range('20090415', '20090519', freq='2B') @@ -392,7 +392,7 @@ def test_series_ctor_plus_datetimeindex(self): data = dict((k, 1) for k in rng) result = Series(data, index=rng) - self.assert_(result.index is rng) + self.assertIs(result.index, rng) def test_series_pad_backfill_limit(self): index = np.arange(10) @@ -850,7 +850,7 @@ def test_nat_vector_field_access(self): result = getattr(idx, field) expected = [getattr(x, field) if x is not NaT else -1 for x in idx] - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) def test_nat_scalar_field_access(self): fields = ['year', 'quarter', 'month', 'day', 'hour', @@ -866,7 +866,7 @@ def test_to_datetime_types(self): # empty string result = to_datetime('') - self.assert_(result is NaT) + self.assertIs(result, NaT) result = to_datetime(['', '']) self.assert_(isnull(result).all()) @@ -894,11 +894,9 @@ def test_to_datetime_types(self): def test_to_datetime_unprocessable_input(self): # GH 4928 - self.assert_( - np.array_equal( - to_datetime([1, '1']), - np.array([1, '1'], dtype='O') - ) + self.assert_numpy_array_equal( + to_datetime([1, '1']), + np.array([1, '1'], dtype='O') ) self.assertRaises(TypeError, to_datetime, [1, '1'], errors='raise') @@ -943,7 +941,7 @@ def test_to_datetime_dt64s(self): for dt in oob_dts: self.assertRaises(ValueError, pd.to_datetime, dt, errors='raise') self.assertRaises(ValueError, tslib.Timestamp, dt) - self.assert_(pd.to_datetime(dt, coerce=True) is NaT) + self.assertIs(pd.to_datetime(dt, coerce=True), NaT) def test_to_datetime_array_of_dt64s(self): dts = [ @@ -953,11 +951,9 @@ def test_to_datetime_array_of_dt64s(self): # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing - self.assert_( - np.array_equal( - pd.to_datetime(dts, box=False), - np.array([Timestamp(x).asm8 for x in dts]) - ) + self.assert_numpy_array_equal( + pd.to_datetime(dts, box=False), + np.array([Timestamp(x).asm8 for x in dts]) ) # A list of datetimes where the last one is out of bounds @@ -971,30 +967,26 @@ def test_to_datetime_array_of_dt64s(self): errors='raise' ) - self.assert_( - np.array_equal( - pd.to_datetime(dts_with_oob, box=False, coerce=True), - np.array( + self.assert_numpy_array_equal( + pd.to_datetime(dts_with_oob, box=False, coerce=True), + np.array( [ Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8, iNaT, ], dtype='M8' - ) ) ) # With coerce=False and errors='ignore', out of bounds datetime64s # are converted to their .item(), which depending on the version of # numpy is either a python datetime.datetime or datetime.date - self.assert_( - np.array_equal( - pd.to_datetime(dts_with_oob, box=False, coerce=False), - np.array( + self.assert_numpy_array_equal( + pd.to_datetime(dts_with_oob, box=False, coerce=False), + np.array( [dt.item() for dt in dts_with_oob], dtype='O' - ) ) ) @@ -1034,7 +1026,7 @@ def test_reasonable_keyerror(self): try: index.get_loc('1/1/2000') except KeyError as e: - self.assert_('2000' in str(e)) + self.assertIn('2000', str(e)) def test_reindex_with_datetimes(self): rng = date_range('1/1/2000', periods=20) @@ -1068,7 +1060,7 @@ def test_promote_datetime_date(self): result = rng.get_indexer(ts2.index) expected = rng.get_indexer(ts_slice.index) - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) def test_asfreq_normalize(self): rng = date_range('1/1/2000 09:30', periods=20) @@ -1151,7 +1143,7 @@ def test_repeat(self): rng = date_range('1/1/2000', '1/1/2001') result = rng.repeat(5) - self.assert_(result.freq is None) + self.assertIsNone(result.freq) self.assertEqual(len(result), 5 * len(rng)) def test_at_time(self): @@ -1521,7 +1513,7 @@ def test_timestamp_repr(self): iso8601 = '1850-01-01 01:23:45.012345' stamp = Timestamp(iso8601, tz='US/Eastern') result = repr(stamp) - self.assert_(iso8601 in result) + self.assertIn(iso8601, result) def test_timestamp_from_ordinal(self): @@ -1554,7 +1546,7 @@ def test_astype_object(self): casted = rng.astype('O') exp_values = list(rng) - self.assert_(np.array_equal(casted, exp_values)) + self.assert_numpy_array_equal(casted, exp_values) def test_catch_infinite_loop(self): offset = datetools.DateOffset(minute=5) @@ -1586,7 +1578,7 @@ def test_append_concat(self): rng1.name = 'foo' rng2.name = 'bar' self.assertEqual(rng1.append(rng1).name, 'foo') - self.assert_(rng1.append(rng2).name is None) + self.assertIsNone(rng1.append(rng2).name) def test_append_concat_tz(self): #GH 2938 @@ -1683,7 +1675,7 @@ def test_series_interpolate_intraday(self): new_index = index.append(index + pd.DateOffset(hours=1)).order() result = ts.reindex(new_index).interpolate(method='time') - self.assert_(np.array_equal(result.values, exp.values)) + self.assert_numpy_array_equal(result.values, exp.values) def test_frame_dict_constructor_datetime64_1680(self): dr = date_range('1/1/2012', periods=10) @@ -1742,7 +1734,7 @@ def test_to_html_timestamp(self): df = DataFrame(np.random.randn(10, 4), index=rng) result = df.to_html() - self.assert_('2000-01-01' in result) + self.assertIn('2000-01-01', result) def test_to_csv_numpy_16_bug(self): frame = DataFrame({'a': date_range('1/1/2000', periods=10)}) @@ -1751,7 +1743,7 @@ def test_to_csv_numpy_16_bug(self): frame.to_csv(buf) result = buf.getvalue() - self.assert_('2000-01-01' in result) + self.assertIn('2000-01-01', result) def test_series_map_box_timestamps(self): # #2689, #2627 @@ -1848,7 +1840,7 @@ def test_astype(self): rng = date_range('1/1/2000', periods=10) result = rng.astype('i8') - self.assert_(np.array_equal(result, rng.asi8)) + self.assert_numpy_array_equal(result, rng.asi8) def test_to_period_nofreq(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) @@ -1918,7 +1910,7 @@ def test_comparisons_coverage(self): result = rng == list(rng) exp = rng == rng - self.assert_(np.array_equal(result, exp)) + self.assert_numpy_array_equal(result, exp) def test_map(self): rng = date_range('1/1/2000', periods=10) @@ -1926,7 +1918,7 @@ def test_map(self): f = lambda x: x.strftime('%Y%m%d') result = rng.map(f) exp = [f(x) for x in rng] - self.assert_(np.array_equal(result, exp)) + self.assert_numpy_array_equal(result, exp) def test_add_union(self): rng = date_range('1/1/2000', periods=5) @@ -2024,11 +2016,11 @@ def test_order(self): ordered, dexer = idx.order(return_indexer=True) self.assert_(ordered.is_monotonic) - self.assert_(np.array_equal(dexer, [1, 2, 0])) + self.assert_numpy_array_equal(dexer, [1, 2, 0]) ordered, dexer = idx.order(return_indexer=True, ascending=False) self.assert_(ordered[::-1].is_monotonic) - self.assert_(np.array_equal(dexer, [0, 2, 1])) + self.assert_numpy_array_equal(dexer, [0, 2, 1]) def test_insert(self): idx = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-02']) @@ -2042,7 +2034,7 @@ def test_insert(self): result = idx.insert(1, 'inserted') expected = Index([datetime(2000, 1, 4), 'inserted', datetime(2000, 1, 1), datetime(2000, 1, 2)]) - self.assert_(not isinstance(result, DatetimeIndex)) + self.assertNotIsInstance(result, DatetimeIndex) tm.assert_index_equal(result, expected) idx = date_range('1/1/2000', periods=3, freq='M') @@ -2055,7 +2047,7 @@ def test_map_bug_1677(self): result = index.map(f) expected = np.array([f(index[0])]) - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) def test_groupby_function_tuple_1677(self): df = DataFrame(np.random.rand(100), @@ -2090,7 +2082,7 @@ def test_union(self): i2 = Int64Index(np.arange(10, 30, 2)) result = i1.union(i2) expected = Int64Index(np.arange(0, 30, 2)) - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) def test_union_with_DatetimeIndex(self): i1 = Int64Index(np.arange(0, 20, 2)) @@ -2131,7 +2123,7 @@ def test_join_self(self): kinds = 'outer', 'inner', 'left', 'right' for kind in kinds: joined = index.join(index, how=kind) - self.assert_(index is joined) + self.assertIs(index, joined) def assert_index_parameters(self, index): assert index.freq == '40960N' @@ -2214,7 +2206,7 @@ def test_datetimeindex_accessors(self): def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) - self.assert_(np.array_equal(dti.nanosecond, np.arange(10))) + self.assert_numpy_array_equal(dti.nanosecond, np.arange(10)) def test_datetimeindex_diff(self): dti1 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), @@ -2351,7 +2343,7 @@ def test_datetimeindex_union_join_empty(self): result = dti.union(empty) tm.assert_isinstance(result, DatetimeIndex) - self.assert_(result is result) + self.assertIs(result, result) result = dti.join(empty) tm.assert_isinstance(result, DatetimeIndex) @@ -2398,12 +2390,12 @@ def test_series_comparison_scalars(self): val = datetime(2000, 1, 4) result = self.series > val expected = np.array([x > val for x in self.series]) - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) val = self.series[5] result = self.series > val expected = np.array([x > val for x in self.series]) - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) def test_between(self): left, right = self.series[[2, 7]] @@ -2426,16 +2418,16 @@ def test_NaT_scalar(self): def test_set_none_nan(self): self.series[3] = None - self.assert_(self.series[3] is NaT) + self.assertIs(self.series[3], NaT) self.series[3:5] = None - self.assert_(self.series[4] is NaT) + self.assertIs(self.series[4], NaT) self.series[5] = np.nan - self.assert_(self.series[5] is NaT) + self.assertIs(self.series[5], NaT) self.series[5:7] = np.nan - self.assert_(self.series[6] is NaT) + self.assertIs(self.series[6], NaT) def test_intercept_astype_object(self): @@ -2575,16 +2567,16 @@ def check(val,unit=None,h=1,s=1,us=0): # nan result = Timestamp(np.nan) - self.assert_(result is NaT) + self.assertIs(result, NaT) result = Timestamp(None) - self.assert_(result is NaT) + self.assertIs(result, NaT) result = Timestamp(iNaT) - self.assert_(result is NaT) + self.assertIs(result, NaT) result = Timestamp(NaT) - self.assert_(result is NaT) + self.assertIs(result, NaT) def test_comparison(self): # 5-18-2012 00:00:00.000 @@ -2893,7 +2885,7 @@ def test_date_range_normalize(self): values = np.array([snap + i * offset for i in range(n)], dtype='M8[ns]') - self.assert_(np.array_equal(rng, values)) + self.assert_numpy_array_equal(rng, values) rng = date_range( '1/1/2000 08:15', periods=n, normalize=False, freq='B') @@ -2950,7 +2942,7 @@ def test_setops_preserve_freq(self): self.assertEqual(result.freq, rng.freq) result = rng[:50].union(rng[60:100]) - self.assert_(result.freq is None) + self.assertIsNone(result.freq) result = rng[:50].intersection(rng[25:75]) self.assertEqual(result.freqstr, 'D') @@ -3143,8 +3135,8 @@ def test_to_datetime_infer_datetime_format_consistent_format(self): # Whether the format is explicitly passed, it is inferred, or # it is not inferred, the results should all be the same - self.assert_(np.array_equal(with_format, no_infer)) - self.assert_(np.array_equal(no_infer, yes_infer)) + self.assert_numpy_array_equal(with_format, no_infer) + self.assert_numpy_array_equal(no_infer, yes_infer) def test_to_datetime_infer_datetime_format_inconsistent_format(self): test_series = pd.Series( @@ -3156,10 +3148,10 @@ def test_to_datetime_infer_datetime_format_inconsistent_format(self): # When the format is inconsistent, infer_datetime_format should just # fallback to the default parsing - self.assert_(np.array_equal( + self.assert_numpy_array_equal( pd.to_datetime(test_series, infer_datetime_format=False), pd.to_datetime(test_series, infer_datetime_format=True) - )) + ) test_series = pd.Series( np.array([ @@ -3168,10 +3160,10 @@ def test_to_datetime_infer_datetime_format_inconsistent_format(self): 'Mar/01/2011', ])) - self.assert_(np.array_equal( + self.assert_numpy_array_equal( pd.to_datetime(test_series, infer_datetime_format=False), pd.to_datetime(test_series, infer_datetime_format=True) - )) + ) def test_to_datetime_infer_datetime_format_series_with_nans(self): test_series = pd.Series( @@ -3182,10 +3174,10 @@ def test_to_datetime_infer_datetime_format_series_with_nans(self): np.nan, ])) - self.assert_(np.array_equal( + self.assert_numpy_array_equal( pd.to_datetime(test_series, infer_datetime_format=False), pd.to_datetime(test_series, infer_datetime_format=True) - )) + ) def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): test_series = pd.Series( @@ -3197,10 +3189,10 @@ def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): '01/03/2011 00:00:00', ])) - self.assert_(np.array_equal( + self.assert_numpy_array_equal( pd.to_datetime(test_series, infer_datetime_format=False), pd.to_datetime(test_series, infer_datetime_format=True) - )) + ) class TestGuessDatetimeFormat(tm.TestCase): @@ -3287,6 +3279,91 @@ def test_guess_datetime_format_for_array(self): ) self.assertTrue(format_for_string_of_nans is None) + +class TestTimestampToJulianDate(tm.TestCase): + + def test_compare_1700(self): + r = Timestamp('1700-06-23').to_julian_date() + self.assertEqual(r, 2342145.5) + + def test_compare_2000(self): + r = Timestamp('2000-04-12').to_julian_date() + self.assertEqual(r, 2451646.5) + + def test_compare_2100(self): + r = Timestamp('2100-08-12').to_julian_date() + self.assertEqual(r, 2488292.5) + + def test_compare_hour01(self): + r = Timestamp('2000-08-12T01:00:00').to_julian_date() + self.assertEqual(r, 2451768.5416666666666666) + + def test_compare_hour13(self): + r = Timestamp('2000-08-12T13:00:00').to_julian_date() + self.assertEqual(r, 2451769.0416666666666666) + + +class TestDateTimeIndexToJulianDate(tm.TestCase): + def test_1700(self): + r1 = Float64Index([2345897.5, + 2345898.5, + 2345899.5, + 2345900.5, + 2345901.5]) + r2 = date_range(start=Timestamp('1710-10-01'), + periods=5, + freq='D').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_2000(self): + r1 = Float64Index([2451601.5, + 2451602.5, + 2451603.5, + 2451604.5, + 2451605.5]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='D').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_hour(self): + r1 = Float64Index([2451601.5, + 2451601.5416666666666666, + 2451601.5833333333333333, + 2451601.625, + 2451601.6666666666666666]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='H').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_minute(self): + r1 = Float64Index([2451601.5, + 2451601.5006944444444444, + 2451601.5013888888888888, + 2451601.5020833333333333, + 2451601.5027777777777777]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='T').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + + def test_second(self): + r1 = Float64Index([2451601.5, + 2451601.500011574074074, + 2451601.5000231481481481, + 2451601.5000347222222222, + 2451601.5000462962962962]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='S').to_julian_date() + self.assertIsInstance(r2, Float64Index) + tm.assert_index_equal(r1, r2) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 48fd68b71cfc1..dda722366e53e 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -75,7 +75,7 @@ def test_utc_to_local_no_modify(self): rng_eastern = rng.tz_convert('US/Eastern') # Values are unmodified - self.assert_(np.array_equal(rng.asi8, rng_eastern.asi8)) + self.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) self.assertEqual(rng_eastern.tz, pytz.timezone('US/Eastern')) @@ -89,7 +89,7 @@ def test_localize_utc_conversion(self): converted = rng.tz_localize('US/Eastern') expected_naive = rng + offsets.Hour(5) - self.assert_(np.array_equal(converted.asi8, expected_naive.asi8)) + self.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) # DST ambiguity, this should fail rng = date_range('3/11/2012', '3/12/2012', freq='30T') @@ -146,10 +146,10 @@ def test_tz_localize_dti(self): end='1/1/2005 5:00:30.256', freq='L', tz='utc') - self.assert_(np.array_equal(dti2.values, dti_utc.values)) + self.assert_numpy_array_equal(dti2.values, dti_utc.values) dti3 = dti2.tz_convert('US/Pacific') - self.assert_(np.array_equal(dti3.values, dti_utc.values)) + self.assert_numpy_array_equal(dti3.values, dti_utc.values) dti = DatetimeIndex(start='11/6/2011 1:59', end='11/6/2011 2:00', freq='L') @@ -189,7 +189,7 @@ def test_create_with_tz(self): self.assertEquals(stamp, rng[1]) utc_stamp = Timestamp('3/11/2012 05:00', tz='utc') - self.assert_(utc_stamp.tzinfo is pytz.utc) + self.assertIs(utc_stamp.tzinfo, pytz.utc) self.assertEquals(utc_stamp.hour, 5) stamp = Timestamp('3/11/2012 05:00').tz_localize('utc') @@ -269,7 +269,7 @@ def test_utc_box_timestamp_and_localize(self): # right tzinfo rng = date_range('3/13/2012', '3/14/2012', freq='H', tz='utc') rng_eastern = rng.tz_convert('US/Eastern') - self.assert_('EDT' in repr(rng_eastern[0].tzinfo)) + self.assertIn('EDT', repr(rng_eastern[0].tzinfo)) def test_timestamp_tz_convert(self): strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] @@ -289,7 +289,7 @@ def test_pass_dates_localize_to_utc(self): fromdates = DatetimeIndex(strdates, tz='US/Eastern') self.assertEqual(conv.tz, fromdates.tz) - self.assert_(np.array_equal(conv.values, fromdates.values)) + self.assert_numpy_array_equal(conv.values, fromdates.values) def test_field_access_localize(self): strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] @@ -301,7 +301,7 @@ def test_field_access_localize(self): tz='America/Atikokan') expected = np.arange(10) - self.assert_(np.array_equal(dr.hour, expected)) + self.assert_numpy_array_equal(dr.hour, expected) def test_with_tz(self): tz = pytz.timezone('US/Central') @@ -309,7 +309,7 @@ def test_with_tz(self): # just want it to work start = datetime(2011, 3, 12, tzinfo=pytz.utc) dr = bdate_range(start, periods=50, freq=datetools.Hour()) - self.assert_(dr.tz is pytz.utc) + self.assertIs(dr.tz, pytz.utc) # DateRange with naive datetimes dr = bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc) @@ -317,8 +317,8 @@ def test_with_tz(self): # normalized central = dr.tz_convert(tz) - self.assert_(central.tz is tz) - self.assert_(central[0].tz is tz) + self.assertIs(central.tz, tz) + self.assertIs(central[0].tz, tz) # datetimes with tzinfo set dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), @@ -332,7 +332,7 @@ def test_tz_localize(self): dr = bdate_range('1/1/2009', '1/1/2010') dr_utc = bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) localized = dr.tz_localize(pytz.utc) - self.assert_(np.array_equal(dr_utc, localized)) + self.assert_numpy_array_equal(dr_utc, localized) def test_with_tz_ambiguous_times(self): tz = pytz.timezone('US/Eastern') @@ -373,14 +373,14 @@ def test_infer_dst(self): '11/06/2011 01:00', '11/06/2011 02:00', '11/06/2011 03:00']) localized = di.tz_localize(tz, infer_dst=True) - self.assert_(np.array_equal(dr, localized)) + self.assert_numpy_array_equal(dr, localized) # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=datetools.Hour()) localized = dr.tz_localize(tz) localized_infer = dr.tz_localize(tz, infer_dst=True) - self.assert_(np.array_equal(localized, localized_infer)) + self.assert_numpy_array_equal(localized, localized_infer) # test utility methods @@ -426,7 +426,7 @@ def test_index_with_timezone_repr(self): rng_eastern = rng.tz_localize('US/Eastern') rng_repr = repr(rng_eastern) - self.assert_('2010-04-13 00:00:00' in rng_repr) + self.assertIn('2010-04-13 00:00:00', rng_repr) def test_index_astype_asobject_tzinfos(self): # #1345 @@ -484,9 +484,9 @@ def test_fixedtz_topydatetime(self): datetime(2000, 1, 2, tzinfo=fixed_off), datetime(2000, 1, 3, tzinfo=fixed_off)]) result = to_datetime(dates).to_pydatetime() - self.assert_(np.array_equal(dates, result)) + self.assert_numpy_array_equal(dates, result) result = to_datetime(dates)._mpl_repr() - self.assert_(np.array_equal(dates, result)) + self.assert_numpy_array_equal(dates, result) def test_convert_tz_aware_datetime_datetime(self): # #1581 @@ -502,15 +502,15 @@ def test_convert_tz_aware_datetime_datetime(self): converted = to_datetime(dates_aware, utc=True) ex_vals = [Timestamp(x).value for x in dates_aware] - self.assert_(np.array_equal(converted.asi8, ex_vals)) - self.assert_(converted.tz is pytz.utc) + self.assert_numpy_array_equal(converted.asi8, ex_vals) + self.assertIs(converted.tz, pytz.utc) def test_to_datetime_utc(self): from dateutil.parser import parse arr = np.array([parse('2012-06-13T01:39:00Z')], dtype=object) result = to_datetime(arr, utc=True) - self.assert_(result.tz is pytz.utc) + self.assertIs(result.tz, pytz.utc) def test_to_datetime_tzlocal(self): from dateutil.parser import parse @@ -521,12 +521,12 @@ def test_to_datetime_tzlocal(self): arr = np.array([dt], dtype=object) result = to_datetime(arr, utc=True) - self.assert_(result.tz is pytz.utc) + self.assertIs(result.tz, pytz.utc) rng = date_range('2012-11-03 03:00', '2012-11-05 03:00', tz=tzlocal()) arr = rng.to_pydatetime() result = to_datetime(arr, utc=True) - self.assert_(result.tz is pytz.utc) + self.assertIs(result.tz, pytz.utc) def test_frame_no_datetime64_dtype(self): @@ -858,18 +858,18 @@ def test_equal_join_ensure_utc(self): ts_moscow = ts.tz_convert('Europe/Moscow') result = ts + ts_moscow - self.assert_(result.index.tz is pytz.utc) + self.assertIs(result.index.tz, pytz.utc) result = ts_moscow + ts - self.assert_(result.index.tz is pytz.utc) + self.assertIs(result.index.tz, pytz.utc) df = DataFrame({'a': ts}) df_moscow = df.tz_convert('Europe/Moscow') result = df + df_moscow - self.assert_(result.index.tz is pytz.utc) + self.assertIs(result.index.tz, pytz.utc) result = df_moscow + df - self.assert_(result.index.tz is pytz.utc) + self.assertIs(result.index.tz, pytz.utc) def test_arith_utc_convert(self): rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 8c31254d26c02..19703b9e30ef6 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -7,11 +7,47 @@ from pandas.core.api import Timestamp from pandas.tslib import period_asfreq, period_ordinal +from pandas.tseries.index import date_range from pandas.tseries.frequencies import get_freq from pandas import _np_version_under1p7 import pandas.util.testing as tm class TestTimestamp(tm.TestCase): + def test_repr(self): + date = '2014-03-07' + tz = 'US/Eastern' + freq = 'M' + + date_only = Timestamp(date) + self.assertIn(date, repr(date_only)) + self.assertNotIn(tz, repr(date_only)) + self.assertNotIn(freq, repr(date_only)) + self.assertEqual(date_only, eval(repr(date_only))) + + date_tz = Timestamp(date, tz=tz) + self.assertIn(date, repr(date_tz)) + self.assertIn(tz, repr(date_tz)) + self.assertNotIn(freq, repr(date_tz)) + self.assertEqual(date_tz, eval(repr(date_tz))) + + date_freq = Timestamp(date, offset=freq) + self.assertIn(date, repr(date_freq)) + self.assertNotIn(tz, repr(date_freq)) + self.assertIn(freq, repr(date_freq)) + self.assertEqual(date_freq, eval(repr(date_freq))) + + date_tz_freq = Timestamp(date, tz=tz, offset=freq) + self.assertIn(date, repr(date_tz_freq)) + self.assertIn(tz, repr(date_tz_freq)) + self.assertIn(freq, repr(date_tz_freq)) + self.assertEqual(date_tz_freq, eval(repr(date_tz_freq))) + + # this can cause the tz field to be populated, but it's redundant to information in the datestring + date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None) + self.assertIn('2014-03-13 00:00:00-0400', repr(date_with_utc_offset)) + self.assertNotIn('tzoffset', repr(date_with_utc_offset)) + self.assertEqual(date_with_utc_offset, eval(repr(date_with_utc_offset))) + def test_bounds_with_different_units(self): out_of_bounds_dates = ( '1677-09-21', @@ -90,30 +126,26 @@ def test_does_not_convert_mixed_integer(self): class TestArrayToDatetime(tm.TestCase): def test_parsing_valid_dates(self): arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) - self.assert_( - np.array_equal( - tslib.array_to_datetime(arr), - np.array( + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr), + np.array( [ '2013-01-01T00:00:00.000000000-0000', '2013-01-02T00:00:00.000000000-0000' ], dtype='M8[ns]' - ) ) ) arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) - self.assert_( - np.array_equal( - tslib.array_to_datetime(arr), - np.array( + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr), + np.array( [ '2013-09-16T00:00:00.000000000-0000', '2013-09-17T00:00:00.000000000-0000' ], dtype='M8[ns]' - ) ) ) @@ -122,10 +154,10 @@ def test_number_looking_strings_not_into_datetime(self): # These strings don't look like datetimes so they shouldn't be # attempted to be converted arr = np.array(['-352.737091', '183.575577'], dtype=object) - self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + self.assert_numpy_array_equal(tslib.array_to_datetime(arr), arr) arr = np.array(['1', '2', '3', '4', '5'], dtype=object) - self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + self.assert_numpy_array_equal(tslib.array_to_datetime(arr), arr) def test_coercing_dates_outside_of_datetime64_ns_bounds(self): invalid_dates = [ @@ -154,16 +186,14 @@ def test_coercing_dates_outside_of_datetime64_ns_bounds(self): ) arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) - self.assert_( - np.array_equal( - tslib.array_to_datetime(arr, coerce=True), - np.array( + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr, coerce=True), + np.array( [ tslib.iNaT, '2000-01-01T00:00:00.000000000-0000' ], dtype='M8[ns]' - ) ) ) @@ -172,20 +202,18 @@ def test_coerce_of_invalid_datetimes(self): # Without coercing, the presence of any invalid dates prevents # any values from being converted - self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + self.assert_numpy_array_equal(tslib.array_to_datetime(arr), arr) # With coercing, the invalid dates becomes iNaT - self.assert_( - np.array_equal( - tslib.array_to_datetime(arr, coerce=True), - np.array( + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr, coerce=True), + np.array( [ '2013-01-01T00:00:00.000000000-0000', tslib.iNaT, tslib.iNaT ], dtype='M8[ns]' - ) ) ) @@ -204,13 +232,11 @@ def test_parsing_timezone_offsets(self): ) for dt_string in dt_strings: - self.assert_( - np.array_equal( - tslib.array_to_datetime( - np.array([dt_string], dtype=object) - ), - expected_output - ) + self.assert_numpy_array_equal( + tslib.array_to_datetime( + np.array([dt_string], dtype=object) + ), + expected_output ) class TestTimestampNsOperations(tm.TestCase): @@ -302,10 +328,46 @@ def test_period_ordinal_business_day(self): # Tuesday self.assertEqual(11418, period_ordinal(2013, 10, 8, 0, 0, 0, 0, 0, get_freq('B'))) -class TestTomeStampOps(tm.TestCase): +class TestTimestampOps(tm.TestCase): def test_timestamp_and_datetime(self): - self.assertEqual((Timestamp(datetime.datetime(2013, 10,13)) - datetime.datetime(2013, 10,12)).days, 1) - self.assertEqual((datetime.datetime(2013, 10, 12) - Timestamp(datetime.datetime(2013, 10,13))).days, -1) + self.assertEqual((Timestamp(datetime.datetime(2013, 10, 13)) - datetime.datetime(2013, 10, 12)).days, 1) + self.assertEqual((datetime.datetime(2013, 10, 12) - Timestamp(datetime.datetime(2013, 10, 13))).days, -1) + + def test_addition_subtraction_types(self): + # Assert on the types resulting from Timestamp +/- various date/time objects + datetime_instance = datetime.datetime(2014, 3, 4) + timedelta_instance = datetime.timedelta(seconds=1) + # build a timestamp with a frequency, since then it supports addition/subtraction of integers + timestamp_instance = date_range(datetime_instance, periods=1, freq='D')[0] + + self.assertEqual(type(timestamp_instance + 1), Timestamp) + self.assertEqual(type(timestamp_instance - 1), Timestamp) + + # Timestamp + datetime not supported, though subtraction is supported and yields timedelta + self.assertEqual(type(timestamp_instance - datetime_instance), datetime.timedelta) + + self.assertEqual(type(timestamp_instance + timedelta_instance), Timestamp) + self.assertEqual(type(timestamp_instance - timedelta_instance), Timestamp) + + if not _np_version_under1p7: + # Timestamp +/- datetime64 not supported, so not tested (could possibly assert error raised?) + timedelta64_instance = np.timedelta64(1, 'D') + self.assertEqual(type(timestamp_instance + timedelta64_instance), Timestamp) + self.assertEqual(type(timestamp_instance - timedelta64_instance), Timestamp) + + def test_addition_subtraction_preserve_frequency(self): + timestamp_instance = date_range('2014-03-05', periods=1, freq='D')[0] + timedelta_instance = datetime.timedelta(days=1) + original_freq = timestamp_instance.freq + self.assertEqual((timestamp_instance + 1).freq, original_freq) + self.assertEqual((timestamp_instance - 1).freq, original_freq) + self.assertEqual((timestamp_instance + timedelta_instance).freq, original_freq) + self.assertEqual((timestamp_instance - timedelta_instance).freq, original_freq) + + if not _np_version_under1p7: + timedelta64_instance = np.timedelta64(1, 'D') + self.assertEqual((timestamp_instance + timedelta64_instance).freq, original_freq) + self.assertEqual((timestamp_instance - timedelta64_instance).freq, original_freq) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 57df3c6651ad4..88559fdfee9de 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -34,6 +34,7 @@ cimport cython from datetime import timedelta, datetime from datetime import time as datetime_time +from dateutil.tz import tzoffset from pandas.compat import parse_date from sys import version_info @@ -183,6 +184,10 @@ class Timestamp(_Timestamp): if ts.value == NPY_NAT: return NaT + if util.is_string_object(offset): + from pandas.tseries.frequencies import to_offset + offset = to_offset(offset) + # make datetime happy ts_base = _Timestamp.__new__(cls, ts.dts.year, ts.dts.month, ts.dts.day, ts.dts.hour, ts.dts.min, @@ -196,26 +201,28 @@ class Timestamp(_Timestamp): return ts_base def __repr__(self): - result = self._repr_base + stamp = self._repr_base zone = None try: - result += self.strftime('%z') + stamp += self.strftime('%z') if self.tzinfo: zone = _get_zone(self.tzinfo) except ValueError: year2000 = self.replace(year=2000) - result += year2000.strftime('%z') + stamp += year2000.strftime('%z') if self.tzinfo: zone = _get_zone(self.tzinfo) try: - result += zone.strftime(' %%Z') + stamp += zone.strftime(' %%Z') except: pass - zone = "'%s'" % zone if zone else 'None' - return "Timestamp('%s', tz=%s)" % (result, zone) + tz = ", tz='{0}'".format(zone) if zone is not None and not isinstance(zone, tzoffset) else "" + offset = ", offset='{0}'".format(self.offset.freqstr) if self.offset is not None else "" + + return "Timestamp('{stamp}'{tz}{offset})".format(stamp=stamp, tz=tz, offset=offset) @property def _date_repr(self): @@ -384,6 +391,30 @@ class Timestamp(_Timestamp): or self.tzinfo is not None or self.nanosecond != 0) + def to_julian_date(self): + """ + Convert TimeStamp to a Julian Date. + 0 Julian date is noon January 1, 4713 BC. + """ + year = self.year + month = self.month + day = self.day + if month <= 2: + year -= 1 + month += 12 + return (day + + np.fix((153*month - 457)/5) + + 365*year + + np.floor(year / 4) - + np.floor(year / 100) + + np.floor(year / 400) + + 1721118.5 + + (self.hour + + self.minute/60.0 + + self.second/3600.0 + + self.microsecond/3600.0/1e+6 + + self.nanosecond/3600.0/1e+9 + )/24.0) _nat_strings = set(['NaT','nat','NAT','nan','NaN','NAN']) class NaTType(_NaT): @@ -657,17 +688,17 @@ cdef class _Timestamp(datetime): if is_timedelta64_object(other): other_int = other.astype('timedelta64[ns]').astype(int) - return Timestamp(self.value + other_int, tz=self.tzinfo) + return Timestamp(self.value + other_int, tz=self.tzinfo, offset=self.offset) if is_integer_object(other): if self.offset is None: raise ValueError("Cannot add integral value to Timestamp " "without offset.") - return Timestamp((self.offset * other).apply(self)) + return Timestamp((self.offset * other).apply(self), offset=self.offset) if isinstance(other, timedelta) or hasattr(other, 'delta'): nanos = _delta_to_nanoseconds(other) - return Timestamp(self.value + nanos, tz=self.tzinfo) + return Timestamp(self.value + nanos, tz=self.tzinfo, offset=self.offset) result = datetime.__add__(self, other) if isinstance(result, datetime): @@ -676,11 +707,11 @@ cdef class _Timestamp(datetime): return result def __sub__(self, other): - if is_integer_object(other): - neg_other = -other - return self + neg_other - # This calling convention is required - return datetime.__sub__(self, other) + if isinstance(other, datetime): + return datetime.__sub__(self, other) + + neg_other = -other + return self + neg_other cpdef _get_field(self, field): out = get_date_field(np.array([self.value], dtype=np.int64), field) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 7499be7cfd3ae..2860cdf3b200d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -71,30 +71,49 @@ def assert_numpy_array_equal(self, np_array, assert_equal): return raise AssertionError('{0} is not equal to {1}.'.format(np_array, assert_equal)) - def assertIs(self, a, b, msg=''): - """Checks that 'a' is 'b'""" + def assertIs(self, first, second, msg=''): + """Checks that 'first' is 'second'""" + a, b = first, second assert a is b, "%s: %r is not %r" % (msg.format(a,b), a, b) - def assertIsNot(self, a, b, msg=''): - """Checks that 'a' is not 'b'""" + def assertIsNot(self, first, second, msg=''): + """Checks that 'first' is not 'second'""" + a, b = first, second assert a is not b, "%s: %r is %r" % (msg.format(a,b), a, b) - def assertIsNone(self, a, msg=''): - """Checks that 'a' is None""" - self.assertIs(a, None, msg) + def assertIsNone(self, expr, msg=''): + """Checks that 'expr' is None""" + self.assertIs(expr, None, msg) - def assertIsNotNone(self, a, msg=''): - """Checks that 'a' is not None""" - self.assertIsNot(a, None, msg) + def assertIsNotNone(self, expr, msg=''): + """Checks that 'expr' is not None""" + self.assertIsNot(expr, None, msg) - def assertIn(self, a, b, msg=''): - """Checks that 'a' is in 'b'""" + def assertIn(self, first, second, msg=''): + """Checks that 'first' is in 'second'""" + a, b = first, second assert a in b, "%s: %r is not in %r" % (msg.format(a,b), a, b) - def assertNotIn(self, a, b, msg=''): - """Checks that 'a' is not in 'b'""" + def assertNotIn(self, first, second, msg=''): + """Checks that 'first' is not in 'second'""" + a, b = first, second assert a not in b, "%s: %r is in %r" % (msg.format(a,b), a, b) + def assertIsInstance(self, obj, cls, msg=''): + """Test that obj is an instance of cls + (which can be a class or a tuple of classes, + as supported by isinstance()).""" + assert isinstance(obj, cls), ( + "%sExpected object to be of type %r, found %r instead" % ( + msg, cls, type(obj))) + + def assertNotIsInstance(self, obj, cls, msg=''): + """Test that obj is not an instance of cls + (which can be a class or a tuple of classes, + as supported by isinstance()).""" + assert not isinstance(obj, cls), ( + "%sExpected object to be of type %r, found %r instead" % ( + msg, cls, type(obj))) # NOTE: don't pass an NDFrame or index to this function - may not handle it # well. @@ -480,12 +499,18 @@ def is_sorted(seq): def assert_series_equal(left, right, check_dtype=True, check_index_type=False, check_series_type=False, - check_less_precise=False): + check_less_precise=False, + check_exact=False): if check_series_type: assert_isinstance(left, type(right)) if check_dtype: assert_attr_equal('dtype', left, right) - assert_almost_equal(left.values, right.values, check_less_precise) + if check_exact: + if not np.array_equal(left.values, right.values): + raise AssertionError('{0} is not equal to {1}.'.format(left.values, + right.values)) + else: + assert_almost_equal(left.values, right.values, check_less_precise) if check_less_precise: assert_almost_equal( left.index.values, right.index.values, check_less_precise) @@ -503,7 +528,8 @@ def assert_frame_equal(left, right, check_dtype=True, check_frame_type=False, check_less_precise=False, check_names=True, - by_blocks=False): + by_blocks=False, + check_exact=False): if check_frame_type: assert_isinstance(left, type(right)) assert_isinstance(left, DataFrame) @@ -536,7 +562,8 @@ def assert_frame_equal(left, right, check_dtype=True, assert_series_equal(lcol, rcol, check_dtype=check_dtype, check_index_type=check_index_type, - check_less_precise=check_less_precise) + check_less_precise=check_less_precise, + check_exact=check_exact) if check_index_type: assert_isinstance(left.index, type(right.index)) @@ -623,9 +650,9 @@ def makeFloatIndex(k=10): return Index(values * (10 ** np.random.randint(0, 9))) -def makeDateIndex(k=10): +def makeDateIndex(k=10, freq='B'): dt = datetime(2000, 1, 1) - dr = bdate_range(dt, periods=k) + dr = bdate_range(dt, periods=k, freq=freq) return DatetimeIndex(dr) @@ -658,10 +685,10 @@ def getSeriesData(): return dict((c, Series(randn(N), index=index)) for c in getCols(K)) -def makeTimeSeries(nper=None): +def makeTimeSeries(nper=None, freq='B'): if nper is None: nper = N - return Series(randn(nper), index=makeDateIndex(nper)) + return Series(randn(nper), index=makeDateIndex(nper, freq=freq)) def makePeriodSeries(nper=None): @@ -670,16 +697,16 @@ def makePeriodSeries(nper=None): return Series(randn(nper), index=makePeriodIndex(nper)) -def getTimeSeriesData(nper=None): - return dict((c, makeTimeSeries(nper)) for c in getCols(K)) +def getTimeSeriesData(nper=None, freq='B'): + return dict((c, makeTimeSeries(nper, freq)) for c in getCols(K)) def getPeriodData(nper=None): return dict((c, makePeriodSeries(nper)) for c in getCols(K)) # make frame -def makeTimeDataFrame(nper=None): - data = getTimeSeriesData(nper) +def makeTimeDataFrame(nper=None, freq='B'): + data = getTimeSeriesData(nper, freq) return DataFrame(data) @@ -892,6 +919,72 @@ def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True, return DataFrame(data, index, columns, dtype=dtype) +def _create_missing_idx(nrows, ncols, density, random_state=None): + if random_state is None: + random_state = np.random + else: + random_state = np.random.RandomState(random_state) + + # below is cribbed from scipy.sparse + size = int(np.round((1 - density) * nrows * ncols)) + # generate a few more to ensure unique values + min_rows = 5 + fac = 1.02 + extra_size = min(size + min_rows, fac * size) + + def _gen_unique_rand(rng, _extra_size): + ind = rng.rand(int(_extra_size)) + return np.unique(np.floor(ind * nrows * ncols))[:size] + + ind = _gen_unique_rand(random_state, extra_size) + while ind.size < size: + extra_size *= 1.05 + ind = _gen_unique_rand(random_state, extra_size) + + j = np.floor(ind * 1. / nrows).astype(int) + i = (ind - j * nrows).astype(int) + return i.tolist(), j.tolist() + + +def makeMissingCustomDataframe(nrows, ncols, density=.9, random_state=None, + c_idx_names=True, r_idx_names=True, + c_idx_nlevels=1, r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, r_ndupe_l=None, dtype=None, + c_idx_type=None, r_idx_type=None): + """ + Parameters + ---------- + Density : float, optional + Float in (0, 1) that gives the percentage of non-missing numbers in + the DataFrame. + random_state : {np.random.RandomState, int}, optional + Random number generator or random seed. + + See makeCustomDataframe for descriptions of the rest of the parameters. + """ + df = makeCustomDataframe(nrows, ncols, c_idx_names=c_idx_names, + r_idx_names=r_idx_names, + c_idx_nlevels=c_idx_nlevels, + r_idx_nlevels=r_idx_nlevels, + data_gen_f=data_gen_f, + c_ndupe_l=c_ndupe_l, r_ndupe_l=r_ndupe_l, + dtype=dtype, c_idx_type=c_idx_type, + r_idx_type=r_idx_type) + + i, j = _create_missing_idx(nrows, ncols, density, random_state) + df.values[i, j] = np.nan + return df + + +def makeMissingDataframe(density=.9, random_state=None): + df = makeDataFrame() + i, j = _create_missing_idx(*df.shape, density=density, + random_state=random_state) + df.values[i, j] = np.nan + return df + + def add_nans(panel): I, J, N = panel.shape for i, item in enumerate(panel.items): diff --git a/vb_suite/frame_ctor.py b/vb_suite/frame_ctor.py index 1d8df95de9fe3..8180b39b116fe 100644 --- a/vb_suite/frame_ctor.py +++ b/vb_suite/frame_ctor.py @@ -1,7 +1,15 @@ from vbench.benchmark import Benchmark from datetime import datetime +try: + import pandas.tseries.offsets as offsets +except: + import pandas.core.datetools as offsets common_setup = """from pandas_vb_common import * +try: + from pandas.tseries.offsets import * +except: + from pandas.core.datetools import * """ #---------------------------------------------------------------------- @@ -36,6 +44,21 @@ """ frame_ctor_nested_dict_int64 = Benchmark("DataFrame(data)", setup) +# dynamically generate benchmarks for every offset +dynamic_benchmarks = {} +n_steps = [1, 2] +for offset in offsets.__all__: + for n in n_steps: + setup = common_setup + """ +df = DataFrame(np.random.randn(1000,10),index=date_range('1/1/1900',periods=1000,freq={}({}))) +d = dict([ (col,df[col]) for col in df.columns ]) +""".format(offset, n) + key = 'frame_ctor_dtindex_{}({})'.format(offset, n) + dynamic_benchmarks[key] = Benchmark("DataFrame(d)", setup, name=key) + +# Have to stuff them in globals() so vbench detects them +globals().update(dynamic_benchmarks) + # from a mi-series setup = common_setup + """ mi = MultiIndex.from_tuples([(x,y) for x in range(100) for y in range(100)]) diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index e658ce75247b4..35508ee3913be 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -403,3 +403,42 @@ def test_unequal(name): frame_object_unequal = Benchmark('test_unequal("object_df")', setup) frame_nonunique_unequal = Benchmark('test_unequal("nonunique_cols")', setup) +#----------------------------------------------------------------------------- +# interpolate +# this is the worst case, where every column has NaNs. +setup = common_setup + """ +df = DataFrame(randn(10000, 100)) +df.values[::2] = np.nan +""" + +frame_interpolate = Benchmark('df.interpolate()', setup, + start_date=datetime(2014, 2, 7)) + +setup = common_setup + """ +df = DataFrame({'A': np.arange(0, 10000), + 'B': np.random.randint(0, 100, 10000), + 'C': randn(10000), + 'D': randn(10000)}) +df.loc[1::5, 'A'] = np.nan +df.loc[1::5, 'C'] = np.nan +""" + +frame_interpolate_some_good = Benchmark('df.interpolate()', setup, + start_date=datetime(2014, 2, 7)) +frame_interpolate_some_good_infer = Benchmark('df.interpolate(downcast="infer")', + setup, + start_date=datetime(2014, 2, 7)) + +# +#------------------------------------------------------------------------- +# frame shift issue-5609 + +setup = common_setup + """ +df = pd.DataFrame(np.random.rand(10000,500)) +""" +frame_shift_axis0 = Benchmark('df.shift(1,axis=0)', setup, + name = 'frame_shift_axis_0', + start_date=datetime(2014,1,1)) +frame_shift_axis1 = Benchmark('df.shift(1,axis=1)', setup, + name = 'frame_shift_axis_1', + start_date=datetime(2014,1,1)) \ No newline at end of file diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 01b44cbd5351c..dc8103b0ceea2 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -269,6 +269,22 @@ def f(g): groupby_frame_apply = Benchmark("df.groupby(['key', 'key2']).apply(f)", setup, start_date=datetime(2011, 10, 1)) + +#---------------------------------------------------------------------- +# DataFrame nth + +setup = common_setup + """ +df = pd.DataFrame(np.random.randint(1, 100, (10000, 2))) +""" + +# Not really a fair test as behaviour has changed! +groupby_frame_nth = Benchmark("df.groupby(0).nth(0)", setup, + start_date=datetime(2014, 3, 1)) + +groupby_series_nth = Benchmark("df[1].groupby(df[0]).nth(0)", setup, + start_date=datetime(2014, 3, 1)) + + #---------------------------------------------------------------------- # Sum booleans #2692 diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py index 8b348ddc6e6cc..2cfdffdc38541 100644 --- a/vb_suite/index_object.py +++ b/vb_suite/index_object.py @@ -46,3 +46,16 @@ index_int64_intersection = Benchmark('left.intersection(right)', setup, start_date=datetime(2011, 1, 1)) + +#---------------------------------------------------------------------- +# string index slicing +setup = common_setup + """ +idx = tm.makeStringIndex(1000000) + +mask = np.arange(1000000) % 3 == 0 +series_mask = Series(mask) +""" +index_str_slice_indexer_basic = Benchmark('idx[:-1]', setup) +index_str_slice_indexer_even = Benchmark('idx[::2]', setup) +index_str_boolean_indexer = Benchmark('idx[mask]', setup) +index_str_boolean_series_indexer = Benchmark('idx[series_mask]', setup) diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py index b60009cd272bb..45f3f510d9f08 100644 --- a/vb_suite/join_merge.py +++ b/vb_suite/join_merge.py @@ -186,6 +186,21 @@ def sample(values, k): concat_small_frames = Benchmark('concat([df] * 1000)', setup, start_date=datetime(2012, 1, 1)) + +#---------------------------------------------------------------------- +# Concat empty + +setup = common_setup + """ +df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) +empty = DataFrame() +""" + +concat_empty_frames1 = Benchmark('concat([df,empty])', setup, + start_date=datetime(2012, 1, 1)) +concat_empty_frames2 = Benchmark('concat([empty,df])', setup, + start_date=datetime(2012, 1, 1)) + + #---------------------------------------------------------------------- # Ordered merge
BB