From 2a6357911ef8aea94905320c4cd412c72f793066 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 2 Jan 2015 12:31:38 -0600 Subject: [PATCH 1/6] TST: Make test work. --- pandas/rpy/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/rpy/tests/test_common.py b/pandas/rpy/tests/test_common.py index a2e6d08d07b58..3416e46a291f5 100644 --- a/pandas/rpy/tests/test_common.py +++ b/pandas/rpy/tests/test_common.py @@ -205,7 +205,7 @@ def test_factor(self): level = list(r['levels'](vector)) factors = [level[index - 1] for index in factors] result = com.load_data(name) - assert np.equal(result, factors) + np.testing.assert_equal(result, factors) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 0b3327f7f3914f916a09ff6451cca371a7c181f9 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 2 Jan 2015 12:32:19 -0600 Subject: [PATCH 2/6] TST: Test DataFrame code path for factors --- pandas/rpy/tests/test_common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/rpy/tests/test_common.py b/pandas/rpy/tests/test_common.py index 3416e46a291f5..6a0527d91a612 100644 --- a/pandas/rpy/tests/test_common.py +++ b/pandas/rpy/tests/test_common.py @@ -207,6 +207,14 @@ def test_factor(self): result = com.load_data(name) np.testing.assert_equal(result, factors) + # test it as a data.frame + result = com.convert_robj(r("as.data.frame({0})".format(name))) + np.testing.assert_equal(result[name].values, factors) + + def test_factor_as_factor(self): + for name in ('state.division', 'state.region'): + pass + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], # '--with-coverage', '--cover-package=pandas.core'], From 0d75807432ea1baa086004f858b63538ed0bf785 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 2 Jan 2015 13:31:24 -0600 Subject: [PATCH 3/6] ENH: Convert R factors to pandas.Categoricals --- pandas/rpy/common.py | 72 +++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 27 deletions(-) diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py index 55adad3610816..cb114a146eae6 100644 --- a/pandas/rpy/common.py +++ b/pandas/rpy/common.py @@ -23,7 +23,7 @@ 'convert_to_r_matrix'] -def load_data(name, package=None, convert=True): +def load_data(name, package=None, convert=True, factors_as_strings=True): if package: importr(package) @@ -32,7 +32,7 @@ def load_data(name, package=None, convert=True): robj = r[name] if convert: - return convert_robj(robj) + return convert_robj(robj, factors_as_strings=factors_as_strings) else: return robj @@ -48,7 +48,7 @@ def _is_null(obj): return _rclass(obj) == 'NULL' -def _convert_list(obj): +def _convert_list(obj, **kwargs): """ Convert named Vector to dict, factors to list """ @@ -64,7 +64,7 @@ def _convert_list(obj): return result -def _convert_array(obj): +def _convert_array(obj, **kwargs): """ Convert Array to DataFrame """ @@ -90,8 +90,11 @@ def _list(item): return df -def _convert_vector(obj): - if isinstance(obj, robj.IntVector): +def _convert_vector(obj, **kwargs): + # FactorVector is sub-class, so check first + if isinstance(obj, robj.FactorVector): + return _convert_factor(obj, **kwargs) + elif isinstance(obj, robj.IntVector): return _convert_int_vector(obj) elif isinstance(obj, robj.StrVector): return _convert_str_vector(obj) @@ -117,7 +120,7 @@ def _convert_vector(obj): NA_INTEGER = -2147483648 -def _convert_int_vector(obj): +def _convert_int_vector(obj, **kwargs): arr = np.asarray(obj) mask = arr == NA_INTEGER if mask.any(): @@ -126,7 +129,7 @@ def _convert_int_vector(obj): return arr -def _convert_str_vector(obj): +def _convert_str_vector(obj, **kwargs): arr = np.asarray(obj, dtype=object) mask = arr == robj.NA_Character if mask.any(): @@ -134,35 +137,47 @@ def _convert_str_vector(obj): return arr -def _convert_DataFrame(rdf): +def _convert_factor(obj, **kwargs): + if kwargs.get("factors_as_strings", True): + levels = np.asarray(obj.levels) + values = np.asarray(obj) + if com.is_float_dtype(values): + mask = np.isnan(values) + notmask = -mask + result = np.empty(len(values), dtype=object) + result[mask] = np.nan + + locs = (values[notmask] - 1).astype(np.int_) + result[notmask] = levels.take(locs) + values = result + else: + values = np.asarray(obj.levels).take(values - 1) + + else: # give a categorical object back + ordered = r["is.ordered"](obj)[0] + categories = list(obj.levels) + codes = np.asarray(obj) - 1 # zero-based indexing + values = pd.Categorical.from_codes(codes, categories=categories, + ordered=ordered) + + return values + + +def _convert_DataFrame(rdf, **kwargs): columns = list(rdf.colnames) rows = np.array(rdf.rownames) data = {} for i, col in enumerate(columns): vec = rdf.rx2(i + 1) - values = _convert_vector(vec) - - if isinstance(vec, robj.FactorVector): - levels = np.asarray(vec.levels) - if com.is_float_dtype(values): - mask = np.isnan(values) - notmask = -mask - result = np.empty(len(values), dtype=object) - result[mask] = np.nan - - locs = (values[notmask] - 1).astype(np.int_) - result[notmask] = levels.take(locs) - values = result - else: - values = np.asarray(vec.levels).take(values - 1) + values = _convert_vector(vec, **kwargs) data[col] = values return pd.DataFrame(data, index=_check_int(rows), columns=columns) -def _convert_Matrix(mat): +def _convert_Matrix(mat, **kwargs): columns = mat.colnames rows = mat.rownames @@ -181,12 +196,14 @@ def _check_int(vec): return vec + _pandas_converters = [ (robj.DataFrame, _convert_DataFrame), (robj.Matrix, _convert_Matrix), (robj.StrVector, _convert_vector), (robj.FloatVector, _convert_vector), (robj.Array, _convert_array), + (robj.FactorVector, _convert_factor), (robj.Vector, _convert_list), ] @@ -197,11 +214,12 @@ def _check_int(vec): (robj.StrVector, _convert_vector), (robj.FloatVector, _convert_vector), (robj.Array, _convert_array), + (robj.FactorVector, _convert_factor), (robj.Vector, _convert_list), ] -def convert_robj(obj, use_pandas=True): +def convert_robj(obj, use_pandas=True, factors_as_strings=True): """ Convert rpy2 object to a pandas-friendly form @@ -220,7 +238,7 @@ def convert_robj(obj, use_pandas=True): for rpy_type, converter in converters: if isinstance(obj, rpy_type): - return converter(obj) + return converter(obj, factors_as_strings=factors_as_strings) raise TypeError('Do not know what to do with %s object' % type(obj)) From 57cf1e73238b75851d5ad9ad3b397c255df02bb8 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 2 Jan 2015 13:53:50 -0600 Subject: [PATCH 4/6] TST: Test to R data.frame with categorical --- pandas/rpy/tests/test_common.py | 37 ++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/pandas/rpy/tests/test_common.py b/pandas/rpy/tests/test_common.py index 6a0527d91a612..9e1fbc943323b 100644 --- a/pandas/rpy/tests/test_common.py +++ b/pandas/rpy/tests/test_common.py @@ -213,7 +213,42 @@ def test_factor(self): def test_factor_as_factor(self): for name in ('state.division', 'state.region'): - pass + vector = r[name] + factors = np.asarray(r['factor'](vector)) - 1 + level = list(r['levels'](vector)) + ordered = r["is.ordered"](vector)[0] + + result = com.load_data(name, factors_as_strings=False) + factor = pd.Categorical.from_codes(factors, categories=level, + ordered=ordered) + np.testing.assert_(result.equals(factor)) + + # test it as a data.frame + result = com.convert_robj(r("as.data.frame({0})".format(name)), + factors_as_strings=False) + np.testing.assert_(isinstance(result, pd.DataFrame)) + np.testing.assert_(result[name].dtype.type == + pd.core.common.CategoricalDtypeType) + + # no easy way to go from categorical Series to Cateogical? + np.testing.assert_equal(result[name].cat.codes, factor.codes) + cat_equals = result[name].cat.categories.equals(factor.categories) + np.testing.assert_(cat_equals) + + def test_to_r_dataframe_with_categorical(self): + r("dta <- warpbreaks") + r("dta[\"tension\"] <- factor(warpbreaks$tension, ordered=TRUE)") + dta = com.load_data("dta", factors_as_strings=False) + # check this while we're here + np.testing.assert_(not dta.wool.cat.ordered) + np.testing.assert_(dta.tension.cat.ordered) + df = com.convert_to_r_dataframe(dta) + np.testing.assert_(isinstance(df[1], robj.FactorVector)) + np.testing.assert_(isinstance(df[2], robj.FactorVector)) + + np.testing.assert_(not r["is.ordered"](df[1])[0]) + np.testing.assert_(r["is.ordered"](df[2])[0]) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 5f4db399422273779f93739b6e1134425b773af2 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 2 Jan 2015 13:54:32 -0600 Subject: [PATCH 5/6] ENH: Handle to R data.frame with Categorical --- pandas/rpy/common.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py index cb114a146eae6..cab47a4e2c00b 100644 --- a/pandas/rpy/common.py +++ b/pandas/rpy/common.py @@ -336,6 +336,11 @@ def convert_to_r_dataframe(df, strings_as_factors=False): if value_type == np.datetime64: value = convert_to_r_posixct(value) + elif value_type == com.CategoricalDtypeType: + levels = robj.StrVector(value.cat.categories) + value = robj.FactorVector(value, + levels=levels, + ordered=value.cat.ordered) else: value = [item if pd.notnull(item) else NA_TYPES[value_type] for item in value] From 620b5edccab803b7b6fa2cd70545189360fc3ec9 Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Fri, 2 Jan 2015 15:33:40 -0600 Subject: [PATCH 6/6] ENH: Support int8 and int16. [skip-ci] --- pandas/rpy/common.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py index cab47a4e2c00b..6318b50052bb5 100644 --- a/pandas/rpy/common.py +++ b/pandas/rpy/common.py @@ -281,6 +281,8 @@ def convert_to_r_posixct(obj): np.float32: robj.FloatVector, np.float: robj.FloatVector, np.int: robj.IntVector, + np.int8: robj.IntVector, + np.int16: robj.IntVector, np.int32: robj.IntVector, np.int64: robj.IntVector, np.object_: robj.StrVector, @@ -292,6 +294,8 @@ def convert_to_r_posixct(obj): np.float32: robj.NA_Real, np.float: robj.NA_Real, np.int: robj.NA_Integer, + np.int8: robj.NA_Integer, + np.int16: robj.NA_Integer, np.int32: robj.NA_Integer, np.int64: robj.NA_Integer, np.object_: robj.NA_Character,