Skip to content

ENH: Handle categorical dtype to/from R #9187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 54 additions & 27 deletions pandas/rpy/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
'convert_to_r_matrix']


def load_data(name, package=None, convert=True):
def load_data(name, package=None, convert=True, factors_as_strings=True):
if package:
importr(package)

Expand All @@ -32,7 +32,7 @@ def load_data(name, package=None, convert=True):
robj = r[name]

if convert:
return convert_robj(robj)
return convert_robj(robj, factors_as_strings=factors_as_strings)
else:
return robj

Expand All @@ -48,7 +48,7 @@ def _is_null(obj):
return _rclass(obj) == 'NULL'


def _convert_list(obj):
def _convert_list(obj, **kwargs):
"""
Convert named Vector to dict, factors to list
"""
Expand All @@ -64,7 +64,7 @@ def _convert_list(obj):
return result


def _convert_array(obj):
def _convert_array(obj, **kwargs):
"""
Convert Array to DataFrame
"""
Expand All @@ -90,8 +90,11 @@ def _list(item):
return df


def _convert_vector(obj):
if isinstance(obj, robj.IntVector):
def _convert_vector(obj, **kwargs):
# FactorVector is sub-class, so check first
if isinstance(obj, robj.FactorVector):
return _convert_factor(obj, **kwargs)
elif isinstance(obj, robj.IntVector):
return _convert_int_vector(obj)
elif isinstance(obj, robj.StrVector):
return _convert_str_vector(obj)
Expand All @@ -117,7 +120,7 @@ def _convert_vector(obj):
NA_INTEGER = -2147483648


def _convert_int_vector(obj):
def _convert_int_vector(obj, **kwargs):
arr = np.asarray(obj)
mask = arr == NA_INTEGER
if mask.any():
Expand All @@ -126,43 +129,55 @@ def _convert_int_vector(obj):
return arr


def _convert_str_vector(obj):
def _convert_str_vector(obj, **kwargs):
arr = np.asarray(obj, dtype=object)
mask = arr == robj.NA_Character
if mask.any():
arr[mask] = np.nan
return arr


def _convert_DataFrame(rdf):
def _convert_factor(obj, **kwargs):
if kwargs.get("factors_as_strings", True):
levels = np.asarray(obj.levels)
values = np.asarray(obj)
if com.is_float_dtype(values):
mask = np.isnan(values)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jseabold: I don't see a test exercising this code path (when is_float_dtype(values) is True). Can you explain its purpose and/or perhaps add a test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was copy-pasted from the existing code.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jseabold: Sorry, my bad. I see now that that code was added due to
GH #1615. Unfortunately, there appears to be a regression when
factors_as_strings=False:

from pandas.rpy.common import load_data
prestige = load_data('Prestige', 'car', factors_as_strings=False)

raises ValueError: codes need to be between -1 and len(categories)-1.

And the issue also affects factors_as_strings=True, where

prestige = load_data('Prestige', 'car', factors_as_strings=True)

raises IndexError: index 2147483647 is out of bounds for axis 0 with size 3

notmask = -mask
result = np.empty(len(values), dtype=object)
result[mask] = np.nan

locs = (values[notmask] - 1).astype(np.int_)
result[notmask] = levels.take(locs)
values = result
else:
values = np.asarray(obj.levels).take(values - 1)

else: # give a categorical object back
ordered = r["is.ordered"](obj)[0]
categories = list(obj.levels)
codes = np.asarray(obj) - 1 # zero-based indexing
values = pd.Categorical.from_codes(codes, categories=categories,
ordered=ordered)

return values


def _convert_DataFrame(rdf, **kwargs):
columns = list(rdf.colnames)
rows = np.array(rdf.rownames)

data = {}
for i, col in enumerate(columns):
vec = rdf.rx2(i + 1)
values = _convert_vector(vec)

if isinstance(vec, robj.FactorVector):
levels = np.asarray(vec.levels)
if com.is_float_dtype(values):
mask = np.isnan(values)
notmask = -mask
result = np.empty(len(values), dtype=object)
result[mask] = np.nan

locs = (values[notmask] - 1).astype(np.int_)
result[notmask] = levels.take(locs)
values = result
else:
values = np.asarray(vec.levels).take(values - 1)
values = _convert_vector(vec, **kwargs)

data[col] = values

return pd.DataFrame(data, index=_check_int(rows), columns=columns)


def _convert_Matrix(mat):
def _convert_Matrix(mat, **kwargs):
columns = mat.colnames
rows = mat.rownames

Expand All @@ -181,12 +196,14 @@ def _check_int(vec):

return vec


_pandas_converters = [
(robj.DataFrame, _convert_DataFrame),
(robj.Matrix, _convert_Matrix),
(robj.StrVector, _convert_vector),
(robj.FloatVector, _convert_vector),
(robj.Array, _convert_array),
(robj.FactorVector, _convert_factor),
(robj.Vector, _convert_list),
]

Expand All @@ -197,11 +214,12 @@ def _check_int(vec):
(robj.StrVector, _convert_vector),
(robj.FloatVector, _convert_vector),
(robj.Array, _convert_array),
(robj.FactorVector, _convert_factor),
(robj.Vector, _convert_list),
]


def convert_robj(obj, use_pandas=True):
def convert_robj(obj, use_pandas=True, factors_as_strings=True):
"""
Convert rpy2 object to a pandas-friendly form

Expand All @@ -220,7 +238,7 @@ def convert_robj(obj, use_pandas=True):

for rpy_type, converter in converters:
if isinstance(obj, rpy_type):
return converter(obj)
return converter(obj, factors_as_strings=factors_as_strings)

raise TypeError('Do not know what to do with %s object' % type(obj))

Expand Down Expand Up @@ -263,6 +281,8 @@ def convert_to_r_posixct(obj):
np.float32: robj.FloatVector,
np.float: robj.FloatVector,
np.int: robj.IntVector,
np.int8: robj.IntVector,
np.int16: robj.IntVector,
np.int32: robj.IntVector,
np.int64: robj.IntVector,
np.object_: robj.StrVector,
Expand All @@ -274,6 +294,8 @@ def convert_to_r_posixct(obj):
np.float32: robj.NA_Real,
np.float: robj.NA_Real,
np.int: robj.NA_Integer,
np.int8: robj.NA_Integer,
np.int16: robj.NA_Integer,
np.int32: robj.NA_Integer,
np.int64: robj.NA_Integer,
np.object_: robj.NA_Character,
Expand Down Expand Up @@ -318,6 +340,11 @@ def convert_to_r_dataframe(df, strings_as_factors=False):

if value_type == np.datetime64:
value = convert_to_r_posixct(value)
elif value_type == com.CategoricalDtypeType:
levels = robj.StrVector(value.cat.categories)
value = robj.FactorVector(value,
levels=levels,
ordered=value.cat.ordered)
else:
value = [item if pd.notnull(item) else NA_TYPES[value_type]
for item in value]
Expand Down
45 changes: 44 additions & 1 deletion pandas/rpy/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,50 @@ def test_factor(self):
level = list(r['levels'](vector))
factors = [level[index - 1] for index in factors]
result = com.load_data(name)
assert np.equal(result, factors)
np.testing.assert_equal(result, factors)

# test it as a data.frame
result = com.convert_robj(r("as.data.frame({0})".format(name)))
np.testing.assert_equal(result[name].values, factors)

def test_factor_as_factor(self):
for name in ('state.division', 'state.region'):
vector = r[name]
factors = np.asarray(r['factor'](vector)) - 1
level = list(r['levels'](vector))
ordered = r["is.ordered"](vector)[0]

result = com.load_data(name, factors_as_strings=False)
factor = pd.Categorical.from_codes(factors, categories=level,
ordered=ordered)
np.testing.assert_(result.equals(factor))

# test it as a data.frame
result = com.convert_robj(r("as.data.frame({0})".format(name)),
factors_as_strings=False)
np.testing.assert_(isinstance(result, pd.DataFrame))
np.testing.assert_(result[name].dtype.type ==
pd.core.common.CategoricalDtypeType)

# no easy way to go from categorical Series to Cateogical?
np.testing.assert_equal(result[name].cat.codes, factor.codes)
cat_equals = result[name].cat.categories.equals(factor.categories)
np.testing.assert_(cat_equals)

def test_to_r_dataframe_with_categorical(self):
r("dta <- warpbreaks")
r("dta[\"tension\"] <- factor(warpbreaks$tension, ordered=TRUE)")
dta = com.load_data("dta", factors_as_strings=False)
# check this while we're here
np.testing.assert_(not dta.wool.cat.ordered)
np.testing.assert_(dta.tension.cat.ordered)
df = com.convert_to_r_dataframe(dta)
np.testing.assert_(isinstance(df[1], robj.FactorVector))
np.testing.assert_(isinstance(df[2], robj.FactorVector))

np.testing.assert_(not r["is.ordered"](df[1])[0])
np.testing.assert_(r["is.ordered"](df[2])[0])


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down