Skip to content

Commit 96033e0

Browse files
committed
EHN: Allow load_data to load the "Titanic" and other problematic R datasets
TST: Move tests from rpy/common.py to rpy/tests/test_common.py TST: Add tests to demonstrate the enhancements made to rpy/common.py. DOC: Add explanation to doc/source/release.rst and doc/source/v0.13.0.txt
1 parent 6857482 commit 96033e0

File tree

6 files changed

+277
-142
lines changed

6 files changed

+277
-142
lines changed

doc/source/r_interface.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ its release 2.3, while the current interface is
2020
designed for the 2.2.x series. We recommend to use 2.2.x over other series
2121
unless you are prepared to fix parts of the code, yet the rpy2-2.3.0
2222
introduces improvements such as a better R-Python bridge memory management
23-
layer so I might be a good idea to bite the bullet and submit patches for
23+
layer so it might be a good idea to bite the bullet and submit patches for
2424
the few minor differences that need to be fixed.
2525

2626

doc/source/release.rst

+4
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,10 @@ Improvements to existing features
160160
:issue:`4998`)
161161
- ``to_dict`` now takes ``records`` as a possible outtype. Returns an array
162162
of column-keyed dictionaries. (:issue:`4936`)
163+
- Improve support for converting R datasets to pandas objects (more
164+
informative index for timeseries and numeric, support for factors, dist, and
165+
high-dimensional arrays).
166+
163167

164168
API Changes
165169
~~~~~~~~~~~

doc/source/v0.13.0.txt

+6
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,12 @@ Enhancements
480480
dfi[mask.any(1)]
481481

482482
:ref:`See the docs<indexing.basics.indexing_isin>` for more.
483+
- All R datasets listed here http://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html can now be loaded into Pandas objects
484+
485+
.. code-block:: python
486+
487+
import pandas.rpy.common as com
488+
com.load_data('Titanic')
483489

484490
.. _whatsnew_0130.experimental:
485491

pandas/rpy/common.py

+53-141
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
from rpy2.robjects import r
1616
import rpy2.robjects as robj
1717

18+
import itertools as IT
19+
20+
1821
__all__ = ['convert_robj', 'load_data', 'convert_to_r_dataframe',
1922
'convert_to_r_matrix']
2023

@@ -46,47 +49,69 @@ def _is_null(obj):
4649

4750
def _convert_list(obj):
4851
"""
49-
Convert named Vector to dict
52+
Convert named Vector to dict, factors to list
5053
"""
51-
values = [convert_robj(x) for x in obj]
52-
return dict(zip(obj.names, values))
54+
try:
55+
values = [convert_robj(x) for x in obj]
56+
keys = r['names'](obj)
57+
return dict(zip(keys, values))
58+
except TypeError:
59+
# For state.division and state.region
60+
factors = list(r['factor'](obj))
61+
level = list(r['levels'](obj))
62+
result = [level[index-1] for index in factors]
63+
return result
5364

5465

5566
def _convert_array(obj):
5667
"""
57-
Convert Array to ndarray
68+
Convert Array to DataFrame
5869
"""
59-
# this royally sucks. "Matrices" (arrays) with dimension > 3 in R aren't
60-
# really matrices-- things come out Fortran order in the first two
61-
# dimensions. Maybe I'm wrong?
62-
70+
def _list(item):
71+
try:
72+
return list(item)
73+
except TypeError:
74+
return []
75+
76+
# For iris3, HairEyeColor, UCBAdmissions, Titanic
6377
dim = list(obj.dim)
6478
values = np.array(list(obj))
65-
66-
if len(dim) == 3:
67-
arr = values.reshape(dim[-1:] + dim[:-1]).swapaxes(1, 2)
68-
69-
if obj.names is not None:
70-
name_list = [list(x) for x in obj.names]
71-
if len(dim) == 2:
72-
return pd.DataFrame(arr, index=name_list[0], columns=name_list[1])
73-
elif len(dim) == 3:
74-
return pd.Panel(arr, items=name_list[2],
75-
major_axis=name_list[0],
76-
minor_axis=name_list[1])
77-
else:
78-
print('Cannot handle dim=%d' % len(dim))
79-
else:
80-
return arr
79+
names = r['dimnames'](obj)
80+
try:
81+
columns = list(r['names'](names))[::-1]
82+
except TypeError:
83+
columns = ['X{:d}'.format(i) for i in range(len(names))][::-1]
84+
columns.append('value')
85+
name_list = [(_list(x) or range(d)) for x, d in zip(names, dim)][::-1]
86+
arr = np.array(list(IT.product(*name_list)))
87+
arr = np.column_stack([arr,values])
88+
df = pd.DataFrame(arr, columns=columns)
89+
return df
8190

8291

8392
def _convert_vector(obj):
8493
if isinstance(obj, robj.IntVector):
8594
return _convert_int_vector(obj)
8695
elif isinstance(obj, robj.StrVector):
8796
return _convert_str_vector(obj)
88-
89-
return list(obj)
97+
# Check if the vector has extra information attached to it that can be used
98+
# as an index
99+
try:
100+
attributes = set(r['attributes'](obj).names)
101+
except AttributeError:
102+
return list(obj)
103+
if 'names' in attributes:
104+
return pd.Series(list(obj), index=r['names'](obj))
105+
elif 'tsp' in attributes:
106+
return pd.Series(list(obj), index=r['time'](obj))
107+
elif 'labels' in attributes:
108+
return pd.Series(list(obj), index=r['labels'](obj))
109+
if _rclass(obj) == 'dist':
110+
# For 'eurodist'. WARNING: This results in a DataFrame, not a Series or list.
111+
matrix = r['as.matrix'](obj)
112+
return convert_robj(matrix)
113+
else:
114+
return list(obj)
90115

91116
NA_INTEGER = -2147483648
92117

@@ -141,8 +166,7 @@ def _convert_Matrix(mat):
141166
rows = mat.rownames
142167

143168
columns = None if _is_null(columns) else list(columns)
144-
index = None if _is_null(rows) else list(rows)
145-
169+
index = r['time'](mat) if _is_null(rows) else list(rows)
146170
return pd.DataFrame(np.array(mat), index=_check_int(index),
147171
columns=columns)
148172

@@ -197,7 +221,7 @@ def convert_robj(obj, use_pandas=True):
197221
if isinstance(obj, rpy_type):
198222
return converter(obj)
199223

200-
raise Exception('Do not know what to do with %s object' % type(obj))
224+
raise TypeError('Do not know what to do with %s object' % type(obj))
201225

202226

203227
def convert_to_r_posixct(obj):
@@ -329,117 +353,5 @@ def convert_to_r_matrix(df, strings_as_factors=False):
329353

330354
return r_matrix
331355

332-
333-
def test_convert_list():
334-
obj = r('list(a=1, b=2, c=3)')
335-
336-
converted = convert_robj(obj)
337-
expected = {'a': [1], 'b': [2], 'c': [3]}
338-
339-
_test.assert_dict_equal(converted, expected)
340-
341-
342-
def test_convert_nested_list():
343-
obj = r('list(a=list(foo=1, bar=2))')
344-
345-
converted = convert_robj(obj)
346-
expected = {'a': {'foo': [1], 'bar': [2]}}
347-
348-
_test.assert_dict_equal(converted, expected)
349-
350-
351-
def test_convert_frame():
352-
# built-in dataset
353-
df = r['faithful']
354-
355-
converted = convert_robj(df)
356-
357-
assert np.array_equal(converted.columns, ['eruptions', 'waiting'])
358-
assert np.array_equal(converted.index, np.arange(1, 273))
359-
360-
361-
def _test_matrix():
362-
r('mat <- matrix(rnorm(9), ncol=3)')
363-
r('colnames(mat) <- c("one", "two", "three")')
364-
r('rownames(mat) <- c("a", "b", "c")')
365-
366-
return r['mat']
367-
368-
369-
def test_convert_matrix():
370-
mat = _test_matrix()
371-
372-
converted = convert_robj(mat)
373-
374-
assert np.array_equal(converted.index, ['a', 'b', 'c'])
375-
assert np.array_equal(converted.columns, ['one', 'two', 'three'])
376-
377-
378-
def test_convert_r_dataframe():
379-
380-
is_na = robj.baseenv.get("is.na")
381-
382-
seriesd = _test.getSeriesData()
383-
frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A'])
384-
385-
# Null data
386-
frame["E"] = [np.nan for item in frame["A"]]
387-
# Some mixed type data
388-
frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)]
389-
390-
r_dataframe = convert_to_r_dataframe(frame)
391-
392-
assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index)
393-
assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns)
394-
assert all(is_na(item) for item in r_dataframe.rx2("E"))
395-
396-
for column in frame[["A", "B", "C", "D"]]:
397-
coldata = r_dataframe.rx2(column)
398-
original_data = frame[column]
399-
assert np.array_equal(convert_robj(coldata), original_data)
400-
401-
for column in frame[["D", "E"]]:
402-
for original, converted in zip(frame[column],
403-
r_dataframe.rx2(column)):
404-
405-
if pd.isnull(original):
406-
assert is_na(converted)
407-
else:
408-
assert original == converted
409-
410-
411-
def test_convert_r_matrix():
412-
413-
is_na = robj.baseenv.get("is.na")
414-
415-
seriesd = _test.getSeriesData()
416-
frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A'])
417-
# Null data
418-
frame["E"] = [np.nan for item in frame["A"]]
419-
420-
r_dataframe = convert_to_r_matrix(frame)
421-
422-
assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index)
423-
assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns)
424-
assert all(is_na(item) for item in r_dataframe.rx(True, "E"))
425-
426-
for column in frame[["A", "B", "C", "D"]]:
427-
coldata = r_dataframe.rx(True, column)
428-
original_data = frame[column]
429-
assert np.array_equal(convert_robj(coldata),
430-
original_data)
431-
432-
# Pandas bug 1282
433-
frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)]
434-
435-
# FIXME: Ugly, this whole module needs to be ported to nose/unittest
436-
try:
437-
wrong_matrix = convert_to_r_matrix(frame)
438-
except TypeError:
439-
pass
440-
except Exception:
441-
raise
442-
443-
444356
if __name__ == '__main__':
445357
pass

pandas/rpy/tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)