Skip to content

Commit ec54354

Browse files
committed
Merge branch 'r-dataset' of https://github.com/unutbu/pandas into unutbu-r-dataset
Conflicts: doc/source/release.rst
2 parents 40ff9bc + 96033e0 commit ec54354

File tree

6 files changed

+276
-142
lines changed

6 files changed

+276
-142
lines changed

doc/source/r_interface.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ its release 2.3, while the current interface is
2020
designed for the 2.2.x series. We recommend to use 2.2.x over other series
2121
unless you are prepared to fix parts of the code, yet the rpy2-2.3.0
2222
introduces improvements such as a better R-Python bridge memory management
23-
layer so I might be a good idea to bite the bullet and submit patches for
23+
layer so it might be a good idea to bite the bullet and submit patches for
2424
the few minor differences that need to be fixed.
2525

2626

doc/source/release.rst

+3
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,9 @@ Improvements to existing features
164164
- ``tz_localize`` can infer a fall daylight savings transition based on the
165165
structure of unlocalized data (:issue:`4230`)
166166
- DatetimeIndex is now in the API documentation
167+
- Improve support for converting R datasets to pandas objects (more
168+
informative index for timeseries and numeric, support for factors, dist, and
169+
high-dimensional arrays).
167170

168171
API Changes
169172
~~~~~~~~~~~

doc/source/v0.13.0.txt

+6
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,12 @@ Enhancements
480480
dfi[mask.any(1)]
481481

482482
:ref:`See the docs<indexing.basics.indexing_isin>` for more.
483+
- All R datasets listed here http://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html can now be loaded into Pandas objects
484+
485+
.. code-block:: python
486+
487+
import pandas.rpy.common as com
488+
com.load_data('Titanic')
483489

484490
- ``tz_localize`` can infer a fall daylight savings transition based on the structure
485491
of the unlocalized data (:issue:`4230`), see :ref:`here<timeseries.timezone>`

pandas/rpy/common.py

+53-141
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
from rpy2.robjects import r
1616
import rpy2.robjects as robj
1717

18+
import itertools as IT
19+
20+
1821
__all__ = ['convert_robj', 'load_data', 'convert_to_r_dataframe',
1922
'convert_to_r_matrix']
2023

@@ -46,47 +49,69 @@ def _is_null(obj):
4649

4750
def _convert_list(obj):
4851
"""
49-
Convert named Vector to dict
52+
Convert named Vector to dict, factors to list
5053
"""
51-
values = [convert_robj(x) for x in obj]
52-
return dict(zip(obj.names, values))
54+
try:
55+
values = [convert_robj(x) for x in obj]
56+
keys = r['names'](obj)
57+
return dict(zip(keys, values))
58+
except TypeError:
59+
# For state.division and state.region
60+
factors = list(r['factor'](obj))
61+
level = list(r['levels'](obj))
62+
result = [level[index-1] for index in factors]
63+
return result
5364

5465

5566
def _convert_array(obj):
5667
"""
57-
Convert Array to ndarray
68+
Convert Array to DataFrame
5869
"""
59-
# this royally sucks. "Matrices" (arrays) with dimension > 3 in R aren't
60-
# really matrices-- things come out Fortran order in the first two
61-
# dimensions. Maybe I'm wrong?
62-
70+
def _list(item):
71+
try:
72+
return list(item)
73+
except TypeError:
74+
return []
75+
76+
# For iris3, HairEyeColor, UCBAdmissions, Titanic
6377
dim = list(obj.dim)
6478
values = np.array(list(obj))
65-
66-
if len(dim) == 3:
67-
arr = values.reshape(dim[-1:] + dim[:-1]).swapaxes(1, 2)
68-
69-
if obj.names is not None:
70-
name_list = [list(x) for x in obj.names]
71-
if len(dim) == 2:
72-
return pd.DataFrame(arr, index=name_list[0], columns=name_list[1])
73-
elif len(dim) == 3:
74-
return pd.Panel(arr, items=name_list[2],
75-
major_axis=name_list[0],
76-
minor_axis=name_list[1])
77-
else:
78-
print('Cannot handle dim=%d' % len(dim))
79-
else:
80-
return arr
79+
names = r['dimnames'](obj)
80+
try:
81+
columns = list(r['names'](names))[::-1]
82+
except TypeError:
83+
columns = ['X{:d}'.format(i) for i in range(len(names))][::-1]
84+
columns.append('value')
85+
name_list = [(_list(x) or range(d)) for x, d in zip(names, dim)][::-1]
86+
arr = np.array(list(IT.product(*name_list)))
87+
arr = np.column_stack([arr,values])
88+
df = pd.DataFrame(arr, columns=columns)
89+
return df
8190

8291

8392
def _convert_vector(obj):
8493
if isinstance(obj, robj.IntVector):
8594
return _convert_int_vector(obj)
8695
elif isinstance(obj, robj.StrVector):
8796
return _convert_str_vector(obj)
88-
89-
return list(obj)
97+
# Check if the vector has extra information attached to it that can be used
98+
# as an index
99+
try:
100+
attributes = set(r['attributes'](obj).names)
101+
except AttributeError:
102+
return list(obj)
103+
if 'names' in attributes:
104+
return pd.Series(list(obj), index=r['names'](obj))
105+
elif 'tsp' in attributes:
106+
return pd.Series(list(obj), index=r['time'](obj))
107+
elif 'labels' in attributes:
108+
return pd.Series(list(obj), index=r['labels'](obj))
109+
if _rclass(obj) == 'dist':
110+
# For 'eurodist'. WARNING: This results in a DataFrame, not a Series or list.
111+
matrix = r['as.matrix'](obj)
112+
return convert_robj(matrix)
113+
else:
114+
return list(obj)
90115

91116
NA_INTEGER = -2147483648
92117

@@ -141,8 +166,7 @@ def _convert_Matrix(mat):
141166
rows = mat.rownames
142167

143168
columns = None if _is_null(columns) else list(columns)
144-
index = None if _is_null(rows) else list(rows)
145-
169+
index = r['time'](mat) if _is_null(rows) else list(rows)
146170
return pd.DataFrame(np.array(mat), index=_check_int(index),
147171
columns=columns)
148172

@@ -197,7 +221,7 @@ def convert_robj(obj, use_pandas=True):
197221
if isinstance(obj, rpy_type):
198222
return converter(obj)
199223

200-
raise Exception('Do not know what to do with %s object' % type(obj))
224+
raise TypeError('Do not know what to do with %s object' % type(obj))
201225

202226

203227
def convert_to_r_posixct(obj):
@@ -329,117 +353,5 @@ def convert_to_r_matrix(df, strings_as_factors=False):
329353

330354
return r_matrix
331355

332-
333-
def test_convert_list():
334-
obj = r('list(a=1, b=2, c=3)')
335-
336-
converted = convert_robj(obj)
337-
expected = {'a': [1], 'b': [2], 'c': [3]}
338-
339-
_test.assert_dict_equal(converted, expected)
340-
341-
342-
def test_convert_nested_list():
343-
obj = r('list(a=list(foo=1, bar=2))')
344-
345-
converted = convert_robj(obj)
346-
expected = {'a': {'foo': [1], 'bar': [2]}}
347-
348-
_test.assert_dict_equal(converted, expected)
349-
350-
351-
def test_convert_frame():
352-
# built-in dataset
353-
df = r['faithful']
354-
355-
converted = convert_robj(df)
356-
357-
assert np.array_equal(converted.columns, ['eruptions', 'waiting'])
358-
assert np.array_equal(converted.index, np.arange(1, 273))
359-
360-
361-
def _test_matrix():
362-
r('mat <- matrix(rnorm(9), ncol=3)')
363-
r('colnames(mat) <- c("one", "two", "three")')
364-
r('rownames(mat) <- c("a", "b", "c")')
365-
366-
return r['mat']
367-
368-
369-
def test_convert_matrix():
370-
mat = _test_matrix()
371-
372-
converted = convert_robj(mat)
373-
374-
assert np.array_equal(converted.index, ['a', 'b', 'c'])
375-
assert np.array_equal(converted.columns, ['one', 'two', 'three'])
376-
377-
378-
def test_convert_r_dataframe():
379-
380-
is_na = robj.baseenv.get("is.na")
381-
382-
seriesd = _test.getSeriesData()
383-
frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A'])
384-
385-
# Null data
386-
frame["E"] = [np.nan for item in frame["A"]]
387-
# Some mixed type data
388-
frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)]
389-
390-
r_dataframe = convert_to_r_dataframe(frame)
391-
392-
assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index)
393-
assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns)
394-
assert all(is_na(item) for item in r_dataframe.rx2("E"))
395-
396-
for column in frame[["A", "B", "C", "D"]]:
397-
coldata = r_dataframe.rx2(column)
398-
original_data = frame[column]
399-
assert np.array_equal(convert_robj(coldata), original_data)
400-
401-
for column in frame[["D", "E"]]:
402-
for original, converted in zip(frame[column],
403-
r_dataframe.rx2(column)):
404-
405-
if pd.isnull(original):
406-
assert is_na(converted)
407-
else:
408-
assert original == converted
409-
410-
411-
def test_convert_r_matrix():
412-
413-
is_na = robj.baseenv.get("is.na")
414-
415-
seriesd = _test.getSeriesData()
416-
frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A'])
417-
# Null data
418-
frame["E"] = [np.nan for item in frame["A"]]
419-
420-
r_dataframe = convert_to_r_matrix(frame)
421-
422-
assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index)
423-
assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns)
424-
assert all(is_na(item) for item in r_dataframe.rx(True, "E"))
425-
426-
for column in frame[["A", "B", "C", "D"]]:
427-
coldata = r_dataframe.rx(True, column)
428-
original_data = frame[column]
429-
assert np.array_equal(convert_robj(coldata),
430-
original_data)
431-
432-
# Pandas bug 1282
433-
frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)]
434-
435-
# FIXME: Ugly, this whole module needs to be ported to nose/unittest
436-
try:
437-
wrong_matrix = convert_to_r_matrix(frame)
438-
except TypeError:
439-
pass
440-
except Exception:
441-
raise
442-
443-
444356
if __name__ == '__main__':
445357
pass

pandas/rpy/tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)