EHN: Allow load_data to load the "Titanic" and other problematic R datasets

unutbu · unutbu · commit 96033e0c9cc6 · 2013-10-02T05:40:54.000-04:00
TST: Move tests from rpy/common.py to rpy/tests/test_common.py
TST: Add tests to demonstrate the enhancements made to rpy/common.py.
DOC: Add explanation to doc/source/release.rst and doc/source/v0.13.0.txt
diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst
@@ -20,7 +20,7 @@ its release 2.3, while the current interface is
 designed for the 2.2.x series. We recommend to use 2.2.x over other series 
 unless you are prepared to fix parts of the code, yet the rpy2-2.3.0
 introduces improvements such as a better R-Python bridge memory management
-layer so I might be a good idea to bite the bullet and submit patches for
+layer so it might be a good idea to bite the bullet and submit patches for
 the few minor differences that need to be fixed.
 
 
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -160,6 +160,10 @@ Improvements to existing features
     :issue:`4998`)
   - ``to_dict`` now takes ``records`` as a possible outtype.  Returns an array
     of column-keyed dictionaries. (:issue:`4936`)
+  - Improve support for converting R datasets to pandas objects (more
+    informative index for timeseries and numeric, support for factors, dist, and
+    high-dimensional arrays).
+
 
 API Changes
 ~~~~~~~~~~~
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -480,6 +480,12 @@ Enhancements
          dfi[mask.any(1)]
 
       :ref:`See the docs<indexing.basics.indexing_isin>` for more.
+  - All R datasets listed here http://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html can now be loaded into Pandas objects
+
+    .. code-block:: python
+
+       import pandas.rpy.common as com
+       com.load_data('Titanic')
 
 .. _whatsnew_0130.experimental:
 
diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py
@@ -15,6 +15,9 @@
 from rpy2.robjects import r
 import rpy2.robjects as robj
 
+import itertools as IT
+
+
 __all__ = ['convert_robj', 'load_data', 'convert_to_r_dataframe',
            'convert_to_r_matrix']
 
@@ -46,47 +49,69 @@ def _is_null(obj):
 
 def _convert_list(obj):
     """
-    Convert named Vector to dict
+    Convert named Vector to dict, factors to list
     """
-    values = [convert_robj(x) for x in obj]
-    return dict(zip(obj.names, values))
+    try:
+        values = [convert_robj(x) for x in obj]
+        keys = r['names'](obj)
+        return dict(zip(keys, values))
+    except TypeError:
+        # For state.division and state.region
+        factors = list(r['factor'](obj))
+        level = list(r['levels'](obj))
+        result = [level[index-1] for index in factors]
+        return result
 
 
 def _convert_array(obj):
     """
-    Convert Array to ndarray
+    Convert Array to DataFrame
     """
-    # this royally sucks. "Matrices" (arrays) with dimension > 3 in R aren't
-    # really matrices-- things come out Fortran order in the first two
-    # dimensions. Maybe I'm wrong?
-
+    def _list(item):
+        try:
+            return list(item)
+        except TypeError:
+            return []
+        
+    # For iris3, HairEyeColor, UCBAdmissions, Titanic
     dim = list(obj.dim)
     values = np.array(list(obj))
-
-    if len(dim) == 3:
-        arr = values.reshape(dim[-1:] + dim[:-1]).swapaxes(1, 2)
-
-    if obj.names is not None:
-        name_list = [list(x) for x in obj.names]
-        if len(dim) == 2:
-            return pd.DataFrame(arr, index=name_list[0], columns=name_list[1])
-        elif len(dim) == 3:
-            return pd.Panel(arr, items=name_list[2],
-                            major_axis=name_list[0],
-                            minor_axis=name_list[1])
-        else:
-            print('Cannot handle dim=%d' % len(dim))
-    else:
-        return arr
+    names = r['dimnames'](obj)
+    try:
+        columns = list(r['names'](names))[::-1]
+    except TypeError:
+        columns = ['X{:d}'.format(i) for i in range(len(names))][::-1]
+    columns.append('value')
+    name_list = [(_list(x) or range(d)) for x, d in zip(names, dim)][::-1]
+    arr = np.array(list(IT.product(*name_list)))
+    arr = np.column_stack([arr,values])
+    df = pd.DataFrame(arr, columns=columns)
+    return df
 
 
 def _convert_vector(obj):
     if isinstance(obj, robj.IntVector):
         return _convert_int_vector(obj)
     elif isinstance(obj, robj.StrVector):
         return _convert_str_vector(obj)
-
-    return list(obj)
+    # Check if the vector has extra information attached to it that can be used
+    # as an index
+    try:
+        attributes = set(r['attributes'](obj).names)
+    except AttributeError:
+        return list(obj)
+    if 'names' in attributes:
+        return pd.Series(list(obj), index=r['names'](obj)) 
+    elif 'tsp' in attributes:
+        return pd.Series(list(obj), index=r['time'](obj)) 
+    elif 'labels' in attributes:
+        return pd.Series(list(obj), index=r['labels'](obj))
+    if _rclass(obj) == 'dist':
+        # For 'eurodist'. WARNING: This results in a DataFrame, not a Series or list.
+        matrix = r['as.matrix'](obj)
+        return convert_robj(matrix)
+    else:
+        return list(obj)
 
 NA_INTEGER = -2147483648
 
@@ -141,8 +166,7 @@ def _convert_Matrix(mat):
     rows = mat.rownames
 
     columns = None if _is_null(columns) else list(columns)
-    index = None if _is_null(rows) else list(rows)
-
+    index = r['time'](mat) if _is_null(rows) else list(rows)
     return pd.DataFrame(np.array(mat), index=_check_int(index),
                         columns=columns)
 
@@ -197,7 +221,7 @@ def convert_robj(obj, use_pandas=True):
         if isinstance(obj, rpy_type):
             return converter(obj)
 
-    raise Exception('Do not know what to do with %s object' % type(obj))
+    raise TypeError('Do not know what to do with %s object' % type(obj))
 
 
 def convert_to_r_posixct(obj):
@@ -329,117 +353,5 @@ def convert_to_r_matrix(df, strings_as_factors=False):
 
     return r_matrix
 
-
-def test_convert_list():
-    obj = r('list(a=1, b=2, c=3)')
-
-    converted = convert_robj(obj)
-    expected = {'a': [1], 'b': [2], 'c': [3]}
-
-    _test.assert_dict_equal(converted, expected)
-
-
-def test_convert_nested_list():
-    obj = r('list(a=list(foo=1, bar=2))')
-
-    converted = convert_robj(obj)
-    expected = {'a': {'foo': [1], 'bar': [2]}}
-
-    _test.assert_dict_equal(converted, expected)
-
-
-def test_convert_frame():
-    # built-in dataset
-    df = r['faithful']
-
-    converted = convert_robj(df)
-
-    assert np.array_equal(converted.columns, ['eruptions', 'waiting'])
-    assert np.array_equal(converted.index, np.arange(1, 273))
-
-
-def _test_matrix():
-    r('mat <- matrix(rnorm(9), ncol=3)')
-    r('colnames(mat) <- c("one", "two", "three")')
-    r('rownames(mat) <- c("a", "b", "c")')
-
-    return r['mat']
-
-
-def test_convert_matrix():
-    mat = _test_matrix()
-
-    converted = convert_robj(mat)
-
-    assert np.array_equal(converted.index, ['a', 'b', 'c'])
-    assert np.array_equal(converted.columns, ['one', 'two', 'three'])
-
-
-def test_convert_r_dataframe():
-
-    is_na = robj.baseenv.get("is.na")
-
-    seriesd = _test.getSeriesData()
-    frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A'])
-
-    # Null data
-    frame["E"] = [np.nan for item in frame["A"]]
-    # Some mixed type data
-    frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)]
-
-    r_dataframe = convert_to_r_dataframe(frame)
-
-    assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index)
-    assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns)
-    assert all(is_na(item) for item in r_dataframe.rx2("E"))
-
-    for column in frame[["A", "B", "C", "D"]]:
-        coldata = r_dataframe.rx2(column)
-        original_data = frame[column]
-        assert np.array_equal(convert_robj(coldata), original_data)
-
-    for column in frame[["D", "E"]]:
-        for original, converted in zip(frame[column],
-                                       r_dataframe.rx2(column)):
-
-            if pd.isnull(original):
-                assert is_na(converted)
-            else:
-                assert original == converted
-
-
-def test_convert_r_matrix():
-
-    is_na = robj.baseenv.get("is.na")
-
-    seriesd = _test.getSeriesData()
-    frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A'])
-    # Null data
-    frame["E"] = [np.nan for item in frame["A"]]
-
-    r_dataframe = convert_to_r_matrix(frame)
-
-    assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index)
-    assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns)
-    assert all(is_na(item) for item in r_dataframe.rx(True, "E"))
-
-    for column in frame[["A", "B", "C", "D"]]:
-        coldata = r_dataframe.rx(True, column)
-        original_data = frame[column]
-        assert np.array_equal(convert_robj(coldata),
-                              original_data)
-
-    # Pandas bug 1282
-    frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)]
-
-    # FIXME: Ugly, this whole module needs to be ported to nose/unittest
-    try:
-        wrong_matrix = convert_to_r_matrix(frame)
-    except TypeError:
-        pass
-    except Exception:
-        raise
-
-
 if __name__ == '__main__':
     pass
diff --git a/pandas/rpy/tests/__init__.py b/pandas/rpy/tests/__init__.py
diff --git a/pandas/rpy/tests/test_common.py b/pandas/rpy/tests/test_common.py