|
15 | 15 | from rpy2.robjects import r
|
16 | 16 | import rpy2.robjects as robj
|
17 | 17 |
|
| 18 | +import itertools as IT |
| 19 | + |
| 20 | + |
18 | 21 | __all__ = ['convert_robj', 'load_data', 'convert_to_r_dataframe',
|
19 | 22 | 'convert_to_r_matrix']
|
20 | 23 |
|
@@ -46,47 +49,69 @@ def _is_null(obj):
|
46 | 49 |
|
47 | 50 | def _convert_list(obj):
|
48 | 51 | """
|
49 |
| - Convert named Vector to dict |
| 52 | + Convert named Vector to dict, factors to list |
50 | 53 | """
|
51 |
| - values = [convert_robj(x) for x in obj] |
52 |
| - return dict(zip(obj.names, values)) |
| 54 | + try: |
| 55 | + values = [convert_robj(x) for x in obj] |
| 56 | + keys = r['names'](obj) |
| 57 | + return dict(zip(keys, values)) |
| 58 | + except TypeError: |
| 59 | + # For state.division and state.region |
| 60 | + factors = list(r['factor'](obj)) |
| 61 | + level = list(r['levels'](obj)) |
| 62 | + result = [level[index-1] for index in factors] |
| 63 | + return result |
53 | 64 |
|
54 | 65 |
|
55 | 66 | def _convert_array(obj):
|
56 | 67 | """
|
57 |
| - Convert Array to ndarray |
| 68 | + Convert Array to DataFrame |
58 | 69 | """
|
59 |
| - # this royally sucks. "Matrices" (arrays) with dimension > 3 in R aren't |
60 |
| - # really matrices-- things come out Fortran order in the first two |
61 |
| - # dimensions. Maybe I'm wrong? |
62 |
| - |
| 70 | + def _list(item): |
| 71 | + try: |
| 72 | + return list(item) |
| 73 | + except TypeError: |
| 74 | + return [] |
| 75 | + |
| 76 | + # For iris3, HairEyeColor, UCBAdmissions, Titanic |
63 | 77 | dim = list(obj.dim)
|
64 | 78 | values = np.array(list(obj))
|
65 |
| - |
66 |
| - if len(dim) == 3: |
67 |
| - arr = values.reshape(dim[-1:] + dim[:-1]).swapaxes(1, 2) |
68 |
| - |
69 |
| - if obj.names is not None: |
70 |
| - name_list = [list(x) for x in obj.names] |
71 |
| - if len(dim) == 2: |
72 |
| - return pd.DataFrame(arr, index=name_list[0], columns=name_list[1]) |
73 |
| - elif len(dim) == 3: |
74 |
| - return pd.Panel(arr, items=name_list[2], |
75 |
| - major_axis=name_list[0], |
76 |
| - minor_axis=name_list[1]) |
77 |
| - else: |
78 |
| - print('Cannot handle dim=%d' % len(dim)) |
79 |
| - else: |
80 |
| - return arr |
| 79 | + names = r['dimnames'](obj) |
| 80 | + try: |
| 81 | + columns = list(r['names'](names))[::-1] |
| 82 | + except TypeError: |
| 83 | + columns = ['X{:d}'.format(i) for i in range(len(names))][::-1] |
| 84 | + columns.append('value') |
| 85 | + name_list = [(_list(x) or range(d)) for x, d in zip(names, dim)][::-1] |
| 86 | + arr = np.array(list(IT.product(*name_list))) |
| 87 | + arr = np.column_stack([arr,values]) |
| 88 | + df = pd.DataFrame(arr, columns=columns) |
| 89 | + return df |
81 | 90 |
|
82 | 91 |
|
83 | 92 | def _convert_vector(obj):
|
84 | 93 | if isinstance(obj, robj.IntVector):
|
85 | 94 | return _convert_int_vector(obj)
|
86 | 95 | elif isinstance(obj, robj.StrVector):
|
87 | 96 | return _convert_str_vector(obj)
|
88 |
| - |
89 |
| - return list(obj) |
| 97 | + # Check if the vector has extra information attached to it that can be used |
| 98 | + # as an index |
| 99 | + try: |
| 100 | + attributes = set(r['attributes'](obj).names) |
| 101 | + except AttributeError: |
| 102 | + return list(obj) |
| 103 | + if 'names' in attributes: |
| 104 | + return pd.Series(list(obj), index=r['names'](obj)) |
| 105 | + elif 'tsp' in attributes: |
| 106 | + return pd.Series(list(obj), index=r['time'](obj)) |
| 107 | + elif 'labels' in attributes: |
| 108 | + return pd.Series(list(obj), index=r['labels'](obj)) |
| 109 | + if _rclass(obj) == 'dist': |
| 110 | + # For 'eurodist'. WARNING: This results in a DataFrame, not a Series or list. |
| 111 | + matrix = r['as.matrix'](obj) |
| 112 | + return convert_robj(matrix) |
| 113 | + else: |
| 114 | + return list(obj) |
90 | 115 |
|
91 | 116 | NA_INTEGER = -2147483648
|
92 | 117 |
|
@@ -141,8 +166,7 @@ def _convert_Matrix(mat):
|
141 | 166 | rows = mat.rownames
|
142 | 167 |
|
143 | 168 | columns = None if _is_null(columns) else list(columns)
|
144 |
| - index = None if _is_null(rows) else list(rows) |
145 |
| - |
| 169 | + index = r['time'](mat) if _is_null(rows) else list(rows) |
146 | 170 | return pd.DataFrame(np.array(mat), index=_check_int(index),
|
147 | 171 | columns=columns)
|
148 | 172 |
|
@@ -197,7 +221,7 @@ def convert_robj(obj, use_pandas=True):
|
197 | 221 | if isinstance(obj, rpy_type):
|
198 | 222 | return converter(obj)
|
199 | 223 |
|
200 |
| - raise Exception('Do not know what to do with %s object' % type(obj)) |
| 224 | + raise TypeError('Do not know what to do with %s object' % type(obj)) |
201 | 225 |
|
202 | 226 |
|
203 | 227 | def convert_to_r_posixct(obj):
|
@@ -329,117 +353,5 @@ def convert_to_r_matrix(df, strings_as_factors=False):
|
329 | 353 |
|
330 | 354 | return r_matrix
|
331 | 355 |
|
332 |
| - |
333 |
| -def test_convert_list(): |
334 |
| - obj = r('list(a=1, b=2, c=3)') |
335 |
| - |
336 |
| - converted = convert_robj(obj) |
337 |
| - expected = {'a': [1], 'b': [2], 'c': [3]} |
338 |
| - |
339 |
| - _test.assert_dict_equal(converted, expected) |
340 |
| - |
341 |
| - |
342 |
| -def test_convert_nested_list(): |
343 |
| - obj = r('list(a=list(foo=1, bar=2))') |
344 |
| - |
345 |
| - converted = convert_robj(obj) |
346 |
| - expected = {'a': {'foo': [1], 'bar': [2]}} |
347 |
| - |
348 |
| - _test.assert_dict_equal(converted, expected) |
349 |
| - |
350 |
| - |
351 |
| -def test_convert_frame(): |
352 |
| - # built-in dataset |
353 |
| - df = r['faithful'] |
354 |
| - |
355 |
| - converted = convert_robj(df) |
356 |
| - |
357 |
| - assert np.array_equal(converted.columns, ['eruptions', 'waiting']) |
358 |
| - assert np.array_equal(converted.index, np.arange(1, 273)) |
359 |
| - |
360 |
| - |
361 |
| -def _test_matrix(): |
362 |
| - r('mat <- matrix(rnorm(9), ncol=3)') |
363 |
| - r('colnames(mat) <- c("one", "two", "three")') |
364 |
| - r('rownames(mat) <- c("a", "b", "c")') |
365 |
| - |
366 |
| - return r['mat'] |
367 |
| - |
368 |
| - |
369 |
| -def test_convert_matrix(): |
370 |
| - mat = _test_matrix() |
371 |
| - |
372 |
| - converted = convert_robj(mat) |
373 |
| - |
374 |
| - assert np.array_equal(converted.index, ['a', 'b', 'c']) |
375 |
| - assert np.array_equal(converted.columns, ['one', 'two', 'three']) |
376 |
| - |
377 |
| - |
378 |
| -def test_convert_r_dataframe(): |
379 |
| - |
380 |
| - is_na = robj.baseenv.get("is.na") |
381 |
| - |
382 |
| - seriesd = _test.getSeriesData() |
383 |
| - frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) |
384 |
| - |
385 |
| - # Null data |
386 |
| - frame["E"] = [np.nan for item in frame["A"]] |
387 |
| - # Some mixed type data |
388 |
| - frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)] |
389 |
| - |
390 |
| - r_dataframe = convert_to_r_dataframe(frame) |
391 |
| - |
392 |
| - assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index) |
393 |
| - assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns) |
394 |
| - assert all(is_na(item) for item in r_dataframe.rx2("E")) |
395 |
| - |
396 |
| - for column in frame[["A", "B", "C", "D"]]: |
397 |
| - coldata = r_dataframe.rx2(column) |
398 |
| - original_data = frame[column] |
399 |
| - assert np.array_equal(convert_robj(coldata), original_data) |
400 |
| - |
401 |
| - for column in frame[["D", "E"]]: |
402 |
| - for original, converted in zip(frame[column], |
403 |
| - r_dataframe.rx2(column)): |
404 |
| - |
405 |
| - if pd.isnull(original): |
406 |
| - assert is_na(converted) |
407 |
| - else: |
408 |
| - assert original == converted |
409 |
| - |
410 |
| - |
411 |
| -def test_convert_r_matrix(): |
412 |
| - |
413 |
| - is_na = robj.baseenv.get("is.na") |
414 |
| - |
415 |
| - seriesd = _test.getSeriesData() |
416 |
| - frame = pd.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) |
417 |
| - # Null data |
418 |
| - frame["E"] = [np.nan for item in frame["A"]] |
419 |
| - |
420 |
| - r_dataframe = convert_to_r_matrix(frame) |
421 |
| - |
422 |
| - assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index) |
423 |
| - assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns) |
424 |
| - assert all(is_na(item) for item in r_dataframe.rx(True, "E")) |
425 |
| - |
426 |
| - for column in frame[["A", "B", "C", "D"]]: |
427 |
| - coldata = r_dataframe.rx(True, column) |
428 |
| - original_data = frame[column] |
429 |
| - assert np.array_equal(convert_robj(coldata), |
430 |
| - original_data) |
431 |
| - |
432 |
| - # Pandas bug 1282 |
433 |
| - frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)] |
434 |
| - |
435 |
| - # FIXME: Ugly, this whole module needs to be ported to nose/unittest |
436 |
| - try: |
437 |
| - wrong_matrix = convert_to_r_matrix(frame) |
438 |
| - except TypeError: |
439 |
| - pass |
440 |
| - except Exception: |
441 |
| - raise |
442 |
| - |
443 |
| - |
444 | 356 | if __name__ == '__main__':
|
445 | 357 | pass
|
0 commit comments