Skip to content

Commit 262b379

Browse files
jrebackwesm
authored andcommitted
ERR: [Python] infer_dtypes for better error messages on write_dataframe
ERR: nicer error message on passing duplicate columns, xref #53 ERR: nice error message on serializing python objects, closes #240 should be after #239 xref pandas-dev/pandas#14383 Author: Jeff Reback <[email protected]> Closes #244 from jreback/infer_dtype and squashes the following commits: 091ee60 [Jeff Reback] fix pandas < 0.19.0 compat 6ef3a47 [Jeff Reback] fixed up mixed with embedded nulls tests 560b9e6 [Jeff Reback] ERR: nicer error message on passing duplicate columns, xref #53
1 parent f6184d5 commit 262b379

File tree

2 files changed

+63
-1
lines changed

2 files changed

+63
-1
lines changed

python/feather/api.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import six
1616
from distutils.version import LooseVersion
1717
import pandas as pd
18+
from feather.compat import pdapi
1819

1920
import feather.ext as ext
2021

@@ -27,14 +28,33 @@ def write_dataframe(df, path):
2728
Write a pandas.DataFrame to Feather format
2829
'''
2930
writer = ext.FeatherWriter(path)
30-
31+
3132
if isinstance(df, pd.SparseDataFrame):
3233
df = df.to_dense()
3334

35+
if not df.columns.is_unique:
36+
raise ValueError("cannot serialize duplicate column names")
37+
3438
# TODO(wesm): pipeline conversion to Arrow memory layout
3539
for i, name in enumerate(df.columns):
3640
col = df.iloc[:, i]
3741

42+
if pdapi.is_object_dtype(col):
43+
inferred_type = pd.lib.infer_dtype(col)
44+
msg = ("cannot serialize column {n} "
45+
"named {name} with dtype {dtype}".format(
46+
n=i, name=name, dtype=inferred_type))
47+
48+
if inferred_type in ['mixed']:
49+
50+
# allow columns with nulls + an inferable type
51+
inferred_type = pd.lib.infer_dtype(col[col.notnull()])
52+
if inferred_type in ['mixed']:
53+
raise ValueError(msg)
54+
55+
elif inferred_type not in ['unicode', 'string']:
56+
raise ValueError(msg)
57+
3858
if not isinstance(name, six.string_types):
3959
name = str(name)
4060

python/feather/tests/test_reader.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,18 @@ def _check_pandas_roundtrip(self, df, expected=None, path=None,
8282

8383
np.testing.assert_array_equal(self._get_null_counts(path, columns), null_counts)
8484

85+
def _assert_error_on_write(self, df, exc, path=None):
86+
# check that we are raising the exception
87+
# on writing
88+
89+
if path is None:
90+
path = random_path()
91+
92+
self.test_files.append(path)
93+
def f():
94+
feather.write_dataframe(df, path)
95+
self.assertRaises(exc, f)
96+
8597
def test_num_rows_attr(self):
8698
df = pd.DataFrame({'foo': [1, 2, 3, 4, 5]})
8799
path = random_path()
@@ -237,10 +249,20 @@ def test_boolean_object_nulls(self):
237249

238250
def test_strings(self):
239251
repeats = 1000
252+
253+
# we hvae mixed bytes, unicode, strings
240254
values = [b'foo', None, u'bar', 'qux', np.nan]
241255
df = pd.DataFrame({'strings': values * repeats})
256+
self._assert_error_on_write(df, ValueError)
242257

258+
# embedded nulls are ok
243259
values = ['foo', None, 'bar', 'qux', None]
260+
df = pd.DataFrame({'strings': values * repeats})
261+
expected = pd.DataFrame({'strings': values * repeats})
262+
self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats])
263+
264+
values = ['foo', None, 'bar', 'qux', np.nan]
265+
df = pd.DataFrame({'strings': values * repeats})
244266
expected = pd.DataFrame({'strings': values * repeats})
245267
self._check_pandas_roundtrip(df, expected, null_counts=[2 * repeats])
246268

@@ -300,3 +322,23 @@ def test_sparse_dataframe(self):
300322
df = pd.DataFrame(data).to_sparse(fill_value=1)
301323
expected = df.to_dense()
302324
self._check_pandas_roundtrip(df, expected)
325+
326+
def test_duplicate_columns(self):
327+
328+
# https://github.com/wesm/feather/issues/53
329+
# not currently able to handle duplicate columns
330+
df = pd.DataFrame(np.arange(12).reshape(4, 3),
331+
columns=list('aaa')).copy()
332+
self._assert_error_on_write(df, ValueError)
333+
334+
def test_unsupported(self):
335+
# https://github.com/wesm/feather/issues/240
336+
# serializing actual python objects
337+
338+
# period
339+
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
340+
self._assert_error_on_write(df, ValueError)
341+
342+
# non-strings
343+
df = pd.DataFrame({'a': ['a', 1, 2.0]})
344+
self._assert_error_on_write(df, ValueError)

0 commit comments

Comments
 (0)