Merge remote-tracking branch 'upstream/master' into Dr-Irv-issue9943

TomAugspurger · TomAugspurger · commit 0701fc8fd084 · 2018-11-01T15:20:23.000-07:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -1273,6 +1273,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
 - Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`)
 - Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`)
 - Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`).
+- Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -14,10 +14,12 @@
 hypothesis.settings.register_profile(
     "ci",
     # Hypothesis timing checks are tuned for scalars by default, so we bump
-    # them from 200ms to 5 secs per test case as the global default.  If this
+    # them from 200ms to 500ms per test case as the global default.  If this
     # is too short for a specific test, (a) try to make it faster, and (b)
-    # if it really is slow add `@settings(timeout=...)` with a working value.
-    timeout=5000,
+    # if it really is slow add `@settings(deadline=...)` with a working value,
+    # or `deadline=None` to entirely disable timeouts for that test.
+    deadline=500,
+    timeout=hypothesis.unlimited,
     suppress_health_check=(hypothesis.HealthCheck.too_slow,)
 )
 hypothesis.settings.load_profile("ci")
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -4637,7 +4637,7 @@ def _convert_string_array(data, encoding, errors, itemsize=None):
     # create the sized dtype
     if itemsize is None:
         ensured = ensure_object(data.ravel())
-        itemsize = libwriters.max_len_string_array(ensured)
+        itemsize = max(1, libwriters.max_len_string_array(ensured))
 
     data = np.asarray(data, dtype="S%d" % itemsize)
     return data
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
@@ -823,6 +823,20 @@ def zip_frames(frames, axis=1):
         return pd.DataFrame(zipped)
 
 
+@composite
+def indices(draw, max_length=5):
+    date = draw(
+        dates(
+            min_value=Timestamp.min.ceil("D").to_pydatetime().date(),
+            max_value=Timestamp.max.floor("D").to_pydatetime().date(),
+        ).map(Timestamp)
+    )
+    periods = draw(integers(0, max_length))
+    freq = draw(sampled_from(list("BDHTS")))
+    dr = date_range(date, periods=periods, freq=freq)
+    return pd.DatetimeIndex(list(dr))
+
+
 class TestDataFrameAggregate():
 
     def test_agg_transform(self, axis, float_frame):
@@ -1142,20 +1156,7 @@ def test_agg_cython_table_raises(self, df, func, expected, axis):
         with pytest.raises(expected):
             df.agg(func, axis=axis)
 
-    @composite
-    def indices(draw, max_length=5):
-        date = draw(
-            dates(
-                min_value=Timestamp.min.ceil("D").to_pydatetime().date(),
-                max_value=Timestamp.max.floor("D").to_pydatetime().date(),
-            ).map(Timestamp)
-        )
-        periods = draw(integers(0, max_length))
-        freq = draw(sampled_from(list("BDHTS")))
-        dr = date_range(date, periods=periods, freq=freq)
-        return pd.DatetimeIndex(list(dr))
-
-    @given(index=indices(5), num_columns=integers(0, 5))
+    @given(index=indices(max_length=5), num_columns=integers(0, 5))
     def test_frequency_is_original(self, index, num_columns):
         # GH 22150
         original = index.copy()
diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py
@@ -8,14 +8,28 @@
 from pandas._libs.interval import IntervalTree
 
 
+def skipif_32bit(param):
+    """
+    Skip parameters in a parametrize on 32bit systems. Specifically used
+    here to skip leaf_size parameters related to GH 23440.
+    """
+    marks = pytest.mark.skipif(compat.is_platform_32bit(),
+                               reason='GH 23440: int type mismatch on 32bit')
+    return pytest.param(param, marks=marks)
+
+
 @pytest.fixture(
     scope='class', params=['int32', 'int64', 'float32', 'float64', 'uint64'])
 def dtype(request):
     return request.param
 
 
-@pytest.fixture(params=[1, 2, 10])
+@pytest.fixture(params=[skipif_32bit(1), skipif_32bit(2), 10])
 def leaf_size(request):
+    """
+    Fixture to specify IntervalTree leaf_size parameter; to be used with the
+    tree fixture.
+    """
     return request.param
 
 
@@ -85,9 +99,8 @@ def test_get_loc_closed(self, closed):
                 tm.assert_numpy_array_equal(tree.get_loc(p),
                                             np.array([0], dtype='int64'))
 
-    @pytest.mark.skipif(compat.is_platform_32bit(),
-                        reason="int type mismatch on 32bit")
-    @pytest.mark.parametrize('leaf_size', [1, 10, 100, 10000])
+    @pytest.mark.parametrize('leaf_size', [
+        skipif_32bit(1), skipif_32bit(10), skipif_32bit(100), 10000])
     def test_get_indexer_closed(self, closed, leaf_size):
         x = np.arange(1000, dtype='float64')
         found = x.astype('intp')
diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py
@@ -1482,6 +1482,16 @@ def check_col(key, name, size):
             pytest.raises(ValueError, store.append, 'df',
                           df, min_itemsize={'foo': 20, 'foobar': 20})
 
+    def test_append_with_empty_string(self):
+
+        with ensure_clean_store(self.path) as store:
+
+            # with all empty strings (GH 12242)
+            df = DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', '']})
+            store.append('df', df[:-1], min_itemsize={'x': 1})
+            store.append('df', df[-1:], min_itemsize={'x': 1})
+            tm.assert_frame_equal(store.select('df'), df)
+
     def test_to_hdf_with_min_itemsize(self):
 
         with ensure_clean_path(self.path) as path: