[qob] Fix IBD and enable tests (#14062)

jigold · web-flow · commit 965f10257513 · 2024-01-09T10:13:33.000-05:00
CHANGELOG: Fixed bugs in the identity by descent implementation for Query on Batch This PR fixes #14052. There were two bugs in how we compute IBD. In addition, the tests weren't running in QoB and the test dataset we were using doesn't have enough variability to catch errors. I used Balding Nichols generated data instead. Do we need to set the seed in the tests here?
diff --git a/hail/python/hail/methods/relatedness/identity_by_descent.py b/hail/python/hail/methods/relatedness/identity_by_descent.py
@@ -147,7 +147,7 @@ def identity_by_descent(dataset, maf=None, bounded=True, min=None, max=None) ->
         _e00=(2 * (p**2) * (q**2) * ((X - 1) / X) * ((Y - 1) / Y) * (T / (T - 1)) * (T / (T - 2)) * (T / (T - 3))),
         _e10=(
             4 * (p**3) * q * ((X - 1) / X) * ((X - 2) / X) * (T / (T - 1)) * (T / (T - 2)) * (T / (T - 3))
-            + 4 * p * (q**3) * ((Y - 1) / X) * ((Y - 2) / X) * (T / (T - 1)) * (T / (T - 2)) * (T / (T - 3))
+            + 4 * p * (q**3) * ((Y - 1) / Y) * ((Y - 2) / Y) * (T / (T - 1)) * (T / (T - 2)) * (T / (T - 3))
         ),
         _e20=(
             (p**4) * ((X - 1) / X) * ((X - 2) / X) * ((X - 3) / X) * (T / (T - 1)) * (T / (T - 2)) * (T / (T - 3))
@@ -164,7 +164,7 @@ def identity_by_descent(dataset, maf=None, bounded=True, min=None, max=None) ->
             + (p**2) * q * ((X - 1) / X) * (T / (T - 1)) * (T / (T - 2))
             + p * (q**2) * ((Y - 1) / Y) * (T / (T - 1)) * (T / (T - 2))
         ),
-        _e22=(T / 2),
+        _e22=1,
     )
 
     dataset = dataset.checkpoint(hl.utils.new_temp_file())
diff --git a/hail/python/test/hail/methods/relatedness/test_identity_by_descent.py b/hail/python/test/hail/methods/relatedness/test_identity_by_descent.py
@@ -5,12 +5,19 @@
 
 import hail as hl
 import hail.utils as utils
-from ...helpers import get_dataset, test_timeout, qobtest
+from ...helpers import test_timeout, qobtest
 
 
-def plinkify(ds, min=None, max=None):
+@pytest.fixture(scope='module')
+def ds():
+    dataset = hl.balding_nichols_model(1, 100, 100)
+    dataset = dataset.key_cols_by(s=hl.str(dataset.sample_idx + 1))
+    return dataset
+
+
+def plinkify(dataset, min=None, max=None):
     vcf = utils.new_temp_file(prefix="plink", extension="vcf")
-    hl.export_vcf(ds, vcf)
+    hl.export_vcf(dataset, vcf)
 
     local_tmpdir = utils.new_local_temp_dir()
     plinkpath = f'{local_tmpdir}/plink-ibd'
@@ -45,9 +52,7 @@ def plinkify(ds, min=None, max=None):
 @qobtest
 @unittest.skipIf('HAIL_TEST_SKIP_PLINK' in os.environ, 'Skipping tests requiring plink')
 @test_timeout(local=10 * 60, batch=10 * 60)
-def test_ibd_default_arguments():
-    ds = get_dataset()
-
+def test_ibd_default_arguments(ds):
     plink_results = plinkify(ds)
     hail_results = hl.identity_by_descent(ds).collect()
 
@@ -62,11 +67,10 @@ def test_ibd_default_arguments():
         assert plink_results[key][1][2] == row.ibs2
 
 
+@qobtest
 @unittest.skipIf('HAIL_TEST_SKIP_PLINK' in os.environ, 'Skipping tests requiring plink')
 @test_timeout(local=10 * 60, batch=10 * 60)
-def test_ibd_0_and_1():
-    ds = get_dataset()
-
+def test_ibd_0_and_1(ds):
     plink_results = plinkify(ds, min=0.0, max=1.0)
     hail_results = hl.identity_by_descent(ds).collect()
 
@@ -81,15 +85,15 @@ def test_ibd_0_and_1():
         assert plink_results[key][1][2] == row.ibs2
 
 
+@qobtest
 @test_timeout(local=10 * 60, batch=10 * 60)
-def test_ibd_does_not_error_with_dummy_maf_float64():
-    dataset = get_dataset()
-    dataset = dataset.annotate_rows(dummy_maf=0.01)
-    hl.identity_by_descent(dataset, dataset['dummy_maf'], min=0.0, max=1.0)
+def test_ibd_does_not_error_with_dummy_maf_float64(ds):
+    ds = ds.annotate_rows(dummy_maf=0.01)
+    hl.identity_by_descent(ds, ds['dummy_maf'], min=0.0, max=1.0)
 
 
+@qobtest
 @test_timeout(local=10 * 60, batch=10 * 60)
-def test_ibd_does_not_error_with_dummy_maf_float32():
-    dataset = get_dataset()
-    dataset = dataset.annotate_rows(dummy_maf=0.01)
-    hl.identity_by_descent(dataset, hl.float32(dataset['dummy_maf']), min=0.0, max=1.0)
+def test_ibd_does_not_error_with_dummy_maf_float32(ds):
+    ds = ds.annotate_rows(dummy_maf=0.01)
+    hl.identity_by_descent(ds, hl.float32(ds['dummy_maf']), min=0.0, max=1.0)