Fix covariance file writer

HippocampusGirl · HippocampusGirl · commit 32d5e6f4af3b · 2024-08-23T10:16:32.000+02:00
diff --git a/src/gwas/src/gwas/pheno.py b/src/gwas/src/gwas/pheno.py
@@ -10,6 +10,7 @@
 from .compression.arr.base import (
     CompressionMethod,
     FileArray,
+    FileArrayWriter,
 )
 from .log import logger
 from .mem.arr import SharedArray
@@ -362,20 +363,24 @@ def covariance_to_txt(
         data_frame = pd.DataFrame(array, index=self.samples, columns=names)
 
         logger.debug("Calculating covariance matrix")
-        covariance = data_frame.cov().to_numpy(dtype=np.float64)
+        covariance: npt.NDArray[np.float64] = np.asfortranarray(
+            data_frame.cov().to_numpy(dtype=np.float64)
+        )
 
-        file_array = FileArray.create(
+        writer: FileArrayWriter[np.float64] = FileArray.create(
             path,
             covariance.shape,
-            covariance.dtype,
+            covariance.dtype.type,
             compression_method,
             num_threads=num_threads,
         )
+
         data_frame = pd.DataFrame(dict(variable=names))
-        with file_array:
-            file_array.set_axis_metadata(0, data_frame)
-            file_array.set_axis_metadata(1, names)
-            file_array[:, :] = covariance
+        writer.set_axis_metadata(0, data_frame)
+        writer.set_axis_metadata(1, names)
+
+        with writer:
+            writer[:, :] = covariance
 
 
 @dataclass
diff --git a/src/gwas/tests/score/test_pheno.py b/src/gwas/tests/score/test_pheno.py
@@ -1,33 +1,44 @@
+import sys
+
 import numpy as np
 import pandas as pd
+import pytest
+from numpy import typing as npt
 from numpy.testing import assert_array_equal
 from pytest import FixtureRequest
 from upath import UPath
 
+from gwas.compression.arr.base import Blosc2CompressionMethod, compression_methods
 from gwas.mem.wkspace import SharedWorkspace
 from gwas.pheno import VariableCollection
+from gwas.utils import cpu_count
 
 from .simulation import missing_value_rate
 
+try:
+    import blosc2 as blosc2
+except ImportError:
+    pass
+
 sample_count = 100
 phenotype_count = 16
 covariate_count = 4
 
-samples = [str(i) for i in range(sample_count)]
-permutation = np.random.permutation(sample_count)
+samples = [f"{i + 1:03d}" for i in range(sample_count)]
 
 phenotype_names = [f"phenotype_{i + 1:02d}" for i in range(phenotype_count)]
 covariate_names = [f"covariate_{i + 1:02d}" for i in range(covariate_count)]
 
 
-def test_pheno(
-    tmp_path: UPath,
-    sw: SharedWorkspace,
-    request: FixtureRequest,
-) -> None:
-    np.random.seed(47)
-    allocation_names = set(sw.allocations.keys())
+@pytest.fixture(scope="session")
+def permutation() -> npt.NDArray[np.int_]:
+    np.random.seed(46)
+    return np.random.permutation(sample_count)
 
+
+@pytest.fixture(scope="session")
+def phenotypes() -> npt.NDArray[np.float64]:
+    np.random.seed(47)
     phenotypes = np.random.rand(sample_count, phenotype_count)
     phenotypes[
         np.random.choice(
@@ -36,8 +47,22 @@ def test_pheno(
             p=[1 - missing_value_rate, missing_value_rate],
         )
     ] = np.nan
-    covariates = np.random.rand(sample_count, covariate_count)
+    return phenotypes
 
+
+@pytest.fixture(scope="session")
+def covariates() -> npt.NDArray[np.float64]:
+    np.random.seed(48)
+    return np.random.rand(sample_count, covariate_count)
+
+
+@pytest.fixture(scope="session")
+def phenotype_path(
+    phenotypes: npt.NDArray[np.float64],
+    permutation: npt.NDArray[np.int_],
+    tmp_path_factory: pytest.TempPathFactory,
+) -> UPath:
+    tmp_path = UPath(tmp_path_factory.mktemp("phenotypes"))
     phenotype_frame = pd.DataFrame(
         phenotypes[permutation, :],
         columns=phenotype_names,
@@ -47,7 +72,16 @@ def test_pheno(
     phenotype_frame.to_csv(
         phenotype_path, sep="\t", index=True, header=True, na_rep="n/a"
     )
+    return phenotype_path
 
+
+@pytest.fixture(scope="session")
+def covariate_path(
+    covariates: npt.NDArray[np.float64],
+    permutation: npt.NDArray[np.int_],
+    tmp_path_factory: pytest.TempPathFactory,
+) -> UPath:
+    tmp_path = UPath(tmp_path_factory.mktemp("covariates"))
     covariate_frame = pd.DataFrame(
         covariates[permutation, :],
         columns=covariate_names,
@@ -57,6 +91,18 @@ def test_pheno(
     covariate_frame.to_csv(
         covariate_path, sep="\t", index=True, header=True, na_rep="n/a"
     )
+    return covariate_path
+
+
+def test_pheno(
+    phenotypes: npt.NDArray[np.float64],
+    covariates: npt.NDArray[np.float64],
+    phenotype_path: UPath,
+    covariate_path: UPath,
+    sw: SharedWorkspace,
+    request: FixtureRequest,
+) -> None:
+    allocation_names = set(sw.allocations.keys())
 
     variable_collection0 = VariableCollection.from_txt(
         [phenotype_path],
@@ -135,3 +181,40 @@ def test_pheno_zero_variance(
         variable_collection.covariates.name,
     }
     assert set(sw.allocations.keys()) <= (allocation_names | new_allocation_names)
+
+
+@pytest.mark.parametrize("compression_method_name", compression_methods.keys())
+def test_covariance(
+    compression_method_name: str,
+    phenotype_path: UPath,
+    covariate_path: UPath,
+    sw: SharedWorkspace,
+    tmp_path: UPath,
+    request: FixtureRequest,
+) -> None:
+    compression_method = compression_methods[compression_method_name]
+    if isinstance(compression_method, Blosc2CompressionMethod):
+        if "blosc2" not in sys.modules:
+            pytest.skip("blosc2 not installed")
+
+    allocation_names = set(sw.allocations.keys())
+
+    variable_collection = VariableCollection.from_txt(
+        [phenotype_path],
+        [covariate_path],
+        sw,
+        samples=samples,
+        missing_value_strategy="listwise_deletion",
+    )
+    request.addfinalizer(variable_collection.free)
+
+    covariance_path = tmp_path / "covariance.tsv"
+    variable_collection.covariance_to_txt(
+        covariance_path, compression_method, num_threads=cpu_count()
+    )
+
+    new_allocation_names = {
+        variable_collection.phenotypes.name,
+        variable_collection.covariates.name,
+    }
+    assert set(sw.allocations.keys()) <= (allocation_names | new_allocation_names)