Skip to content

Commit 54ec595

Browse files
Switch covariance estimation to numpy.ma.cov
This may give slightly different results from `pd.DataFrame.cov` in the presence of missing data, because the calculation of the mean is done differently, see pandas-dev/pandas#16837 As both implementations are correct, we switch to the more performant one here
1 parent 32d5e6f commit 54ec595

File tree

2 files changed

+14
-9
lines changed

2 files changed

+14
-9
lines changed

src/gwas/src/gwas/pheno.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -348,24 +348,24 @@ def from_txt(
348348

349349
def covariance_to_txt(
350350
self, path: UPath, compression_method: CompressionMethod, num_threads: int
351-
) -> None:
351+
) -> UPath:
352352
path = path.with_suffix(compression_method.suffix)
353353
if path.is_file():
354354
logger.debug("Skip writing covariance matrix because it already exists")
355-
return
355+
return path
356356

357357
phenotype_array = self.phenotypes.to_numpy()
358358
covariate_array = self.covariates.to_numpy()
359359

360360
array = np.hstack((phenotype_array, covariate_array))
361361
names = [*self.phenotype_names, *self.covariate_names]
362362

363-
data_frame = pd.DataFrame(array, index=self.samples, columns=names)
364-
365363
logger.debug("Calculating covariance matrix")
366-
covariance: npt.NDArray[np.float64] = np.asfortranarray(
367-
data_frame.cov().to_numpy(dtype=np.float64)
368-
)
364+
masked_array = np.ma.array(array, mask=np.isnan(array))
365+
numpy_covariance = np.ma.cov(
366+
masked_array, rowvar=False, allow_masked=True
367+
).filled(np.nan)
368+
covariance: npt.NDArray[np.float64] = np.asfortranarray(numpy_covariance)
369369

370370
writer: FileArrayWriter[np.float64] = FileArray.create(
371371
path,
@@ -382,6 +382,8 @@ def covariance_to_txt(
382382
with writer:
383383
writer[:, :] = covariance
384384

385+
return writer.file_path
386+
385387

386388
@dataclass
387389
class VariableSummary:

src/gwas/tests/score/test_pheno.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88
from pytest import FixtureRequest
99
from upath import UPath
1010

11-
from gwas.compression.arr.base import Blosc2CompressionMethod, compression_methods
11+
from gwas.compression.arr.base import (
12+
Blosc2CompressionMethod,
13+
compression_methods,
14+
)
1215
from gwas.mem.wkspace import SharedWorkspace
1316
from gwas.pheno import VariableCollection
1417
from gwas.utils import cpu_count
@@ -209,7 +212,7 @@ def test_covariance(
209212
request.addfinalizer(variable_collection.free)
210213

211214
covariance_path = tmp_path / "covariance.tsv"
212-
variable_collection.covariance_to_txt(
215+
covariance_path = variable_collection.covariance_to_txt(
213216
covariance_path, compression_method, num_threads=cpu_count()
214217
)
215218

0 commit comments

Comments
 (0)