Skip to content

Commit 53ce4fc

Browse files
authored
cools_cis_eig work with regions as DF, numutils eig update (#215)
1 parent 78c18f8 commit 53ce4fc

File tree

2 files changed

+148
-54
lines changed

2 files changed

+148
-54
lines changed

cooltools/eigdecomp.py

+122-46
Original file line numberDiff line numberDiff line change
@@ -294,46 +294,133 @@ def cooler_cis_eig(
294294
phasing_track_col="GC",
295295
balance="weight",
296296
ignore_diags=None,
297+
bad_bins=None,
297298
clip_percentile=99.9,
298299
sort_metric=None,
300+
map=map,
299301
):
300-
# Perform consitency checks.
302+
"""
303+
Compute compartment eigenvector for a given cooler `clr` in a number of
304+
symmetric intra chromosomal regions (cis-regions), or for each chromosome.
305+
Note that the amplitude of compartment eigenvectors is weighted by their
306+
corresponding eigenvalue
307+
Parameters
308+
----------
309+
clr : cooler
310+
cooler object to fetch data from
311+
bins : DataFrame
312+
table of bins derived from clr with phasing track added
313+
regions : iterable or DataFrame, optional
314+
if provided, eigenvectors are calculated for the regions only,
315+
otherwise chromosome-wide eigenvectors are computed, for chromosomes
316+
specified in bins.
317+
n_eigs : int
318+
number of eigenvectors to compute
319+
phasing_track_col : str, optional
320+
name of the columns in `bins` table, if provided, eigenvectors are
321+
flipped to achieve a positive correlation with `bins[phasing_track_col]`.
322+
balance : str
323+
name of the column with balancing weights to be used.
324+
ignore_diags : int, optional
325+
the number of diagonals to ignore. Derived from cooler metadata
326+
if not specified.
327+
bad_bins : array-like
328+
a list of bins to ignore. Indexes of bins must be absolute,
329+
as in clr.bins()[:], as opposed to being offset by chromosome start.
330+
`bad_bins` will be combined with the bad bins masked by balancing.
331+
clip_percentile : float
332+
if >0 and <100, clip pixels with diagonal-normalized values
333+
higher than the specified percentile of matrix-wide values.
334+
sort_metric : str
335+
If provided, re-sort `eigenvecs` and `eigvals` in the order of
336+
decreasing correlation between phasing_track and eigenvector, using the
337+
specified measure of correlation. Possible values:
338+
'pearsonr' - sort by decreasing Pearson correlation.
339+
'var_explained' - sort by decreasing absolute amount of variation in
340+
`eigvecs` explained by `phasing_track` (i.e. R^2 * var(eigvec))
341+
'MAD_explained' - sort by decreasing absolute amount of Median Absolute
342+
Deviation from the median of `eigvecs` explained by `phasing_track`
343+
(i.e. COMED(eigvec, phasing_track) * MAD(eigvec)).
344+
'spearmanr' - sort by decreasing Spearman correlation.
345+
This option is designed to report the most "biologically" informative
346+
eigenvectors first, and prevent eigenvector swapping caused by
347+
translocations. In reality, however, sometimes it shows poor
348+
performance and may lead to reporting of non-informative eigenvectors.
349+
Off by default.
350+
map : callable, optional
351+
Map functor implementation.
352+
Returns
353+
-------
354+
eigvals, eigvec_table -> DataFrames with eigenvalues for each region and
355+
a table of eigenvectors filled in the `bins` table.
356+
.. note:: ALWAYS check your EVs by eye. The first one occasionally does
357+
not reflect the compartment structure, but instead describes
358+
chromosomal arms or translocation blowouts. Possible mitigations:
359+
employ `regions` (e.g. arms) to avoid issues with chromosomal arms,
360+
use `bad_bins` to ignore small transolcations.
361+
"""
362+
363+
# get chromosomes from bins, if regions not specified:
301364
if regions is None:
302-
chroms_not_in_clr = [
303-
chrom for chrom in bins["chrom"].unique() if chrom not in clr.chromsizes
304-
]
365+
regions = list(bins["chrom"].unique()) # parse_regions fill in the rest
305366

306-
if len(chroms_not_in_clr) > 0:
307-
raise ValueError(
308-
"The following chromosomes are found in the bin table, but not "
309-
"in the cooler: " + str(chroms_not_in_clr)
310-
)
367+
# make sure phasing_track_col is in bins, if phasing is requested
368+
if phasing_track_col and (phasing_track_col not in bins):
369+
raise ValueError(f'No column "{phasing_track_col}" in the bin table')
311370

312-
if regions is None:
313-
regions = (
314-
[(chrom, 0, clr.chromsizes[chrom]) for chrom in bins["chrom"].unique()]
315-
if regions is None
316-
else [bioframe.parse_region(r) for r in regions]
317-
)
371+
# regions to dataframe
372+
regions = bioframe.parse_regions(regions, clr.chromsizes)
318373

374+
# ignore diags as in cooler inless specified
319375
ignore_diags = (
320376
clr._load_attrs("bins/weight").get("ignore_diags", 2)
321377
if ignore_diags is None
322378
else ignore_diags
323379
)
324380

381+
# prepare output table for eigen vectors
325382
eigvec_table = bins.copy()
326-
for i in range(n_eigs):
327-
eigvec_table["E" + str(i + 1)] = np.nan
383+
eigvec_columns = [f"E{i + 1}" for i in range(n_eigs)]
384+
for ev_col in eigvec_columns:
385+
eigvec_table[ev_col] = np.nan
386+
387+
# prepare output table for eigenvalues
388+
eigvals_table = regions.copy()
389+
eigval_columns = [f"eigval{i + 1}" for i in range(n_eigs)]
390+
for eval_col in eigval_columns:
391+
eigvals_table[eval_col] = np.nan
328392

329393
def _each(region):
330-
A = clr.matrix(balance=balance).fetch(region)
331-
if phasing_track_col and (phasing_track_col not in bins):
332-
raise ValueError(
333-
'No column "{}" in the bin table'.format(phasing_track_col)
334-
)
394+
"""
395+
perform eigen decomposition for a given region
396+
assuming safety checks are done outside of this
397+
function.
398+
Parameters
399+
----------
400+
region: tuple-like
401+
tuple of the form (chroms,start,end,*)
402+
Returns
403+
-------
404+
_region, eigvals, eigvecs -> ndarrays
405+
array of eigenvalues and an array eigenvectors
406+
"""
407+
_region = region[:3] # take only (chrom, start, end)
408+
A = clr.matrix(balance=balance).fetch(_region)
409+
410+
# filter bad_bins relevant for the _region from A
411+
if bad_bins is not None:
412+
# filter bad_bins for the _region and turn relative:
413+
lo, hi = clr.extent(_region)
414+
bad_bins_region = bad_bins[(bad_bins>=lo)&(bad_bins<hi)]
415+
bad_bins_region -= lo
416+
if len(bad_bins_region) > 0:
417+
# apply bad bins to symmetric matrix A:
418+
A[:,bad_bins_region] = np.nan
419+
A[bad_bins_region,:] = np.nan
420+
421+
# extract phasing track relevant for the _region
335422
phasing_track = (
336-
bioframe.select(bins, region)[phasing_track_col].values
423+
bioframe.select(bins, _region)[phasing_track_col].values
337424
if phasing_track_col
338425
else None
339426
)
@@ -347,33 +434,22 @@ def _each(region):
347434
sort_metric=sort_metric,
348435
)
349436

350-
return eigvals, eigvecs
351-
352-
eigvals_per_reg, eigvecs_per_reg = zip(*map(_each, regions))
437+
return _region, eigvals, eigvecs
353438

354-
for region, eigvecs in zip(regions, eigvecs_per_reg):
355-
idx = bioframe.select(bins, region).index
356-
for i, eigvec in enumerate(eigvecs):
357-
eigvec_table.loc[idx, "E" + str(i + 1)] = eigvec
439+
# eigendecompose matrix per region (can be multiprocessed)
440+
# output assumes that the order of results matches regions
441+
results = map(_each, regions.values)
358442

359-
region_strs = [
360-
(
361-
chrom
362-
if (start == 0 and end == clr.chromsizes[chrom])
363-
else "{}:{}-{}".format(chrom, start, end)
364-
)
365-
for chrom, start, end in regions
366-
]
367-
368-
eigvals = pd.DataFrame(
369-
index=region_strs,
370-
data=np.vstack(eigvals_per_reg),
371-
columns=["eigval" + str(i + 1) for i in range(n_eigs)],
372-
)
443+
# go through eigendecomposition results and fill in
444+
# output table eigvec_table and eigvals_table
445+
for _region, _eigvals, _eigvecs in results:
446+
idx = bioframe.select(eigvec_table, _region).index
447+
eigvec_table.at[idx, eigvec_columns] = _eigvecs.T
448+
idx = bioframe.select(eigvals_table, _region).index
449+
eigvals_table.at[idx, eigval_columns] = _eigvals
373450

374-
eigvals.index.name = "region"
375451

376-
return eigvals, eigvec_table
452+
return eigvals_table, eigvec_table
377453

378454

379455
def cooler_trans_eig(

cooltools/lib/numutils.py

+26-8
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,8 @@ def get_eig(mat, n=3, mask_zero_rows=False, subtract_mean=False, divide_by_mean=
436436
A square matrix, must not contain nans, infs or zero rows.
437437
438438
n : int
439-
The number of eigenvectors to return.
439+
The number of eigenvectors to return. Output is backfilled with NaNs
440+
when n exceeds the size of the input matrix.
440441
441442
mask_zero_rows : bool
442443
If True, mask empty rows/columns before eigenvector decomposition.
@@ -454,7 +455,7 @@ def get_eig(mat, n=3, mask_zero_rows=False, subtract_mean=False, divide_by_mean=
454455
An array of eigenvectors (in rows), sorted by a decreasing absolute
455456
eigenvalue.
456457
457-
eigvecs : np.ndarray
458+
eigvals : np.ndarray
458459
An array of sorted eigenvalues.
459460
460461
"""
@@ -468,6 +469,12 @@ def get_eig(mat, n=3, mask_zero_rows=False, subtract_mean=False, divide_by_mean=
468469
"The matrix contains empty rows/columns and is symmetric. "
469470
"Mask the empty rows with remove_zeros=True"
470471
)
472+
# size of the input matrix
473+
n_rows = mat.shape[0]
474+
if n > n_rows:
475+
warnings.warn(
476+
"Number n of requested eigenvalues is larger than the matrix size."
477+
)
471478

472479
if mask_zero_rows:
473480
if not is_symmetric(mat):
@@ -482,14 +489,19 @@ def get_eig(mat, n=3, mask_zero_rows=False, subtract_mean=False, divide_by_mean=
482489
subtract_mean=subtract_mean,
483490
divide_by_mean=divide_by_mean,
484491
)
485-
n_rows = mat.shape[0]
486492
eigvecs = np.full((n, n_rows), np.nan)
487493
for i in range(n):
488494
eigvecs[i][mask] = eigvecs_collapsed[i]
489495

490496
return eigvecs, eigvals
491497
else:
492498
mat = mat.astype(np.float, copy=True) # make a copy, ensure float
499+
500+
# prepare NaN-filled arrays for output in case we requested
501+
# more eigenvalues that can be computed (eigs/eigsh k limits)
502+
eigvals = np.full(n, np.nan)
503+
eigvecs = np.full((n, n_rows), np.nan)
504+
493505
mean = np.mean(mat)
494506

495507
if subtract_mean:
@@ -498,12 +510,18 @@ def get_eig(mat, n=3, mask_zero_rows=False, subtract_mean=False, divide_by_mean=
498510
mat /= mean
499511

500512
if symmetric:
501-
eigvals, eigvecs = scipy.sparse.linalg.eigsh(mat, n)
513+
# adjust requested number of eigvals for "eigsh"
514+
_n = n if n < n_rows else (n_rows - 1)
515+
_eigvals, _eigvecs = scipy.sparse.linalg.eigsh(mat, _n)
502516
else:
503-
eigvals, eigvecs = scipy.sparse.linalg.eigs(mat, n)
504-
order = np.argsort(-np.abs(eigvals))
505-
eigvals = eigvals[order]
506-
eigvecs = eigvecs.T[order]
517+
# adjust requested number of eigvals for "eigs"
518+
_n = n if n < (n_rows - 1) else (n_rows - 2)
519+
_eigvals, _eigvecs = scipy.sparse.linalg.eigs(mat, _n)
520+
521+
# reorder according to eigvals and copy into output arrays
522+
order = np.argsort(-np.abs(_eigvals))
523+
eigvals[:_n] = _eigvals[order]
524+
eigvecs[:_n,:] = _eigvecs.T[order]
507525

508526
return eigvecs, eigvals
509527

0 commit comments

Comments
 (0)