@@ -294,46 +294,133 @@ def cooler_cis_eig(
294
294
phasing_track_col = "GC" ,
295
295
balance = "weight" ,
296
296
ignore_diags = None ,
297
+ bad_bins = None ,
297
298
clip_percentile = 99.9 ,
298
299
sort_metric = None ,
300
+ map = map ,
299
301
):
300
- # Perform consitency checks.
302
+ """
303
+ Compute compartment eigenvector for a given cooler `clr` in a number of
304
+ symmetric intra chromosomal regions (cis-regions), or for each chromosome.
305
+ Note that the amplitude of compartment eigenvectors is weighted by their
306
+ corresponding eigenvalue
307
+ Parameters
308
+ ----------
309
+ clr : cooler
310
+ cooler object to fetch data from
311
+ bins : DataFrame
312
+ table of bins derived from clr with phasing track added
313
+ regions : iterable or DataFrame, optional
314
+ if provided, eigenvectors are calculated for the regions only,
315
+ otherwise chromosome-wide eigenvectors are computed, for chromosomes
316
+ specified in bins.
317
+ n_eigs : int
318
+ number of eigenvectors to compute
319
+ phasing_track_col : str, optional
320
+ name of the columns in `bins` table, if provided, eigenvectors are
321
+ flipped to achieve a positive correlation with `bins[phasing_track_col]`.
322
+ balance : str
323
+ name of the column with balancing weights to be used.
324
+ ignore_diags : int, optional
325
+ the number of diagonals to ignore. Derived from cooler metadata
326
+ if not specified.
327
+ bad_bins : array-like
328
+ a list of bins to ignore. Indexes of bins must be absolute,
329
+ as in clr.bins()[:], as opposed to being offset by chromosome start.
330
+ `bad_bins` will be combined with the bad bins masked by balancing.
331
+ clip_percentile : float
332
+ if >0 and <100, clip pixels with diagonal-normalized values
333
+ higher than the specified percentile of matrix-wide values.
334
+ sort_metric : str
335
+ If provided, re-sort `eigenvecs` and `eigvals` in the order of
336
+ decreasing correlation between phasing_track and eigenvector, using the
337
+ specified measure of correlation. Possible values:
338
+ 'pearsonr' - sort by decreasing Pearson correlation.
339
+ 'var_explained' - sort by decreasing absolute amount of variation in
340
+ `eigvecs` explained by `phasing_track` (i.e. R^2 * var(eigvec))
341
+ 'MAD_explained' - sort by decreasing absolute amount of Median Absolute
342
+ Deviation from the median of `eigvecs` explained by `phasing_track`
343
+ (i.e. COMED(eigvec, phasing_track) * MAD(eigvec)).
344
+ 'spearmanr' - sort by decreasing Spearman correlation.
345
+ This option is designed to report the most "biologically" informative
346
+ eigenvectors first, and prevent eigenvector swapping caused by
347
+ translocations. In reality, however, sometimes it shows poor
348
+ performance and may lead to reporting of non-informative eigenvectors.
349
+ Off by default.
350
+ map : callable, optional
351
+ Map functor implementation.
352
+ Returns
353
+ -------
354
+ eigvals, eigvec_table -> DataFrames with eigenvalues for each region and
355
+ a table of eigenvectors filled in the `bins` table.
356
+ .. note:: ALWAYS check your EVs by eye. The first one occasionally does
357
+ not reflect the compartment structure, but instead describes
358
+ chromosomal arms or translocation blowouts. Possible mitigations:
359
+ employ `regions` (e.g. arms) to avoid issues with chromosomal arms,
360
+ use `bad_bins` to ignore small transolcations.
361
+ """
362
+
363
+ # get chromosomes from bins, if regions not specified:
301
364
if regions is None :
302
- chroms_not_in_clr = [
303
- chrom for chrom in bins ["chrom" ].unique () if chrom not in clr .chromsizes
304
- ]
365
+ regions = list (bins ["chrom" ].unique ()) # parse_regions fill in the rest
305
366
306
- if len (chroms_not_in_clr ) > 0 :
307
- raise ValueError (
308
- "The following chromosomes are found in the bin table, but not "
309
- "in the cooler: " + str (chroms_not_in_clr )
310
- )
367
+ # make sure phasing_track_col is in bins, if phasing is requested
368
+ if phasing_track_col and (phasing_track_col not in bins ):
369
+ raise ValueError (f'No column "{ phasing_track_col } " in the bin table' )
311
370
312
- if regions is None :
313
- regions = (
314
- [(chrom , 0 , clr .chromsizes [chrom ]) for chrom in bins ["chrom" ].unique ()]
315
- if regions is None
316
- else [bioframe .parse_region (r ) for r in regions ]
317
- )
371
+ # regions to dataframe
372
+ regions = bioframe .parse_regions (regions , clr .chromsizes )
318
373
374
+ # ignore diags as in cooler inless specified
319
375
ignore_diags = (
320
376
clr ._load_attrs ("bins/weight" ).get ("ignore_diags" , 2 )
321
377
if ignore_diags is None
322
378
else ignore_diags
323
379
)
324
380
381
+ # prepare output table for eigen vectors
325
382
eigvec_table = bins .copy ()
326
- for i in range (n_eigs ):
327
- eigvec_table ["E" + str (i + 1 )] = np .nan
383
+ eigvec_columns = [f"E{ i + 1 } " for i in range (n_eigs )]
384
+ for ev_col in eigvec_columns :
385
+ eigvec_table [ev_col ] = np .nan
386
+
387
+ # prepare output table for eigenvalues
388
+ eigvals_table = regions .copy ()
389
+ eigval_columns = [f"eigval{ i + 1 } " for i in range (n_eigs )]
390
+ for eval_col in eigval_columns :
391
+ eigvals_table [eval_col ] = np .nan
328
392
329
393
def _each (region ):
330
- A = clr .matrix (balance = balance ).fetch (region )
331
- if phasing_track_col and (phasing_track_col not in bins ):
332
- raise ValueError (
333
- 'No column "{}" in the bin table' .format (phasing_track_col )
334
- )
394
+ """
395
+ perform eigen decomposition for a given region
396
+ assuming safety checks are done outside of this
397
+ function.
398
+ Parameters
399
+ ----------
400
+ region: tuple-like
401
+ tuple of the form (chroms,start,end,*)
402
+ Returns
403
+ -------
404
+ _region, eigvals, eigvecs -> ndarrays
405
+ array of eigenvalues and an array eigenvectors
406
+ """
407
+ _region = region [:3 ] # take only (chrom, start, end)
408
+ A = clr .matrix (balance = balance ).fetch (_region )
409
+
410
+ # filter bad_bins relevant for the _region from A
411
+ if bad_bins is not None :
412
+ # filter bad_bins for the _region and turn relative:
413
+ lo , hi = clr .extent (_region )
414
+ bad_bins_region = bad_bins [(bad_bins >= lo )& (bad_bins < hi )]
415
+ bad_bins_region -= lo
416
+ if len (bad_bins_region ) > 0 :
417
+ # apply bad bins to symmetric matrix A:
418
+ A [:,bad_bins_region ] = np .nan
419
+ A [bad_bins_region ,:] = np .nan
420
+
421
+ # extract phasing track relevant for the _region
335
422
phasing_track = (
336
- bioframe .select (bins , region )[phasing_track_col ].values
423
+ bioframe .select (bins , _region )[phasing_track_col ].values
337
424
if phasing_track_col
338
425
else None
339
426
)
@@ -347,33 +434,22 @@ def _each(region):
347
434
sort_metric = sort_metric ,
348
435
)
349
436
350
- return eigvals , eigvecs
351
-
352
- eigvals_per_reg , eigvecs_per_reg = zip (* map (_each , regions ))
437
+ return _region , eigvals , eigvecs
353
438
354
- for region , eigvecs in zip (regions , eigvecs_per_reg ):
355
- idx = bioframe .select (bins , region ).index
356
- for i , eigvec in enumerate (eigvecs ):
357
- eigvec_table .loc [idx , "E" + str (i + 1 )] = eigvec
439
+ # eigendecompose matrix per region (can be multiprocessed)
440
+ # output assumes that the order of results matches regions
441
+ results = map (_each , regions .values )
358
442
359
- region_strs = [
360
- (
361
- chrom
362
- if (start == 0 and end == clr .chromsizes [chrom ])
363
- else "{}:{}-{}" .format (chrom , start , end )
364
- )
365
- for chrom , start , end in regions
366
- ]
367
-
368
- eigvals = pd .DataFrame (
369
- index = region_strs ,
370
- data = np .vstack (eigvals_per_reg ),
371
- columns = ["eigval" + str (i + 1 ) for i in range (n_eigs )],
372
- )
443
+ # go through eigendecomposition results and fill in
444
+ # output table eigvec_table and eigvals_table
445
+ for _region , _eigvals , _eigvecs in results :
446
+ idx = bioframe .select (eigvec_table , _region ).index
447
+ eigvec_table .at [idx , eigvec_columns ] = _eigvecs .T
448
+ idx = bioframe .select (eigvals_table , _region ).index
449
+ eigvals_table .at [idx , eigval_columns ] = _eigvals
373
450
374
- eigvals .index .name = "region"
375
451
376
- return eigvals , eigvec_table
452
+ return eigvals_table , eigvec_table
377
453
378
454
379
455
def cooler_trans_eig (
0 commit comments