Skip to content

Add Nussinov Algorithm for RNA Secondary Structure Prediction #6071

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 166 additions & 0 deletions src/main/java/com/thealgorithms/dynamicprogramming/Nussinov.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import es.uma.eda.problem.combinatorial.sequence.folding.RNASecondaryStructurePredictor;
/** Download the required library here: https://drive.google.com/file/d/1L9ZU3_D3FODd2B22cr0eGBKeyDmsKog3/view?usp=sharing **/
/**
* Implementation of the Nussinov algorithm for RNA secondary structure prediction by
* base-pair maximization using dynamic programming.
*/
public class Nussinov extends RNASecondaryStructurePredictor {
/**
* Strings describing compatible base pairs: basepairs1(i) is compatible with basepairs2(i)
*/
private final static String basepairs1 = "AUUCGG";
private final static String basepairs2 = "UAGGCU";

/**
* Default constructor
*/
public Nussinov() {
}

/**
* Returns true iff nucleotides b1 and b2 are compatible for pairing
* @param b1 a nucleotide (A, C, G, U)
* @param b2 a nucleotide (A, C, G, U)
* @return true iff nucleotides b1 and b2 are compatible for pairing
*/
protected boolean isCompatible (char b1, char b2) {
b1 = Character.toUpperCase(b1);
b2 = Character.toUpperCase(b2);
for (int i=0; i<basepairs1.length(); i++)
if ((basepairs1.charAt(i) == b1) && (basepairs2.charAt(i) == b2))
return true;

return false;
}

@Override
public String getName() {
return "Nussinov algorithm";
}

@Override
protected String _run(String rnaSeq, int minLoopSize) {
int n = rnaSeq.length(); // number of nucleotides in the sequence
int[][] M = new int[n][n+1]; // to store costs
int[][] D = new int[n][n]; // to store decisions
boolean[][] B = new boolean[n][n]; // to precompute which base pairs match

for (int i=0; i<n; i++)
for (int j=i+1; j<n; j++)
B[i][j] = isCompatible(rnaSeq.charAt(i), rnaSeq.charAt(j));

// Base case: M(i,j) = 0 if j <= i+minLoopSize
for (int i=0; i<n; i++) {
M[i][i] = 0; // M(i,i-1) = 0
for (int j=i, k=0; (k<=minLoopSize) && (j<n); j++, k++) {
M[i][j+1] = 0; // M(i,j) = 0;
D[i][j] = -1; // -1 => unpaired
}
}

// General case: M(i,j) = max(M(i,j-1), max (M(i, k-1) + M(k+1,j-1) + 1 | i <= k < j-minLoopSize, s_k and s_j are complementary)) if j > i+minLoopSize
for (int l = minLoopSize + 1; l < n; l++) { // iterate possible lengths (j - i)
for (int i = 0; i + l < n; i++) { // iterate through sequence
int j = i + l;
M[i][j + 1] = M[i][j]; // option 1: do not pair s_j
D[i][j] = -1; // initially, unpaired

// Option 2: find max possible pairing with previous nucleotides
for (int k = i; k < j - minLoopSize; k++) {
if (B[k][j]) { // if nucleotides are compatible
int pairs = M[i][k] + M[k + 1][j] + 1;
if (pairs > M[i][j + 1]) {
M[i][j + 1] = pairs;
D[i][j] = k; // pair s_k with s_j
}
}
}
}
}

if (verbosity) {
System.out.println("\nScore matrix:\n");

System.out.print(" \t|");
for (int j=0; j<=n; j++)
System.out.print("\t"+(j-1));
System.out.print("\n \t|\t ");
for (int j=1; j<=n; j++)
System.out.print("\t"+rnaSeq.charAt(j-1));
System.out.print("\n---- \t+");
for (int j=0; j<=n; j++)
System.out.print("\t----");
for (int i=0; i<n; i++) {
System.out.print("\n"+i+ " " + rnaSeq.charAt(i)+"\t|\t");
for (int j=0; j<i; j++) {
System.out.print(" \t");
}
for (int j=i; j<=n; j++) {
System.out.format("%d \t", M[i][j]);
}
}

System.out.println("\n\nDecision matrix:\n");

System.out.print(" \t|");
for (int j=0; j<n; j++)
System.out.print("\t"+j);
System.out.print("\n \t|");
for (int j=0; j<n; j++)
System.out.print("\t"+rnaSeq.charAt(j));
System.out.print("\n---- \t+");
for (int j=0; j<n; j++)
System.out.print("\t----");
for (int i=0; i<n; i++) {
System.out.print("\n"+i+ " " + rnaSeq.charAt(i)+"\t|\t");
for (int j=0; j<n; j++) {
System.out.format("%d \t", D[i][j]);
}
}
System.out.println("\n");
}

// Reconstruction of the optimal solution
return reconstructSolution(D, 0, n-1);
}


private String reconstructSolution (int[][] D, int i, int j) {
if (i >= j) {
return ".".repeat(Math.max(0, j - i + 1)); // no possible pairs
}

if (D[i][j] == -1) {
return reconstructSolution(D, i, j - 1) + "."; // no pairing at j
} else {
int k = D[i][j];
return reconstructSolution(D, i, k - 1) + "(" + reconstructSolution(D, k + 1, j - 1) + ")"; // pair s_k with s_j
}
}

@Override
public double evaluate(String rnaSeq, String folding) {
if (rnaSeq.length() != folding.length()) // error: the folding string does not match the RNA sequence
return -1.0;
double contacts = 0.0;
int n = folding.length();
int open = 0;

for (int i=0; i<n; i++)
switch (folding.charAt(i)) {
case '.': break;
case '(': contacts += 1.0;
open++;
break;
case ')': open --;
if (open < 0) // error: parentheses are not balanced
return -1.0;
break;
default: return -1.0; // error: unknown symbol found
}
if (open > 0) // error: parentheses are not balanced
contacts = -1.0;
return contacts;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package com.thealgorithms.dynamicprogramming;

import java.io.IOException;
import java.util.List;

import es.uma.eda.AlgorithmUtil;
import es.uma.eda.problem.combinatorial.sequence.SequenceUtil;
import es.uma.eda.problem.combinatorial.sequence.folding.RNASecondaryStructurePredictor;

/**
* Main class for testing RNA folding algorithms
* @author ccottap
*
*/
public class TestRNAFolding {

/**
* Filename for storing statistics
*/
private final static String outputFilename = "folding.txt";
/**
* how much the size of sequences is scaled-up on each step
*/
private final static double scaleFactor = 1.5;

/**
* Finds the best folding of an RNA sequence
* @param args command-line arguments (to read data from file or generate at random)
* @throws IOException if data cannot be read from file or statistics cannot be written to file
*/
public static void main(String[] args) throws IOException {
if (args.length<1) {
System.out.println("java TestRNAFolding (-s|-f|-r) <arguments>");
System.out.println("\t-s: folds a string given as parameter");
System.out.println("\t-f: performs a batch test from a file");
System.out.println("\t-r: measures time");
}
else {
List<String> sequences = null;
RNASecondaryStructurePredictor predictor = new Nussinov();

switch (args[0]) {
case "-s":
if (args.length < 3) {
System.out.println("java TestRNAFolding -s <string> <min-loop-size>");
System.exit(-1);
}
else {
String f = predictor.run(args[1], Integer.parseInt(args[2]));
System.out.println("Folding " + args[1] + "...");
System.out.println("Result:\n\t" + (int)predictor.evaluate(args[1],f) + " base pairs\n\t" + args[1] + "\n\t" + f);
}
break;

case "-f":
if (args.length < 3) {
System.out.println("java TestRNAFolding -f <filename> <min-loop-size>");
System.exit(-1);
}
else {
sequences = SequenceUtil.readSequencesFromFile(args[1]);
predictor.setVerbosity(false);
for (String s: sequences) {
String f = predictor.run(s, Integer.parseInt(args[2]));
System.out.println((int)predictor.evaluate(s, f));
}
}
break;

case "-r":
if (args.length < 5) {
System.out.println("java TestRNAFolding -r <initial-lenght> <doublings> <tests-per-length> <min-loop-size>");
System.exit(-1);
}
else {
int initialLength = Integer.parseUnsignedInt(args[1]);
int doublings = Integer.parseUnsignedInt(args[2]);
int testsPerLength = Integer.parseUnsignedInt(args[3]);
int minLoopSize = Integer.parseUnsignedInt(args[4]);
double[][] statistics = runTimer(predictor, initialLength, doublings, testsPerLength, minLoopSize);
AlgorithmUtil.writeStats(outputFilename, statistics);
}
break;

default:
System.out.println("Wrong argument: " + args[0]);
System.exit(-1);
}
}

}

/**
* Performs a series of timed experiments with sequences of different sizes
* @param predictor the structure prediction algorithm
* @param initialLength initial length of sequences
* @param doublings number of times the sequence length is doubled
* @param testsPerLength number of tests per sequence length
* @param minLoopSize minimum number of nucleotides enclosed in loops
* @return a matrix with computational times (one row per size, one column per test).
*/
private static double[][] runTimer(RNASecondaryStructurePredictor predictor , int initialLength, int doublings, int testsPerLength, int minLoopSize) {
final String alphabet = "ACGU";
double[][] statistics = new double[doublings][testsPerLength+1];
double interval;

predictor.setVerbosity(false);
for (int l = initialLength, dup = 0; dup < doublings; l *= scaleFactor, dup++) {
statistics[dup][0] = l;
System.out.println("Trying sequences of length " + l);
for (int j=0; j<testsPerLength; j++) {
String s = SequenceUtil.randomString(l, alphabet);
predictor.run(s, minLoopSize);
interval = predictor.getTime();
statistics[dup][j+1] = interval;
System.out.println(predictor.getName() + " took " + interval + "s");
}
}

return statistics;

}

}
Loading