|
| 1 | +""" |
| 2 | +https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm |
| 3 | +The Smith-Waterman algorithm is a dynamic programming algorithm used for sequence |
| 4 | +alignment. It is particularly useful for finding similarities between two sequences, |
| 5 | +such as DNA or protein sequences. In this implementation, gaps are penalized |
| 6 | +linearly, meaning that the score is reduced by a fixed amount for each gap introduced |
| 7 | +in the alignment. However, it's important to note that the Smith-Waterman algorithm |
| 8 | +supports other gap penalty methods as well. |
| 9 | +""" |
| 10 | + |
| 11 | + |
| 12 | +def score_function( |
| 13 | + source_char: str, |
| 14 | + target_char: str, |
| 15 | + match: int = 1, |
| 16 | + mismatch: int = -1, |
| 17 | + gap: int = -2, |
| 18 | +) -> int: |
| 19 | + """ |
| 20 | + Calculate the score for a character pair based on whether they match or mismatch. |
| 21 | + Returns 1 if the characters match, -1 if they mismatch, and -2 if either of the |
| 22 | + characters is a gap. |
| 23 | + >>> score_function('A', 'A') |
| 24 | + 1 |
| 25 | + >>> score_function('A', 'C') |
| 26 | + -1 |
| 27 | + >>> score_function('-', 'A') |
| 28 | + -2 |
| 29 | + >>> score_function('A', '-') |
| 30 | + -2 |
| 31 | + >>> score_function('-', '-') |
| 32 | + -2 |
| 33 | + """ |
| 34 | + if "-" in (source_char, target_char): |
| 35 | + return gap |
| 36 | + return match if source_char == target_char else mismatch |
| 37 | + |
| 38 | + |
| 39 | +def smith_waterman( |
| 40 | + query: str, |
| 41 | + subject: str, |
| 42 | + match: int = 1, |
| 43 | + mismatch: int = -1, |
| 44 | + gap: int = -2, |
| 45 | +) -> list[list[int]]: |
| 46 | + """ |
| 47 | + Perform the Smith-Waterman local sequence alignment algorithm. |
| 48 | + Returns a 2D list representing the score matrix. Each value in the matrix |
| 49 | + corresponds to the score of the best local alignment ending at that point. |
| 50 | + >>> smith_waterman('ACAC', 'CA') |
| 51 | + [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]] |
| 52 | + >>> smith_waterman('acac', 'ca') |
| 53 | + [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]] |
| 54 | + >>> smith_waterman('ACAC', 'ca') |
| 55 | + [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]] |
| 56 | + >>> smith_waterman('acac', 'CA') |
| 57 | + [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]] |
| 58 | + >>> smith_waterman('ACAC', '') |
| 59 | + [[0], [0], [0], [0], [0]] |
| 60 | + >>> smith_waterman('', 'CA') |
| 61 | + [[0, 0, 0]] |
| 62 | + >>> smith_waterman('ACAC', 'CA') |
| 63 | + [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]] |
| 64 | +
|
| 65 | + >>> smith_waterman('acac', 'ca') |
| 66 | + [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]] |
| 67 | +
|
| 68 | + >>> smith_waterman('ACAC', 'ca') |
| 69 | + [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]] |
| 70 | +
|
| 71 | + >>> smith_waterman('acac', 'CA') |
| 72 | + [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]] |
| 73 | +
|
| 74 | + >>> smith_waterman('ACAC', '') |
| 75 | + [[0], [0], [0], [0], [0]] |
| 76 | +
|
| 77 | + >>> smith_waterman('', 'CA') |
| 78 | + [[0, 0, 0]] |
| 79 | +
|
| 80 | + >>> smith_waterman('AGT', 'AGT') |
| 81 | + [[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3]] |
| 82 | +
|
| 83 | + >>> smith_waterman('AGT', 'GTA') |
| 84 | + [[0, 0, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 2, 0]] |
| 85 | +
|
| 86 | + >>> smith_waterman('AGT', 'GTC') |
| 87 | + [[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0]] |
| 88 | +
|
| 89 | + >>> smith_waterman('AGT', 'G') |
| 90 | + [[0, 0], [0, 0], [0, 1], [0, 0]] |
| 91 | +
|
| 92 | + >>> smith_waterman('G', 'AGT') |
| 93 | + [[0, 0, 0, 0], [0, 0, 1, 0]] |
| 94 | +
|
| 95 | + >>> smith_waterman('AGT', 'AGTCT') |
| 96 | + [[0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0], [0, 0, 0, 3, 1, 1]] |
| 97 | +
|
| 98 | + >>> smith_waterman('AGTCT', 'AGT') |
| 99 | + [[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3], [0, 0, 0, 1], [0, 0, 0, 1]] |
| 100 | +
|
| 101 | + >>> smith_waterman('AGTCT', 'GTC') |
| 102 | + [[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3], [0, 0, 1, 1]] |
| 103 | + """ |
| 104 | + # make both query and subject uppercase |
| 105 | + query = query.upper() |
| 106 | + subject = subject.upper() |
| 107 | + |
| 108 | + # Initialize score matrix |
| 109 | + m = len(query) |
| 110 | + n = len(subject) |
| 111 | + score = [[0] * (n + 1) for _ in range(m + 1)] |
| 112 | + kwargs = {"match": match, "mismatch": mismatch, "gap": gap} |
| 113 | + |
| 114 | + for i in range(1, m + 1): |
| 115 | + for j in range(1, n + 1): |
| 116 | + # Calculate scores for each cell |
| 117 | + match = score[i - 1][j - 1] + score_function( |
| 118 | + query[i - 1], subject[j - 1], **kwargs |
| 119 | + ) |
| 120 | + delete = score[i - 1][j] + gap |
| 121 | + insert = score[i][j - 1] + gap |
| 122 | + |
| 123 | + # Take maximum score |
| 124 | + score[i][j] = max(0, match, delete, insert) |
| 125 | + |
| 126 | + return score |
| 127 | + |
| 128 | + |
| 129 | +def traceback(score: list[list[int]], query: str, subject: str) -> str: |
| 130 | + r""" |
| 131 | + Perform traceback to find the optimal local alignment. |
| 132 | + Starts from the highest scoring cell in the matrix and traces back recursively |
| 133 | + until a 0 score is found. Returns the alignment strings. |
| 134 | + >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'CA') |
| 135 | + 'CA\nCA' |
| 136 | + >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'acac', 'ca') |
| 137 | + 'CA\nCA' |
| 138 | + >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'ca') |
| 139 | + 'CA\nCA' |
| 140 | + >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'acac', 'CA') |
| 141 | + 'CA\nCA' |
| 142 | + >>> traceback([[0, 0, 0]], 'ACAC', '') |
| 143 | + '' |
| 144 | + """ |
| 145 | + # make both query and subject uppercase |
| 146 | + query = query.upper() |
| 147 | + subject = subject.upper() |
| 148 | + # find the indices of the maximum value in the score matrix |
| 149 | + max_value = float("-inf") |
| 150 | + i_max = j_max = 0 |
| 151 | + for i, row in enumerate(score): |
| 152 | + for j, value in enumerate(row): |
| 153 | + if value > max_value: |
| 154 | + max_value = value |
| 155 | + i_max, j_max = i, j |
| 156 | + # Traceback logic to find optimal alignment |
| 157 | + i = i_max |
| 158 | + j = j_max |
| 159 | + align1 = "" |
| 160 | + align2 = "" |
| 161 | + gap = score_function("-", "-") |
| 162 | + # guard against empty query or subject |
| 163 | + if i == 0 or j == 0: |
| 164 | + return "" |
| 165 | + while i > 0 and j > 0: |
| 166 | + if score[i][j] == score[i - 1][j - 1] + score_function( |
| 167 | + query[i - 1], subject[j - 1] |
| 168 | + ): |
| 169 | + # optimal path is a diagonal take both letters |
| 170 | + align1 = query[i - 1] + align1 |
| 171 | + align2 = subject[j - 1] + align2 |
| 172 | + i -= 1 |
| 173 | + j -= 1 |
| 174 | + elif score[i][j] == score[i - 1][j] + gap: |
| 175 | + # optimal path is a vertical |
| 176 | + align1 = query[i - 1] + align1 |
| 177 | + align2 = f"-{align2}" |
| 178 | + i -= 1 |
| 179 | + else: |
| 180 | + # optimal path is a horizontal |
| 181 | + align1 = f"-{align1}" |
| 182 | + align2 = subject[j - 1] + align2 |
| 183 | + j -= 1 |
| 184 | + |
| 185 | + return f"{align1}\n{align2}" |
| 186 | + |
| 187 | + |
| 188 | +if __name__ == "__main__": |
| 189 | + query = "HEAGAWGHEE" |
| 190 | + subject = "PAWHEAE" |
| 191 | + |
| 192 | + score = smith_waterman(query, subject, match=1, mismatch=-1, gap=-2) |
| 193 | + print(traceback(score, query, subject)) |
0 commit comments