Skip to content

Commit 467903a

Browse files
BAW2501pre-commit-ci[bot]tianyizheng02
authored
added smith waterman algorithm (TheAlgorithms#9001)
* added smith waterman algorithm * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * descriptive names for the parameters a and b * doctesting lowercase upcase empty string cases * updated block quot,fixed traceback and doctests * shorter block quote Co-authored-by: Tianyi Zheng <[email protected]> * global vars to func params,more doctests * updated doctests * user access to SW params * formating --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Tianyi Zheng <[email protected]>
1 parent 35dd529 commit 467903a

File tree

1 file changed

+193
-0
lines changed

1 file changed

+193
-0
lines changed

Diff for: dynamic_programming/smith_waterman.py

+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
"""
2+
https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
3+
The Smith-Waterman algorithm is a dynamic programming algorithm used for sequence
4+
alignment. It is particularly useful for finding similarities between two sequences,
5+
such as DNA or protein sequences. In this implementation, gaps are penalized
6+
linearly, meaning that the score is reduced by a fixed amount for each gap introduced
7+
in the alignment. However, it's important to note that the Smith-Waterman algorithm
8+
supports other gap penalty methods as well.
9+
"""
10+
11+
12+
def score_function(
13+
source_char: str,
14+
target_char: str,
15+
match: int = 1,
16+
mismatch: int = -1,
17+
gap: int = -2,
18+
) -> int:
19+
"""
20+
Calculate the score for a character pair based on whether they match or mismatch.
21+
Returns 1 if the characters match, -1 if they mismatch, and -2 if either of the
22+
characters is a gap.
23+
>>> score_function('A', 'A')
24+
1
25+
>>> score_function('A', 'C')
26+
-1
27+
>>> score_function('-', 'A')
28+
-2
29+
>>> score_function('A', '-')
30+
-2
31+
>>> score_function('-', '-')
32+
-2
33+
"""
34+
if "-" in (source_char, target_char):
35+
return gap
36+
return match if source_char == target_char else mismatch
37+
38+
39+
def smith_waterman(
40+
query: str,
41+
subject: str,
42+
match: int = 1,
43+
mismatch: int = -1,
44+
gap: int = -2,
45+
) -> list[list[int]]:
46+
"""
47+
Perform the Smith-Waterman local sequence alignment algorithm.
48+
Returns a 2D list representing the score matrix. Each value in the matrix
49+
corresponds to the score of the best local alignment ending at that point.
50+
>>> smith_waterman('ACAC', 'CA')
51+
[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
52+
>>> smith_waterman('acac', 'ca')
53+
[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
54+
>>> smith_waterman('ACAC', 'ca')
55+
[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
56+
>>> smith_waterman('acac', 'CA')
57+
[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
58+
>>> smith_waterman('ACAC', '')
59+
[[0], [0], [0], [0], [0]]
60+
>>> smith_waterman('', 'CA')
61+
[[0, 0, 0]]
62+
>>> smith_waterman('ACAC', 'CA')
63+
[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
64+
65+
>>> smith_waterman('acac', 'ca')
66+
[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
67+
68+
>>> smith_waterman('ACAC', 'ca')
69+
[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
70+
71+
>>> smith_waterman('acac', 'CA')
72+
[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
73+
74+
>>> smith_waterman('ACAC', '')
75+
[[0], [0], [0], [0], [0]]
76+
77+
>>> smith_waterman('', 'CA')
78+
[[0, 0, 0]]
79+
80+
>>> smith_waterman('AGT', 'AGT')
81+
[[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3]]
82+
83+
>>> smith_waterman('AGT', 'GTA')
84+
[[0, 0, 0, 0], [0, 0, 0, 1], [0, 1, 0, 0], [0, 0, 2, 0]]
85+
86+
>>> smith_waterman('AGT', 'GTC')
87+
[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0]]
88+
89+
>>> smith_waterman('AGT', 'G')
90+
[[0, 0], [0, 0], [0, 1], [0, 0]]
91+
92+
>>> smith_waterman('G', 'AGT')
93+
[[0, 0, 0, 0], [0, 0, 1, 0]]
94+
95+
>>> smith_waterman('AGT', 'AGTCT')
96+
[[0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0], [0, 0, 0, 3, 1, 1]]
97+
98+
>>> smith_waterman('AGTCT', 'AGT')
99+
[[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3], [0, 0, 0, 1], [0, 0, 0, 1]]
100+
101+
>>> smith_waterman('AGTCT', 'GTC')
102+
[[0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3], [0, 0, 1, 1]]
103+
"""
104+
# make both query and subject uppercase
105+
query = query.upper()
106+
subject = subject.upper()
107+
108+
# Initialize score matrix
109+
m = len(query)
110+
n = len(subject)
111+
score = [[0] * (n + 1) for _ in range(m + 1)]
112+
kwargs = {"match": match, "mismatch": mismatch, "gap": gap}
113+
114+
for i in range(1, m + 1):
115+
for j in range(1, n + 1):
116+
# Calculate scores for each cell
117+
match = score[i - 1][j - 1] + score_function(
118+
query[i - 1], subject[j - 1], **kwargs
119+
)
120+
delete = score[i - 1][j] + gap
121+
insert = score[i][j - 1] + gap
122+
123+
# Take maximum score
124+
score[i][j] = max(0, match, delete, insert)
125+
126+
return score
127+
128+
129+
def traceback(score: list[list[int]], query: str, subject: str) -> str:
130+
r"""
131+
Perform traceback to find the optimal local alignment.
132+
Starts from the highest scoring cell in the matrix and traces back recursively
133+
until a 0 score is found. Returns the alignment strings.
134+
>>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'CA')
135+
'CA\nCA'
136+
>>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'acac', 'ca')
137+
'CA\nCA'
138+
>>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'ca')
139+
'CA\nCA'
140+
>>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'acac', 'CA')
141+
'CA\nCA'
142+
>>> traceback([[0, 0, 0]], 'ACAC', '')
143+
''
144+
"""
145+
# make both query and subject uppercase
146+
query = query.upper()
147+
subject = subject.upper()
148+
# find the indices of the maximum value in the score matrix
149+
max_value = float("-inf")
150+
i_max = j_max = 0
151+
for i, row in enumerate(score):
152+
for j, value in enumerate(row):
153+
if value > max_value:
154+
max_value = value
155+
i_max, j_max = i, j
156+
# Traceback logic to find optimal alignment
157+
i = i_max
158+
j = j_max
159+
align1 = ""
160+
align2 = ""
161+
gap = score_function("-", "-")
162+
# guard against empty query or subject
163+
if i == 0 or j == 0:
164+
return ""
165+
while i > 0 and j > 0:
166+
if score[i][j] == score[i - 1][j - 1] + score_function(
167+
query[i - 1], subject[j - 1]
168+
):
169+
# optimal path is a diagonal take both letters
170+
align1 = query[i - 1] + align1
171+
align2 = subject[j - 1] + align2
172+
i -= 1
173+
j -= 1
174+
elif score[i][j] == score[i - 1][j] + gap:
175+
# optimal path is a vertical
176+
align1 = query[i - 1] + align1
177+
align2 = f"-{align2}"
178+
i -= 1
179+
else:
180+
# optimal path is a horizontal
181+
align1 = f"-{align1}"
182+
align2 = subject[j - 1] + align2
183+
j -= 1
184+
185+
return f"{align1}\n{align2}"
186+
187+
188+
if __name__ == "__main__":
189+
query = "HEAGAWGHEE"
190+
subject = "PAWHEAE"
191+
192+
score = smith_waterman(query, subject, match=1, mismatch=-1, gap=-2)
193+
print(traceback(score, query, subject))

0 commit comments

Comments
 (0)