TheAlgorithms · tianyizheng02 · Sep 30, 2023 · Aug 20, 2023 · Aug 20, 2023 · Aug 22, 2023
diff --git a/dynamic_programming/smith_waterman.py b/dynamic_programming/smith_waterman.py
@@ -0,0 +1,98 @@
+
+# https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
+# Score constants
+"""
+Score constants used in the Smith-Waterman algorithm. Matches are given a positive
+score while mismatches are given a negative score. Gaps are also penalized.
+"""
+MATCH = 1
+MISMATCH = -1
+GAP = -2
+
+
+def score_function(a: str, b: str) -> int:
+    """
+    Calculate the score for a character pair based on whether they match or mismatch.
+    Returns 1 if the characters match, -1 if they mismatch.
+    >>> score_function('A', 'A')
+    1
+    >>> score_function('A', 'C')
+    -1
+    """
+    if a == b:
+        return MATCH
+    else:
+        return MISMATCH
+
+
+def smith_waterman(query: str, subject: str) -> list[list[int]]:
+    """
+    Perform the Smith-Waterman local sequence alignment algorithm.
+    Returns a 2D list representing the score matrix. Each value in the matrix
+    corresponds to the score of the best local alignment ending at that point.
+    >>> smith_waterman('ACAC', 'CA')
+    [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]]
+    """
+
+    # Initialize score matrix
+    m = len(query)
+    n = len(subject)
+    score = [[0] * (n + 1) for _ in range(m + 1)]
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            # Calculate scores for each cell
+            match = score[i - 1][j - 1] + score_function(query[i - 1], subject[j - 1])
+            delete = score[i - 1][j] + GAP
+            insert = score[i][j - 1] + GAP
+
+            # Take maximum score
+            score[i][j] = max(0, match, delete, insert)
+
+    return score
+
+
+def traceback(score: list[list[int]], query: str, subject: str) -> str:
+    r"""
+    Perform traceback to find the optimal local alignment.
+    Starts from the highest scoring cell in the matrix and traces back recursively
+    until a 0 score is found. Returns the alignment strings.
+    >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'CA')
+    'CAC\nCA-'
+    """
+
+    # Traceback logic to find optimal alignment
+    i = len(query)
+    j = len(subject)
+    align1 = ""
+    align2 = ""
+
+    while i > 0 and j > 0:
+        if score[i][j] == score[i - 1][j - 1] + score_function(
+            query[i - 1], subject[j - 1]
+        ):
+            # optimal path is a diagonal take both letters
+            align1 = query[i - 1] + align1
+            align2 = subject[j - 1] + align2
+            i -= 1
+            j -= 1
+        elif score[i][j] == score[i - 1][j] + GAP:
+            # optimal path is a vertical
+            align1 = query[i - 1] + align1
+            align2 = "-" + align2
+            i -= 1
+        else:
+            # optimal path is a horizontal
+            align1 = "-" + align1
+            align2 = subject[j - 1] + align2
+            j -= 1
+
+    return f'{align1}\n{align2}'
+
+
+if __name__ == "__main__":
+    query = "HEAGAWGHEE"
+    subject = "PAWHEAE"
+
+    score = smith_waterman(query, subject)
+    print(traceback(score, query, subject))