Posts Tagged ‘Dynamic programming’
Dynamic programming example with C# using Needleman Wunsch algorithm
Dynamic programming example with C#
Needleman-Wunsch algorithm, global sequence alignment
This example uses fictional species and matches their DNA by using a scoring matrix (the file BLOSUM62.txt)
Reference:
http://www.avatar.se/molbioinfo2001/dynprog/adv_dynamic.html
http://www.ludwig.edu.au/course/lectures2005/Likic.pdf
SequenceAlignment.cs
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace DynamicProgramming
{
/// <summary>
/// Dynamic programming example
/// Needleman-Wunsch algorithm
/// Global sequence alignment using a scoring matrix (BLOSUM62)
///
/// Author: Kunuk Nykjaer
/// </summary>
public class SequenceAlignment
{
public static void Main(string[] args)
{
new SequenceAlignment().Run();
PL("Press a key to exit.");
Console.ReadKey();
}
const int Size = 24; // BLOSUM62 matrix
readonly private List<string> linesMatrix;
readonly List<LifeForm> lifeForms = new List<LifeForm>();
readonly private int[,] matrix = new int[Size, Size];
readonly Dictionary<string, int> lookup = new Dictionary<string, int>();
const int DefaultMismatchPenalty = -4; // if not defined, use this
static readonly string NL = Environment.NewLine;
// trace back
const string DONE = @"ยค";
const string DIAG = @"\";
const string UP = @"|";
const string LEFT = @"-";
// print alignment
const string GAP = @"-";
public SequenceAlignment()
{
linesMatrix = ReadFileToList("BLOSUM62.txt");
}
// BLOSUM62.txt
void ParseMatrixFile()
{
const int startLine = 7; // # lines before line 7
for (int i = 0; i < linesMatrix.Count; i++)
{
if (i >= startLine)
{
var line = linesMatrix.ElementAt(i);
String[] row = Regex.Split(line, @"\s+"); // white space
lookup.Add(row[0], i - startLine);
for (int j = 1; j < row.Length; j++)
{
var s = row[j];
if (string.IsNullOrEmpty(s))
continue;
matrix[i - startLine, j - 1] = Int32.Parse(s);
}
}
}
}
static void PrintMatrix<T>(T[,] A, int I, int J)
{
for (int i = 0; i < I; i++)
{
for (int j = 0; j < J; j++)
{
var v = A[i, j];
P(v + " ");
}
PL();
}
}
static string ReverseString(string s)
{
char[] arr = s.ToCharArray();
Array.Reverse(arr);
return new string(arr);
}
public void Run()
{
// put your data here
lifeForms.Add(new LifeForm() { Name = "Sphinx", DNA = "KQRK" });
lifeForms.Add(new LifeForm() { Name = "Bandersnatch", DNA = "KAK" });
lifeForms.Add(new LifeForm() { Name = "Snark", DNA = "KQRIKAAKABK" });
ParseMatrixFile();
for (int i = 0; i < lifeForms.Count; i++)
{
for (int j = 0; j < i; j++)
{
if (i == j)
continue;
var l1 = lifeForms.ElementAt(i);
var l2 = lifeForms.ElementAt(j);
var sequenceAlign = SequenceAlign(l1.DNA, l2.DNA).ToString();
Console.WriteLine(string.Format("Matching species: {0} and {1}, DNA: {2} and {3}\n{4}"
, l1.Name, l2.Name, l1.DNA, l2.DNA, sequenceAlign));
}
}
}
/*
p = gap penalty
a = diff char penalty
Sequence-Alignment(m, n, xs, ys, p, a) {
for i = 0 to m
M[i, 0] = i*p
for j = 0 to n
M[0, j] = j*p
for i = 1 to m
for j = 1 to n
M[i, j] = max( a(xi, yj) + M[i-1, j-1],
p + M[i-1, j],
p + M[i, j-1])
return M[m, n]
*/
Sequence SequenceAlign(string xs, string ys)
{
const int p = -4; //gap penalty, knowledge by looking at matrix file
int m = xs.Length;
int n = ys.Length;
// init the matrix
var M = new int[m + 1, n + 1]; // dynamic programming buttom up memory table
var T = new string[m + 1, n + 1]; // trace back
for (int i = 0; i < m + 1; i++)
M[i, 0] = i * p;
for (int j = 0; j < n + 1; j++)
M[0, j] = j * p;
T[0, 0] = DONE;
for (int i = 1; i < m + 1; i++)
T[i, 0] = UP;
for (int j = 1; j < n + 1; j++)
T[0, j] = LEFT;
// calc
for (int i = 1; i < m + 1; i++)
{
for (int j = 1; j < n + 1; j++)
{
var alpha = Alpha(xs.ElementAt(i - 1).ToString(), ys.ElementAt(j - 1).ToString());
var diag = alpha + M[i - 1, j - 1];
var up = p + M[i - 1, j];
var left = p + M[i, j - 1];
var max = Max(diag, up, left);
M[i, j] = max;
if (max == diag)
T[i, j] = DIAG;
else if (max == up)
T[i, j] = UP;
else
T[i, j] = LEFT;
}
}
var traceBack = ParseTraceBack(T, m + 1, n + 1);
var sb = new StringBuilder();
string first, second;
if (xs.Length != ys.Length)
{
string s;
if (xs.Length > ys.Length)
{
s = ys;
first = xs;
}
else
{
s = xs;
first = ys;
}
int i = 0;
foreach (var trace in traceBack)
{
if (trace.ToString() == DIAG)
sb.Append(s.ElementAt(i++).ToString());
else
sb.Append(GAP);
}
second = sb.ToString();
}
else
{
first = xs;
second = ys;
}
PL("\nScore table");
PrintMatrix(M, m + 1, n + 1);
PL("\nTraceBack");
PrintMatrix(T, m + 1, n + 1);
PL();
var sequence = new Sequence() { Score = M[m, n], Path = traceBack, One = first, Two = second };
return sequence;
}
static string ParseTraceBack(string[,] T, int I, int J)
{
var sb = new StringBuilder();
int i = I - 1;
int j = J - 1;
string path = T[i, j];
while (path != DONE)
{
sb.Append(path);
if (path == DIAG)
{
i--;
j--;
}
else if (path == UP)
i--;
else if (path == LEFT)
j--;
path = T[i, j];
}
return ReverseString(sb.ToString());
}
static int Max(int a, int b, int c)
{
if (a >= b && a >= c)
return a;
if (b >= a && b >= c)
return b;
return c;
}
int Alpha(string x, string y)
{
if (lookup.ContainsKey(x) && lookup.ContainsKey(y))
{
var i = lookup[x];
var j = lookup[y];
return matrix[i, j];
}
else if (x == y)
return 1; // matrix file match * with *
return DefaultMismatchPenalty; // default mismatch penalty
}
class LifeForm
{
public string Name { get; set; }
public string DNA { get; set; }
}
class Sequence
{
public int Score { get; set; }
public string Path { get; set; }
public string One { get; set; }
public string Two { get; set; }
public new string ToString()
{
var s = string.Format("score = {0}{1}one = {2}{3}two = {4}\n\n", Score, NL, One, NL, Two);
return s;
}
}
public static List<string> ReadFileToList(string path)
{
var lines = new List<string>();
StreamReader streamReader = null;
try
{
streamReader = File.OpenText(path);
string line = streamReader.ReadLine();
while (line != null)
{
lines.Add(line);
line = streamReader.ReadLine();
}
}
catch (FileNotFoundException ex)
{
PL(string.Format("Error cannot find file {0}", path));
PL(ex.Message + Environment.NewLine + ex.StackTrace);
}
catch (Exception ex)
{
PL(string.Format("Error reading file {0}", path));
PL(ex.Message + Environment.NewLine + ex.StackTrace);
}
finally
{
if (streamReader != null)
streamReader.Close();
}
return lines;
}
public static void PL(object o) { Console.WriteLine(o); } //alias
public static void PL() { Console.WriteLine(); } //alias
public static void P(object o) { Console.Write(o); } //alias
}
}
BLOSUM62.txt
# Matrix made by matblas from blosum62.iij # * column uses minimum score # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units # Blocks Database = /data/blocks_5.0/blocks.dat # Cluster Percentage: >= 62 # Entropy = 0.6979, Expected = -0.5209 A R N D C Q E G H I L K M F P S T W Y V B Z X * A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
Result: Matching species: Bandersnatch and Sphinx, DNA: KAK and KQRK score = 5 one = KQRK two = K-AK Matching species: Snark and Sphinx, DNA: KQRIKAAKABK and KQRK score = -8 one = KQRIKAAKABK two = KQR-------K Matching species: Snark and Bandersnatch, DNA: KQRIKAAKABK and KAK score = -18 one = KQRIKAAKABK two = -------KA-K