Software Programming

Kunuk Nykjaer

Dynamic programming example with C# using Needleman Wunsch algorithm

with 8 comments


Dynamic programming example with C#
Needleman-Wunsch algorithm, global sequence alignment

This example uses fictional species and matches their DNA by using a scoring matrix (the file BLOSUM62.txt)

Reference:
http://www.avatar.se/molbioinfo2001/dynprog/adv_dynamic.html
http://www.ludwig.edu.au/course/lectures2005/Likic.pdf

SequenceAlignment.cs

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace DynamicProgramming
{
    /// <summary>
    /// Dynamic programming example
    /// Needleman-Wunsch algorithm
    /// Global sequence alignment using a scoring matrix (BLOSUM62)
    /// 
    /// Author:  Kunuk Nykjaer
    /// </summary>
    public class SequenceAlignment
    {
        public static void Main(string[] args) 
        {
            new SequenceAlignment().Run();
            PL("Press a key to exit.");
            Console.ReadKey();
        }

        const int Size = 24; // BLOSUM62 matrix
        readonly private List<string> linesMatrix;
        readonly List<LifeForm> lifeForms = new List<LifeForm>();
        readonly private int[,] matrix = new int[Size, Size];
        readonly Dictionary<string, int> lookup = new Dictionary<string, int>();
        const int DefaultMismatchPenalty = -4; // if not defined, use this
        static readonly string NL = Environment.NewLine;

        // trace back
        const string DONE = @"¤";
        const string DIAG = @"\";
        const string UP = @"|";
        const string LEFT = @"-";

        // print alignment
        const string GAP = @"-";

        public SequenceAlignment()
        {
            linesMatrix = ReadFileToList("BLOSUM62.txt");
        }


        // BLOSUM62.txt
        void ParseMatrixFile()
        {
            const int startLine = 7; // # lines before line 7
            for (int i = 0; i < linesMatrix.Count; i++)
            {
                if (i >= startLine)
                {
                    var line = linesMatrix.ElementAt(i);
                    String[] row = Regex.Split(line, @"\s+"); // white space
                    lookup.Add(row[0], i - startLine);

                    for (int j = 1; j < row.Length; j++)
                    {
                        var s = row[j];
                        if (string.IsNullOrEmpty(s))
                            continue;
                        matrix[i - startLine, j - 1] = Int32.Parse(s);
                    }
                }
            }
        }

        static void PrintMatrix<T>(T[,] A, int I, int J)
        {
            for (int i = 0; i < I; i++)
            {
                for (int j = 0; j < J; j++)
                {
                    var v = A[i, j];
                    P(v + " ");
                }
                PL();
            }
        }


        static string ReverseString(string s)
        {
            char[] arr = s.ToCharArray();
            Array.Reverse(arr);
            return new string(arr);
        }

        public void Run()
        {
            // put your data here
            lifeForms.Add(new LifeForm() { Name = "Sphinx", DNA = "KQRK" });
            lifeForms.Add(new LifeForm() { Name = "Bandersnatch", DNA = "KAK" });
            lifeForms.Add(new LifeForm() { Name = "Snark", DNA = "KQRIKAAKABK" });

            ParseMatrixFile();

            for (int i = 0; i < lifeForms.Count; i++)
            {
                for (int j = 0; j < i; j++)
                {
                    if (i == j)
                        continue;

                    var l1 = lifeForms.ElementAt(i);
                    var l2 = lifeForms.ElementAt(j);
                    var sequenceAlign = SequenceAlign(l1.DNA, l2.DNA).ToString();
                    Console.WriteLine(string.Format("Matching species: {0} and {1}, DNA: {2} and {3}\n{4}"
                        , l1.Name, l2.Name, l1.DNA, l2.DNA, sequenceAlign));
                }
            }
        }


        /*  
             p = gap penalty      
             a = diff char penalty
         
             Sequence-Alignment(m, n, xs, ys, p, a) {
             for i = 0 to m
                 M[i, 0] = i*p
             for j = 0 to n
                 M[0, j] = j*p
             for i = 1 to m
                 for j = 1 to n
                     M[i, j] = max(  a(xi, yj) + M[i-1, j-1],
                         p + M[i-1, j],
                         p + M[i, j-1])
             return M[m, n]            
         */
        Sequence SequenceAlign(string xs, string ys)
        {
            const int p = -4; //gap penalty, knowledge by looking at matrix file
            int m = xs.Length;
            int n = ys.Length;

            // init the matrix
            var M = new int[m + 1, n + 1]; // dynamic programming buttom up memory table
            var T = new string[m + 1, n + 1]; // trace back

            for (int i = 0; i < m + 1; i++)
                M[i, 0] = i * p;
            for (int j = 0; j < n + 1; j++)
                M[0, j] = j * p;

            T[0, 0] = DONE;
            for (int i = 1; i < m + 1; i++)
                T[i, 0] = UP;
            for (int j = 1; j < n + 1; j++)
                T[0, j] = LEFT;

            // calc
            for (int i = 1; i < m + 1; i++)
            {
                for (int j = 1; j < n + 1; j++)
                {
                    var alpha = Alpha(xs.ElementAt(i - 1).ToString(), ys.ElementAt(j - 1).ToString());
                    var diag = alpha + M[i - 1, j - 1];
                    var up = p + M[i - 1, j];
                    var left = p + M[i, j - 1];
                    var max = Max(diag, up, left);
                    M[i, j] = max;

                    if (max == diag)
                        T[i, j] = DIAG;
                    else if (max == up)
                        T[i, j] = UP;
                    else
                        T[i, j] = LEFT;
                }
            }

            var traceBack = ParseTraceBack(T, m + 1, n + 1);

            var sb = new StringBuilder();
            string first, second;

            if (xs.Length != ys.Length)
            {
                string s;
                if (xs.Length > ys.Length)
                {
                    s = ys;
                    first = xs;
                }
                else
                {
                    s = xs;
                    first = ys;
                }


                int i = 0;
                foreach (var trace in traceBack)
                {
                    if (trace.ToString() == DIAG)
                        sb.Append(s.ElementAt(i++).ToString());
                    else
                        sb.Append(GAP);
                }

                second = sb.ToString();
            }
            else
            {
                first = xs;
                second = ys;
            }

            PL("\nScore table");
            PrintMatrix(M, m + 1, n + 1);
            PL("\nTraceBack");
            PrintMatrix(T, m + 1, n + 1);
            PL();

            var sequence = new Sequence() { Score = M[m, n], Path = traceBack, One = first, Two = second };
            return sequence;
        }


        static string ParseTraceBack(string[,] T, int I, int J)
        {
            var sb = new StringBuilder();
            int i = I - 1;
            int j = J - 1;
            string path = T[i, j];
            while (path != DONE)
            {
                sb.Append(path);
                if (path == DIAG)
                {
                    i--;
                    j--;
                }
                else if (path == UP)
                    i--;
                else if (path == LEFT)
                    j--;

                path = T[i, j];
            }
            return ReverseString(sb.ToString());
        }


        static int Max(int a, int b, int c)
        {
            if (a >= b && a >= c)
                return a;
            if (b >= a && b >= c)
                return b;
            return c;
        }

        int Alpha(string x, string y)
        {
            if (lookup.ContainsKey(x) && lookup.ContainsKey(y))
            {
                var i = lookup[x];
                var j = lookup[y];
                return matrix[i, j];
            }
            else if (x == y)
                return 1; // matrix file match * with *

            return DefaultMismatchPenalty; // default mismatch penalty
        }

        class LifeForm
        {
            public string Name { get; set; }
            public string DNA { get; set; }
        }

        class Sequence
        {
            public int Score { get; set; }
            public string Path { get; set; }
            public string One { get; set; }
            public string Two { get; set; }
            public new string ToString()
            {
                var s = string.Format("score = {0}{1}one = {2}{3}two = {4}\n\n", Score, NL, One, NL, Two);
                return s;
            }
        }

        public static List<string> ReadFileToList(string path)
        {
            var lines = new List<string>();
            StreamReader streamReader = null;
            try
            {
                streamReader = File.OpenText(path);
                string line = streamReader.ReadLine();
                while (line != null)
                {
                    lines.Add(line);
                    line = streamReader.ReadLine();
                }
            }
            catch (FileNotFoundException ex)
            {
                PL(string.Format("Error cannot find file {0}", path));
                PL(ex.Message + Environment.NewLine + ex.StackTrace);
            }
            catch (Exception ex)
            {
                PL(string.Format("Error reading file {0}", path));
                PL(ex.Message + Environment.NewLine + ex.StackTrace);
            }
            finally
            {
                if (streamReader != null)
                    streamReader.Close();
            }
            return lines;
        }

        public static void PL(object o) { Console.WriteLine(o); } //alias
        public static void PL() { Console.WriteLine(); } //alias
        public static void P(object o) { Console.Write(o); } //alias
    }
}

BLOSUM62.txt

#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4
R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4
N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4
D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4
C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4
E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4
H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4
K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4
M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4
T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4
V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4
B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4
Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4
X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
Result:

Matching species: Bandersnatch and Sphinx, DNA: KAK and KQRK
score = 5
one = KQRK
two = K-AK

Matching species: Snark and Sphinx, DNA: KQRIKAAKABK and KQRK
score = -8
one = KQRIKAAKABK
two = KQR-------K

Matching species: Snark and Bandersnatch, DNA: KQRIKAAKABK and KAK
score = -18
one = KQRIKAAKABK
two = -------KA-K

Advertisements

Written by kunuk Nykjaer

October 17, 2010 at 12:28 pm

8 Responses

Subscribe to comments with RSS.

  1. very good
    but ican not excuteit how pleas replay me

    apeer

    January 5, 2011 at 7:20 pm

    • hey

      you should know how to execute c# code.
      Maybe you couldn’t open the file BLOSUM62.txt
      the file must be in your bin debug or release folder.
      You should know how to load files in c#

      kunuk Nykjaer

      January 15, 2011 at 3:29 pm

      • thank you..
        IKNOW load file with out
        (util.pl)
        pleas call me util???
        or any path explaine this method…

        apeer

        January 16, 2011 at 4:36 pm

  2. I try to load util.pl, but not work. Please advide me.
    Thank you.

    patsaraporn

    February 20, 2011 at 6:03 am

    • I updated the code to not use Util.P and Util.PL
      Should work properly now.

      regards
      Kunuk

      kunuk Nykjaer

      February 21, 2011 at 3:52 pm

  3. This code excellent.
    Thank you.

    patsaraporn

    February 22, 2011 at 3:21 pm

  4. you can write to parallel (master and slave)?? you can help me ??

    bhxh247

    May 28, 2013 at 6:28 am

  5. Hi Kunuk,

    Great post and nice stand alone applicaiton. I was curious if you have a similar program that can recieve the two sequences to be aligned from another class?

    All the best,
    Chris

    Chris

    January 29, 2015 at 6:45 pm


Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: