#note Genome-Scale Algorithm Design (1)

This is a note for the book Genome-Scale Algorithm Design: Biological Sequence Analysis in the Era of High-Throughput Sequencing.

Exercise 1.1 Write a program that lists all the DNA sequences that encode a given protein sequence.

From the wikipedia page DNA and RNA codon tables I got the inverse table for the standard genetic code and built the Java class for protein – genetic code model (actually this was not necessary, one could also simply use a Map to store the symbol of the amino acid (for example “A”) as the key, and to store the genetics code (for example “GCU, GCC, GCA, GCG”) as the value, I wrote this class just to make my code more readable).

package de.yanzhou.genome;

import java.util.List;
public class GeneticCode {
  private String aminoAcid;
  private Character abbrAminoAcid;
  private List<String> rnaCodons;
  public GeneticCode(String aminoAcid, Character abbrAminoAcid, List<String> rnaCodons) {
    this.aminoAcid = aminoAcid;
    this.abbrAminoAcid = abbrAminoAcid;
    this.rnaCodons = rnaCodons;
  }

  public String getAminoAcid() {
    return aminoAcid;
  }

  public void setAminoAcid(String aminoAcid) {
    this.aminoAcid = aminoAcid;
  }
  public Character getAbbrAminoAcid() {
    return abbrAminoAcid;
  }

  public void setAbbrAminoAcid(Character abbrAminoAcid) {
    this.abbrAminoAcid = abbrAminoAcid;
  }
  public List<String> getRnaCodons() {
    return rnaCodons;
  }

  public void setRnaCodons(List<String> rnaCodons) {
    this.rnaCodons = rnaCodons;
  }
}

To count the combinations I took a random protein sequence “YNLARYLCMH” as an example, which was generated by Random Protein Sequence tool.

Defined function f(i) = ni , where n is the length of the protein sequence, ni is the symbol at the ith position, for convenience, i starts at 0, so for example, f(0) = Y, f(3) = A.

Defined function g(s) = Ns, where S is the symbol of the amino acid, Ns is the number of codons it has. For example, g(Y) = 2, g(A) = 4.

So the total number of the combinations Z can be calculated by using the formula:

    \[Z(n) = \prod_{i=0}^{n}C_{g(f(i))}^{1}=\prod_{i=0}^{n}g(f(i))\]

So for the very small length protein sequence YNLARYLCMH has altogether 2*2*6*4*6*2*6*2*1*2 = 27648 combinations!

To calculate and print the DNA sequences that encode a given protein sequence (limit the first 20 DNA sequences):

package de.yanzhou.genome;

import com.google.common.collect.Lists;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

public class PrintDNASequences {
  static final GeneticCode Ala = new GeneticCode("Ala", 'A', Arrays.asList("GCU", "GCC", "GCA", "GCG"));
  static final GeneticCode Arg = new GeneticCode("Arg", 'R', Arrays.asList("CGU", "CGC", "CGA", "CGG", "AGA", "AGG"));
  static final GeneticCode Asn = new GeneticCode("Asn", 'N', Arrays.asList("AAU", "AAC"));
  static final GeneticCode Asp = new GeneticCode("Asp", 'D', Arrays.asList("GAU", "GAC"));
  static final GeneticCode AsnOrAsp = new GeneticCode("Asn or Asp", 'B', Arrays.asList("AAU", "AAC", "GAU", "GAC"));
  static final GeneticCode Cys = new GeneticCode("Cys", 'C', Arrays.asList("UGU", "UGC"));
  static final GeneticCode Gln = new GeneticCode("Gln", 'Q', Arrays.asList("CAA", "CAG"));
  static final GeneticCode Glu = new GeneticCode("Glu", 'E', Arrays.asList("GAA", "GAG"));
  static final GeneticCode GlnOrGlu = new GeneticCode("Gln or Glu", 'Z', Arrays.asList("CAA", "CAG", "GAA", "GAG"));
  static final GeneticCode Gly = new GeneticCode("Gly", 'G', Arrays.asList("GGU", "GGC", "GGA", "GGG"));
  static final GeneticCode His = new GeneticCode("His", 'H', Arrays.asList("CAU", "CAC"));
  static final GeneticCode Ile = new GeneticCode("Ile", 'I', Arrays.asList("AUU", "AUC", "AUA"));
  static final GeneticCode Leu = new GeneticCode("Leu", 'L', Arrays.asList("CUU", "CUC", "CUA", "CUG", "UUA", "UUG"));
  static final GeneticCode Lys = new GeneticCode("Lys", 'K', Arrays.asList("AAA", "AAG"));
  static final GeneticCode Met = new GeneticCode("Met", 'M', List.of("AUG"));
  static final GeneticCode Phe = new GeneticCode("Phe", 'F', Arrays.asList("UUU", "UUC"));
  static final GeneticCode Pro = new GeneticCode("Pro", 'P', Arrays.asList("CCU", "CCC", "CCA", "CCG"));
  static final GeneticCode Ser = new GeneticCode("Ser", 'S', Arrays.asList("UCU", "UCC", "UCA", "UCG", "AGU", "AGC"));
  static final GeneticCode Thr = new GeneticCode("Thr", 'T', Arrays.asList("ACU", "ACC", "ACA", "ACG"));
  static final GeneticCode Trp = new GeneticCode("Trp", 'W', List.of("UGG"));
  static final GeneticCode Tyr = new GeneticCode("Tyr", 'Y', Arrays.asList("UAU", "UAC"));
  static final GeneticCode Val = new GeneticCode("Val", 'V', Arrays.asList("GUU", "GUC", "GUA", "GUG"));
  static final List<GeneticCode> listGeneticCode = Arrays.asList(Ala, Arg, Asn, Asp, AsnOrAsp, Cys, Gln, Glu, GlnOrGlu, Gly,
          His, Ile, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val);

  public static void main(String[] args) {

    // Generate the Map, key is the symbol of the amino acid, value is the list of codons.
    Map<Character, List<String>> mapGeneticCode = new HashMap<>();
    for (GeneticCode geneticCode : listGeneticCode) {
      mapGeneticCode.put(geneticCode.getAbbrAminoAcid(), geneticCode.getRnaCodons());
    }

    // Random protein sequence, generated by https://www.bioinformatics.org/sms2/random_protein.html
    String proteinSequence = "YNLARYLCMH";

    String abbrAminoAcids = mapGeneticCode.keySet().stream()
            .map(String::valueOf)
            .collect(Collectors.joining());
    if (isProteinSequenceValid(proteinSequence, abbrAminoAcids)){
      List<Character> listProteinSequence = proteinSequence.chars()
              .mapToObj(c -> (char) c)
              .toList();
      // Get all the lists of amino acid according to the protein sequence.
      List<List<String>> listAminoAcid = listProteinSequence.stream()
              .map(mapGeneticCode::get).toList();
      // Calculate the result.
      List<List<String>> result = Lists.cartesianProduct(listAminoAcid);
      // Format and display the protein sequence.
      String s = listProteinSequence.stream()
              .map(String::valueOf)
              .collect(Collectors.joining("  "));
      System.out.println(s);
      // Display the first 20 DNA sequences.
      result.stream()
              .limit(20)
              .map(t -> t.stream()
                      .map(String::valueOf)
                      .collect(Collectors.joining())
                      .replace("U", "T"))
              .forEach(System.out::println);
    } else {
      System.out.println("The protein sequence is invalid.");
    }
  }

  // Check if the protein sequence is valid
  private static boolean isProteinSequenceValid(String proteinSequence, String abbrAminoAcids){
    return proteinSequence.replaceAll("[" +abbrAminoAcids + "]", "").length() == 0 && !proteinSequence.isEmpty();
  }
}

Output:

Y  N  L  A  R  Y  L  C  M  H
TATAATCTTGCTCGTTATCTTTGTATGCAT
TATAATCTTGCTCGTTATCTTTGTATGCAC
TATAATCTTGCTCGTTATCTTTGCATGCAT
TATAATCTTGCTCGTTATCTTTGCATGCAC
TATAATCTTGCTCGTTATCTCTGTATGCAT
TATAATCTTGCTCGTTATCTCTGTATGCAC
TATAATCTTGCTCGTTATCTCTGCATGCAT
TATAATCTTGCTCGTTATCTCTGCATGCAC
TATAATCTTGCTCGTTATCTATGTATGCAT
TATAATCTTGCTCGTTATCTATGTATGCAC
TATAATCTTGCTCGTTATCTATGCATGCAT
TATAATCTTGCTCGTTATCTATGCATGCAC
TATAATCTTGCTCGTTATCTGTGTATGCAT
TATAATCTTGCTCGTTATCTGTGTATGCAC
TATAATCTTGCTCGTTATCTGTGCATGCAT
TATAATCTTGCTCGTTATCTGTGCATGCAC
TATAATCTTGCTCGTTATTTATGTATGCAT
TATAATCTTGCTCGTTATTTATGTATGCAC
TATAATCTTGCTCGTTATTTATGCATGCAT
TATAATCTTGCTCGTTATTTATGCATGCAC

1 thought on “#note Genome-Scale Algorithm Design (1)”

  1. Pingback: #note Genome-Scale Algorithm Design (2) - Yan’s Notebook

Leave a Comment

Your email address will not be published. Required fields are marked *

Scroll to Top