#note Genome-Scale Algorithm Design (2)

This is a note for the book Genome-Scale Algorithm Design: Biological Sequence Analysis in the Era of High-Throughput Sequencing (2nd edition).

Exercise 1.2  In a given organism some codons are used more frequently than others to encode the same amino acid. Given the observed frequency of every codon in a species, normalize it into probabilities and write a program that, given a protein sequence, samples a random DNA sequence that encodes that protein under such codon usage probabilities.

Steps:

1. Gather Codon Usage Data: Collect the observed frequency of each codon in the organism’s genome. This data can be found in databases like the Codon Usage Database (https://www.kazusa.or.jp/codon/).

2. Normalize Frequencies: Calculate the probabilities for each codon by normalizing the observed frequencies. Ensure that the sum of probabilities for each amino acid is equal to 1.

3. Create a Codon-to-Amino Acid Mapping: A mapping of codons to the corresponding amino acids is needed. This is essential for translating the protein sequence into codons.

4. Sample Sequence: Given a protein sequence, utilize the normalized codon probabilities and the codon-to-amino acid mapping to generate a random sequence that encodes the protein.

Similar to the previous exercise, I established a GeneticCode model and employed a HashMap to store the codon alongside its corresponding normalized probability.

package de.yanzhou.genome;

import lombok.Getter;
import lombok.Setter;

import java.util.HashMap;

@Getter
@Setter
public class GeneticCode {
    private String aminoAcid;
    private Character abbrAminoAcid;
    private HashMap<String, Double> rnaCodons;
    public GeneticCode(String aminoAcid, Character abbrAminoAcid, HashMap<String, Double> rnaCodons) {
        this.aminoAcid = aminoAcid;
        this.abbrAminoAcid = abbrAminoAcid;
        this.rnaCodons = rnaCodons;
    }
}

And a Utilities Class is utilized for the tasks of loading the codon usage table, normalizing codon usage probabilities, and generating the final sequence.

package de.yanzhou.genome;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Utilities { 
  public static HashMap<String, Double> calculateProbabilities(List<GeneticCode> geneticCodes, Character abbrAminoAcid, HashMap<String, Double> codonUsageTable){
    GeneticCode code = geneticCodes.stream()
            .filter(c -> c.getAbbrAminoAcid().equals(abbrAminoAcid))
            .findFirst()
            .orElse(null);
    if(code == null){
        throw new RuntimeException("The amino acid with this abbreviation " + abbrAminoAcid + " was not found.");
    }
    double sum = codonUsageTable.entrySet().stream()
            .filter(c -> code.getRnaCodons().containsKey(c.getKey()))
            .map(Map.Entry::getValue)
            .mapToDouble(Double::doubleValue)
            .sum();
    HashMap<String, Double> result = new HashMap<>();
    for(Map.Entry<String, Double> codon : code.getRnaCodons().entrySet()) {
      String sequence = codon.getKey();
      Double frequency = codonUsageTable.get(sequence)/sum;
      result.put(sequence, frequency);
    }
    return result;
  }

  public static HashMap<String, Double> loadCodonUsage(String codonUsageTable){
    String pattern = "([AUCG]{3})\\s*(\\d+\\.\\d+)";
    Pattern p = Pattern.compile(pattern);
    Matcher m = p.matcher(codonUsageTable);
    HashMap<String, Double> codonUsage = new HashMap<>();
    while (m.find()) {
      String codon = m.group(1);
      String frequency = m.group(2);
      codonUsage.put(codon, Double.valueOf(frequency));
    }
    return codonUsage;
  }

  public static Map.Entry<String, Double> randomPickWithWeight(HashMap<String, Double> codonUsage, double probability){
    double cumulativeProbability = 0.0;
    ArrayList<Double> list = new ArrayList<>();
    for (Map.Entry<String, Double> entry : codonUsage.entrySet()) {
      list.add(entry.getValue());
    }
    Collections.sort(list);
    for (double num : list) {
      for (Map.Entry<String, Double> entry : codonUsage.entrySet()) {
        if (entry.getValue().equals(num)) {
          cumulativeProbability += num;
          if(cumulativeProbability > probability)
            return entry;
        }
      }
    }
  return null;
  }
}

In the Main Class, I utilized the codon usage table of the mitochondrion in Saccharomyces cerevisiae, which I obtained from the Codon Usage Database.

import de.yanzhou.genome.GeneticCode;
import de.yanzhou.genome.Utilities;

import java.util.*;

public class CodonSampler {

  public static void main(String[] args) {

    GeneticCode Ala = new GeneticCode("Ala", 'A',
                new HashMap<>() {{
                    put("GCU", 0.0);
                    put("GCC", 0.0);
                    put("GCA", 0.0);
                    put("GCG", 0.0);
                }});
    GeneticCode Arg = new GeneticCode("Arg", 'R',
                new HashMap<>() {{
                    put("CGU", 0.0);
                    put("CGC", 0.0);
                    put("CGA", 0.0);
                    put("CGG", 0.0);
                    put("AGA", 0.0);
                    put("AGG", 0.0);
                }});
    GeneticCode Asn = new GeneticCode("Asn", 'N',
                new HashMap<>() {{
                    put("AAU", 0.0);
                    put("AAC", 0.0);
                }});
    GeneticCode Asp = new GeneticCode("Asp", 'D',
                new HashMap<>() {{
                    put("GAU", 0.0);
                    put("GAC", 0.0);
                }});
    GeneticCode AsnOrAsp = new GeneticCode("Asn or Asp", 'B',
                new HashMap<>() {{
                    put("AAU", 0.0);
                    put("AAC", 0.0);
                    put("GAU", 0.0);
                    put("GAC", 0.0);
                }});
    GeneticCode Cys = new GeneticCode("Cys", 'C',
                new HashMap<>() {{
                    put("UGU", 0.0);
                    put("UGC", 0.0);
                }});
    GeneticCode Gln = new GeneticCode("Gln", 'Q',
                new HashMap<>() {{
                    put("CAA", 0.0);
                    put("CAG", 0.0);
                }});
    GeneticCode Glu = new GeneticCode("Glu", 'E',
                new HashMap<>() {{
                    put("GAA", 0.0);
                    put("GAG", 0.0);
                }});
    GeneticCode GlnOrGlu = new GeneticCode("Gln or Glu", 'Z',
                new HashMap<>() {{
                    put("CAA", 0.0);
                    put("CAG", 0.0);
                    put("GAA", 0.0);
                    put("GAG", 0.0);
                }});
    GeneticCode Gly = new GeneticCode("Gly", 'G',
                new HashMap<>() {{
                    put("GGU", 0.0);
                    put("GGC", 0.0);
                    put("GGA", 0.0);
                    put("GGG", 0.0);
                }});
    GeneticCode His = new GeneticCode("His", 'H',
                new HashMap<>() {{
                    put("CAU", 0.0);
                    put("CAC", 0.0);
                }});
    GeneticCode Ile = new GeneticCode("Ile", 'I',
                new HashMap<>() {{
                    put("AUU", 0.0);
                    put("AUC", 0.0);
                    put("AUA", 0.0);
                }});
    GeneticCode Leu = new GeneticCode("Leu", 'L',
                new HashMap<>() {{
                    put("CUU", 0.0);
                    put("CUC", 0.0);
                    put("CUA", 0.0);
                    put("CUG", 0.0);
                    put("UUA", 0.0);
                    put("UUG", 0.0);
                }});
    GeneticCode Lys = new GeneticCode("Lys", 'K',
                new HashMap<>() {{
                    put("AAA", 0.0);
                    put("AAG", 0.0);
                }});
    GeneticCode Met = new GeneticCode("Met", 'M',
                new HashMap<>() {{
                    put("AUG", 0.0);
                }});
    GeneticCode Phe = new GeneticCode("Phe", 'F',
                new HashMap<>() {{
                    put("UUU", 0.0);
                    put("UUC", 0.0);
                }});
    GeneticCode Pro = new GeneticCode("Pro", 'P',
                new HashMap<>() {{
                    put("CCU", 0.0);
                    put("CCC", 0.0);
                    put("CCA", 0.0);
                    put("CCG", 0.0);
                }});
    GeneticCode Ser = new GeneticCode("Ser", 'S',
                new HashMap<>() {{
                    put("UCU", 0.0);
                    put("UCC", 0.0);
                    put("UCA", 0.0);
                    put("UCG", 0.0);
                    put("AGU", 0.0);
                    put("AGC", 0.0);
                }});
    GeneticCode Thr = new GeneticCode("Thr", 'T',
                new HashMap<>() {{
                    put("ACU", 0.0);
                    put("ACC", 0.0);
                    put("ACA", 0.0);
                    put("ACG", 0.0);
                }});
    GeneticCode Trp = new GeneticCode("Trp", 'W',
                new HashMap<>() {{
                    put("UGG", 0.0);
                }});
    GeneticCode Tyr = new GeneticCode("Tyr", 'Y',
                new HashMap<>() {{
                    put("UAU", 0.0);
                    put("UAC", 0.0);
                }});
    GeneticCode Val = new GeneticCode("Val", 'V',
                new HashMap<>() {{
                    put("GUU", 0.0);
                    put("GUC", 0.0);
                    put("GUA", 0.0);
                    put("GUG", 0.0);
                }});
    List<GeneticCode> geneticCodes = Arrays.asList(Ala, Arg, Asn, Asp, AsnOrAsp, Cys, Gln, Glu, GlnOrGlu, Gly,
                His, Ile, Leu, Lys, Met, Phe, Pro, Ser, Thr, Trp, Tyr, Val);

    String codonUsage =
            """
                    UUU 39.7(   827)  UCU 16.5(   343)  UAU 55.7(  1159)  UGU  8.5(   176)
                    UUC 16.8(   349)  UCC  3.2(    67)  UAC  5.6(   117)  UGC  0.8(    16)
                    UUA103.8(  2161)  UCA 25.6(   532)  UAA  4.9(   101)  UGA 13.2(   275)
                    UUG  4.6(    96)  UCG  1.5(    31)  UAG  0.3(     7)  UGG  1.8(    38)

                    CUU  5.7(   119)  CCU 18.6(   387)  CAU 17.6(   367)  CGU  2.8(    58)
                    CUC  1.0(    21)  CCC  3.2(    66)  CAC  2.0(    41)  CGC  0.6(    13)
                    CUA  7.9(   164)  CCA 11.0(   229)  CAA 19.1(   398)  CGA  0.4(     8)
                    CUG  1.5(    32)  CCG  2.0(    41)  CAG  2.5(    52)  CGG  1.0(    21)

                    AUU 80.9(  1684)  ACU 16.4(   342)  AAU 87.7(  1824)  AGU 12.6(   263)
                    AUC  9.9(   207)  ACC  3.6(    75)  AAC  7.7(   160)  AGC  1.5(    31)
                    AUA 32.8(   682)  ACA 20.3(   423)  AAA 61.1(  1272)  AGA 22.9(   476)
                    AUG 21.8(   453)  ACG  1.7(    36)  AAG  8.7(   182)  AGG  2.2(    46)

                    GUU 18.0(   375)  GCU 22.1(   459)  GAU 28.4(   591)  GGU 36.1(   752)
                    GUC  2.7(    57)  GCC  3.5(    73)  GAC  3.7(    77)  GGC  2.0(    41)
                    GUA 27.2(   567)  GCA 15.5(   323)  GAA 24.7(   513)  GGA 11.8(   245)
                    GUG  2.5(    53)  GCG  1.9(    40)  GAG  3.7(    78)  GGG  4.7(    98)
            """;

    HashMap<String, Double> codonUsageTable = Utilities.loadCodonUsage(codonUsage);

    geneticCodes.forEach(geneticCode -> {
            HashMap<String, Double> r = Utilities.calculateProbabilities(geneticCodes, geneticCode.getAbbrAminoAcid(), codonUsageTable);
            geneticCode.setRnaCodons(r);
    });

    String protein = "MSEMAKKTTK";

    char[] aminoAcid = protein.toCharArray();
    StringBuilder sequence = new StringBuilder();
    double probability = 1.0;
    for (char abbr : aminoAcid) {
      Optional<GeneticCode> geneticCode = geneticCodes.stream().filter(code -> code.getAbbrAminoAcid().equals(abbr)).findFirst();
      if (geneticCode.isPresent()) {
        Map.Entry<String, Double> sequenceWithProbability = Utilities.randomPickWithWeight(geneticCode.get().getRnaCodons(), new Random().nextDouble());
        if (sequenceWithProbability == null) {
          throw new RuntimeException("Calculation error.");
        }
        sequence.append(sequenceWithProbability.getKey());
        probability *= sequenceWithProbability.getValue();
      } else {
        throw new RuntimeException("The amino acid with this abbreviation " + abbr + " was not found.");
      }
    }
    System.out.println("The sequence:\n " + sequence + " \nwas generated based on this probability: \n" + probability);
  }
}

Result:

The sequence:
 AUGUCAGAAAUGGCUAAAAAGACAACUAAA 
was generated based on this probability: 
0.003386895635385543

Leave a Comment

Your email address will not be published. Required fields are marked *

Scroll to Top