6. Recipe 6: Set functions

Note:

Works only on kDataFrameMQF

6.1. Description

  1. Create two kDataFrames with same kmer size and fill them with random kmers
  2. Apply set functions diff, intersect, union and dump the resultant kDataFrames

6.2. Implementation

6.2.1. Importing

[1]:
import kProcessor as kp
import random

6.2.2. Function to generate random kmers with random counts

[2]:
def generate_kmers(kSize, kmers_no):
    kmers_list = []
    for i in range(kmers_no):
        _kmer = "".join([random.choice('ACGT') for _ in range(kSize)])
        _count = random.randint(1, 1000)
        kmers_list.append((_kmer, _count))

    return kmers_list

6.2.3. Function to dump the kmers of a kDataFrame

[3]:
def dumpKmers(kFrame):
    it = kFrame.begin()
    counter = 1
    while (it != kFrame.end()):
        # Get the kmer string
        kmer = it.getKmer()

        # Get the kmer count
        count = it.getCount()

        # Print the data
        print("%d- kmer: %s with count: %d" % (counter, kmer, count))
        counter +=1

        it.next()

6.2.4. Determining kmer size

[4]:
kSize = 31

6.2.5. Create list of kDataFrames

[5]:
kFrames_vec = [kp.kDataFrameMQF(kSize), kp.kDataFrameMQF(kSize)]

6.2.6. Create random kmers list

[6]:
kmers_list1 = generate_kmers(kSize, kmers_no=20)
kmers_list2 = generate_kmers(kSize, kmers_no=10)

6.2.7. Replicate some kmers from kmers_list1 in kmers_list2 to make sure len(intersection) > 0

[7]:
kmers_list2 += kmers_list1[0:10]

6.2.8. Inserting the kmers

[8]:
for kmer in kmers_list1:
    kFrames_vec[0].insert(kmer[0], kmer[1])

for kmer in kmers_list2:
    kFrames_vec[1].insert(kmer[0], kmer[1])

6.2.9. Apply set functions

[9]:
# Apply kFrameIntersect
intersect_kFrame = kp.kFrameIntersect(kFrames_vec)

# Apply kFrameDiff
diff_kFrame = kp.kFrameDiff(kFrames_vec)

# Apply kFrameUnion
union_kFrame = kp.kFrameUnion(kFrames_vec)

6.2.10. Dumping kmers of kFrames_vec[0]

[10]:
dumpKmers(kFrames_vec[0])
1- kmer: ATACGAGCATGTCTCATTTCACTTAAAGGGA with count: 477
2- kmer: AACATGACTTGCTTAATCATAGCCAAGTAAC with count: 969
3- kmer: GGCATGCTTCCTACGAAACGGAGAGGTTACC with count: 984
4- kmer: ACGATAAACAGCTCTGTACGTTAACTGTCAC with count: 508
5- kmer: TCTGGCGTTGTATATACTTGGTTCCGCCTAA with count: 563
6- kmer: GCCGGTATACGGTACGCAGATGGCTACTACC with count: 60
7- kmer: CTCAGACGCGCTAAAATTTTGCTGTCATACC with count: 509
8- kmer: CCAGGGATAGAAATGCAGCATCTGTCTTCAA with count: 87
9- kmer: ACTGAAATGTAAACCGGGGACTGGTGAGACC with count: 250
10- kmer: AGCTCGCCTTTACGCGAATTCCACAGCGGAC with count: 477
11- kmer: ACCGTCTCAAATGTTGGAGCTCTCCTCGGTG with count: 38
12- kmer: CAAACCGAAGAAGCATTACATCTCGCTATAA with count: 168
13- kmer: ACCGTCTCTTCAGCGCGCAAAAAGAAACGCT with count: 618
14- kmer: AGTGATGGTGCTCCACGATTACGAGCACGGG with count: 293
15- kmer: TAATTTCAAGAATGGCTAAGAGCGTACAGGC with count: 235
16- kmer: TCATGAAGTACCAGGTTGTCCCGGGTGTACA with count: 840
17- kmer: CCTCAGACGACATCAGTATGGCGGATGTACC with count: 21
18- kmer: AAAAGACACCGGCGAGTGCTTACTAGTGGCC with count: 725
19- kmer: TACGGGGGATCTCAATGTAAGTAGACGCGCA with count: 257
20- kmer: CCCAGGCATGGGCAGGTCTCGTGCAACTGAG with count: 398

6.2.11. Dumping kmers of kFrames_vec[1]

[11]:
dumpKmers(kFrames_vec[1])
1- kmer: AAATACAGAAGAGTAGAAGTCCCTGGCGCAA with count: 649
2- kmer: ACTAGAGATTTATCCAAAGAGGATCCCCTGA with count: 685
3- kmer: ACGAGCTGCGCCTACGCACTGATATGCATGA with count: 965
4- kmer: CCACTCAGGGTCTGACCTAAGTGTTGTCTCG with count: 833
5- kmer: GGCATGCTTCCTACGAAACGGAGAGGTTACC with count: 984
6- kmer: ACGATAAACAGCTCTGTACGTTAACTGTCAC with count: 508
7- kmer: GCTGGCGGTTTATTTGTTTTCTGAACTCCGC with count: 40
8- kmer: AGGAAAACTTAACATCAGTTCACCTTGATGG with count: 415
9- kmer: CTCAGACGCGCTAAAATTTTGCTGTCATACC with count: 509
10- kmer: CCAGGGATAGAAATGCAGCATCTGTCTTCAA with count: 87
11- kmer: ACTGAAATGTAAACCGGGGACTGGTGAGACC with count: 250
12- kmer: AGCTCGCCTTTACGCGAATTCCACAGCGGAC with count: 477
13- kmer: CTACGCGAAATCGCTACTTATGCGCTGAGAA with count: 470
14- kmer: AAGACTCCTATCCGTACATGAGTTGAAGGTC with count: 850
15- kmer: ACCGTCTCTTCAGCGCGCAAAAAGAAACGCT with count: 618
16- kmer: AGTGATGGTGCTCCACGATTACGAGCACGGG with count: 293
17- kmer: CCTCAGACGACATCAGTATGGCGGATGTACC with count: 21
18- kmer: ACCCCCGTTGACCGTGCAGAGGGGCAAATGG with count: 269
19- kmer: TACGGGGGATCTCAATGTAAGTAGACGCGCA with count: 257
20- kmer: AAGGGTTGTACGCTGTGTCATACTATGTCAG with count: 83

6.2.12. Dump diff_kFrame

[12]:
dumpKmers(diff_kFrame)
1- kmer: ATACGAGCATGTCTCATTTCACTTAAAGGGA with count: 477
2- kmer: AACATGACTTGCTTAATCATAGCCAAGTAAC with count: 969
3- kmer: TCTGGCGTTGTATATACTTGGTTCCGCCTAA with count: 563
4- kmer: GCCGGTATACGGTACGCAGATGGCTACTACC with count: 60
5- kmer: ACCGTCTCAAATGTTGGAGCTCTCCTCGGTG with count: 38
6- kmer: CAAACCGAAGAAGCATTACATCTCGCTATAA with count: 168
7- kmer: TAATTTCAAGAATGGCTAAGAGCGTACAGGC with count: 235
8- kmer: TCATGAAGTACCAGGTTGTCCCGGGTGTACA with count: 840
9- kmer: AAAAGACACCGGCGAGTGCTTACTAGTGGCC with count: 725
10- kmer: CCCAGGCATGGGCAGGTCTCGTGCAACTGAG with count: 398

6.2.13. Dump instersect_kFrame

[13]:
dumpKmers(intersect_kFrame)
1- kmer: GGCATGCTTCCTACGAAACGGAGAGGTTACC with count: 984
2- kmer: ACGATAAACAGCTCTGTACGTTAACTGTCAC with count: 508
3- kmer: CTCAGACGCGCTAAAATTTTGCTGTCATACC with count: 509
4- kmer: CCAGGGATAGAAATGCAGCATCTGTCTTCAA with count: 87
5- kmer: ACTGAAATGTAAACCGGGGACTGGTGAGACC with count: 250
6- kmer: AGCTCGCCTTTACGCGAATTCCACAGCGGAC with count: 477
7- kmer: ACCGTCTCTTCAGCGCGCAAAAAGAAACGCT with count: 618
8- kmer: AGTGATGGTGCTCCACGATTACGAGCACGGG with count: 293
9- kmer: CCTCAGACGACATCAGTATGGCGGATGTACC with count: 21
10- kmer: TACGGGGGATCTCAATGTAAGTAGACGCGCA with count: 257

6.2.14. Dump union_kFrame

[14]:
dumpKmers(union_kFrame)
1- kmer: AAATACAGAAGAGTAGAAGTCCCTGGCGCAA with count: 649
2- kmer: ACTAGAGATTTATCCAAAGAGGATCCCCTGA with count: 685
3- kmer: ACGAGCTGCGCCTACGCACTGATATGCATGA with count: 965
4- kmer: CCACTCAGGGTCTGACCTAAGTGTTGTCTCG with count: 833
5- kmer: ATACGAGCATGTCTCATTTCACTTAAAGGGA with count: 477
6- kmer: AACATGACTTGCTTAATCATAGCCAAGTAAC with count: 969
7- kmer: GGCATGCTTCCTACGAAACGGAGAGGTTACC with count: 1968
8- kmer: ACGATAAACAGCTCTGTACGTTAACTGTCAC with count: 1016
9- kmer: TCTGGCGTTGTATATACTTGGTTCCGCCTAA with count: 563
10- kmer: GCTGGCGGTTTATTTGTTTTCTGAACTCCGC with count: 40
11- kmer: AGGAAAACTTAACATCAGTTCACCTTGATGG with count: 415
12- kmer: GCCGGTATACGGTACGCAGATGGCTACTACC with count: 60
13- kmer: CTCAGACGCGCTAAAATTTTGCTGTCATACC with count: 1018
14- kmer: CCAGGGATAGAAATGCAGCATCTGTCTTCAA with count: 174
15- kmer: ACTGAAATGTAAACCGGGGACTGGTGAGACC with count: 500
16- kmer: AGCTCGCCTTTACGCGAATTCCACAGCGGAC with count: 954
17- kmer: CTACGCGAAATCGCTACTTATGCGCTGAGAA with count: 470
18- kmer: ACCGTCTCAAATGTTGGAGCTCTCCTCGGTG with count: 38
19- kmer: AAGACTCCTATCCGTACATGAGTTGAAGGTC with count: 850
20- kmer: CAAACCGAAGAAGCATTACATCTCGCTATAA with count: 168
21- kmer: ACCGTCTCTTCAGCGCGCAAAAAGAAACGCT with count: 1236
22- kmer: AGTGATGGTGCTCCACGATTACGAGCACGGG with count: 586
23- kmer: TAATTTCAAGAATGGCTAAGAGCGTACAGGC with count: 235
24- kmer: TCATGAAGTACCAGGTTGTCCCGGGTGTACA with count: 840
25- kmer: CCTCAGACGACATCAGTATGGCGGATGTACC with count: 42
26- kmer: ACCCCCGTTGACCGTGCAGAGGGGCAAATGG with count: 269
27- kmer: AAAAGACACCGGCGAGTGCTTACTAGTGGCC with count: 725
28- kmer: TACGGGGGATCTCAATGTAAGTAGACGCGCA with count: 514
29- kmer: AAGGGTTGTACGCTGTGTCATACTATGTCAG with count: 83
30- kmer: CCCAGGCATGGGCAGGTCTCGTGCAACTGAG with count: 398

6.2.15. Complete Script

import kProcessor as kp
import random

def generate_kmers(kSize, kmers_no):
    kmers_list = []
    for i in range(kmers_no):
        _kmer = "".join([random.choice('ACGT') for _ in range(kSize)])
        _count = random.randint(1, 1000)
        kmers_list.append((_kmer, _count))

    return kmers_list

def dumpKmers(kFrame):
    it = kFrame.begin()
    counter = 1
    while (it != kFrame.end()):
        # Get the kmer string
        kmer = it.getKmer()

        # Get the kmer count
        count = it.getCount()

        # Print the data
        print("%d- kmer: %s with count: %d" % (counter, kmer, count))
        counter +=1

        it.next()

kSize = 31

kFrames_vec = [kp.kDataFrameMQF(kSize), kp.kDataFrameMQF(kSize)]

kmers_list1 = generate_kmers(kSize, kmers_no=20)
kmers_list2 = generate_kmers(kSize, kmers_no=10)

kmers_list2 += kmers_list1[0:10]

for kmer in kmers_list1:
    kFrames_vec[0].insert(kmer[0], kmer[1])

for kmer in kmers_list2:
    kFrames_vec[1].insert(kmer[0], kmer[1])

# Apply kFrameIntersect
intersect_kFrame = kp.kFrameIntersect(kFrames_vec)

# Apply kFrameDiff
diff_kFrame = kp.kFrameDiff(kFrames_vec)

# Apply kFrameUnion
union_kFrame = kp.kFrameUnion(kFrames_vec)

dumpKmers(kFrames_vec[0])

dumpKmers(kFrames_vec[1])

dumpKmers(diff_kFrame)

dumpKmers(intersect_kFrame)

dumpKmers(union_kFrame)