6. Recipe 6: Set functions¶
Note:
Works only on kDataFrameMQF
6.1. Description¶
- Create two kDataFrames with same kmer size and fill them with random kmers
- Apply set functions diff, intersect, union and dump the resultant kDataFrames
6.2. Implementation¶
6.2.1. Importing¶
[1]:
import kProcessor as kp
import random
6.2.2. Function to generate random kmers with random counts¶
[2]:
def generate_kmers(kSize, kmers_no):
kmers_list = []
for i in range(kmers_no):
_kmer = "".join([random.choice('ACGT') for _ in range(kSize)])
_count = random.randint(1, 1000)
kmers_list.append((_kmer, _count))
return kmers_list
6.2.3. Function to dump the kmers of a kDataFrame¶
[3]:
def dumpKmers(kFrame):
it = kFrame.begin()
counter = 1
while (it != kFrame.end()):
# Get the kmer string
kmer = it.getKmer()
# Get the kmer count
count = it.getCount()
# Print the data
print("%d- kmer: %s with count: %d" % (counter, kmer, count))
counter +=1
it.next()
6.2.4. Determining kmer size¶
[4]:
kSize = 31
6.2.5. Create list of kDataFrames¶
[5]:
kFrames_vec = [kp.kDataFrameMQF(kSize), kp.kDataFrameMQF(kSize)]
6.2.6. Create random kmers list¶
[6]:
kmers_list1 = generate_kmers(kSize, kmers_no=20)
kmers_list2 = generate_kmers(kSize, kmers_no=10)
6.2.7. Replicate some kmers from kmers_list1 in kmers_list2 to make sure len(intersection) > 0¶
[7]:
kmers_list2 += kmers_list1[0:10]
6.2.8. Inserting the kmers¶
[8]:
for kmer in kmers_list1:
kFrames_vec[0].insert(kmer[0], kmer[1])
for kmer in kmers_list2:
kFrames_vec[1].insert(kmer[0], kmer[1])
6.2.9. Apply set functions¶
[9]:
# Apply kFrameIntersect
intersect_kFrame = kp.kFrameIntersect(kFrames_vec)
# Apply kFrameDiff
diff_kFrame = kp.kFrameDiff(kFrames_vec)
# Apply kFrameUnion
union_kFrame = kp.kFrameUnion(kFrames_vec)
6.2.10. Dumping kmers of kFrames_vec[0]¶
[10]:
dumpKmers(kFrames_vec[0])
1- kmer: ATACGAGCATGTCTCATTTCACTTAAAGGGA with count: 477
2- kmer: AACATGACTTGCTTAATCATAGCCAAGTAAC with count: 969
3- kmer: GGCATGCTTCCTACGAAACGGAGAGGTTACC with count: 984
4- kmer: ACGATAAACAGCTCTGTACGTTAACTGTCAC with count: 508
5- kmer: TCTGGCGTTGTATATACTTGGTTCCGCCTAA with count: 563
6- kmer: GCCGGTATACGGTACGCAGATGGCTACTACC with count: 60
7- kmer: CTCAGACGCGCTAAAATTTTGCTGTCATACC with count: 509
8- kmer: CCAGGGATAGAAATGCAGCATCTGTCTTCAA with count: 87
9- kmer: ACTGAAATGTAAACCGGGGACTGGTGAGACC with count: 250
10- kmer: AGCTCGCCTTTACGCGAATTCCACAGCGGAC with count: 477
11- kmer: ACCGTCTCAAATGTTGGAGCTCTCCTCGGTG with count: 38
12- kmer: CAAACCGAAGAAGCATTACATCTCGCTATAA with count: 168
13- kmer: ACCGTCTCTTCAGCGCGCAAAAAGAAACGCT with count: 618
14- kmer: AGTGATGGTGCTCCACGATTACGAGCACGGG with count: 293
15- kmer: TAATTTCAAGAATGGCTAAGAGCGTACAGGC with count: 235
16- kmer: TCATGAAGTACCAGGTTGTCCCGGGTGTACA with count: 840
17- kmer: CCTCAGACGACATCAGTATGGCGGATGTACC with count: 21
18- kmer: AAAAGACACCGGCGAGTGCTTACTAGTGGCC with count: 725
19- kmer: TACGGGGGATCTCAATGTAAGTAGACGCGCA with count: 257
20- kmer: CCCAGGCATGGGCAGGTCTCGTGCAACTGAG with count: 398
6.2.11. Dumping kmers of kFrames_vec[1]¶
[11]:
dumpKmers(kFrames_vec[1])
1- kmer: AAATACAGAAGAGTAGAAGTCCCTGGCGCAA with count: 649
2- kmer: ACTAGAGATTTATCCAAAGAGGATCCCCTGA with count: 685
3- kmer: ACGAGCTGCGCCTACGCACTGATATGCATGA with count: 965
4- kmer: CCACTCAGGGTCTGACCTAAGTGTTGTCTCG with count: 833
5- kmer: GGCATGCTTCCTACGAAACGGAGAGGTTACC with count: 984
6- kmer: ACGATAAACAGCTCTGTACGTTAACTGTCAC with count: 508
7- kmer: GCTGGCGGTTTATTTGTTTTCTGAACTCCGC with count: 40
8- kmer: AGGAAAACTTAACATCAGTTCACCTTGATGG with count: 415
9- kmer: CTCAGACGCGCTAAAATTTTGCTGTCATACC with count: 509
10- kmer: CCAGGGATAGAAATGCAGCATCTGTCTTCAA with count: 87
11- kmer: ACTGAAATGTAAACCGGGGACTGGTGAGACC with count: 250
12- kmer: AGCTCGCCTTTACGCGAATTCCACAGCGGAC with count: 477
13- kmer: CTACGCGAAATCGCTACTTATGCGCTGAGAA with count: 470
14- kmer: AAGACTCCTATCCGTACATGAGTTGAAGGTC with count: 850
15- kmer: ACCGTCTCTTCAGCGCGCAAAAAGAAACGCT with count: 618
16- kmer: AGTGATGGTGCTCCACGATTACGAGCACGGG with count: 293
17- kmer: CCTCAGACGACATCAGTATGGCGGATGTACC with count: 21
18- kmer: ACCCCCGTTGACCGTGCAGAGGGGCAAATGG with count: 269
19- kmer: TACGGGGGATCTCAATGTAAGTAGACGCGCA with count: 257
20- kmer: AAGGGTTGTACGCTGTGTCATACTATGTCAG with count: 83
6.2.12. Dump diff_kFrame¶
[12]:
dumpKmers(diff_kFrame)
1- kmer: ATACGAGCATGTCTCATTTCACTTAAAGGGA with count: 477
2- kmer: AACATGACTTGCTTAATCATAGCCAAGTAAC with count: 969
3- kmer: TCTGGCGTTGTATATACTTGGTTCCGCCTAA with count: 563
4- kmer: GCCGGTATACGGTACGCAGATGGCTACTACC with count: 60
5- kmer: ACCGTCTCAAATGTTGGAGCTCTCCTCGGTG with count: 38
6- kmer: CAAACCGAAGAAGCATTACATCTCGCTATAA with count: 168
7- kmer: TAATTTCAAGAATGGCTAAGAGCGTACAGGC with count: 235
8- kmer: TCATGAAGTACCAGGTTGTCCCGGGTGTACA with count: 840
9- kmer: AAAAGACACCGGCGAGTGCTTACTAGTGGCC with count: 725
10- kmer: CCCAGGCATGGGCAGGTCTCGTGCAACTGAG with count: 398
6.2.13. Dump instersect_kFrame¶
[13]:
dumpKmers(intersect_kFrame)
1- kmer: GGCATGCTTCCTACGAAACGGAGAGGTTACC with count: 984
2- kmer: ACGATAAACAGCTCTGTACGTTAACTGTCAC with count: 508
3- kmer: CTCAGACGCGCTAAAATTTTGCTGTCATACC with count: 509
4- kmer: CCAGGGATAGAAATGCAGCATCTGTCTTCAA with count: 87
5- kmer: ACTGAAATGTAAACCGGGGACTGGTGAGACC with count: 250
6- kmer: AGCTCGCCTTTACGCGAATTCCACAGCGGAC with count: 477
7- kmer: ACCGTCTCTTCAGCGCGCAAAAAGAAACGCT with count: 618
8- kmer: AGTGATGGTGCTCCACGATTACGAGCACGGG with count: 293
9- kmer: CCTCAGACGACATCAGTATGGCGGATGTACC with count: 21
10- kmer: TACGGGGGATCTCAATGTAAGTAGACGCGCA with count: 257
6.2.14. Dump union_kFrame¶
[14]:
dumpKmers(union_kFrame)
1- kmer: AAATACAGAAGAGTAGAAGTCCCTGGCGCAA with count: 649
2- kmer: ACTAGAGATTTATCCAAAGAGGATCCCCTGA with count: 685
3- kmer: ACGAGCTGCGCCTACGCACTGATATGCATGA with count: 965
4- kmer: CCACTCAGGGTCTGACCTAAGTGTTGTCTCG with count: 833
5- kmer: ATACGAGCATGTCTCATTTCACTTAAAGGGA with count: 477
6- kmer: AACATGACTTGCTTAATCATAGCCAAGTAAC with count: 969
7- kmer: GGCATGCTTCCTACGAAACGGAGAGGTTACC with count: 1968
8- kmer: ACGATAAACAGCTCTGTACGTTAACTGTCAC with count: 1016
9- kmer: TCTGGCGTTGTATATACTTGGTTCCGCCTAA with count: 563
10- kmer: GCTGGCGGTTTATTTGTTTTCTGAACTCCGC with count: 40
11- kmer: AGGAAAACTTAACATCAGTTCACCTTGATGG with count: 415
12- kmer: GCCGGTATACGGTACGCAGATGGCTACTACC with count: 60
13- kmer: CTCAGACGCGCTAAAATTTTGCTGTCATACC with count: 1018
14- kmer: CCAGGGATAGAAATGCAGCATCTGTCTTCAA with count: 174
15- kmer: ACTGAAATGTAAACCGGGGACTGGTGAGACC with count: 500
16- kmer: AGCTCGCCTTTACGCGAATTCCACAGCGGAC with count: 954
17- kmer: CTACGCGAAATCGCTACTTATGCGCTGAGAA with count: 470
18- kmer: ACCGTCTCAAATGTTGGAGCTCTCCTCGGTG with count: 38
19- kmer: AAGACTCCTATCCGTACATGAGTTGAAGGTC with count: 850
20- kmer: CAAACCGAAGAAGCATTACATCTCGCTATAA with count: 168
21- kmer: ACCGTCTCTTCAGCGCGCAAAAAGAAACGCT with count: 1236
22- kmer: AGTGATGGTGCTCCACGATTACGAGCACGGG with count: 586
23- kmer: TAATTTCAAGAATGGCTAAGAGCGTACAGGC with count: 235
24- kmer: TCATGAAGTACCAGGTTGTCCCGGGTGTACA with count: 840
25- kmer: CCTCAGACGACATCAGTATGGCGGATGTACC with count: 42
26- kmer: ACCCCCGTTGACCGTGCAGAGGGGCAAATGG with count: 269
27- kmer: AAAAGACACCGGCGAGTGCTTACTAGTGGCC with count: 725
28- kmer: TACGGGGGATCTCAATGTAAGTAGACGCGCA with count: 514
29- kmer: AAGGGTTGTACGCTGTGTCATACTATGTCAG with count: 83
30- kmer: CCCAGGCATGGGCAGGTCTCGTGCAACTGAG with count: 398
6.2.15. Complete Script¶
import kProcessor as kp
import random
def generate_kmers(kSize, kmers_no):
kmers_list = []
for i in range(kmers_no):
_kmer = "".join([random.choice('ACGT') for _ in range(kSize)])
_count = random.randint(1, 1000)
kmers_list.append((_kmer, _count))
return kmers_list
def dumpKmers(kFrame):
it = kFrame.begin()
counter = 1
while (it != kFrame.end()):
# Get the kmer string
kmer = it.getKmer()
# Get the kmer count
count = it.getCount()
# Print the data
print("%d- kmer: %s with count: %d" % (counter, kmer, count))
counter +=1
it.next()
kSize = 31
kFrames_vec = [kp.kDataFrameMQF(kSize), kp.kDataFrameMQF(kSize)]
kmers_list1 = generate_kmers(kSize, kmers_no=20)
kmers_list2 = generate_kmers(kSize, kmers_no=10)
kmers_list2 += kmers_list1[0:10]
for kmer in kmers_list1:
kFrames_vec[0].insert(kmer[0], kmer[1])
for kmer in kmers_list2:
kFrames_vec[1].insert(kmer[0], kmer[1])
# Apply kFrameIntersect
intersect_kFrame = kp.kFrameIntersect(kFrames_vec)
# Apply kFrameDiff
diff_kFrame = kp.kFrameDiff(kFrames_vec)
# Apply kFrameUnion
union_kFrame = kp.kFrameUnion(kFrames_vec)
dumpKmers(kFrames_vec[0])
dumpKmers(kFrames_vec[1])
dumpKmers(diff_kFrame)
dumpKmers(intersect_kFrame)
dumpKmers(union_kFrame)