DNA二代测序机器会生成很多碎片序列,如何将其快速拼装是值得研究的问题。
本软件借鉴欧拉回路算法,欧拉算法最初是为了解决七桥问题设计的,即不重复的通过所有7座桥各一次。
拼接DNA时将图中的ABCD等点替换成DNA kmer短序列,将所有节点通过一遍即获得拼接完成的序列。
上代码:
print('程序开始运行,使用欧拉回路算法组装测序后的碎片序列')
#encoding: utf-8
# coding: utf8
import time # Counting running time
import re
import random
def startPosition(graphs): # The function diffrent from exaustive search
start=[]
liste=[]
lis=list(graphs.values())
for i in lis:
for k in i:
liste.append(k)
for i in graphs: # 决定节点是奇数还是偶数
if (len(graphs[i])+liste.count(i))%2==1:
start.append(i)
if len(start)==1:
for i in graphs:
if len(set(i))==1:
start.append(i)
elif len(start)==0:
start=list(graphs.keys())
return start
def edges(length,string):
'将序列分成kmer'
liste = []
for i in range(0,len(string)-length+1):
liste.append(string[i:length+i])
return liste
def randomSequenceGenerator(length): #生成随机的序列
sequence=[]
dic=['A','C','T','G']
for i in range(length):
index=random.randint(0,3)
sequence.append(dic[index])
return ''.join(sequence)
a = ['ATG','GGG','GGT','GTA','GTG','TAT','TGG'] #示例序列
#a=['ATG','TGG','TGC','GTG','GGC','GCA','GCG','CGT']
#a=edges(3,'ATTGCGGAGTGACGATG')
#a=edges(3,'AAAAGGGCAAGCGTACGATGGGCCATGCCCGGAGCGGGCCCAAGGGCCCGTGCAATTGCGGAGTGACGATG')
#a=randomSequenceGenerator(100)
next=True
while(next):
print('程序菜单')
print('1.是否使用默认DNA序列碎片:',a)
print('2.输入你想打碎的DNA序列')
print('3.随机生成DNA序列并打碎')
try:
answer=int(input('请输入数字选择菜单选项,并按回车键执行'))
if answer not in [1,2,3]:
raise Exception
elif answer==1:
a = ['ATG','GGG','GGT','GTA','GTG','TAT','TGG']
elif answer==2:
try:
seq=input('请输入你想打碎的序列').upper()
print(seq)
result=re.findall(r'[^A^T^C^G]{1,}',seq)
if result!=[]:
raise Exception
a=edges(3,seq)
except Exception:
print('请输入有效的DNA序列')
print('输入有误,并不是有效的DNA序列,将使用默认序列碎片',a)
else:
try:
length=int(input('请输入你希望的DNA序列长度,请输入3-69之间的有效数字'))
if length not in range(3,70):
raise Exception
seq=randomSequenceGenerator(length)
print()
print('随机生成的DNA序列为',seq)
a=edges(3,seq)
except Exception:
print('请输入3-69之间的有效数字,输入有误,使用默认序列碎片',a)
next=True
next=False
except Exception:
print()
print('对不起,程序无法执行,请输入有效菜单选项数字')
print()
def vertices(length,lis):
li=[]
for i in lis:
li.append(i[:length])
li.append(i[length-1:])
return list(set(li))
b = vertices(2,a)
def merge(str1,str2):
length=len(str1)
string=''
if str1[-length+1:]==str2[:length-1]:
string=str1+str2[-1]
return string
def graph(vertice,edge):
dic={}
for i in vertice:
dic[i]=[]
for w in range(4): # 这个循环处理重复序列,比如AAAAAA,TTTTT
for k in vertice:
for j in vertice:
if merge(k,j) in edge:
dic[k].append(j)
edge.remove(merge(k,j))
return dic
s=graph(b,a[:])
start=startPosition(s)
def findPath(vertice,edge,graphs,start):
import random
i=random.randint(0,len(vertice)-1)
path = []
for k in range(len(edge)):
if start in graphs[start]:
path.append(start + start[-1])
graphs[start].remove(start)
elif graphs[start] == []:
break
else:
index = random.randint(0, len(graphs[start]) - 1)
nextVertice = graphs[start][index]
mergedWord = merge(start, nextVertice)
path.append(mergedWord)
graphs[start].remove(nextVertice)
start = nextVertice
suffix = ''.join([path[i][-1] for i in range(1, len(path))])
if len(path)==0:
str=''
else:
str = path[0] + suffix
return str
lis=[]
start_time = time.time()
loops=3000 # 如果序列变得巨大,那么需要调整这个参数
if len(start)==2:
print()
print('正在进行运算')
print('当前序列碎片为','\n',a)
for i in range(loops):
k=random.randint(0,1)
s=graph(b,a[:])
q = findPath(b, a, s, start[k])
if len(q) == len(a) + 2 and q not in lis:
lis.append(q)
else:
for i in range(1500): #循环的次数比较重要,太小的话无法发现所有的序列
k=random.randint(0,len(b)-1)
s=graph(b,a[:])
q = findPath(b, a, s,b[k])
if len(q)==len(a)+2 and q not in lis:
lis.append(q)
print()
print('由序列碎片组装后所有可能的序列:',len(lis),'个')
print()
for i in lis:
print(i)
print('运行结束')
print('运行时间: ',round(time.time()-start_time,2),'秒')
也可以使用python3 pip install eulerdg 安装本软件