一款使用欧拉回路算法模拟二代测序机器打碎后序列的拼装软件

DNA二代测序机器会生成很多碎片序列,如何将其快速拼装是值得研究的问题。

本软件借鉴欧拉回路算法,欧拉算法最初是为了解决七桥问题设计的,即不重复的通过所有7座桥各一次。

在这里插入图片描述

拼接DNA时将图中的ABCD等点替换成DNA kmer短序列,将所有节点通过一遍即获得拼接完成的序列。

上代码:

print('程序开始运行,使用欧拉回路算法组装测序后的碎片序列')

#encoding: utf-8
# coding: utf8
import time # Counting running time
import re
import random

def startPosition(graphs): # The function diffrent from exaustive search
    start=[]
    liste=[]
    lis=list(graphs.values())
    for i in lis:
        for k in i:
            liste.append(k)
    for i in graphs:  # 决定节点是奇数还是偶数
        if (len(graphs[i])+liste.count(i))%2==1:
            start.append(i)
    if len(start)==1:  
        for i in graphs:
            if len(set(i))==1:
                start.append(i)
    elif len(start)==0:
        start=list(graphs.keys())

    return start


def edges(length,string):
    '将序列分成kmer'
    liste = []
    for i in range(0,len(string)-length+1):
        liste.append(string[i:length+i])
    return liste




def randomSequenceGenerator(length): #生成随机的序列
    sequence=[]
    dic=['A','C','T','G']
    for i in range(length):
        index=random.randint(0,3)
        sequence.append(dic[index])
    return ''.join(sequence)


	
a = ['ATG','GGG','GGT','GTA','GTG','TAT','TGG']    #示例序列


#a=['ATG','TGG','TGC','GTG','GGC','GCA','GCG','CGT'] 
#a=edges(3,'ATTGCGGAGTGACGATG')
#a=edges(3,'AAAAGGGCAAGCGTACGATGGGCCATGCCCGGAGCGGGCCCAAGGGCCCGTGCAATTGCGGAGTGACGATG')
#a=randomSequenceGenerator(100)  

next=True
while(next):
	print('程序菜单')
	print('1.是否使用默认DNA序列碎片:',a)
	print('2.输入你想打碎的DNA序列')
	print('3.随机生成DNA序列并打碎')
	try:
		answer=int(input('请输入数字选择菜单选项,并按回车键执行'))
		if answer not in [1,2,3]:
			raise Exception
		elif answer==1:
			a = ['ATG','GGG','GGT','GTA','GTG','TAT','TGG']
		elif answer==2:
			try:
				seq=input('请输入你想打碎的序列').upper()
				print(seq)
				result=re.findall(r'[^A^T^C^G]{1,}',seq)
				if result!=[]:
					raise Exception
				
				a=edges(3,seq)
			
			except Exception:
				print('请输入有效的DNA序列')
				print('输入有误,并不是有效的DNA序列,将使用默认序列碎片',a)

		else:
			try:
				length=int(input('请输入你希望的DNA序列长度,请输入3-69之间的有效数字'))
				if length not in range(3,70):
					raise Exception
				seq=randomSequenceGenerator(length)
				print()
				print('随机生成的DNA序列为',seq)
				a=edges(3,seq)
			except Exception:
				print('请输入3-69之间的有效数字,输入有误,使用默认序列碎片',a)
				next=True
			
		next=False

			
		
	except Exception:
		print()
		print('对不起,程序无法执行,请输入有效菜单选项数字')
		print()
	
def vertices(length,lis):
    li=[]
    for i in lis:
        li.append(i[:length])
        li.append(i[length-1:])
    return list(set(li))

b = vertices(2,a)

def merge(str1,str2):
    length=len(str1)
    string=''
    if str1[-length+1:]==str2[:length-1]:
        string=str1+str2[-1]
    return string

def graph(vertice,edge):
    dic={}
    for i in vertice:
        dic[i]=[]
    for w in range(4): # 这个循环处理重复序列,比如AAAAAA,TTTTT
        for k in vertice: 
            for j in vertice:
                if merge(k,j) in edge:
                    dic[k].append(j)
                    edge.remove(merge(k,j))


    return dic
s=graph(b,a[:])
start=startPosition(s)

def findPath(vertice,edge,graphs,start):
    import random
    i=random.randint(0,len(vertice)-1)
    
    path = []

    for k in range(len(edge)):

        if start in graphs[start]:
            path.append(start + start[-1])
            graphs[start].remove(start)
        elif graphs[start] == []:
    
            break
        else:
            index = random.randint(0, len(graphs[start]) - 1)
            nextVertice = graphs[start][index]
            mergedWord = merge(start, nextVertice)
            path.append(mergedWord)
            graphs[start].remove(nextVertice)
            start = nextVertice

   
    suffix = ''.join([path[i][-1] for i in range(1, len(path))])
    if len(path)==0:
        str=''
    else:
        str = path[0] + suffix

    return str

lis=[]


start_time = time.time()

loops=3000 # 如果序列变得巨大,那么需要调整这个参数
if len(start)==2:
    print()
    print('正在进行运算')
    print('当前序列碎片为','\n',a)
    for i in range(loops):
        k=random.randint(0,1)
        s=graph(b,a[:])
        q = findPath(b, a, s, start[k])
        if len(q) == len(a) + 2 and q not in lis:
            lis.append(q)

else:
    for i in range(1500):  #循环的次数比较重要,太小的话无法发现所有的序列
        k=random.randint(0,len(b)-1)
        s=graph(b,a[:])
        q = findPath(b, a, s,b[k])
        if len(q)==len(a)+2 and q not in lis:
            lis.append(q)
print()
print('由序列碎片组装后所有可能的序列:',len(lis),'个')
print()
for i in lis:
    print(i)
print('运行结束')
print('运行时间: ',round(time.time()-start_time,2),'秒')


也可以使用python3 pip install eulerdg 安装本软件

发布了8 篇原创文章 · 获赞 2 · 访问量 2036

猜你喜欢

转载自blog.csdn.net/a_giant_pig/article/details/101204073