生物信息006-Illumina质量体系Phred33 与 phred 64转换及fastq文件格式

使用python后跟着-c这个命令来运行字符串

/usr/bin/python -c 'print "Hello World"'
Hello World

字符在ASCII中对应的数值

/usr/bin/python -c 'print ord("A")'
65

整数的在ASCII中对应的字符

/usr/bin/python -c "print chr(65)"
A

找到字符I对应的Illumina 1.5 (+64)编码质量值

/usr/bin/python -c 'print ord("I") - 64'
9

找到40这个值对应的Illumina 1.5 (+64)编码质量字符

/usr/bin/python -c 'print chr(40 + 64)'
h

phred质量体系

不同版本质量得分与质量字符ASCII值的关系

这里写图片描述
ASCII值小于等于58(相应的质量得分小于等于25)对应的字符只有在Phred+33的编码中被使用,所有Phred+64所使用的字符的ASCII值都大于等于59。在通常情况下,ASCII值大于等于74的字符只出现在Phred+64中。利用这些信息即可在程序中进行判断

python脚本判断编码及相互转换


import time
import argparse

def sign(func):
    """作者签名"""
    def warpper(*args, **kwargs):
        name = "The author\'s Sunchengquan"
        print(name.center(50, "#"))
        courrent_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        print(courrent_time.center(50, "#")+'\n')
        func(*args, **kwargs)
    return warpper
def timer(func):
    def wrapper(*args, **kwargs):
        start= time.time()
        func(*args, **kwargs)
        print(str("program consume:%.2fs" %(time.time()-start)).center(50, "#"))
    return wrapper

def fq_Phred_check(inputfile):
    '判断fastq序列编码是Phred33(Illumina1.8+) or Phred64(Illumina1.3+)'
    fastq_file = open(inputfile,'r',encoding='utf-8')
    i = 1
    ASCII_list = []
    for line in fastq_file:
        if (i % 4 == 0) & (i <= 10):
            line_strip = line.strip()
            for each in line_strip:
                ASCII_each = ord(each)
                if ASCII_each >= 75:
                    ASCII_list.append(ASCII_each)
        i = i + 1
    fastq_file.close()
    if len(ASCII_list) == 0:
        phred_value = 33
        print('The fastq Phred value should be Phred 33'.center(50, "#"))
    else:
        phred_value = 64
        print('The fastq Phred value should be Phred 64'.center(50, "#"))

    return phred_value


def fq_Phred_change(inputfile):
    'fastq Phred33 与 phred 64 之间转换'
    fastq_file = open(inputfile,'r',encoding='utf-8')
    fq_phred = fq_Phred_check(inputfile)
    if int(fq_phred) == 64:
        out_file_name = inputfile.split('.')[0] + '_phred' + str(fq_phred-31)+'.fastq'
    else:
        out_file_name = inputfile.split('.')[0] + '_phred' + str(fq_phred +31) + '.fastq'
    outfile = open(out_file_name,'w',encoding='utf-8')
    i = 1
    if int(fq_phred) == 64:
        for line in fastq_file:
            line_strip = line.strip()
            if i % 4 == 0:
                temp = []
                for each in line_strip:
                    phred_value = ord(each) - 64 + 33
                    print(phred_value)
                    each_phred = str(chr(phred_value))
                    temp.append(each_phred)
                print(temp)
                Phred33_value = ''.join(temp)
                outfile.write(Phred33_value + '\n')
            else:
                outfile.write(line_strip + '\n')
            i = i + 1
        print("phred64 transform Phred33 success".center(50, "#"))
    elif int(fq_phred) == 33:
        for line in fastq_file:
            line_strip = line.strip()
            if i % 4 == 0:
                temp = []
                for each in line_strip:
                    phred_value = ord(each) - 33 + 64
                    each_phred = chr(phred_value)
                    temp.append(each_phred)

                Phred33_value = ''.join(temp)
                outfile.write(Phred33_value + '\n')
            else:
                outfile.write(line_strip + '\n')
            i = i + 1
        print("phred33 transform Phred64 success".center(50, "#"))
    fastq_file.close()
    outfile.close()

@sign
@timer
def main():
    # parser=argparse.ArgumentParser(description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter,
    #                                epilog='author:\t{0}\nmail:\t{1}'.format(__author__,__mail__))
    # parser.add_argument("--input",required=True,help="inputfile")
    # argv=vars(parser.parse_args())
    # inputfile = argv["input"]
    inputfile = 'data/test1.fq'
    fq_Phred_change(inputfile)
if __name__ == '__main__':
    main()

python脚本fastq转换fasta格式

def fastq_fasta(inputfile):
    '将fastq转换成fasta序列格式'
    fastq_file = open(inputfile,'r',encoding='utf-8')
    out_file_name = inputfile.split('.')[0] + '.fasta'
    output_fasta = open(out_file_name,'w',encoding='utf-8')
    i = 0
    for line in fastq_file:
        if i % 4 == 0:
            line_new = line.strip().split('\n')[0].replace('@','>')
            output_fasta.write(line_new + '\n')
        elif (i - 1) % 4 == 0:
            output_fasta.write(line)
        i = i + 1
    print("fastq transform fasta success".center(50, "#"))
    fastq_file.close()
    output_fasta.close()

 def main():
    inputfile = 'data/test1.fq'
    fastq_fasta(inputfile)

if __name__ == '__main__':
    main()

从这个课程的网址中获取一个测试数据集

curl -O http://personal.psu.edu/iua1/courses/files/2014/illumina-data.fq.gz
你可以解压这个gzip文件 illumina-data.fq.gz ,也可以不解压,并用zcat (Linux)来打开。当你有很多的数据集时,牺牲读取速度来节省存储也是值得的

time zcat illumina-data.fq.gz > tmp

解压数据,同时保留原压缩文件
gunzip -c illumina-data.fq.gz > illumina-data.fq
time cat illumina-data.fq > tmp

查看FASTQ记录

zcat illumina-data.fq.gz | head -4
@HWI-ST1342:96:H0NP9ADXX:2:1115:13393:59201
CCCATCTTAATCGATCATCAGCTCGAAGTGCACCGATCCGAGTCGCTCGTGTCTGTCGTTATCGTGCCGGACGTTGTAGGCCCAAGCTTTGCACTCAACGTTGATGATGATTCCACGGACTGGGCGCTCGAAGTGTACCGCAACCAGCGG
+
CCCFFFFFHHHGHJIJIIJIJJJJJJIIFGIJJJIJJIJGIJIGEGGIIAEHHHFFFFFDACCDDDDDDDB>B@BD?CDDCDDDDDDDDCDDDDDDACDBBABDDDEEDEEDEEEDDDDDDDDDDDDDDDDDDBCCDEDDDDDDDDDDDD

简单粗暴地检查一下质量,在前10,000行里是否有连续的###字符

zcat illumina-data.fq.gz | head -10000 | grep '###' | wc -l

在数据里搜索pattern并着色

zcat illumina-data.fq.gz | head -10000 | grep 'ATG' --color=always | head

这里写图片描述

python提取fastq文件中的序列

"""
写程序 grepFastq.py,
提取fastq.name中名字对应的test1.fq的序列。
fastq.name
HWI-ST1223:80:D1FMTACXX:2:1101:1243:2213 1:N:0:AGTCAA
"""


seqFile = "data/test1.fq"
nameFile = "data/fastq.name"

aDict = {}
count = 0
for line in open(seqFile):
    if count % 4 == 0:
        name = line.strip()[1:] 
        aDict[name] = [line]
    else:
        aDict[name].append(line)
   count += 1


for line in open(nameFile):
    name = line.strip()
    print(''.join(aDict[name]))

@HWI-ST1223:80:D1FMTACXX:2:1101:1243:2213 1:N:0:AGTCAA
TCTGTGTAGCCNTGGCTGTCCTGGAACTCACTTTGTAGACCAGGCTGGCATGCACCACCACNNNCGGCTCATTTGTCTTTNNTTTTTGTTTTGTTCTGTA
+
BCCFFFFFFHH#4AFHIJJJJJJJJJJJJJJJJJIJIJJJJJGHIJJJJJJJJJJJJJIIJ###--5ABECFFDDEEEEE##,5=@B8?CDD<AD>C:@>

猜你喜欢

转载自blog.csdn.net/sunchengquan/article/details/79792959
今日推荐