使用Diamond比对NR数据库获取物种注释

之前用Kraken2注释宏基因组的contig,发现只有30%左右可以被Kraken2注释

Kraken2+Bracken:宏基因组物种注释-CSDN博客

不信邪,再用NR库试试

参考:

将NR数据库diamond比对结果做物种注释_diamond 物种注释-CSDN博客

NR下载

nohup wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz &wget -c https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5#检查下载的完整性(一样的话就是完整了)md5sum nr.gz### 899ac219cac213c60fede9c3d9ef8f7bnr.gzcat nr.gz.md5 ### 899ac219cac213c60fede9c3d9ef8f7bnr.gznohup gunzipnr.gz&mv nrnr.fa###下载NR物种相关信息和taxid信息wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gzwget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz.md5wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gzwget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5#检查完整性md5sum prot.accession2taxid.gz#32b8ca7ea712e161c72af69135fc938ecat prot.accession2taxid.gz.md5 #32b8ca7ea712e161c72af69135fc938emd5sum taxdump.tar.gz #26558b800bc1b3795c25d1f0ead65412cat taxdump.tar.gz.md5#26558b800bc1b3795c25d1f0ead65412#解压tar -zxvf taxdump.tar.gzgunzip prot.accession2taxid.gz#移动需要的文件cd ~mkdir .taxonkitcp *.dmp ~/.taxonkit

必要软件下载

conda create -n NR_database_searchconda activate NR_database_searchconda install -c bioconda taxonkit=0.15.0conda install -c bioconda diamond=2.1.8conda install -c bioconda csvtk=0.28.0

diamond建库

nohup diamond makedb --threads 180 --in nr.fa --db NR_2023_07_23 &

diamond比对

diamond blastx --db NR_2023_07_23.dmnd --query nucleic_reads.fna\-o nucleic_matches_fmt6.txt --threads 180 --evalue 0.00001 \--max-target-seqs 5 --outfmt 6diamond blastp --db NR_2023_07_23.dmnd --query protein_reads.fna\-o protein_matches_fmt6.txt --threads 180 --evalue 0.00001 \--max-target-seqs 5 --outfmt 6## --outfmt 6 最好别改变

这些参数可以调整diamond比对的速度or准确性

图片[1] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

这几个参数可以调整比对的coverage,identity,score

图片[2] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

结果如下 !!!(这个表头后面python会加)

图片[3] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

taxonkit获得物种分类信息表

感谢大佬:一文完成nt库序列快速下载及blast结果注释物种 (qq.com)

得到seqid注释之后,可以搜索注释

## 一些主要的物种编号# 2 bacteria# 2157archaea# 4751fungi# 10239 virus# 2759 Eukaryota#看taxnokit安好了么taxonkit -h#创建目录cd /home/zhongpei/database/NR_2023_07_23mkdir /home/zhongpei/database/NR_2023_07_23/NCBI_Main_taxcd NCBI_Main_tax#开始taxonkit list -j 4 --ids 2,2157,4751,10239,2759 --indent "" > NCBI_Main.taxid.txt# -j 是线程,软件说4个够了;--ids 是需要的物种编号,用逗号分隔wc -l NCBI_Main.taxid.txt # 2708739 NCBI_Main.taxid.txthead -n 5 NCBI_Main.taxid.txt #查看内容# 提取taxid和taxonomy(界门纲目科属种)的对应信息到NCBI_Main.taxid.txtless NCBI_Main.taxid.txt | taxonkit reformat -I 1 -r Unassigned -f "{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}\t{t}"| sed '1i\Taxid\tKingdom\tPhylum\tClass\tOrder\tFamily\tGenus\tSpecies\tStrain' > NCBI_Main.taxid_new.txt# -I 1 一个制表符分隔;-r 没有找到的用什么字符去填充,这里用的“Unassigned” # -f 输出的格式;1i表示在第行之前插入文本(sed用法,不太会)

完成!

图片[4] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

vim NCBI_Main.taxid_new.txt

把第一个Taxid改成小写

seqid和taxid的对应

还记不记得第一步下载过一个 “prot.accession2taxid” ,现在要派上用场了

其实python也能做,csvtk太不熟悉了,先来学习一下吧,感觉还挺方便(这一步比较慢)

cat prot.accession2taxid | csvtk -t grep -f taxid -P NCBI_Main.taxid.txt | csvtk -t cut -f accession.version,taxid > NCBI_seqid_taxid.txt# "cat prot.accession2taxid |" 把 prot.accession2taxid 的内容到下面的 csvtk# -t 输入内容是制表符分隔;grep 这是csvtk的1个子命令,用于在文件中搜索匹配的行# -P 搜索那些"taxid"字段的值出现在"NCBI_Main.taxid.txt"文件中的行# cut 这是csvtk的1个子命令,用于从输入中选择特定的字段

NCBI_seqid_taxid.txt 就是目标文件

图片[5] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

diamond seqid和taxid对应,再和界门纲目科属种对应

把diamond结果文件与NCBI_seqid_taxid.txt对应

#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9# ########################################################### match diamond blast NR result.txt and NCBI_seqid_taxid.txt# written by PeiZhong in IFR of CAASimport osimport argparseparser = argparse.ArgumentParser(description='match diamond blast NR result.txt and NCBI_seqid_taxid.txt. ''!!! all the file should end will .txt !!!')parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your diamond result txt')parser.add_argument('result_files_mark',help='mark=The name of the mark specific to ''your two-column diamond results file in this folder, e.g., clean, ''for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')parser.add_argument('NCBI_acc_taxid',help='full Path that contain NCBI_seqid_taxid.txt')args = parser.parse_args()result_file_folder_path = args.diamond_result_folder_pathNCBI_file = args.NCBI_acc_taxidfile_mark = args.result_files_markos.chdir(result_file_folder_path)files = os.listdir(result_file_folder_path)print(files)db={}print("start db read")f_table = open("%s" % (NCBI_file), 'r')print("start db build")for line in f_table.readlines():line=line.split('\t')acc_num = line[0].strip()tax_num = line[1].strip()db[acc_num] = tax_numprint("finish db build")file_ls = []for result_file in files:if file_mark in result_file:file_ls.append(result_file)file_ls.sort()print(file_ls)header = "qseqid\taccession.version\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"for result_file in file_ls:print(result_file)f_result = open("%s" % (result_file), 'r+')content = f_result.read()f_result.seek(0, 0)f_result.write(header.rstrip('\r\n') + '\n' + content)f_result.close()out_name = str(result_file).strip('txt')out_name = out_name+'_taxid.txt'f_result = open("%s" % (result_file), 'r')f_out = open("%s" % (out_name), 'a')for line in f_result.readlines():line = line.split('\t')query_num = line[0].strip()acc_q_num = line[1].strip()if acc_q_num in db:print(query_num,end="\t",file=f_out)print(acc_q_num, end="\t", file=f_out)print(db[acc_q_num], file=f_out)f_out.close()f_result.close()f_table.close()

图片[6] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

chmod +x diamond_NR_tax.py(完整地址)diamond_NR_tax.py(完整地址) -hdiamond_NR_tax.py(完整地址) diamond_result_folder_path result_files_mark NCBI_acc_taxiddiamond_result_folder_path:你存放上面处理完的diamond比对文件,txt结尾的文件,的目录(完整地址)result_files_mark:这个地址中,你的这些文件独有的标识字符串#for example,the mark of result file 'SY10_clean.txt' and 'SY11_clean.txt' is 'clean'NCBI_acc_taxid:你的NCBI_seqid_taxid.txt文件的完整地址

图片[7] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

做完结果是这样的

图片[8] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

图片[9] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

接下来再写个python代码根据taxid把这个文件和界门纲目科属种联系起来就行(不好意思只会python,我检讨。。。。但是python简单呀)

#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9# ########################################################### match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt# written by PeiZhong in IFR of CAASimport osimport argparseparser = argparse.ArgumentParser(description='match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt. ''!!! all the file should end will .txt !!!')parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your result txt')parser.add_argument('result_files_mark',help='mark=The name of the mark specific to ''your two-column diamond results file in this folder, e.g., clean, ''for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')parser.add_argument('NCBI_taxid_tax',help='full Path that contain NCBI_Main.taxid_new.txt')args = parser.parse_args()result_file_folder_path = args.diamond_result_folder_pathNCBI_file = args.NCBI_taxid_taxfile_mark = args.result_files_markos.chdir(result_file_folder_path)files = os.listdir(result_file_folder_path)print(files)db={}print("start db read")f_table = open("%s" % (NCBI_file), 'r')print("start db build")for line in f_table.readlines():line=line.split('\t')taxid = line[0].strip()tax_anno = line[1:8]db[taxid] = tax_annoprint("finish db build")file_ls = []for result_file in files:if file_mark in result_file:file_ls.append(result_file)file_ls.sort()print(file_ls)for result_file in file_ls:print(result_file)out_name = str(result_file).strip('txt')out_name = out_name+'_tax.txt'f_result = open("%s" % (result_file), 'r')f_out = open("%s" % (out_name), 'a')for line in f_result.readlines():line = line.split('\t')query_num = line[0].strip()acc_q_num = line[1].strip()taxid_1 = line[2].strip()if taxid_1 in db:print(query_num,end="\t",file=f_out)print(acc_q_num, end="\t", file=f_out)print(taxid_1, end="\t", file=f_out)tax_in_db = db[taxid_1]str_ls = map(str, tax_in_db)tax = '\t'.join(str_ls)print(tax, file=f_out)f_out.close()f_result.close()f_table.close()

图片[6] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

和上面一样也需要给权限

图片[11] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

好了,可以交差了 !我宣布python是我们初学者的yyds

图片[12] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

再把多个样本的结果结合到一起成为表格

#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9# ########################################################### match diamond blast NR result.txt and NCBI_seqid_taxid.txt# written by PeiZhong in IFR of CAASimport argparseimport osimport pandas as pdimport csvparser = argparse.ArgumentParser(description='tax files combine')parser.add_argument('tax_result_folder_path',help='full Path of the folder that contain your tax result txt')parser.add_argument('result_files_mark',help='mark=The name of the mark specific to ''your two-column diamond results file in this folder, e.g., clean, ''for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')args = parser.parse_args()result_file_folder_path = args.tax_result_folder_pathfile_mark = args.result_files_markpath = os.chdir(result_file_folder_path)files = os.listdir(result_file_folder_path)file_ls = []for file in files:if file_mark in file:file_ls.append(file)file_ls.sort()print(file_ls)for file in file_ls:df = pd.read_csv('%s' % (file),sep='\t')df.drop_duplicates(subset='qseqid', keep='first', inplace=True)outname = str(file).rsplit(".", 1)[0]df.to_csv(outname+'_only1.txt', index=False, sep='\t')path = os.chdir(result_file_folder_path)files = os.listdir(result_file_folder_path)file2_ls = []for file2 in files:if "only1" in file2:file2_ls.append(file2)file2_ls.sort()print(file2_ls)def tax_finder(file):tax_ls = []with open(file, 'r') as f:for line in f.readlines():if "qseqid" not in line:tax = ';'.join(line.split('\t')[3:9])tax_ls.append(tax)db = {}for i in tax_ls:if i in db:db[i] += 1if i not in db:db[i] = 1output = str(file).rsplit(".",1)[0]+"_count1.txt"with open(output,"a") as f3:for key, value in db.items():key = key.strip()print(key, end="\t", file=f3)print(value, file=f3)for file in file2_ls:tax_finder(file)path = os.chdir(result_file_folder_path)files = os.listdir(result_file_folder_path)file3_ls = []for file3 in files:if "count1" in file3:file3_ls.append(file3)file3_ls.sort()print(file3_ls)all_tax = {}for file in file3_ls:with open("%s" % (file),"r") as f4:for line in f4.readlines():tax = line.split('\t')[0]all_tax[tax] = 0print(all_tax)ls_db_result = []for file in file3_ls:with open("%s" % (file),"r") as f5:db_name = str(file)db_name = all_tax.copy()for line in f5.readlines():tax = line.split('\t')[0]count = line.strip("\n").split('\t')[1]if tax in db_name:db_name[tax] = countls_db_result.append(db_name)header=[]for key in all_tax.keys():header.append(key)with open('merge_overview_2.csv', 'a', encoding='utf-8', newline='') as f7:dictWriter = csv.DictWriter(f7,header)dictWriter.writeheader()dictWriter.writerows(ls_db_result)f7_rows=[]with open('merge_overview_2.csv',"r") as f7:for line in f7.readlines():f7_rows.append(line)with open('merge_overview_3.csv',"a") as f8:print("",end=",",file=f8)print(f7_rows[0].strip("\n"),file=f8)row_count=0for i in file3_ls:row_count += 1print(i,end=",",file=f8)print(f7_rows[row_count].strip("\n"),file=f8)

图片[13] - 使用Diamond比对NR数据库获取物种注释 - MaxSSL

© 版权声明
THE END
喜欢就支持一下吧
点赞0 分享