使用Diamond比对NR数据库获取物种注释

之前用Kraken2注释宏基因组的contig，发现只有30%左右可以被Kraken2注释

Kraken2+Bracken：宏基因组物种注释-CSDN博客

不信邪，再用NR库试试

参考：

将NR数据库diamond比对结果做物种注释_diamond 物种注释-CSDN博客

NR下载

nohup wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz &wget -c https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5#检查下载的完整性（一样的话就是完整了）md5sum nr.gz### 899ac219cac213c60fede9c3d9ef8f7bnr.gzcat nr.gz.md5 ### 899ac219cac213c60fede9c3d9ef8f7bnr.gznohup gunzipnr.gz&mv nrnr.fa###下载NR物种相关信息和taxid信息wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gzwget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz.md5wget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gzwget -t 0 -c -b https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5#检查完整性md5sum prot.accession2taxid.gz#32b8ca7ea712e161c72af69135fc938ecat prot.accession2taxid.gz.md5 #32b8ca7ea712e161c72af69135fc938emd5sum taxdump.tar.gz #26558b800bc1b3795c25d1f0ead65412cat taxdump.tar.gz.md5#26558b800bc1b3795c25d1f0ead65412#解压tar -zxvf taxdump.tar.gzgunzip prot.accession2taxid.gz#移动需要的文件cd ~mkdir .taxonkitcp *.dmp ~/.taxonkit

必要软件下载

conda create -n NR_database_searchconda activate NR_database_searchconda install -c bioconda taxonkit=0.15.0conda install -c bioconda diamond=2.1.8conda install -c bioconda csvtk=0.28.0

diamond建库

nohup diamond makedb --threads 180 --in nr.fa --db NR_2023_07_23 &

diamond比对

diamond blastx --db NR_2023_07_23.dmnd --query nucleic_reads.fna\-o nucleic_matches_fmt6.txt --threads 180 --evalue 0.00001 \--max-target-seqs 5 --outfmt 6diamond blastp --db NR_2023_07_23.dmnd --query protein_reads.fna\-o protein_matches_fmt6.txt --threads 180 --evalue 0.00001 \--max-target-seqs 5 --outfmt 6## --outfmt 6 最好别改变

这些参数可以调整diamond比对的速度or准确性

这几个参数可以调整比对的coverage，identity，score

结果如下！！！（这个表头后面python会加）

taxonkit获得物种分类信息表

感谢大佬：一文完成nt库序列快速下载及blast结果注释物种 (qq.com)

得到seqid注释之后，可以搜索注释

## 一些主要的物种编号# 2 bacteria# 2157archaea# 4751fungi# 10239 virus# 2759 Eukaryota#看taxnokit安好了么taxonkit -h#创建目录cd /home/zhongpei/database/NR_2023_07_23mkdir /home/zhongpei/database/NR_2023_07_23/NCBI_Main_taxcd NCBI_Main_tax#开始taxonkit list -j 4 --ids 2,2157,4751,10239,2759 --indent "" > NCBI_Main.taxid.txt# -j 是线程，软件说4个够了；--ids 是需要的物种编号，用逗号分隔wc -l NCBI_Main.taxid.txt # 2708739 NCBI_Main.taxid.txthead -n 5 NCBI_Main.taxid.txt #查看内容# 提取taxid和taxonomy（界门纲目科属种）的对应信息到NCBI_Main.taxid.txtless NCBI_Main.taxid.txt | taxonkit reformat -I 1 -r Unassigned -f "{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}\t{t}"| sed '1i\Taxid\tKingdom\tPhylum\tClass\tOrder\tFamily\tGenus\tSpecies\tStrain' > NCBI_Main.taxid_new.txt# -I 1 一个制表符分隔；-r 没有找到的用什么字符去填充，这里用的“Unassigned” # -f 输出的格式；1i表示在第行之前插入文本(sed用法，不太会)

完成！

vim NCBI_Main.taxid_new.txt

把第一个Taxid改成小写

seqid和taxid的对应

还记不记得第一步下载过一个 “prot.accession2taxid” ，现在要派上用场了

其实python也能做，csvtk太不熟悉了，先来学习一下吧，感觉还挺方便（这一步比较慢）

cat prot.accession2taxid | csvtk -t grep -f taxid -P NCBI_Main.taxid.txt | csvtk -t cut -f accession.version,taxid > NCBI_seqid_taxid.txt# "cat prot.accession2taxid |" 把 prot.accession2taxid 的内容到下面的 csvtk# -t 输入内容是制表符分隔；grep 这是csvtk的1个子命令，用于在文件中搜索匹配的行# -P 搜索那些"taxid"字段的值出现在"NCBI_Main.taxid.txt"文件中的行# cut 这是csvtk的1个子命令，用于从输入中选择特定的字段

NCBI_seqid_taxid.txt 就是目标文件

diamond seqid和taxid对应，再和界门纲目科属种对应

把diamond结果文件与NCBI_seqid_taxid.txt对应

#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9# ########################################################### match diamond blast NR result.txt and NCBI_seqid_taxid.txt# written by PeiZhong in IFR of CAASimport osimport argparseparser = argparse.ArgumentParser(description='match diamond blast NR result.txt and NCBI_seqid_taxid.txt. ''!!! all the file should end will .txt !!!')parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your diamond result txt')parser.add_argument('result_files_mark',help='mark=The name of the mark specific to ''your two-column diamond results file in this folder, e.g., clean, ''for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')parser.add_argument('NCBI_acc_taxid',help='full Path that contain NCBI_seqid_taxid.txt')args = parser.parse_args()result_file_folder_path = args.diamond_result_folder_pathNCBI_file = args.NCBI_acc_taxidfile_mark = args.result_files_markos.chdir(result_file_folder_path)files = os.listdir(result_file_folder_path)print(files)db={}print("start db read")f_table = open("%s" % (NCBI_file), 'r')print("start db build")for line in f_table.readlines():line=line.split('\t')acc_num = line[0].strip()tax_num = line[1].strip()db[acc_num] = tax_numprint("finish db build")file_ls = []for result_file in files:if file_mark in result_file:file_ls.append(result_file)file_ls.sort()print(file_ls)header = "qseqid\taccession.version\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore"for result_file in file_ls:print(result_file)f_result = open("%s" % (result_file), 'r+')content = f_result.read()f_result.seek(0, 0)f_result.write(header.rstrip('\r\n') + '\n' + content)f_result.close()out_name = str(result_file).strip('txt')out_name = out_name+'_taxid.txt'f_result = open("%s" % (result_file), 'r')f_out = open("%s" % (out_name), 'a')for line in f_result.readlines():line = line.split('\t')query_num = line[0].strip()acc_q_num = line[1].strip()if acc_q_num in db:print(query_num,end="\t",file=f_out)print(acc_q_num, end="\t", file=f_out)print(db[acc_q_num], file=f_out)f_out.close()f_result.close()f_table.close()

chmod +x diamond_NR_tax.py(完整地址)diamond_NR_tax.py(完整地址) -hdiamond_NR_tax.py(完整地址) diamond_result_folder_path result_files_mark NCBI_acc_taxiddiamond_result_folder_path：你存放上面处理完的diamond比对文件，txt结尾的文件，的目录（完整地址）result_files_mark：这个地址中，你的这些文件独有的标识字符串#for example,the mark of result file 'SY10_clean.txt' and 'SY11_clean.txt' is 'clean'NCBI_acc_taxid：你的NCBI_seqid_taxid.txt文件的完整地址

做完结果是这样的

接下来再写个python代码根据taxid把这个文件和界门纲目科属种联系起来就行(不好意思只会python，我检讨。。。。但是python简单呀)

#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9# ########################################################### match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt# written by PeiZhong in IFR of CAASimport osimport argparseparser = argparse.ArgumentParser(description='match diamond blast NR taxid result.txt and NCBI_Main.taxid_new.txt. ''!!! all the file should end will .txt !!!')parser.add_argument('diamond_result_folder_path',help='full Path of the folder that contain your result txt')parser.add_argument('result_files_mark',help='mark=The name of the mark specific to ''your two-column diamond results file in this folder, e.g., clean, ''for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')parser.add_argument('NCBI_taxid_tax',help='full Path that contain NCBI_Main.taxid_new.txt')args = parser.parse_args()result_file_folder_path = args.diamond_result_folder_pathNCBI_file = args.NCBI_taxid_taxfile_mark = args.result_files_markos.chdir(result_file_folder_path)files = os.listdir(result_file_folder_path)print(files)db={}print("start db read")f_table = open("%s" % (NCBI_file), 'r')print("start db build")for line in f_table.readlines():line=line.split('\t')taxid = line[0].strip()tax_anno = line[1:8]db[taxid] = tax_annoprint("finish db build")file_ls = []for result_file in files:if file_mark in result_file:file_ls.append(result_file)file_ls.sort()print(file_ls)for result_file in file_ls:print(result_file)out_name = str(result_file).strip('txt')out_name = out_name+'_tax.txt'f_result = open("%s" % (result_file), 'r')f_out = open("%s" % (out_name), 'a')for line in f_result.readlines():line = line.split('\t')query_num = line[0].strip()acc_q_num = line[1].strip()taxid_1 = line[2].strip()if taxid_1 in db:print(query_num,end="\t",file=f_out)print(acc_q_num, end="\t", file=f_out)print(taxid_1, end="\t", file=f_out)tax_in_db = db[taxid_1]str_ls = map(str, tax_in_db)tax = '\t'.join(str_ls)print(tax, file=f_out)f_out.close()f_result.close()f_table.close()

和上面一样也需要给权限

好了，可以交差了！我宣布python是我们初学者的yyds

再把多个样本的结果结合到一起成为表格

#!/home/zhongpei/miniconda3/envs/py39/bin/python3.9# ########################################################### match diamond blast NR result.txt and NCBI_seqid_taxid.txt# written by PeiZhong in IFR of CAASimport argparseimport osimport pandas as pdimport csvparser = argparse.ArgumentParser(description='tax files combine')parser.add_argument('tax_result_folder_path',help='full Path of the folder that contain your tax result txt')parser.add_argument('result_files_mark',help='mark=The name of the mark specific to ''your two-column diamond results file in this folder, e.g., clean, ''for example,the mark of result file SY10_NR_diamond_clean_fmt6_taxid.txt is clean')args = parser.parse_args()result_file_folder_path = args.tax_result_folder_pathfile_mark = args.result_files_markpath = os.chdir(result_file_folder_path)files = os.listdir(result_file_folder_path)file_ls = []for file in files:if file_mark in file:file_ls.append(file)file_ls.sort()print(file_ls)for file in file_ls:df = pd.read_csv('%s' % (file),sep='\t')df.drop_duplicates(subset='qseqid', keep='first', inplace=True)outname = str(file).rsplit(".", 1)[0]df.to_csv(outname+'_only1.txt', index=False, sep='\t')path = os.chdir(result_file_folder_path)files = os.listdir(result_file_folder_path)file2_ls = []for file2 in files:if "only1" in file2:file2_ls.append(file2)file2_ls.sort()print(file2_ls)def tax_finder(file):tax_ls = []with open(file, 'r') as f:for line in f.readlines():if "qseqid" not in line:tax = ';'.join(line.split('\t')[3:9])tax_ls.append(tax)db = {}for i in tax_ls:if i in db:db[i] += 1if i not in db:db[i] = 1output = str(file).rsplit(".",1)[0]+"_count1.txt"with open(output,"a") as f3:for key, value in db.items():key = key.strip()print(key, end="\t", file=f3)print(value, file=f3)for file in file2_ls:tax_finder(file)path = os.chdir(result_file_folder_path)files = os.listdir(result_file_folder_path)file3_ls = []for file3 in files:if "count1" in file3:file3_ls.append(file3)file3_ls.sort()print(file3_ls)all_tax = {}for file in file3_ls:with open("%s" % (file),"r") as f4:for line in f4.readlines():tax = line.split('\t')[0]all_tax[tax] = 0print(all_tax)ls_db_result = []for file in file3_ls:with open("%s" % (file),"r") as f5:db_name = str(file)db_name = all_tax.copy()for line in f5.readlines():tax = line.split('\t')[0]count = line.strip("\n").split('\t')[1]if tax in db_name:db_name[tax] = countls_db_result.append(db_name)header=[]for key in all_tax.keys():header.append(key)with open('merge_overview_2.csv', 'a', encoding='utf-8', newline='') as f7:dictWriter = csv.DictWriter(f7,header)dictWriter.writeheader()dictWriter.writerows(ls_db_result)f7_rows=[]with open('merge_overview_2.csv',"r") as f7:for line in f7.readlines():f7_rows.append(line)with open('merge_overview_3.csv',"a") as f8:print("",end=",",file=f8)print(f7_rows[0].strip("\n"),file=f8)row_count=0for i in file3_ls:row_count += 1print(i,end=",",file=f8)print(f7_rows[row_count].strip("\n"),file=f8)