糖尿病康复 > 基于Astral利用单拷贝同源基因构建物种树

基于Astral利用单拷贝同源基因构建物种树

时间：2023-02-22 10:33:19

想介绍的都在之前的文章里了构建单拷贝同源蛋白系统发育树，一条命令提序列!
不同的是，之前是将得到的单拷贝同源基因比对后进行了串联，每个物种都得到一个很大的序列，然后进行建树；现在是使用并联的方法，是将每个单拷贝同源基因集比对后建树，然后再利用Astral构建了物种树，在之前的脚本上进行了功能扩充，输入的命令不变，只是最后直接多了一个*.gene.species.tree的树文件，树都建好了。

##构建单拷贝同源蛋白, python3##在linux下运行，需要安装proteinortho与muscle，proteinortho安装地址：http://www.bioinf.uni-leipzig.de/Software/proteinortho/，muscle地址：/muscle/。##需要安装Astral，地址：/smirarab/ASTRAL##作者：刘宁华山东大学青岛校区 1039438318@#1. 处理序列文本def chuli_seq(in_file_0):import ospath = os.getcwd()path = path + "/" + in_file_0 + "/"files = os.listdir(path)file_path = []for file in files:file_path_1 = path + filefile_path.append(file_path_1)for file in file_path:cmd = "sed -i 's/ /_/g' " + fileos.system(cmd)return print("1. 序列预处理")#2. 运行同源比对命令def protein_ortho(file):import oscmd = "proteinortho6.pl -project=" + file + "_work -e=1e-5 -cov=50 -identity=50 -clean " + file + "/*.faa"os.system(cmd)return print("2. 完成同源比对")#3. 比对结果处理，提取单拷贝蛋白结果def guoLv_result(in_file_1, num):in_file_1 = open(in_file_1, 'r')out_file_1 = open("work_ortho", 'w')for line in in_file_1:if line.split('\t')[0] == line.split('\t')[1] == num:out_file_1.write(line)out_file_1.close()return print("3. 完成文件过滤")#4. 整合全基因组蛋白序列为一个文件whole_proteindef get_whole_protein(file_name):import oscmd = "cd " + file_name +" ; cat ./*.faa >> whole_protein ; mv whole_protein .. ; cd .."os.system(cmd)return print("4. 全基因组整合完成")#5. fasta序列存为字典def read_to_dict(in_file_2):seq_dict = {}ac = ''seq = ''for line in open(in_file_2):if line.startswith('>') and seq != '':seq_dict[ac] = seqseq = ''if line.startswith('>'):ac = line.strip()[1:]else:seq = seq + line.strip()seq_dict[ac] = seqreturn seq_dict#6. 获取名称列表def get_bacteria_name(in_file3):file = open(in_file3)line = file.readline()names = line.replace('.faa', '').split()[4:]return names#7. 逐行读取记录，并将符合的序列写入新文件中def get_single_seq(seq_dict):i = 1for line in open("work_ortho", 'r'):lines = line.strip().split('\t')[3:]out_file = open('%s.faa' % (i), 'w')for key in seq_dict:if key in lines:out_file.write('>' + key + '\n' + seq_dict[key] + '\n')out_file.close()i = i + 1return print("5. 完成蛋白序列提取")#8. 获取单拷贝蛋白数量def get_protein_num():import globpath_file = glob.glob(r'*.faa')protein_num = len(path_file)return protein_num#9. muscle比对，并整合为一个文件 protein_align.fasdef get_muscle():import oscmd1 = "for file in ./*.faa; do muscle -in $file -out $file.fas1; done"cmd2 = "cat ./*.fas1 >> protein_align.fas"os.system(cmd1)os.system(cmd2)return print("6. 完成序列比对")#10. 提取同源蛋白序列def get_result(names, seq_dict, protein_num):out_file_name = str(protein_num) + ".result.fasta"out_file = open(out_file_name, 'w')i = 3for name in names:seq_list = []seq_align = ''# 将蛋白序列读取为列表for lines in open("work_ortho", 'r'):lines = lines.strip().split('\t')[i] # 索引第几列就是哪个蛋白序列seq_list.append(lines)for key in seq_dict:if key in seq_list:seq_align = seq_align + seq_dict[key]out_file.write('>' + name + '\n' + seq_align + '\n')i = i + 1out_file.close()return print("7. 生成目标文件 --> *.result.fasta")#11，对比对好的单个单拷贝基因序列进行改名处理，先移入single_a_genedef get_tree(in_file1, num2):import oscmd1 = "mkdir single_a_gene"cmd2 = "mv *.fas1 single_a_gene/"cmd3 = "mkdir changed_ID"os.system(cmd1)os.system(cmd2)os.system(cmd3)path = os.getcwd()#获取物种名与RASTid的对应path1 = path + "/" + in_file1name_id_dit = {}files1 = os.listdir(path1)for file1 in files1:path11 = path1 + "/" + file1file2 = file1.split(".")[0]with open(path11, "r") as infile:for line in infile:line1 = line.strip().split(".")[0:2]line2 = '.'.join(line1)name_id_dit[line2] = file2break#将比对好的单拷贝序列改名为物种名path2 = path + "/single_a_gene"files2 = os.listdir(path2)path3 = path + "/changed_ID"for file2 in files2:path22 = path2 + "/" + file2outfile = path3 + "/" + file2 + ".fasta"with open(outfile, "a") as outfile2:with open(path22, "r") as infile2:for line in infile2:if line.startswith(">"):line1 = line.strip().split(".")[0:2]line2 = '.'.join(line1)outfile2.write(">" + name_id_dit[line2] + "\n")else:outfile2.write(line)#fasttree建树cmd4 = "for file in changed_ID/*.fasta; do fasttree -boot 1000 $file > $file.tree; done"os.system(cmd4)cmd5 = "cat changed_ID/*.tree >> ./single_gene.tree"os.system(cmd5)cmd6 = "java -jar /home/LiuNingHua/lnh-software/Astral/astral.5.7.5.jar -i single_gene.tree -o " + str(num2) + ".gene.species.tree"os.system(cmd6)return print("**. 生成物种树 --> *.gene.species.tree")#12. 删除无用文件def del_file():import oscmd1 = "rm -rf *.faa *.fas1 *.fas work_ortho whole_protein single_a_gene changed_ID"os.system(cmd1)return print("8. 文件清理，结束!")###########################################################if __name__=='__main__':import sysimport osimport timetry:file_name = sys.argv[1]num = sys.argv[2]test = int(sys.argv[2]) #确保输入顺序正确except:print("请输入正确参数顺序！")print("usage: python3 protein_ortho.py <file_name> <bacteria_num>")else:chuli_seq(file_name) #处理序列名中的空格in_file = file_name + "_work.proteinortho.tsv"protein_ortho(file_name) #同源比对while True:if os.path.isfile(in_file):time.sleep(20) #确保文件.proteinortho.tsv已完成写入guoLv_result(in_file, num) # 获得work_orthoget_whole_protein(file_name) # 集合全基因组,生成whole_proteinwhole_protein_dict = read_to_dict(in_file_2='whole_protein') # 全基因组存入字典get_single_seq(whole_protein_dict) # 提取出单蛋白序列protein_num = get_protein_num() #获取单拷贝蛋白的数量get_muscle() # 完成比对align_protein_dict = read_to_dict(in_file_2="protein_align.fas") #提取后的蛋白序列存入字典bac_names = get_bacteria_name(in_file) # 菌株名称get_result(bac_names, align_protein_dict, protein_num) #完成序列提取get_tree(file_name, protein_num) #Astral建树del_file() # 删除中间文件break

如果觉得《基于Astral利用单拷贝同源基因构建物种树》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。