Newer
Older
# created by Nina Marthe 2023 - nina.marthe@ird.fr
# licensed under MIT
import os
import subprocess
from .Graph_gff import *
from .Functions_output import *
from .argparser import *
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
def main():
args=arg()
read_args(args)
# intersect is in the current directory
intersect="intersect"
gfa=args.graph.name
load_intersect(intersect)
segments=args.segment_coordinates_path+"/segments.txt"
# outputs the gff and gaf of the graph
if args.graph_gff:
out_graph_gff=gfa.split("/")[-1].split(".")[0:-1][0]+".gff"
graph_gff(out_graph_gff)
if args.graph_gaf:
out_graph_gaf=gfa.split("/")[-1].split(".")[0:-1][0]+".gaf"
graph_gaf(out_graph_gaf,segments)
# get list of files in seg_coord
segment_coord_files=os.listdir(args.segment_coordinates_path)
# if a transfer on a target genome is asked
if args.annotation or args.variation or args.alignment:
# get the target genomes if there is none specified
if len(args.target)==0:
with open(args.segment_coordinates_path+"/walks.txt",'r') as walks:
line=walks.readline()
while line:
genome_name=line.split()[1]
if (args.source_genome not in genome_name) and (genome_name not in args.target) and ("MINIGRAPH" not in genome_name):
args.target.append(genome_name)
line=walks.readline()
pav_dict={}
# if a pav matrix is asked, create dictionnary to store the features, and for each feature the value for all the target genomes.
if args.pav_matrix:
pav_line=len(args.target)*[1]
pav_dict["gene_id"]=list(args.target)
for feat in Features.keys():
if Features[feat].type=='gene':
pav_dict[feat]=pav_line.copy()
seg_size={}
# if the annotation transfer on target genome is asked, build a dictionnary with the segment sizes
if args.annotation or args.variation or args.alignment or args.pav_matrix:
with open(segments,'r') as segments_file:
line=segments_file.readline()
while line:
line=line.split()
seg_id='s'+line[1]
segment_size=len(line[2])
seg_size[seg_id]=segment_size
line=segments_file.readline()
genome_index=0
for target_genome in args.target:
print(f'\n{target_genome} transfer :')
# create directory to store output files
command="mkdir "+target_genome
subprocess.run(command,shell=True,timeout=None)
# create dictionnaries with paths and segments positions.
for file in segment_coord_files:
genome_name=get_genome_name(args.target,file)
if genome_name==target_genome :
print(f' loading the information from the file {file}')
file_path=args.segment_coordinates_path+file
get_segments_positions_on_genome(file_path)
print(f' loading the walks for the genome {target_genome}')
walks_path=args.segment_coordinates_path+"/walks.txt"
target_genome_paths=get_paths(walks_path,target_genome)
# create output files names
out_target_gff=target_genome+"/"+target_genome+".gff"
out_target_var=target_genome+"/"+target_genome+"_var.txt"
out_aln=target_genome+"/"+target_genome+"_aln.txt"
list_feat_absent=[]
# do the annotation transfer (or var/aln)
transfer_on_target(segments,out_target_gff,out_target_var,out_aln,target_genome,target_genome_paths,list_feat_absent,seg_size,args)
# if pav matrix is asked, add the information of this transfer on the matrix
if args.pav_matrix:
for feat in list_feat_absent:
pav_dict[feat][genome_index]=0
genome_index+=1
print('\ngeneration of the presence-absence matrix for the transfered genes')
pav_output=''
for line in pav_dict:
pav_output+=line
for field in pav_dict[line]:
pav_output+="\t"+str(field)
pav_output+="\n"
out_pav="PAV_matrix.txt"
with open(out_pav,'r') as file_out_pav:
file_out_pav.write(pav_output)