Newer
Older

nina.marthe_ird.fr
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import argparse
import os
import subprocess
from getSegmentsCoordinates import seg_coord
def dir_path(path):
if os.path.isdir(path):
return path
else:
raise argparse.ArgumentTypeError(f"readable_dir:{path} is not a valid path")
def arg():
parser = argparse.ArgumentParser(prog='GrAnnoT',description='Annotation transfer between genome and pangenome graph.')
# input options
# input files
parser.add_argument('graph', metavar='graph.gfa', type=argparse.FileType('r'), help='pangenome graph file in GFA format containing the genome that the annotation refers to and the target genome.s')
parser.add_argument('gff', metavar='annotation.gff', type=argparse.FileType('r') ,help='annotation file in GFF format containing the annotations to be transfered')
# optionnal input file
parser.add_argument('-coord', '--segment_coordinates_path', metavar="path/to/files", type=dir_path, default=".", help="path to the files given by the preprocessing of the graph; if not given the program will do the preprocessing; recommended if the program will run several times")
# genome names
parser.add_argument('source_genome', metavar="source_genome", type=str, help="name of the annotated genome's path in the GFA (field ? in the W line), that the annotation file refers to")
parser.add_argument('-t', '--target', metavar="target_genome", nargs='*', type=str, default=[], help="name of the target genomes' paths in the GFA (field ? in the W line) for the annotation; if the option is not used the programm will transfer the annotation on all the genomes")
# graph annotation options
parser.add_argument('-gff','--graph_gff',help="output the annotation of the graph in GFF format",action='store_true')
parser.add_argument('-gaf','--graph_gaf',help="output the annotation of the graph in GAF format",action='store_true')
# genome annotation options
parser.add_argument('-ann','--annotation',help="output the annotation transfer in GFF format",action='store_true')
parser.add_argument('-var','--variation',help="output the detail of the variations for the transfered features",action='store_true')
parser.add_argument('-aln','--alignment',help="output the alignment of the transfered features",action='store_true')
# annotation transfer options
parser.add_argument('-id', '--identity', metavar="N", type=int, default=80, help="minimum identity percentage required for the feature to be transfered")
parser.add_argument('-cov', '--coverage', metavar="N", type=int, default=80, help="minimum coverage percentage required for the feature to be transfered")
parser.add_argument('-diff', '--max_difference', metavar="N", type=int, default=0, help="maximum difference accepted for the feature to be transfered (transfer rejected if the feature size on the target genome is N times bigger or smaller than on the source genome)")

nina.marthe_ird.fr
committed
# version
parser.add_argument('--version', action='version', version='%(prog)s 0.1')
return parser.parse_args()
def read_args(args):
path_list=args.target[:]

nina.marthe_ird.fr
committed
path_list.append(args.source_genome)
if args.segment_coordinates_path==".":
print("run getSegCoord.py")
seg_coord("RiceGraph_cactus.gfa",path_list)

nina.marthe_ird.fr
committed
#else:
# path_segcoord=args.segment_coordinates_path
print("run intersect") # intersect between gff and several bed
list_files=os.scandir(args.segment_coordinates_path)
command="echo -n "" > intersect"
subprocess.run(command,shell=True,timeout=None)
for file in list_files:
if args.source_genome in file.name:
command="bedtools intersect -wo -a "+file.path+" -b "+args.gff.name+">>intersect"
subprocess.run(command,shell=True,timeout=None)
#command="for f in "+args.segment_coordinates_path+"*"+args.source_genome+"*; do bedtools intersect -wo -a $f -b "+args.gff.name+" >> "+args.segment_coordinates_path+"intersect;done"
def get_genome_name(target_genomes,file_name):
for genome in target_genomes:
if genome in file_name:
return genome
return ""