adapted the code to select the genomes to handle, and treat assemblies with...

adapted the code to select the genomes to handle, and treat assemblies with contigs instead of chromosomes

adapted the code to select the genomes to handle, and treat assemblies with...
6f09e872 · nina.marthe_ird.fr · 397e4149 · 6f09e872
Commit 6f09e872 authored 1 year ago by nina.marthe_ird.fr
--- a/getSegmentsCoordinates.py
+++ b/getSegmentsCoordinates.py
 import subprocess
-import sys

 def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

-def getChrName(chromosome_field):
-    chromosome_id=""
-    if has_numbers(chromosome_field):
-        for char in reversed(chromosome_field): # take the last digits of the field
+def getSeqNumber(name_field):
+    seq_number=""
+    if has_numbers(name_field[-3:]):
+        for char in reversed(name_field): # take the last digits of the field
            if not char.isdigit():
                break
            else:
-                chromosome_id+=char
-        chromosome_id="Chr"+chromosome_id[::-1]
+                seq_number+=char
    else:
-        for char in reversed(chromosome_field): # take the last uppercase chars of the fied
+        for char in reversed(name_field): # take the last uppercase chars of the fied
            if not char.isupper():
                break
            else:
-                chromosome_id+=char
-        chromosome_id="Chr"+chromosome_id[::-1]
-    return chromosome_id
-
-if not(len(sys.argv)==2) :
-        print("expected input : gfa file with walks.")
-        print("output : bed files giving the coordinates of the segments on the genomes (or on minigraph segments).")
-        sys.exit(1)
-elif (sys.argv[1]=="-h") :
-    print("expected input : gfa file with walks.")
-    print("output : bed files giving the coordinates of the segments on the genomes (or on minigraph segments).")
-    sys.exit(1)
-
-gfa=sys.argv[1]
-
-# get the lines that start with "S"
-command="grep ^S "+gfa+" > segments.txt"
-subprocess.run(command,shell=True)
-segments = open('segments.txt','r')
-lines=segments.readlines()
-segments.close()
-
-# build a dictionnary with the segment sizes
-segments_size={}
-for line in lines:
-    line=line.split()
-    seg_id='s'+line[1]
-    seg_size=len(line[2])
-    segments_size[seg_id]=seg_size
-
-# get the lines that start with "W"
-command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > walks.txt"
-subprocess.run(command,shell=True)
-walks=open('walks.txt','r')
-lines=walks.readlines()
-walks.close()
-
-# on these lines, get the name of the genome to name the output bed file
-file_names=list()
-for line in lines :
-    line=line.split()
-    name=line[3]
-    path_start=int(line[4])
-    chromosome_field=line[3]
-    chromosome_id=getChrName(chromosome_field)
-
-    file_name=name+'.bed'
-
-    # if we are writing in the file for the first time, overwrite it. else, append it
-    # this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
-    if file_name not in file_names:
-        file_names.append(file_name)
-        out_bed = open(file_name, 'w')
-    else :
-        out_bed = open(file_name, 'a')
-        
-    path=line[6].split(',')
-    position=path_start
+                seq_number+=char
+    return seq_number[::-1]
    
-    for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
-        # coordinates calculation : start=position, stop=position+segment_size-1, then position+=segment_size
-        
-        chr='Chr'+chromosome_id[len(chromosome_id)-2:]
-        
-        seg_start=position
-        seg_name='s'+path[i][1:]
-        seg_stop=position+segments_size[seg_name]
-
-        if path[i][0:1]=='>':
-            orientation='+'
-        elif path[i][0:1]=='<':
-            orientation='-'
-        else:
-            orientation='.'
+def get_seg_len(gfa):
+
+    # get the lines that start with "S"
+    command="grep ^S "+gfa+" > seg_coord/segments.txt"
+    subprocess.run(command,shell=True,timeout=None)
+    segments = open('seg_coord/segments.txt','r')
+    lines=segments.readlines()
+    segments.close()
+
+    # build a dictionnary with the segment sizes
+    segments_size={}
+    for line in lines:
+        line=line.split()
+        seg_id='s'+line[1]
+        seg_size=len(line[2])
+        segments_size[seg_id]=seg_size
+
+    return segments_size
+
+def check_walk_name(walk_names,name):
+    name_found=False
+    for walk in walk_names:
+        if walk in name:
+            name_found=True
+            break
+    return name_found # rename name_found var
+
+def seg_coord(gfa,walk_names):
+
+    # create directory to store output files
+    command="mkdir seg_coord"
+    subprocess.run(command,shell=True)
+
+    segments_size=get_seg_len(gfa)
+
+    # get the lines that start with "W"
+    command="grep ^W "+gfa+" | sed 's/>/,>/g' | sed 's/</,</g' > seg_coord/walks.txt"
+    subprocess.run(command,shell=True,timeout=None)
+    walks=open('seg_coord/walks.txt','r')
+    lines=walks.readlines()
+    walks.close()
+
+    # on these lines, get the name of the genome to name the output bed file
+    file_names=list()
+    for line in lines :
+        line=line.split()
+        name=line[3]
+
+        if check_walk_name(walk_names,name) | (len(walk_names)==1): # len=1 if there is only the source genome.
+
+            path_start=int(line[4])
+            seq_name=name.split('_')[-1]
+            if (seq_name[0:10]=="chromosome") | (seq_name[0:10]=="Chromosome"):
+                seq_name="Chr"+seq_name[10:]
+
+            file_name="seg_coord/"+name+'.bed'
+
+            # if we are writing in the file for the first time, overwrite it. else, append it
+            # this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file.
+            if file_name not in file_names:
+                file_names.append(file_name)
+                out_bed = open(file_name, 'w')
+            else :
+                out_bed = open(file_name, 'a')
+                
+            path=line[6].split(',')
+            position=path_start
+            
+            for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file
+                # coordinates calculation : start=position, stop=position+segment_size-1, then position+=segment_size
+                
+                seg_start=position
+                seg_name='s'+path[i][1:]
+                seg_stop=position+segments_size[seg_name]
+            
+                out_line=seq_name+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
+                out_bed.write(out_line)
+                
+                position+=segments_size[seg_name]
+            out_bed.close()
    
-        out_line=chr+'\t'+str(seg_start)+'\t'+str(seg_stop)+'\t'+path[i][0:1]+seg_name+'\n'
-        out_bed.write(out_line)
-        
-        position+=segments_size[seg_name]
-    out_bed.close()
- 
-command="rm segments.txt && rm walks.txt"
-subprocess.run(command,shell=True)
-
-
-command="if [ 0 -lt $(ls _MINIGRAPH_* 2>/dev/null | wc -w) ]; then mkdir minigraph_segments && mv _MINIGRAPH_* minigraph_segments; fi"
-subprocess.run(command,shell=True)
\ No newline at end of file
+    command="rm seg_coord/segments.txt && rm seg_coord/walks.txt"
+    subprocess.run(command,shell=True,timeout=None)
+
+    command="if ls test/*_MINIGRAPH_* 1> /dev/null 2>&1; then mkdir seg_coord/minigraph_segments && mv *_MINIGRAPH_* seg_coord/minigraph_segments/; fi"
+    subprocess.run(command,shell=True,timeout=None)
\ No newline at end of file