From f2324623607dc59a093fabe819ff66588c0e06ef Mon Sep 17 00:00:00 2001 From: NMarthe <nina.marthe@ird.fr> Date: Tue, 31 Oct 2023 12:52:31 +0100 Subject: [PATCH] =?UTF-8?q?corrig=C3=A9=20la=20detection=20des=20chromosom?= =?UTF-8?q?es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- seg_coord/getSegmentsCoordinates.py | 31 ++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/seg_coord/getSegmentsCoordinates.py b/seg_coord/getSegmentsCoordinates.py index 27c5704..3eccffa 100644 --- a/seg_coord/getSegmentsCoordinates.py +++ b/seg_coord/getSegmentsCoordinates.py @@ -1,6 +1,27 @@ import subprocess import sys +def has_numbers(inputString): + return any(char.isdigit() for char in inputString) + +def getChrName(chromosome_field): + chromosome_id="" + if has_numbers(chromosome_field): + for char in reversed(chromosome_field): #Â take the last digits of the field + if not char.isdigit(): + break + else: + chromosome_id+=char + chromosome_id="Chr"+chromosome_id[::-1] + else: + for char in reversed(chromosome_field): #Â take the last uppercase chars of the fied + if not char.isupper(): + break + else: + chromosome_id+=char + chromosome_id="Chr"+chromosome_id[::-1] + return chromosome_id + if not(len(sys.argv)==2) : print("expected input : gfa file with walks.") print("output : bed files giving the coordinates of the segments on the genomes (or on minigraph segments).") @@ -38,12 +59,12 @@ walks.close() file_names=list() for line in lines : line=line.split() - name=line[1] - chr_field=line[3].split('_') - chromosome=chr_field[len(chr_field)-1] + name=line[3] path_start=int(line[4]) + chromosome_field=line[3] + chromosome_id=getChrName(chromosome_field) - file_name=name+'_'+chromosome+'.bed' + file_name=name+'.bed' # if we are writing in the file for the first time, overwrite it. else, append it # this is because chromosomes can be fragmented. the coordinates of all the fragments from the same chromosome will be written in the same bed file. @@ -59,7 +80,7 @@ for line in lines : for i in range(1, len(path)): # for each segment in the path, write the position of the segment in the output bed file # coordinates calculation : start=position, stop=position+segment_size-1, then position+=segment_size - chr='Chr'+chromosome[len(chromosome)-2:] + chr='Chr'+chromosome_id[len(chromosome_id)-2:] seg_start=position seg_name='s'+path[i][1:] -- GitLab