Skip to content
Snippets Groups Projects
Functions.py 45.3 KiB
Newer Older
from Graph_gff import Segments, Features, get_start, get_stop
# functions to generate a genome's gff from the graph's gff

# get the start position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
NMarthe's avatar
NMarthe committed
def get_start_2(seg_start, seg_genome, feat_id):
    s_start=seg_genome[seg_start][1]
NMarthe's avatar
NMarthe committed

# get the stop position of the features on the linear genome, using their coordinates on the graph and the coordinantes of the segments on the genome
NMarthe's avatar
NMarthe committed
def get_stop_2(seg_stop, seg_genome, feat_id):
    s_start=seg_genome[seg_stop][1]
NMarthe's avatar
NMarthe committed

# get the position of a part of a feature on the complete feature (on the original genome)
def get_position(seg_start,seg_stop,feat_start,feature,seg_genome):
NMarthe's avatar
NMarthe committed
    # start position of the entire feature on the reference
    start_gene_start=Segments[feat_start[1:]].start+get_start(feat_start[1:],feature.id)-1
    # start position of the part of the feature on the reference
    start_first_seg=Segments[seg_start].start+get_start(seg_start,feature.id)-1
    
    # start and stop position of the feature on the genome we transfer on, to get the length of the part of the feature
    start_new_genome=get_start_2(seg_start,seg_genome,feature.id) 
    stop_new_genome=get_stop_2(seg_stop,seg_genome,feature.id)
    # start position of the part of the feature on the complete feature
    start_gene=int(start_first_seg)-int(start_gene_start)+1
    # stop position : start+length
    stop_gene=start_gene+(stop_new_genome-start_new_genome)
    
    position=";position="+str(start_gene)+"-"+str(stop_gene)
    #position=";start_position_on_feature="+str(start_gene)+":stop_position_on_feature="+str(stop_gene)
    return position

NMarthe's avatar
NMarthe committed

# get the proportion of a part of the feature on the total length
def get_proportion(seg_start,seg_stop,seg_genome,feature):
    start_new_genome=get_start_2(seg_start,seg_genome,feature.id) # start position of the feature on the genome we transfer on
    stop_new_genome=get_stop_2(seg_stop,seg_genome,feature.id) # stop position of the feature on the genome we transfer on
    proportion=";proportion="+str(stop_new_genome-start_new_genome+1)+"/"+str(feature.size)
    #proportion=";number_bases="+str(stop_new_genome-start_new_genome+1)+";total_bases="+str(feature.size)
NMarthe's avatar
NMarthe committed

# returns the gff line to write in the output file (gff detail)
NMarthe's avatar
NMarthe committed
def write_line(list_seg,i,feat,seg_start,feat_start,seg_genome):
    seg_stop=list_seg[i-1]
    chr=seg_genome[seg_start[1:]][0]
    # start and stop position of the feature on the genome we transfer on
    start_new_genome=get_start_2(seg_start[1:],seg_genome,feature.id) 
    stop_new_genome=get_stop_2(seg_stop[1:],seg_genome,feature.id)
    proportion=get_proportion(seg_start[1:],seg_stop[1:],seg_genome,feature)
    position=get_position(seg_start[1:],seg_stop[1:],feat_start,feature,seg_genome)
    out_line=chr+"\tGrAnnot\t"+feature.type+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t.\t"+strand+"\t.\t"+annotation+"\n"
# takes a list of stranded segments, returns a list of unstranded segments
def remove_strand(list_seg):
    list_unstrand=list()
    for seg_stranded in list_seg:
        list_unstrand.append(seg_stranded[1:])
    return list_unstrand

# takes two lists of segments for two genes, check if the first list is an inversion of the second one (if most of the segments in common are on the opposite strand)
def detect_inversion(list_1,list_2):
    list_1_unstrand=remove_strand(list_1)
    list_2_unstrand=remove_strand(list_2)

    # get the list of segments in common
    seg_common=list()
    for seg in list_1_unstrand:
        if seg in list_2_unstrand:
            seg_common.append(seg)
    
    # for each segment in common, check if the strand is the same. check index in list unstranded to get the segment in list stranded
    same_strand_count=0
    for seg in seg_common:
        index_1=list_1_unstrand.index(seg)
        index_2=list_2_unstrand.index(seg)
        if list_1[index_1]==list_2[index_2]:
            same_strand_count+=1

    # if there is less than 10% of strand difference among the common segments (ie more than 90% of same strand), return False, no inversion 
    if len(seg_common)-same_strand_count<len(seg_common)/10:
        return False
    else:
        return True


# takes a feature and a feature type, returns a list of child features that have the wanted type
def get_list_childs(feature,child_type):
    list_childs=list()
    for child in feature.childs:
        if Features[child].type==child_type:
            list_childs.append(child)
    return list_childs


# print stats on the transfer : number of feature that have segments in different positions missing. 
def stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok):

    # creates files with the features by category (missing first, middle, end segment, etc)
    feat_miss_first=open('features_missing_first.txt','w')
    feat_miss_first.write("features missing the first segment on azucena :\n")
    for first in feature_missing_first :
        feat_miss_first.write(Features[first].annot)
        if Features[first].type!='gene':
            feat_miss_first.write(';')
            feat_miss_first.write(Features[first].note)
        feat_miss_first.write("\n")
    feat_miss_first.close()
        
    feat_miss_middle=open('features_missing_middle.txt','w')
    feat_miss_middle.write("\nfeatures missing a middle segment on azucena :\n")
    for middle in feature_missing_middle :
        feat_miss_middle.write(Features[middle].annot)
        if Features[middle].type!='gene':
            feat_miss_middle.write(';')
            feat_miss_middle.write(Features[middle].note)
        feat_miss_middle.write("\n")
    feat_miss_middle.close()
       
    feat_miss_last=open('features_missing_last.txt','w')
    feat_miss_last.write("\nfeatures missing the last segment on azucena :\n")
    for last in feature_missing_last :
        feat_miss_last.write(Features[last].annot)
        if Features[last].type!='gene':
            feat_miss_last.write(';')
            feat_miss_last.write(Features[last].note)
        feat_miss_last.write("\n")
    feat_miss_last.close()
    
    feat_miss=open('features_missing.txt','w')
    feat_miss.write("\nfeatures missing entirely on azucena :\n")
    for entier in feature_missing_all :
        feat_miss.write(Features[entier].annot)
        if Features[entier].type!='gene':
            feat_miss.write(';')
            feat_miss.write(Features[entier].note)
        feat_miss.write("\n")
    feat_miss.close()

    feat_miss_total=open('features_missing_total.txt','w')
    feat_miss_total.write("\nfeatures missing at least one segment on azucena :\n")
    for total in feature_missing_total :
        feat_miss_total.write(Features[total].annot)
        if Features[total].type!='gene':
            feat_miss_total.write(';')
            feat_miss_total.write(Features[total].note)
        feat_miss_total.write("\n")
    feat_miss_total.close()
    
    feat_all=open('all_features.txt','w')
    for feat in list_feature:
        feat_all.write(Features[feat].annot)
        if Features[feat].type!='gene':
            feat_all.write(';')
            feat_all.write(Features[feat].note)
        feat_all.write("\n")
    feat_all.close()

    feat_ok=open('features_ok.txt','w')
    feat_ok.write("\nfeatures entirely present in azucena :\n")
    for ok in feature_ok :
        feat_ok.write(Features[ok].annot)
        if Features[ok].type!='gene':
            feat_ok.write(';')
            feat_ok.write(Features[ok].note)
        feat_ok.write("\n")
    feat_ok.close()

    # prints the stats for the features (hypothetical/putative rate, by category)
    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_first.txt | wc -l",))
    total=len(feature_missing_first)
    print("\nthe first segment is missing for", len(feature_missing_first) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_middle.txt | wc -l",))
    total=len(feature_missing_middle)
    print("a middle segment is missing for", len(feature_missing_middle) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_last.txt | wc -l",))
    total=len(feature_missing_last)
    print("the last segment is missing for", len(feature_missing_last) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing.txt | wc -l",))
    total=len(feature_missing_all)
    print(len(feature_missing_all) ,"features are entirely missing, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_missing_total.txt | wc -l",))
    total=len(feature_missing_total)
    print("there is at least one segment missing for", len(feature_missing_total) ,"features, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' features_ok.txt | wc -l",))
    total=len(feature_ok)
    print(len(feature_ok) ,"features are entirely present in the new genome, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)

    hyp_put=int(subprocess.getoutput("grep 'hypothetical\|putative' all_features.txt | wc -l",))
    total=len(list_feature)
    print("there is", len(list_feature) ,"features in total, including",round(100*(hyp_put)/total,2),"% hypothetical or putative.",hyp_put,total)
    command="rm features_missing*.txt && rm all_features.txt && rm features_ok.txt"
    subprocess.run(command,shell=True)


def gff_detail(list_seg,seg_genome,feat,file_out_detail):
    # loop that goes through all the segments that have the current feature
    # keeps the first segment present in the genome found, and when it finds a segment absent in the genome, prints the current part of the fragment, and resets the first segment present.
    # continues to go through the segments, keeps the next segment present in the genome, and when it finds a segment absent, prints the second part of the feature.
    # etc.
    # at the end of the loop, prints the last part of the fragment.

    feat_start="empty" # stocks the first segment of the feature
    seg_start="empty" # stocks the first segment of the current part of the feature
    #seg_stop="empty" # stocks the last segment of the current part of the feature

        if list_seg[i][1:] in seg_genome: # look for a segment present in the genome
            if feat_start=="empty":
                feat_start=list_seg[i]
    
            if seg_start=="empty": # if we dont have a start, take the current segment for the start of the part of the feature
                seg_start=list_seg[i]
                
            #else: if we already have a start, keep going though the segments until we find a stop (segment not in azucena)
                
        else:
            if seg_start!="empty": # found a stop. so print the line, reset seg_start, and keep going through the segments to find the next seg_start
                    
                out_line=write_line(list_seg,i,feat,seg_start,feat_start,seg_genome)
                file_out_detail.write(out_line)
                seg_start="empty"
                #seg_stop="empty"
            
            #else: if the current segment is not in azucena but there is no start, keep looking for a start
                
                
    # print the last part of the feature in the end
    if list_seg[i][1:] in seg_genome:
        out_line=write_line(list_seg,i+1,feat,seg_start,feat_start,seg_genome)
        file_out_detail.write(out_line)


# outputs the feature once in a gff, from the first to the last segment present on the new genome (if the size is ok) :
def gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n):
    
    # count the number of features not transfered because they are too big or too small on the new genome
    # count the number of features absent on the new genome
    if (first_seg!='0') & (last_seg!='0'):

        start2=get_start_2(first_seg,seg_genome,feat)
        stop2=get_stop_2(last_seg,seg_genome,feat)
        size_on_new_genome=int(stop2)-int(start2)+1
        size_diff=abs(size_on_new_genome-Features[feat].size)
        # if the gene on the new genome is not 2 times bigger or smaller than the original, write its line in the gff
        if not ((size_on_new_genome>Features[feat].size*n) | (size_on_new_genome<Features[feat].size/n)) : 
            chr=seg_genome[first_seg][0]
            strand=list_seg[0][0:1]
            nb_variants=0
            for segment in list_seg:
                if segment[1:] not in seg_genome: # if a segment is absent, it is either a deletion of a substitution. insertions are not considered here.
                    nb_variants+=1

            annotation=Features[feat].annot+";Size_diff="+str(size_diff)+";Nb_variants="+str(nb_variants)

            line=chr+"  GrAnnot   "+Features[feat].type+" "+str(get_start_2(first_seg,seg_genome,feat))+" "+str(get_stop_2(last_seg,seg_genome,feat))+"   .   "+strand+"   .   "+annotation+"\n"
            file_out.write(line)
        else:
            bad_size+=1
    else :
        absent_features+=1
    
    return list([bad_size,absent_features])


306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700
# outputs the detail of the variations on the new genome for the feature
def print_variations(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq,list_var,list_var_rate):
    string_print=""
    cpt_print=0
    
    # list of size difference, to get the average size difference
    diff_list=list()
    list_var_temp=[0,0,0,0,0]

    # create type_n the index of the list of the lists in list_var. for the stats of the number of variation by type of feature.
    type=Features[feat].type
    match type:
        case "gene":
            type_n=0
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "mRNA":
            type_n=1
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "intron":
            type_n=2
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "exon":
            type_n=3
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "three_prime_UTR":
            type_n=4
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "five_prime_UTR":
            type_n=5
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size
        case "CDS":
            type_n=6
            list_var[3][type_n]+=1
            list_var_temp[3]+=1
            list_var[4][type_n]+=Features[feat].size
            list_var_temp[4]+=Features[feat].size



    # get the list of the segments in a feature that are missing in the genome
    segments_missing=list()
    for segment in list_seg:
            if segment[1:] not in seg_genome:
                segments_missing.append(Segments[segment[1:]])

    # if there are missing segments, print them
    if (first_seg!='0') & (last_seg!='0'):

        var=0 # count variations, to see if there is any

        feature=Features[feat]
        feat_start=feature.start

        # get the lengths of the feature, on the original genome and on the new one
        start_new_genome=get_start_2(first_seg,seg_genome,feat)
        stop_new_genome=get_stop_2(last_seg,seg_genome,feat)
        size_new_genome=int(stop_new_genome)-int(start_new_genome)+1
        size_diff=str(size_new_genome-feature.size)

      

        diff_list.append(abs(int(size_diff))) # to have stats on the size of the segments missing
        #line=feat+" : \nlength on the original genome = "+str(feature.size)+"\nlength difference (length in the new genome-length in the original genome) = "+size_diff+"\n"
        #file_out2.write(line)

        
        # find the path in azucena. 
        first_strand=seg_genome[first_seg][3]
        first_seg_stranded=first_strand+first_seg
        last_strand=seg_genome[last_seg][3]
        last_seg_stranded=last_strand+last_seg
        id_first_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(first_seg_stranded))
        id_last_seg=int(paths["CM020642.1_Azucena_chromosome10"].index(last_seg_stranded))
        list_segfeat_azu=paths["CM020642.1_Azucena_chromosome10"][id_first_seg:id_last_seg+1]
        list_segfeat_nb=Features[feat].segments_list

        # loop to go through both paths
        i=0
        j=0

        last=''
        last_taille=0
        last_seq=''
        last_start=''
       #pos=''
        last_in_azu=''

        # check if there is an inversion
        inversion=detect_inversion(list_segfeat_nb,list_segfeat_azu)
        if inversion:
            inverted="1"
        else:
            inverted="0"
        #if detect_inversion(list_segfeat_nb,list_segfeat_azu):
        #    line="inversion detected\n"
        #    file_out2.write(line)

        # remove the strands from the lists of segments :
        list_segfeat_nb=remove_strand(list_segfeat_nb)
        list_segfeat_azu=remove_strand(list_segfeat_azu)

        # detect and print variations ignoring the strands
        while (i<len(list_segfeat_nb)) & (j<len(list_segfeat_azu)):
            if list_segfeat_nb[i] != list_segfeat_azu[j]: # if there is a difference between the two paths
                if list_segfeat_azu[j] not in list_segfeat_nb: # if the segment in azu is absent in nb
                    if list_segfeat_nb[i] not in list_segfeat_azu: # if the segment in nb is absent is azu
                        # is both segments are absent in the other genome, its a substitution
                        last_in_azu=list_segfeat_azu[j]

                        # print if we had an insertion or deletion running
                        if last=='insertion':
                            pos_old=str(int(Segments[last_start].start)-int(feat_start)+1)

                            start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                            start_var=int(seg_genome[last_start][1])-1
                            pos_new=str(start_var-start_feat+1)

                            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                            string_print+=line
                            cpt_print+=1
                            #file_out2.write(line)
                            list_var[1][type_n]+=1
                            list_var_temp[1]+=1
                            var+=1

                        elif last=='deletion':
                            pos_old=int(Segments[last_start].start)-int(feat_start)+1

                            start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                            start_var=int(seg_genome[last_in_azu][2])+1 
                            pos_new=str(start_var-start_feat+1)

                            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                            string_print+=line
                            cpt_print+=1
                            #file_out2.write(line)
                            list_var[0][type_n]+=1
                            list_var_temp[0]+=1                            
                            var+=1

                        last='';last_taille=0;last_seq='';last_start=''

                        # print the substitution
                        # substitution of segment list_segfeat_nb[i][1:] by segment list_segfeat_azu[j][1:]
                        pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1
                        start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat)
                        start_var=int(seg_genome[list_segfeat_azu[j-1]][2])+1
                        pos_new=str(start_var-start_feat+1)
                        # start du fragment (var) - start du gène +1

                        # start de la var (donc fin du segment d'avant +1) = seg_genome[id_seg][2]+1
                        # -> trouver le segment d'avant -> j-1.
                        

                        # start du gène sur azu -> get_start_2 premier_seg, seg_genome, feature_id
                        # feat_id = feat
                        # seg_genome = seg_gneome
                        # premier_seg sur azu -> list_segfeat_azu[0]
                        

                        # seg_genome donne les positions sur azu
                        # seg_genome[seg]=list([ch,start,stop,strand])
                        if len(seg_seq[list_segfeat_nb[i]]) == len(seg_seq[list_segfeat_azu[j]]): # if the substituion is between two segment of the same size, print it
                            size_subs=len(seg_seq[list_segfeat_nb[i]])
                            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tsubstitution\t"+seg_seq[list_segfeat_nb[i]]+"\t"+seg_seq[list_segfeat_azu[j]]+"\t"+str(size_subs)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                        else :
                            # if the segments of the substitution have a different size, print deletion then insertion at the same position.
                            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+seg_seq[list_segfeat_nb[i]]+"\t-\t"+str(len(seg_seq[list_segfeat_nb[i]]))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                            line+=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+seg_seq[list_segfeat_azu[j]]+"\t"+str(len(seg_seq[list_segfeat_azu[j]]))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                            var+=1
                        string_print+=line
                        cpt_print+=1
                        #file_out2.write(line)
                        list_var[2][type_n]+=1
                        list_var_temp[2]+=1
                        var+=1
                        # go to the next segments on the paths
                        i+=1;j+=1

                    else: # azu segment not in nb, but nb segment in azu : insertion
                        if last=='deletion':
                            if i==0:
                                pos_old=int(Segments[last_start].start)-int(feat_start)+1+feature.pos_start
                            else:
                                pos_old=int(Segments[last_start].start)-int(feat_start)+1

                            start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                            start_var=int(seg_genome[last_in_azu][2])+1 
                            pos_new=str(start_var-start_feat+1)

                            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                            string_print+=line
                            cpt_print+=1
                            #file_out2.write(line)
                            list_var[0][type_n]+=1
                            list_var_temp[0]+=1
                            var+=1;last='';last_taille=0;last_start='';last_seq=''

                        last_in_azu=list_segfeat_azu[j]

                        if last=='insertion':
                            #last_taille+=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
                            last_seq=last_seq+seg_seq[list_segfeat_azu[j]]
                        else:
                            last='insertion'
                            last_seq=seg_seq[list_segfeat_azu[j]]
                            #last_taille=int(seg_genome[list_segfeat_azu[j]][2])-int(seg_genome[list_segfeat_azu[j]][1])+1
                            last_start=list_segfeat_nb[i]
                        j+=1
                        
                elif list_segfeat_nb[i] not in list_segfeat_azu: # nb segment not in azu, but azu segment in nb : deletion
                    if last=='insertion':
                        pos_old=int(Segments[last_start].start)-int(feat_start)+1
                        
                        start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                        start_var=int(seg_genome[last_start][1])-1
                        pos_new=str(start_var-start_feat+1)
                        
                        line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                        string_print+=line
                        cpt_print+=1
                        #file_out2.write(line)
                        list_var[1][type_n]+=1
                        list_var_temp[1]+=1
                        var+=1;last='';last_start='';last_taille=0;last_seq=''

                    if last=='deletion':
                        #last_taille+=Segments[list_segfeat_nb[i]].size
                        last_seq=last_seq+seg_seq[list_segfeat_nb[i]]
                    else:
                        last='deletion'
                        last_start=list_segfeat_nb[i]
                        #last_taille=Segments[list_segfeat_nb[i]].size
                        if i==0: # if the deletion is at the start of the feature, the deletion doesnt start at the start at the first segment : 
                                    #use pos_start, position of the feature on its first segment
                            last_seq=seg_seq[list_segfeat_nb[i]][feature.pos_start-1:]
                        else: # else, the deletion will always start at the start of the first segment.
                            last_seq=seg_seq[list_segfeat_nb[i]]
                        #if i==0:
                        #    start=1
                    i+=1
                    
                else : # idk yet. if both segments are present in the other genome but not at the same position. probably substitution then
                    line="weird order change"
                    string_print+=line
                    cpt_print+=1
                    #file_out2.write(line)
                    var+=1;i+=1;j+=1

            else: # segment present in both. print the running indel if there is one
                if last=='insertion':
                    pos_old=int(Segments[last_start].start)-int(feat_start)+1

                    start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                    start_var=int(seg_genome[last_start][1])-1
                    pos_new=str(start_var-start_feat+1)
                    
                    line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                    string_print+=line
                    cpt_print+=1
                    #file_out2.write(line)
                    list_var[1][type_n]+=1
                    list_var_temp[1]+=1
                    var+=1

                elif last=='deletion':
                    if i==0:
                        pos_old=int(Segments[last_start].start)-int(feat_start)+1+feature.pos_start
                    else:
                        pos_old=int(Segments[last_start].start)-int(feat_start)+1

                    if last_in_azu=="": # deletion of the beggining of the feature, so no segment placed in the new genome yet. 
                        pos_new="1"
                    else :
                        start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
                        start_var=int(seg_genome[last_in_azu][2])+1 
                        pos_new=str(start_var-start_feat+1)
                    
                    line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                    string_print+=line
                    cpt_print+=1
                    #file_out2.write(line)
                    list_var[0][type_n]+=1
                    list_var_temp[0]+=1
                    var+=1

                last_in_azu=list_segfeat_azu[j]

                last='';last_taille=0;last_start='';last_seq='';i+=1;j+=1

                
            
        # finish printing the current indel
        if last=='insertion':
            pos_old=int(Segments[last_start].start)-int(feat_start)+1

            start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
            start_var=int(seg_genome[last_start][1])-1
            pos_new=str(start_var-start_feat+1)

            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tinsertion\t-\t"+last_seq+"\t"+str(last_taille)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
            string_print+=line
            cpt_print+=1
            #file_out2.write(line)
            list_var[1][type_n]+=1
            list_var_temp[1]+=1
            var+=1

        elif last=='deletion':
            pos_old=int(Segments[last_start].start)-int(feat_start)+1

            start_feat=get_start_2(list_segfeat_azu[0],seg_genome,feat) 
            start_var=int(seg_genome[last_in_azu][2])+1 
            pos_new=str(start_var-start_feat+1)

            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+last_seq+"\t-\t"+str(len(last_seq))+"\t"+str(pos_old)+"\t"+pos_new+"\n"
            string_print+=line
            cpt_print+=1
            #file_out2.write(line)
            list_var[0][type_n]+=1
            list_var_temp[0]+=1
            var+=1


        # see if the end is missing for one of the two genomes
        if not((i>=len(list_segfeat_nb)-1) & (j>=len(list_segfeat_azu)-1)):
            if (i<len(list_segfeat_nb)-1):
                if not (j>=len(list_segfeat_azu)-1):
                    print('pb')
                # length of the deletion = position of the end of the feature - position of the start of the deletion
                #length=feature.stop-Segments[list_segfeat_nb[i]].start # pas bon, on prend le dernier segment en entier. len(del_sequence) plus sur. 
                pos_old=int(Segments[list_segfeat_nb[i]].start)-int(feat_start)+1

                del_sequence=""
                for k in range(i,len(list_segfeat_nb)):
                    if k==len(list_segfeat_nb):
                        del_sequence+=seg_seq[list_segfeat_nb[k]][0:feature.pos_stop]
                    else:
                        del_sequence+=seg_seq[list_segfeat_nb[k]]
                length=len(del_sequence)

                pos_new=str(length) # the deletion is at the end of the feature on the new genome

                line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(stop_new_genome)+"\t"+str(size_new_genome)+"\t"+inverted+"\t"+size_diff+"\tdeletion\t"+del_sequence+"\t-\t"+str(length)+"\t"+str(pos_old)+"\t"+pos_new+"\n"
                string_print+=line
                cpt_print+=1
                #file_out2.write(line)
                list_var[0][type_n]+=1
                var+=1
            elif (j<len(list_segfeat_azu)-1):
                print("ins at the end ?")
            #   string_print+=line   
            #   #file_out2.write("insertion at the end\n")
            #   list_var[1][type_n]+=1
            #   var+=1

        if var==0:
            line=feat+"\t"+feature.type+"\t"+feature.chr+"\t"+str(start_new_genome)+"\t"+str(size_new_genome)+"\t"+size_diff+"\tno_var\t-\t-\t-\t-\t-\n"
            string_print+=line
            cpt_print+=1
            #file_out2.write(line)
    #else:
        #line=feat+" : feature entirely absent.\n"
        #file_out2.write(line)


    for i in [0,1,2]:
        if list_var_temp[4]!=0:
            list_var_rate[i][type_n].append(list_var_temp[i]/list_var_temp[4])
            # no need to return this list, it modifies the original list directly

    if string_print!="":
        file_out2.write(string_print)

    return diff_list


# writes the gff of azucena using the gff of the graph
def genome_gff(pos_seg, gff, out, gfa):
    print("generation of the genome's gff ")
NMarthe's avatar
NMarthe committed
    # create a dictionnary with the positions of the segments on the genome to transfer on
    seg_genome={}
    bed=open(pos_seg,'r')
    lines=bed.readlines()
    for line in lines:
        line=line.split()
        if line[3][0:1]=='>':
            strand='+'
        elif line[3][0:1]=='<':
            strand='-'
        else:
            strand=''
        seg=line[3][1:]
        seg_genome[seg]=list([ch,start,stop,strand])
    # create a dictionnary with the sequences of the segments of the graph and a dictionary with the paths from the gfa. each genome name gives the list of its segments in order
    file_gfa=open(gfa,'r')
    lines_gfa=file_gfa.readlines()
    paths={}
    for line in lines_gfa:
        line=line.split()
        if (line[0]=="S"): # get the sequence of the segment
            seg_id='s'+line[1]
            seg_seq[seg_id]=line[2]

        if (line[0]=="W") & (line[1]!="_MINIGRAPH_"): # get the walk of the genome
            
            path=line[6].replace(">",";>")
            path=path.replace("<",";<").split(';')
            list_path=list()
            for segment in path:
                if segment[0:1]=='>':
                    list_path.append('+s'+segment[1:])
                elif segment[0:1]=='<':
                    list_path.append('-s'+segment[1:])
                else:
                    list_path.append('s'+segment[1:])
 
            paths[line[3]]=list_path


    file_out_detail = open("azucena_chr10_detail.gff", 'w')
    file_out=open(out,'w')
    # for stats by type of variation
    del_gene=0;del_mrna=0;del_intron=0;del_exon=0;del_3utr=0;del_5utr=0;del_cds=0
    deletions=[del_gene,del_mrna,del_intron,del_exon,del_3utr,del_5utr,del_cds]

    ins_gene=0;ins_mrna=0;ins_intron=0;ins_exon=0;ins_3utr=0;ins_5utr=0;ins_cds=0
    insertions=[ins_gene,ins_mrna,ins_intron,ins_exon,ins_3utr,ins_5utr,ins_cds]

    sub_gene=0;sub_mrna=0;sub_intron=0;sub_exon=0;sub_3utr=0;sub_5utr=0;sub_cds=0
    substitutions=[sub_gene,sub_mrna,sub_intron,sub_exon,sub_3utr,sub_5utr,sub_cds]

    nb_type=[0,0,0,0,0,0,0]
    length_type=[0,0,0,0,0,0,0]

    list_var=[deletions,insertions,substitutions,nb_type,length_type]

    deletions_rate=[list(),list(),list(),list(),list(),list(),list()]
    insertions_rate=[list(),list(),list(),list(),list(),list(),list()]
    substitutions_rate=[list(),list(),list(),list(),list(),list(),list()]
    list_var_rate=[deletions_rate,insertions_rate,substitutions_rate]

    stats=False
    plot=False
NMarthe's avatar
NMarthe committed
    # get the list of all the features to transfer from the gff
    lines=gff.readlines()
    for line in lines:
        id=(line.split()[8].split(";")[0].split("=")[1].replace(".","_").replace(":","_"))
        if id not in list_feature:
            list_feature.append(id)
NMarthe's avatar
NMarthe committed
    # create objects for stats on how many segments are absent in azucena, their average length, etc
    if stats==True:
        seg_first=list()
        seg_middle=list()
        seg_last=list()
        seg_ok=list()
        seg_entier=list()
        feature_missing_first=list()
        feature_missing_middle=list()
        feature_missing_last=list()
        feature_missing_all=list()
        feature_missing_total=list()
        feature_total=list()
        feature_ok=list()



        # for each feature, get list of the segments where it is and the first and last segment of the feature on the new genome

        first_seg="0"
        for segment in list_seg:
NMarthe's avatar
NMarthe committed
            if segment[1:] in seg_genome:
                first_seg=segment[1:]
                break
        last_seg="0"
        for segment in reversed(list_seg):
NMarthe's avatar
NMarthe committed
            if segment[1:] in seg_genome:
                last_seg=segment[1:]
        # stats on how many segments are absent in azucena, their average length, etc
NMarthe's avatar
NMarthe committed

        # segment missing that is at the start of a feature - seg_first
        # segment missing that is at the end of a feature - seg_last
        # segment missing that is in the middle of a feature - seg_middle
        # segment missing that contains the entire feature - seg_entier
        # total number of genes missing - seg_total_abs
        # segments not missing - seg_ok
        # segments missing of not - seg_total
NMarthe's avatar
NMarthe committed
            for segment in list_seg: # counts the segments in each category
NMarthe's avatar
NMarthe committed
                if segment[1:] not in seg_genome: # the segment is missing
NMarthe's avatar
NMarthe committed
                    if segment==list_seg[0]: # the segment is the first of the feature
                        if segment==list_seg[-1]: # the segment is the last of the feature
                            seg_entier.append(seg_len) # the segment is the first and the last, so it contains the entire feature
NMarthe's avatar
NMarthe committed
                            seg_first.append(seg_len) 
                    elif segment==list_seg[-1]: # the segment is the last of the feature
                        seg_last.append(seg_len)
                    else:
                        seg_middle.append(seg_len)
NMarthe's avatar
NMarthe committed
                else: # the segment is present on the genome
NMarthe's avatar
NMarthe committed
        # the fist segment of the feature is missing - feature_missing_first
        # the last segment of the feature is missing - feature_missing_last
        # at least one middle segment of the feature is missing - feature_missing_middle
        # the entire feature is missing ->feature_missing_all
        # at least one segment is missing first, last, or middle) - feature_missing_total
        # no segment is missing, the feature is complete - feature_ok  
        # total number of features, with missing segments or not - feature_total
            feature_total.append(feat)
NMarthe's avatar
NMarthe committed

            if (first_seg=='0') & (last_seg=='0') : # no segment of the feature is in the genome, the feature is missing entirely
                feature_missing_all.append(feat)
            elif first_seg != list_seg[0][1:]: # the first segment is missing 
                feature_missing_first.append(feat)
            elif last_seg!=list_seg[-1][1:]: # the last segment is missing
                feature_missing_last.append(feat)

NMarthe's avatar
NMarthe committed
            # go through all the segments, check if some are missing in the middle of the feature
            elif (len(list_seg)!=1) & (feat not in feature_missing_all): # to access the second to last element
                for segment in list_seg[1-(len(list_seg)-2)]:
                    if segment[1:] not in seg_genome:
                        feature_missing_middle.append(feat)
                        break


NMarthe's avatar
NMarthe committed
            # go through the segments, to see if one is missing anywhere on the feature
            for segment in list_seg:
NMarthe's avatar
NMarthe committed
               if segment[1:] not in seg_genome:
                   if feat not in feature_missing_total:
                        feature_missing_total.append(feat)
                        break

NMarthe's avatar
NMarthe committed
            # if the feature doesnt have a missing segment, it is complete.         ADD THE PATH CHECK !!
            if feat not in feature_missing_total:
                feature_ok.append(feat)



        # outputs each fragment of the feature in a gff  
        gff_detail(list_seg,seg_genome,feat,file_out_detail) # insertions !
        # outputs the feature once in a gff, from the first to the last segment present on the new genome
        n=2 # maximum difference size (n times bigger of smaller)
        result=gff_one(first_seg,last_seg,seg_genome,feat,list_seg,file_out,n)
        bad_size+=result[0]
        absent_features+=result[1]
        # outputs the detail of variations for the feature :
        variations_output=print_variations(list_seg,seg_genome,first_seg,last_seg,feat,file_out2,paths,seg_seq,list_var,list_var_rate)
        diff_list+=variations_output
    
    file_out_detail.close()
    file_out.close()
    file_out2.close()

    # changer la forme de l'output des variations
    #print(list_var)
    #print(list_var_rate)
    #print(sum(list_var_rate[1][0])/len(list_var_rate[1][0]))


    if plot==True:
        import seaborn as sns
        import matplotlib.pyplot as plt
        # set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above) 
        sns.set(style="darkgrid")

        import pandas as pd

        df_deletions = pd.DataFrame(list_var_rate[0])#, columns=['genes','mrna','introns','exons','3utr','5utr','cds'])
        df_deletions=df_deletions.T
        df_deletions.columns = ['genes', 'mrna', 'introns', 'exons', '3utr', '5utr', 'cds']

        sns.violinplot(data=df_deletions)
        plt.ylim(-0.2, 0.2)
        plt.show()

        df_insertions = pd.DataFrame(list_var_rate[1])#, columns=['genes','mrna','introns','exons','3utr','5utr','cds'])
        df_insertions=df_insertions.T
        df_insertions.columns = ['genes', 'mrna', 'introns', 'exons', '3utr', '5utr', 'cds']

        sns.violinplot(data=df_insertions)
        plt.ylim(-0.2, 0.2)
        plt.show()


        df_substitutions = pd.DataFrame(list_var_rate[2])#, columns=['genes','mrna','introns','exons','3utr','5utr','cds'])
        df_substitutions=df_substitutions.T
        df_substitutions.columns = ['genes', 'mrna', 'introns', 'exons', '3utr', '5utr', 'cds']

        sns.violinplot(data=df_substitutions)
        plt.ylim(-0.2, 0.2)
        plt.show()


    # print stats
    from statistics import median, mean
    if stats==True:
        print(len(Features)-(bad_size+absent_features),"out of",len(Features),"features are transfered.")
        print(bad_size,"out of",len(Features), "features are not transfered because they are too big or too small compared to the original genome.")
        print(absent_features,"out of",len(Features),"features are not transfered because they are absent in the new genome.")
        if len(diff_list)!=0:
            print("average length difference of the transfered genes : ",sum(diff_list)/len(diff_list))
        stats_features(feature_missing_first,feature_missing_middle,feature_missing_last,feature_missing_all,feature_missing_total,list_feature,feature_ok)
        # prints the stats for the segments
        #print(len(seg_first),"segments missing at the beginning of a feature, of mean length", round(mean(seg_first),2), "and median length",median(seg_first))
        #print(len(seg_middle),"segments missing in the middle of a feature, of mean length", round(mean(seg_middle),2), "and median length",median(seg_middle))
        #print(len(seg_last), "segments missing at the end of a feature, of mean length", round(mean(seg_last),2), "and median length",median(seg_last))
        #print(len(seg_entier),"segments that have an entiere feature (not in beggining/middle/end) missing, of mean length", round(mean(seg_entier),2), "and median length",median(seg_entier))
        #print(len(seg_total),"segments that have a feature piece missing, of mean length", round(mean(seg_total),2), "and median length",median(seg_total))
        #print(len(seg_ok),"segments that have a feature found, of mean length", round(mean(seg_ok),2), "and median length", median(seg_ok))