Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import re
import sys
from collections import defaultdict
from Bio import SeqIO
import pandas as pd
from collections import Counter
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--gff', '-g', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to input gff file')
@click.option('--output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to output file')
@click.option('--fasta_file', '-fasta', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to fasta file of effector')
def main(gff, output, fasta_file):
"""This count the number of effector per contig"""
id_effector = []
gff_parse = []
final_effectors = []
dico_gff = defaultdict(list)
for record in SeqIO.parse(fasta_file, "fasta"):
id_effector.append(record.id)
with open(gff,"r") as f1 :
for lignes in f1:
ligne = lignes.rstrip("\n")
col = ligne.split("\t")
if col[2] == "gene":
id_1 = re.sub("ID=", "", col[8])
id_2 = re.sub(";Name=\w+", "", id_1)
gff_parse.append(col[0]+" "+col[1]+" "+col[2]+" "+id_2)
dico_gff[col[0]].append(id_2)
#print(len(id_effector))
for cle in dico_gff:
for elem in dico_gff[cle]:
for id in id_effector:
if elem == id :
final_effectors.append(cle)
counts = pd.Series(final_effectors).value_counts()
counts.to_csv(output, header = False, sep="\t")
#print(counts)
#print(df)
if __name__ == '__main__':
main()