"tsg_io/git@forge.ird.fr:us191/TSG-QC.git" did not exist on "35905a69c5dad57fb6c9a8a6d740f664f35b850e"
Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--dbcan', '-i', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to input dbcan result file')
@click.option('--gff', '-g', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to input gff file')
@click.option('--output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to output count csv file')
def main(dbcan, gff, output):
dbcan = pd.read_table(dbcan)
dbcan.columns = ["gene", "HMMER", "Hotpep", "Diamond", "#ofTools"]
dbcan = dbcan[dbcan["#ofTools"] == 3]
gff = pd.read_table(gff)
gff.columns = ["contig", "source", "feature", "start", "stop", "score_1", "strand", "score_2", "id"]
gff = gff[gff["feature"] == "gene"]
gff["id"] = gff["id"].str.replace(r'Name=', "")
gff["id"] = gff["id"].str.replace(r'Parent=', "")
gff["id"] = gff["id"].str.replace(r'\;[0-9a-zA-Z_]*', "")
gff["id"] = gff["id"].str.replace(r'ID=', "")
print(gff.head())
gff = gff[gff["id"].isin(dbcan["gene"])]
#gff.reset_index(inplace=True)
dbcan = pd.concat([dbcan, gff], axis=1)
count = dbcan["contig"].value_counts()
count.to_csv(str(output))
if __name__ == '__main__':
main()