Add files via upload

f7f9d527 · thdurand4 · GitHub · 0fbfd71f · f7f9d527 · f7f9d527
Unverified Commit f7f9d527 authored 2 years ago by thdurand4 Committed by GitHub 2 years ago
--- a/config/config.yaml
+++ b/config/config.yaml
+DATA:
+    PROTEIN: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/fasta_prot/"
+    OUTPUT: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/results/"
+    SCRIPTS: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/scripts/"
+    BDD_PFAM: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/BDD_PFAM/Pfam-A.hmm"
+    GFF: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/GFF3_protein/"
+
+TOOLS_PARAMS:
+    PARSE_TARGETP_TRESHOLD: "0.8"
+    PARSE_SIGNALP_TRESHOLD: "0.8"
+    PARSE_TMHMM_TRESHOLD: "1"
+    PARSE_WOLFPOSORT_TRESHOLD: "14"
+    HMMER: "--cpu 8 -E 0.003"
\ No newline at end of file
--- a/run_worflow.sh
+++ b/run_worflow.sh
+#!/bin/sh
+### Job name
+#SBATCH --job-name=annotation
+
+### Requirements
+#SBATCH --partition=long
+
+
+
+### Output
+#SBATCH --output=/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/log_effector.out
+#SBATCH --error=/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/log_effector.err
+module purge
+module load r
+module load python/3.7
+
+
+snakemake --profile effector --use-envmodules
--- a/scripts/count_effectors.py
+++ b/scripts/count_effectors.py
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+import pandas as pd
+from collections import Counter
+import click
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--gff', '-g', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to input gff file')
+@click.option('--output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to output file')
+@click.option('--fasta_file', '-fasta', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to fasta file of effector')
+
+
+def main(gff, output, fasta_file):
+    """This count the number of effector per contig"""
+
+
+    id_effector = []
+    gff_parse = []
+    final_effectors = []
+    dico_gff = defaultdict(list)
+
+    for record in SeqIO.parse(fasta_file, "fasta"):
+        id_effector.append(record.id)
+
+
+
+    with open(gff,"r") as f1 :
+        for lignes in f1:
+                ligne = lignes.rstrip("\n")
+                col = ligne.split("\t")
+                id_1 = re.sub("ID=", "", col[8])
+                id_2 = re.sub(";Name=\w+", "", id_1)
+                if col[2] =="gene":
+                    gff_parse.append(col[0]+" "+col[1]+" "+col[2]+" "+id_2)
+                    dico_gff[col[0]].append(id_2)
+
+
+
+
+    #print(len(id_effector))
+
+
+    for cle in dico_gff:
+        for elem in dico_gff[cle]:
+            for id in id_effector:
+                if elem == id :
+                    final_effectors.append(cle)
+
+
+    counts = pd.Series(final_effectors).value_counts()
+    counts.to_csv(output, header = False, sep="\t")
+    #print(counts)
+
+    #print(df)
+
+
+if __name__ == '__main__':
+    main()
+
--- a/scripts/fasta_intersect.py
+++ b/scripts/fasta_intersect.py
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+import click
+
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--protein_file', '-p', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to fasta protein')
+@click.option('--secreted_id', '-s', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to ID with signal peptide')
+@click.option('--fasta_output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to fasta protein output')
+
+def main(protein_file, secreted_id, fasta_output):
+    """This programme use ID of protein with signal peptide and protein fasta file to generate fasta protein with this ID"""
+    signalpeptide = []
+    fasta_prot = defaultdict(str)
+
+    with open(secreted_id, "r") as f1:
+        for lignes in f1:
+            without_backspace = lignes.rstrip("\n")
+            signalpeptide.append(without_backspace)
+
+    for record in SeqIO.parse(protein_file, "fasta"):
+        fasta_prot[record.id] = record
+
+    fasta_secretion = []
+
+    for cle in fasta_prot:
+        for elem in signalpeptide:
+            if elem == cle:
+                fasta_secretion.append(fasta_prot[elem])
+
+    SeqIO.write(fasta_secretion,fasta_output,"fasta")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/scripts/gff_sort.py
+++ b/scripts/gff_sort.py
+import pandas as pd
+import click
+import re
+import os
+import sys
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--gff', '-g', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to input gff file')
+@click.option('--output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to output file')
+@click.option('--strain_name', '-name', default=None,
+              type=click.STRING,
+              required=True, show_default=True, help='Name of the strain')
+
+
+def main(gff, output, strain_name):
+    """This programme rename ID of the gff3 file"""
+    gene_gff = []
+    with open(gff, "r") as f1:
+        for lignes in f1:
+            col = lignes.split("\t")
+            if re.search("gene",col[2]):
+                id_strain = re.sub("ID=","ID="+strain_name+"_",col[8])
+                prot_gff = re.sub(";","T0;",id_strain)
+                gene_gff.append(col[0]+"\t"+col[1]+"\t"+col[2]+"\t"+col[3]+"\t"+col[4]+"\t"+col[5]+"\t"+col[6]+"\t"+col[7]+"\t"+prot_gff)
+
+    output_file = open(output,"w")
+    for elem in gene_gff:
+        output_file.write(elem)
+    output_file.close()
+if __name__ == '__main__':
+    main()
--- a/scripts/id_secreted_to_fasta.py
+++ b/scripts/id_secreted_to_fasta.py
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+import click
+
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--protein_file', '-fasta', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to fasta protein sorted with tmhmm')
+@click.option('--id_secreted_protein', '-id', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to id of secreted protein (output of wolfpsort parsed)')
+@click.option('--fasta_output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to fasta protein secreted output')
+
+def main(protein_file, id_secreted_protein, fasta_output):
+    """This programme use ID of secreted protein with signal peptide and protein fasta file to generate fasta protein with this ID"""
+    id_secreted = []
+    fasta_prot = defaultdict(str)
+
+    with open(id_secreted_protein, "r") as f1:
+        for lignes in f1:
+            without_backspace = lignes.rstrip("\n")
+            id_secreted.append(without_backspace)
+
+
+    for record in SeqIO.parse(protein_file, "fasta"):
+        fasta_prot[record.id] = record
+
+    fasta_secretion = []
+
+    for cle in fasta_prot:
+        for elem in id_secreted:
+            if elem == cle:
+                fasta_secretion.append(fasta_prot[elem])
+
+    SeqIO.write(fasta_secretion,fasta_output,"fasta")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/scripts/intersect.sh
+++ b/scripts/intersect.sh
+#!/bin/sh
+
+intersect() {
+    for file in "$@"; do
+        sort -u "$file"
+    done | sort | uniq -cd | grep "^[^0-9]*$# " | sed -e "s/      [0-9]//" | sed -e "s/ //"
+}
+
+
+intersect $1 $2 $3 $4
--- a/scripts/parse_phobius.py
+++ b/scripts/parse_phobius.py
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+import pandas as pd
+import click
+
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--phobius_file', '-t', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to output of phobius file')
+@click.option('--spphobius_output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to create output of secreted ID of sorted protein')
+
+def main(phobius_file, spphobius_output):
+    """ This program retrieve ID of secreted protein of phobius with TM == 0 or TM == 1 """
+    sptargetp_id = []
+    with open(phobius_file) as f1:
+        for lignes in f1:
+            if re.search("^[^SEQUENCE]",lignes):
+                col = lignes.split()
+                if col[1] == "0" and col[2] == "Y":
+                    #print(col[0],col[1],col[2])
+                    sptargetp_id.append(col[0])
+                if col[1] == "1" and col[2] == "Y":
+                    #print(col[0], col[1], col[2])
+                    sptargetp_id.append(col[0])
+
+    output_id = open(spphobius_output, "w")
+    for elem in sptargetp_id:
+        output_id.write(elem+"\n")
+    output_id.close()
+
+if __name__ == '__main__':
+    main()
+
+
+
--- a/scripts/parse_predgpi.py
+++ b/scripts/parse_predgpi.py
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+import pandas as pd
+import click
+
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--predgpi_file', '-p', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to output of predgpi tool')
+@click.option('--predgpi_output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to create ID of sorted protein with no membrane anchor')
+
+def main(predgpi_file, predgpi_output):
+    """ This program retrieve ID of secreted protein of predGPI with no anchor membrane"""
+    predgpi_id = []
+    with open(predgpi_file) as f1:
+        for lignes in f1:
+            col = lignes.split("\t")
+            if col[2] != "GPI-anchor":
+                #print(col[0],col[2])
+                predgpi_id.append(col[0])
+
+    output_id = open(predgpi_output, "w")
+    for elem in predgpi_id:
+        output_id.write(elem+"\n")
+    output_id.close()
+
+if __name__ == '__main__':
+    main()
+
+
+
--- a/scripts/parse_signalp.py
+++ b/scripts/parse_signalp.py
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+import pandas as pd
+import click
+
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--signalp_file_gff', '-s', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to output of gff3 of signalp')
+@click.option('--spsignalp_output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to create secreted ID of sorted protein')
+@click.option('--treshold', '-th', default=None,
+              type=click.FloatRange(min=0, max=1, min_open=True, max_open=True),
+              required=True, show_default=True, help='treshold for protein with signal peptide (between 0 & 1)')
+
+def main(signalp_file_gff, spsignalp_output, treshold):
+    """ This program retrieve ID of secreted protein of signalp with cutoff between 0 & 1 """
+    spsignalp_id = []
+    with open(signalp_file_gff) as f1:
+        for lignes in f1:
+            if re.search("^[^#]",lignes):
+                col = lignes.split("\t")
+                if float(col[5]) >= treshold:
+                    #print(col[0],col[5])
+                    spsignalp_id.append(col[0])
+
+    output_id = open(spsignalp_output, "w")
+    for elem in spsignalp_id:
+        output_id.write(elem+"\n")
+    output_id.close()
+
+if __name__ == '__main__':
+    main()
+
+
+
--- a/scripts/parse_targetp.py
+++ b/scripts/parse_targetp.py
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+import pandas as pd
+import click
+
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--targetp_file', '-t', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to output of targetp2 file')
+@click.option('--sptargetp_output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to create secreted ID of sorted protein')
+@click.option('--treshold', '-th', default=None,
+              type=click.FloatRange(min=0, max=1, min_open=True, max_open=True),
+              required=True, show_default=True, help='treshold for protein with signal peptide (between 0 & 1)')
+
+def main(targetp_file, sptargetp_output, treshold):
+    """ This program retrieve ID of secreted protein of targetp with cutoff between 0 & 1 """
+    sptargetp_id = []
+    with open(targetp_file) as f1:
+        for lignes in f1:
+            if re.search("^[^#]",lignes):
+                col = lignes.split("\t")
+                if col[1] == "SP" and float(col[3])>= treshold:
+                    #print(col[0],col[1],col[3])
+                    sptargetp_id.append(col[0])
+
+    output_id = open(sptargetp_output, "w")
+    for elem in sptargetp_id:
+        output_id.write(elem+"\n")
+    output_id.close()
+
+if __name__ == '__main__':
+    main()
+
+
+
--- a/scripts/parse_tmhmm.py
+++ b/scripts/parse_tmhmm.py
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+import pandas as pd
+import click
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--tmhmm_file', '-in', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to input of tmhmm results')
+@click.option('--parsetmhmm_output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to create secreted ID of sorted protein')
+@click.option('--transmembranaire', '-tm', default=None,
+              type=click.INT,
+              required=True, show_default=True, help='Number of maxiumum of transmembranaire domain you want to keep')
+
+def main(tmhmm_file, parsetmhmm_output, transmembranaire):
+    """ This program parse the output of tool tmhmm with a cutoff and transmembranaire domain (max) """
+    tm_parsing = []
+    with open(tmhmm_file) as f1:
+        for lignes in f1:
+                ligne = lignes.rstrip("\n")
+                col = ligne.split("\t")
+                nb_tm = col[4].split("=")
+                if  int(nb_tm[1]) <= transmembranaire:
+                    tm_parsing.append(ligne)
+    output_tmhmm = open(parsetmhmm_output, "w")
+    for elem in tm_parsing:
+        output_tmhmm.write(elem+"\n")
+    output_tmhmm.close()
+
+if __name__ == '__main__':
+    main()
+
--- a/scripts/parse_wolfpsort.py
+++ b/scripts/parse_wolfpsort.py
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+import pandas as pd
+import click
+
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--wolfpsort_file', '-in', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to the wolfpsort file')
+@click.option('--spwolfpsort_output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to output of wolfpsort parsed')
+@click.option('--treshold', '-th', default=None,
+              type=click.INT,
+              required=True, show_default=True, help='treshold for score wolfpsort')
+
+def main(wolfpsort_file, spwolfpsort_output, treshold):
+    """ This program retrieve ID of secreted protein of wolfpsort with score cutoff wolfpsort """
+    wolfpsort_id = []
+    with open(wolfpsort_file) as f1:
+        for lignes in f1:
+            if re.search("^[^#]",lignes):
+                good_lignes = re.sub(",","",lignes)
+                col = good_lignes.split(" ")
+                if col[1] == "extr" and int(col[2])>= treshold:
+                    print(col[0],col[1],col[2])
+                    wolfpsort_id.append(col[0])
+
+    output_id = open(spwolfpsort_output, "w")
+    for elem in wolfpsort_id:
+        output_id.write(elem+"\n")
+    output_id.close()
+
+if __name__ == '__main__':
+    main()
--- a/scripts/rename_prot.py
+++ b/scripts/rename_prot.py
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+from Bio.Seq import Seq
+import click
+import os
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--protein_file', '-p', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to fasta protein')
+@click.option('--fasta_output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to create fasta sorted protein')
+@click.option('--strain_name', '-name', default=None,
+              type=click.STRING,
+              required=True, show_default=True, help='Name of the strain')
+
+
+
+def main(protein_file, fasta_output, strain_name):
+    """This programme remove * character on protein seq and rename it"""
+    # read fasta and save to dict
+    sorted_prot = []
+    for record in SeqIO.parse(protein_file, "fasta"):
+        record.id = str(strain_name)+"_"+str(record.id)
+        no_stop = re.sub("\*","",str(record.seq))
+        record.seq = Seq(no_stop)
+        sorted_prot.append(record)
+        record.description =""
+        record.name = ""
+
+    SeqIO.write(sorted_prot,fasta_output,"fasta")
+
+if __name__ == '__main__':
+    main()
--- a/scripts/tmhmm_to_fasta.py
+++ b/scripts/tmhmm_to_fasta.py
+import re
+import sys
+from collections import defaultdict
+from Bio import SeqIO
+import click
+
+
+@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
+@click.option('--protein_file', '-p', default=None,
+              type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to fasta protein sorted with the 4 predicted tools (PredGPI , Phobius, SignalP, TargetP')
+@click.option('--tmhmm_parsed', '-tmhmm', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to output file of tmhmm parsed')
+@click.option('--fasta_output', '-o', default=None,
+              type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
+              required=True, show_default=True, help='Path to fasta protein output')
+
+def main(protein_file, tmhmm_parsed, fasta_output):
+    """This programme use ID of protein with signal peptide and protein fasta file to generate fasta protein with this ID"""
+    id_tmhmm = []
+    fasta_prot = defaultdict(str)
+
+    with open(tmhmm_parsed, "r") as f1:
+        for lignes in f1:
+            without_backspace = lignes.rstrip("\n")
+            col = without_backspace.split("\t")
+            id_tmhmm.append(col[0])
+
+
+    for record in SeqIO.parse(protein_file, "fasta"):
+        fasta_prot[record.id] = record
+
+    fasta_secretion = []
+
+    for cle in fasta_prot:
+        for elem in id_tmhmm:
+            if elem == cle:
+                fasta_secretion.append(fasta_prot[elem])
+
+    SeqIO.write(fasta_secretion,fasta_output,"fasta")
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/snakefile
+++ b/snakefile