Skip to content
Snippets Groups Projects
Unverified Commit f7f9d527 authored by thdurand4's avatar thdurand4 Committed by GitHub
Browse files

Add files via upload

parent 0fbfd71f
No related branches found
No related tags found
No related merge requests found
DATA:
PROTEIN: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/fasta_prot/"
OUTPUT: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/results/"
SCRIPTS: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/scripts/"
BDD_PFAM: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/BDD_PFAM/Pfam-A.hmm"
GFF: "/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/GFF3_protein/"
TOOLS_PARAMS:
PARSE_TARGETP_TRESHOLD: "0.8"
PARSE_SIGNALP_TRESHOLD: "0.8"
PARSE_TMHMM_TRESHOLD: "1"
PARSE_WOLFPOSORT_TRESHOLD: "14"
HMMER: "--cpu 8 -E 0.003"
\ No newline at end of file
#!/bin/sh
### Job name
#SBATCH --job-name=annotation
### Requirements
#SBATCH --partition=long
### Output
#SBATCH --output=/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/log_effector.out
#SBATCH --error=/shared/home/tdurand/annotation_fijiensism2/workflow_effectors/log_effector.err
module purge
module load r
module load python/3.7
snakemake --profile effector --use-envmodules
import re
import sys
from collections import defaultdict
from Bio import SeqIO
import pandas as pd
from collections import Counter
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--gff', '-g', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to input gff file')
@click.option('--output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to output file')
@click.option('--fasta_file', '-fasta', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to fasta file of effector')
def main(gff, output, fasta_file):
"""This count the number of effector per contig"""
id_effector = []
gff_parse = []
final_effectors = []
dico_gff = defaultdict(list)
for record in SeqIO.parse(fasta_file, "fasta"):
id_effector.append(record.id)
with open(gff,"r") as f1 :
for lignes in f1:
ligne = lignes.rstrip("\n")
col = ligne.split("\t")
id_1 = re.sub("ID=", "", col[8])
id_2 = re.sub(";Name=\w+", "", id_1)
if col[2] =="gene":
gff_parse.append(col[0]+" "+col[1]+" "+col[2]+" "+id_2)
dico_gff[col[0]].append(id_2)
#print(len(id_effector))
for cle in dico_gff:
for elem in dico_gff[cle]:
for id in id_effector:
if elem == id :
final_effectors.append(cle)
counts = pd.Series(final_effectors).value_counts()
counts.to_csv(output, header = False, sep="\t")
#print(counts)
#print(df)
if __name__ == '__main__':
main()
import re
import sys
from collections import defaultdict
from Bio import SeqIO
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--protein_file', '-p', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to fasta protein')
@click.option('--secreted_id', '-s', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to ID with signal peptide')
@click.option('--fasta_output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to fasta protein output')
def main(protein_file, secreted_id, fasta_output):
"""This programme use ID of protein with signal peptide and protein fasta file to generate fasta protein with this ID"""
signalpeptide = []
fasta_prot = defaultdict(str)
with open(secreted_id, "r") as f1:
for lignes in f1:
without_backspace = lignes.rstrip("\n")
signalpeptide.append(without_backspace)
for record in SeqIO.parse(protein_file, "fasta"):
fasta_prot[record.id] = record
fasta_secretion = []
for cle in fasta_prot:
for elem in signalpeptide:
if elem == cle:
fasta_secretion.append(fasta_prot[elem])
SeqIO.write(fasta_secretion,fasta_output,"fasta")
if __name__ == '__main__':
main()
\ No newline at end of file
import pandas as pd
import click
import re
import os
import sys
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--gff', '-g', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to input gff file')
@click.option('--output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to output file')
@click.option('--strain_name', '-name', default=None,
type=click.STRING,
required=True, show_default=True, help='Name of the strain')
def main(gff, output, strain_name):
"""This programme rename ID of the gff3 file"""
gene_gff = []
with open(gff, "r") as f1:
for lignes in f1:
col = lignes.split("\t")
if re.search("gene",col[2]):
id_strain = re.sub("ID=","ID="+strain_name+"_",col[8])
prot_gff = re.sub(";","T0;",id_strain)
gene_gff.append(col[0]+"\t"+col[1]+"\t"+col[2]+"\t"+col[3]+"\t"+col[4]+"\t"+col[5]+"\t"+col[6]+"\t"+col[7]+"\t"+prot_gff)
output_file = open(output,"w")
for elem in gene_gff:
output_file.write(elem)
output_file.close()
if __name__ == '__main__':
main()
import re
import sys
from collections import defaultdict
from Bio import SeqIO
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--protein_file', '-fasta', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to fasta protein sorted with tmhmm')
@click.option('--id_secreted_protein', '-id', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to id of secreted protein (output of wolfpsort parsed)')
@click.option('--fasta_output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to fasta protein secreted output')
def main(protein_file, id_secreted_protein, fasta_output):
"""This programme use ID of secreted protein with signal peptide and protein fasta file to generate fasta protein with this ID"""
id_secreted = []
fasta_prot = defaultdict(str)
with open(id_secreted_protein, "r") as f1:
for lignes in f1:
without_backspace = lignes.rstrip("\n")
id_secreted.append(without_backspace)
for record in SeqIO.parse(protein_file, "fasta"):
fasta_prot[record.id] = record
fasta_secretion = []
for cle in fasta_prot:
for elem in id_secreted:
if elem == cle:
fasta_secretion.append(fasta_prot[elem])
SeqIO.write(fasta_secretion,fasta_output,"fasta")
if __name__ == '__main__':
main()
\ No newline at end of file
#!/bin/sh
intersect() {
for file in "$@"; do
sort -u "$file"
done | sort | uniq -cd | grep "^[^0-9]*$# " | sed -e "s/ [0-9]//" | sed -e "s/ //"
}
intersect $1 $2 $3 $4
import re
import sys
from collections import defaultdict
from Bio import SeqIO
import pandas as pd
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--phobius_file', '-t', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to output of phobius file')
@click.option('--spphobius_output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to create output of secreted ID of sorted protein')
def main(phobius_file, spphobius_output):
""" This program retrieve ID of secreted protein of phobius with TM == 0 or TM == 1 """
sptargetp_id = []
with open(phobius_file) as f1:
for lignes in f1:
if re.search("^[^SEQUENCE]",lignes):
col = lignes.split()
if col[1] == "0" and col[2] == "Y":
#print(col[0],col[1],col[2])
sptargetp_id.append(col[0])
if col[1] == "1" and col[2] == "Y":
#print(col[0], col[1], col[2])
sptargetp_id.append(col[0])
output_id = open(spphobius_output, "w")
for elem in sptargetp_id:
output_id.write(elem+"\n")
output_id.close()
if __name__ == '__main__':
main()
import re
import sys
from collections import defaultdict
from Bio import SeqIO
import pandas as pd
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--predgpi_file', '-p', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to output of predgpi tool')
@click.option('--predgpi_output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to create ID of sorted protein with no membrane anchor')
def main(predgpi_file, predgpi_output):
""" This program retrieve ID of secreted protein of predGPI with no anchor membrane"""
predgpi_id = []
with open(predgpi_file) as f1:
for lignes in f1:
col = lignes.split("\t")
if col[2] != "GPI-anchor":
#print(col[0],col[2])
predgpi_id.append(col[0])
output_id = open(predgpi_output, "w")
for elem in predgpi_id:
output_id.write(elem+"\n")
output_id.close()
if __name__ == '__main__':
main()
import re
import sys
from collections import defaultdict
from Bio import SeqIO
import pandas as pd
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--signalp_file_gff', '-s', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to output of gff3 of signalp')
@click.option('--spsignalp_output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to create secreted ID of sorted protein')
@click.option('--treshold', '-th', default=None,
type=click.FloatRange(min=0, max=1, min_open=True, max_open=True),
required=True, show_default=True, help='treshold for protein with signal peptide (between 0 & 1)')
def main(signalp_file_gff, spsignalp_output, treshold):
""" This program retrieve ID of secreted protein of signalp with cutoff between 0 & 1 """
spsignalp_id = []
with open(signalp_file_gff) as f1:
for lignes in f1:
if re.search("^[^#]",lignes):
col = lignes.split("\t")
if float(col[5]) >= treshold:
#print(col[0],col[5])
spsignalp_id.append(col[0])
output_id = open(spsignalp_output, "w")
for elem in spsignalp_id:
output_id.write(elem+"\n")
output_id.close()
if __name__ == '__main__':
main()
import re
import sys
from collections import defaultdict
from Bio import SeqIO
import pandas as pd
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--targetp_file', '-t', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to output of targetp2 file')
@click.option('--sptargetp_output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to create secreted ID of sorted protein')
@click.option('--treshold', '-th', default=None,
type=click.FloatRange(min=0, max=1, min_open=True, max_open=True),
required=True, show_default=True, help='treshold for protein with signal peptide (between 0 & 1)')
def main(targetp_file, sptargetp_output, treshold):
""" This program retrieve ID of secreted protein of targetp with cutoff between 0 & 1 """
sptargetp_id = []
with open(targetp_file) as f1:
for lignes in f1:
if re.search("^[^#]",lignes):
col = lignes.split("\t")
if col[1] == "SP" and float(col[3])>= treshold:
#print(col[0],col[1],col[3])
sptargetp_id.append(col[0])
output_id = open(sptargetp_output, "w")
for elem in sptargetp_id:
output_id.write(elem+"\n")
output_id.close()
if __name__ == '__main__':
main()
import re
import sys
from collections import defaultdict
from Bio import SeqIO
import pandas as pd
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--tmhmm_file', '-in', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to input of tmhmm results')
@click.option('--parsetmhmm_output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to create secreted ID of sorted protein')
@click.option('--transmembranaire', '-tm', default=None,
type=click.INT,
required=True, show_default=True, help='Number of maxiumum of transmembranaire domain you want to keep')
def main(tmhmm_file, parsetmhmm_output, transmembranaire):
""" This program parse the output of tool tmhmm with a cutoff and transmembranaire domain (max) """
tm_parsing = []
with open(tmhmm_file) as f1:
for lignes in f1:
ligne = lignes.rstrip("\n")
col = ligne.split("\t")
nb_tm = col[4].split("=")
if int(nb_tm[1]) <= transmembranaire:
tm_parsing.append(ligne)
output_tmhmm = open(parsetmhmm_output, "w")
for elem in tm_parsing:
output_tmhmm.write(elem+"\n")
output_tmhmm.close()
if __name__ == '__main__':
main()
import re
import sys
from collections import defaultdict
from Bio import SeqIO
import pandas as pd
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--wolfpsort_file', '-in', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to the wolfpsort file')
@click.option('--spwolfpsort_output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to output of wolfpsort parsed')
@click.option('--treshold', '-th', default=None,
type=click.INT,
required=True, show_default=True, help='treshold for score wolfpsort')
def main(wolfpsort_file, spwolfpsort_output, treshold):
""" This program retrieve ID of secreted protein of wolfpsort with score cutoff wolfpsort """
wolfpsort_id = []
with open(wolfpsort_file) as f1:
for lignes in f1:
if re.search("^[^#]",lignes):
good_lignes = re.sub(",","",lignes)
col = good_lignes.split(" ")
if col[1] == "extr" and int(col[2])>= treshold:
print(col[0],col[1],col[2])
wolfpsort_id.append(col[0])
output_id = open(spwolfpsort_output, "w")
for elem in wolfpsort_id:
output_id.write(elem+"\n")
output_id.close()
if __name__ == '__main__':
main()
import re
import sys
from collections import defaultdict
from Bio import SeqIO
from Bio.Seq import Seq
import click
import os
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--protein_file', '-p', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to fasta protein')
@click.option('--fasta_output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to create fasta sorted protein')
@click.option('--strain_name', '-name', default=None,
type=click.STRING,
required=True, show_default=True, help='Name of the strain')
def main(protein_file, fasta_output, strain_name):
"""This programme remove * character on protein seq and rename it"""
# read fasta and save to dict
sorted_prot = []
for record in SeqIO.parse(protein_file, "fasta"):
record.id = str(strain_name)+"_"+str(record.id)
no_stop = re.sub("\*","",str(record.seq))
record.seq = Seq(no_stop)
sorted_prot.append(record)
record.description =""
record.name = ""
SeqIO.write(sorted_prot,fasta_output,"fasta")
if __name__ == '__main__':
main()
import re
import sys
from collections import defaultdict
from Bio import SeqIO
import click
@click.command(context_settings={'help_option_names': ('-h', '--help'), "max_content_width": 800})
@click.option('--protein_file', '-p', default=None,
type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to fasta protein sorted with the 4 predicted tools (PredGPI , Phobius, SignalP, TargetP')
@click.option('--tmhmm_parsed', '-tmhmm', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to output file of tmhmm parsed')
@click.option('--fasta_output', '-o', default=None,
type=click.Path(exists=False, file_okay=True, dir_okay=False, readable=True, resolve_path=True),
required=True, show_default=True, help='Path to fasta protein output')
def main(protein_file, tmhmm_parsed, fasta_output):
"""This programme use ID of protein with signal peptide and protein fasta file to generate fasta protein with this ID"""
id_tmhmm = []
fasta_prot = defaultdict(str)
with open(tmhmm_parsed, "r") as f1:
for lignes in f1:
without_backspace = lignes.rstrip("\n")
col = without_backspace.split("\t")
id_tmhmm.append(col[0])
for record in SeqIO.parse(protein_file, "fasta"):
fasta_prot[record.id] = record
fasta_secretion = []
for cle in fasta_prot:
for elem in id_tmhmm:
if elem == cle:
fasta_secretion.append(fasta_prot[elem])
SeqIO.write(fasta_secretion,fasta_output,"fasta")
if __name__ == '__main__':
main()
\ No newline at end of file
snakefile 0 → 100644
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment