Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
frangiPANe_snake
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
DIADE
frangiPANe_snake
Commits
65ae93b3
Commit
65ae93b3
authored
1 year ago
by
christine.tranchant_ird.fr
Browse files
Options
Downloads
Patches
Plain Diff
Integrating module load and singu
parent
267daf0a
No related branches found
No related tags found
1 merge request
!1
Draft: Resolve "Vecscreen"
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
frangiPANe/snakefiles/vecscreen.smk
+6
-78
6 additions, 78 deletions
frangiPANe/snakefiles/vecscreen.smk
frangiPANe/snakemake_scripts/vecscreen.py
+73
-0
73 additions, 0 deletions
frangiPANe/snakemake_scripts/vecscreen.py
with
79 additions
and
78 deletions
frangiPANe/snakefiles/vecscreen.smk
+
6
−
78
View file @
65ae93b3
...
...
@@ -9,12 +9,15 @@ rule vecscreen:
univec_file = f"{frangipane_obj.path_univec}"
output:
fasta_file = f"{frangipane_obj.path_vecscreen}/{{sample_name}}-{{ref_name}}-vecscreen.fa"
singularity:
tools_config['SINGULARITY']['TOOLS']
envmodules:
tools_config["ENVMODULE"]["NCBI-TOOLS"]
log:
error = f"{frangipane_obj.path_logs}/vecscreen/{{sample_name}}-{{ref_name}}.log.e",
output = f"{frangipane_obj.path_logs}/vecscreen/{{sample_name}}-{{ref_name}}.log.o"
params:
length = frangipane_obj.get_config_value('CONTIGS','minLength'),
module = tools_config["ENVMODULE"]["NCBI-TOOLS"]
message:
f"""--------------------
-Running {{rule}}
...
...
@@ -26,80 +29,5 @@ rule vecscreen:
| - Filtering on sequences longer than : {{params.length}}
--------------------
"""
run:
from Bio import SeqIO
# Load NCBI-TOOLS module
cmd_load=f"module load {params.module}"
nb_seq, nb_seq_no_hits, nb_seq_suspicious = 0, 0, 0
with open(os.path.join(output.fasta_file), "w") as o:
for seq in SeqIO.parse(input.fasta_file, "fasta"):
# Vecscreen only works with fasta file, not sequence
tmp_file = output.fasta_file + ".tmp"
with open(tmp_file, "w") as tmp:
tmp.write('>' + str(seq.description) + '\n')
tmp.write(gs.format_60(str(seq.seq)) + '\n')
cmd = f'module load {params.module} && vecscreen -db {input.univec_file} -query {tmp_file} -outfmt 1 -text_output'
print(f"\t\t\ {seq.description} : vecscreen cmd : {cmd}")
try:
subprocess.run(cmd, shell=True, check=True)
except subprocess.CalledProcessError as e:
print(f"########## ERROR running Vecscreen command: {e}")
process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if 'No hits found' in process.stdout:
o.write('>' + str(seq.description) + '\n')
o.write(gs.format_60(str(seq.seq)) + '\n')
nb_seq += 1
nb_seq_no_hits += 1
else:
# print(process.stdout)
# print("Hits : >" + str(seq.description))
nb_seq_suspicious += 1
bounds = []
for line in process.stdout.split("\n"):
if len(line) > 0:
if line[0].isnumeric():
for value in line.split('\t'):
bounds.append(int(value))
min_seq = 0
max_seq = len(seq)
min_bounds = min(bounds)
max_bounds = max(bounds)
if min_bounds == min_seq + 1:
min_seq = max_bounds
new_length = max_seq - max_bounds
if new_length >= params.length:
header_fields = str(seq.description).split()
o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
header_fields[2:]) + '\n')
o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
nb_seq += 1
elif max_bounds == max_seq:
max_seq = min_bounds
new_length = max_seq
if new_length >= params.length:
header_fields = str(seq.description).split()
o.write('>' + header_fields[0] + " " + str(new_length) + " " + " ".join(
header_fields[2:]) + '\n')
o.write(gs.format_60(str(seq.seq[min_seq:max_seq])) + '\n')
nb_seq += 1
else:
print("Warning (seq : " + str(
seq.description) + ") : the detected contamination appears to be in the middle of the contig !")
cmd = f'rm {tmp_file}'
process = subprocess.run(cmd, shell=True, capture_output=True, text=True)
# text = f"""
# ### VecScreen
# <hr>
# * NO HITS : {nb_seq_no_hits}index_blast(vec_file.value)
# * SUSPICIOUS : {nb_seq_suspicious}
# * KEPT : {nb_seq}
# """
\ No newline at end of file
shell:
f"{frangipane_obj.snakemake_scripts}/vecscreen.py"
\ No newline at end of file
This diff is collapsed.
Click to expand it.
frangiPANe/snakemake_scripts/vecscreen.py
0 → 100644
+
73
−
0
View file @
65ae93b3
from
Bio
import
SeqIO
nb_seq
,
nb_seq_no_hits
,
nb_seq_suspicious
=
0
,
0
,
0
with
open
(
os
.
path
.
join
(
output
.
fasta_file
),
"
w
"
)
as
o
:
for
seq
in
SeqIO
.
parse
(
snakemake
.
input
.
fasta_file
,
"
fasta
"
):
# Vecscreen only works with fasta file, not sequence
tmp_file
=
snakemake
.
output
.
fasta_file
+
"
.tmp
"
with
open
(
tmp_file
,
"
w
"
)
as
tmp
:
tmp
.
write
(
'
>
'
+
str
(
seq
.
description
)
+
'
\n
'
)
tmp
.
write
(
gs
.
format_60
(
str
(
seq
.
seq
))
+
'
\n
'
)
print
(
f
"
\t\t
\
{
seq
.
description
}
: vecscreen cmd :
{
cmd
}
"
)
try
:
subprocess
.
run
(
cmd
,
shell
=
True
,
check
=
True
)
except
subprocess
.
CalledProcessError
as
e
:
print
(
f
"
########## ERROR running Vecscreen command:
{
e
}
"
)
process
=
subprocess
.
run
(
cmd
,
shell
=
True
,
capture_output
=
True
,
text
=
True
)
if
'
No hits found
'
in
process
.
stdout
:
o
.
write
(
'
>
'
+
str
(
seq
.
description
)
+
'
\n
'
)
o
.
write
(
gs
.
format_60
(
str
(
seq
.
seq
))
+
'
\n
'
)
nb_seq
+=
1
nb_seq_no_hits
+=
1
else
:
# print(process.stdout)
# print("Hits : >" + str(seq.description))
nb_seq_suspicious
+=
1
bounds
=
[]
for
line
in
process
.
stdout
.
split
(
"
\n
"
):
if
len
(
line
)
>
0
:
if
line
[
0
].
isnumeric
():
for
value
in
line
.
split
(
'
\t
'
):
bounds
.
append
(
int
(
value
))
min_seq
=
0
max_seq
=
len
(
seq
)
min_bounds
=
min
(
bounds
)
max_bounds
=
max
(
bounds
)
if
min_bounds
==
min_seq
+
1
:
min_seq
=
max_bounds
new_length
=
max_seq
-
max_bounds
if
new_length
>=
snakemake
.
params
.
length
:
header_fields
=
str
(
seq
.
description
).
split
()
o
.
write
(
'
>
'
+
header_fields
[
0
]
+
"
"
+
str
(
new_length
)
+
"
"
+
"
"
.
join
(
header_fields
[
2
:])
+
'
\n
'
)
o
.
write
(
gs
.
format_60
(
str
(
seq
.
seq
[
min_seq
:
max_seq
]))
+
'
\n
'
)
nb_seq
+=
1
elif
max_bounds
==
max_seq
:
max_seq
=
min_bounds
new_length
=
max_seq
if
new_length
>=
snakemake
.
params
.
length
:
header_fields
=
str
(
seq
.
description
).
split
()
o
.
write
(
'
>
'
+
header_fields
[
0
]
+
"
"
+
str
(
new_length
)
+
"
"
+
"
"
.
join
(
header_fields
[
2
:])
+
'
\n
'
)
o
.
write
(
gs
.
format_60
(
str
(
seq
.
seq
[
min_seq
:
max_seq
]))
+
'
\n
'
)
nb_seq
+=
1
else
:
print
(
"
Warning (seq :
"
+
str
(
seq
.
description
)
+
"
) : the detected contamination appears to be in the middle of the contig !
"
)
cmd
=
f
'
rm
{
tmp_file
}
'
process
=
subprocess
.
run
(
cmd
,
shell
=
True
,
capture_output
=
True
,
text
=
True
)
# text = f"""
# ### VecScreen
# <hr>
# * NO HITS : {nb_seq_no_hits}index_blast(vec_file.value)
# * SUSPICIOUS : {nb_seq_suspicious}
# * KEPT : {nb_seq}
# """
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment