Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ dependencies = [
"fastapi",
"starlette",
"uvicorn",
"polars<1.38.0"
"polars<1.38.0",
"pandas==2.2.1"
]
dynamic = ["version"]

Expand Down
68 changes: 62 additions & 6 deletions src/dcd_mapping/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import requests
from Bio.SearchIO import HSP
from Bio.SearchIO import parse as parse_blat
from Bio.SearchIO import read as read_blat
from Bio.SearchIO._model import Hit, QueryResult
from cool_seq_tool.schemas import Strand

Expand Down Expand Up @@ -92,7 +93,11 @@ def get_ref_genome_file(


def _run_blat(
target_args: str, query_file: Path, out_file: str, silent: bool
target_args: str,
query_file: Path,
reference_file: Path,
out_file: str,
silent: bool,
) -> subprocess.CompletedProcess:
"""Execute BLAT binary with relevant params.

Expand All @@ -109,9 +114,8 @@ def _run_blat(
:param silent: if True, suppress all console output
:return: process result
"""
reference_genome_file = get_ref_genome_file(silent=silent)
bin_name = os.environ["BLAT_BIN_PATH"] if "BLAT_BIN_PATH" in os.environ else "blat" # noqa: SIM401
command = f"{bin_name} {reference_genome_file} {target_args} -minScore=20 {query_file} {out_file}"
command = f"{bin_name} {reference_file} {target_args} -minScore=20 {query_file} {out_file}"
_logger.debug("Running BLAT command: %s", command)
result = subprocess.run( # noqa: UP022
command,
Expand Down Expand Up @@ -186,15 +190,20 @@ def _get_blat_output(
# TODO consider implementing support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments.
msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported."
raise NotImplementedError(msg)
process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
reference_genome_file = get_ref_genome_file(silent=silent)
process_result = _run_blat(
target_args, query_file, reference_genome_file, "/dev/stdout", silent
)
out_file = _write_blat_output_tempfile(process_result)

try:
output = parse_blat(out_file, "blat-psl")

except ValueError:
target_args = "-q=dnax -t=dnax"
process_result = _run_blat(target_args, query_file, "/dev/stdout", silent)
process_result = _run_blat(
target_args, query_file, reference_genome_file, "/dev/stdout", silent
)
out_file = _write_blat_output_tempfile(process_result)
try:
output = parse_blat(out_file, "blat-psl")
Expand Down Expand Up @@ -254,7 +263,7 @@ def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit:
return best_score_hit


def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None) -> HSP:
def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None = None) -> HSP:
"""Retrieve preferred HSP from BLAT Hit object.

If gene location data is available, prefer the HSP with the least distance
Expand Down Expand Up @@ -491,3 +500,50 @@ def build_alignment_result(
alignment_result = fetch_alignment(metadata, silent)

return alignment_result


def align_target_to_protein(
target_sequence: str, reference_sequence: str, silent: bool
) -> AlignmentResult:
with tempfile.NamedTemporaryFile() as query_tmp_file, tempfile.NamedTemporaryFile() as reference_tmp_file:
_write_query_file(Path(query_tmp_file.name), [">query", target_sequence])
_write_query_file(
Path(reference_tmp_file.name), [">reference", reference_sequence]
)
target_args = "-q=prot -t=prot"

process_result = _run_blat(
target_args,
query_tmp_file.name,
reference_tmp_file.name,
"/dev/stdout",
silent,
)
out_file = _write_blat_output_tempfile(process_result)

try:
blat_output = read_blat(out_file, "blat-psl")

except ValueError as e:
msg = "Unable to run successful BLAT on target sequence against selected reference protein sequence."
raise AlignmentError(msg) from e

best_hit = _get_best_hit(blat_output, None)
best_hsp = _get_best_hsp(best_hit)

query_subranges = []
hit_subranges = []
for fragment in best_hsp:
query_subranges.append(
SequenceRange(start=fragment.query_start, end=fragment.query_end)
)
hit_subranges.append(
SequenceRange(start=fragment.hit_start, end=fragment.hit_end)
)

return AlignmentResult(
query_range=SequenceRange(start=best_hsp.query_start, end=best_hsp.query_end),
query_subranges=query_subranges,
hit_range=SequenceRange(start=best_hsp.hit_start, end=best_hsp.hit_end),
hit_subranges=hit_subranges,
)
4 changes: 2 additions & 2 deletions src/dcd_mapping/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@ class MappedReferenceSequence(ReferenceSequence):
class AlignmentResult(BaseModel):
"""Define BLAT alignment output."""

chrom: str
strand: Strand
chrom: str | None = None
strand: Strand | None = None
coverage: float | None = None
ident_pct: float | None = None
query_range: SequenceRange
Expand Down
62 changes: 55 additions & 7 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from mavehgvs.util import parse_variant_strings
from mavehgvs.variant import Variant

from dcd_mapping.align import align_target_to_protein
from dcd_mapping.exceptions import (
MissingSequenceIdError,
UnsupportedReferenceSequenceNameSpaceError,
Expand Down Expand Up @@ -181,6 +182,7 @@ def _create_post_mapped_hgvs_strings(
tx: TxSelectResult | None = None,
alignment: AlignmentResult | None = None,
accession_id: str | None = None,
protein_alignment: AlignmentResult | None = None,
) -> list[str]:
"""Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings.

Expand Down Expand Up @@ -220,7 +222,7 @@ def _create_post_mapped_hgvs_strings(
if layer is AnnotationLayer.PROTEIN:
assert tx # noqa: S101. mypy help

variant = _adjust_protein_variant_to_ref(variant, tx)
variant = _adjust_protein_variant_to_ref(variant, protein_alignment)
hgvs_strings.append(tx.np + ":" + str(variant))
elif layer is AnnotationLayer.GENOMIC:
if accession_id:
Expand Down Expand Up @@ -250,14 +252,50 @@ def _create_post_mapped_hgvs_strings(

def _adjust_protein_variant_to_ref(
variant: Variant,
tx: TxSelectResult,
alignment: AlignmentResult,
) -> Variant:
# adjust starts - hgvs uses 1-based numbering for c. sequences, while blat hits are 0-based
starts = []
if isinstance(variant.positions, Iterable):
is_multi_position = True
for position in variant.positions:
position.position = position.position + tx.start
return variant
starts.append(position.position - 1)
else:
is_multi_position = False
starts.append(variant.positions.position - 1)

# get hit
query_subrange_containing_hit = None
target_subrange_containing_hit = None
for query_subrange, target_subrange in zip(
alignment.query_subranges, alignment.hit_subranges, strict=True
):
if all(
start >= query_subrange.start and start < query_subrange.end
for start in starts
):
query_subrange_containing_hit = query_subrange
target_subrange_containing_hit = target_subrange
break

if query_subrange_containing_hit is None or target_subrange_containing_hit is None:
msg = f"Cannot process variant {variant} because it is not fully contained within the aligned portion of the query sequence compared to the selected reference sequence"
raise ValueError(msg)

for idx, start in enumerate(starts):
# get variant start relative to the reference (the "hit")
# distance from beginning of query to variant start position:
query_to_start = start - query_subrange_containing_hit.start

# distance from beginning of ref to the variant start position:
ref_to_start = target_subrange_containing_hit.start + query_to_start

# add distance from ref to variant start; hgvs is 1-based, so convert back to 1-based
if is_multi_position:
variant.positions[idx].position = ref_to_start + 1
else:
variant.positions.position = ref_to_start + 1

variant.positions.position = variant.positions.position + tx.start
return variant


Expand Down Expand Up @@ -368,14 +406,16 @@ def _map_protein_coding_pro(
row: ScoreRow,
sequence_id: str,
transcript: TxSelectResult,
protein_align_result: AlignmentResult,
) -> MappedScore:
"""Construct VRS object mapping for ``hgvs_pro`` variant column entry

These arguments are a little lazy and could be pruned down later

:param row: A row of output from a MaveDB score set
:param sequence_id: The GA4GH accession for the provided sequence
:param transcript: The transcript selection information for a score set
:param transcript: The transcript selection information for a target
:param protein_align_result: The protein-protein alignment result for a target against its selected protein reference
:return: VRS mapping object if mapping succeeds
"""
if (
Expand Down Expand Up @@ -444,6 +484,7 @@ def _map_protein_coding_pro(
row.hgvs_pro,
AnnotationLayer.PROTEIN,
tx=transcript,
protein_alignment=protein_align_result,
)
post_mapped_protein = _construct_vrs_allele(
post_mapped_hgvs_strings,
Expand Down Expand Up @@ -729,6 +770,7 @@ def _map_protein_coding(
records: list[ScoreRow],
transcript: TxSelectResult | TxSelectError,
align_result: AlignmentResult,
silent: bool,
) -> list[MappedScore]:
"""Perform mapping on protein coding experiment results

Expand All @@ -748,6 +790,11 @@ def _map_protein_coding(
sequence = metadata.target_sequence
psequence_id = gsequence_id = store_sequence(sequence)

# align protein sequence to selected reference protein sequence to get offsets and gaps
protein_align_result = align_target_to_protein(
sequence, transcript.sequence, silent
)

variations: list[MappedScore] = []
for row in records:
hgvs_nt_mappings = None
Expand All @@ -767,7 +814,7 @@ def _map_protein_coding(
else:
if _hgvs_pro_is_valid(row.hgvs_pro):
hgvs_pro_mappings = _map_protein_coding_pro(
row, psequence_id, transcript
row, psequence_id, transcript, protein_align_result
)
elif (
not hgvs_nt_mappings
Expand Down Expand Up @@ -954,6 +1001,7 @@ def vrs_map(
records,
transcript,
align_result,
silent,
)
return _map_regulatory_noncoding(
metadata,
Expand Down
Loading