diff --git a/pyproject.toml b/pyproject.toml index 58c986a..96d8ee2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,8 @@ dependencies = [ "fastapi", "starlette", "uvicorn", - "polars<1.38.0" + "polars<1.38.0", + "pandas==2.2.1" ] dynamic = ["version"] diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index 1d25546..e072098 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -10,6 +10,7 @@ import requests from Bio.SearchIO import HSP from Bio.SearchIO import parse as parse_blat +from Bio.SearchIO import read as read_blat from Bio.SearchIO._model import Hit, QueryResult from cool_seq_tool.schemas import Strand @@ -92,7 +93,11 @@ def get_ref_genome_file( def _run_blat( - target_args: str, query_file: Path, out_file: str, silent: bool + target_args: str, + query_file: Path, + reference_file: Path, + out_file: str, + silent: bool, ) -> subprocess.CompletedProcess: """Execute BLAT binary with relevant params. @@ -109,9 +114,8 @@ def _run_blat( :param silent: if True, suppress all console output :return: process result """ - reference_genome_file = get_ref_genome_file(silent=silent) bin_name = os.environ["BLAT_BIN_PATH"] if "BLAT_BIN_PATH" in os.environ else "blat" # noqa: SIM401 - command = f"{bin_name} {reference_genome_file} {target_args} -minScore=20 {query_file} {out_file}" + command = f"{bin_name} {reference_file} {target_args} -minScore=20 {query_file} {out_file}" _logger.debug("Running BLAT command: %s", command) result = subprocess.run( # noqa: UP022 command, @@ -186,7 +190,10 @@ def _get_blat_output( # TODO consider implementing support for mixed types, not hard to do - just split blat into two files and run command with each set of arguments. msg = "Mapping for score sets with a mix of nucleotide and protein target sequences is not currently supported." raise NotImplementedError(msg) - process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) + reference_genome_file = get_ref_genome_file(silent=silent) + process_result = _run_blat( + target_args, query_file, reference_genome_file, "/dev/stdout", silent + ) out_file = _write_blat_output_tempfile(process_result) try: @@ -194,7 +201,9 @@ def _get_blat_output( except ValueError: target_args = "-q=dnax -t=dnax" - process_result = _run_blat(target_args, query_file, "/dev/stdout", silent) + process_result = _run_blat( + target_args, query_file, reference_genome_file, "/dev/stdout", silent + ) out_file = _write_blat_output_tempfile(process_result) try: output = parse_blat(out_file, "blat-psl") @@ -254,7 +263,7 @@ def _get_best_hit(output: QueryResult, chromosome: str | None) -> Hit: return best_score_hit -def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None) -> HSP: +def _get_best_hsp(hit: Hit, gene_location: GeneLocation | None = None) -> HSP: """Retrieve preferred HSP from BLAT Hit object. If gene location data is available, prefer the HSP with the least distance @@ -491,3 +500,50 @@ def build_alignment_result( alignment_result = fetch_alignment(metadata, silent) return alignment_result + + +def align_target_to_protein( + target_sequence: str, reference_sequence: str, silent: bool +) -> AlignmentResult: + with tempfile.NamedTemporaryFile() as query_tmp_file, tempfile.NamedTemporaryFile() as reference_tmp_file: + _write_query_file(Path(query_tmp_file.name), [">query", target_sequence]) + _write_query_file( + Path(reference_tmp_file.name), [">reference", reference_sequence] + ) + target_args = "-q=prot -t=prot" + + process_result = _run_blat( + target_args, + query_tmp_file.name, + reference_tmp_file.name, + "/dev/stdout", + silent, + ) + out_file = _write_blat_output_tempfile(process_result) + + try: + blat_output = read_blat(out_file, "blat-psl") + + except ValueError as e: + msg = "Unable to run successful BLAT on target sequence against selected reference protein sequence." + raise AlignmentError(msg) from e + + best_hit = _get_best_hit(blat_output, None) + best_hsp = _get_best_hsp(best_hit) + + query_subranges = [] + hit_subranges = [] + for fragment in best_hsp: + query_subranges.append( + SequenceRange(start=fragment.query_start, end=fragment.query_end) + ) + hit_subranges.append( + SequenceRange(start=fragment.hit_start, end=fragment.hit_end) + ) + + return AlignmentResult( + query_range=SequenceRange(start=best_hsp.query_start, end=best_hsp.query_end), + query_subranges=query_subranges, + hit_range=SequenceRange(start=best_hsp.hit_start, end=best_hsp.hit_end), + hit_subranges=hit_subranges, + ) diff --git a/src/dcd_mapping/schemas.py b/src/dcd_mapping/schemas.py index 7d213df..75f0be6 100644 --- a/src/dcd_mapping/schemas.py +++ b/src/dcd_mapping/schemas.py @@ -106,8 +106,8 @@ class MappedReferenceSequence(ReferenceSequence): class AlignmentResult(BaseModel): """Define BLAT alignment output.""" - chrom: str - strand: Strand + chrom: str | None = None + strand: Strand | None = None coverage: float | None = None ident_pct: float | None = None query_range: SequenceRange diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index b36e043..4d4be1c 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -21,6 +21,7 @@ from mavehgvs.util import parse_variant_strings from mavehgvs.variant import Variant +from dcd_mapping.align import align_target_to_protein from dcd_mapping.exceptions import ( MissingSequenceIdError, UnsupportedReferenceSequenceNameSpaceError, @@ -181,12 +182,12 @@ def _create_post_mapped_hgvs_strings( tx: TxSelectResult | None = None, alignment: AlignmentResult | None = None, accession_id: str | None = None, + protein_alignment: AlignmentResult | None = None, ) -> list[str]: """Generate a list of (post-mapped) HGVS strings from one long string containing many valid HGVS substrings. - For protein annotations, these strings must be adjusted to match the offset defined by the start of the - transcript sequence. For genomic annotations, these strings must be adjusted to match the coordinates of - the reference alignment. + For protein annotations, these strings must be adjusted to match the alignment of the target to the selected reference protein sequence. + For genomic annotations, these strings must be adjusted to match the coordinates of the reference alignment. :param raw_description: A string containing valid HGVS sub-strings :param layer: An enum denoting the targeted annotation layer of these HGVS strings @@ -220,7 +221,7 @@ def _create_post_mapped_hgvs_strings( if layer is AnnotationLayer.PROTEIN: assert tx # noqa: S101. mypy help - variant = _adjust_protein_variant_to_ref(variant, tx) + variant = _adjust_protein_variant_to_ref(variant, protein_alignment) hgvs_strings.append(tx.np + ":" + str(variant)) elif layer is AnnotationLayer.GENOMIC: if accession_id: @@ -250,14 +251,50 @@ def _create_post_mapped_hgvs_strings( def _adjust_protein_variant_to_ref( variant: Variant, - tx: TxSelectResult, + alignment: AlignmentResult, ) -> Variant: + # adjust starts - hgvs uses 1-based numbering for c. sequences, while blat hits are 0-based + starts = [] if isinstance(variant.positions, Iterable): + is_multi_position = True for position in variant.positions: - position.position = position.position + tx.start - return variant + starts.append(position.position - 1) + else: + is_multi_position = False + starts.append(variant.positions.position - 1) + + # get hit + query_subrange_containing_hit = None + target_subrange_containing_hit = None + for query_subrange, target_subrange in zip( + alignment.query_subranges, alignment.hit_subranges, strict=True + ): + if all( + start >= query_subrange.start and start < query_subrange.end + for start in starts + ): + query_subrange_containing_hit = query_subrange + target_subrange_containing_hit = target_subrange + break + + if query_subrange_containing_hit is None or target_subrange_containing_hit is None: + msg = f"Cannot process variant {variant} because it is not fully contained within the aligned portion of the query sequence compared to the selected reference sequence" + raise ValueError(msg) + + for idx, start in enumerate(starts): + # get variant start relative to the reference (the "hit") + # distance from beginning of query to variant start position: + query_to_start = start - query_subrange_containing_hit.start + + # distance from beginning of ref to the variant start position: + ref_to_start = target_subrange_containing_hit.start + query_to_start + + # add distance from ref to variant start; hgvs is 1-based, so convert back to 1-based + if is_multi_position: + variant.positions[idx].position = ref_to_start + 1 + else: + variant.positions.position = ref_to_start + 1 - variant.positions.position = variant.positions.position + tx.start return variant @@ -368,6 +405,7 @@ def _map_protein_coding_pro( row: ScoreRow, sequence_id: str, transcript: TxSelectResult, + protein_align_result: AlignmentResult, ) -> MappedScore: """Construct VRS object mapping for ``hgvs_pro`` variant column entry @@ -375,7 +413,8 @@ def _map_protein_coding_pro( :param row: A row of output from a MaveDB score set :param sequence_id: The GA4GH accession for the provided sequence - :param transcript: The transcript selection information for a score set + :param transcript: The transcript selection information for a target + :param protein_align_result: The protein-protein alignment result for a target against its selected protein reference :return: VRS mapping object if mapping succeeds """ if ( @@ -444,6 +483,7 @@ def _map_protein_coding_pro( row.hgvs_pro, AnnotationLayer.PROTEIN, tx=transcript, + protein_alignment=protein_align_result, ) post_mapped_protein = _construct_vrs_allele( post_mapped_hgvs_strings, @@ -729,6 +769,7 @@ def _map_protein_coding( records: list[ScoreRow], transcript: TxSelectResult | TxSelectError, align_result: AlignmentResult, + silent: bool, ) -> list[MappedScore]: """Perform mapping on protein coding experiment results @@ -748,6 +789,13 @@ def _map_protein_coding( sequence = metadata.target_sequence psequence_id = gsequence_id = store_sequence(sequence) + # align protein sequence to selected reference protein sequence to get offsets and gaps + protein_align_result = None + if isinstance(transcript, TxSelectResult): + protein_align_result = align_target_to_protein( + sequence, transcript.sequence, silent + ) + variations: list[MappedScore] = [] for row in records: hgvs_nt_mappings = None @@ -765,9 +813,16 @@ def _map_protein_coding( error_message=str(transcript).strip("'"), ) else: - if _hgvs_pro_is_valid(row.hgvs_pro): + if _hgvs_pro_is_valid(row.hgvs_pro) and protein_align_result is not None: hgvs_pro_mappings = _map_protein_coding_pro( - row, psequence_id, transcript + row, psequence_id, transcript, protein_align_result + ) + # This should not occur because protein align result is only None if transcript selection failed, which should be caught by the TxSelectError check above. + elif protein_align_result is None: + hgvs_pro_mappings = MappedScore( + accession_id=row.accession, + score=row.score, + error_message="Could not perform mapping for protein variant because transcript sequence is missing or could not be aligned to reference sequence", ) elif ( not hgvs_nt_mappings @@ -954,6 +1009,7 @@ def vrs_map( records, transcript, align_result, + silent, ) return _map_regulatory_noncoding( metadata,