diff --git a/rust/bioscript-cli/src/report_options.rs b/rust/bioscript-cli/src/report_options.rs index b7e51d6..f6a95b9 100644 --- a/rust/bioscript-cli/src/report_options.rs +++ b/rust/bioscript-cli/src/report_options.rs @@ -247,7 +247,7 @@ fn generate_app_report(options: &AppReportOptions) -> Result<(), String> { input_index: options.loader.input_index.clone(), reference_file: options.loader.reference_file.clone(), reference_index: options.loader.reference_index.clone(), - detect_sex: options.detect_sex, + detect_sex: should_detect_sex(options), }; let mut input_inspection = inspect_file(input_file, &inspect_options).map_err(|err| err.to_string())?; @@ -369,6 +369,10 @@ fn loader_with_inspection( loader } +fn should_detect_sex(options: &AppReportOptions) -> bool { + options.detect_sex && options.sample_sex.is_none() +} + fn open_app_html_report_if_requested(options: &AppReportOptions) { if options.open_report && let Err(err) = open_html_report(&options.output_dir.join("index.html")) @@ -538,6 +542,7 @@ mod app_report_option_tests { assert_eq!(options.analysis_max_duration_ms, 2500); assert!(options.detect_sex); assert_eq!(options.sample_sex, Some(InferredSex::Female)); + assert!(!should_detect_sex(&options)); assert_eq!(options.loader.format, Some(GenotypeSourceFormat::Vcf)); assert_eq!( options.loader.input_index, diff --git a/rust/bioscript-formats/src/genotype.rs b/rust/bioscript-formats/src/genotype.rs index bf8f53a..b3af89f 100644 --- a/rust/bioscript-formats/src/genotype.rs +++ b/rust/bioscript-formats/src/genotype.rs @@ -936,11 +936,19 @@ mod tests { kind: Some(VariantKind::Insertion), ..VariantSpec::default() }; - assert!(vcf_row_matches_variant( + assert!(!vcf_row_matches_variant( &deletion_row, &insertion, Some(Assembly::Grch38) )); + let insertion_row = parse_vcf_record("1\t10\trsins\tT\tTC\t.\tPASS\t.\tGT\t0/1") + .unwrap() + .unwrap(); + assert!(vcf_row_matches_variant( + &insertion_row, + &insertion, + Some(Assembly::Grch38) + )); assert!(!vcf_row_matches_variant( &row, &VariantSpec::default(), diff --git a/rust/bioscript-formats/src/genotype/cram_backend/store.rs b/rust/bioscript-formats/src/genotype/cram_backend/store.rs index a957f4a..9ddadda 100644 --- a/rust/bioscript-formats/src/genotype/cram_backend/store.rs +++ b/rust/bioscript-formats/src/genotype/cram_backend/store.rs @@ -224,7 +224,7 @@ impl CramBackend { } } -const DEFAULT_MAX_CRAM_WORKERS: usize = 2; +const DEFAULT_MAX_CRAM_WORKERS: usize = 1; fn cram_lookup_worker_count(job_count: usize) -> usize { if job_count <= 1 { @@ -328,8 +328,12 @@ mod tests { #[test] fn cram_lookup_worker_count_bounds_requested_and_available_workers() { + unsafe { + env::remove_var("BIOSCRIPT_CRAM_THREADS"); + } assert_eq!(cram_lookup_worker_count(0), 1); assert_eq!(cram_lookup_worker_count(1), 1); + assert_eq!(cram_lookup_worker_count(8), 1); unsafe { env::set_var("BIOSCRIPT_CRAM_THREADS", "2"); diff --git a/rust/bioscript-formats/src/genotype/vcf/matching.rs b/rust/bioscript-formats/src/genotype/vcf/matching.rs index 9496a4c..5e71aa8 100644 --- a/rust/bioscript-formats/src/genotype/vcf/matching.rs +++ b/rust/bioscript-formats/src/genotype/vcf/matching.rs @@ -135,9 +135,8 @@ pub(crate) fn vcf_row_matches_variant( && alternate.len() < row.reference.len() }) } - VariantKind::Insertion | VariantKind::Indel => { - row.position == locus.start.saturating_sub(1) - } + VariantKind::Insertion => insertion_row_matches_variant(row, variant, &locus), + VariantKind::Indel => indel_row_matches_variant(row, variant, &locus), VariantKind::Other => row.position == locus.start, } } @@ -204,3 +203,62 @@ fn snp_row_has_catalog_allele(row: &ParsedVcfRow, variant: &VariantSpec) -> bool .any(|candidate| candidate.eq_ignore_ascii_case(reference)) }) } + +fn insertion_row_matches_variant( + row: &ParsedVcfRow, + variant: &VariantSpec, + locus: &GenomicLocus, +) -> bool { + indel_position_matches(row.position, locus) + && row.alternates.iter().any(|alternate| { + alternate.len() > row.reference.len() + && row_matches_catalog_alleles(row, alternate, variant) + }) +} + +fn indel_row_matches_variant( + row: &ParsedVcfRow, + variant: &VariantSpec, + locus: &GenomicLocus, +) -> bool { + indel_position_matches(row.position, locus) + && row.alternates.iter().any(|alternate| { + alternate.len() != row.reference.len() + && row_matches_catalog_alleles(row, alternate, variant) + }) +} + +fn indel_position_matches(row_position: i64, locus: &GenomicLocus) -> bool { + row_position == locus.start || row_position == locus.start.saturating_sub(1) +} + +fn row_matches_catalog_alleles(row: &ParsedVcfRow, alternate: &str, variant: &VariantSpec) -> bool { + let reference_matches = variant + .reference + .as_ref() + .is_none_or(|reference| reference.eq_ignore_ascii_case(&row.reference)); + let alternate_matches = variant + .alternate + .as_ref() + .is_none_or(|expected| expected.eq_ignore_ascii_case(alternate)); + + if reference_matches && alternate_matches { + return true; + } + + let Some(expected_reference) = variant.reference.as_ref() else { + return false; + }; + let Some(expected_alternate) = variant.alternate.as_ref() else { + return reference_matches; + }; + + indel_delta(expected_reference, expected_alternate) == indel_delta(&row.reference, alternate) +} + +fn indel_delta(reference: &str, alternate: &str) -> Option { + let alternate_len = isize::try_from(alternate.len()).ok()?; + let reference_len = isize::try_from(reference.len()).ok()?; + let delta = alternate_len - reference_len; + (delta != 0).then_some(delta) +} diff --git a/rust/bioscript-formats/tests/file_formats/vcf.rs b/rust/bioscript-formats/tests/file_formats/vcf.rs index e892615..54c8d98 100644 --- a/rust/bioscript-formats/tests/file_formats/vcf.rs +++ b/rust/bioscript-formats/tests/file_formats/vcf.rs @@ -264,7 +264,8 @@ fn vcf_locus_lookup_handles_deletion_insertion_and_unresolved_evidence() { ##FORMAT=\n\ #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n\ 1\t99\t.\tAT\tA\t.\tPASS\t.\tGT\t0/1\n\ - chr1\t199\t.\tA\tATG\t.\tPASS\t.\tGT\t0/1\n", + chr1\t199\t.\tA\tATG\t.\tPASS\t.\tGT\t0/1\n\ + chr1\t250\trs8176719\tT\tTC\t.\tPASS\t.\tGT\t0/1\n", ) .unwrap(); @@ -313,6 +314,27 @@ fn vcf_locus_lookup_handles_deletion_insertion_and_unresolved_evidence() { insertion.evidence ); + let anchored_insertion = store + .lookup_variant(&VariantSpec { + grch37: Some(bioscript_core::GenomicLocus { + chrom: "1".to_owned(), + start: 250, + end: 250, + }), + reference: Some("T".to_owned()), + alternate: Some("TC".to_owned()), + kind: Some(VariantKind::Insertion), + ..VariantSpec::default() + }) + .unwrap(); + assert_eq!(anchored_insertion.genotype.as_deref(), Some("DI")); + assert_eq!(anchored_insertion.evidence[0], "resolved by locus chr1:250"); + assert!( + anchored_insertion.evidence[1].contains("source line: chr1 250"), + "{:?}", + anchored_insertion.evidence + ); + let unresolved = store .lookup_variant(&VariantSpec { grch37: Some(bioscript_core::GenomicLocus { diff --git a/rust/bioscript-reporting/src/observation.rs b/rust/bioscript-reporting/src/observation.rs index 2d57846..ca9a9c3 100644 --- a/rust/bioscript-reporting/src/observation.rs +++ b/rust/bioscript-reporting/src/observation.rs @@ -453,6 +453,30 @@ mod tests { ); } + #[test] + fn normalizes_anchored_insertion_deletion_tokens() { + assert_eq!( + normalize_app_genotype("DD", "T", "TC", Some(VariantKind::Insertion), "9", None,), + ("0/0".to_owned(), "hom_ref".to_owned()) + ); + assert_eq!( + normalize_app_genotype("DI", "T", "TC", Some(VariantKind::Insertion), "9", None,), + ("0/1".to_owned(), "het".to_owned()) + ); + assert_eq!( + normalize_app_genotype("II", "T", "TC", Some(VariantKind::Insertion), "9", None,), + ("1/1".to_owned(), "hom_alt".to_owned()) + ); + } + + #[test] + fn symbolic_insertion_deletion_tokens_require_an_inserted_alt() { + assert_eq!( + normalize_app_genotype("DI", "A", "G", Some(VariantKind::Insertion), "9", None,), + ("DI".to_owned(), "unknown".to_owned()) + ); + } + #[test] fn repeat_indel_insertion_deletion_tokens_remain_ambiguous_without_sequence_alleles() { assert_eq!( diff --git a/rust/bioscript-reporting/src/observation/genotype_display.rs b/rust/bioscript-reporting/src/observation/genotype_display.rs index 723fa0f..762e9db 100644 --- a/rust/bioscript-reporting/src/observation/genotype_display.rs +++ b/rust/bioscript-reporting/src/observation/genotype_display.rs @@ -115,12 +115,16 @@ pub(super) fn normalize_app_genotype( fn indel_display_tokens( display: &str, ref_allele: &str, - _alt_allele: &str, + alt_allele: &str, kind: Option, ) -> Option<(String, String)> { - if ref_allele.len() <= 1 - || !matches!(kind, Some(VariantKind::Deletion | VariantKind::Insertion)) - { + if !matches!(kind, Some(VariantKind::Deletion | VariantKind::Insertion)) { + return None; + } + if matches!(kind, Some(VariantKind::Deletion)) && ref_allele.len() <= 1 { + return None; + } + if matches!(kind, Some(VariantKind::Insertion)) && alt_allele.len() <= ref_allele.len() { return None; } if !display diff --git a/rust/bioscript-runtime/src/runtime/objects.rs b/rust/bioscript-runtime/src/runtime/objects.rs index b7c426a..b3348e7 100644 --- a/rust/bioscript-runtime/src/runtime/objects.rs +++ b/rust/bioscript-runtime/src/runtime/objects.rs @@ -215,6 +215,18 @@ pub(crate) fn variant_object(spec: &VariantSpec) -> MontyObject { MontyObject::String(alternate.clone()), )); } + if !spec.observed_alternates.is_empty() { + attrs.push(( + MontyObject::String("observed_alternates".to_owned()), + MontyObject::List( + spec.observed_alternates + .iter() + .cloned() + .map(MontyObject::String) + .collect(), + ), + )); + } if let Some(kind) = spec.kind { attrs.push(( MontyObject::String("kind".to_owned()), @@ -249,6 +261,7 @@ pub(crate) fn variant_object(spec: &VariantSpec) -> MontyObject { "grch38".to_owned(), "reference".to_owned(), "alternate".to_owned(), + "observed_alternates".to_owned(), "kind".to_owned(), "deletion_length".to_owned(), "motifs".to_owned(), diff --git a/rust/bioscript-runtime/src/runtime/variants.rs b/rust/bioscript-runtime/src/runtime/variants.rs index 5eee041..2f26e41 100644 --- a/rust/bioscript-runtime/src/runtime/variants.rs +++ b/rust/bioscript-runtime/src/runtime/variants.rs @@ -55,6 +55,9 @@ pub(crate) fn dataclass_to_variant_spec(obj: &MontyObject) -> Result spec.reference = string_from_optional(value)?, "alternate" => spec.alternate = string_from_optional(value)?, + "observed_alts" | "observed_alternates" => { + spec.observed_alternates = string_list_from_object(value)? + } "kind" => { spec.kind = string_from_optional(value)? .as_deref() @@ -113,7 +116,14 @@ pub(crate) fn variant_spec_from_kwargs( .transpose()? } "ref" | "reference" => spec.reference = string_from_optional(value)?, - "alt" | "alternate" => spec.alternate = string_from_optional(value)?, + "alt" | "alternate" => { + let alternates = string_or_list(value)?; + spec.alternate = alternates.first().cloned(); + spec.observed_alternates = alternates; + } + "observed_alts" | "observed_alternates" => { + spec.observed_alternates = string_or_list(value)?; + } "kind" => { spec.kind = string_from_optional(value)? .as_deref() @@ -223,3 +233,44 @@ pub(crate) fn int_from_optional(value: &MontyObject) -> Result, Runt ))), } } + +#[cfg(test)] +mod tests { + use super::{dataclass_to_variant_spec, variant_spec_from_kwargs}; + use crate::runtime::objects::variant_object; + use monty::MontyObject; + + #[test] + fn variant_kwargs_accept_alt_lists_as_observed_alternates() { + let spec = variant_spec_from_kwargs(&[( + MontyObject::String("alt".to_owned()), + MontyObject::List(vec![ + MontyObject::String("C".to_owned()), + MontyObject::String("T".to_owned()), + ]), + )]) + .unwrap(); + + assert_eq!(spec.alternate.as_deref(), Some("C")); + assert_eq!(spec.observed_alternates, vec!["C", "T"]); + } + + #[test] + fn variant_dataclass_round_trips_observed_alternates() { + let spec = variant_spec_from_kwargs(&[( + MontyObject::String("alt".to_owned()), + MontyObject::List(vec![ + MontyObject::String("A".to_owned()), + MontyObject::String("C".to_owned()), + MontyObject::String("T".to_owned()), + ]), + )]) + .unwrap(); + + let object = variant_object(&spec); + let round_tripped = dataclass_to_variant_spec(&object).unwrap(); + + assert_eq!(round_tripped.alternate.as_deref(), Some("A")); + assert_eq!(round_tripped.observed_alternates, vec!["A", "C", "T"]); + } +}