From 27d669e6fdd45dbdcfa4e3175b4de8c765a47fc0 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Wed, 30 Aug 2023 15:15:25 +0200 Subject: [PATCH] feat: log basic seed and align stats at debug level --- packages_rs/nextclade/src/align/align.rs | 20 +++++++--- .../nextclade/src/align/score_matrix.rs | 3 +- .../nextclade/src/align/seed_match2.rs | 37 +++++++++++++++++-- 3 files changed, 51 insertions(+), 9 deletions(-) diff --git a/packages_rs/nextclade/src/align/align.rs b/packages_rs/nextclade/src/align/align.rs index 705f295a3..dcdbb3a49 100644 --- a/packages_rs/nextclade/src/align/align.rs +++ b/packages_rs/nextclade/src/align/align.rs @@ -10,7 +10,7 @@ use crate::alphabet::letter::Letter; use crate::alphabet::nuc::Nuc; use crate::make_error; use eyre::{Report, WrapErr}; -use log::{info, trace}; +use log::{debug, info, trace}; fn align_pairwise>( qry_seq: &[T], @@ -48,7 +48,7 @@ pub fn align_nuc( if ref_len + qry_len < (10 * params.seed_length) { // for very short sequences, use full square let stripes = full_matrix(ref_len, qry_len); - trace!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band construction: short sequences, using full matrix"); + debug!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band construction: short sequences, using full matrix"); return Ok(align_pairwise(qry_seq, ref_seq, gap_open_close, params, &stripes)); } @@ -77,24 +77,34 @@ pub fn align_nuc( params.max_band_area, )?; + if log::max_level() == log::Level::Debug { + let band_area: usize = stripes.iter().map(Stripe::len).sum(); + let mean_band_width = band_area / ref_len; + debug!("Nucleotide alignment band area={band_area}, mean band width={mean_band_width}"); + } + let mut alignment = align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes); alignment.is_reverse_complement = is_reverse_complement; + let alignment_score = alignment.alignment_score; + + debug!("Attempt: {attempt}, Alignment score: {alignment_score}"); + if alignment.hit_boundary { terminal_bandwidth *= 2; excess_bandwidth *= 2; allowed_mismatches *= 2; attempt += 1; - info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band boundary is hit on attempt {}. Retrying with relaxed parameters. Alignment score was: {}", attempt, alignment.alignment_score); + info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band boundary is hit on attempt {}. Retrying with relaxed parameters. Alignment score was: {alignment_score}", attempt); } else { if attempt > 0 { - info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Succeeded without hitting band boundary on attempt {}. Alignment score was: {}", attempt+1, alignment.alignment_score); + info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Succeeded without hitting band boundary on attempt {}. Alignment score was: {alignment_score}", attempt+1); } return Ok(alignment); } if attempt > params.max_alignment_attempts { - info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Attempted to relax band parameters {attempt} times, but still hitting the band boundary. Alignment score was: {}", alignment.alignment_score); + info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Attempted to relax band parameters {attempt} times, but still hitting the band boundary. Alignment score was: {alignment_score}"); return Ok(alignment); } } diff --git a/packages_rs/nextclade/src/align/score_matrix.rs b/packages_rs/nextclade/src/align/score_matrix.rs index 2823b6aec..8f61661e5 100644 --- a/packages_rs/nextclade/src/align/score_matrix.rs +++ b/packages_rs/nextclade/src/align/score_matrix.rs @@ -38,7 +38,8 @@ pub fn score_matrix>( let mut scores = Band2d::::new(stripes); let band_size = paths.data_len(); - trace!("Score matrix: allocated alignment band of size={band_size}"); + let mean_band_width = band_size / ref_len; + trace!("Score matrix: allocated alignment band of size={band_size}, mean band width={mean_band_width}"); let left_align = match params.gap_alignment_side { GapAlignmentSide::Left => 1, diff --git a/packages_rs/nextclade/src/align/seed_match2.rs b/packages_rs/nextclade/src/align/seed_match2.rs index abfd3737b..dd3ab50db 100644 --- a/packages_rs/nextclade/src/align/seed_match2.rs +++ b/packages_rs/nextclade/src/align/seed_match2.rs @@ -12,6 +12,7 @@ use eyre::Report; use gcollections::ops::{Bounded, Intersection, IsEmpty, Union}; use interval::interval_set::{IntervalSet, ToIntervalSet}; use itertools::Itertools; +use log::{debug, trace}; use std::borrow::Cow; use std::cmp::{max, min}; use std::collections::{BTreeMap, VecDeque}; @@ -481,14 +482,44 @@ pub fn get_seed_matches2( let seed_matches = chain_seeds(&matches); // write_matches_to_file(&seed_matches, "chained_matches.csv"); + // Trace basic seed alignment stats + if log::max_level() >= log::Level::Debug { + let max_shift = seed_matches.iter().map(|sm| sm.offset.abs()).max().unwrap_or(0); + // Need to iterate in pairs + let max_unmatched_ref_stretch = seed_matches + .iter() + .tuple_windows() + .map(|(sm1, sm2)| sm2.ref_pos - (sm1.ref_pos + sm1.length)) + .max() + .unwrap_or(0); + let max_unmatched_qry_stretch = seed_matches + .iter() + .tuple_windows() + .map(|(sm1, sm2)| sm2.qry_pos - (sm1.qry_pos + sm1.length)) + .max() + .unwrap_or(0); + let first_match = seed_matches.first().unwrap(); + let last_match = seed_matches.last().unwrap(); + let first_match_ref_pos = first_match.ref_pos; + let first_match_qry_pos = first_match.qry_pos; + let last_match_ref_pos = ref_seq.len() - last_match.ref_pos - last_match.length; + let last_match_qry_pos = qry_seq.len() - last_match.qry_pos - last_match.length; + + let n_matches = seed_matches.len(); + debug!("Chained seed stats. max indel: {max_shift}, # matches: {n_matches}, first/last match distance from start/end [start: (ref: {first_match_ref_pos}, qry: {first_match_qry_pos}), end: (ref: {last_match_ref_pos}, qry: {last_match_qry_pos})], max unmatched stretch (ref: {max_unmatched_ref_stretch}, qry stretch: {max_unmatched_qry_stretch})"); + } + let sum_of_seed_length: usize = seed_matches.iter().map(|sm| sm.length).sum(); - if (sum_of_seed_length as f64 / qry_seq.len() as f64) < params.min_seed_cover { + let qry_coverage = sum_of_seed_length as f64 / qry_seq.len() as f64; + debug!("Seed alignment covers {:.2}% of query length", 100.0 * qry_coverage); + if qry_coverage < params.min_seed_cover { let query_knowns = qry_seq.iter().filter(|n| n.is_acgt()).count(); + let qry_coverage_knowns = sum_of_seed_length as f64 / query_knowns as f64; if (sum_of_seed_length as f64 / query_knowns as f64) < params.min_seed_cover { return make_error!( - "Unable to align: seed alignment covers {:.2}% of the query sequence, which is less than expected {:.2}% \ + "Unable to align: seed alignment covers {:.2}% of query sequence ACGTs, which is less than required {:.2}% \ (configurable using 'min seed cover' CLI flag or dataset property). This is likely due to low quality of the \ - provided sequence, or due to using incorrect reference sequence.", + provided sequence, or due to using an incorrect reference sequence.", 100.0 * (sum_of_seed_length as f64) / (query_knowns as f64), 100.0 * params.min_seed_cover );