1
0
mirror of https://github.com/pcvolkmer/fastq-tools.git synced 2025-09-13 13:12:52 +00:00

refactor: add FASTQ header subtypes

This commit is contained in:
2025-08-11 13:16:02 +02:00
parent 00b3dc5e09
commit 31c275b092

View File

@@ -2,7 +2,11 @@ use crate::scramble_sequence;
use std::fmt::Display; use std::fmt::Display;
use std::str::FromStr; use std::str::FromStr;
pub struct Header { pub enum Header {
Casava18(Casava18Header),
}
pub struct Casava18Header {
instrument_name: String, instrument_name: String,
run_id: u32, run_id: u32,
flowcell_id: String, flowcell_id: String,
@@ -33,45 +37,54 @@ impl Header {
((value.len() as u8) + value.chars().map(|c| c as u8 & 2).sum::<u8>()) % 97 ((value.len() as u8) + value.chars().map(|c| c as u8 & 2).sum::<u8>()) % 97
} }
Header { match self {
instrument_name: format!("TEST{:0<2}", (string_sum(&self.instrument_name) * 17) % 97), Header::Casava18(header) => Header::Casava18(Casava18Header {
run_id: number(self.run_id), instrument_name: format!(
flowcell_id: string(&self.flowcell_id), "TEST{:0<2}",
flowcell_lane: number(self.flowcell_lane), (string_sum(&header.instrument_name) * 17) % 97
tile_number: number(self.tile_number), ),
x: self.x + string_sum(&self.instrument_name) as u32, run_id: number(header.run_id),
y: self.y + string_sum(&self.instrument_name) as u32, flowcell_id: string(&header.flowcell_id),
pair_member: self.pair_member, flowcell_lane: number(header.flowcell_lane),
filtered: self.filtered, tile_number: number(header.tile_number),
control_bits: self.control_bits, x: header.x + string_sum(&header.instrument_name) as u32,
index_sequence: scramble_sequence(&self.index_sequence, 1), y: header.y + string_sum(&header.instrument_name) as u32,
pair_member: header.pair_member,
filtered: header.filtered,
control_bits: header.control_bits,
index_sequence: scramble_sequence(&header.index_sequence, 1),
}),
} }
} }
} }
impl Display for Header { impl Display for Header {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!( match self {
f, Header::Casava18(header) => {
"@{}:{}:{}:{}:{}:{}:{} {}:{}:{}:{}", write!(
self.instrument_name, f,
self.run_id, "@{}:{}:{}:{}:{}:{}:{} {}:{}:{}:{}",
self.flowcell_id, header.instrument_name,
self.flowcell_lane, header.run_id,
self.tile_number, header.flowcell_id,
self.x, header.flowcell_lane,
self.y, header.tile_number,
match self.pair_member { header.x,
Pair::PairedEnd => "1", header.y,
Pair::MatePair => "2", match header.pair_member {
}, Pair::PairedEnd => "1",
match self.filtered { Pair::MatePair => "2",
Filtered::Y => "Y", },
Filtered::N => "N", match header.filtered {
}, Filtered::Y => "Y",
self.control_bits, Filtered::N => "N",
self.index_sequence },
) header.control_bits,
header.index_sequence
)
}
}
} }
} }
@@ -80,57 +93,58 @@ impl FromStr for Header {
fn from_str(s: &str) -> Result<Self, Self::Err> { fn from_str(s: &str) -> Result<Self, Self::Err> {
if !s.starts_with("@") { if !s.starts_with("@") {
return Err("Invalid Casava 1.8+ header".to_string()); return Err("Cannot parse FASTQ header".to_string());
} }
let parts = s let parts = s
.split(" ") .split(" ")
.flat_map(|main_part| main_part.split(":").collect::<Vec<_>>()) .flat_map(|main_part| main_part.split(":").collect::<Vec<_>>())
.collect::<Vec<_>>(); .collect::<Vec<_>>();
if parts.len() != 11 {
return Err("Invalid Casava 1.8+ header".to_string()); if parts.len() == 11 {
return Ok(Header::Casava18(Casava18Header {
instrument_name: parts[0][1..].to_string(),
run_id: parts[1]
.parse()
.expect("Valid Casava 1.8+ header: Number value required"),
flowcell_id: parts[2].into(),
flowcell_lane: parts[3]
.parse()
.expect("Valid Casava 1.8+ header: Number value required"),
tile_number: parts[4]
.parse()
.expect("Valid Casava 1.8+ header: Number value required"),
x: parts[5]
.parse()
.expect("Valid Casava 1.8+ header: Number value required"),
y: parts[6]
.parse()
.expect("Valid Casava 1.8+ header: Number value required"),
pair_member: match parts[7] {
"1" => Pair::PairedEnd,
"2" => Pair::MatePair,
_ => return Err("Invalid Casava 1.8+ header".to_string()),
},
filtered: match parts[8] {
"Y" => Filtered::Y,
"N" => Filtered::N,
_ => return Err("Invalid Casava 1.8+ header".to_string()),
},
control_bits: if parts[9]
.parse::<u32>()
.expect("Valid Casava 1.8+ header: Even value for control bits required")
% 2
== 0
{
parts[9].parse().expect("Number")
} else {
return Err("Invalid Casava 1.8+ header".to_string());
},
index_sequence: parts[10].into(),
}));
} }
Ok(Header { Err("Cannot parse FASTQ header".to_string())
instrument_name: parts[0][1..].to_string(),
run_id: parts[1]
.parse()
.expect("Valid Casava 1.8+ header: Number value required"),
flowcell_id: parts[2].into(),
flowcell_lane: parts[3]
.parse()
.expect("Valid Casava 1.8+ header: Number value required"),
tile_number: parts[4]
.parse()
.expect("Valid Casava 1.8+ header: Number value required"),
x: parts[5]
.parse()
.expect("Valid Casava 1.8+ header: Number value required"),
y: parts[6]
.parse()
.expect("Valid Casava 1.8+ header: Number value required"),
pair_member: match parts[7] {
"1" => Pair::PairedEnd,
"2" => Pair::MatePair,
_ => return Err("Invalid Casava 1.8+ header".to_string()),
},
filtered: match parts[8] {
"Y" => Filtered::Y,
"N" => Filtered::N,
_ => return Err("Invalid Casava 1.8+ header".to_string()),
},
control_bits: if parts[9]
.parse::<u32>()
.expect("Valid Casava 1.8+ header: Even value for control bits required")
% 2
== 0
{
parts[9].parse().expect("Number")
} else {
return Err("Invalid Casava 1.8+ header".to_string());
},
index_sequence: parts[10].into(),
})
} }
} }
@@ -154,22 +168,22 @@ mod tests {
#[test] #[test]
fn should_return_parsed_header() { fn should_return_parsed_header() {
let given = "@EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG"; let given = "@EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG";
let actual = given.parse::<Header>();
assert!(actual.is_ok()); if let Ok(Header::Casava18(actual)) = given.parse::<Header>() {
assert_eq!(actual.instrument_name, "EAS139");
let actual = actual.unwrap(); assert_eq!(actual.run_id, 136);
assert_eq!(actual.instrument_name, "EAS139"); assert_eq!(actual.flowcell_id, "FC706VJ");
assert_eq!(actual.run_id, 136); assert_eq!(actual.flowcell_lane, 2);
assert_eq!(actual.flowcell_id, "FC706VJ"); assert_eq!(actual.tile_number, 2104);
assert_eq!(actual.flowcell_lane, 2); assert_eq!(actual.x, 15343);
assert_eq!(actual.tile_number, 2104); assert_eq!(actual.y, 197393);
assert_eq!(actual.x, 15343); assert_eq!(actual.pair_member, Pair::PairedEnd);
assert_eq!(actual.y, 197393); assert_eq!(actual.filtered, Filtered::Y);
assert_eq!(actual.pair_member, Pair::PairedEnd); assert_eq!(actual.control_bits, 18);
assert_eq!(actual.filtered, Filtered::Y); assert_eq!(actual.index_sequence, "ATCACG");
assert_eq!(actual.control_bits, 18); } else {
assert_eq!(actual.index_sequence, "ATCACG"); panic!("Failed to parse FASTQ header");
}
} }
#[test] #[test]