From 00b3dc5e09eb0d90f3334ea4712cb19e610013d7 Mon Sep 17 00:00:00 2001 From: Paul-Christian Volkmer Date: Mon, 11 Aug 2025 12:58:24 +0200 Subject: [PATCH] chore: use subcommand 'scramble' --- Cargo.toml | 7 +- README.md | 10 ++- src/cli.rs | 15 ++++ src/fastq.rs | 215 +++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 232 ++++----------------------------------------------- 5 files changed, 259 insertions(+), 220 deletions(-) create mode 100644 src/cli.rs create mode 100644 src/fastq.rs diff --git a/Cargo.toml b/Cargo.toml index 9d741ac..7445e32 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,11 +1,12 @@ [package] -name = "fastq-scrambler" -version = "0.1.0" +name = "fastq-tools" +version = "0.2.0" edition = "2024" authors = ["Paul-Christian Volkmer"] -description = "Application to scramble FASTQ files to provide non-sensitive data for development purposes" +description = "Application to show information about and scramble FASTQ files to provide non-sensitive data for development purposes" license = "GPL-3" readme = "README.md" [dependencies] regex = "1.11" +clap = { version = "4.5", features = ["color", "derive"]} diff --git a/README.md b/README.md index 3025027..0803fdf 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,15 @@ -# FASTQ scrambler +# FASTQ tools -Application to scramble FASTQ files to provide non-sensitive data for development purposes +Application to show information about and scramble FASTQ files to provide non-sensitive data for development purposes ## Usage +This application provides the following subcommands + +### Scramble + To scramble compressed FASTQ files use: ```shell -cat file_fastq.gz | gz -d | fastq-scrambler | gz > scrambled_fastq.gz +cat file_fastq.gz | gzip -d | fastq-tools scramble | gzip > scrambled_fastq.gz ``` \ No newline at end of file diff --git a/src/cli.rs b/src/cli.rs new file mode 100644 index 0000000..add105d --- /dev/null +++ b/src/cli.rs @@ -0,0 +1,15 @@ +use clap::{Parser, Subcommand}; + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +#[command(propagate_version = true)] +pub struct Args { + #[command(subcommand)] + pub(crate) command: Command, +} + +#[derive(Subcommand)] +pub enum Command { + Info, + Scramble, +} diff --git a/src/fastq.rs b/src/fastq.rs new file mode 100644 index 0000000..2ec90e0 --- /dev/null +++ b/src/fastq.rs @@ -0,0 +1,215 @@ +use crate::scramble_sequence; +use std::fmt::Display; +use std::str::FromStr; + +pub struct Header { + instrument_name: String, + run_id: u32, + flowcell_id: String, + flowcell_lane: u32, + tile_number: u32, + x: u32, + y: u32, + pair_member: Pair, + filtered: Filtered, + control_bits: u32, + index_sequence: String, +} + +impl Header { + pub fn scramble(self) -> Self { + fn number(value: u32) -> u32 { + value % 3 + value % 17 + value % 271 + value % 911 + } + + fn string(value: &str) -> String { + value + .chars() + .map(|c| (((c as u8 % 3 * c as u8 % 17) % 26) + 0x41) as char) + .collect::() + } + + fn string_sum(value: &str) -> u8 { + ((value.len() as u8) + value.chars().map(|c| c as u8 & 2).sum::()) % 97 + } + + Header { + instrument_name: format!("TEST{:0<2}", (string_sum(&self.instrument_name) * 17) % 97), + run_id: number(self.run_id), + flowcell_id: string(&self.flowcell_id), + flowcell_lane: number(self.flowcell_lane), + tile_number: number(self.tile_number), + x: self.x + string_sum(&self.instrument_name) as u32, + y: self.y + string_sum(&self.instrument_name) as u32, + pair_member: self.pair_member, + filtered: self.filtered, + control_bits: self.control_bits, + index_sequence: scramble_sequence(&self.index_sequence, 1), + } + } +} + +impl Display for Header { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "@{}:{}:{}:{}:{}:{}:{} {}:{}:{}:{}", + self.instrument_name, + self.run_id, + self.flowcell_id, + self.flowcell_lane, + self.tile_number, + self.x, + self.y, + match self.pair_member { + Pair::PairedEnd => "1", + Pair::MatePair => "2", + }, + match self.filtered { + Filtered::Y => "Y", + Filtered::N => "N", + }, + self.control_bits, + self.index_sequence + ) + } +} + +impl FromStr for Header { + type Err = String; + + fn from_str(s: &str) -> Result { + if !s.starts_with("@") { + return Err("Invalid Casava 1.8+ header".to_string()); + } + + let parts = s + .split(" ") + .flat_map(|main_part| main_part.split(":").collect::>()) + .collect::>(); + if parts.len() != 11 { + return Err("Invalid Casava 1.8+ header".to_string()); + } + + Ok(Header { + instrument_name: parts[0][1..].to_string(), + run_id: parts[1] + .parse() + .expect("Valid Casava 1.8+ header: Number value required"), + flowcell_id: parts[2].into(), + flowcell_lane: parts[3] + .parse() + .expect("Valid Casava 1.8+ header: Number value required"), + tile_number: parts[4] + .parse() + .expect("Valid Casava 1.8+ header: Number value required"), + x: parts[5] + .parse() + .expect("Valid Casava 1.8+ header: Number value required"), + y: parts[6] + .parse() + .expect("Valid Casava 1.8+ header: Number value required"), + pair_member: match parts[7] { + "1" => Pair::PairedEnd, + "2" => Pair::MatePair, + _ => return Err("Invalid Casava 1.8+ header".to_string()), + }, + filtered: match parts[8] { + "Y" => Filtered::Y, + "N" => Filtered::N, + _ => return Err("Invalid Casava 1.8+ header".to_string()), + }, + control_bits: if parts[9] + .parse::() + .expect("Valid Casava 1.8+ header: Even value for control bits required") + % 2 + == 0 + { + parts[9].parse().expect("Number") + } else { + return Err("Invalid Casava 1.8+ header".to_string()); + }, + index_sequence: parts[10].into(), + }) + } +} + +#[derive(Debug, PartialEq)] +pub enum Pair { + PairedEnd = 1, + MatePair = 2, +} + +#[derive(Debug, PartialEq)] +pub enum Filtered { + Y, + N, +} + +#[cfg(test)] +mod tests { + use crate::fastq::{Filtered, Pair}; + use crate::{scramble_sequence, Header}; + + #[test] + fn should_return_parsed_header() { + let given = "@EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG"; + let actual = given.parse::
(); + + assert!(actual.is_ok()); + + let actual = actual.unwrap(); + assert_eq!(actual.instrument_name, "EAS139"); + assert_eq!(actual.run_id, 136); + assert_eq!(actual.flowcell_id, "FC706VJ"); + assert_eq!(actual.flowcell_lane, 2); + assert_eq!(actual.tile_number, 2104); + assert_eq!(actual.x, 15343); + assert_eq!(actual.y, 197393); + assert_eq!(actual.pair_member, Pair::PairedEnd); + assert_eq!(actual.filtered, Filtered::Y); + assert_eq!(actual.control_bits, 18); + assert_eq!(actual.index_sequence, "ATCACG"); + } + + #[test] + fn should_return_header_string() { + let given = "@EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG"; + let actual = given.parse::
(); + + assert!(actual.is_ok()); + + let actual = actual.unwrap(); + assert_eq!(given, actual.to_string()); + } + + #[test] + fn should_return_scrambled_header_string() { + let given = "@EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG"; + let actual = given.parse::
(); + let expected = "@TEST73:273:CQEAACM:8:503:15353:197403 1:Y:18:GAGCGC"; + + assert!(actual.is_ok()); + + let actual = actual.unwrap().scramble(); + assert_eq!(expected, actual.to_string().as_str()); + } + + #[test] + fn should_return_scrambled_sequence_string_seed1() { + let given = "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT"; + let actual = scramble_sequence(given, 1); + let expected = "CGATCTGGCGCGCAGCGCCGGAGCGAGCAGAGCGTAGATGCATCCGCGCGGCGCGCCGTT"; + + assert_eq!(expected, actual); + } + + #[test] + fn should_return_scrambled_sequence_string_seed42() { + let given = "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT"; + let actual = scramble_sequence(given, 42); + let expected = "GTTTCTGGTTCGCAGCGCTCTCGCTCGCATCTTCTATCTGCTTCTTCGCCGCGCGCTTTA"; + + assert_eq!(expected, actual); + } +} diff --git a/src/main.rs b/src/main.rs index 39fe54a..769560a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,21 +1,11 @@ -use std::{fmt::Display, str::FromStr}; +mod cli; +mod fastq; +use crate::cli::{Args, Command}; +use crate::fastq::Header; +use clap::Parser; use regex::Regex; -struct Header { - instrument_name: String, - run_id: u32, - flowcell_id: String, - flowcell_lane: u32, - tile_number: u32, - x: u32, - y: u32, - pair_member: Pair, - filtered: Filtered, - control_bits: u32, - index_sequence: String, -} - fn scramble_sequence(value: &str, seed: u32) -> String { let ahead_1 = Regex::new(r"T([ACG])").unwrap(); let ahead_2 = Regex::new(r"A([CGT])").unwrap(); @@ -59,137 +49,20 @@ fn scramble_sequence(value: &str, seed: u32) -> String { result.to_string() } -impl Header { - fn scramble(self) -> Self { - fn number(value: u32) -> u32 { - value % 3 + value % 17 + value % 271 + value % 911 - } - - fn string(value: &str) -> String { - value - .chars() - .map(|c| (((c as u8 % 3 * c as u8 % 17) % 26) + 0x41) as char) - .collect::() - } - - fn string_sum(value: &str) -> u8 { - ((value.len() as u8) + value.chars().map(|c| c as u8 & 2).sum::()) % 97 - } - - Header { - instrument_name: format!("TEST{:0<2}", (string_sum(&self.instrument_name) * 17) % 97), - run_id: number(self.run_id), - flowcell_id: string(&self.flowcell_id), - flowcell_lane: number(self.flowcell_lane), - tile_number: number(self.tile_number), - x: self.x + string_sum(&self.instrument_name) as u32, - y: self.y + string_sum(&self.instrument_name) as u32, - pair_member: self.pair_member, - filtered: self.filtered, - control_bits: self.control_bits, - index_sequence: scramble_sequence(&self.index_sequence, 1), - } - } -} - -impl Display for Header { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "@{}:{}:{}:{}:{}:{}:{} {}:{}:{}:{}", - self.instrument_name, - self.run_id, - self.flowcell_id, - self.flowcell_lane, - self.tile_number, - self.x, - self.y, - match self.pair_member { - Pair::PairedEnd => "1", - Pair::MatePair => "2", - }, - match self.filtered { - Filtered::Y => "Y", - Filtered::N => "N", - }, - self.control_bits, - self.index_sequence - ) - } -} - -impl FromStr for Header { - type Err = String; - - fn from_str(s: &str) -> Result { - if !s.starts_with("@") { - return Err("Invalid Casava 1.8+ header".to_string()); - } - - let parts = s - .split(" ") - .flat_map(|main_part| main_part.split(":").collect::>()) - .collect::>(); - if parts.len() != 11 { - return Err("Invalid Casava 1.8+ header".to_string()); - } - - Ok(Header { - instrument_name: parts[0][1..].to_string(), - run_id: parts[1] - .parse() - .expect("Valid Casava 1.8+ header: Number value required"), - flowcell_id: parts[2].into(), - flowcell_lane: parts[3] - .parse() - .expect("Valid Casava 1.8+ header: Number value required"), - tile_number: parts[4] - .parse() - .expect("Valid Casava 1.8+ header: Number value required"), - x: parts[5] - .parse() - .expect("Valid Casava 1.8+ header: Number value required"), - y: parts[6] - .parse() - .expect("Valid Casava 1.8+ header: Number value required"), - pair_member: match parts[7] { - "1" => Pair::PairedEnd, - "2" => Pair::MatePair, - _ => return Err("Invalid Casava 1.8+ header".to_string()), - }, - filtered: match parts[8] { - "Y" => Filtered::Y, - "N" => Filtered::N, - _ => return Err("Invalid Casava 1.8+ header".to_string()), - }, - control_bits: if parts[9] - .parse::() - .expect("Valid Casava 1.8+ header: Even value for control bits required") - % 2 - == 0 - { - parts[9].parse().expect("Number") - } else { - return Err("Invalid Casava 1.8+ header".to_string()); - }, - index_sequence: parts[10].into(), - }) - } -} - -#[derive(Debug, PartialEq)] -enum Pair { - PairedEnd = 1, - MatePair = 2, -} - -#[derive(Debug, PartialEq)] -enum Filtered { - Y, - N, -} - fn main() { + let args = Args::parse(); + + match &args.command { + Command::Info => { + println!("Not implemented yet"); + } + Command::Scramble => scramble(), + } + + println!() +} + +fn scramble() { let stdin = std::io::stdin(); let mut buf = String::new(); @@ -213,73 +86,4 @@ fn main() { line += 1; buf.clear(); } - - println!() -} - -#[cfg(test)] -mod tests { - use crate::{scramble_sequence, Filtered, Header, Pair}; - - #[test] - fn should_return_parsed_header() { - let given = "@EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG"; - let actual = given.parse::
(); - - assert!(actual.is_ok()); - - let actual = actual.unwrap(); - assert_eq!(actual.instrument_name, "EAS139"); - assert_eq!(actual.run_id, 136); - assert_eq!(actual.flowcell_id, "FC706VJ"); - assert_eq!(actual.flowcell_lane, 2); - assert_eq!(actual.tile_number, 2104); - assert_eq!(actual.x, 15343); - assert_eq!(actual.y, 197393); - assert_eq!(actual.pair_member, Pair::PairedEnd); - assert_eq!(actual.filtered, Filtered::Y); - assert_eq!(actual.control_bits, 18); - assert_eq!(actual.index_sequence, "ATCACG"); - } - - #[test] - fn should_return_header_string() { - let given = "@EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG"; - let actual = given.parse::
(); - - assert!(actual.is_ok()); - - let actual = actual.unwrap(); - assert_eq!(given, actual.to_string()); - } - - #[test] - fn should_return_scrambled_header_string() { - let given = "@EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG"; - let actual = given.parse::
(); - let expected = "@TEST73:273:CQEAACM:8:503:15353:197403 1:Y:18:GAGCGC"; - - assert!(actual.is_ok()); - - let actual = actual.unwrap().scramble(); - assert_eq!(expected, actual.to_string().as_str()); - } - - #[test] - fn should_return_scrambled_sequence_string_seed1() { - let given = "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT"; - let actual = scramble_sequence(given, 1); - let expected = "CGATCTGGCGCGCAGCGCCGGAGCGAGCAGAGCGTAGATGCATCCGCGCGGCGCGCCGTT"; - - assert_eq!(expected, actual); - } - - #[test] - fn should_return_scrambled_sequence_string_seed42() { - let given = "GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT"; - let actual = scramble_sequence(given, 42); - let expected = "GTTTCTGGTTCGCAGCGCTCTCGCTCGCATCTTCTATCTGCTTCTTCGCCGCGCGCTTTA"; - - assert_eq!(expected, actual); - } }