diff --git a/Cargo.toml b/Cargo.toml index 7445e32..cf65ad9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,3 +10,5 @@ readme = "README.md" [dependencies] regex = "1.11" clap = { version = "4.5", features = ["color", "derive"]} +console = "0.16" +itertools = "0.14" diff --git a/README.md b/README.md index 0803fdf..5070713 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,24 @@ Application to show information about and scramble FASTQ files to provide non-se This application provides the following subcommands +### Info + +To show information about compressed FASTQ files use: + +```shell +cat file_fastq.gz | gzip -d | fastq-tools info +``` + +This will result in output like + +![Info subcommand](docs/info_subcommand.jpg) + ### Scramble To scramble compressed FASTQ files use: ```shell cat file_fastq.gz | gzip -d | fastq-tools scramble | gzip > scrambled_fastq.gz -``` \ No newline at end of file +``` + +This will scramble headers and sequences and write the output into `scrambled_fastq.gz`. \ No newline at end of file diff --git a/docs/info_subcommand.jpg b/docs/info_subcommand.jpg new file mode 100644 index 0000000..cee2ce6 Binary files /dev/null and b/docs/info_subcommand.jpg differ diff --git a/src/fastq.rs b/src/fastq.rs index deb2ab9..4d25d62 100644 --- a/src/fastq.rs +++ b/src/fastq.rs @@ -32,6 +32,34 @@ pub struct IlluminaHeader { } impl Header { + pub fn instrument_name(&self) -> String { + match self { + Header::Casava18(h) => h.instrument_name.clone(), + Header::Illumina(h) => h.instrument_name.clone(), + } + } + + pub fn flowcell_id(&self) -> Option { + match self { + Header::Casava18(h) => Some(h.flowcell_id.clone()), + Header::Illumina(_) => None, + } + } + + pub fn flowcell_lane(&self) -> u32 { + match self { + Header::Casava18(h) => h.flowcell_lane, + Header::Illumina(h) => h.flowcell_lane, + } + } + + pub fn pair_member(&self) -> Pair { + match self { + Header::Casava18(h) => h.pair_member.clone(), + Header::Illumina(h) => h.pair_member.clone(), + } + } + pub fn scramble(self) -> Self { fn number(value: u32) -> u32 { value % 3 + value % 17 + value % 271 + value % 911 @@ -213,7 +241,7 @@ impl FromStr for Header { } } -#[derive(Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq)] pub enum Pair { PairedEnd = 1, MatePair = 2, diff --git a/src/main.rs b/src/main.rs index 769560a..4c98b95 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,8 +2,10 @@ mod cli; mod fastq; use crate::cli::{Args, Command}; -use crate::fastq::Header; +use crate::fastq::{Header, Pair}; use clap::Parser; +use console::Style; +use itertools::Itertools; use regex::Regex; fn scramble_sequence(value: &str, seed: u32) -> String { @@ -53,9 +55,7 @@ fn main() { let args = Args::parse(); match &args.command { - Command::Info => { - println!("Not implemented yet"); - } + Command::Info => info(), Command::Scramble => scramble(), } @@ -87,3 +87,171 @@ fn scramble() { buf.clear(); } } + +fn info() { + let stdin = std::io::stdin(); + let mut buf = String::new(); + + let mut headers = vec![]; + let mut read_lens = vec![]; + let mut quality_lens = vec![]; + + let headline_style = Style::new().bold(); + let info_style = Style::new().bold().blue(); + let error_style = Style::new().bold().red(); + + let mut line = 1; + while let Ok(n) = stdin.read_line(&mut buf) { + if n == 0 { + break; + } + + if buf.starts_with("@") { + if let Ok(header) = buf.parse::
() { + headers.push(header) + } else { + println!( + "{}", + error_style.apply_to(format!("🔥 Invalid header at line {}", line)) + ); + } + } else if buf.starts_with("+") { + // ignore optional description + } else if line % 4 == 0 { + // check if quality values differs from sequence values + if Some(&buf.trim().len()) != read_lens.last() { + println!( + "{}", + error_style + .apply_to(format!("🔥 Invalid quality string length at line {}", line)) + ); + return; + } + quality_lens.push(buf.trim().len()); + } else if line % 4 == 2 { + read_lens.push(buf.trim().len()); + } + + line += 1; + buf.clear(); + } + + if line % 4 != 1 { + println!( + "{}", + error_style.apply_to("🔥 File contains invalid or incomplete sequences") + ); + return; + } + + println!( + "{} {}", + info_style.apply_to("🛈 "), + headline_style.apply_to(format!("Found {} complete sequence sets", headers.len())) + ); + + // Instruments + + println!( + "{} {}", + info_style.apply_to("🛈 "), + headline_style.apply_to("Unique instrument name(s):") + ); + println!( + "{}", + headers + .iter() + .map(|header| header.instrument_name()) + .sorted() + .chunk_by(|value| value.clone()) + .into_iter() + .map(|g| format!(" {} ({})", g.0, g.1.count())) + .collect::>() + .join("\n") + ); + + // Flowcell IDs + + println!( + "{} {}", + info_style.apply_to("🛈 "), + headline_style.apply_to("Flowcell ID(s):") + ); + println!( + "{}", + headers + .iter() + .filter_map(|header| header.flowcell_id()) + .sorted() + .chunk_by(|value| value.clone()) + .into_iter() + .map(|g| format!(" {} ({})", g.0, g.1.count())) + .collect::>() + .join("\n") + ); + + // Flowcell Lanes + + println!( + "{} {}", + info_style.apply_to("🛈 "), + headline_style.apply_to("Flowcell lane(s):") + ); + + println!( + "{}", + headers + .iter() + .map(|header| header.flowcell_lane()) + .sorted() + .chunk_by(|value| value.to_string()) + .into_iter() + .map(|g| format!(" {} ({})", g.0, g.1.count())) + .collect::>() + .join("\n") + ); + + // Read Orders + + println!( + "{} {}", + info_style.apply_to("🛈 "), + headline_style.apply_to("Read order(s):") + ); + + println!( + "{}", + headers + .iter() + .map(|header| match header.pair_member() { + Pair::PairedEnd => "R1", + Pair::MatePair => "R2", + }) + .sorted() + .chunk_by(|value| value.to_string()) + .into_iter() + .map(|g| format!(" {} ({})", g.0, g.1.count())) + .collect::>() + .join("\n") + ); + + // Read Lengths + + println!( + "{} {}", + info_style.apply_to("🛈 "), + headline_style.apply_to("Read length(s):") + ); + + println!( + "{}", + read_lens + .iter() + .sorted() + .chunk_by(|value| value.to_string()) + .into_iter() + .map(|g| format!(" {} ({})", g.0, g.1.count())) + .collect::>() + .join("\n") + ) +}