1
0
mirror of https://github.com/pcvolkmer/fastq-tools.git synced 2025-09-13 05:02:53 +00:00

feat: implement info subcommand

This commit is contained in:
2025-08-11 15:36:20 +02:00
parent 6ca8fa3cdb
commit 09c573f2f2
5 changed files with 218 additions and 6 deletions

View File

@@ -10,3 +10,5 @@ readme = "README.md"
[dependencies] [dependencies]
regex = "1.11" regex = "1.11"
clap = { version = "4.5", features = ["color", "derive"]} clap = { version = "4.5", features = ["color", "derive"]}
console = "0.16"
itertools = "0.14"

View File

@@ -6,10 +6,24 @@ Application to show information about and scramble FASTQ files to provide non-se
This application provides the following subcommands This application provides the following subcommands
### Info
To show information about compressed FASTQ files use:
```shell
cat file_fastq.gz | gzip -d | fastq-tools info
```
This will result in output like
![Info subcommand](docs/info_subcommand.jpg)
### Scramble ### Scramble
To scramble compressed FASTQ files use: To scramble compressed FASTQ files use:
```shell ```shell
cat file_fastq.gz | gzip -d | fastq-tools scramble | gzip > scrambled_fastq.gz cat file_fastq.gz | gzip -d | fastq-tools scramble | gzip > scrambled_fastq.gz
``` ```
This will scramble headers and sequences and write the output into `scrambled_fastq.gz`.

BIN
docs/info_subcommand.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

View File

@@ -32,6 +32,34 @@ pub struct IlluminaHeader {
} }
impl Header { impl Header {
pub fn instrument_name(&self) -> String {
match self {
Header::Casava18(h) => h.instrument_name.clone(),
Header::Illumina(h) => h.instrument_name.clone(),
}
}
pub fn flowcell_id(&self) -> Option<String> {
match self {
Header::Casava18(h) => Some(h.flowcell_id.clone()),
Header::Illumina(_) => None,
}
}
pub fn flowcell_lane(&self) -> u32 {
match self {
Header::Casava18(h) => h.flowcell_lane,
Header::Illumina(h) => h.flowcell_lane,
}
}
pub fn pair_member(&self) -> Pair {
match self {
Header::Casava18(h) => h.pair_member.clone(),
Header::Illumina(h) => h.pair_member.clone(),
}
}
pub fn scramble(self) -> Self { pub fn scramble(self) -> Self {
fn number(value: u32) -> u32 { fn number(value: u32) -> u32 {
value % 3 + value % 17 + value % 271 + value % 911 value % 3 + value % 17 + value % 271 + value % 911
@@ -213,7 +241,7 @@ impl FromStr for Header {
} }
} }
#[derive(Debug, PartialEq)] #[derive(Clone, Debug, PartialEq)]
pub enum Pair { pub enum Pair {
PairedEnd = 1, PairedEnd = 1,
MatePair = 2, MatePair = 2,

View File

@@ -2,8 +2,10 @@ mod cli;
mod fastq; mod fastq;
use crate::cli::{Args, Command}; use crate::cli::{Args, Command};
use crate::fastq::Header; use crate::fastq::{Header, Pair};
use clap::Parser; use clap::Parser;
use console::Style;
use itertools::Itertools;
use regex::Regex; use regex::Regex;
fn scramble_sequence(value: &str, seed: u32) -> String { fn scramble_sequence(value: &str, seed: u32) -> String {
@@ -53,9 +55,7 @@ fn main() {
let args = Args::parse(); let args = Args::parse();
match &args.command { match &args.command {
Command::Info => { Command::Info => info(),
println!("Not implemented yet");
}
Command::Scramble => scramble(), Command::Scramble => scramble(),
} }
@@ -87,3 +87,171 @@ fn scramble() {
buf.clear(); buf.clear();
} }
} }
fn info() {
let stdin = std::io::stdin();
let mut buf = String::new();
let mut headers = vec![];
let mut read_lens = vec![];
let mut quality_lens = vec![];
let headline_style = Style::new().bold();
let info_style = Style::new().bold().blue();
let error_style = Style::new().bold().red();
let mut line = 1;
while let Ok(n) = stdin.read_line(&mut buf) {
if n == 0 {
break;
}
if buf.starts_with("@") {
if let Ok(header) = buf.parse::<Header>() {
headers.push(header)
} else {
println!(
"{}",
error_style.apply_to(format!("🔥 Invalid header at line {}", line))
);
}
} else if buf.starts_with("+") {
// ignore optional description
} else if line % 4 == 0 {
// check if quality values differs from sequence values
if Some(&buf.trim().len()) != read_lens.last() {
println!(
"{}",
error_style
.apply_to(format!("🔥 Invalid quality string length at line {}", line))
);
return;
}
quality_lens.push(buf.trim().len());
} else if line % 4 == 2 {
read_lens.push(buf.trim().len());
}
line += 1;
buf.clear();
}
if line % 4 != 1 {
println!(
"{}",
error_style.apply_to("🔥 File contains invalid or incomplete sequences")
);
return;
}
println!(
"{} {}",
info_style.apply_to("🛈 "),
headline_style.apply_to(format!("Found {} complete sequence sets", headers.len()))
);
// Instruments
println!(
"{} {}",
info_style.apply_to("🛈 "),
headline_style.apply_to("Unique instrument name(s):")
);
println!(
"{}",
headers
.iter()
.map(|header| header.instrument_name())
.sorted()
.chunk_by(|value| value.clone())
.into_iter()
.map(|g| format!(" {} ({})", g.0, g.1.count()))
.collect::<Vec<String>>()
.join("\n")
);
// Flowcell IDs
println!(
"{} {}",
info_style.apply_to("🛈 "),
headline_style.apply_to("Flowcell ID(s):")
);
println!(
"{}",
headers
.iter()
.filter_map(|header| header.flowcell_id())
.sorted()
.chunk_by(|value| value.clone())
.into_iter()
.map(|g| format!(" {} ({})", g.0, g.1.count()))
.collect::<Vec<String>>()
.join("\n")
);
// Flowcell Lanes
println!(
"{} {}",
info_style.apply_to("🛈 "),
headline_style.apply_to("Flowcell lane(s):")
);
println!(
"{}",
headers
.iter()
.map(|header| header.flowcell_lane())
.sorted()
.chunk_by(|value| value.to_string())
.into_iter()
.map(|g| format!(" {} ({})", g.0, g.1.count()))
.collect::<Vec<String>>()
.join("\n")
);
// Read Orders
println!(
"{} {}",
info_style.apply_to("🛈 "),
headline_style.apply_to("Read order(s):")
);
println!(
"{}",
headers
.iter()
.map(|header| match header.pair_member() {
Pair::PairedEnd => "R1",
Pair::MatePair => "R2",
})
.sorted()
.chunk_by(|value| value.to_string())
.into_iter()
.map(|g| format!(" {} ({})", g.0, g.1.count()))
.collect::<Vec<String>>()
.join("\n")
);
// Read Lengths
println!(
"{} {}",
info_style.apply_to("🛈 "),
headline_style.apply_to("Read length(s):")
);
println!(
"{}",
read_lens
.iter()
.sorted()
.chunk_by(|value| value.to_string())
.into_iter()
.map(|g| format!(" {} ({})", g.0, g.1.count()))
.collect::<Vec<String>>()
.join("\n")
)
}