mirror of
https://github.com/pcvolkmer/fastq-tools.git
synced 2025-09-13 05:02:53 +00:00
feat: implement info subcommand
This commit is contained in:
@@ -10,3 +10,5 @@ readme = "README.md"
|
||||
[dependencies]
|
||||
regex = "1.11"
|
||||
clap = { version = "4.5", features = ["color", "derive"]}
|
||||
console = "0.16"
|
||||
itertools = "0.14"
|
||||
|
16
README.md
16
README.md
@@ -6,10 +6,24 @@ Application to show information about and scramble FASTQ files to provide non-se
|
||||
|
||||
This application provides the following subcommands
|
||||
|
||||
### Info
|
||||
|
||||
To show information about compressed FASTQ files use:
|
||||
|
||||
```shell
|
||||
cat file_fastq.gz | gzip -d | fastq-tools info
|
||||
```
|
||||
|
||||
This will result in output like
|
||||
|
||||

|
||||
|
||||
### Scramble
|
||||
|
||||
To scramble compressed FASTQ files use:
|
||||
|
||||
```shell
|
||||
cat file_fastq.gz | gzip -d | fastq-tools scramble | gzip > scrambled_fastq.gz
|
||||
```
|
||||
```
|
||||
|
||||
This will scramble headers and sequences and write the output into `scrambled_fastq.gz`.
|
BIN
docs/info_subcommand.jpg
Normal file
BIN
docs/info_subcommand.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 16 KiB |
30
src/fastq.rs
30
src/fastq.rs
@@ -32,6 +32,34 @@ pub struct IlluminaHeader {
|
||||
}
|
||||
|
||||
impl Header {
|
||||
pub fn instrument_name(&self) -> String {
|
||||
match self {
|
||||
Header::Casava18(h) => h.instrument_name.clone(),
|
||||
Header::Illumina(h) => h.instrument_name.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flowcell_id(&self) -> Option<String> {
|
||||
match self {
|
||||
Header::Casava18(h) => Some(h.flowcell_id.clone()),
|
||||
Header::Illumina(_) => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flowcell_lane(&self) -> u32 {
|
||||
match self {
|
||||
Header::Casava18(h) => h.flowcell_lane,
|
||||
Header::Illumina(h) => h.flowcell_lane,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pair_member(&self) -> Pair {
|
||||
match self {
|
||||
Header::Casava18(h) => h.pair_member.clone(),
|
||||
Header::Illumina(h) => h.pair_member.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn scramble(self) -> Self {
|
||||
fn number(value: u32) -> u32 {
|
||||
value % 3 + value % 17 + value % 271 + value % 911
|
||||
@@ -213,7 +241,7 @@ impl FromStr for Header {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum Pair {
|
||||
PairedEnd = 1,
|
||||
MatePair = 2,
|
||||
|
176
src/main.rs
176
src/main.rs
@@ -2,8 +2,10 @@ mod cli;
|
||||
mod fastq;
|
||||
|
||||
use crate::cli::{Args, Command};
|
||||
use crate::fastq::Header;
|
||||
use crate::fastq::{Header, Pair};
|
||||
use clap::Parser;
|
||||
use console::Style;
|
||||
use itertools::Itertools;
|
||||
use regex::Regex;
|
||||
|
||||
fn scramble_sequence(value: &str, seed: u32) -> String {
|
||||
@@ -53,9 +55,7 @@ fn main() {
|
||||
let args = Args::parse();
|
||||
|
||||
match &args.command {
|
||||
Command::Info => {
|
||||
println!("Not implemented yet");
|
||||
}
|
||||
Command::Info => info(),
|
||||
Command::Scramble => scramble(),
|
||||
}
|
||||
|
||||
@@ -87,3 +87,171 @@ fn scramble() {
|
||||
buf.clear();
|
||||
}
|
||||
}
|
||||
|
||||
fn info() {
|
||||
let stdin = std::io::stdin();
|
||||
let mut buf = String::new();
|
||||
|
||||
let mut headers = vec![];
|
||||
let mut read_lens = vec![];
|
||||
let mut quality_lens = vec![];
|
||||
|
||||
let headline_style = Style::new().bold();
|
||||
let info_style = Style::new().bold().blue();
|
||||
let error_style = Style::new().bold().red();
|
||||
|
||||
let mut line = 1;
|
||||
while let Ok(n) = stdin.read_line(&mut buf) {
|
||||
if n == 0 {
|
||||
break;
|
||||
}
|
||||
|
||||
if buf.starts_with("@") {
|
||||
if let Ok(header) = buf.parse::<Header>() {
|
||||
headers.push(header)
|
||||
} else {
|
||||
println!(
|
||||
"{}",
|
||||
error_style.apply_to(format!("🔥 Invalid header at line {}", line))
|
||||
);
|
||||
}
|
||||
} else if buf.starts_with("+") {
|
||||
// ignore optional description
|
||||
} else if line % 4 == 0 {
|
||||
// check if quality values differs from sequence values
|
||||
if Some(&buf.trim().len()) != read_lens.last() {
|
||||
println!(
|
||||
"{}",
|
||||
error_style
|
||||
.apply_to(format!("🔥 Invalid quality string length at line {}", line))
|
||||
);
|
||||
return;
|
||||
}
|
||||
quality_lens.push(buf.trim().len());
|
||||
} else if line % 4 == 2 {
|
||||
read_lens.push(buf.trim().len());
|
||||
}
|
||||
|
||||
line += 1;
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
if line % 4 != 1 {
|
||||
println!(
|
||||
"{}",
|
||||
error_style.apply_to("🔥 File contains invalid or incomplete sequences")
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
println!(
|
||||
"{} {}",
|
||||
info_style.apply_to("🛈 "),
|
||||
headline_style.apply_to(format!("Found {} complete sequence sets", headers.len()))
|
||||
);
|
||||
|
||||
// Instruments
|
||||
|
||||
println!(
|
||||
"{} {}",
|
||||
info_style.apply_to("🛈 "),
|
||||
headline_style.apply_to("Unique instrument name(s):")
|
||||
);
|
||||
println!(
|
||||
"{}",
|
||||
headers
|
||||
.iter()
|
||||
.map(|header| header.instrument_name())
|
||||
.sorted()
|
||||
.chunk_by(|value| value.clone())
|
||||
.into_iter()
|
||||
.map(|g| format!(" {} ({})", g.0, g.1.count()))
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
);
|
||||
|
||||
// Flowcell IDs
|
||||
|
||||
println!(
|
||||
"{} {}",
|
||||
info_style.apply_to("🛈 "),
|
||||
headline_style.apply_to("Flowcell ID(s):")
|
||||
);
|
||||
println!(
|
||||
"{}",
|
||||
headers
|
||||
.iter()
|
||||
.filter_map(|header| header.flowcell_id())
|
||||
.sorted()
|
||||
.chunk_by(|value| value.clone())
|
||||
.into_iter()
|
||||
.map(|g| format!(" {} ({})", g.0, g.1.count()))
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
);
|
||||
|
||||
// Flowcell Lanes
|
||||
|
||||
println!(
|
||||
"{} {}",
|
||||
info_style.apply_to("🛈 "),
|
||||
headline_style.apply_to("Flowcell lane(s):")
|
||||
);
|
||||
|
||||
println!(
|
||||
"{}",
|
||||
headers
|
||||
.iter()
|
||||
.map(|header| header.flowcell_lane())
|
||||
.sorted()
|
||||
.chunk_by(|value| value.to_string())
|
||||
.into_iter()
|
||||
.map(|g| format!(" {} ({})", g.0, g.1.count()))
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
);
|
||||
|
||||
// Read Orders
|
||||
|
||||
println!(
|
||||
"{} {}",
|
||||
info_style.apply_to("🛈 "),
|
||||
headline_style.apply_to("Read order(s):")
|
||||
);
|
||||
|
||||
println!(
|
||||
"{}",
|
||||
headers
|
||||
.iter()
|
||||
.map(|header| match header.pair_member() {
|
||||
Pair::PairedEnd => "R1",
|
||||
Pair::MatePair => "R2",
|
||||
})
|
||||
.sorted()
|
||||
.chunk_by(|value| value.to_string())
|
||||
.into_iter()
|
||||
.map(|g| format!(" {} ({})", g.0, g.1.count()))
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
);
|
||||
|
||||
// Read Lengths
|
||||
|
||||
println!(
|
||||
"{} {}",
|
||||
info_style.apply_to("🛈 "),
|
||||
headline_style.apply_to("Read length(s):")
|
||||
);
|
||||
|
||||
println!(
|
||||
"{}",
|
||||
read_lens
|
||||
.iter()
|
||||
.sorted()
|
||||
.chunk_by(|value| value.to_string())
|
||||
.into_iter()
|
||||
.map(|g| format!(" {} ({})", g.0, g.1.count()))
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
)
|
||||
}
|
||||
|
Reference in New Issue
Block a user