1
0
mirror of https://github.com/pcvolkmer/fastq-tools.git synced 2025-09-13 05:02:53 +00:00

feat: generate GRZ metadata for file

This commit is contained in:
2025-08-14 23:25:53 +02:00
parent 9cc7762af2
commit d719fed150
7 changed files with 413 additions and 23 deletions

View File

@@ -13,3 +13,14 @@ clap = { version = "4.5", features = ["color", "derive"]}
console = "0.16"
itertools = "0.14"
flate2 = "1.1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
base16ct = { version = "0.2", features = ["alloc"] }
sha2 = { version = "0.10", default-features = false }
[profile.release]
opt-level = "z"
codegen-units = 1
lto = true
strip = true
panic = "abort"

View File

@@ -10,12 +10,13 @@ This application provides the following subcommands
Usage: fastq-tools [OPTIONS] <COMMAND>
Commands:
info Show information about input
scramble Scramble input data
help Print this message or the help of the given subcommand(s)
info Show information about input
grz-metadata Show GRZ metadata
scramble Scramble input data
help Print this message or the help of the given subcommand(s)
Options:
-i, --input <INPUT_FILE> Input file (optional)
-i, --input <INPUT_FILE> Input file
-d, --decompress Decompress input as gzip compressed data
-h, --help Print help
-V, --version Print version
@@ -44,6 +45,24 @@ This will result in output like
![Info subcommand](docs/info_subcommand.jpg)
### GRZ Metadata
To generate GRZ metadata for a file use:
```shell
fastq-tools --decompress --input file_fastq.gz grz-metadata
```
The use of the `--input` argument is required for this sub command.
If the file is an uncompressed FASTQ file, you can omit the `--decompress` option.
![GRZ Metadata subcommand](docs/grz-metadata_subcommand.jpg)
Supported file types are:
* fastq (full support)
* bam, bed, vcf (limited support)
### Scramble
To scramble compressed FASTQ files use:

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

View File

@@ -7,12 +7,19 @@ use std::path::PathBuf;
pub struct Args {
#[command(subcommand)]
pub(crate) command: Command,
#[arg(short = 'i', long = "input", help = "Input file (optional)")]
#[arg(
short = 'i',
long = "input",
help = "Input file",
group = "metadata",
global = true
)]
pub(crate) input_file: Option<PathBuf>,
#[arg(
short = 'd',
long = "decompress",
help = "Decompress input as gzip compressed data"
help = "Decompress input as gzip compressed data",
global = true
)]
pub(crate) decompress: bool,
}
@@ -21,6 +28,8 @@ pub struct Args {
pub enum Command {
#[command(about = "Show information about input")]
Info,
#[command(about = "Show GRZ metadata")]
GrzMetadata,
#[command(about = "Scramble input data")]
Scramble,
}

View File

@@ -1,6 +1,8 @@
use crate::scramble_sequence;
use serde::Serialize;
use std::fmt::Display;
use std::str::FromStr;
use std::string::ToString;
pub enum Header {
Casava18(Casava18Header),
@@ -241,9 +243,11 @@ impl FromStr for Header {
}
}
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug, PartialEq, Serialize)]
pub enum Pair {
#[serde(rename = "R1")]
PairedEnd = 1,
#[serde(rename = "R2")]
MatePair = 2,
}

View File

@@ -1,8 +1,10 @@
mod cli;
mod fastq;
mod metadata_file;
use crate::cli::{Args, Command};
use crate::fastq::{Header, Pair};
use crate::metadata_file::MetadataFile;
use clap::Parser;
use console::Style;
use flate2::read::GzDecoder;
@@ -10,6 +12,7 @@ use itertools::Itertools;
use regex::Regex;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;
fn scramble_sequence(value: &str, seed: u32) -> String {
let ahead_1 = Regex::new(r"T([ACG])").unwrap();
@@ -56,19 +59,61 @@ fn scramble_sequence(value: &str, seed: u32) -> String {
fn main() {
let args = Args::parse();
let input: Box<dyn BufRead> = match args.input_file {
let input_file = args.input_file;
match &args.command {
Command::Info => match input_reader(input_file, args.decompress) {
Ok(input) => info(input),
Err(err) => {
eprintln!(
"{}\n",
Style::new().bold().red().apply_to(format!("🔥 {err}"))
);
}
},
Command::GrzMetadata => match input_file {
Some(input_file) => {
let file_metadata = match MetadataFile::read_file(input_file, args.decompress) {
Ok(file_metadata) => file_metadata,
Err(err) => {
eprintln!(
"{}\n",
Style::new().bold().red().apply_to(format!("🔥 {err}"))
);
return;
}
};
println!(
"{}\n",
serde_json::to_string_pretty(&file_metadata).unwrap()
);
}
None => eprintln!(
"{}\n",
Style::new().bold().red().apply_to("🔥 No input file!")
),
},
Command::Scramble => match input_reader(input_file, args.decompress) {
Ok(input) => scramble(input),
Err(err) => {
eprintln!(
"{}\n",
Style::new().bold().red().apply_to(format!("🔥 {err}"))
);
}
},
}
}
fn input_reader(input_file: Option<PathBuf>, decompress: bool) -> Result<Box<dyn BufRead>, String> {
let input: Box<dyn BufRead> = match input_file {
Some(input_file) => {
let file = match File::open(input_file) {
Ok(file) => file,
_ => {
println!(
"{}\n",
Style::new()
.bold()
.red()
.apply_to("🔥 Cannot open input file")
);
return;
return Err("Cannot open input file".to_string());
}
};
Box::new(BufReader::new(file))
@@ -76,19 +121,14 @@ fn main() {
_ => Box::new(BufReader::new(std::io::stdin())),
};
let input: Box<dyn BufRead> = if args.decompress {
let input: Box<dyn BufRead> = if decompress {
let gz_decoder = GzDecoder::new(input);
Box::new(BufReader::new(gz_decoder))
} else {
Box::new(input)
};
match &args.command {
Command::Info => info(input),
Command::Scramble => scramble(input),
}
println!()
Ok(input)
}
fn scramble(mut reader: impl BufRead) {

307
src/metadata_file.rs Normal file
View File

@@ -0,0 +1,307 @@
use crate::fastq::{Header, Pair};
use crate::input_reader;
use crate::metadata_file::MetadataError::{CannotReadFile, ReadError};
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::error::Error;
use std::fmt::{Debug, Display, Formatter};
use std::fs;
use std::fs::File;
use std::io::BufRead;
use std::path::PathBuf;
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct MetadataFile {
/// Type of checksum algorithm used
#[serde(skip_serializing_if = "Option::is_none")]
pub checksum_type: Option<ChecksumType>,
/// checksum of the file
pub file_checksum: String,
/// Path relative to the submission files directory, e.g.:
/// 'patient_001/patient_001_dna.fastq.gz' if the file is located in <submission
/// root>/files/patient_001/patient_001_dna.fastq.gz
pub file_path: String,
/// Size of the file in bytes
pub file_size_in_bytes: u64,
/// Type of the file; if BED file is submitted, only 1 file is allowed.
pub file_type: FileType,
/// Indicates the flow cell.
#[serde(skip_serializing_if = "Option::is_none")]
pub flowcell_id: Option<String>,
/// Indicates the lane
#[serde(skip_serializing_if = "Option::is_none")]
pub lane_id: Option<String>,
/// The read length; in the case of long-read sequencing it is the rounded average read
/// length.
#[serde(skip_serializing_if = "Option::is_none")]
pub read_length: Option<i64>,
/// Indicates the read order for paired-end reads.
#[serde(skip_serializing_if = "Option::is_none")]
pub read_order: Option<ReadOrder>,
}
/// Type of checksum algorithm used
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ChecksumType {
Sha256,
}
/// Type of the file; if BED file is submitted, only 1 file is allowed.
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FileType {
Bam,
Bed,
Fastq,
Vcf,
}
/// Indicates the read order for paired-end reads.
#[derive(Debug, Serialize, Deserialize)]
pub enum ReadOrder {
R1,
R2,
}
pub enum MetadataError {
CannotReadFile,
UnsupportedFile,
ReadError(String),
}
impl Debug for MetadataError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self)
}
}
impl Display for MetadataError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}",
match self {
MetadataError::CannotReadFile => "Cannot read file".into(),
MetadataError::UnsupportedFile => "Unsupported file type".into(),
MetadataError::ReadError(err) => format!("Error reading file: {}", err),
}
)
}
}
impl Error for MetadataError {}
impl MetadataFile {
pub fn read_file(path: PathBuf, decompress: bool) -> Result<MetadataFile, MetadataError> {
let path = match path.to_str() {
Some(path) => path,
None => return Err(MetadataError::CannotReadFile),
};
let file = File::open(path).map_err(|_| CannotReadFile)?;
let file_type = if path.to_lowercase().ends_with(".bam") {
FileType::Bam
} else if path.to_lowercase().ends_with(".vcf") {
FileType::Vcf
} else if path.to_lowercase().ends_with(".bed") {
FileType::Bed
} else if path.to_lowercase().ends_with(".fastq")
|| path.to_lowercase().ends_with(".fastq.gz")
{
FileType::Fastq
} else {
return Err(MetadataError::UnsupportedFile);
};
let file_checksum = match fs::read(path) {
Ok(content) => {
let mut hasher = Sha256::new();
hasher.update(content.as_slice());
let hash = hasher.finalize();
base16ct::lower::encode_string(&hash)
}
Err(_) => {
return Err(CannotReadFile);
}
};
if let FileType::Fastq = file_type {
match input_reader(Some(PathBuf::from(path)), decompress) {
Ok(input_reader) => {
let input_metadata = MetadataFile::read(input_reader)?;
Ok(MetadataFile {
file_type,
file_checksum,
checksum_type: Some(ChecksumType::Sha256),
file_size_in_bytes: file.metadata().map_err(|_| CannotReadFile)?.len(),
flowcell_id: input_metadata.flowcell_id,
read_order: input_metadata.read_order,
file_path: path.to_string(),
read_length: input_metadata.read_length,
lane_id: input_metadata.lane_id,
})
}
Err(err) => Err(ReadError(err.to_string())),
}
} else {
Ok(MetadataFile {
file_type,
file_checksum,
checksum_type: Some(ChecksumType::Sha256),
file_size_in_bytes: file.metadata().map_err(|_| CannotReadFile)?.len(),
flowcell_id: None,
read_order: None,
file_path: path.to_string(),
read_length: None,
lane_id: None,
})
}
}
fn read(mut reader: impl BufRead) -> Result<MetadataFile, MetadataError> {
let mut buf = String::new();
let mut headers = vec![];
let mut read_lens = vec![];
let mut quality_lens = vec![];
let mut line = 1;
while let Ok(n) = reader.read_line(&mut buf) {
if n == 0 {
break;
}
if buf.starts_with("@") {
if let Ok(header) = buf.parse::<Header>() {
headers.push(header)
} else {
return Err(ReadError(format!("Invalid header at line {}", line)));
}
} else if buf.starts_with("+") {
// ignore optional description
} else if line % 4 == 0 {
// check if quality values differs from sequence values
if Some(&buf.trim().len()) != read_lens.last() {
return Err(ReadError(format!(
"Invalid quality string length at line {}",
line
)));
}
quality_lens.push(buf.trim().len());
} else if line % 4 == 2 {
read_lens.push(buf.trim().len());
}
line += 1;
buf.clear();
}
if line == 1 {
return Err(ReadError("No valid input".to_string()));
}
if line % 4 != 1 {
return Err(ReadError(
"File contains invalid or incomplete sequences".to_string(),
));
}
// Flowcell IDs
let flowcell_ids = headers
.iter()
.filter_map(|header| header.flowcell_id())
.sorted()
.chunk_by(|value| value.clone())
.into_iter()
.map(|g| g.0)
.collect::<Vec<String>>();
// Flowcell Lanes
let flowcell_lanes = headers
.iter()
.map(|header| header.flowcell_lane())
.sorted()
.chunk_by(|value| value.to_string())
.into_iter()
.map(|g| g.0)
.collect::<Vec<String>>();
// Read Orders
let read_orders = headers
.iter()
.map(|header| match header.pair_member() {
Pair::PairedEnd => "R1",
Pair::MatePair => "R2",
})
.sorted()
.chunk_by(|value| value.to_string())
.into_iter()
.map(|g| g.0)
.collect::<Vec<String>>();
// Read Lengths
let read_leans = read_lens
.iter()
.sorted()
.chunk_by(|value| value.to_string())
.into_iter()
.map(|g| g.0.parse::<i64>().unwrap())
.collect::<Vec<i64>>();
Ok(MetadataFile {
checksum_type: Some(ChecksumType::Sha256),
file_checksum: String::new(),
file_path: String::new(),
file_size_in_bytes: 0,
file_type: FileType::Fastq,
flowcell_id: if flowcell_ids.len() == 1 {
Some(flowcell_ids.into_iter().nth(0).unwrap())
} else {
return Err(ReadError("Cannot find single flowcell id".to_string()));
},
lane_id: if flowcell_lanes.len() == 1 {
Some(flowcell_lanes.into_iter().nth(0).unwrap())
} else {
return Err(ReadError("Cannot find single lane id".to_string()));
},
read_length: if read_leans.len() == 1 {
Some(read_leans.into_iter().nth(0).unwrap())
} else {
return Err(ReadError("Cannot find single lane id".to_string()));
},
read_order: if read_orders.len() == 1 {
match read_orders.into_iter().nth(0) {
None => None,
Some(value) => match value.as_str() {
"R1" => Some(ReadOrder::R1),
"R2" => Some(ReadOrder::R2),
_ => None,
},
}
} else {
return Err(ReadError("Cannot find single lane id".to_string()));
},
})
}
}