1
0
mirror of https://github.com/pcvolkmer/fastq-tools.git synced 2025-09-13 05:02:53 +00:00

feat: generate GRZ metadata for file

This commit is contained in:
2025-08-14 23:25:53 +02:00
parent 9cc7762af2
commit d719fed150
7 changed files with 413 additions and 23 deletions

View File

@@ -13,3 +13,14 @@ clap = { version = "4.5", features = ["color", "derive"]}
console = "0.16" console = "0.16"
itertools = "0.14" itertools = "0.14"
flate2 = "1.1" flate2 = "1.1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
base16ct = { version = "0.2", features = ["alloc"] }
sha2 = { version = "0.10", default-features = false }
[profile.release]
opt-level = "z"
codegen-units = 1
lto = true
strip = true
panic = "abort"

View File

@@ -10,12 +10,13 @@ This application provides the following subcommands
Usage: fastq-tools [OPTIONS] <COMMAND> Usage: fastq-tools [OPTIONS] <COMMAND>
Commands: Commands:
info Show information about input info Show information about input
scramble Scramble input data grz-metadata Show GRZ metadata
help Print this message or the help of the given subcommand(s) scramble Scramble input data
help Print this message or the help of the given subcommand(s)
Options: Options:
-i, --input <INPUT_FILE> Input file (optional) -i, --input <INPUT_FILE> Input file
-d, --decompress Decompress input as gzip compressed data -d, --decompress Decompress input as gzip compressed data
-h, --help Print help -h, --help Print help
-V, --version Print version -V, --version Print version
@@ -44,6 +45,24 @@ This will result in output like
![Info subcommand](docs/info_subcommand.jpg) ![Info subcommand](docs/info_subcommand.jpg)
### GRZ Metadata
To generate GRZ metadata for a file use:
```shell
fastq-tools --decompress --input file_fastq.gz grz-metadata
```
The use of the `--input` argument is required for this sub command.
If the file is an uncompressed FASTQ file, you can omit the `--decompress` option.
![GRZ Metadata subcommand](docs/grz-metadata_subcommand.jpg)
Supported file types are:
* fastq (full support)
* bam, bed, vcf (limited support)
### Scramble ### Scramble
To scramble compressed FASTQ files use: To scramble compressed FASTQ files use:

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

View File

@@ -7,12 +7,19 @@ use std::path::PathBuf;
pub struct Args { pub struct Args {
#[command(subcommand)] #[command(subcommand)]
pub(crate) command: Command, pub(crate) command: Command,
#[arg(short = 'i', long = "input", help = "Input file (optional)")] #[arg(
short = 'i',
long = "input",
help = "Input file",
group = "metadata",
global = true
)]
pub(crate) input_file: Option<PathBuf>, pub(crate) input_file: Option<PathBuf>,
#[arg( #[arg(
short = 'd', short = 'd',
long = "decompress", long = "decompress",
help = "Decompress input as gzip compressed data" help = "Decompress input as gzip compressed data",
global = true
)] )]
pub(crate) decompress: bool, pub(crate) decompress: bool,
} }
@@ -21,6 +28,8 @@ pub struct Args {
pub enum Command { pub enum Command {
#[command(about = "Show information about input")] #[command(about = "Show information about input")]
Info, Info,
#[command(about = "Show GRZ metadata")]
GrzMetadata,
#[command(about = "Scramble input data")] #[command(about = "Scramble input data")]
Scramble, Scramble,
} }

View File

@@ -1,6 +1,8 @@
use crate::scramble_sequence; use crate::scramble_sequence;
use serde::Serialize;
use std::fmt::Display; use std::fmt::Display;
use std::str::FromStr; use std::str::FromStr;
use std::string::ToString;
pub enum Header { pub enum Header {
Casava18(Casava18Header), Casava18(Casava18Header),
@@ -241,9 +243,11 @@ impl FromStr for Header {
} }
} }
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq, Serialize)]
pub enum Pair { pub enum Pair {
#[serde(rename = "R1")]
PairedEnd = 1, PairedEnd = 1,
#[serde(rename = "R2")]
MatePair = 2, MatePair = 2,
} }

View File

@@ -1,8 +1,10 @@
mod cli; mod cli;
mod fastq; mod fastq;
mod metadata_file;
use crate::cli::{Args, Command}; use crate::cli::{Args, Command};
use crate::fastq::{Header, Pair}; use crate::fastq::{Header, Pair};
use crate::metadata_file::MetadataFile;
use clap::Parser; use clap::Parser;
use console::Style; use console::Style;
use flate2::read::GzDecoder; use flate2::read::GzDecoder;
@@ -10,6 +12,7 @@ use itertools::Itertools;
use regex::Regex; use regex::Regex;
use std::fs::File; use std::fs::File;
use std::io::{BufRead, BufReader}; use std::io::{BufRead, BufReader};
use std::path::PathBuf;
fn scramble_sequence(value: &str, seed: u32) -> String { fn scramble_sequence(value: &str, seed: u32) -> String {
let ahead_1 = Regex::new(r"T([ACG])").unwrap(); let ahead_1 = Regex::new(r"T([ACG])").unwrap();
@@ -56,19 +59,61 @@ fn scramble_sequence(value: &str, seed: u32) -> String {
fn main() { fn main() {
let args = Args::parse(); let args = Args::parse();
let input: Box<dyn BufRead> = match args.input_file {
let input_file = args.input_file;
match &args.command {
Command::Info => match input_reader(input_file, args.decompress) {
Ok(input) => info(input),
Err(err) => {
eprintln!(
"{}\n",
Style::new().bold().red().apply_to(format!("🔥 {err}"))
);
}
},
Command::GrzMetadata => match input_file {
Some(input_file) => {
let file_metadata = match MetadataFile::read_file(input_file, args.decompress) {
Ok(file_metadata) => file_metadata,
Err(err) => {
eprintln!(
"{}\n",
Style::new().bold().red().apply_to(format!("🔥 {err}"))
);
return;
}
};
println!(
"{}\n",
serde_json::to_string_pretty(&file_metadata).unwrap()
);
}
None => eprintln!(
"{}\n",
Style::new().bold().red().apply_to("🔥 No input file!")
),
},
Command::Scramble => match input_reader(input_file, args.decompress) {
Ok(input) => scramble(input),
Err(err) => {
eprintln!(
"{}\n",
Style::new().bold().red().apply_to(format!("🔥 {err}"))
);
}
},
}
}
fn input_reader(input_file: Option<PathBuf>, decompress: bool) -> Result<Box<dyn BufRead>, String> {
let input: Box<dyn BufRead> = match input_file {
Some(input_file) => { Some(input_file) => {
let file = match File::open(input_file) { let file = match File::open(input_file) {
Ok(file) => file, Ok(file) => file,
_ => { _ => {
println!( return Err("Cannot open input file".to_string());
"{}\n",
Style::new()
.bold()
.red()
.apply_to("🔥 Cannot open input file")
);
return;
} }
}; };
Box::new(BufReader::new(file)) Box::new(BufReader::new(file))
@@ -76,19 +121,14 @@ fn main() {
_ => Box::new(BufReader::new(std::io::stdin())), _ => Box::new(BufReader::new(std::io::stdin())),
}; };
let input: Box<dyn BufRead> = if args.decompress { let input: Box<dyn BufRead> = if decompress {
let gz_decoder = GzDecoder::new(input); let gz_decoder = GzDecoder::new(input);
Box::new(BufReader::new(gz_decoder)) Box::new(BufReader::new(gz_decoder))
} else { } else {
Box::new(input) Box::new(input)
}; };
match &args.command { Ok(input)
Command::Info => info(input),
Command::Scramble => scramble(input),
}
println!()
} }
fn scramble(mut reader: impl BufRead) { fn scramble(mut reader: impl BufRead) {

307
src/metadata_file.rs Normal file
View File

@@ -0,0 +1,307 @@
use crate::fastq::{Header, Pair};
use crate::input_reader;
use crate::metadata_file::MetadataError::{CannotReadFile, ReadError};
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::error::Error;
use std::fmt::{Debug, Display, Formatter};
use std::fs;
use std::fs::File;
use std::io::BufRead;
use std::path::PathBuf;
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct MetadataFile {
/// Type of checksum algorithm used
#[serde(skip_serializing_if = "Option::is_none")]
pub checksum_type: Option<ChecksumType>,
/// checksum of the file
pub file_checksum: String,
/// Path relative to the submission files directory, e.g.:
/// 'patient_001/patient_001_dna.fastq.gz' if the file is located in <submission
/// root>/files/patient_001/patient_001_dna.fastq.gz
pub file_path: String,
/// Size of the file in bytes
pub file_size_in_bytes: u64,
/// Type of the file; if BED file is submitted, only 1 file is allowed.
pub file_type: FileType,
/// Indicates the flow cell.
#[serde(skip_serializing_if = "Option::is_none")]
pub flowcell_id: Option<String>,
/// Indicates the lane
#[serde(skip_serializing_if = "Option::is_none")]
pub lane_id: Option<String>,
/// The read length; in the case of long-read sequencing it is the rounded average read
/// length.
#[serde(skip_serializing_if = "Option::is_none")]
pub read_length: Option<i64>,
/// Indicates the read order for paired-end reads.
#[serde(skip_serializing_if = "Option::is_none")]
pub read_order: Option<ReadOrder>,
}
/// Type of checksum algorithm used
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ChecksumType {
Sha256,
}
/// Type of the file; if BED file is submitted, only 1 file is allowed.
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FileType {
Bam,
Bed,
Fastq,
Vcf,
}
/// Indicates the read order for paired-end reads.
#[derive(Debug, Serialize, Deserialize)]
pub enum ReadOrder {
R1,
R2,
}
pub enum MetadataError {
CannotReadFile,
UnsupportedFile,
ReadError(String),
}
impl Debug for MetadataError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self)
}
}
impl Display for MetadataError {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}",
match self {
MetadataError::CannotReadFile => "Cannot read file".into(),
MetadataError::UnsupportedFile => "Unsupported file type".into(),
MetadataError::ReadError(err) => format!("Error reading file: {}", err),
}
)
}
}
impl Error for MetadataError {}
impl MetadataFile {
pub fn read_file(path: PathBuf, decompress: bool) -> Result<MetadataFile, MetadataError> {
let path = match path.to_str() {
Some(path) => path,
None => return Err(MetadataError::CannotReadFile),
};
let file = File::open(path).map_err(|_| CannotReadFile)?;
let file_type = if path.to_lowercase().ends_with(".bam") {
FileType::Bam
} else if path.to_lowercase().ends_with(".vcf") {
FileType::Vcf
} else if path.to_lowercase().ends_with(".bed") {
FileType::Bed
} else if path.to_lowercase().ends_with(".fastq")
|| path.to_lowercase().ends_with(".fastq.gz")
{
FileType::Fastq
} else {
return Err(MetadataError::UnsupportedFile);
};
let file_checksum = match fs::read(path) {
Ok(content) => {
let mut hasher = Sha256::new();
hasher.update(content.as_slice());
let hash = hasher.finalize();
base16ct::lower::encode_string(&hash)
}
Err(_) => {
return Err(CannotReadFile);
}
};
if let FileType::Fastq = file_type {
match input_reader(Some(PathBuf::from(path)), decompress) {
Ok(input_reader) => {
let input_metadata = MetadataFile::read(input_reader)?;
Ok(MetadataFile {
file_type,
file_checksum,
checksum_type: Some(ChecksumType::Sha256),
file_size_in_bytes: file.metadata().map_err(|_| CannotReadFile)?.len(),
flowcell_id: input_metadata.flowcell_id,
read_order: input_metadata.read_order,
file_path: path.to_string(),
read_length: input_metadata.read_length,
lane_id: input_metadata.lane_id,
})
}
Err(err) => Err(ReadError(err.to_string())),
}
} else {
Ok(MetadataFile {
file_type,
file_checksum,
checksum_type: Some(ChecksumType::Sha256),
file_size_in_bytes: file.metadata().map_err(|_| CannotReadFile)?.len(),
flowcell_id: None,
read_order: None,
file_path: path.to_string(),
read_length: None,
lane_id: None,
})
}
}
fn read(mut reader: impl BufRead) -> Result<MetadataFile, MetadataError> {
let mut buf = String::new();
let mut headers = vec![];
let mut read_lens = vec![];
let mut quality_lens = vec![];
let mut line = 1;
while let Ok(n) = reader.read_line(&mut buf) {
if n == 0 {
break;
}
if buf.starts_with("@") {
if let Ok(header) = buf.parse::<Header>() {
headers.push(header)
} else {
return Err(ReadError(format!("Invalid header at line {}", line)));
}
} else if buf.starts_with("+") {
// ignore optional description
} else if line % 4 == 0 {
// check if quality values differs from sequence values
if Some(&buf.trim().len()) != read_lens.last() {
return Err(ReadError(format!(
"Invalid quality string length at line {}",
line
)));
}
quality_lens.push(buf.trim().len());
} else if line % 4 == 2 {
read_lens.push(buf.trim().len());
}
line += 1;
buf.clear();
}
if line == 1 {
return Err(ReadError("No valid input".to_string()));
}
if line % 4 != 1 {
return Err(ReadError(
"File contains invalid or incomplete sequences".to_string(),
));
}
// Flowcell IDs
let flowcell_ids = headers
.iter()
.filter_map(|header| header.flowcell_id())
.sorted()
.chunk_by(|value| value.clone())
.into_iter()
.map(|g| g.0)
.collect::<Vec<String>>();
// Flowcell Lanes
let flowcell_lanes = headers
.iter()
.map(|header| header.flowcell_lane())
.sorted()
.chunk_by(|value| value.to_string())
.into_iter()
.map(|g| g.0)
.collect::<Vec<String>>();
// Read Orders
let read_orders = headers
.iter()
.map(|header| match header.pair_member() {
Pair::PairedEnd => "R1",
Pair::MatePair => "R2",
})
.sorted()
.chunk_by(|value| value.to_string())
.into_iter()
.map(|g| g.0)
.collect::<Vec<String>>();
// Read Lengths
let read_leans = read_lens
.iter()
.sorted()
.chunk_by(|value| value.to_string())
.into_iter()
.map(|g| g.0.parse::<i64>().unwrap())
.collect::<Vec<i64>>();
Ok(MetadataFile {
checksum_type: Some(ChecksumType::Sha256),
file_checksum: String::new(),
file_path: String::new(),
file_size_in_bytes: 0,
file_type: FileType::Fastq,
flowcell_id: if flowcell_ids.len() == 1 {
Some(flowcell_ids.into_iter().nth(0).unwrap())
} else {
return Err(ReadError("Cannot find single flowcell id".to_string()));
},
lane_id: if flowcell_lanes.len() == 1 {
Some(flowcell_lanes.into_iter().nth(0).unwrap())
} else {
return Err(ReadError("Cannot find single lane id".to_string()));
},
read_length: if read_leans.len() == 1 {
Some(read_leans.into_iter().nth(0).unwrap())
} else {
return Err(ReadError("Cannot find single lane id".to_string()));
},
read_order: if read_orders.len() == 1 {
match read_orders.into_iter().nth(0) {
None => None,
Some(value) => match value.as_str() {
"R1" => Some(ReadOrder::R1),
"R2" => Some(ReadOrder::R2),
_ => None,
},
}
} else {
return Err(ReadError("Cannot find single lane id".to_string()));
},
})
}
}