diff --git a/Cargo.toml b/Cargo.toml index 8b7754d..828e752 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,3 +13,14 @@ clap = { version = "4.5", features = ["color", "derive"]} console = "0.16" itertools = "0.14" flate2 = "1.1" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +base16ct = { version = "0.2", features = ["alloc"] } +sha2 = { version = "0.10", default-features = false } + +[profile.release] +opt-level = "z" +codegen-units = 1 +lto = true +strip = true +panic = "abort" diff --git a/README.md b/README.md index 952cc81..cc2c5e6 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,13 @@ This application provides the following subcommands Usage: fastq-tools [OPTIONS] Commands: - info Show information about input - scramble Scramble input data - help Print this message or the help of the given subcommand(s) + info Show information about input + grz-metadata Show GRZ metadata + scramble Scramble input data + help Print this message or the help of the given subcommand(s) Options: - -i, --input Input file (optional) + -i, --input Input file -d, --decompress Decompress input as gzip compressed data -h, --help Print help -V, --version Print version @@ -44,6 +45,24 @@ This will result in output like ![Info subcommand](docs/info_subcommand.jpg) +### GRZ Metadata + +To generate GRZ metadata for a file use: + +```shell +fastq-tools --decompress --input file_fastq.gz grz-metadata +``` + +The use of the `--input` argument is required for this sub command. +If the file is an uncompressed FASTQ file, you can omit the `--decompress` option. + +![GRZ Metadata subcommand](docs/grz-metadata_subcommand.jpg) + +Supported file types are: + +* fastq (full support) +* bam, bed, vcf (limited support) + ### Scramble To scramble compressed FASTQ files use: diff --git a/docs/grz-metadata_subcommand.jpg b/docs/grz-metadata_subcommand.jpg new file mode 100644 index 0000000..fbc27e5 Binary files /dev/null and b/docs/grz-metadata_subcommand.jpg differ diff --git a/src/cli.rs b/src/cli.rs index 780eb9b..26fa52f 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -7,12 +7,19 @@ use std::path::PathBuf; pub struct Args { #[command(subcommand)] pub(crate) command: Command, - #[arg(short = 'i', long = "input", help = "Input file (optional)")] + #[arg( + short = 'i', + long = "input", + help = "Input file", + group = "metadata", + global = true + )] pub(crate) input_file: Option, #[arg( short = 'd', long = "decompress", - help = "Decompress input as gzip compressed data" + help = "Decompress input as gzip compressed data", + global = true )] pub(crate) decompress: bool, } @@ -21,6 +28,8 @@ pub struct Args { pub enum Command { #[command(about = "Show information about input")] Info, + #[command(about = "Show GRZ metadata")] + GrzMetadata, #[command(about = "Scramble input data")] Scramble, } diff --git a/src/fastq.rs b/src/fastq.rs index 4d25d62..b3807da 100644 --- a/src/fastq.rs +++ b/src/fastq.rs @@ -1,6 +1,8 @@ use crate::scramble_sequence; +use serde::Serialize; use std::fmt::Display; use std::str::FromStr; +use std::string::ToString; pub enum Header { Casava18(Casava18Header), @@ -241,9 +243,11 @@ impl FromStr for Header { } } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Serialize)] pub enum Pair { + #[serde(rename = "R1")] PairedEnd = 1, + #[serde(rename = "R2")] MatePair = 2, } diff --git a/src/main.rs b/src/main.rs index d05e3a6..57380d2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,10 @@ mod cli; mod fastq; +mod metadata_file; use crate::cli::{Args, Command}; use crate::fastq::{Header, Pair}; +use crate::metadata_file::MetadataFile; use clap::Parser; use console::Style; use flate2::read::GzDecoder; @@ -10,6 +12,7 @@ use itertools::Itertools; use regex::Regex; use std::fs::File; use std::io::{BufRead, BufReader}; +use std::path::PathBuf; fn scramble_sequence(value: &str, seed: u32) -> String { let ahead_1 = Regex::new(r"T([ACG])").unwrap(); @@ -56,19 +59,61 @@ fn scramble_sequence(value: &str, seed: u32) -> String { fn main() { let args = Args::parse(); - let input: Box = match args.input_file { + + let input_file = args.input_file; + + match &args.command { + Command::Info => match input_reader(input_file, args.decompress) { + Ok(input) => info(input), + Err(err) => { + eprintln!( + "{}\n", + Style::new().bold().red().apply_to(format!("🔥 {err}")) + ); + } + }, + Command::GrzMetadata => match input_file { + Some(input_file) => { + let file_metadata = match MetadataFile::read_file(input_file, args.decompress) { + Ok(file_metadata) => file_metadata, + Err(err) => { + eprintln!( + "{}\n", + Style::new().bold().red().apply_to(format!("🔥 {err}")) + ); + return; + } + }; + + println!( + "{}\n", + serde_json::to_string_pretty(&file_metadata).unwrap() + ); + } + None => eprintln!( + "{}\n", + Style::new().bold().red().apply_to("🔥 No input file!") + ), + }, + Command::Scramble => match input_reader(input_file, args.decompress) { + Ok(input) => scramble(input), + Err(err) => { + eprintln!( + "{}\n", + Style::new().bold().red().apply_to(format!("🔥 {err}")) + ); + } + }, + } +} + +fn input_reader(input_file: Option, decompress: bool) -> Result, String> { + let input: Box = match input_file { Some(input_file) => { let file = match File::open(input_file) { Ok(file) => file, _ => { - println!( - "{}\n", - Style::new() - .bold() - .red() - .apply_to("🔥 Cannot open input file") - ); - return; + return Err("Cannot open input file".to_string()); } }; Box::new(BufReader::new(file)) @@ -76,19 +121,14 @@ fn main() { _ => Box::new(BufReader::new(std::io::stdin())), }; - let input: Box = if args.decompress { + let input: Box = if decompress { let gz_decoder = GzDecoder::new(input); Box::new(BufReader::new(gz_decoder)) } else { Box::new(input) }; - match &args.command { - Command::Info => info(input), - Command::Scramble => scramble(input), - } - - println!() + Ok(input) } fn scramble(mut reader: impl BufRead) { diff --git a/src/metadata_file.rs b/src/metadata_file.rs new file mode 100644 index 0000000..ce112c5 --- /dev/null +++ b/src/metadata_file.rs @@ -0,0 +1,307 @@ +use crate::fastq::{Header, Pair}; +use crate::input_reader; +use crate::metadata_file::MetadataError::{CannotReadFile, ReadError}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; +use std::error::Error; +use std::fmt::{Debug, Display, Formatter}; +use std::fs; +use std::fs::File; +use std::io::BufRead; +use std::path::PathBuf; + +#[derive(Debug, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct MetadataFile { + /// Type of checksum algorithm used + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum_type: Option, + + /// checksum of the file + pub file_checksum: String, + + /// Path relative to the submission files directory, e.g.: + /// 'patient_001/patient_001_dna.fastq.gz' if the file is located in /files/patient_001/patient_001_dna.fastq.gz + pub file_path: String, + + /// Size of the file in bytes + pub file_size_in_bytes: u64, + + /// Type of the file; if BED file is submitted, only 1 file is allowed. + pub file_type: FileType, + + /// Indicates the flow cell. + #[serde(skip_serializing_if = "Option::is_none")] + pub flowcell_id: Option, + + /// Indicates the lane + #[serde(skip_serializing_if = "Option::is_none")] + pub lane_id: Option, + + /// The read length; in the case of long-read sequencing it is the rounded average read + /// length. + #[serde(skip_serializing_if = "Option::is_none")] + pub read_length: Option, + + /// Indicates the read order for paired-end reads. + #[serde(skip_serializing_if = "Option::is_none")] + pub read_order: Option, +} + +/// Type of checksum algorithm used +#[derive(Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ChecksumType { + Sha256, +} + +/// Type of the file; if BED file is submitted, only 1 file is allowed. +#[derive(Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum FileType { + Bam, + + Bed, + + Fastq, + + Vcf, +} + +/// Indicates the read order for paired-end reads. +#[derive(Debug, Serialize, Deserialize)] +pub enum ReadOrder { + R1, + + R2, +} + +pub enum MetadataError { + CannotReadFile, + UnsupportedFile, + ReadError(String), +} + +impl Debug for MetadataError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self) + } +} + +impl Display for MetadataError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}", + match self { + MetadataError::CannotReadFile => "Cannot read file".into(), + MetadataError::UnsupportedFile => "Unsupported file type".into(), + MetadataError::ReadError(err) => format!("Error reading file: {}", err), + } + ) + } +} + +impl Error for MetadataError {} + +impl MetadataFile { + pub fn read_file(path: PathBuf, decompress: bool) -> Result { + let path = match path.to_str() { + Some(path) => path, + None => return Err(MetadataError::CannotReadFile), + }; + + let file = File::open(path).map_err(|_| CannotReadFile)?; + + let file_type = if path.to_lowercase().ends_with(".bam") { + FileType::Bam + } else if path.to_lowercase().ends_with(".vcf") { + FileType::Vcf + } else if path.to_lowercase().ends_with(".bed") { + FileType::Bed + } else if path.to_lowercase().ends_with(".fastq") + || path.to_lowercase().ends_with(".fastq.gz") + { + FileType::Fastq + } else { + return Err(MetadataError::UnsupportedFile); + }; + + let file_checksum = match fs::read(path) { + Ok(content) => { + let mut hasher = Sha256::new(); + hasher.update(content.as_slice()); + let hash = hasher.finalize(); + base16ct::lower::encode_string(&hash) + } + Err(_) => { + return Err(CannotReadFile); + } + }; + + if let FileType::Fastq = file_type { + match input_reader(Some(PathBuf::from(path)), decompress) { + Ok(input_reader) => { + let input_metadata = MetadataFile::read(input_reader)?; + + Ok(MetadataFile { + file_type, + file_checksum, + checksum_type: Some(ChecksumType::Sha256), + file_size_in_bytes: file.metadata().map_err(|_| CannotReadFile)?.len(), + flowcell_id: input_metadata.flowcell_id, + read_order: input_metadata.read_order, + file_path: path.to_string(), + read_length: input_metadata.read_length, + lane_id: input_metadata.lane_id, + }) + } + Err(err) => Err(ReadError(err.to_string())), + } + } else { + Ok(MetadataFile { + file_type, + file_checksum, + checksum_type: Some(ChecksumType::Sha256), + file_size_in_bytes: file.metadata().map_err(|_| CannotReadFile)?.len(), + flowcell_id: None, + read_order: None, + file_path: path.to_string(), + read_length: None, + lane_id: None, + }) + } + } + + fn read(mut reader: impl BufRead) -> Result { + let mut buf = String::new(); + + let mut headers = vec![]; + let mut read_lens = vec![]; + let mut quality_lens = vec![]; + + let mut line = 1; + while let Ok(n) = reader.read_line(&mut buf) { + if n == 0 { + break; + } + + if buf.starts_with("@") { + if let Ok(header) = buf.parse::
() { + headers.push(header) + } else { + return Err(ReadError(format!("Invalid header at line {}", line))); + } + } else if buf.starts_with("+") { + // ignore optional description + } else if line % 4 == 0 { + // check if quality values differs from sequence values + if Some(&buf.trim().len()) != read_lens.last() { + return Err(ReadError(format!( + "Invalid quality string length at line {}", + line + ))); + } + quality_lens.push(buf.trim().len()); + } else if line % 4 == 2 { + read_lens.push(buf.trim().len()); + } + + line += 1; + buf.clear(); + } + + if line == 1 { + return Err(ReadError("No valid input".to_string())); + } + + if line % 4 != 1 { + return Err(ReadError( + "File contains invalid or incomplete sequences".to_string(), + )); + } + + // Flowcell IDs + + let flowcell_ids = headers + .iter() + .filter_map(|header| header.flowcell_id()) + .sorted() + .chunk_by(|value| value.clone()) + .into_iter() + .map(|g| g.0) + .collect::>(); + + // Flowcell Lanes + + let flowcell_lanes = headers + .iter() + .map(|header| header.flowcell_lane()) + .sorted() + .chunk_by(|value| value.to_string()) + .into_iter() + .map(|g| g.0) + .collect::>(); + + // Read Orders + + let read_orders = headers + .iter() + .map(|header| match header.pair_member() { + Pair::PairedEnd => "R1", + Pair::MatePair => "R2", + }) + .sorted() + .chunk_by(|value| value.to_string()) + .into_iter() + .map(|g| g.0) + .collect::>(); + + // Read Lengths + + let read_leans = read_lens + .iter() + .sorted() + .chunk_by(|value| value.to_string()) + .into_iter() + .map(|g| g.0.parse::().unwrap()) + .collect::>(); + + Ok(MetadataFile { + checksum_type: Some(ChecksumType::Sha256), + file_checksum: String::new(), + file_path: String::new(), + file_size_in_bytes: 0, + file_type: FileType::Fastq, + flowcell_id: if flowcell_ids.len() == 1 { + Some(flowcell_ids.into_iter().nth(0).unwrap()) + } else { + return Err(ReadError("Cannot find single flowcell id".to_string())); + }, + lane_id: if flowcell_lanes.len() == 1 { + Some(flowcell_lanes.into_iter().nth(0).unwrap()) + } else { + return Err(ReadError("Cannot find single lane id".to_string())); + }, + read_length: if read_leans.len() == 1 { + Some(read_leans.into_iter().nth(0).unwrap()) + } else { + return Err(ReadError("Cannot find single lane id".to_string())); + }, + read_order: if read_orders.len() == 1 { + match read_orders.into_iter().nth(0) { + None => None, + Some(value) => match value.as_str() { + "R1" => Some(ReadOrder::R1), + "R2" => Some(ReadOrder::R2), + _ => None, + }, + } + } else { + return Err(ReadError("Cannot find single lane id".to_string())); + }, + }) + } +}