mirror of
https://github.com/pcvolkmer/fastq-tools.git
synced 2025-09-13 05:02:53 +00:00
feat: generate GRZ metadata for file
This commit is contained in:
11
Cargo.toml
11
Cargo.toml
@@ -13,3 +13,14 @@ clap = { version = "4.5", features = ["color", "derive"]}
|
||||
console = "0.16"
|
||||
itertools = "0.14"
|
||||
flate2 = "1.1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
base16ct = { version = "0.2", features = ["alloc"] }
|
||||
sha2 = { version = "0.10", default-features = false }
|
||||
|
||||
[profile.release]
|
||||
opt-level = "z"
|
||||
codegen-units = 1
|
||||
lto = true
|
||||
strip = true
|
||||
panic = "abort"
|
||||
|
27
README.md
27
README.md
@@ -10,12 +10,13 @@ This application provides the following subcommands
|
||||
Usage: fastq-tools [OPTIONS] <COMMAND>
|
||||
|
||||
Commands:
|
||||
info Show information about input
|
||||
scramble Scramble input data
|
||||
help Print this message or the help of the given subcommand(s)
|
||||
info Show information about input
|
||||
grz-metadata Show GRZ metadata
|
||||
scramble Scramble input data
|
||||
help Print this message or the help of the given subcommand(s)
|
||||
|
||||
Options:
|
||||
-i, --input <INPUT_FILE> Input file (optional)
|
||||
-i, --input <INPUT_FILE> Input file
|
||||
-d, --decompress Decompress input as gzip compressed data
|
||||
-h, --help Print help
|
||||
-V, --version Print version
|
||||
@@ -44,6 +45,24 @@ This will result in output like
|
||||
|
||||

|
||||
|
||||
### GRZ Metadata
|
||||
|
||||
To generate GRZ metadata for a file use:
|
||||
|
||||
```shell
|
||||
fastq-tools --decompress --input file_fastq.gz grz-metadata
|
||||
```
|
||||
|
||||
The use of the `--input` argument is required for this sub command.
|
||||
If the file is an uncompressed FASTQ file, you can omit the `--decompress` option.
|
||||
|
||||

|
||||
|
||||
Supported file types are:
|
||||
|
||||
* fastq (full support)
|
||||
* bam, bed, vcf (limited support)
|
||||
|
||||
### Scramble
|
||||
|
||||
To scramble compressed FASTQ files use:
|
||||
|
BIN
docs/grz-metadata_subcommand.jpg
Normal file
BIN
docs/grz-metadata_subcommand.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 25 KiB |
13
src/cli.rs
13
src/cli.rs
@@ -7,12 +7,19 @@ use std::path::PathBuf;
|
||||
pub struct Args {
|
||||
#[command(subcommand)]
|
||||
pub(crate) command: Command,
|
||||
#[arg(short = 'i', long = "input", help = "Input file (optional)")]
|
||||
#[arg(
|
||||
short = 'i',
|
||||
long = "input",
|
||||
help = "Input file",
|
||||
group = "metadata",
|
||||
global = true
|
||||
)]
|
||||
pub(crate) input_file: Option<PathBuf>,
|
||||
#[arg(
|
||||
short = 'd',
|
||||
long = "decompress",
|
||||
help = "Decompress input as gzip compressed data"
|
||||
help = "Decompress input as gzip compressed data",
|
||||
global = true
|
||||
)]
|
||||
pub(crate) decompress: bool,
|
||||
}
|
||||
@@ -21,6 +28,8 @@ pub struct Args {
|
||||
pub enum Command {
|
||||
#[command(about = "Show information about input")]
|
||||
Info,
|
||||
#[command(about = "Show GRZ metadata")]
|
||||
GrzMetadata,
|
||||
#[command(about = "Scramble input data")]
|
||||
Scramble,
|
||||
}
|
||||
|
@@ -1,6 +1,8 @@
|
||||
use crate::scramble_sequence;
|
||||
use serde::Serialize;
|
||||
use std::fmt::Display;
|
||||
use std::str::FromStr;
|
||||
use std::string::ToString;
|
||||
|
||||
pub enum Header {
|
||||
Casava18(Casava18Header),
|
||||
@@ -241,9 +243,11 @@ impl FromStr for Header {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[derive(Clone, Debug, PartialEq, Serialize)]
|
||||
pub enum Pair {
|
||||
#[serde(rename = "R1")]
|
||||
PairedEnd = 1,
|
||||
#[serde(rename = "R2")]
|
||||
MatePair = 2,
|
||||
}
|
||||
|
||||
|
72
src/main.rs
72
src/main.rs
@@ -1,8 +1,10 @@
|
||||
mod cli;
|
||||
mod fastq;
|
||||
mod metadata_file;
|
||||
|
||||
use crate::cli::{Args, Command};
|
||||
use crate::fastq::{Header, Pair};
|
||||
use crate::metadata_file::MetadataFile;
|
||||
use clap::Parser;
|
||||
use console::Style;
|
||||
use flate2::read::GzDecoder;
|
||||
@@ -10,6 +12,7 @@ use itertools::Itertools;
|
||||
use regex::Regex;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn scramble_sequence(value: &str, seed: u32) -> String {
|
||||
let ahead_1 = Regex::new(r"T([ACG])").unwrap();
|
||||
@@ -56,19 +59,61 @@ fn scramble_sequence(value: &str, seed: u32) -> String {
|
||||
|
||||
fn main() {
|
||||
let args = Args::parse();
|
||||
let input: Box<dyn BufRead> = match args.input_file {
|
||||
|
||||
let input_file = args.input_file;
|
||||
|
||||
match &args.command {
|
||||
Command::Info => match input_reader(input_file, args.decompress) {
|
||||
Ok(input) => info(input),
|
||||
Err(err) => {
|
||||
eprintln!(
|
||||
"{}\n",
|
||||
Style::new().bold().red().apply_to(format!("🔥 {err}"))
|
||||
);
|
||||
}
|
||||
},
|
||||
Command::GrzMetadata => match input_file {
|
||||
Some(input_file) => {
|
||||
let file_metadata = match MetadataFile::read_file(input_file, args.decompress) {
|
||||
Ok(file_metadata) => file_metadata,
|
||||
Err(err) => {
|
||||
eprintln!(
|
||||
"{}\n",
|
||||
Style::new().bold().red().apply_to(format!("🔥 {err}"))
|
||||
);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
println!(
|
||||
"{}\n",
|
||||
serde_json::to_string_pretty(&file_metadata).unwrap()
|
||||
);
|
||||
}
|
||||
None => eprintln!(
|
||||
"{}\n",
|
||||
Style::new().bold().red().apply_to("🔥 No input file!")
|
||||
),
|
||||
},
|
||||
Command::Scramble => match input_reader(input_file, args.decompress) {
|
||||
Ok(input) => scramble(input),
|
||||
Err(err) => {
|
||||
eprintln!(
|
||||
"{}\n",
|
||||
Style::new().bold().red().apply_to(format!("🔥 {err}"))
|
||||
);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn input_reader(input_file: Option<PathBuf>, decompress: bool) -> Result<Box<dyn BufRead>, String> {
|
||||
let input: Box<dyn BufRead> = match input_file {
|
||||
Some(input_file) => {
|
||||
let file = match File::open(input_file) {
|
||||
Ok(file) => file,
|
||||
_ => {
|
||||
println!(
|
||||
"{}\n",
|
||||
Style::new()
|
||||
.bold()
|
||||
.red()
|
||||
.apply_to("🔥 Cannot open input file")
|
||||
);
|
||||
return;
|
||||
return Err("Cannot open input file".to_string());
|
||||
}
|
||||
};
|
||||
Box::new(BufReader::new(file))
|
||||
@@ -76,19 +121,14 @@ fn main() {
|
||||
_ => Box::new(BufReader::new(std::io::stdin())),
|
||||
};
|
||||
|
||||
let input: Box<dyn BufRead> = if args.decompress {
|
||||
let input: Box<dyn BufRead> = if decompress {
|
||||
let gz_decoder = GzDecoder::new(input);
|
||||
Box::new(BufReader::new(gz_decoder))
|
||||
} else {
|
||||
Box::new(input)
|
||||
};
|
||||
|
||||
match &args.command {
|
||||
Command::Info => info(input),
|
||||
Command::Scramble => scramble(input),
|
||||
}
|
||||
|
||||
println!()
|
||||
Ok(input)
|
||||
}
|
||||
|
||||
fn scramble(mut reader: impl BufRead) {
|
||||
|
307
src/metadata_file.rs
Normal file
307
src/metadata_file.rs
Normal file
@@ -0,0 +1,307 @@
|
||||
use crate::fastq::{Header, Pair};
|
||||
use crate::input_reader;
|
||||
use crate::metadata_file::MetadataError::{CannotReadFile, ReadError};
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::error::Error;
|
||||
use std::fmt::{Debug, Display, Formatter};
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::BufRead;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct MetadataFile {
|
||||
/// Type of checksum algorithm used
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub checksum_type: Option<ChecksumType>,
|
||||
|
||||
/// checksum of the file
|
||||
pub file_checksum: String,
|
||||
|
||||
/// Path relative to the submission files directory, e.g.:
|
||||
/// 'patient_001/patient_001_dna.fastq.gz' if the file is located in <submission
|
||||
/// root>/files/patient_001/patient_001_dna.fastq.gz
|
||||
pub file_path: String,
|
||||
|
||||
/// Size of the file in bytes
|
||||
pub file_size_in_bytes: u64,
|
||||
|
||||
/// Type of the file; if BED file is submitted, only 1 file is allowed.
|
||||
pub file_type: FileType,
|
||||
|
||||
/// Indicates the flow cell.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub flowcell_id: Option<String>,
|
||||
|
||||
/// Indicates the lane
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub lane_id: Option<String>,
|
||||
|
||||
/// The read length; in the case of long-read sequencing it is the rounded average read
|
||||
/// length.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub read_length: Option<i64>,
|
||||
|
||||
/// Indicates the read order for paired-end reads.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub read_order: Option<ReadOrder>,
|
||||
}
|
||||
|
||||
/// Type of checksum algorithm used
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ChecksumType {
|
||||
Sha256,
|
||||
}
|
||||
|
||||
/// Type of the file; if BED file is submitted, only 1 file is allowed.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum FileType {
|
||||
Bam,
|
||||
|
||||
Bed,
|
||||
|
||||
Fastq,
|
||||
|
||||
Vcf,
|
||||
}
|
||||
|
||||
/// Indicates the read order for paired-end reads.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum ReadOrder {
|
||||
R1,
|
||||
|
||||
R2,
|
||||
}
|
||||
|
||||
pub enum MetadataError {
|
||||
CannotReadFile,
|
||||
UnsupportedFile,
|
||||
ReadError(String),
|
||||
}
|
||||
|
||||
impl Debug for MetadataError {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for MetadataError {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}",
|
||||
match self {
|
||||
MetadataError::CannotReadFile => "Cannot read file".into(),
|
||||
MetadataError::UnsupportedFile => "Unsupported file type".into(),
|
||||
MetadataError::ReadError(err) => format!("Error reading file: {}", err),
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for MetadataError {}
|
||||
|
||||
impl MetadataFile {
|
||||
pub fn read_file(path: PathBuf, decompress: bool) -> Result<MetadataFile, MetadataError> {
|
||||
let path = match path.to_str() {
|
||||
Some(path) => path,
|
||||
None => return Err(MetadataError::CannotReadFile),
|
||||
};
|
||||
|
||||
let file = File::open(path).map_err(|_| CannotReadFile)?;
|
||||
|
||||
let file_type = if path.to_lowercase().ends_with(".bam") {
|
||||
FileType::Bam
|
||||
} else if path.to_lowercase().ends_with(".vcf") {
|
||||
FileType::Vcf
|
||||
} else if path.to_lowercase().ends_with(".bed") {
|
||||
FileType::Bed
|
||||
} else if path.to_lowercase().ends_with(".fastq")
|
||||
|| path.to_lowercase().ends_with(".fastq.gz")
|
||||
{
|
||||
FileType::Fastq
|
||||
} else {
|
||||
return Err(MetadataError::UnsupportedFile);
|
||||
};
|
||||
|
||||
let file_checksum = match fs::read(path) {
|
||||
Ok(content) => {
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(content.as_slice());
|
||||
let hash = hasher.finalize();
|
||||
base16ct::lower::encode_string(&hash)
|
||||
}
|
||||
Err(_) => {
|
||||
return Err(CannotReadFile);
|
||||
}
|
||||
};
|
||||
|
||||
if let FileType::Fastq = file_type {
|
||||
match input_reader(Some(PathBuf::from(path)), decompress) {
|
||||
Ok(input_reader) => {
|
||||
let input_metadata = MetadataFile::read(input_reader)?;
|
||||
|
||||
Ok(MetadataFile {
|
||||
file_type,
|
||||
file_checksum,
|
||||
checksum_type: Some(ChecksumType::Sha256),
|
||||
file_size_in_bytes: file.metadata().map_err(|_| CannotReadFile)?.len(),
|
||||
flowcell_id: input_metadata.flowcell_id,
|
||||
read_order: input_metadata.read_order,
|
||||
file_path: path.to_string(),
|
||||
read_length: input_metadata.read_length,
|
||||
lane_id: input_metadata.lane_id,
|
||||
})
|
||||
}
|
||||
Err(err) => Err(ReadError(err.to_string())),
|
||||
}
|
||||
} else {
|
||||
Ok(MetadataFile {
|
||||
file_type,
|
||||
file_checksum,
|
||||
checksum_type: Some(ChecksumType::Sha256),
|
||||
file_size_in_bytes: file.metadata().map_err(|_| CannotReadFile)?.len(),
|
||||
flowcell_id: None,
|
||||
read_order: None,
|
||||
file_path: path.to_string(),
|
||||
read_length: None,
|
||||
lane_id: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn read(mut reader: impl BufRead) -> Result<MetadataFile, MetadataError> {
|
||||
let mut buf = String::new();
|
||||
|
||||
let mut headers = vec![];
|
||||
let mut read_lens = vec![];
|
||||
let mut quality_lens = vec![];
|
||||
|
||||
let mut line = 1;
|
||||
while let Ok(n) = reader.read_line(&mut buf) {
|
||||
if n == 0 {
|
||||
break;
|
||||
}
|
||||
|
||||
if buf.starts_with("@") {
|
||||
if let Ok(header) = buf.parse::<Header>() {
|
||||
headers.push(header)
|
||||
} else {
|
||||
return Err(ReadError(format!("Invalid header at line {}", line)));
|
||||
}
|
||||
} else if buf.starts_with("+") {
|
||||
// ignore optional description
|
||||
} else if line % 4 == 0 {
|
||||
// check if quality values differs from sequence values
|
||||
if Some(&buf.trim().len()) != read_lens.last() {
|
||||
return Err(ReadError(format!(
|
||||
"Invalid quality string length at line {}",
|
||||
line
|
||||
)));
|
||||
}
|
||||
quality_lens.push(buf.trim().len());
|
||||
} else if line % 4 == 2 {
|
||||
read_lens.push(buf.trim().len());
|
||||
}
|
||||
|
||||
line += 1;
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
if line == 1 {
|
||||
return Err(ReadError("No valid input".to_string()));
|
||||
}
|
||||
|
||||
if line % 4 != 1 {
|
||||
return Err(ReadError(
|
||||
"File contains invalid or incomplete sequences".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
// Flowcell IDs
|
||||
|
||||
let flowcell_ids = headers
|
||||
.iter()
|
||||
.filter_map(|header| header.flowcell_id())
|
||||
.sorted()
|
||||
.chunk_by(|value| value.clone())
|
||||
.into_iter()
|
||||
.map(|g| g.0)
|
||||
.collect::<Vec<String>>();
|
||||
|
||||
// Flowcell Lanes
|
||||
|
||||
let flowcell_lanes = headers
|
||||
.iter()
|
||||
.map(|header| header.flowcell_lane())
|
||||
.sorted()
|
||||
.chunk_by(|value| value.to_string())
|
||||
.into_iter()
|
||||
.map(|g| g.0)
|
||||
.collect::<Vec<String>>();
|
||||
|
||||
// Read Orders
|
||||
|
||||
let read_orders = headers
|
||||
.iter()
|
||||
.map(|header| match header.pair_member() {
|
||||
Pair::PairedEnd => "R1",
|
||||
Pair::MatePair => "R2",
|
||||
})
|
||||
.sorted()
|
||||
.chunk_by(|value| value.to_string())
|
||||
.into_iter()
|
||||
.map(|g| g.0)
|
||||
.collect::<Vec<String>>();
|
||||
|
||||
// Read Lengths
|
||||
|
||||
let read_leans = read_lens
|
||||
.iter()
|
||||
.sorted()
|
||||
.chunk_by(|value| value.to_string())
|
||||
.into_iter()
|
||||
.map(|g| g.0.parse::<i64>().unwrap())
|
||||
.collect::<Vec<i64>>();
|
||||
|
||||
Ok(MetadataFile {
|
||||
checksum_type: Some(ChecksumType::Sha256),
|
||||
file_checksum: String::new(),
|
||||
file_path: String::new(),
|
||||
file_size_in_bytes: 0,
|
||||
file_type: FileType::Fastq,
|
||||
flowcell_id: if flowcell_ids.len() == 1 {
|
||||
Some(flowcell_ids.into_iter().nth(0).unwrap())
|
||||
} else {
|
||||
return Err(ReadError("Cannot find single flowcell id".to_string()));
|
||||
},
|
||||
lane_id: if flowcell_lanes.len() == 1 {
|
||||
Some(flowcell_lanes.into_iter().nth(0).unwrap())
|
||||
} else {
|
||||
return Err(ReadError("Cannot find single lane id".to_string()));
|
||||
},
|
||||
read_length: if read_leans.len() == 1 {
|
||||
Some(read_leans.into_iter().nth(0).unwrap())
|
||||
} else {
|
||||
return Err(ReadError("Cannot find single lane id".to_string()));
|
||||
},
|
||||
read_order: if read_orders.len() == 1 {
|
||||
match read_orders.into_iter().nth(0) {
|
||||
None => None,
|
||||
Some(value) => match value.as_str() {
|
||||
"R1" => Some(ReadOrder::R1),
|
||||
"R2" => Some(ReadOrder::R2),
|
||||
_ => None,
|
||||
},
|
||||
}
|
||||
} else {
|
||||
return Err(ReadError("Cannot find single lane id".to_string()));
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user