From d719fed1507eb2198311a3a15d3a5a13eb6e2b14 Mon Sep 17 00:00:00 2001 From: Paul-Christian Volkmer Date: Thu, 14 Aug 2025 23:25:53 +0200 Subject: [PATCH] feat: generate GRZ metadata for file --- Cargo.toml | 11 ++ README.md | 27 ++- docs/grz-metadata_subcommand.jpg | Bin 0 -> 25777 bytes src/cli.rs | 13 +- src/fastq.rs | 6 +- src/main.rs | 72 ++++++-- src/metadata_file.rs | 307 +++++++++++++++++++++++++++++++ 7 files changed, 413 insertions(+), 23 deletions(-) create mode 100644 docs/grz-metadata_subcommand.jpg create mode 100644 src/metadata_file.rs diff --git a/Cargo.toml b/Cargo.toml index 8b7754d..828e752 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,3 +13,14 @@ clap = { version = "4.5", features = ["color", "derive"]} console = "0.16" itertools = "0.14" flate2 = "1.1" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +base16ct = { version = "0.2", features = ["alloc"] } +sha2 = { version = "0.10", default-features = false } + +[profile.release] +opt-level = "z" +codegen-units = 1 +lto = true +strip = true +panic = "abort" diff --git a/README.md b/README.md index 952cc81..cc2c5e6 100644 --- a/README.md +++ b/README.md @@ -10,12 +10,13 @@ This application provides the following subcommands Usage: fastq-tools [OPTIONS] Commands: - info Show information about input - scramble Scramble input data - help Print this message or the help of the given subcommand(s) + info Show information about input + grz-metadata Show GRZ metadata + scramble Scramble input data + help Print this message or the help of the given subcommand(s) Options: - -i, --input Input file (optional) + -i, --input Input file -d, --decompress Decompress input as gzip compressed data -h, --help Print help -V, --version Print version @@ -44,6 +45,24 @@ This will result in output like ![Info subcommand](docs/info_subcommand.jpg) +### GRZ Metadata + +To generate GRZ metadata for a file use: + +```shell +fastq-tools --decompress --input file_fastq.gz grz-metadata +``` + +The use of the `--input` argument is required for this sub command. +If the file is an uncompressed FASTQ file, you can omit the `--decompress` option. + +![GRZ Metadata subcommand](docs/grz-metadata_subcommand.jpg) + +Supported file types are: + +* fastq (full support) +* bam, bed, vcf (limited support) + ### Scramble To scramble compressed FASTQ files use: diff --git a/docs/grz-metadata_subcommand.jpg b/docs/grz-metadata_subcommand.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fbc27e55c9899001218f2255b27b4c5c08a70418 GIT binary patch literal 25777 zcmd42cT`i~w=Wv1(xoF+>4<72VQq|�X;+8Qeg`meGx3~N zy~50E_27i058uUSNgqy1U9D{6xBh{bR(t3h$-*iiD0Es_MpjN<;hegLrk1vj?zQVi z#wIsR&1`Ppv9-H<&))gbV;5I9cMrd({sDnO!6DCIL`BEEd=;CV@-{Us{ar?8UVcGg zQSrx;PgT`5$XZlgeM5UkXIFR6*WPc#BR@yS#wRAH<`)*1mRDAPt*sMwcK7y)2c*N_ zf9#?IF#Hd*{@t?wi(T9_yXYAi85mFev5SsAkTw{&8JW(iGV@%qI`P1VSMuVslYCc` zK2)}`NU2%l`5*fJU=@&7pO+#0G3{TL{heWv|Bo#DcfyyUxV#9S zSo!x{wsCU~zLbmN%2aHJ7~TF*WmqD);Z5ZZ>M&;)9e^eJ%zn8~l34^HnBna<;&F{8 zWHqN~@@>iaveDX_g=uv5Lq^fByl!nPbwjnlBpFc*=CqyjkwmnFy=nCa^%N435aOTF9u3h|<_ItGB^P3)Z?yqzY&s;q1q%O^H zt{m&|@QV~lXW{Ot86rz{<*Q;X1p2@Y&v_NwSuO9>&%yqV# zeyJ18GMPG|UEDzsX^VS*Wxgd*5cb9*(JI%IGu##A`Y?YoFGgxFeA zGNs-EexqzS$`ZJB0dYZgrt6yM14~Uwk{$k#%C8j<>d9RHic%~_M&LUiv31a;-%r7z z*GW?0(+87N&9Ynx;q2yLr6I1YwO_h{lP>e=9c3;i)5icrB=0`CX*^c$)m|H363Ur{ zAZ2U4;-P?oA=LNw4T3WX=%8ous~En*bBP&BN_`MLWY~7q$&XZAO9^%<2hG>As|`~m zT=PhBUo;=+Z!jI_J(5(4{(99#4LXAopG1oobHH{^Tdj4VaMHOKXD7?29uJ0@N>QOHAguMfIXJ%lA*yg)SzMp6B z9RpnME5bW7QPD6l=X}rKF)#1vH2_@`sQq54DW>cGwo{MQ!Fpce2+1W!?NS>e&US!djn zumViVv5$^Qxsqih&VxeS&a~ow7J+tSxhuwLy8Ha2%vHNVLodX) zou)(m=0;7ZHYR7Hrzp6m%snLeZQ2+-x^$$vLB{uEH_)!|7{E*+=ext0Dzu(SbYBC? zb(n^$SBI?>G4%p3m{bRBH`06=Y>ertVu71nE3gE?{e^JmCnMYZF`&{w7}M8Vx4T=t z4Qo`*JyOgqOBt=$czi&M2Tx6|9|Ov;`=rewyFWe__P_P9c5{jG*menga)PW>;nACQ zy6_n&G2ere{TT3gSASd4u&3HvXHYSY{AfW%aVVt^Q|+>wP(P@;mg}2q1?@m~}AK*l}2GH$TscTr@C@0smZN4+33W*G0?S1ME&IQIz^w4Yq}aiw zb=OS-MJMHo%Z|*#-ej*(j0n7?)tlV3s=d+x-KwlcO!E;sf2$`V+`5%u z?<9MK@PaAeOU(&kv-t}pV5{NzpOQiP@*p$26uk$Y+cgoK#w{sDHZsAlyr z*wzA5$L&^-0V5`mAMJ+iN)IJo_yiF_7T4Df1dFB^1Q1e~L(h&rnbt$+!>>lxEDt4$ zW&U>Cx_DNJZ&RT8HsXVFRIqb1sq{Jay?g70}WG`gLi^M1H5K^P&(b0)CMnknh#pL zySrjWK|86GO|xaIc*GDMUYhuA>9!s47;sYT8%6yX@M2)=gSqi2y6PeF%we*2%s`-6 z?MHboSY7!;l!fLvbO*g|R0BjGVs@_>@!6cRGU5aLe>tq5-%!B#2EAt})MCt4ATgB5 z>ktc{h@iyWgEkd>wr?0K0Nz>|oe?qOyYpo{#Rmk`e0~!P$2RiHQM(Aw3*0n43MDQbT&|ds) z*30S$_O}STpL;|X{PHV*quOe^(fE7!EEhAbaNe-KixX=1E#bpK_V!xR4?!#LOoGw~ z#9YN8cGqup=L3oEy%JMfTZw9G-Sf@7y4BgNfk zZir--Vmz2gbZO=wel+R;l|tORT8;tQohNnP$j)+U?1g_Q=`E(PRtnoiAt9i)mu_{G8!vpEpv%70Y1%YV>z=9}R+%ZqxV+|#Um73E6=r_wN}``qEy6qvma z|8dr|=;CuWpjb+uXs7eh!zyKoj|_mt*!qtizV62W`p?Fx^AM0N{!uRH?fORDqb6ZK zyZ#q9a9$8|kqPIVy{2*O7FDX4$-ptt>=^J=f;vNz5Vb8r z&xZj!PFM=(peE5Z=NEq}p@S+`uE++^o=oj|Tbo!{m||HX&4IUn8aA94B1=%tp-I=$ z$tJi^!|HGhvgruBceonXPdUUSrXe-#zBG{h`O{{lNqA>bs;eZ{p9xi(&S^S1pkv zE6sIj?V~w3oRh8U)uDuCzH9|ZY4Qm@;}ENl9^+Hhm!TEPDjZD;P?oyXf}I7cs081j zTe@_u6=e`9f&iMVf1XRAc^=L=MY=QLcufhp8eK8Tc7Zd>&@WgDR?j|sqWglb^>gDU z-N(hRJFii>g{VTkg$8BKuxFt}I}};dkBaI)<3*m(>DI~ff2*_AB5({)0R3nIfx^Of zPOUEQqEB*)Soe!a`8TT~zrXF4Q)%CvP{a*Nx$v}~mZ1@Mn zYO6>I+Yvq8o$Qk9$$E}#RgIO|hpnRfJ!0pY3ku~0H36s#+i7OJ5@XtJ|aj7W&6*bt^VA`$O8J66uyEY6}h8Y z2zRNag#krGHS&9-V&mzdY`P%_pW)^WkhtD4AnHy5yj_)ONV)^Z$EXk!r-*QuBa~+_ z8NIeUNX|IiMM)?<*7=s{8-U!~8!e?IAEuuy=}WIT22`OnsW%9UL(iy6BTZ6Vue|4n zAvN`zV^ovYxi%e$dQq4Pw*uT@Wl^8y;oWqDS5fMV0=$ZP-urR zBDP$4ZWz}xs#I7ozN_-Cr0V3Gro>(DY}GbINP6 z=@XwUSK8x~-$#90O6V|ja+AQP!rM8ZNAIL*6YP2nU>(~Pw;ifH2Dlw1gKM0rx%vYb zbfPyrx-t(xV%U*jFt%YdtlYwbAzcNH5_mz^RsAB>u(xQBKzP-`1 zE^%z@4LB05*!9p=#xd!_)>Y7lCOf|4y=qIbTQ+ks35Yh@_DCszX_-Q_ff)`zph3G6 z%svqGpelUCBFksc2W;XYoSg}8E#)o@0@;o>otAr^s_Z?8hB8O-@1Et+1BqC76J+%w zc3;F_U1aO>wLg(qlSsNFT-*r(U(OuwvTZOUshBofTssn`8Y6eafz?7g?6$=#&iIqVhswh%=j zYh%{aqiPYI=OQonvR-0(nt7rf!9z-K4{VXCMph?YuL+h4EybCQjr2dgyp?o*Sjw#J zp5;V`#5PiK$2v(i0J#GfR+ zba)U{acSH@(Ibi4H_I2{Zg5(`prT}Lb6~!YXFU$$!cZzyRh83oF;etRLFLqiWm~(q z_mP&^I1qG%z$b~2O8IdN*wOFFC~o5tF?>!#7jllLa^yCPNbrIVVu}|h`vL}@{gCz! zXK5~GFVuPb_Pk6WCefBY|28S1^cSx^>C?C=m}yO`5)rL)+^>9FCkEhtG#~Nc>gJgY?u3CUOoylgiHlf@9Kj`hjtwe2>$YQ0 z@>qF_ir#UTGEU0Yxmqc(%POIK(_p6H!|AW3?s)U=svvbz;ZuiS3lp=exLZg0n{Blj z)FJfXPW3TBNfUBVaqD{vKkVu#4cQz6_`tpM3E?|aVL-EHWL>DKqtiDL&(>inZfXg` znXhy7=ZMen9X(Wbl9AtH?^CW*L5ViwHK)Bcy?JIqagK5~RNp4MKW+oOdw$;M`HQP} z7nk|gZI@R~ON){ZyNPgvW}Xl?`D+;+76;_bslzZ_`D(;HlDRD!KbXHD1X0U(-!+2z zJl&C`JD*i68GWYl)aNJf-_Xf1WR9Upz=Z*oR)JZKrXNd4{Am9glw)bwI6+F=H|n9% z3ZH?)yRb&nt7=J3+n)17%%uGJ8zWD#Jin&Wa%dUh$>|M_yzcQzaL$7yO;BwI#rOY#oF6;Nu7?|sr}xuP2Uzcjh+a?GMJFva{DX}x*fA^glXAx=kU1SfEP*J=qBCG! z7H_UcEy3T#&tK6N&21g8+xUCDx!24<<*6WDDLV#KGa~SxY5UiH*7@5c(GVw5WloHT zkMv|T1`=>Xcp5M8bWvyWo)cCcRWXBF;F~L&7q5U*SWJLud~<`uS<>_N--xK$HVr*! zwZ?w79dWVW4$Jslo)(0)zivJ;d7@YVq>*MRZN*)Rd8;M@-3NvqF*peqj+!B27&R~z zap+zRZ4T22rNv}B%9%7Yln35UbB$3h2)A9$utXe<%YatXsM#VaG_{&Uh=Rs-W_cs?G6Cs065buo6tePZ!h?te%vAV^H+N z-MHW=u5?)m@8MhVOSTtD&em}D-UW&3s9~O8_rg9b8_lR%lle`jC#;fhpQ?9As&k!D zSBV0jLc%BEU4lD7&fun6gw1dT#9MsW&aNtY3a|dM6aMM)*NfVK*o}pMT@dh;&E zSe!o8nV^P~f?10MKsS%-cUlg~%Iz)ToYxj(mus2@JMB^Jy)J~;8S^vFtxtlCmPTM! z%m{j*t8Wsb+CJ_}oZ`*uhni7wj|8-(nDhLX#X~_e@R;V}^4LCLZ$mMYviVWPcshO* z8=mV%xI>a!w8KLwGEzpzfLDf-3HU$Tg?U1($OlS`TCP-kFU8ozdWH zrG%~UQLZV2ei3)XCYeGbS$0NTupz&#F_v=kO@i==%G!8+Mqy$C6Kywp2N%mU~ z+a)?i!6>3JJ^Y=&B-r)gb3-re4D{146HdI`zZ7~UZLoV@X}JzL{Pg*0Q`t0!Q!Fnp z_6TWTmw9bqW$8cdFqV1A;7}U1 z9|P`IAALUWbVFOdH1&*ko`1a#G2j>wIkg*L3kn^aq5p&Ej{y9L+Od-#Ly8x{8dAjm0X zu69AJ4JSj}-*Cuv$jFWEU%Hghr1Q|x3i7$t-&w%crf%idwPe0DJ>zqCWP-ajA?oP^ zauu`VSFS4TXjL~;?J!%^^59GgH=&fIMdeA#I|g(jHoHj@$ZN#sUYsM^C|b?q-@@;w ze*}m&Alis%yyG$8VfWMpJTFG#6~bL);SHu5?ur{qJs6q{;NQjjN{HHJ{{yxn;|@Z7 zQN(SJJT<0#+;UH-u~;f!!P6?O!MPyV-`ofa3#-!aLE@oOTLpE$AIKr zct@_-0vFqQxL_#bPh3m?_?QnElc)-Y@v!Gp3Kpd*+r(PSQ^QY^Mkyk+cy!|jd;v&w zhH*`OAYZ9H;hqiJ-i@b`>uFh<S&J=z$w$`50E`M$C73c^YG49p_W;gk z=wBl~X}Idhrc$H-v9ahJ_+g1&7VAZ`NI}J9E%v=vcwl#OkRFvM>PK}!*gGt|=ck05 zCJn@O)zbKue+vVl6enRSo97Cs$QumPWuCjd8@jnJgW2fuUoG24PI3CfJ364-| z@(j-J^_(xw7@igBEl3bHezMHk3F?=J)n6b7B)@Q#ESLMM#Gu*m518co8!%a>#LzQk zLOGXUv!Gn@-okUC)KLZ|STI}kftmo9vOnaKn~V=;Ru4{~cB3;|OV2TCdx-g98QqDKy@Bt6y_VZ;I z2VPDID80Em7cYE{R&zo&j~xRzpr(-Jir%y@Sg`qQsoH{T>r>=<{Xg4 zol8F|Q}6Mea=<$&Ac3uYPXe@gxgpY-I6RBjm@ z33m6WB^T7kUU_~ttN)v-lv@|SqFHiwP9|9SHRv~}7RCC+=1HO*)x+?&4P0swiSI6; zo=)jBbY&}$`Cdi7_ARdAVIlEcO8u97>5pf9>!qFy1wSUMQ!f8hf2kp)j#K)8u%wQ? zW+pJ_rdxO}pN3(pGdk;b+Pu65PnZ#dn$(Bzi$G+L8C#ftX3d~Lq7?hMV;?v&je!Z7pd*`0=H`lz+4}~s^(y6iTC3ppG zQ~7Oa8SlxS30kDSL!-z<(Nl(=lSf&^nX*+Hpf4WM+j3nUoQBn`THo^Z9 z^hV59kRS&Ly&qub^?$;%2uByoBJk%r-JJwRkx*vSxn=XL>IzKT2P4?m>#xVUe^xe` zzMj*cmpWC~PB8PI4z8KR_#bUILOY)n5A~JPX74u;O9*E_RO=KM{7F$JvXo%LGAceV zpi$xvTS(HUO%6@FO`Ie3o?ZQmgl1#o$3pd-IT*!aIDeP6kk!DN;4Fh4A}xE=nmmAy zz_q?jQi8e-?7QMRn5d%eL{x}!QXoDJ+s-pv@iKEtLRZWY*B>vidh~FJ_@y%V%v@gi z?Pm=|L9Z?SC(&(vK#+;iXiYoc;ZKRDXnwJ#3YB`b$PIhas9z~9_8^7SQSmG?v}3)4iD za};9)i?-k@AGRr=io7zqSiBS|C31GV2Hqr|KOxvb+bo9)#N)9^7-ct_|T1bR9hln zQ0?z^_kD{eaxeB`a<2}ERekCWu(`zVe5xHV$MWv>;O7eM9{=5$x-#|JQn1q1Qx;dP zCuztig%%S)hq<1v^GbHN{wKS#_J7B&ta(#U231?);|-)6V@#35tJNS<8YG}i`O&kM zjj~F_pw1@!qu+Mbm>|El7CF+LqpTuQy7AANZ7!1W>RpI{X7g*x24`hPxy;eG&UOPy ztCb5=&!49lXXFRP*pfj1B6=_V_eAdx>*_>}hCtjlWCkUsp9y-M%{L7l2O=6tGD;z{ zM_hcs$Xqq}3EGCWY`CBcoJ=^Ng#7{3BL4}o{2&xgY3XymY3Ma)RKU=MY3`lHaQI|J7ViRz#35vL*&xq0!>aBq&KR!vnqdFtYgW}w5JPYzUqc22@@%&i*`Ia{ zXgYlO^^D-*;AXs>wUx!re%0o`%S)fm5#hK{B|D<+&-b;%yk=QIf;fV2#2)Kdg;m4M@JfGa8aU;%{E>+}sF-qv zP=sEDPIk1}iJuvRB1>@kMkV%FWcF~1MyyklkOXtysMFeFIz{Um3dR`qmc5)~z^Qm> z-wI+LzN2p&`}^8$l0Uw$1HpA3;tx8K6$?3N@BQ5Mr0i46C#dwxKE4M$ye?H=RCX9^`JKexvuYoyw-9R zbyE**mJ>E{8w3b?2eWef?BM833N7Ovh99c5lEz|xgRhfJ=0y-Ofx6+P&UtGsia=J2 z1)YH7LKEN-NMsJL4U|_zHEPCdAwp`5Iok zF7iI(K{bbOK2BS?>)E;1_wpCs^Kh6U3l>tRjscoAheh%o8FFE?TWNHTY+|lkMG3{c zFvz)EHz4IbmJ|%KLyU|J#jGLbeO! z7qg+L$iQ1PQSao7g?gLCTI{Ugk*Pycc@2D{d)#wcBU~|eNpIKb{l}?OWLr#M8)`=t zwGG4U=FZSMWhzRI!{}r8zlQ6X2)6&2m)NXlOXM>Ba8x`f)aUW=pda%(%eddQLI>w$ zzSO%M+$i#G1{JP7R21f618JAdJ@QZ!{6FjhP41YAzs zf70EUGeHE{hJ~PG;0v!v3=5YYQ8HuVpf*=)E{SZu@ycC`sr2(e9DFZFcZqA^w<&58 zk+e)DDmX8=+QT0C+3yH-Ylg9pgHb;ey8inGN53 zaZkr>cs;xgxCGn1e7d4Q0~iBlZ_>fXMDbkO_57o*@P+v%#aohGWN@)3_RB=H4kjI zG-H2f3ndSwosXo0XK#J2 z$b(7Y6NWk(a}4iP{&znD+Pa%H zRnnXW14<2N27g+2`3BM}{1cNSXFytfv{FoNOD5xkfp6RBLohAJj7ATs+YRdmMl^A7 z=5WQV;`3(K6Xp~sH~q)Zb@viR?3=!}*w-cY0pHT0q{~guq*i3>XXsaGIR}Gkw|h?S z4VAeSWoPTbv%r~;cF3Q?Xz|dn_e0IU{u0IkE$Ddf5M+bPtPbLY^X3FsYzcsoBCZ*N z-|V;#-FG51m%j8$^B{-T*5+_(b4p_s3o3xMTOJ1lF~g37?^tlz^3uAG(&C4@3orA@ zO(xO4d|auOJfHKXsz(T;Wd`olP=}8l8AdcG3*2$TuPct9CE-NDY$h+@RZGhHmtvI&zHeBXvG*NSY)jHr@!pJ5qeK+bpGEB zKqf%Gm~p=tz@!+_`hsi|`OLCl2&otm{y$}6Xf=y?ZD=L_Z2I)03q z&PYVjX#avX^{4sS-%niAdJSDB!69?UfO9kIbs7p9U{(I6N!g%bebqtDPsIR0xXaRN zWc`$abPBZYsX2A_W+ghqiMM4RqFV^LYrb6^WHXw$E zR~H2pq;o4M6^D9ZtGTqcjvBcHb@U%dp#;@@*>Kv(7zO88t5U<+8U-Yk^;Ji?&|mF} zUL4X$m`{wQ%HR{Dx1VwttPkvk_P9T)L61P7R*xP{3`rnF-erGe@P+7e2Jw$t%)>e0 z9XbvhnUJsOv4wFZrEU8SbGOIH-7|F$_|)Rk?nxbuS^j#vDoU^*%phVkfRVHO1m)Cu zkOaqi^7=u;CWHpTJ;wdvF2kWa@6U+iIa)?|}5;s9e!+*$V7a?cLC{FcBwP z^i1^yDbnbob`y=^I$5=${iCb0ied9h)-!LAPlSWl?u+ZmurK7`HHX!gv}GWD*p#rT zC!61$?0Vo0Lt4BszVK@2SvKdY^nq-18kTr-Y?b8IB7^S zNAD?F)dY+tq?Zk?e>+{A8DB~fdu}zXFGB-br-ZepDkvt!9$njen4?u+pevfi8+~p| zbt#%%8fSJ#p$i4E86S6qGgs`abH^8?)_MXvC?k7zzn%mf10-Pnkom6fi(}3L48Sny zYIowmlQFMCb`G7y0k>9Zrt?<`C`R_jZU{b#>HDCo2kk_ZZL(V+L^{zkv>4k9O^icgP$Hk4njmDDL}%-S(B$4`atnFPx;}J%ID)+ZPpv+S&`3`$4)5VZDtJo2;=(Cr zO-K)Z#Q&_g3(}*PvNTS8iyfR7s+TWM^qw*@Jd#mKP1FI4Z}iQX;N8$t9l&r6NL@7LuG@XZ!^Cc~ zMf@U~D#!mTB%RiPz}ADWRqVkBWH3s#C3%`SRgNvE-_h!iNFW-7F7PZdLuL4Hr!+Zd z1ZiK-l(DRl`QvOWER=m93pbZ-*6}4D_a+nV~iO+$MAegyT9~Sfx&<*+PD zCgl>7Q0<5#oYuFKLJ0%v5Q<;Zcn!rLFol9P=cAlr65Nt3>JvQcnx%MSXFx~Kn5}wm zmG1VVMfC=`E<+wS@l(zC=CaX(Ofih9)YR&M7{AM;n1}V>24yY#GEVyzU3!mq@c)=O%u(rU9`xvJW|nd%|kbUxLC+&rc`9k|s(GlXc9yTl9lyohiO zcrvelt&Q3SAy#^Xv?@2DwM}|Ov~WE_J^bIVo#;>@-T5#hMeB=fq;ju`82ER?Yv47l zPHNSLk9xZ`7ZJ@4lD{%fSpVOsK zXsO-q{^s_y+4>lKG=w~t9r_ZLME_;HB1zqYU0?lR@wO+I(gpA9;oyu$k(O6LcJC<*24^EFmA@`)y% z*)jwJPi)bx#V`Ew*-(w6D{e`?S3`EG^VGDCX3p+eskzTXpLR9~6UltZ1v_^T4Q{V= zk7*~=R0ZAMIMHP|^;&t1yvY$RDKP6YB=_(zcxM+I9#l?ez*m;o{-_?JLcZ4ZyR3R6 zDc>O&JejG_SZaB@D%w4p^68pb(h}D@;X%Km!w)j3;!N~(*7)Z7Z7D@#s|^$_Gz*L$ zH5aWK&=%QK;sttM#L@n2QJh-w2{IFEs;guJ(6B?9myq_tqC}lV?{M%gAYQ3+opd5O zjjaz`aN-J*aL}h4o)tF_+Q9=NF>~XNuzoct5+1ypg$3+JcMvDt2{=b5td7yg+agRkc@fLLe=s{L`!Zo#*W>M27_c?sDz~U5V_`D*NW5&T< zFvJo0a^ybQBxdhWSrT=+U8&>pV+U{P3$RGjLC+z#lBe%6DRyXz#BRv;$Di5O=EW4Q z9q<#oN7VDfyz`6(r=#1@s7GU1t9o>hjfCr60qs1K!R@BuYIeGV5aOa+ned7&d^D%m za_|^1_6hru)*#)`Ic3u zuUDV$r^0KV=IiA1wE~CV`sY8p0UnMJ z6VlpPjC}O+oh_18xos8z_+2fFzsXxs62njX>U6Ak%*L z`&^mV8d_FH=iN30Ctm+N+U1FQH3u6_pf&HjoMH!afZN5T@T0SpV$FNMU*`lr_!OrV z>HhGwCXeDZ`6NUC)2$Oh`nzqx_%w*x+&9wf`c{ld!TYA7`iCcQIIZ^G9R5Mtm1FhpYQm6m}yRqYPEd zENpy}Lu*s8p^70-aT7i(2TGI3wX&m-bdAI~;S(>L!H9oK@M{41X|3pTvJ zbZhpUtz}8MSo}?-AJZzsQW|2|9ma6w`@KyV2S;%n#wh(7#A%jNnBOIB!PY+(X_)CO zathQx1zI_0;ipOwCBRy^4P-17EYpe3!HLK=Vv6@4mJ7NYfB89?wrCrU4q#TQgkgIT zAuedw@t(l)Q{oqKq4;M(_b+bpgrSbWj8IH2Si_z$?*HqPx zFkN}q9en>Cc%fTiP`!LQcZwJgLk&m6s3B{f5(4U6+(DLu*gJUp zS<(I&)F$d4D zlH|KDyG6c|m0`_g2m1Z1q*8jr=ea8B?Q)$Zg$l#eeyA$3&R`hQ0a1&CP<>vWf(KVa z#XR%>TV>`evYv)qKRQ4Y=W~hj7o0 zhRF{!ozhVm=2B&2vlpu8h*eBr*b~e=T^v_N(V`*x;q{RXNpmZfXYYfVK0dSD75Q>H zH9}5XyFv|2=Q%KELU{A#KT4z2`X(C#XI8a)T0H4L-JaJg-aVY5>JKfE?^?b6;L-wC zR5?Pz#!EJLV8)Jm#M?-G#nk5NPb5bGq(t;tIb@UjVN(7R&O6cw&yQywb$8e2=q+fz zQfcp`0p$6W|G;`zbA&4?~h;4+DwYNdSDx1yarua0tsArn^`H}pTAx}jR7n?;u1>ci` zyoKPKNiXk1=2!qvuGUk92QA;;o<=x^*=B~#=ucW%d$a0oK!?C@8xExqV+QBl#3k`L zIW47g$Y#Qw+!XNy<}XtzWepud7}4kn*_+=G@=!G>;H_+pz8;~+6pwngT~nsjKAzUl zMNCpo@_UlGgqR#TAA6@bRO)m;26$p%N{E@Q%a09j*A1N8CYg4>K~EaGXrT#*C#EdE zK#Jhi(A_3lZ(W~7R@hn}-lQ8k&qXpdPT?M=e)R{Sy-{Q<fRH*lA^Ik#FAAUGKo)IPE$EpXNLCP9{Q znp6`Sc+#XcH`oP#3Lh|#90?b257(f6{~;ZVE4Mc{#0}`H-|7;z{9KIFNt(EECCV-& z_4MicimZKehOhl4*yLW`|8b|LwldwCX&W*Ot?CwPqMF~3-l5t=W1{uxP3g;iKDXS# zgoVS4SHGKG3~+~{mVZ-$m8<(32UtNE|3{eoFhz9s>qmISwnSo7$vf`n%# zuS@^ZBS5YTzcF8YvZv91x86=R*Op6zS`rC%V8kJjDTuuB`-fp~Ma#_4{kX-^7a@GU z2v%>x+YmbZlqY;jZgb&h=lxj-487lCs?gYv!MeDs4^26P6FV}C5=z}eG>iGiBA%Xv zIuGL@+aF#^N3CmeW1o?~Qy=^jU6rdGbIvSwl^Vh?QX-2QCxDl>Ld2gxm_w47N`D1P@T!|T`dR(- z+V}qwW@zma*Bd!+&+Ly>XsY=}$8Pt0*H$?$c)o%U#)ZPKd{`(2a(D_GR|c2#a2yr> zED04I!%G%G7_FI}eCOW9NnD_bv_O9?om%qH;hWA(!v*{(Rrvw+DO&hYG=4aHK$-0s z4k_-2C?Cvs#W%p?YMRA=W7~V*59B4XfQCL<$iHlD1=lcs=8Ta(Z2QUcLGc@J=kNO@ zZmk%LA&R9)=c{m~rrKk`EhR^`t!`}7AkLqA$VXX@~F4(ZQw*3?P8Qs2I)H+dms`Cd~swZGNw%7O)`!+^8$2lD7zr&&V{ zmLqStY>%lrqbSon&hFJCo~WG{9-@`gZB=!wewl}{|E+?@&T(-&hN_C+iute+E|n%S zph+<3DW19Z8RJ1`D{Z_3_g8#Zd;Zmq=Ct?Py`Ke{?u~wigGw5^jd0C7I@-v`p(@`@ zi+8Ndml8pv25RnsXwLJGM<|YkzYfNz0v5~=z96&NFl+bg#?16b?`7^k)sY+Ie4ok! z);_9&D}LKu+97UEHZ6Ml#fP|Nhm6}SY)3gs;SHh+J$1{PX#$Ur+hSz{uS+~bKCgf5 zPgpWr7GgP-3X{kx1OuPVDyOub=q_bWrNzWKhpd$H2^POdjDDiw!0rW>9in%S2Xb?6 zS#OJiY78J_rp+_DFPIl0q|<*rNRYZ>Ko7Y|N{D&I{QeA>>*y7JY}U!52a$saU7n0T zSaaC-Ai@8WG$ea`geDEC{ZG=631F0g0-TLT6<%++t7}LM^+33jZ-NP%D9MRlRQUFO zr|h(Ai|T;eRNO)CF<|@`7CZ4G%U)&Uw|m&kAQ+c&=wyeGOGmj3tcv($JCEoujx{sb z5t@-Si8x14fRdTZXqiUY)s+Rz!E-O3YaW$36H`}Izg}RG5`C70(9O$=O`Ag~Hc-ny zhh3ZBew9o2y@^-fJ{FmR9i&sf4u`fWNJdCNJFv!7fXx)nVq!+ z^m7e~7aQ&i7I0$%Q&&p_F+h{oP3>CQG> zkIZn}4_Z$E<~A8P_3#*w`1n7IsytNj9i83ldOk(rhkJe%=MD*e52Y@i_G=`+&q0S` zGzN5^q^tQ-YEWv28iD0&Dzfe@X8I*cXo;wB&BW4hw~hN~JoXip(LGaU{1r}J$?wWf zB->JDGqN6zDj*wH?jibh3L24eXuKjX@W8J zaM35aZBe_8xmJCfC8*IHe`)BoI;q@?qJ5x}H}DZN{rU0CVqg1LAsypukv=u6j7tlj z3bnuSBx+Rn|C;il%(jHyy}h(3uLmKOmQd53K-p-F+)h)40Z_$zAZt*Q2EhL4+Mj~) z&!#InfTWwV>wGm35Ls5ynk!4{C2ZUW;}ID_t}81=M1*T2@yVp5no@W5j)x5ycQy1( zy=_P9c{QrcTyD4ZK8)Zb6|1x(UfQ8K&yG^NeB`1Yytc3wu|>!EcHb~nPX6Az;&sTU z{y}q|*^u2ai~zfef|vQRVAx4QZO*(9#aua+VGZ~wGl^_a z=&8|e-=f)1TvN#4P1?N(P#aX#rLka!aq4IQC}?cgBze5UPOY#>E5%e;J%0Wh|4Y=n zTZg_6h&Y*(gGP4x{^PDt1k_3%<&w54V-(2p7mdiJ0%&^J1NdBD1;0IJ%}!UN5P{he z77J-xjb?vVkL#p*?6g7@vfrp?irrA0DPzu?`Ha%1#{S=`y^vQ3T|C|fLm^P5yH$Cdg|7qx9ERZ3>C_p5w(AlJ?zT& zVg&HDKu(?R-}3PP@HAo}|8Gl<57%SA-CopiX_&6-?KU$nJO(Jj)2Bn7(&x%Td(hRE z?{YSL(OuV3QhQ_8w~;1hVm>Q|MoJAGX-$70z#m1@0UJc)-5yu-F7kC$J3k zNCvg*j?N$&{1Y{mUD8@O>v&TrS$*x*{D9=PLy8GOAm9IE&jaoZ+}Ti&W7Un)WDCcaK~WGuFo;VEwrKU^gq;%xu8ES6|w%(m+>xWdh0%qqgqqQf+La zDFZA)m$zaAQ62!Z*J+)Cba=Y~@zU1C3@C?@tpH^|>P8S|K74q<=&1ZWMGnU1WL6mG zqCV`=T?JMKtT#3Nuy3ww6RBys%b@~-T31YLZDJ(wzidxkLk~Emj+IBe={-DCKI3V0 zh1bxJXtiPw=vH|2bR9F2AX82mZ^kBhPf(|4O(UM~=pSEhwAE-2tDoy>G{}8gYI*Wj zLT?il+q5{n%zjmM(_QLAHT1_NkZ#FWk?)9qX z-cW<`Sh`ckouT^$0T17&&IS&A_06PLA|pq4Nr?A!sJpnZF+<^n!H8)(Y81WnXRX|4kCA*zfDe=u@n<_AqTQ3FVKHo zi9fXA>a5b9E6gKI(nVL~c|xS7^Yj zp3&N11Gi*RQ2`LQj-5pALOM+PaDIwSWJ@woG~)fwa@(15g%xnk{=Sd9>Ivetwn^ty z6}vlcf;D)YM|n3zJt4kQZ4_YWRu+W)#*|H{!%lXM=&z~PoBC4gB(t``%G_NB3~ty+#-8k7N+zk_*ilflO|BZ2Uou6L7mZsOgI{hG4xVmIygs`(Eq%oYH?=n|Q*v}}^r~no=(0`WjV_MI zSvnV~RSoMwb@PMio1s>#-mLcF@g+I(|He8gC5^~)ee82#MX2+e`xcQ#woo0))n+?t zTwV|mw>O>F2==Muw(?X<>Wp*Bps>T49|*A$W{IccoCOW%?~@7v-0<|%l>OQh#yWYR zB(W#_?H`3T5}HKW#|ifa1tFqDtSmR|<=wjqlDFI9YgXed=nZeKW_uqb7c1}4cU$~G zM^MPuC#&O1{NsXy(|VeQ`Pc1N*G|u@6h5zrag$*MpYtvmp4>m(+bC{xaZj|*sRf5% zMGNP=O-Y#=-~(o9A9{MPQ~UbCi_SL91htyxeY(M|J1vcz!|UgZlSdQp<}ttw`>uRs z>j5X8zR!+4XGY4ufs9|yg}oLrAgw+by3?JKLhhN>3CE|dA__1)SjF6tRpp44_TlWN zo^0~SpfZp*R;QakWlUvVocq-9<%v;aZ?dGDnC ztZOW1;8%FvHO!C)f44v_uGU@X+~v9OG?SK*xdAQZ^f{S*Gpb`C6P@M{ht81wvj>fq z*Y8(FOZvB}Uan#*thCb(*EZfX;95D$n3fmGGO?`%nqXk?>aPqRcAO6Yb+Ks#E86#x zu<{N0mRxqCpdssn6!Fz@>7;Dud+VJ~;fK0<7Dk zstSL@!`UEA;fCaf(FV; zNU8))t;iX!XhJ-Z7u}w(-&TZkfzU{=zky)&HCEkD6j+lldgse=s-hwT(Z24_D zQ6Trj;l<782-*cH_FxTbbFmFe^xe*>YFU2|VUzv{rlYzqYlEnMHfocW$811cD?qd> zigqi=-~o)8zF4NY6zI2fBW43T`zgSfF2_E=k?rnu&a|y6+_o$fN4|8amgFa3M zo07yH5WQ5usz=Q%E#REJJ8peSXYYRdifH$%NEJ_ZZ0DDe3AvapwX_nf{0AU0aWuuA#pe zSTdeYw^i%azOQtdO!j&z&KPV(nC|Sj-Z=C^HAO`k%qQ9$)+7FedCr6>5j8Wr0OQ8d zQDyTP^KhM*_gOwY+uHlabXznpo?JEy&b-d`NT*3+RuBjYE|d}EeoONe0cK|Zy7-~n zA6`-R%6sltEcP9jEtimP^vBJ)t6qxp3iXDhzo8&QhLLrR`OlL-PJC@zw??kjwgE6S zTZqw6o~l1RK!LGhki0|JQL~d_&o*}XmBM)z7ufB-y1vzCh;>{^O^WdSH8P%(ioK`Ku9_emS7eX~#?>>d2L?KN z(ceH-=_ge;6X0l-u~p*~Q&bGa5(#+PFx;&o!-c05SXQ3S#IwW+W(@K5wWqZ-cm zo-+>)w12$1R8sQ)&+&Apx2@$fZAulL)d{ga&;+vgP>a47YNbTx1sU3+*G3B6 z3qFT>XwG$Yfr!_PLG7Y@s#~z`t(lhGU5bz2yc>35%P~4xj%xc^Zn|j@zASfE;i_HR zz;O7DiWi3|ulvVlVCqJ0p@1c!4IFKg=a?McydclpITG&$AXzS9pYS~Zxe?OeR(!jqeqb3n9d_R#=m7y} zkyUek@0A<95aPW?38G}Wgv~7K6i^?^R982#M3AYgi)4TOtnXLE4FBVbjQ;x-ja{3D zLVgUiO6ZS)b}Z~-D#9^dF$eASoFm$=V46G}f|Ws+w{|w(`SjUH*sN5C+Ci<$@|nF) z+9#G#*3EtK-sqH`<#s;ettXM6y-$9;pi8d$(1-xK#v-EBPCB_5x6)zhR6o3(`DK$o zlPBbWJ^xvN+1MAD&u`0TgaV%>((3p`q!^)4J8B}1Y)JwgN}5R4u2IU^jJ89c;nPBa zOz;h=diEzcz5qbaIqgS?$dsD=+(?jyx!B2%K*;}tk(*Q{Xlb)u;GMbTMhdWjF`eKo z{m0^&^BsmB(E96T7#vXK0!fA(D)Z9e_r+Vn`FS2}pEKw&AE4YeBS;=7SN>aYS(=F4 zVmHdrZj||OA%PBIlzd*Q5W!!N%^tB9cNVxMQ=@oZcrc_#i-xLqr6B$B=(VliK=z}r z2J@dfu|0qkG~bxDiI_M-yF1Rf!`zLq3TUd7ejQ#egwMP}IsRP_hf99w5`_r8_g_^f zyB2+Z*8~Zf_=|Aw(f?1lx5%pDf5f=k^R1pZ__??Z7(E=$N=1J;qaf*j)o>Q69{aU! z_{oHk@LCqzhH5NyW_syKd2(KPd~lhmR);{cjm(xh&avQ|r{eqh?pV!sj2iBqfq`bQ zjMjrp$6<+%Pm4@VII@4nUH7a&hMqnv{gwS{SVIG0qE6PBIK-Q28~Z|(h4azTVTcV$ z9ujpvquu%$TCXzX@hGtqnR2Ucd-WzxwG(Dcp%(Al7HSMrT&E~ZcwHQ@b-puN!LwI< z2Kua&He_;LB7(k*N{4or9=g++UNC@67$V?(r^!H7d^xxlXdqeMwLabTWm(Q0!x@H9 zj>+pyw~EFuYtNOm=bfltJ(3pr7;#@ni*()YNKT$&2QjzG8FN~HA~`yLnFmPN<%uCofKk@} zh3msnCqNneAAOMzi>Q1+q4m?UOnipynp#t8s!j-t@|J zFv*1roK{-8`(D&XWPKWXlH^GXwHVYHXf4Pc=iXP3V=rH>#|6X4&&K`@)y2_iBq&~y z@j7H`@|DvZ?n6>RtII05ce6RYOW{*Z^Vrn^PzM~Rp%r~dZwu=)M*AZFt~mbey`SG&w2 zyw*QY)va)ygxb0Sx!6}%W6AKL{Gvu~qWbe4RQfDi1&u>O+I zH;||7^lgBs(e(#Wqq=d!3ZG+mG-P5IId5QE=GdtUIJ8lI@2g$5p+T7dtHp}q3@+p2M~ zQulZu(1U@*VP0Bd7uEqFS53$Z(oZw5e~0s^WJZ6?-ivu|%mBI!FWThJR6291X&qgB zq%YiZ>M?Ye{S{5EGwpedOxsG9L6|2mYPL$&YEP%NH%_j*|A9{$^B zBt3y@f8HWk#9K#nS0s(WO}wlAh*7`JS6PM9Sv85FoFZ^Q)ZJyuqO>oW?6xiyjea>E zW_)3!F9Pg6phJ8SFFx%0o?dqCB-sV>-kLaO=8xk%P_02%x&Vq%ZRzA2ODWkGR37hQ zJ^6+_$!@GTLN6{8gT@9{EsR6vBA|l-nhJm=XWtdMI% z9pG5br`$3aFCTsF$TRj^EZ|eBkprq%jPMybTzjl7N$T1l`vg4Lc5>tAj&W*-19L8t zF6JrLJ(L1!Q01pSsCkQs%)f)plS}c)0FJq3!o4kjjCuVn?EB}p3ZW5n{bq`7v^q@@%posEuZv)Qyd(MBnTNKFt ztqxx`-+GKI5BLgaz9axjsu*~KRMWr{MP?}FRPk` literal 0 HcmV?d00001 diff --git a/src/cli.rs b/src/cli.rs index 780eb9b..26fa52f 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -7,12 +7,19 @@ use std::path::PathBuf; pub struct Args { #[command(subcommand)] pub(crate) command: Command, - #[arg(short = 'i', long = "input", help = "Input file (optional)")] + #[arg( + short = 'i', + long = "input", + help = "Input file", + group = "metadata", + global = true + )] pub(crate) input_file: Option, #[arg( short = 'd', long = "decompress", - help = "Decompress input as gzip compressed data" + help = "Decompress input as gzip compressed data", + global = true )] pub(crate) decompress: bool, } @@ -21,6 +28,8 @@ pub struct Args { pub enum Command { #[command(about = "Show information about input")] Info, + #[command(about = "Show GRZ metadata")] + GrzMetadata, #[command(about = "Scramble input data")] Scramble, } diff --git a/src/fastq.rs b/src/fastq.rs index 4d25d62..b3807da 100644 --- a/src/fastq.rs +++ b/src/fastq.rs @@ -1,6 +1,8 @@ use crate::scramble_sequence; +use serde::Serialize; use std::fmt::Display; use std::str::FromStr; +use std::string::ToString; pub enum Header { Casava18(Casava18Header), @@ -241,9 +243,11 @@ impl FromStr for Header { } } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Serialize)] pub enum Pair { + #[serde(rename = "R1")] PairedEnd = 1, + #[serde(rename = "R2")] MatePair = 2, } diff --git a/src/main.rs b/src/main.rs index d05e3a6..57380d2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,10 @@ mod cli; mod fastq; +mod metadata_file; use crate::cli::{Args, Command}; use crate::fastq::{Header, Pair}; +use crate::metadata_file::MetadataFile; use clap::Parser; use console::Style; use flate2::read::GzDecoder; @@ -10,6 +12,7 @@ use itertools::Itertools; use regex::Regex; use std::fs::File; use std::io::{BufRead, BufReader}; +use std::path::PathBuf; fn scramble_sequence(value: &str, seed: u32) -> String { let ahead_1 = Regex::new(r"T([ACG])").unwrap(); @@ -56,19 +59,61 @@ fn scramble_sequence(value: &str, seed: u32) -> String { fn main() { let args = Args::parse(); - let input: Box = match args.input_file { + + let input_file = args.input_file; + + match &args.command { + Command::Info => match input_reader(input_file, args.decompress) { + Ok(input) => info(input), + Err(err) => { + eprintln!( + "{}\n", + Style::new().bold().red().apply_to(format!("🔥 {err}")) + ); + } + }, + Command::GrzMetadata => match input_file { + Some(input_file) => { + let file_metadata = match MetadataFile::read_file(input_file, args.decompress) { + Ok(file_metadata) => file_metadata, + Err(err) => { + eprintln!( + "{}\n", + Style::new().bold().red().apply_to(format!("🔥 {err}")) + ); + return; + } + }; + + println!( + "{}\n", + serde_json::to_string_pretty(&file_metadata).unwrap() + ); + } + None => eprintln!( + "{}\n", + Style::new().bold().red().apply_to("🔥 No input file!") + ), + }, + Command::Scramble => match input_reader(input_file, args.decompress) { + Ok(input) => scramble(input), + Err(err) => { + eprintln!( + "{}\n", + Style::new().bold().red().apply_to(format!("🔥 {err}")) + ); + } + }, + } +} + +fn input_reader(input_file: Option, decompress: bool) -> Result, String> { + let input: Box = match input_file { Some(input_file) => { let file = match File::open(input_file) { Ok(file) => file, _ => { - println!( - "{}\n", - Style::new() - .bold() - .red() - .apply_to("🔥 Cannot open input file") - ); - return; + return Err("Cannot open input file".to_string()); } }; Box::new(BufReader::new(file)) @@ -76,19 +121,14 @@ fn main() { _ => Box::new(BufReader::new(std::io::stdin())), }; - let input: Box = if args.decompress { + let input: Box = if decompress { let gz_decoder = GzDecoder::new(input); Box::new(BufReader::new(gz_decoder)) } else { Box::new(input) }; - match &args.command { - Command::Info => info(input), - Command::Scramble => scramble(input), - } - - println!() + Ok(input) } fn scramble(mut reader: impl BufRead) { diff --git a/src/metadata_file.rs b/src/metadata_file.rs new file mode 100644 index 0000000..ce112c5 --- /dev/null +++ b/src/metadata_file.rs @@ -0,0 +1,307 @@ +use crate::fastq::{Header, Pair}; +use crate::input_reader; +use crate::metadata_file::MetadataError::{CannotReadFile, ReadError}; +use itertools::Itertools; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; +use std::error::Error; +use std::fmt::{Debug, Display, Formatter}; +use std::fs; +use std::fs::File; +use std::io::BufRead; +use std::path::PathBuf; + +#[derive(Debug, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct MetadataFile { + /// Type of checksum algorithm used + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum_type: Option, + + /// checksum of the file + pub file_checksum: String, + + /// Path relative to the submission files directory, e.g.: + /// 'patient_001/patient_001_dna.fastq.gz' if the file is located in /files/patient_001/patient_001_dna.fastq.gz + pub file_path: String, + + /// Size of the file in bytes + pub file_size_in_bytes: u64, + + /// Type of the file; if BED file is submitted, only 1 file is allowed. + pub file_type: FileType, + + /// Indicates the flow cell. + #[serde(skip_serializing_if = "Option::is_none")] + pub flowcell_id: Option, + + /// Indicates the lane + #[serde(skip_serializing_if = "Option::is_none")] + pub lane_id: Option, + + /// The read length; in the case of long-read sequencing it is the rounded average read + /// length. + #[serde(skip_serializing_if = "Option::is_none")] + pub read_length: Option, + + /// Indicates the read order for paired-end reads. + #[serde(skip_serializing_if = "Option::is_none")] + pub read_order: Option, +} + +/// Type of checksum algorithm used +#[derive(Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ChecksumType { + Sha256, +} + +/// Type of the file; if BED file is submitted, only 1 file is allowed. +#[derive(Debug, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum FileType { + Bam, + + Bed, + + Fastq, + + Vcf, +} + +/// Indicates the read order for paired-end reads. +#[derive(Debug, Serialize, Deserialize)] +pub enum ReadOrder { + R1, + + R2, +} + +pub enum MetadataError { + CannotReadFile, + UnsupportedFile, + ReadError(String), +} + +impl Debug for MetadataError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self) + } +} + +impl Display for MetadataError { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}", + match self { + MetadataError::CannotReadFile => "Cannot read file".into(), + MetadataError::UnsupportedFile => "Unsupported file type".into(), + MetadataError::ReadError(err) => format!("Error reading file: {}", err), + } + ) + } +} + +impl Error for MetadataError {} + +impl MetadataFile { + pub fn read_file(path: PathBuf, decompress: bool) -> Result { + let path = match path.to_str() { + Some(path) => path, + None => return Err(MetadataError::CannotReadFile), + }; + + let file = File::open(path).map_err(|_| CannotReadFile)?; + + let file_type = if path.to_lowercase().ends_with(".bam") { + FileType::Bam + } else if path.to_lowercase().ends_with(".vcf") { + FileType::Vcf + } else if path.to_lowercase().ends_with(".bed") { + FileType::Bed + } else if path.to_lowercase().ends_with(".fastq") + || path.to_lowercase().ends_with(".fastq.gz") + { + FileType::Fastq + } else { + return Err(MetadataError::UnsupportedFile); + }; + + let file_checksum = match fs::read(path) { + Ok(content) => { + let mut hasher = Sha256::new(); + hasher.update(content.as_slice()); + let hash = hasher.finalize(); + base16ct::lower::encode_string(&hash) + } + Err(_) => { + return Err(CannotReadFile); + } + }; + + if let FileType::Fastq = file_type { + match input_reader(Some(PathBuf::from(path)), decompress) { + Ok(input_reader) => { + let input_metadata = MetadataFile::read(input_reader)?; + + Ok(MetadataFile { + file_type, + file_checksum, + checksum_type: Some(ChecksumType::Sha256), + file_size_in_bytes: file.metadata().map_err(|_| CannotReadFile)?.len(), + flowcell_id: input_metadata.flowcell_id, + read_order: input_metadata.read_order, + file_path: path.to_string(), + read_length: input_metadata.read_length, + lane_id: input_metadata.lane_id, + }) + } + Err(err) => Err(ReadError(err.to_string())), + } + } else { + Ok(MetadataFile { + file_type, + file_checksum, + checksum_type: Some(ChecksumType::Sha256), + file_size_in_bytes: file.metadata().map_err(|_| CannotReadFile)?.len(), + flowcell_id: None, + read_order: None, + file_path: path.to_string(), + read_length: None, + lane_id: None, + }) + } + } + + fn read(mut reader: impl BufRead) -> Result { + let mut buf = String::new(); + + let mut headers = vec![]; + let mut read_lens = vec![]; + let mut quality_lens = vec![]; + + let mut line = 1; + while let Ok(n) = reader.read_line(&mut buf) { + if n == 0 { + break; + } + + if buf.starts_with("@") { + if let Ok(header) = buf.parse::
() { + headers.push(header) + } else { + return Err(ReadError(format!("Invalid header at line {}", line))); + } + } else if buf.starts_with("+") { + // ignore optional description + } else if line % 4 == 0 { + // check if quality values differs from sequence values + if Some(&buf.trim().len()) != read_lens.last() { + return Err(ReadError(format!( + "Invalid quality string length at line {}", + line + ))); + } + quality_lens.push(buf.trim().len()); + } else if line % 4 == 2 { + read_lens.push(buf.trim().len()); + } + + line += 1; + buf.clear(); + } + + if line == 1 { + return Err(ReadError("No valid input".to_string())); + } + + if line % 4 != 1 { + return Err(ReadError( + "File contains invalid or incomplete sequences".to_string(), + )); + } + + // Flowcell IDs + + let flowcell_ids = headers + .iter() + .filter_map(|header| header.flowcell_id()) + .sorted() + .chunk_by(|value| value.clone()) + .into_iter() + .map(|g| g.0) + .collect::>(); + + // Flowcell Lanes + + let flowcell_lanes = headers + .iter() + .map(|header| header.flowcell_lane()) + .sorted() + .chunk_by(|value| value.to_string()) + .into_iter() + .map(|g| g.0) + .collect::>(); + + // Read Orders + + let read_orders = headers + .iter() + .map(|header| match header.pair_member() { + Pair::PairedEnd => "R1", + Pair::MatePair => "R2", + }) + .sorted() + .chunk_by(|value| value.to_string()) + .into_iter() + .map(|g| g.0) + .collect::>(); + + // Read Lengths + + let read_leans = read_lens + .iter() + .sorted() + .chunk_by(|value| value.to_string()) + .into_iter() + .map(|g| g.0.parse::().unwrap()) + .collect::>(); + + Ok(MetadataFile { + checksum_type: Some(ChecksumType::Sha256), + file_checksum: String::new(), + file_path: String::new(), + file_size_in_bytes: 0, + file_type: FileType::Fastq, + flowcell_id: if flowcell_ids.len() == 1 { + Some(flowcell_ids.into_iter().nth(0).unwrap()) + } else { + return Err(ReadError("Cannot find single flowcell id".to_string())); + }, + lane_id: if flowcell_lanes.len() == 1 { + Some(flowcell_lanes.into_iter().nth(0).unwrap()) + } else { + return Err(ReadError("Cannot find single lane id".to_string())); + }, + read_length: if read_leans.len() == 1 { + Some(read_leans.into_iter().nth(0).unwrap()) + } else { + return Err(ReadError("Cannot find single lane id".to_string())); + }, + read_order: if read_orders.len() == 1 { + match read_orders.into_iter().nth(0) { + None => None, + Some(value) => match value.as_str() { + "R1" => Some(ReadOrder::R1), + "R2" => Some(ReadOrder::R2), + _ => None, + }, + } + } else { + return Err(ReadError("Cannot find single lane id".to_string())); + }, + }) + } +}