From 4e176e0b2adf366d98297b952d2b46ff6d4cf192 Mon Sep 17 00:00:00 2001
From: Tyler Chafin <tc25@sanger.ac.uk>
Date: Tue, 19 Nov 2024 15:34:38 +0000
Subject: [PATCH] docs update

---
 docs/usage.md | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/docs/usage.md b/docs/usage.md
index d2ed32b1..fa884eb0 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -78,6 +78,8 @@ The BlobToolKit pipeline can be run in many different ways. The default way requ
 
 It is a good idea to put a date suffix for each database location so you know at a glance whether you are using the latest version. We are using the `YYYY_MM` format as we do not expect the databases to be updated more frequently than once a month. However, feel free to use `DATE=YYYY_MM_DD` or a different format if you prefer.
 
+Note that all input databases may be optionall passed directly to the pipeline compressed as `.tar.gz`, and the pipeline will handle decompression.
+
 #### 1. NCBI taxdump database
 
 Create the database directory, retrieve and decompress the NCBI taxonomy:
@@ -85,8 +87,10 @@ Create the database directory, retrieve and decompress the NCBI taxonomy:
 ```bash
 DATE=2024_10
 TAXDUMP=/path/to/databases/taxdump_${DATE}
+TAXDUMP_TAR=/path/to/databases/taxdump_${DATE}.tar.gz
 mkdir -p "$TAXDUMP"
-curl -L ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz | tar -xzf - -C "$TAXDUMP"
+curl -L ftp://ftp.ncbi.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz -o $TAXDUMP_TAR
+tar -xzf $TAXDUMP_TAR -C "$TAXDUMP"
 ```
 
 #### 2. NCBI nucleotide BLAST database
@@ -96,6 +100,7 @@ Create the database directory and move into the directory:
 ```bash
 DATE=2024_10
 NT=/path/to/databases/nt_${DATE}
+NT_TAR=/path/to/databases/nt_${DATE}.tar.gz
 mkdir -p $NT
 cd $NT
 ```
@@ -113,6 +118,11 @@ done
 wget "https://ftp.ncbi.nlm.nih.gov/blast/db/v5/taxdb.tar.gz" &&
 tar xf taxdb.tar.gz -C $NT &&
 rm taxdb.tar.gz
+
+# Compress and cleanup
+cd ..
+tar -cvzf $NT_TAR $NT
+rm -r $NT
 ```
 
 #### 3. UniProt reference proteomes database
@@ -126,6 +136,7 @@ Create the database directory and move into the directory:
 ```bash
 DATE=2024_10
 UNIPROT=/path/to/databases/uniprot_${DATE}
+UNIPROT_TAR=/path/to/databases/uniprot_${DATE}.tar.gz
 mkdir -p $UNIPROT
 cd $UNIPROT
 ```
@@ -152,6 +163,12 @@ diamond makedb -p 16 --in reference_proteomes.fasta.gz --taxonmap reference_prot
 # clean up
 mv extract/{README,STATS} .
 rm -r extract
+rm -r $TAXDUMP
+
+# Compress final database and cleanup
+cd ..
+tar -cvzf $UNIPROT_TAR $UNIPROT
+rm -r $UNIPROT
 ```
 
 #### 4. BUSCO databases
@@ -161,6 +178,7 @@ Create the database directory and move into the directory:
 ```bash
 DATE=2024_10
 BUSCO=/path/to/databases/busco_${DATE}
+BUSCO_TAR=/path/to/databases/busco_${DATE}.tar.gz
 mkdir -p $BUSCO
 cd $BUSCO
 ```
@@ -181,6 +199,12 @@ If you have [GNU parallel](https://www.gnu.org/software/parallel/) installed, yo
 find v5/data -name "*.tar.gz" | parallel "cd {//}; tar -xzf {/}"
 ```
 
+Finally re-compress and cleanup the files:
+```bash
+tar -cvzf $BUSCO_TAR $BUSCO
+rm -r $BUSCO
+```
+
 ## Changes from Snakemake to Nextflow
 
 ### Commands