Merge pull request opentargets#9 from opentargets/js29

Minor changes
thehyve · Mar 31, 2022 · 94be4da · 94be4da
2 parents 23607fb + c228b0f
commit 94be4da
Show file tree

Hide file tree

Showing 6 changed files with 9 additions and 14 deletions.
diff --git a/5_combine_results.py b/5_combine_results.py
@@ -26,9 +26,6 @@ def main():
         pyspark.sql.SparkSession.builder
         .config("spark.sql.files.ignoreCorruptFiles", "true")
         .config("spark.master", "local[*]")
-        #.config("spark.driver.maxResultSize", "80g")
-        #.config("spark.driver.memory", "150g")
-        #.config("spark.executor.memory", "2g")
         .getOrCreate()
     )
     print('Spark version: ', spark.version)
@@ -88,7 +85,6 @@ def main():
 
     # Coalesce
     df = df.coalesce(200)
-    df.explain()
 
     # Write
     (
@@ -102,6 +98,7 @@ def main():
     )
 
     # Somewhat slow - could fail if not enough memory on machine
+    # This could be done in a more efficient way, using spark to coalesce
     (
         df.toPandas().to_csv(
             '/output/coloc_raw.csv.gz',

diff --git a/6_process_results.py b/6_process_results.py
@@ -32,8 +32,6 @@ def main():
     spark = (
         pyspark.sql.SparkSession.builder
         .config("spark.master", "local[*]")
-        .config("spark.driver.memory", "20g")
-        .config("spark.executor.memory", "20g")
         .getOrCreate()
     )
     # sc = spark.sparkContext

diff --git a/8_copy_results_to_gcs.sh b/8_copy_results_to_gcs.sh
@@ -5,6 +5,7 @@ version_date=`date +%y%m%d`
 
 # Copy current results
 gsutil -m cp -r $HOME/output/coloc_raw.parquet gs://genetics-portal-dev-staging/coloc/$version_date/
+gsutil -m cp -r $HOME/output/coloc_raw.csv.gz gs://genetics-portal-dev-staging/coloc/$version_date/
 gsutil -m cp -r $HOME/output/coloc_processed.parquet gs://genetics-portal-dev-staging/coloc/$version_date/
 
 # Copy overlap table

diff --git a/README.md b/README.md
@@ -126,7 +126,7 @@ Start docker as above, and then either manually run individual commands in run_c
 NCORES=95
 export PYSPARK_SUBMIT_ARGS="--driver-memory 100g pyspark-shell --executor-memory 2g pyspark-shell"
 #NCORES=31
-#export PYSPARK_SUBMIT_ARGS="--driver-memory 50g --executor-memory 2g pyspark-shell"
+#export PYSPARK_SUBMIT_ARGS="--driver-memory 20g --executor-memory 2g pyspark-shell"
 
 # Run the full pipeline (or alternatively, run individual commands from this script)
 dt=`date '+%Y_%m_%d.%H_%M'`
@@ -195,14 +195,14 @@ To run on google dataproc: (last run took XX hrs)
 # Start a dataproc cluster
 # Note that I had this fail multiple times, and had to try adjusting the number
 # of executors, memory, cores, etc. to get it to work. More memory seems to be key.
-# Took ~30 min on last run, n2-highmem-64
+# Took nearly 5 hrs on last run, n2-highmem-64
+# This is probably mainly due to checking for duplicates. Without that would be < 1 hr.
 gcloud beta dataproc clusters create \
     js-coloc-beta-join \
     --image-version=preview \
     --properties=spark:spark.debug.maxToStringFields=100,spark:spark.driver.memory=25g,spark:spark.executor.memory=76g,spark:spark.executor.cores=8,spark:spark.executor.instances=6 \
     --master-machine-type=n2-highmem-64 \
     --master-boot-disk-size=2TB \
-    --num-master-local-ssds=8 \
     --zone=europe-west1-d \
     --initialization-action-timeout=20m \
     --single-node \

diff --git a/join_results_with_betas.py b/join_results_with_betas.py
@@ -33,10 +33,10 @@ def main():
     print('Spark version: ', spark.version)
 
     # File args (dataproc)
-    in_parquet = 'gs://genetics-portal-dev-staging/coloc/220127/coloc_processed.parquet'
+    in_parquet = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed.parquet'
     in_sumstats = 'gs://genetics-portal-dev-sumstats/filtered/significant_window_2mb_union'
-    out_parquet = 'gs://genetics-portal-dev-staging/coloc/220127/coloc_processed_w_betas.parquet'
-    out_dups = 'gs://genetics-portal-dev-staging/coloc/220127/coloc_processed_w_betas_dups.parquet'
+    out_parquet = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed_w_betas.parquet'
+    out_dups = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed_w_betas_dups.parquet'
 
     # # File args (local)
     # in_parquet = '/home/ubuntu/results/coloc/results/coloc_processed.parquet'

diff --git a/run_coloc_pipeline_opt.sh b/run_coloc_pipeline_opt.sh
@@ -59,9 +59,8 @@ time cat /configs/commands_todo_coloc_opt.txt | parallel -j $NCORES --joblog /ou
 # Note: "--bar" can make things slower if there are millions of commands
 
 # Combine the results of all the individual analyses
-# This step can be slow/inefficient due to Hadoop many small files problem
 echo -e "\n5_combine_results.py"
-time python 5_combine_results.py # Took ~3 hrs last run (222 cores, 400 Gb)
+time python 5_combine_results.py # Takes a few minutes
 
 # Process the results for exporting. Renames or computes a few columns,
 # e.g. coloc_h4_h3 ratio, filters based on number of overlapping vars,