Skip to content

Commit

Permalink
Merge pull request opentargets#9 from opentargets/js29
Browse files Browse the repository at this point in the history
Minor changes
  • Loading branch information
Jeremy37 authored Mar 31, 2022
2 parents 23607fb + c228b0f commit 94be4da
Show file tree
Hide file tree
Showing 6 changed files with 9 additions and 14 deletions.
5 changes: 1 addition & 4 deletions 5_combine_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@ def main():
pyspark.sql.SparkSession.builder
.config("spark.sql.files.ignoreCorruptFiles", "true")
.config("spark.master", "local[*]")
#.config("spark.driver.maxResultSize", "80g")
#.config("spark.driver.memory", "150g")
#.config("spark.executor.memory", "2g")
.getOrCreate()
)
print('Spark version: ', spark.version)
Expand Down Expand Up @@ -88,7 +85,6 @@ def main():

# Coalesce
df = df.coalesce(200)
df.explain()

# Write
(
Expand All @@ -102,6 +98,7 @@ def main():
)

# Somewhat slow - could fail if not enough memory on machine
# This could be done in a more efficient way, using spark to coalesce
(
df.toPandas().to_csv(
'/output/coloc_raw.csv.gz',
Expand Down
2 changes: 0 additions & 2 deletions 6_process_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ def main():
spark = (
pyspark.sql.SparkSession.builder
.config("spark.master", "local[*]")
.config("spark.driver.memory", "20g")
.config("spark.executor.memory", "20g")
.getOrCreate()
)
# sc = spark.sparkContext
Expand Down
1 change: 1 addition & 0 deletions 8_copy_results_to_gcs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ version_date=`date +%y%m%d`

# Copy current results
gsutil -m cp -r $HOME/output/coloc_raw.parquet gs://genetics-portal-dev-staging/coloc/$version_date/
gsutil -m cp -r $HOME/output/coloc_raw.csv.gz gs://genetics-portal-dev-staging/coloc/$version_date/
gsutil -m cp -r $HOME/output/coloc_processed.parquet gs://genetics-portal-dev-staging/coloc/$version_date/

# Copy overlap table
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ Start docker as above, and then either manually run individual commands in run_c
NCORES=95
export PYSPARK_SUBMIT_ARGS="--driver-memory 100g pyspark-shell --executor-memory 2g pyspark-shell"
#NCORES=31
#export PYSPARK_SUBMIT_ARGS="--driver-memory 50g --executor-memory 2g pyspark-shell"
#export PYSPARK_SUBMIT_ARGS="--driver-memory 20g --executor-memory 2g pyspark-shell"
# Run the full pipeline (or alternatively, run individual commands from this script)
dt=`date '+%Y_%m_%d.%H_%M'`
Expand Down Expand Up @@ -195,14 +195,14 @@ To run on google dataproc: (last run took XX hrs)
# Start a dataproc cluster
# Note that I had this fail multiple times, and had to try adjusting the number
# of executors, memory, cores, etc. to get it to work. More memory seems to be key.
# Took ~30 min on last run, n2-highmem-64
# Took nearly 5 hrs on last run, n2-highmem-64
# This is probably mainly due to checking for duplicates. Without that would be < 1 hr.
gcloud beta dataproc clusters create \
js-coloc-beta-join \
--image-version=preview \
--properties=spark:spark.debug.maxToStringFields=100,spark:spark.driver.memory=25g,spark:spark.executor.memory=76g,spark:spark.executor.cores=8,spark:spark.executor.instances=6 \
--master-machine-type=n2-highmem-64 \
--master-boot-disk-size=2TB \
--num-master-local-ssds=8 \
--zone=europe-west1-d \
--initialization-action-timeout=20m \
--single-node \
Expand Down
6 changes: 3 additions & 3 deletions join_results_with_betas.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ def main():
print('Spark version: ', spark.version)

# File args (dataproc)
in_parquet = 'gs://genetics-portal-dev-staging/coloc/220127/coloc_processed.parquet'
in_parquet = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed.parquet'
in_sumstats = 'gs://genetics-portal-dev-sumstats/filtered/significant_window_2mb_union'
out_parquet = 'gs://genetics-portal-dev-staging/coloc/220127/coloc_processed_w_betas.parquet'
out_dups = 'gs://genetics-portal-dev-staging/coloc/220127/coloc_processed_w_betas_dups.parquet'
out_parquet = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed_w_betas.parquet'
out_dups = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed_w_betas_dups.parquet'

# # File args (local)
# in_parquet = '/home/ubuntu/results/coloc/results/coloc_processed.parquet'
Expand Down
3 changes: 1 addition & 2 deletions run_coloc_pipeline_opt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,8 @@ time cat /configs/commands_todo_coloc_opt.txt | parallel -j $NCORES --joblog /ou
# Note: "--bar" can make things slower if there are millions of commands

# Combine the results of all the individual analyses
# This step can be slow/inefficient due to Hadoop many small files problem
echo -e "\n5_combine_results.py"
time python 5_combine_results.py # Took ~3 hrs last run (222 cores, 400 Gb)
time python 5_combine_results.py # Takes a few minutes

# Process the results for exporting. Renames or computes a few columns,
# e.g. coloc_h4_h3 ratio, filters based on number of overlapping vars,
Expand Down

0 comments on commit 94be4da

Please sign in to comment.