Merge pull request opentargets#10 from opentargets/js29

Fix for process_results so that gene_id is extracted from phenotype_id
thehyve · Apr 8, 2022 · 14ffc07 · 14ffc07
2 parents 94be4da + 69ba2dd
commit 14ffc07
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 10 deletions.
diff --git a/6_process_results.py b/6_process_results.py
@@ -131,11 +131,23 @@ def main():
         print('{} coloc tests removed that were duplicates'.format( int((last_n - df.count())/2) ))
 
     # Add gene_id using phenotype_id
+    # Need to handle both eQTLs, which may have phenotype_id as an array probe
+    # and sQTLs, which have the gene_id within the phenotype_id field
     phenotype_map = load_pheno_to_gene_map(in_phenotype_maps)
     biofeature_mapper = udf(lambda x: phenotype_map.get(x, x))
     df = (
-        df.withColumn('left_gene_id', biofeature_mapper(col('left_phenotype')))
-          .withColumn('right_gene_id', biofeature_mapper(col('right_phenotype')))
+        df.withColumn('left_gene_id',
+                        when(col('left_type') == 'eqtl', biofeature_mapper(col('left_phenotype')))
+                        .otherwise(lit(None)))
+          .withColumn('right_gene_id',
+                        when(col('right_type') == 'eqtl', biofeature_mapper(col('right_phenotype')))
+                        .otherwise(lit(None)))
+          .withColumn('left_gene_id',
+                        when(col('left_type') == 'sqtl', split(col('left_phenotype'), '\^').getItem(4))
+                        .otherwise(col('left_gene_id')))
+          .withColumn('right_gene_id',
+                        when(col('right_type') == 'sqtl', split(col('right_phenotype'), '\^').getItem(4))
+                        .otherwise(col('right_gene_id')))
     )
 
     # Set gene_id to null if it doesn't start with ENSG
@@ -184,6 +196,7 @@ def main():
 
     return 0
 
+
 def drop_duplicates_keep_first(df, subset, order_colname, ascending=True):
     ''' Implements the equivalent pd.drop_duplicates(keep='first')
     Args:
@@ -215,6 +228,7 @@ def drop_duplicates_keep_first(df, subset, order_colname, ascending=True):
     )
     return res
 
+
 def load_pheno_to_gene_map(infs):
     ''' Loads a dictionary, mapping phenotype_ids to ensembl gene IDs.
         Input files should have 2 columns phenotype_id, gene_id
@@ -243,6 +257,7 @@ def load_pheno_to_gene_map(infs):
 
     return d
 
+
 if __name__ == '__main__':
 
     main()
diff --git a/8_copy_results_to_gcs.sh b/8_copy_results_to_gcs.sh
@@ -4,15 +4,15 @@
 version_date=`date +%y%m%d`
 
 # Copy current results
-gsutil -m cp -r $HOME/output/coloc_raw.parquet gs://genetics-portal-dev-staging/coloc/$version_date/
-gsutil -m cp -r $HOME/output/coloc_raw.csv.gz gs://genetics-portal-dev-staging/coloc/$version_date/
-gsutil -m cp -r $HOME/output/coloc_processed.parquet gs://genetics-portal-dev-staging/coloc/$version_date/
+gsutil -m cp -r $HOME/disk1/output/coloc_raw.parquet gs://genetics-portal-dev-staging/coloc/$version_date/
+gsutil -m cp -r $HOME/disk1/output/coloc_raw.csv.gz gs://genetics-portal-dev-staging/coloc/$version_date/
+gsutil -m cp -r $HOME/disk1/output/coloc_processed.parquet gs://genetics-portal-dev-staging/coloc/$version_date/
 
 # Copy overlap table
 gsutil -m cp -r $HOME/output/overlap_table gs://genetics-portal-dev-staging/coloc/$version_date/overlap_table
 
 # Make a note as to what this coloc run contained. E.g.:
-echo "Recomputed from scratch: Data for Genetics portal R8, with GTEx-sQTL added and FinnGen R6" > README.txt
+echo "Recomputed from scratch: Data for Genetics portal R8, with GTEx-sQTL added and FinnGen R6. (Latest fix: made sure sQTLs have gene IDs filled.)" > README.txt
 gsutil cp README.txt gs://genetics-portal-dev-staging/coloc/$version_date/README.txt
 
 if [ -d "$HOME/output/merged" ]; then

diff --git a/README.md b/README.md
@@ -196,7 +196,7 @@ To run on google dataproc: (last run took XX hrs)
 # Note that I had this fail multiple times, and had to try adjusting the number
 # of executors, memory, cores, etc. to get it to work. More memory seems to be key.
 # Took nearly 5 hrs on last run, n2-highmem-64
-# This is probably mainly due to checking for duplicates. Without that would be < 1 hr.
+# This would probably be dramatically faster on BigQuery.
 gcloud beta dataproc clusters create \
     js-coloc-beta-join \
     --image-version=preview \

diff --git a/join_results_with_betas.py b/join_results_with_betas.py
@@ -33,10 +33,10 @@ def main():
     print('Spark version: ', spark.version)
 
     # File args (dataproc)
-    in_parquet = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed.parquet'
+    in_parquet = 'gs://genetics-portal-dev-staging/coloc/220408/coloc_processed.parquet'
     in_sumstats = 'gs://genetics-portal-dev-sumstats/filtered/significant_window_2mb_union'
-    out_parquet = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed_w_betas.parquet'
-    out_dups = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed_w_betas_dups.parquet'
+    out_parquet = 'gs://genetics-portal-dev-staging/coloc/220408/coloc_processed_w_betas.parquet'
+    out_dups = 'gs://genetics-portal-dev-staging/coloc/220408/coloc_processed_w_betas_dups.parquet'
 
     # # File args (local)
     # in_parquet = '/home/ubuntu/results/coloc/results/coloc_processed.parquet'