Skip to content

Commit

Permalink
Merge pull request opentargets#10 from opentargets/js29
Browse files Browse the repository at this point in the history
Fix for process_results so that gene_id is extracted from phenotype_id
  • Loading branch information
Jeremy37 authored Apr 8, 2022
2 parents 94be4da + 69ba2dd commit 14ffc07
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 10 deletions.
19 changes: 17 additions & 2 deletions 6_process_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,23 @@ def main():
print('{} coloc tests removed that were duplicates'.format( int((last_n - df.count())/2) ))

# Add gene_id using phenotype_id
# Need to handle both eQTLs, which may have phenotype_id as an array probe
# and sQTLs, which have the gene_id within the phenotype_id field
phenotype_map = load_pheno_to_gene_map(in_phenotype_maps)
biofeature_mapper = udf(lambda x: phenotype_map.get(x, x))
df = (
df.withColumn('left_gene_id', biofeature_mapper(col('left_phenotype')))
.withColumn('right_gene_id', biofeature_mapper(col('right_phenotype')))
df.withColumn('left_gene_id',
when(col('left_type') == 'eqtl', biofeature_mapper(col('left_phenotype')))
.otherwise(lit(None)))
.withColumn('right_gene_id',
when(col('right_type') == 'eqtl', biofeature_mapper(col('right_phenotype')))
.otherwise(lit(None)))
.withColumn('left_gene_id',
when(col('left_type') == 'sqtl', split(col('left_phenotype'), '\^').getItem(4))
.otherwise(col('left_gene_id')))
.withColumn('right_gene_id',
when(col('right_type') == 'sqtl', split(col('right_phenotype'), '\^').getItem(4))
.otherwise(col('right_gene_id')))
)

# Set gene_id to null if it doesn't start with ENSG
Expand Down Expand Up @@ -184,6 +196,7 @@ def main():

return 0


def drop_duplicates_keep_first(df, subset, order_colname, ascending=True):
''' Implements the equivalent pd.drop_duplicates(keep='first')
Args:
Expand Down Expand Up @@ -215,6 +228,7 @@ def drop_duplicates_keep_first(df, subset, order_colname, ascending=True):
)
return res


def load_pheno_to_gene_map(infs):
''' Loads a dictionary, mapping phenotype_ids to ensembl gene IDs.
Input files should have 2 columns phenotype_id, gene_id
Expand Down Expand Up @@ -243,6 +257,7 @@ def load_pheno_to_gene_map(infs):

return d


if __name__ == '__main__':

main()
8 changes: 4 additions & 4 deletions 8_copy_results_to_gcs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
version_date=`date +%y%m%d`

# Copy current results
gsutil -m cp -r $HOME/output/coloc_raw.parquet gs://genetics-portal-dev-staging/coloc/$version_date/
gsutil -m cp -r $HOME/output/coloc_raw.csv.gz gs://genetics-portal-dev-staging/coloc/$version_date/
gsutil -m cp -r $HOME/output/coloc_processed.parquet gs://genetics-portal-dev-staging/coloc/$version_date/
gsutil -m cp -r $HOME/disk1/output/coloc_raw.parquet gs://genetics-portal-dev-staging/coloc/$version_date/
gsutil -m cp -r $HOME/disk1/output/coloc_raw.csv.gz gs://genetics-portal-dev-staging/coloc/$version_date/
gsutil -m cp -r $HOME/disk1/output/coloc_processed.parquet gs://genetics-portal-dev-staging/coloc/$version_date/

# Copy overlap table
gsutil -m cp -r $HOME/output/overlap_table gs://genetics-portal-dev-staging/coloc/$version_date/overlap_table

# Make a note as to what this coloc run contained. E.g.:
echo "Recomputed from scratch: Data for Genetics portal R8, with GTEx-sQTL added and FinnGen R6" > README.txt
echo "Recomputed from scratch: Data for Genetics portal R8, with GTEx-sQTL added and FinnGen R6. (Latest fix: made sure sQTLs have gene IDs filled.)" > README.txt
gsutil cp README.txt gs://genetics-portal-dev-staging/coloc/$version_date/README.txt

if [ -d "$HOME/output/merged" ]; then
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ To run on google dataproc: (last run took XX hrs)
# Note that I had this fail multiple times, and had to try adjusting the number
# of executors, memory, cores, etc. to get it to work. More memory seems to be key.
# Took nearly 5 hrs on last run, n2-highmem-64
# This is probably mainly due to checking for duplicates. Without that would be < 1 hr.
# This would probably be dramatically faster on BigQuery.
gcloud beta dataproc clusters create \
js-coloc-beta-join \
--image-version=preview \
Expand Down
6 changes: 3 additions & 3 deletions join_results_with_betas.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ def main():
print('Spark version: ', spark.version)

# File args (dataproc)
in_parquet = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed.parquet'
in_parquet = 'gs://genetics-portal-dev-staging/coloc/220408/coloc_processed.parquet'
in_sumstats = 'gs://genetics-portal-dev-sumstats/filtered/significant_window_2mb_union'
out_parquet = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed_w_betas.parquet'
out_dups = 'gs://genetics-portal-dev-staging/coloc/220331/coloc_processed_w_betas_dups.parquet'
out_parquet = 'gs://genetics-portal-dev-staging/coloc/220408/coloc_processed_w_betas.parquet'
out_dups = 'gs://genetics-portal-dev-staging/coloc/220408/coloc_processed_w_betas_dups.parquet'

# # File args (local)
# in_parquet = '/home/ubuntu/results/coloc/results/coloc_processed.parquet'
Expand Down

0 comments on commit 14ffc07

Please sign in to comment.