Skip to content

Commit

Permalink
Merge pull request #17 from umccr/implement-psa-schema-spreadsheet-li…
Browse files Browse the repository at this point in the history
…brary-tracking-metadata

Implemented OrcaVault psa schema
  • Loading branch information
victorskl authored Jan 11, 2025
2 parents 78da3a7 + 1f62b55 commit 0c4dcff
Show file tree
Hide file tree
Showing 8 changed files with 276 additions and 2 deletions.
15 changes: 14 additions & 1 deletion dev/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,24 @@ tsa:
@docker compose cp ./src/tsa.sql postgres:/tmp/tsa.sql
@docker compose exec -e PGPASSWORD=dev -it postgres psql -h 0.0.0.0 -d orcavault -U dev -f /tmp/tsa.sql

all: ods tsa
psa:
@docker compose cp ./src/psa.sql postgres:/tmp/psa.sql
@docker compose exec -e PGPASSWORD=dev -it postgres psql -h 0.0.0.0 -d orcavault -U dev -f /tmp/psa.sql

all: ods tsa psa

load:
@docker compose cp ./src/load.sh postgres:/tmp/load.sh
@docker compose exec -it postgres bash /tmp/load.sh

wait:
@sleep 5

reload: down up wait all load

next:
@docker compose cp ./src/next.sh postgres:/tmp/next.sh
@docker compose exec -it postgres bash /tmp/next.sh

sync:
@aws s3 sync s3://orcabus-test-data-843407916570-ap-southeast-2/orcavault/data/ data/
7 changes: 7 additions & 0 deletions dev/src/next.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env bash

PGPASSWORD=dev psql -h 0.0.0.0 -d orcavault -U dev -c 'SELECT tsa.truncate_tables();'

PGPASSWORD=dev psql -h 0.0.0.0 -d orcavault -U dev <<EOF
\copy tsa.spreadsheet_library_tracking_metadata from '/data/orcavault_tsa_spreadsheet_library_tracking_metadata.next.csv' with (format csv, header true, delimiter ',');
EOF
38 changes: 38 additions & 0 deletions dev/src/psa.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
SELECT current_database();

-- create psa schema
CREATE SCHEMA IF NOT EXISTS psa AUTHORIZATION dev;
SET search_path TO psa;

SELECT current_schema();

CREATE TABLE IF NOT EXISTS orcavault.psa.spreadsheet_library_tracking_metadata
(
assay varchar,
comments varchar,
coverage varchar,
experiment_id varchar,
external_sample_id varchar,
external_subject_id varchar,
library_id varchar,
override_cycles varchar,
phenotype varchar,
project_name varchar,
project_owner varchar,
qpcr_id varchar,
quality varchar,
run varchar,
sample_id varchar,
sample_name varchar,
samplesheet_sample_id varchar,
source varchar,
subject_id varchar,
truseq_index varchar,
type varchar,
workflow varchar,
r_rna varchar,
study varchar,
sheet_name varchar,
load_datetime timestamptz,
record_source varchar(255)
);
23 changes: 23 additions & 0 deletions dev/src/tsa.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,29 @@ SET search_path TO tsa;

SELECT current_schema();

-- --

DROP FUNCTION IF EXISTS tsa.truncate_tables();

CREATE OR REPLACE FUNCTION tsa.truncate_tables()
RETURNS void
LANGUAGE 'sql'
COST 100
VOLATILE PARALLEL UNSAFE
AS $BODY$
DO $$ DECLARE
table_name text;
BEGIN
FOR table_name IN (SELECT tablename FROM pg_tables WHERE schemaname='tsa') LOOP
EXECUTE 'TRUNCATE TABLE tsa."' || table_name || '" CASCADE;';
END LOOP;
END $$;
$BODY$;

ALTER FUNCTION tsa.truncate_tables() OWNER TO dev;

-- --

CREATE TABLE IF NOT EXISTS orcavault.tsa.spreadsheet_library_tracking_metadata
(
assay varchar,
Expand Down
9 changes: 8 additions & 1 deletion orcavault/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,20 @@ ods:
tsa:
@(cd ../dev/ && $(MAKE) tsa)

psa:
@(cd ../dev/ && $(MAKE) psa)

all:
@(cd ../dev/ && $(MAKE) all)

load:
@(cd ../dev/ && $(MAKE) load)

reload: down up ods tsa load
reload:
@(cd ../dev/ && $(MAKE) reload)

next:
@(cd ../dev/ && $(MAKE) next)

sync:
@(cd ../dev/ && $(MAKE) sync)
3 changes: 3 additions & 0 deletions orcavault/dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ clean-targets:
models:
orcavault:
+materialized: view
psa:
+schema: psa
+materialized: table
raw:
+schema: raw
+materialized: table
60 changes: 60 additions & 0 deletions orcavault/models/psa/schema.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
version: 2

models:

- name: spreadsheet_library_tracking_metadata
columns:
- name: assay
data_type: varchar
- name: comments
data_type: varchar
- name: coverage
data_type: varchar
- name: experiment_id
data_type: varchar
- name: external_sample_id
data_type: varchar
- name: external_subject_id
data_type: varchar
- name: library_id
data_type: varchar
- name: override_cycles
data_type: varchar
- name: phenotype
data_type: varchar
- name: project_name
data_type: varchar
- name: project_owner
data_type: varchar
- name: qpcr_id
data_type: varchar
- name: quality
data_type: varchar
- name: run
data_type: varchar
- name: sample_id
data_type: varchar
- name: sample_name
data_type: varchar
- name: samplesheet_sample_id
data_type: varchar
- name: source
data_type: varchar
- name: subject_id
data_type: varchar
- name: truseq_index
data_type: varchar
- name: type
data_type: varchar
- name: workflow
data_type: varchar
- name: r_rna
data_type: varchar
- name: study
data_type: varchar
- name: sheet_name
data_type: varchar
- name: load_datetime
data_type: timestamptz
- name: record_source
data_type: varchar(255)
123 changes: 123 additions & 0 deletions orcavault/models/psa/spreadsheet_library_tracking_metadata.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
{{
config(
materialized='incremental'
)
}}

with source as (

select * from {{ source('tsa', 'spreadsheet_library_tracking_metadata') }}

),

cleaned as (

select
trim(regexp_replace(assay, E'[\\n\\r]+', '', 'g')) as assay,
trim(regexp_replace(comments, E'[\\n\\r]+', '', 'g')) as comments,
trim(regexp_replace(coverage, E'[\\n\\r]+', '', 'g')) as coverage,
trim(regexp_replace(experiment_id, E'[\\n\\r]+', '', 'g')) as experiment_id,
trim(regexp_replace(external_sample_id, E'[\\n\\r]+', '', 'g')) as external_sample_id,
trim(regexp_replace(external_subject_id, E'[\\n\\r]+', '', 'g')) as external_subject_id,
trim(regexp_replace(library_id, E'[\\n\\r]+', '', 'g')) as library_id,
trim(regexp_replace(override_cycles, E'[\\n\\r]+', '', 'g')) as override_cycles,
trim(regexp_replace(phenotype, E'[\\n\\r]+', '', 'g')) as phenotype,
trim(regexp_replace(project_name, E'[\\n\\r]+', '', 'g')) as project_name,
trim(regexp_replace(project_owner, E'[\\n\\r]+', '', 'g')) as project_owner,
trim(regexp_replace(qpcr_id, E'[\\n\\r]+', '', 'g')) as qpcr_id,
trim(regexp_replace(quality, E'[\\n\\r]+', '', 'g')) as quality,
trim(regexp_replace(run, E'[\\n\\r]+', '', 'g')) as run,
trim(regexp_replace(sample_id, E'[\\n\\r]+', '', 'g')) as sample_id,
trim(regexp_replace(sample_name, E'[\\n\\r]+', '', 'g')) as sample_name,
trim(regexp_replace(samplesheet_sample_id, E'[\\n\\r]+', '', 'g')) as samplesheet_sample_id,
trim(regexp_replace(source, E'[\\n\\r]+', '', 'g')) as source,
trim(regexp_replace(subject_id, E'[\\n\\r]+', '', 'g')) as subject_id,
trim(regexp_replace(truseq_index, E'[\\n\\r]+', '', 'g')) as truseq_index,
trim(regexp_replace(type, E'[\\n\\r]+', '', 'g')) as type,
trim(regexp_replace(workflow, E'[\\n\\r]+', '', 'g')) as workflow,
trim(regexp_replace(r_rna, E'[\\n\\r]+', '', 'g')) as r_rna,
trim(regexp_replace(study, E'[\\n\\r]+', '', 'g')) as study,
trim(regexp_replace(sheet_name, E'[\\n\\r]+', '', 'g')) as sheet_name
from
source
where
coalesce
(
nullif(assay, ''),
nullif(comments, ''),
nullif(coverage, ''),
nullif(experiment_id, ''),
nullif(external_sample_id, ''),
nullif(external_subject_id, ''),
nullif(library_id, ''),
nullif(override_cycles, ''),
nullif(phenotype, ''),
nullif(project_name, ''),
nullif(project_owner, ''),
nullif(qpcr_id, ''),
nullif(quality, ''),
nullif(run, ''),
nullif(sample_id, ''),
nullif(sample_name, ''),
nullif(samplesheet_sample_id, ''),
nullif(source, ''),
nullif(subject_id, ''),
nullif(truseq_index, ''),
nullif(type, ''),
nullif(workflow, ''),
nullif(r_rna, ''),
nullif(study, '')
) is not null

),

transformed as (

select
*
from
cleaned
except
select
assay,
comments,
coverage,
experiment_id,
external_sample_id,
external_subject_id,
library_id,
override_cycles,
phenotype,
project_name,
project_owner,
qpcr_id,
quality,
run,
sample_id,
sample_name,
samplesheet_sample_id,
source,
subject_id,
truseq_index,
type,
workflow,
r_rna,
study,
sheet_name
from
{{ this }}

),

final as (

select
*,
cast('{{ run_started_at }}' as timestamptz) as load_datetime,
(select 'UMCCR_Library_Tracking_MetaData') as record_source
from
transformed

)

select * from final

0 comments on commit 0c4dcff

Please sign in to comment.