Skip to content

Commit

Permalink
update paper scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
clemgoub committed May 29, 2024
1 parent c61c336 commit 8f44c22
Show file tree
Hide file tree
Showing 8 changed files with 375 additions and 211 deletions.
44 changes: 40 additions & 4 deletions paper/Applications_Examples/CSATIVA/Cs_GraffiTE.R
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ plot_curves <- function(vcf, rare_freq, fixed_freq, nperm, type = c("polymorphis

#### CLUSTERS ####

ClusAll<-read.table("~/Documents/Csativa/GraffiTE/Cs_GraffiTE_280423_pangenome.variants_clustered_mode0_with_ANNOTATION", h = F)
ClusAll<-read.table("Cs_GraffiTE_280423_pangenome.variants_clustered_mode0_with_ANNOTATION", h = F)
#head(ClusAll)
names(ClusAll)<-c("cluster", "member", "SUPP", "SUPP_VEC", "SVLEN", "n_hits", "n_frags", "hit_length", "hit_names", "hit_class", "hit_strands", "frags", "cum_hits_len", "SV_TE_cov", "TSD")
Ncount<-tapply(ClusAll$member, ClusAll$cluster, length)
Expand All @@ -166,7 +166,7 @@ ClusN3<-ClusAll %>%
) %>%
filter(Count > 2) %>%
arrange(desc(Count), desc(cumSVLEN))
write.table(ClusN3, file = "~/Documents/Csativa/GraffiTE/ClusN3.tsv", quote = F, row.names = F)
write.table(ClusN3, file = "ClusN3.tsv", quote = F, row.names = F)

# All N >= 3
barplot(height = ClusN3$Count, width = ClusN3$cumSVLEN)
Expand All @@ -179,7 +179,43 @@ axis(side = 1, at = seq(0,250, by = 1))


#### VCF ####
vcf_N3<-read.vcfR("/Users/cgoubert/Library/CloudStorage/[email protected]/Other\ computers/Zephyrantes/N3_200_40000bp_clusters_pangenome.vcf.recode.vcf")
vcf_N3<-read.vcfR("N3_200_40000bp_clusters_pangenome.vcf.recode.vcf")

# Saturation
plot_curves(vcf_N3, 1/9, 9/9, 100, "polymorphism", "clem")
plot_curves(vcf_N3, 1/9, 9/9, 100, "polymorphism", "clem")

#### Comparison with GT-svsn ####
# # gather the vcfs
# cp ../Cs_GT-sv/3_TSD_search/pangenome.vcf Cs_GT-sv_pangenome.vcf
# cp ../Cs_GT-sn/3_TSD_search/pangenome.vcf Cs_GT-sn_pangenome.vcf
# # list and merge with same parameteres as GraffiTE
# ls *.vcf > vcfs.txt
# ~/bin/SURVIVOR/Debug/SURVIVOR merge vcfs.txt 0.1 0 0 0 0 100 Cs_GT-svsn_merged.vcf
# # MMseqs
# # first gather the variants
# grep -v '#' Cs_GT-svsn_merged.vcf | awk '/SVTYPE=INS/ {print ">"$3; print $5; next} /SVTYPE=DEL/ {print ">"$3; print $4; next}' > Cs_GT-svsn_variants.fasta
# salloc --time=2:00:0 --cpus-per-task=20 --account=rrg-bourqueg-ad --mem-per-cpu=5Gb
# module load mmseqs2
# mmseqs easy-cluster Cs_GT-svsn_variants.fasta Cs_GT-svsn_variants.MMseqs2 tmp --cov-mode 0 -c 0.8 --min-seq-id 0.8 -s 100 --exact-kmer-matching 1 --threads 20
# # list the cluster with 3+ sequences
# cut -f 1 Cs_GT-svsn_variants.MMseqs2_cluster.tsv | sort | uniq -c | awk '$1 > 2' > Cs_GT-svsn_variants.MMseqs2_N3clusters
# # get the list of loci members of these clusters
# Cs_GT-svsn]$ join -12 -21 <(sort -k2,2 Cs_GT-svsn_variants.MMseqs2_N3clusters) <(sort -k1,1 Cs_GT-svsn_variants.MMseqs2_cluster.tsv) | awk '{print $3}' > Cs_GT-svsn_variants.MMseqs2_N3members
# # next I will filter the VCF line to keep these, and with size 200-40000
# join -11 -23 -o 1.1,2.8 <(sort -k1,1 Cs_GT-svsn_variants.MMseqs2_N3members) <(grep -v '#' Cs_GT-svsn_merged.vcf | sort -k3,3) | sed 's/;SVLEN=/\t/g;s/;SVTYPE=/\t/g;s/-//g;s/;SUPP_VEC=/\t/g' | awk '{print $1"\t"$3"\t"$4}' | awk '$3 >= 200 && $3 <= 40000' | cut -f 2 | sort | uniq -c
# # 10 is Sniffles only, 11 both and, 01 svim only
# 25142 01
# 19140 10
# 17442 11

GTsv<-25142+17442
GTsn<-19140+17442
both<-17442
#install.packages("VennDiagram")
library(VennDiagram)
VennDiagram::draw.pairwise.venn(GTsv, GTsn,both,
euler.d = T,
category = c("GT-sv\n(assemblies)", "GT-sn\n(long reads)"),
col = c("purple", "green3"),fill = c("purple", "green3"),
fontfamily = "sans",cat.fontfamily = "sans",scaled = T,
)
104 changes: 87 additions & 17 deletions paper/Applications_Examples/DMEL_RECH2022/GraffiTE_DM30.pME_Figures.R
Original file line number Diff line number Diff line change
@@ -1,26 +1,50 @@
### Analysis of Rech's (DM30) pangenome with GraffiTE
## We will use here the full GT-sv-sn-GA and filter for 1 hits

################################################################################
# Data prep (bash commands used to prepare the data for R)
# The starting file is the output if Graffite mode GT-svsn-GA DM30_sv-sn_graphaligner.vcf
# 1. list variants with a single TE hit using the pre-genotyping VCF:
# grep -v '#' DM30_sv-sn_pangenome.vcf | grep 'n_hits=1;' | cut -f 3 > DM30_sv-sn_pangenome_1hits_IDs
# 2. filter for this variants and exclude fixed loci (same genotype as reference genome for all samples)
# vcftools --vcf DM30_sv-sn_graphaligner.vcf --snps DM30_sv-sn_pangenome_1hits_IDs --non-ref-ac-any 1 --recode --recode-INFO-all --out DM30_sv-sn_GA_1hits_03142024
# 3. convert filtered VCF into tsv for R analyses
# cat GA_tsv_head <(paste <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 1-7) <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 8 | sed 's/;/\t/g;s/[A-Za-z0-9_-]*=//g;s/\tlowdepth//g' | awk '$2~/>/ {if (NF == 29) {print $3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\t"$28"\t"$29"\tNA\t"$1"\t"$2"\t"} else {print $3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\t"$28"\t"$29"\t"$30"\t"$1"\t"$2}; next} {if (NF == 29) {print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\tNA\t"$28"\t"$29} else {print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\t"$28"\t"$29"\t"$30}}') <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }') <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }' | awk '{ count = 0; for (i=1; i<=NF; i++) { if ($i == "0/0" || $i == "N/A") count++}; {print 30-count}}') | awk -v OFS="\t" '$1=$1') | awk '/POS/ {print $0"\thasMissing"; next} /N\/A/ {print $0"\tTRUE"; next} {print $0"\tFALSE"}' > DM30_sv-sn_GA_1hits_03142024.recode.NAomitCounts.tsv
# 4. header for this file is:
# CHROM POS ID REF ALT QUAL FILTER CIEND CIPOS CHR2 END SVLEN SVMETHOD SVTYPE SUPP_VEC SUPP STRANDS sniffles2_SUPP sniffles2_SVLEN sniffles2_SVTYPE sniffles2_ID svim-asm_SUPP svim-asm_SVLEN svim-asm_SVTYPE svim-asm_ID n_hits match_lengths TEname matching_classes fragmts RM_hit_strands RM_hit_IDs total_match_length total_match_span TSD DP AT AKA-017 AKA-018 COR-014 COR-018 COR-023 JUT-008 KIE-094 RAL-059 COR-025 GIM-012 GIM-024 JUT-011 LUN-004 LUN-007 MUN-008 MUN-009 MUN-013 MUN-015 MUN-020 RAL-091 RAL-176 RAL-177 RAL-375 RAL-426 RAL-737 RAL-855 SLA-001 STO-022 TEN-015 TOM-008 GraphAligner_genome_counts
################################################################################

### R analyses start here:

# Load R libraries
library(ggplot2)
library(ggrepel)
library(dplyr)
library(grDevices)

# load main table converted from the GraffiTE VCF
GTDM<-read.table("DM30_sv-sn_graphaligner_1hits_08212023_withCounts_withHeader.tsv",
#GTDM<-read.table("DM30_sv-sn_graphaligner_1hits_08212023_withCounts_withHeader.tsv",
# h = T)
options(scipen=999) # prevent scientific notation of genomic position and other large integers
GTDM<-read.table("DM30_sv-sn_GA_1hits_03142024.recode.NAomitCounts.tsv",
h = T)
length(subset(GTDM, (GTDM$n_hits == 1))[,1])
# # subset to trusted pME (n_hits = 1 and not fixed relative to dm6 after GraphAligner)
GTDM.f<-subset(GTDM, (GTDM$n_hits == 1 & GTDM$GraphAligner_genome_counts != 0))

# join the TE classification
# check that the input file is already filtered for 1 hits only
# length(GTDM[,1])
# length(subset(GTDM, (GTDM$n_hits == 1))[,1])
# length(subset(GTDM, (GTDM$n_hits == 1 & GTDM$GraphAligner_genome_counts != 0))[,1])
# for code legacy and to avoid bug, rename input with .f
GTDM.f<-GTDM
# load and join the TE classification
MCTE<-read.table("Rech_TE_list.txt", h = T)
colnames(GTDM.f)[31]<-"TEname"
GTDM.f<-merge(GTDM.f, MCTE, by = "TEname")
# list the chr we will use
DMchrs<-c("NC_004354.4", "NT_033779.5","NT_033778.4", "NT_037436.4", "NT_033777.3")
GTDM.f<-GTDM.f[GTDM.f$CHROM %in% DMchrs,]
# remove loci with missing genotypes
GTDM.f<-GTDM.f[GTDM.f$hasMissing == FALSE,]

# annotate euchromatic and heterochromatic pME
Euch.bed<-read.table("/Users/clementgoubert/Library/CloudStorage/[email protected]/Other\ computers/Zephyrantes/GraffiTE/Comparison_GraffiTE_Rech_DM30_plus_ISO-1/GraffiTE_VCFS/Euchromatic_regions_RECH2022.bed")
Euch.bed<-read.table("Euchromatic_regions_RECH2022.bed")
GTDM.f.bed<-data.frame(cbind(GTDM.f$CHROM, GTDM.f$POS, GTDM.f$POS+1,GTDM.f$ID))
Euch.GTDM.f<-bedtoolsr::bt.intersect(GTDM.f.bed, Euch.bed)[,4]
GTDM.f$region<-ifelse(GTDM.f$ID %in% Euch.GTDM.f, "euch", "heter")
Expand All @@ -36,7 +60,7 @@ Het<-data.frame(chr = c("2L", "2R", "3L", "3R", "X"),
X2_s = c(18870000, 24972477, 19026900, 31614278, 21338973),
X2_e = c(23513712, 25286936, 28110227, 32079331, 23542271))
# get dm6 annotation for background densities
rpmsk<-read.table("/Users/clementgoubert/Library/CloudStorage/[email protected]/Other\ computers/Zephyrantes/GraffiTE/Comparison_GraffiTE_Rech_DM30_plus_ISO-1/dm6_rpmsk_nochr.bed")
rpmsk<-read.table("dm6_rpmsk_nochr.bed")
names(rpmsk)<-c("chr", "start", "end", "strand", "name", "Order", "SuperFam")
rpmsk.TEs<-rpmsk[!rpmsk$Order %in% c("Simple_repeat", "Low_complexity", "Satellite"),]
rpmsk.reps<-rpmsk[rpmsk$Order %in% c("Simple_repeat", "Low_complexity", "Satellite"),]
Expand Down Expand Up @@ -85,7 +109,7 @@ DMpME<-ggplot(GTDM.f[GTDM.f$SuperFamily != "NewFam",])+
)

cov_all$type<-factor(cov_all$type, levels = c("TEs", "simple_repeats"), labels = c("TEs", "other repeats"))
label(cov_all$type)
#label(cov_all$type)
DM_reps<-ggplot(GTDM.f[GTDM.f$SuperFamily != "NewFam",])+
geom_rect(dat = Het, aes(xmin = X1_s/1000000, xmax = X1_e/1000000, ymin = 0, ymax = 1), fill = "grey90")+
geom_rect(dat = Het, aes(xmin = X2_s/1000000, xmax = X2_e/1000000, ymin = 0, ymax = 1), fill = "grey90")+
Expand Down Expand Up @@ -155,7 +179,22 @@ bpDM<-ggplot(GTDM.f[GTDM.f$SuperFamily != "NewFam",], aes(x = factor(SuperFamily
reverse = T)
)

######### Saturation curves ##################
# export list of analysed variants
write.table(as.data.frame(GTDM.f$ID), "analyzed_variants", quote = F, row.names = F, col.names = F)

### Saturation curves:

################################################################################
# for this analysis, we are using a tool we created early on that require the
# variants to be encoded in the SUPP_VEC of the VCF. Since we are working on the
# genotyped variants (not the detected variants), we needed to recode the SUPP_VEC
# accordingly
#
# # first filter VCF to match the filtered list of variant
# vcftools --vcf DM30_sv-sn_GA_1hits_03142024.recode.vcf --snps analyzed_variants --recode --recode-INFO-all --out DM30_sv-sn_GA_1hits_03142024__analyzed
# # second replace the SUPP_VEC using GA genotypes
# cat <(grep '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf) <(paste <(grep -v '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf | sed 's/SUPP_VEC=/SUPP_VEC=\t/g' | cut -f 1-8) <(grep -v '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }' | sed -e 's/0\/0/0/g' -e 's/N\/A/0/g' -e 's/.\/./1/g' -e 's/ //g') <(grep -v '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf | sed 's/[0-1]*;SUPP=/\t;SUPP=/g' | cut -f 9-) | sed 's/SUPP_VEC=\t/SUPP_VEC=/g;s/\t;SUPP/;SUPP/g') > DM30_sv-sn_GA_1hits_03142024__analyzed_GA_SUPP.recode.vcf
################################################################################

library(vcfR)
library(ggrepel)
Expand All @@ -166,9 +205,9 @@ library(reshape2)
library(ade4)
library(stringr)

# we need to load these functions first
theme_set(theme_bw(base_size = 14))

# functions to read VCF's support vector and plot the saturation curves
discovery_curve <- function(ref_vcf, type = c("polymorphism", "TE"), nperm = 100) {
vcf_matrix <- lapply(str_split(extract.info(ref_vcf, "SUPP_VEC"), ""), as.numeric)
na_entries <- is.na(vcf_matrix)
Expand Down Expand Up @@ -215,7 +254,6 @@ discovery_curve <- function(ref_vcf, type = c("polymorphism", "TE"), nperm = 100
pca = pca_matrix
))
}

filter_vcf_freq <- function(ref_vcf, min_freq = 0, max_freq = Inf, nhits = c("any", "single")) {
vcf_freq <- lapply(str_split(extract.info(ref_vcf, "SUPP_VEC"), ""), as.numeric) %>%
simplify2array() %>%
Expand All @@ -232,7 +270,6 @@ filter_vcf_freq <- function(ref_vcf, min_freq = 0, max_freq = Inf, nhits = c("an
ref_vcf[vcf_freq >= min_freq & vcf_freq <= max_freq]
}
}

plot_curves <- function(vcf, rare_freq, fixed_freq, nperm, type = c("polymorphism", "TE"), mode = c("cristian", "clem")) {

n_genomes <- str_length(extract.info(vcf, "SUPP_VEC")[1])
Expand Down Expand Up @@ -303,9 +340,8 @@ plot_curves <- function(vcf, rare_freq, fixed_freq, nperm, type = c("polymorphis
theme(legend.position = c(60, 1000))
}
}


vcfDM<-read.vcfR("DM30_sv-sn_graphaligner_1hits_08212023_GA_SUPP.recode.vcf")
# load the VCF
vcfDM<-read.vcfR("DM30_sv-sn_GA_1hits_03142024__analyzed_GA_SUPP.recode.vcf")
t_2_29<-plot_curves(vcfDM, rare_freq = 2/30, fixed_freq = 29/30, nperm = 100, type = "polymorphism", mode = "clem")


Expand All @@ -314,4 +350,38 @@ CP2<-cowplot::plot_grid(t_2_29, bpDM,
rel_widths = c(0.3, 0.7),
ncol = 2
)

# final plot
cowplot::plot_grid(CP1, CP2, ncol = 1)

### Comparison with short reads:

################################################################################
# data prep to match long-reads analyses
#
# 1. First, I need to filter the pangenie VCF in the same fashion: 1 hits, no fixed
# ```sh
# vcftools --vcf DM30_sv-sn_pangenie.vcf --snps DM30_sv-sn_pangenome_1hits_IDs --non-ref-ac-any 1 --recode --recode-INFO-all --out DM30_sv-sn_PA_1hits_03192024
# ```
# After filtering, kept 10950 out of a possible 32909 Site
#
# 2. Identify missing data -- we don't need to parse the info field this time (we just need to positions and the genotypes)
# ```sh
# paste <(grep -v '#' DM30_sv-sn_PA_1hits_03192024.recode.vcf | cut -f 1-7) <(grep -v '#' DM30_sv-sn_PA_1hits_03192024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }') <(grep -v '#' DM30_sv-sn_PA_1hits_03192024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }' | awk '{ count = 0; for (i=1; i<=NF; i++) { if ($i == "0/0" || $i == "N/A") count++}; {print 30-count}}') | awk -v OFS="\t" '$1=$1' | awk '/POS/ {print $0"\thasMissing"; next} /N\/A/ {print $0"\tTRUE"; next} {print $0"\tFALSE"}' | awk '{print $1"\t"$2"\t"$3"\t"$(NF-1)"\t"$NF}' > DM30_sv-sn_PG_1hits_03192024.simple.NAomitCounts.tsv
# ```
################################################################################

GTDMshort<-read.table("DM30_sv-sn_PG_1hits_03192024.simple.NAomitCounts.tsv")
names(GTDMshort)<-c("CHROM", "POS", "ID", "PGcount", "hasMissing")
# filter for main chromosomes
GTDMshort<-GTDMshort[GTDMshort$CHROM %in% DMchrs,]
# remove loci with missing genotypes
GTDMshort<-GTDMshort[GTDMshort$hasMissing == FALSE,]

library(ggVennDiagram)
vlist<-list(
GTlong=GTDM.f$ID,
GTshort=GTDMshort$ID)
ggVennDiagram(vlist) + scale_fill_gradient(low="grey99",high = "red")

length(GTDMshort$ID)
43 changes: 41 additions & 2 deletions paper/Applications_Examples/HUMAN_HPRC/GraffiTE_HPRC_pangenome.R
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ HPRCvcf_F<-HPRCvcf[Lfilter & HitsFilter & TEclassFilter & SVAVNTF_filter]
## Saturation
Sat<-plot_curves(HPRCvcf_F, rare_freq = 0.05, fixed_freq = 0.95, nperm = 100, type = "polymorphism", mode = "clem")
## PCoA
HPRCmeta<-read.table("~/Documents/GraffiTE/HPRC_metadata.txt", h = T, sep = "\t")
HPRCmeta<-read.table("HPRC_metadata.txt", h = T, sep = "\t")
HPRCmeta2<-HPRCmeta[rep(seq_len(nrow(HPRCmeta)), each = 2), ]
HPRCmeta2$ID<-paste(HPRCmeta2$Sample, rep(c("mat", "pat"), 47), sep = "_")
HPRCmeta2<-HPRCmeta2[order(HPRCmeta2$Sample),]
Expand Down Expand Up @@ -290,4 +290,43 @@ Counts<-ggplot(TElonger, aes(x = SubCode, y = Count))+

Aplot<-cowplot::plot_grid(Sat, PCoA, rel_widths = c(0.35,0.65))
Bplot<-cowplot::plot_grid( NULL, Counts, rel_widths = c(0,1))
cowplot::plot_grid(Aplot, Bplot, nrow = 2)
cowplot::plot_grid(Aplot, Bplot, nrow = 2)

### Comparison with Minigraph-Cactus calls
# 1. We created a VCF derived from the GFA of Liao at al (HPRC paper)
# 2. We used the --vcf input of GraffiTE to annotate this VCF for pMEs.
# 3. We removed duplicates SV from the VCF: each SNP/INDEL replicate (which occurs often at poly-A tails).
# grep -v '#' MiniCac_HPRC_pangenome.vcf | cut -f 1,2,8,3 | sed 's/repeat_ids=/\t/g;s/;matching_classes=/\t/g' | awk '{print $1"_"$2"_"$5"\t"$3}'| sort -u -k1,1 | cut -f 2 > MiniCac_HPRC_pangenome__variants_to_keep.txt
# vcftools --vcf MiniCac_HPRC_pangenome_.vcf --snps MiniCac_HPRC_pangenome__variants_to_keep.txt --recode --recode-INFO-all --out MiniCac_HPRC_pangenome_preR_filter
# 4. This filtered VCF is the input for these analyses

MiniCacvcf<-read.vcfR("MiniCac_HPRC_pangenome_preR_filter.recode.vcf")
MiniCacvcffilter<-abs(as.numeric(extract.info(MiniCacvcf, "LEN"))) >= 250
MiniCacvcfHitsFilter<-as.numeric(extract.info(MiniCacvcf, "n_hits")) == 1 | extract.info(MiniCacvcf, "mam_filter_1") != "None"
MiniCacvcfSVAVNTF_filter<-extract.info(MiniCacvcf, "mam_filter_2") == "None"
MiniCacvcfTEclassFilter<-grepl(paste(c("Alu", "L1", "SVA"), collapse = "|"), extract.info(MiniCacvcf, "repeat_ids"))
MiniCacvcfHPRCvcf_F<-MiniCacvcf[MiniCacvcffilter & MiniCacvcfHitsFilter & MiniCacvcfSVAVNTF_filter & MiniCacvcfTEclassFilter]
write.vcf(MiniCacvcfHPRCvcf_F, "/Users/clementgoubert/Library/CloudStorage/[email protected]/Other computers/Zephyrantes/GraffiTE/paper_datasets/Applications_Examples/HUMAN_HPRC/MiniCac_HPRC_pangenome_R_filtered.vcf")
# export the GraffiTE one
write.vcf(HPRCvcf_F, "/Users/clementgoubert/Library/CloudStorage/[email protected]/Other computers/Zephyrantes/GraffiTE/paper_datasets/Applications_Examples/HUMAN_HPRC/HPRC_pangenome_R_filtered.vcf")

# prep for comparison with SURVIVOR
# bcftools view HPRC_pangenome_R_filtered.vcf > HPRC_pangenome_R_filtered_.vcf
# bcftools view MiniCac_HPRC_pangenome_R_filtered.vcf > MiniCac_HPRC_pangenome_R_filtered_.vcf
# ls HPRC_pangenome_R_filtered_.vcf MiniCac_HPRC_pangenome_R_filtered_.vcf > vcfs.txt
# ~/SURVIVOR/Debug/SURVIVOR merge vcfs.txt 0.1 0 0 0 0 100 HPRC_filtered_merged.vcf
# grep -v '#' HPRC_filtered_merged.vcf | grep -c 'SUPP_VEC=10' # This is the count of GraffiTE/svim-asm specific variants
# 6270
# grep -v '#' HPRC_filtered_merged.vcf | grep -c 'SUPP_VEC=01' # This is the count of Minigraph/Cactus specific variants
# 788
# grep -v '#' HPRC_filtered_merged.vcf | grep -c 'SUPP_VEC=11' # This is the count of shared variants between both methods using the same filters
# 8451

library(VennDiagram)
draw.pairwise.venn(area1 = 6270+8451,
area2 = 788+8451,
cross.area = 8451,category = c("minmap2/svim-asm", "Minigraph/Cactus"),
euler.d = T,
scaled = T,
fill = c("grey99", "red"),fontfamily = rep("sans", 3),cat.fontfamily = rep("sans",2))

Loading

0 comments on commit 8f44c22

Please sign in to comment.