update paper scripts

cgroza · May 29, 2024 · 8f44c22 · 8f44c22
1 parent c61c336
commit 8f44c22
Show file tree

Hide file tree

Showing 8 changed files with 375 additions and 211 deletions.
diff --git a/paper/Applications_Examples/CSATIVA/Cs_GraffiTE.R b/paper/Applications_Examples/CSATIVA/Cs_GraffiTE.R
@@ -148,7 +148,7 @@ plot_curves <- function(vcf, rare_freq, fixed_freq, nperm, type = c("polymorphis
 
 #### CLUSTERS ####
 
-ClusAll<-read.table("~/Documents/Csativa/GraffiTE/Cs_GraffiTE_280423_pangenome.variants_clustered_mode0_with_ANNOTATION", h = F)
+ClusAll<-read.table("Cs_GraffiTE_280423_pangenome.variants_clustered_mode0_with_ANNOTATION", h = F)
 #head(ClusAll)
 names(ClusAll)<-c("cluster", "member", "SUPP", "SUPP_VEC", "SVLEN", "n_hits", "n_frags", "hit_length", "hit_names", "hit_class", "hit_strands", "frags", "cum_hits_len", "SV_TE_cov", "TSD")
 Ncount<-tapply(ClusAll$member, ClusAll$cluster, length)
@@ -166,7 +166,7 @@ ClusN3<-ClusAll %>%
     ) %>%
   filter(Count > 2) %>%
   arrange(desc(Count), desc(cumSVLEN))
-write.table(ClusN3, file = "~/Documents/Csativa/GraffiTE/ClusN3.tsv", quote = F, row.names = F)
+write.table(ClusN3, file = "ClusN3.tsv", quote = F, row.names = F)
 
 # All N >= 3
 barplot(height = ClusN3$Count, width = ClusN3$cumSVLEN)
@@ -179,7 +179,43 @@ axis(side = 1, at = seq(0,250, by = 1))
 
 
 #### VCF ####
-vcf_N3<-read.vcfR("/Users/cgoubert/Library/CloudStorage/[email protected]/Other\ computers/Zephyrantes/N3_200_40000bp_clusters_pangenome.vcf.recode.vcf")
+vcf_N3<-read.vcfR("N3_200_40000bp_clusters_pangenome.vcf.recode.vcf")
 
 # Saturation
-plot_curves(vcf_N3, 1/9, 9/9, 100, "polymorphism", "clem")
+plot_curves(vcf_N3, 1/9, 9/9, 100, "polymorphism", "clem")
+
+#### Comparison with GT-svsn ####
+# # gather the vcfs
+# cp ../Cs_GT-sv/3_TSD_search/pangenome.vcf Cs_GT-sv_pangenome.vcf
+# cp ../Cs_GT-sn/3_TSD_search/pangenome.vcf Cs_GT-sn_pangenome.vcf
+# # list and merge with same parameteres as GraffiTE
+# ls *.vcf > vcfs.txt
+# ~/bin/SURVIVOR/Debug/SURVIVOR merge vcfs.txt 0.1 0 0 0 0 100 Cs_GT-svsn_merged.vcf
+# # MMseqs
+# # first gather the variants
+# grep -v '#' Cs_GT-svsn_merged.vcf | awk '/SVTYPE=INS/ {print ">"$3; print $5; next} /SVTYPE=DEL/ {print ">"$3; print $4; next}' > Cs_GT-svsn_variants.fasta
+# salloc --time=2:00:0 --cpus-per-task=20 --account=rrg-bourqueg-ad --mem-per-cpu=5Gb
+# module load mmseqs2
+# mmseqs easy-cluster Cs_GT-svsn_variants.fasta Cs_GT-svsn_variants.MMseqs2 tmp --cov-mode 0 -c 0.8 --min-seq-id 0.8 -s 100 --exact-kmer-matching 1 --threads 20
+# # list the cluster with 3+ sequences
+# cut -f 1 Cs_GT-svsn_variants.MMseqs2_cluster.tsv | sort | uniq -c | awk '$1 > 2' > Cs_GT-svsn_variants.MMseqs2_N3clusters
+# # get the list of loci members of these clusters
+# Cs_GT-svsn]$ join -12 -21 <(sort -k2,2 Cs_GT-svsn_variants.MMseqs2_N3clusters) <(sort -k1,1 Cs_GT-svsn_variants.MMseqs2_cluster.tsv) | awk '{print $3}' >  Cs_GT-svsn_variants.MMseqs2_N3members
+# # next I will filter the VCF line to keep these, and with size 200-40000
+# join -11 -23 -o 1.1,2.8 <(sort -k1,1 Cs_GT-svsn_variants.MMseqs2_N3members) <(grep -v '#' Cs_GT-svsn_merged.vcf | sort -k3,3) | sed 's/;SVLEN=/\t/g;s/;SVTYPE=/\t/g;s/-//g;s/;SUPP_VEC=/\t/g' | awk '{print $1"\t"$3"\t"$4}' | awk '$3 >= 200 && $3 <= 40000' | cut -f 2 | sort | uniq -c
+# # 10 is Sniffles only, 11 both and, 01 svim only
+# 25142 01
+# 19140 10
+# 17442 11
+
+GTsv<-25142+17442
+GTsn<-19140+17442
+both<-17442
+#install.packages("VennDiagram")
+library(VennDiagram)
+VennDiagram::draw.pairwise.venn(GTsv, GTsn,both,
+                                euler.d = T,
+                                category = c("GT-sv\n(assemblies)", "GT-sn\n(long reads)"),
+                                col = c("purple", "green3"),fill = c("purple", "green3"),
+                                fontfamily = "sans",cat.fontfamily = "sans",scaled = T,
+                                )
diff --git a/paper/Applications_Examples/DMEL_RECH2022/GraffiTE_DM30.pME_Figures.R b/paper/Applications_Examples/DMEL_RECH2022/GraffiTE_DM30.pME_Figures.R
@@ -1,26 +1,50 @@
 ### Analysis of Rech's (DM30) pangenome with GraffiTE
 ## We will use here the full GT-sv-sn-GA and filter for 1 hits 
+
+################################################################################
+# Data prep (bash commands used to prepare the data for R)
+# The starting file is the output if Graffite mode GT-svsn-GA DM30_sv-sn_graphaligner.vcf
+# 1. list variants with a single TE hit using the pre-genotyping VCF:
+# grep -v '#' DM30_sv-sn_pangenome.vcf | grep 'n_hits=1;' | cut -f 3 > DM30_sv-sn_pangenome_1hits_IDs
+# 2. filter for this variants and exclude fixed loci (same genotype as reference genome for all samples)
+# vcftools --vcf  DM30_sv-sn_graphaligner.vcf --snps DM30_sv-sn_pangenome_1hits_IDs --non-ref-ac-any 1 --recode --recode-INFO-all --out DM30_sv-sn_GA_1hits_03142024
+# 3. convert filtered VCF into tsv for R analyses
+# cat GA_tsv_head <(paste <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 1-7) <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 8 | sed 's/;/\t/g;s/[A-Za-z0-9_-]*=//g;s/\tlowdepth//g' | awk '$2~/>/ {if (NF == 29) {print $3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\t"$28"\t"$29"\tNA\t"$1"\t"$2"\t"} else {print $3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\t"$28"\t"$29"\t"$30"\t"$1"\t"$2}; next}  {if (NF == 29) {print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\tNA\t"$28"\t"$29} else {print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\t"$28"\t"$29"\t"$30}}') <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }') <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }' | awk '{ count = 0; for (i=1; i<=NF; i++) { if ($i == "0/0" || $i == "N/A") count++}; {print 30-count}}') | awk -v OFS="\t" '$1=$1') | awk '/POS/ {print $0"\thasMissing"; next} /N\/A/ {print $0"\tTRUE"; next} {print $0"\tFALSE"}' > DM30_sv-sn_GA_1hits_03142024.recode.NAomitCounts.tsv
+# 4. header for this file is:
+# CHROM	POS	ID	REF	ALT	QUAL	FILTER	CIEND	CIPOS	CHR2	END	SVLEN	SVMETHOD	SVTYPE	SUPP_VEC	SUPP	STRANDS	sniffles2_SUPP	sniffles2_SVLEN	sniffles2_SVTYPE	sniffles2_ID	svim-asm_SUPP	svim-asm_SVLEN	svim-asm_SVTYPE	svim-asm_ID	n_hits	match_lengths	TEname	matching_classes	fragmts	RM_hit_strands	RM_hit_IDs	total_match_length	total_match_span	TSD	DP	AT	AKA-017	AKA-018	COR-014	COR-018	COR-023	JUT-008	KIE-094	RAL-059	COR-025	GIM-012	GIM-024	JUT-011	LUN-004	LUN-007	MUN-008	MUN-009	MUN-013	MUN-015	MUN-020	RAL-091	RAL-176	RAL-177	RAL-375	RAL-426	RAL-737	RAL-855	SLA-001	STO-022	TEN-015	TOM-008 GraphAligner_genome_counts
+################################################################################
+
+### R analyses start here:
+
+# Load R libraries
 library(ggplot2)
 library(ggrepel)
 library(dplyr)
 library(grDevices)
 
 # load main table converted from the GraffiTE VCF
-GTDM<-read.table("DM30_sv-sn_graphaligner_1hits_08212023_withCounts_withHeader.tsv", 
+#GTDM<-read.table("DM30_sv-sn_graphaligner_1hits_08212023_withCounts_withHeader.tsv", 
+#                 h = T)
+options(scipen=999) # prevent scientific notation of genomic position and other large integers
+GTDM<-read.table("DM30_sv-sn_GA_1hits_03142024.recode.NAomitCounts.tsv", 
                  h = T)
-length(subset(GTDM, (GTDM$n_hits == 1))[,1])
-# # subset to trusted pME (n_hits = 1 and not fixed relative to dm6 after GraphAligner)
-GTDM.f<-subset(GTDM, (GTDM$n_hits == 1 & GTDM$GraphAligner_genome_counts != 0))
-
-# join the TE classification
+# check that the input file is already filtered for 1 hits only
+# length(GTDM[,1])
+# length(subset(GTDM, (GTDM$n_hits == 1))[,1])
+# length(subset(GTDM, (GTDM$n_hits == 1 & GTDM$GraphAligner_genome_counts != 0))[,1])
+# for code legacy and to avoid bug, rename input with .f
+GTDM.f<-GTDM
+# load and join the TE classification
 MCTE<-read.table("Rech_TE_list.txt", h = T)
-colnames(GTDM.f)[31]<-"TEname"
 GTDM.f<-merge(GTDM.f, MCTE, by = "TEname")
 # list the chr we will use
 DMchrs<-c("NC_004354.4", "NT_033779.5","NT_033778.4", "NT_037436.4", "NT_033777.3")
 GTDM.f<-GTDM.f[GTDM.f$CHROM %in% DMchrs,]
+# remove loci with missing genotypes
+GTDM.f<-GTDM.f[GTDM.f$hasMissing == FALSE,]
+
 # annotate euchromatic and heterochromatic pME
-Euch.bed<-read.table("/Users/clementgoubert/Library/CloudStorage/[email protected]/Other\ computers/Zephyrantes/GraffiTE/Comparison_GraffiTE_Rech_DM30_plus_ISO-1/GraffiTE_VCFS/Euchromatic_regions_RECH2022.bed")
+Euch.bed<-read.table("Euchromatic_regions_RECH2022.bed")
 GTDM.f.bed<-data.frame(cbind(GTDM.f$CHROM, GTDM.f$POS, GTDM.f$POS+1,GTDM.f$ID))
 Euch.GTDM.f<-bedtoolsr::bt.intersect(GTDM.f.bed, Euch.bed)[,4]
 GTDM.f$region<-ifelse(GTDM.f$ID %in% Euch.GTDM.f, "euch", "heter")
@@ -36,7 +60,7 @@ Het<-data.frame(chr = c("2L", "2R", "3L", "3R", "X"),
                 X2_s = c(18870000, 24972477, 19026900, 31614278, 21338973),
                 X2_e = c(23513712, 25286936, 28110227, 32079331, 23542271))
 # get dm6 annotation for background densities
-rpmsk<-read.table("/Users/clementgoubert/Library/CloudStorage/[email protected]/Other\ computers/Zephyrantes/GraffiTE/Comparison_GraffiTE_Rech_DM30_plus_ISO-1/dm6_rpmsk_nochr.bed")
+rpmsk<-read.table("dm6_rpmsk_nochr.bed")
 names(rpmsk)<-c("chr", "start", "end", "strand", "name", "Order", "SuperFam")
 rpmsk.TEs<-rpmsk[!rpmsk$Order %in% c("Simple_repeat", "Low_complexity", "Satellite"),]
 rpmsk.reps<-rpmsk[rpmsk$Order %in% c("Simple_repeat", "Low_complexity", "Satellite"),]
@@ -85,7 +109,7 @@ DMpME<-ggplot(GTDM.f[GTDM.f$SuperFamily != "NewFam",])+
         )
 
 cov_all$type<-factor(cov_all$type, levels = c("TEs", "simple_repeats"), labels = c("TEs", "other repeats"))
-label(cov_all$type)
+#label(cov_all$type)
 DM_reps<-ggplot(GTDM.f[GTDM.f$SuperFamily != "NewFam",])+
   geom_rect(dat = Het, aes(xmin = X1_s/1000000, xmax = X1_e/1000000, ymin = 0, ymax = 1), fill = "grey90")+
   geom_rect(dat = Het, aes(xmin = X2_s/1000000, xmax = X2_e/1000000, ymin = 0, ymax = 1), fill = "grey90")+
@@ -155,7 +179,22 @@ bpDM<-ggplot(GTDM.f[GTDM.f$SuperFamily != "NewFam",], aes(x = factor(SuperFamily
                              reverse = T)
          )
 
-######### Saturation curves ##################
+# export list of analysed variants
+write.table(as.data.frame(GTDM.f$ID), "analyzed_variants", quote = F, row.names = F, col.names = F)
+
+### Saturation curves:
+
+################################################################################
+# for this analysis, we are using a tool we created early on that require the
+# variants to be encoded in the SUPP_VEC of the VCF. Since we are working on the
+# genotyped variants (not the detected variants), we needed to recode the SUPP_VEC
+# accordingly
+#
+# # first filter VCF to match the filtered list of variant
+# vcftools --vcf DM30_sv-sn_GA_1hits_03142024.recode.vcf --snps analyzed_variants --recode --recode-INFO-all --out DM30_sv-sn_GA_1hits_03142024__analyzed
+# # second replace the SUPP_VEC using GA genotypes
+# cat <(grep '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf) <(paste <(grep -v '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf | sed 's/SUPP_VEC=/SUPP_VEC=\t/g' | cut -f 1-8) <(grep -v '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }' | sed -e 's/0\/0/0/g' -e 's/N\/A/0/g' -e 's/.\/./1/g' -e 's/ //g') <(grep -v '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf | sed 's/[0-1]*;SUPP=/\t;SUPP=/g' | cut -f 9-) | sed 's/SUPP_VEC=\t/SUPP_VEC=/g;s/\t;SUPP/;SUPP/g') > DM30_sv-sn_GA_1hits_03142024__analyzed_GA_SUPP.recode.vcf
+################################################################################
 
 library(vcfR)
 library(ggrepel)
@@ -166,9 +205,9 @@ library(reshape2)
 library(ade4)
 library(stringr)
 
-# we need to load these functions first
 theme_set(theme_bw(base_size = 14))
 
+# functions to read VCF's support vector and plot the saturation curves
 discovery_curve <- function(ref_vcf, type = c("polymorphism", "TE"), nperm = 100) {
   vcf_matrix <- lapply(str_split(extract.info(ref_vcf, "SUPP_VEC"), ""), as.numeric)
   na_entries <- is.na(vcf_matrix)
@@ -215,7 +254,6 @@ discovery_curve <- function(ref_vcf, type = c("polymorphism", "TE"), nperm = 100
     pca = pca_matrix
   ))
 }
-
 filter_vcf_freq <- function(ref_vcf, min_freq = 0, max_freq = Inf, nhits = c("any", "single")) {
   vcf_freq <- lapply(str_split(extract.info(ref_vcf, "SUPP_VEC"), ""), as.numeric) %>%
     simplify2array() %>%
@@ -232,7 +270,6 @@ filter_vcf_freq <- function(ref_vcf, min_freq = 0, max_freq = Inf, nhits = c("an
     ref_vcf[vcf_freq >= min_freq & vcf_freq <= max_freq]
   }
 }
-
 plot_curves <- function(vcf, rare_freq, fixed_freq, nperm, type = c("polymorphism", "TE"), mode = c("cristian", "clem")) {
 
   n_genomes <- str_length(extract.info(vcf, "SUPP_VEC")[1])
@@ -303,9 +340,8 @@ plot_curves <- function(vcf, rare_freq, fixed_freq, nperm, type = c("polymorphis
       theme(legend.position = c(60, 1000))
   }
 }
-
-
-vcfDM<-read.vcfR("DM30_sv-sn_graphaligner_1hits_08212023_GA_SUPP.recode.vcf")
+# load the VCF
+vcfDM<-read.vcfR("DM30_sv-sn_GA_1hits_03142024__analyzed_GA_SUPP.recode.vcf")
 t_2_29<-plot_curves(vcfDM, rare_freq = 2/30, fixed_freq = 29/30, nperm = 100, type = "polymorphism", mode = "clem")
 
 
@@ -314,4 +350,38 @@ CP2<-cowplot::plot_grid(t_2_29, bpDM,
                    rel_widths = c(0.3, 0.7),
                    ncol = 2
                    )
+
+# final plot
 cowplot::plot_grid(CP1, CP2, ncol = 1)
+
+### Comparison with short reads:
+
+################################################################################
+# data prep to match long-reads analyses
+#
+# 1. First, I need to filter the pangenie VCF in the same fashion: 1 hits, no fixed
+# ```sh
+# vcftools --vcf  DM30_sv-sn_pangenie.vcf --snps DM30_sv-sn_pangenome_1hits_IDs --non-ref-ac-any 1 --recode --recode-INFO-all --out DM30_sv-sn_PA_1hits_03192024
+# ```
+# After filtering, kept 10950 out of a possible 32909 Site
+# 
+# 2. Identify missing data -- we don't need to parse the info field this time (we just need to positions and the genotypes)
+# ```sh
+# paste <(grep -v '#' DM30_sv-sn_PA_1hits_03192024.recode.vcf | cut -f 1-7)  <(grep -v '#' DM30_sv-sn_PA_1hits_03192024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }') <(grep -v '#' DM30_sv-sn_PA_1hits_03192024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }' | awk '{ count = 0; for (i=1; i<=NF; i++) { if ($i == "0/0" || $i == "N/A") count++}; {print 30-count}}') | awk -v OFS="\t" '$1=$1' | awk '/POS/ {print $0"\thasMissing"; next} /N\/A/ {print $0"\tTRUE"; next} {print $0"\tFALSE"}' | awk '{print $1"\t"$2"\t"$3"\t"$(NF-1)"\t"$NF}' > DM30_sv-sn_PG_1hits_03192024.simple.NAomitCounts.tsv
+# ```
+################################################################################
+
+GTDMshort<-read.table("DM30_sv-sn_PG_1hits_03192024.simple.NAomitCounts.tsv")
+names(GTDMshort)<-c("CHROM", "POS", "ID", "PGcount", "hasMissing")
+# filter for main chromosomes
+GTDMshort<-GTDMshort[GTDMshort$CHROM %in% DMchrs,]
+# remove loci with missing genotypes
+GTDMshort<-GTDMshort[GTDMshort$hasMissing == FALSE,]
+
+library(ggVennDiagram)
+vlist<-list(
+  GTlong=GTDM.f$ID,
+  GTshort=GTDMshort$ID)
+ggVennDiagram(vlist) + scale_fill_gradient(low="grey99",high = "red")
+
+length(GTDMshort$ID)
diff --git a/paper/Applications_Examples/HUMAN_HPRC/GraffiTE_HPRC_pangenome.R b/paper/Applications_Examples/HUMAN_HPRC/GraffiTE_HPRC_pangenome.R
@@ -162,7 +162,7 @@ HPRCvcf_F<-HPRCvcf[Lfilter & HitsFilter & TEclassFilter & SVAVNTF_filter]
 ## Saturation
 Sat<-plot_curves(HPRCvcf_F, rare_freq = 0.05, fixed_freq = 0.95, nperm = 100, type = "polymorphism", mode = "clem")
 ## PCoA
-HPRCmeta<-read.table("~/Documents/GraffiTE/HPRC_metadata.txt", h = T, sep = "\t")
+HPRCmeta<-read.table("HPRC_metadata.txt", h = T, sep = "\t")
 HPRCmeta2<-HPRCmeta[rep(seq_len(nrow(HPRCmeta)), each = 2), ]
 HPRCmeta2$ID<-paste(HPRCmeta2$Sample, rep(c("mat", "pat"), 47), sep = "_")
 HPRCmeta2<-HPRCmeta2[order(HPRCmeta2$Sample),]
@@ -290,4 +290,43 @@ Counts<-ggplot(TElonger, aes(x = SubCode, y = Count))+
 
 Aplot<-cowplot::plot_grid(Sat, PCoA, rel_widths = c(0.35,0.65))
 Bplot<-cowplot::plot_grid( NULL, Counts, rel_widths = c(0,1))
-cowplot::plot_grid(Aplot, Bplot, nrow = 2)
+cowplot::plot_grid(Aplot, Bplot, nrow = 2)
+
+### Comparison with Minigraph-Cactus calls
+# 1. We created a VCF derived from the GFA of Liao at al (HPRC paper)
+# 2. We used the --vcf input of GraffiTE to annotate this VCF for pMEs.
+# 3. We removed duplicates SV from the VCF: each SNP/INDEL replicate (which occurs often at poly-A tails).
+# grep -v '#' MiniCac_HPRC_pangenome.vcf | cut -f 1,2,8,3 | sed 's/repeat_ids=/\t/g;s/;matching_classes=/\t/g' | awk '{print $1"_"$2"_"$5"\t"$3}'| sort -u -k1,1 | cut -f 2 > MiniCac_HPRC_pangenome__variants_to_keep.txt
+# vcftools --vcf MiniCac_HPRC_pangenome_.vcf --snps MiniCac_HPRC_pangenome__variants_to_keep.txt --recode --recode-INFO-all --out MiniCac_HPRC_pangenome_preR_filter
+# 4. This filtered VCF is the input for these analyses
+
+MiniCacvcf<-read.vcfR("MiniCac_HPRC_pangenome_preR_filter.recode.vcf")
+MiniCacvcffilter<-abs(as.numeric(extract.info(MiniCacvcf, "LEN"))) >= 250
+MiniCacvcfHitsFilter<-as.numeric(extract.info(MiniCacvcf, "n_hits")) == 1 | extract.info(MiniCacvcf, "mam_filter_1") != "None"
+MiniCacvcfSVAVNTF_filter<-extract.info(MiniCacvcf, "mam_filter_2") == "None"
+MiniCacvcfTEclassFilter<-grepl(paste(c("Alu", "L1", "SVA"), collapse = "|"), extract.info(MiniCacvcf, "repeat_ids"))
+MiniCacvcfHPRCvcf_F<-MiniCacvcf[MiniCacvcffilter & MiniCacvcfHitsFilter & MiniCacvcfSVAVNTF_filter & MiniCacvcfTEclassFilter]
+write.vcf(MiniCacvcfHPRCvcf_F, "/Users/clementgoubert/Library/CloudStorage/[email protected]/Other computers/Zephyrantes/GraffiTE/paper_datasets/Applications_Examples/HUMAN_HPRC/MiniCac_HPRC_pangenome_R_filtered.vcf")
+# export the GraffiTE one
+write.vcf(HPRCvcf_F, "/Users/clementgoubert/Library/CloudStorage/[email protected]/Other computers/Zephyrantes/GraffiTE/paper_datasets/Applications_Examples/HUMAN_HPRC/HPRC_pangenome_R_filtered.vcf")
+
+# prep for comparison with SURVIVOR
+# bcftools view HPRC_pangenome_R_filtered.vcf > HPRC_pangenome_R_filtered_.vcf
+# bcftools view MiniCac_HPRC_pangenome_R_filtered.vcf > MiniCac_HPRC_pangenome_R_filtered_.vcf
+# ls HPRC_pangenome_R_filtered_.vcf MiniCac_HPRC_pangenome_R_filtered_.vcf > vcfs.txt
+# ~/SURVIVOR/Debug/SURVIVOR merge vcfs.txt 0.1 0 0 0 0 100 HPRC_filtered_merged.vcf
+# grep -v '#' HPRC_filtered_merged.vcf | grep -c 'SUPP_VEC=10' # This is the count of GraffiTE/svim-asm specific variants
+# 6270
+# grep -v '#' HPRC_filtered_merged.vcf | grep -c 'SUPP_VEC=01' # This is the count of Minigraph/Cactus specific variants
+# 788
+# grep -v '#' HPRC_filtered_merged.vcf | grep -c 'SUPP_VEC=11' # This is the count of shared variants between both methods using the same filters
+# 8451 
+
+library(VennDiagram)
+draw.pairwise.venn(area1 = 6270+8451,
+                   area2 = 788+8451,
+                   cross.area = 8451,category = c("minmap2/svim-asm", "Minigraph/Cactus"),
+                   euler.d = T,
+                   scaled = T,
+                   fill = c("grey99", "red"),fontfamily = rep("sans", 3),cat.fontfamily = rep("sans",2))
+