-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
375 additions
and
211 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -148,7 +148,7 @@ plot_curves <- function(vcf, rare_freq, fixed_freq, nperm, type = c("polymorphis | |
|
||
#### CLUSTERS #### | ||
|
||
ClusAll<-read.table("~/Documents/Csativa/GraffiTE/Cs_GraffiTE_280423_pangenome.variants_clustered_mode0_with_ANNOTATION", h = F) | ||
ClusAll<-read.table("Cs_GraffiTE_280423_pangenome.variants_clustered_mode0_with_ANNOTATION", h = F) | ||
#head(ClusAll) | ||
names(ClusAll)<-c("cluster", "member", "SUPP", "SUPP_VEC", "SVLEN", "n_hits", "n_frags", "hit_length", "hit_names", "hit_class", "hit_strands", "frags", "cum_hits_len", "SV_TE_cov", "TSD") | ||
Ncount<-tapply(ClusAll$member, ClusAll$cluster, length) | ||
|
@@ -166,7 +166,7 @@ ClusN3<-ClusAll %>% | |
) %>% | ||
filter(Count > 2) %>% | ||
arrange(desc(Count), desc(cumSVLEN)) | ||
write.table(ClusN3, file = "~/Documents/Csativa/GraffiTE/ClusN3.tsv", quote = F, row.names = F) | ||
write.table(ClusN3, file = "ClusN3.tsv", quote = F, row.names = F) | ||
|
||
# All N >= 3 | ||
barplot(height = ClusN3$Count, width = ClusN3$cumSVLEN) | ||
|
@@ -179,7 +179,43 @@ axis(side = 1, at = seq(0,250, by = 1)) | |
|
||
|
||
#### VCF #### | ||
vcf_N3<-read.vcfR("/Users/cgoubert/Library/CloudStorage/[email protected]/Other\ computers/Zephyrantes/N3_200_40000bp_clusters_pangenome.vcf.recode.vcf") | ||
vcf_N3<-read.vcfR("N3_200_40000bp_clusters_pangenome.vcf.recode.vcf") | ||
|
||
# Saturation | ||
plot_curves(vcf_N3, 1/9, 9/9, 100, "polymorphism", "clem") | ||
plot_curves(vcf_N3, 1/9, 9/9, 100, "polymorphism", "clem") | ||
|
||
#### Comparison with GT-svsn #### | ||
# # gather the vcfs | ||
# cp ../Cs_GT-sv/3_TSD_search/pangenome.vcf Cs_GT-sv_pangenome.vcf | ||
# cp ../Cs_GT-sn/3_TSD_search/pangenome.vcf Cs_GT-sn_pangenome.vcf | ||
# # list and merge with same parameteres as GraffiTE | ||
# ls *.vcf > vcfs.txt | ||
# ~/bin/SURVIVOR/Debug/SURVIVOR merge vcfs.txt 0.1 0 0 0 0 100 Cs_GT-svsn_merged.vcf | ||
# # MMseqs | ||
# # first gather the variants | ||
# grep -v '#' Cs_GT-svsn_merged.vcf | awk '/SVTYPE=INS/ {print ">"$3; print $5; next} /SVTYPE=DEL/ {print ">"$3; print $4; next}' > Cs_GT-svsn_variants.fasta | ||
# salloc --time=2:00:0 --cpus-per-task=20 --account=rrg-bourqueg-ad --mem-per-cpu=5Gb | ||
# module load mmseqs2 | ||
# mmseqs easy-cluster Cs_GT-svsn_variants.fasta Cs_GT-svsn_variants.MMseqs2 tmp --cov-mode 0 -c 0.8 --min-seq-id 0.8 -s 100 --exact-kmer-matching 1 --threads 20 | ||
# # list the cluster with 3+ sequences | ||
# cut -f 1 Cs_GT-svsn_variants.MMseqs2_cluster.tsv | sort | uniq -c | awk '$1 > 2' > Cs_GT-svsn_variants.MMseqs2_N3clusters | ||
# # get the list of loci members of these clusters | ||
# Cs_GT-svsn]$ join -12 -21 <(sort -k2,2 Cs_GT-svsn_variants.MMseqs2_N3clusters) <(sort -k1,1 Cs_GT-svsn_variants.MMseqs2_cluster.tsv) | awk '{print $3}' > Cs_GT-svsn_variants.MMseqs2_N3members | ||
# # next I will filter the VCF line to keep these, and with size 200-40000 | ||
# join -11 -23 -o 1.1,2.8 <(sort -k1,1 Cs_GT-svsn_variants.MMseqs2_N3members) <(grep -v '#' Cs_GT-svsn_merged.vcf | sort -k3,3) | sed 's/;SVLEN=/\t/g;s/;SVTYPE=/\t/g;s/-//g;s/;SUPP_VEC=/\t/g' | awk '{print $1"\t"$3"\t"$4}' | awk '$3 >= 200 && $3 <= 40000' | cut -f 2 | sort | uniq -c | ||
# # 10 is Sniffles only, 11 both and, 01 svim only | ||
# 25142 01 | ||
# 19140 10 | ||
# 17442 11 | ||
|
||
GTsv<-25142+17442 | ||
GTsn<-19140+17442 | ||
both<-17442 | ||
#install.packages("VennDiagram") | ||
library(VennDiagram) | ||
VennDiagram::draw.pairwise.venn(GTsv, GTsn,both, | ||
euler.d = T, | ||
category = c("GT-sv\n(assemblies)", "GT-sn\n(long reads)"), | ||
col = c("purple", "green3"),fill = c("purple", "green3"), | ||
fontfamily = "sans",cat.fontfamily = "sans",scaled = T, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,50 @@ | ||
### Analysis of Rech's (DM30) pangenome with GraffiTE | ||
## We will use here the full GT-sv-sn-GA and filter for 1 hits | ||
|
||
################################################################################ | ||
# Data prep (bash commands used to prepare the data for R) | ||
# The starting file is the output if Graffite mode GT-svsn-GA DM30_sv-sn_graphaligner.vcf | ||
# 1. list variants with a single TE hit using the pre-genotyping VCF: | ||
# grep -v '#' DM30_sv-sn_pangenome.vcf | grep 'n_hits=1;' | cut -f 3 > DM30_sv-sn_pangenome_1hits_IDs | ||
# 2. filter for this variants and exclude fixed loci (same genotype as reference genome for all samples) | ||
# vcftools --vcf DM30_sv-sn_graphaligner.vcf --snps DM30_sv-sn_pangenome_1hits_IDs --non-ref-ac-any 1 --recode --recode-INFO-all --out DM30_sv-sn_GA_1hits_03142024 | ||
# 3. convert filtered VCF into tsv for R analyses | ||
# cat GA_tsv_head <(paste <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 1-7) <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 8 | sed 's/;/\t/g;s/[A-Za-z0-9_-]*=//g;s/\tlowdepth//g' | awk '$2~/>/ {if (NF == 29) {print $3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\t"$28"\t"$29"\tNA\t"$1"\t"$2"\t"} else {print $3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\t"$28"\t"$29"\t"$30"\t"$1"\t"$2}; next} {if (NF == 29) {print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\tNA\t"$28"\t"$29} else {print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14"\t"$15"\t"$16"\t"$17"\t"$18"\t"$19"\t"$20"\t"$21"\t"$22"\t"$23"\t"$24"\t"$25"\t"$26"\t"$27"\t"$28"\t"$29"\t"$30}}') <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }') <(grep -v '#' DM30_sv-sn_GA_1hits_03142024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }' | awk '{ count = 0; for (i=1; i<=NF; i++) { if ($i == "0/0" || $i == "N/A") count++}; {print 30-count}}') | awk -v OFS="\t" '$1=$1') | awk '/POS/ {print $0"\thasMissing"; next} /N\/A/ {print $0"\tTRUE"; next} {print $0"\tFALSE"}' > DM30_sv-sn_GA_1hits_03142024.recode.NAomitCounts.tsv | ||
# 4. header for this file is: | ||
# CHROM POS ID REF ALT QUAL FILTER CIEND CIPOS CHR2 END SVLEN SVMETHOD SVTYPE SUPP_VEC SUPP STRANDS sniffles2_SUPP sniffles2_SVLEN sniffles2_SVTYPE sniffles2_ID svim-asm_SUPP svim-asm_SVLEN svim-asm_SVTYPE svim-asm_ID n_hits match_lengths TEname matching_classes fragmts RM_hit_strands RM_hit_IDs total_match_length total_match_span TSD DP AT AKA-017 AKA-018 COR-014 COR-018 COR-023 JUT-008 KIE-094 RAL-059 COR-025 GIM-012 GIM-024 JUT-011 LUN-004 LUN-007 MUN-008 MUN-009 MUN-013 MUN-015 MUN-020 RAL-091 RAL-176 RAL-177 RAL-375 RAL-426 RAL-737 RAL-855 SLA-001 STO-022 TEN-015 TOM-008 GraphAligner_genome_counts | ||
################################################################################ | ||
|
||
### R analyses start here: | ||
|
||
# Load R libraries | ||
library(ggplot2) | ||
library(ggrepel) | ||
library(dplyr) | ||
library(grDevices) | ||
|
||
# load main table converted from the GraffiTE VCF | ||
GTDM<-read.table("DM30_sv-sn_graphaligner_1hits_08212023_withCounts_withHeader.tsv", | ||
#GTDM<-read.table("DM30_sv-sn_graphaligner_1hits_08212023_withCounts_withHeader.tsv", | ||
# h = T) | ||
options(scipen=999) # prevent scientific notation of genomic position and other large integers | ||
GTDM<-read.table("DM30_sv-sn_GA_1hits_03142024.recode.NAomitCounts.tsv", | ||
h = T) | ||
length(subset(GTDM, (GTDM$n_hits == 1))[,1]) | ||
# # subset to trusted pME (n_hits = 1 and not fixed relative to dm6 after GraphAligner) | ||
GTDM.f<-subset(GTDM, (GTDM$n_hits == 1 & GTDM$GraphAligner_genome_counts != 0)) | ||
|
||
# join the TE classification | ||
# check that the input file is already filtered for 1 hits only | ||
# length(GTDM[,1]) | ||
# length(subset(GTDM, (GTDM$n_hits == 1))[,1]) | ||
# length(subset(GTDM, (GTDM$n_hits == 1 & GTDM$GraphAligner_genome_counts != 0))[,1]) | ||
# for code legacy and to avoid bug, rename input with .f | ||
GTDM.f<-GTDM | ||
# load and join the TE classification | ||
MCTE<-read.table("Rech_TE_list.txt", h = T) | ||
colnames(GTDM.f)[31]<-"TEname" | ||
GTDM.f<-merge(GTDM.f, MCTE, by = "TEname") | ||
# list the chr we will use | ||
DMchrs<-c("NC_004354.4", "NT_033779.5","NT_033778.4", "NT_037436.4", "NT_033777.3") | ||
GTDM.f<-GTDM.f[GTDM.f$CHROM %in% DMchrs,] | ||
# remove loci with missing genotypes | ||
GTDM.f<-GTDM.f[GTDM.f$hasMissing == FALSE,] | ||
|
||
# annotate euchromatic and heterochromatic pME | ||
Euch.bed<-read.table("/Users/clementgoubert/Library/CloudStorage/[email protected]/Other\ computers/Zephyrantes/GraffiTE/Comparison_GraffiTE_Rech_DM30_plus_ISO-1/GraffiTE_VCFS/Euchromatic_regions_RECH2022.bed") | ||
Euch.bed<-read.table("Euchromatic_regions_RECH2022.bed") | ||
GTDM.f.bed<-data.frame(cbind(GTDM.f$CHROM, GTDM.f$POS, GTDM.f$POS+1,GTDM.f$ID)) | ||
Euch.GTDM.f<-bedtoolsr::bt.intersect(GTDM.f.bed, Euch.bed)[,4] | ||
GTDM.f$region<-ifelse(GTDM.f$ID %in% Euch.GTDM.f, "euch", "heter") | ||
|
@@ -36,7 +60,7 @@ Het<-data.frame(chr = c("2L", "2R", "3L", "3R", "X"), | |
X2_s = c(18870000, 24972477, 19026900, 31614278, 21338973), | ||
X2_e = c(23513712, 25286936, 28110227, 32079331, 23542271)) | ||
# get dm6 annotation for background densities | ||
rpmsk<-read.table("/Users/clementgoubert/Library/CloudStorage/[email protected]/Other\ computers/Zephyrantes/GraffiTE/Comparison_GraffiTE_Rech_DM30_plus_ISO-1/dm6_rpmsk_nochr.bed") | ||
rpmsk<-read.table("dm6_rpmsk_nochr.bed") | ||
names(rpmsk)<-c("chr", "start", "end", "strand", "name", "Order", "SuperFam") | ||
rpmsk.TEs<-rpmsk[!rpmsk$Order %in% c("Simple_repeat", "Low_complexity", "Satellite"),] | ||
rpmsk.reps<-rpmsk[rpmsk$Order %in% c("Simple_repeat", "Low_complexity", "Satellite"),] | ||
|
@@ -85,7 +109,7 @@ DMpME<-ggplot(GTDM.f[GTDM.f$SuperFamily != "NewFam",])+ | |
) | ||
|
||
cov_all$type<-factor(cov_all$type, levels = c("TEs", "simple_repeats"), labels = c("TEs", "other repeats")) | ||
label(cov_all$type) | ||
#label(cov_all$type) | ||
DM_reps<-ggplot(GTDM.f[GTDM.f$SuperFamily != "NewFam",])+ | ||
geom_rect(dat = Het, aes(xmin = X1_s/1000000, xmax = X1_e/1000000, ymin = 0, ymax = 1), fill = "grey90")+ | ||
geom_rect(dat = Het, aes(xmin = X2_s/1000000, xmax = X2_e/1000000, ymin = 0, ymax = 1), fill = "grey90")+ | ||
|
@@ -155,7 +179,22 @@ bpDM<-ggplot(GTDM.f[GTDM.f$SuperFamily != "NewFam",], aes(x = factor(SuperFamily | |
reverse = T) | ||
) | ||
|
||
######### Saturation curves ################## | ||
# export list of analysed variants | ||
write.table(as.data.frame(GTDM.f$ID), "analyzed_variants", quote = F, row.names = F, col.names = F) | ||
|
||
### Saturation curves: | ||
|
||
################################################################################ | ||
# for this analysis, we are using a tool we created early on that require the | ||
# variants to be encoded in the SUPP_VEC of the VCF. Since we are working on the | ||
# genotyped variants (not the detected variants), we needed to recode the SUPP_VEC | ||
# accordingly | ||
# | ||
# # first filter VCF to match the filtered list of variant | ||
# vcftools --vcf DM30_sv-sn_GA_1hits_03142024.recode.vcf --snps analyzed_variants --recode --recode-INFO-all --out DM30_sv-sn_GA_1hits_03142024__analyzed | ||
# # second replace the SUPP_VEC using GA genotypes | ||
# cat <(grep '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf) <(paste <(grep -v '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf | sed 's/SUPP_VEC=/SUPP_VEC=\t/g' | cut -f 1-8) <(grep -v '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }' | sed -e 's/0\/0/0/g' -e 's/N\/A/0/g' -e 's/.\/./1/g' -e 's/ //g') <(grep -v '#' DM30_sv-sn_GA_1hits_03142024__analyzed.recode.vcf | sed 's/[0-1]*;SUPP=/\t;SUPP=/g' | cut -f 9-) | sed 's/SUPP_VEC=\t/SUPP_VEC=/g;s/\t;SUPP/;SUPP/g') > DM30_sv-sn_GA_1hits_03142024__analyzed_GA_SUPP.recode.vcf | ||
################################################################################ | ||
|
||
library(vcfR) | ||
library(ggrepel) | ||
|
@@ -166,9 +205,9 @@ library(reshape2) | |
library(ade4) | ||
library(stringr) | ||
|
||
# we need to load these functions first | ||
theme_set(theme_bw(base_size = 14)) | ||
|
||
# functions to read VCF's support vector and plot the saturation curves | ||
discovery_curve <- function(ref_vcf, type = c("polymorphism", "TE"), nperm = 100) { | ||
vcf_matrix <- lapply(str_split(extract.info(ref_vcf, "SUPP_VEC"), ""), as.numeric) | ||
na_entries <- is.na(vcf_matrix) | ||
|
@@ -215,7 +254,6 @@ discovery_curve <- function(ref_vcf, type = c("polymorphism", "TE"), nperm = 100 | |
pca = pca_matrix | ||
)) | ||
} | ||
|
||
filter_vcf_freq <- function(ref_vcf, min_freq = 0, max_freq = Inf, nhits = c("any", "single")) { | ||
vcf_freq <- lapply(str_split(extract.info(ref_vcf, "SUPP_VEC"), ""), as.numeric) %>% | ||
simplify2array() %>% | ||
|
@@ -232,7 +270,6 @@ filter_vcf_freq <- function(ref_vcf, min_freq = 0, max_freq = Inf, nhits = c("an | |
ref_vcf[vcf_freq >= min_freq & vcf_freq <= max_freq] | ||
} | ||
} | ||
|
||
plot_curves <- function(vcf, rare_freq, fixed_freq, nperm, type = c("polymorphism", "TE"), mode = c("cristian", "clem")) { | ||
|
||
n_genomes <- str_length(extract.info(vcf, "SUPP_VEC")[1]) | ||
|
@@ -303,9 +340,8 @@ plot_curves <- function(vcf, rare_freq, fixed_freq, nperm, type = c("polymorphis | |
theme(legend.position = c(60, 1000)) | ||
} | ||
} | ||
|
||
|
||
vcfDM<-read.vcfR("DM30_sv-sn_graphaligner_1hits_08212023_GA_SUPP.recode.vcf") | ||
# load the VCF | ||
vcfDM<-read.vcfR("DM30_sv-sn_GA_1hits_03142024__analyzed_GA_SUPP.recode.vcf") | ||
t_2_29<-plot_curves(vcfDM, rare_freq = 2/30, fixed_freq = 29/30, nperm = 100, type = "polymorphism", mode = "clem") | ||
|
||
|
||
|
@@ -314,4 +350,38 @@ CP2<-cowplot::plot_grid(t_2_29, bpDM, | |
rel_widths = c(0.3, 0.7), | ||
ncol = 2 | ||
) | ||
|
||
# final plot | ||
cowplot::plot_grid(CP1, CP2, ncol = 1) | ||
|
||
### Comparison with short reads: | ||
|
||
################################################################################ | ||
# data prep to match long-reads analyses | ||
# | ||
# 1. First, I need to filter the pangenie VCF in the same fashion: 1 hits, no fixed | ||
# ```sh | ||
# vcftools --vcf DM30_sv-sn_pangenie.vcf --snps DM30_sv-sn_pangenome_1hits_IDs --non-ref-ac-any 1 --recode --recode-INFO-all --out DM30_sv-sn_PA_1hits_03192024 | ||
# ``` | ||
# After filtering, kept 10950 out of a possible 32909 Site | ||
# | ||
# 2. Identify missing data -- we don't need to parse the info field this time (we just need to positions and the genotypes) | ||
# ```sh | ||
# paste <(grep -v '#' DM30_sv-sn_PA_1hits_03192024.recode.vcf | cut -f 1-7) <(grep -v '#' DM30_sv-sn_PA_1hits_03192024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }') <(grep -v '#' DM30_sv-sn_PA_1hits_03192024.recode.vcf | cut -f 10- | sed 's/\.\/\.:/N\/A:/g' | awk '{ for (i=1; i<=NF; i++) printf "%s ", substr($i, 1, 3); print ""; }' | awk '{ count = 0; for (i=1; i<=NF; i++) { if ($i == "0/0" || $i == "N/A") count++}; {print 30-count}}') | awk -v OFS="\t" '$1=$1' | awk '/POS/ {print $0"\thasMissing"; next} /N\/A/ {print $0"\tTRUE"; next} {print $0"\tFALSE"}' | awk '{print $1"\t"$2"\t"$3"\t"$(NF-1)"\t"$NF}' > DM30_sv-sn_PG_1hits_03192024.simple.NAomitCounts.tsv | ||
# ``` | ||
################################################################################ | ||
|
||
GTDMshort<-read.table("DM30_sv-sn_PG_1hits_03192024.simple.NAomitCounts.tsv") | ||
names(GTDMshort)<-c("CHROM", "POS", "ID", "PGcount", "hasMissing") | ||
# filter for main chromosomes | ||
GTDMshort<-GTDMshort[GTDMshort$CHROM %in% DMchrs,] | ||
# remove loci with missing genotypes | ||
GTDMshort<-GTDMshort[GTDMshort$hasMissing == FALSE,] | ||
|
||
library(ggVennDiagram) | ||
vlist<-list( | ||
GTlong=GTDM.f$ID, | ||
GTshort=GTDMshort$ID) | ||
ggVennDiagram(vlist) + scale_fill_gradient(low="grey99",high = "red") | ||
|
||
length(GTDMshort$ID) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -162,7 +162,7 @@ HPRCvcf_F<-HPRCvcf[Lfilter & HitsFilter & TEclassFilter & SVAVNTF_filter] | |
## Saturation | ||
Sat<-plot_curves(HPRCvcf_F, rare_freq = 0.05, fixed_freq = 0.95, nperm = 100, type = "polymorphism", mode = "clem") | ||
## PCoA | ||
HPRCmeta<-read.table("~/Documents/GraffiTE/HPRC_metadata.txt", h = T, sep = "\t") | ||
HPRCmeta<-read.table("HPRC_metadata.txt", h = T, sep = "\t") | ||
HPRCmeta2<-HPRCmeta[rep(seq_len(nrow(HPRCmeta)), each = 2), ] | ||
HPRCmeta2$ID<-paste(HPRCmeta2$Sample, rep(c("mat", "pat"), 47), sep = "_") | ||
HPRCmeta2<-HPRCmeta2[order(HPRCmeta2$Sample),] | ||
|
@@ -290,4 +290,43 @@ Counts<-ggplot(TElonger, aes(x = SubCode, y = Count))+ | |
|
||
Aplot<-cowplot::plot_grid(Sat, PCoA, rel_widths = c(0.35,0.65)) | ||
Bplot<-cowplot::plot_grid( NULL, Counts, rel_widths = c(0,1)) | ||
cowplot::plot_grid(Aplot, Bplot, nrow = 2) | ||
cowplot::plot_grid(Aplot, Bplot, nrow = 2) | ||
|
||
### Comparison with Minigraph-Cactus calls | ||
# 1. We created a VCF derived from the GFA of Liao at al (HPRC paper) | ||
# 2. We used the --vcf input of GraffiTE to annotate this VCF for pMEs. | ||
# 3. We removed duplicates SV from the VCF: each SNP/INDEL replicate (which occurs often at poly-A tails). | ||
# grep -v '#' MiniCac_HPRC_pangenome.vcf | cut -f 1,2,8,3 | sed 's/repeat_ids=/\t/g;s/;matching_classes=/\t/g' | awk '{print $1"_"$2"_"$5"\t"$3}'| sort -u -k1,1 | cut -f 2 > MiniCac_HPRC_pangenome__variants_to_keep.txt | ||
# vcftools --vcf MiniCac_HPRC_pangenome_.vcf --snps MiniCac_HPRC_pangenome__variants_to_keep.txt --recode --recode-INFO-all --out MiniCac_HPRC_pangenome_preR_filter | ||
# 4. This filtered VCF is the input for these analyses | ||
|
||
MiniCacvcf<-read.vcfR("MiniCac_HPRC_pangenome_preR_filter.recode.vcf") | ||
MiniCacvcffilter<-abs(as.numeric(extract.info(MiniCacvcf, "LEN"))) >= 250 | ||
MiniCacvcfHitsFilter<-as.numeric(extract.info(MiniCacvcf, "n_hits")) == 1 | extract.info(MiniCacvcf, "mam_filter_1") != "None" | ||
MiniCacvcfSVAVNTF_filter<-extract.info(MiniCacvcf, "mam_filter_2") == "None" | ||
MiniCacvcfTEclassFilter<-grepl(paste(c("Alu", "L1", "SVA"), collapse = "|"), extract.info(MiniCacvcf, "repeat_ids")) | ||
MiniCacvcfHPRCvcf_F<-MiniCacvcf[MiniCacvcffilter & MiniCacvcfHitsFilter & MiniCacvcfSVAVNTF_filter & MiniCacvcfTEclassFilter] | ||
write.vcf(MiniCacvcfHPRCvcf_F, "/Users/clementgoubert/Library/CloudStorage/[email protected]/Other computers/Zephyrantes/GraffiTE/paper_datasets/Applications_Examples/HUMAN_HPRC/MiniCac_HPRC_pangenome_R_filtered.vcf") | ||
# export the GraffiTE one | ||
write.vcf(HPRCvcf_F, "/Users/clementgoubert/Library/CloudStorage/[email protected]/Other computers/Zephyrantes/GraffiTE/paper_datasets/Applications_Examples/HUMAN_HPRC/HPRC_pangenome_R_filtered.vcf") | ||
|
||
# prep for comparison with SURVIVOR | ||
# bcftools view HPRC_pangenome_R_filtered.vcf > HPRC_pangenome_R_filtered_.vcf | ||
# bcftools view MiniCac_HPRC_pangenome_R_filtered.vcf > MiniCac_HPRC_pangenome_R_filtered_.vcf | ||
# ls HPRC_pangenome_R_filtered_.vcf MiniCac_HPRC_pangenome_R_filtered_.vcf > vcfs.txt | ||
# ~/SURVIVOR/Debug/SURVIVOR merge vcfs.txt 0.1 0 0 0 0 100 HPRC_filtered_merged.vcf | ||
# grep -v '#' HPRC_filtered_merged.vcf | grep -c 'SUPP_VEC=10' # This is the count of GraffiTE/svim-asm specific variants | ||
# 6270 | ||
# grep -v '#' HPRC_filtered_merged.vcf | grep -c 'SUPP_VEC=01' # This is the count of Minigraph/Cactus specific variants | ||
# 788 | ||
# grep -v '#' HPRC_filtered_merged.vcf | grep -c 'SUPP_VEC=11' # This is the count of shared variants between both methods using the same filters | ||
# 8451 | ||
|
||
library(VennDiagram) | ||
draw.pairwise.venn(area1 = 6270+8451, | ||
area2 = 788+8451, | ||
cross.area = 8451,category = c("minmap2/svim-asm", "Minigraph/Cactus"), | ||
euler.d = T, | ||
scaled = T, | ||
fill = c("grey99", "red"),fontfamily = rep("sans", 3),cat.fontfamily = rep("sans",2)) | ||
|
Oops, something went wrong.