-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathadd_pangenome_matrices.pl
executable file
·123 lines (104 loc) · 3.53 KB
/
add_pangenome_matrices.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env perl
# 2012-2025 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2):
# 1: https://www.eead.csic.es/compbio (Estacion Experimental Aula Dei-CSIC/Fundacion ARAID, Spain)
# 2: http://www.ccg.unam.mx/~vinuesa (Center for Genomic Sciences, UNAM, Mexico)
# This script can be used to add pangenome matrices (-m files) generated by compare_clusters.pl,
# for instance to combine CDS and -a 'rRNA,tRNA' results derived from GenBank files
$|=1;
use strict;
use warnings;
use File::Basename;
use Getopt::Std;
use FindBin '$Bin';
use lib "$Bin/lib";
use lib "$Bin/lib/bioperl-1.5.2_102/";
use phyTools;
my ($INP_tabfile1,$INP_tabfile2,$INP_out_tabfile,%opts) = ('','','');
getopts('hi:j:o:', \%opts);
if(($opts{'h'})||(scalar(keys(%opts))==0))
{
print "\n[options]: \n";
print "-h \t this message\n";
print "-i \t first input .tab data file (required, generated by compare_clusters.pl)\n";
print "-j \t second input .tab data file (required, generated by compare_clusters.pl)\n";
print "-o \t name of output .tab file (required)\n";
exit(-1);
}
if(defined($opts{'i'}))
{
$INP_tabfile1 = $opts{'i'};
}
else{ die "\n# $0 : need -i parameter, exit\n"; }
if(defined($opts{'j'}))
{
$INP_tabfile2 = $opts{'j'};
}
else{ die "\n# $0 : need -j parameter, exit\n"; }
if(defined($opts{'o'}))
{
$INP_out_tabfile = $opts{'o'};
}
else{ die "\n# $0 : need -o parameter, exit\n"; }
printf("\n# %s -i %s -j %s -o %s\n",$0,$INP_tabfile1,$INP_tabfile2,$INP_out_tabfile);
#################################### MAIN PROGRAM ################################################
my (@genomes1,@genomes2,%table1,%table2,@clusters1,@clusters2);
my ($source1,$source2,$genome);
# 1) read input file i
open(DATAI,$INP_tabfile1) || die "# add_pangenome_matrices : cannot read $INP_tabfile1\n";
while(<DATAI>)
{
#source:folder/ 35_tnpA.faa 36_beta-lactamase_CTX-M-65.faa 60_ant39.faa ...
#K_pneumoniae_9_plasmid_9.gb 0 2 0
#K_pneumoniae_12_plasmid_12.gb 0 2 1
#K_oxytoca_plasmid_pKOX105.gb 0 1 0
#...
chomp($_);
my @data = split(/\t/,$_);
if($data[0] =~ /^source:(.*)/)
{
$source1 = $1;
push(@clusters1,@data[1..$#data]);
}
else
{
$genome = (split(/\.gbk/,shift(@data)))[0];
push(@genomes1,$genome);
$table1{$genome} = \@data; #print "i $genome\n";
}
}
close(DATAI);
# 1) read input file j
open(DATAJ,$INP_tabfile2) || die "# add_pangenome_matrices : cannot read $INP_tabfile2\n";
while(<DATAJ>)
{
chomp($_);
my @data = split(/\t/,$_);
if($data[0] =~ /^source:(.*)/)
{
$source2 = $1;
push(@clusters2,@data[1..$#data]);
}
else
{
$genome = (split(/\.gbk/,shift(@data)))[0];
push(@genomes2,$genome);
$table2{$genome} = \@data; #print "j $genome\n";
}
}
close(DATAJ);
# 3) check dimensions
if($#genomes1 != $#genomes2)
{
die "# $0 : number of genomes in $INP_tabfile1 and $INP_tabfile2 does not match, please check input\n";
}
else{ print "# number of genomes = $#genomes1\n" }
# 4) add matrices
open(OUT,">$INP_out_tabfile\n") || die "# $0 : cannot create $INP_out_tabfile\n";
print OUT "source:$source1,$source2\t".join("\t",@clusters1)."\t".join("\t",@clusters2)."\n";
foreach my $i (0 .. $#genomes1)
{
$genome = $genomes1[$i];
print OUT "$genome\t".join("\t",@{$table1{$genome}})."\t".join("\t",@{$table2{$genome}})."\n";
}
close(OUT);
print "# output pangenome matrix file = $INP_out_tabfile\n";