-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClustering.m
166 lines (145 loc) · 5.49 KB
/
Clustering.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
%Bioc 570 Final Project.
clear all
[num,txt,raw] = xlsread('matlab_clustering.xlsx',1);
[m,n] = size(num);%697 genes, 10 Liquid Killing conditions
[x,y] = size(txt);%checked the size of my txt array
conditionname= {'lk16','lk32','lk34','lk35','lk38','lk56','Phenanthroline','RPW',...
'Hygromycin','Cadmium'};%Defines the names of each of my conditions
%(eg. the columns of num and txt)
numoclust = 15;
%input for the number of clusters
ids = kmeans(num,numoclust);
figure(1); hold on;
for ii = 1:numoclust
inds = ids == ii;
subplot(3,5,ii);
plot(1:n,mean(num(inds,:),1),'.-','LineWidth',3,'Markersize',18);
hold on;
title([' Cluster ' int2str(ii) , ' | #genes: ' int2str(sum(inds))],'FontSize',9);
end
hold off;
% figure;hold on;
% for ii = 1:length(n);
% scatter(1:length(num(:,1)),num(:,ii),4^2,ids,'filled');
% end
% hold off;
%%
clear conditionname z ii idscell sortedtxt idx
% z = 0;
% for ii = 1:length(ids)
% if ids(ii,:) == 15
% z = z+1;
% end
% end
idscell = num2cell(ids);
sortedtxt = [idscell txt];
[ranks_ordered, idx] = sort(cell2mat(sortedtxt(:,1)));
sortedtxt = sortedtxt(idx,:);
%Make a structure for my gene informaiton.
field1 = 'id'; value1 = {sortedtxt{:,1}};
field2 = 'geneName'; value2 = {sortedtxt{:,2}};
field3 = 'GenbankID'; value3 = {sortedtxt{:,3}};
field4 = 'GOterms'; value4 = {sortedtxt{:,4}};
field5 = 'GeneSequence'; value5 = cell([1,length(sortedtxt)]);
field6 = 'Promotor'; value6 = cell([1,length(sortedtxt)]);
field7 = 'PromotorComplement'; value7 = cell([1,length(sortedtxt)]);
Kgroup = struct(field1,value1,field2,value2,field3,...
value3,field4,value4,field5,value5,field6,value6,field7,value7);
clear field1 field2 field3 field4 field5 field6 field7
clear value1 value2 value3 value4 value5 value6 value7
%read in the C. elegans Genome
gunzip('caenorhabditis_elegans.PRJNA13758.WBPS9.genomic.fa.gz');
%[headers, totalgenome] = fastaread('GCA_0000029853_Celegans.fasta');
[headers, totalgenome] = fastaread('caenorhabditis_elegans.PRJNA13758.WBPS9.genomic.fa');
totalgenome = [totalgenome{1,1:end}];
totalgenome = [totalgenome seqcomplement(totalgenome)];
%%
%Find the starting position of each gene and gets the promotor region (1000
%bp upstream of start).
for q = 1:length(sortedtxt)
disp(['Getting gene data for : ' num2str(q)])
Kgroup(q).GeneSequence = getgenbank(Kgroup(q).GenbankID,'SEQUENCEONLY',true);
geneindx = strfind(totalgenome,Kgroup(q).GeneSequence(1,1:14));
%Note on unique nucleotide sequences: Assuming a C. elegans genome size of
%100,258,171 bp, we can calculate the number of bp required to make a
%unique sequence as n = log4(genomesize) or n = 13.7. This is the minimum
%number of base pairs needed to be uniqie in the total geneome size.
%Moveing forward I will use 14 bp as my representative unique sequence, to
%insure I will get the gene of intrest.
if geneindx-1000 <= 0
Kgroup(q).Promotor = totalgenome(0:geneindx);
Kgroup(q).PromotorComplement = seqcomplement(totalgenome(0:geneindx));
pause(2);
else
Kgroup(q).Promotor = totalgenome(geneindx-1000:geneindx);
Kgroup(q).PromotorComplement = seqcomplement(totalgenome(geneindx-1000:geneindx));
pause(2);
end
end
%%
%Find and recored # of esre motifs
esre = 'tctgcgtctct';
esre = upper(esre);
count = 0; foundPromos = 0; groupnum = 1;
for q = 2:length(Kgroup)
chck = strfind(Kgroup(q).Promotor,esre);%count esre Motifs
if isempty(chck) < 1
count = count+1;
end
if isempty(Kgroup(q).Promotor) ~= 1 %find empty promotor regions
foundPromos = foundPromos + 1;
end
if Kgroup(q).id == Kgroup(q-1).id %Assign counts to groups
GroupCount(groupnum) = count;
else
groupnum = groupnum+1;
GroupCount(groupnum) = count - GroupCount(groupnum-1);
count = GroupCount(groupnum)
end
end
TotalESRE = length([strfind(totalgenome,esre)]'); %Count all esre motifs
%%
%This code was in development in an attempt to compare each of the motifs
%to each other to search for common elements. I ran out of time during the
%semester to finish this, but have plans to continue to explore this as
%well as other methods, including incorporating the blastlocal search into
%my program. This code is not addreddesed in depth in the paper.
q = 1;
count = 1;
for ii = 2:length(Kgroup)
ii
if Kgroup(ii).id == Kgroup(ii-1).id
count = count+1
indexofgroups(q) = count
else
indexofgroups(q) = count
count = 1;
q = q+1
end
end
%%
%Make a structure to save the motif data
field1 = 'KgroupID'; value1 = cell(10^5,1);
field2 = 'Motifpool'; value2 = cell(10^5,1);
field3 = 'FrequencyScore'; value3 = cell(10^5,1);
field4 = 'Motifs'; value4 = cell(10^5,1);
MotifData = struct(field1,value1,field2,value2,field3,value3,field4,value4);
clear field1 field2 field3 field4; clear value1 value2 value3 value4;
%%
q = 1;
n = 1;
for i = 1:length(indexofgroups)
for ii = n:indexofgroups(q)
if ii == n
[nmersFound, freqDiffSorted] = motiffinder(Kgroup(ii).Promotor,[Kgroup(ii+1:indexofgroups(i)).Promotor],14)
%nxtln =
else
[nmersFound, freqDiffSorted] = motiffinder(Kgroup(ii).Promotor,...
[Kgroup(n:ii-1).Promotor Kgroup(ii+1:indexofgroups(i)).Promotor],14)
%struct
end
end
q = q+1;
n = indexofgroups(q);
end
disp('done')