48 #include "../utils/fast_math.hpp"
49 #include "../utils/Matrix.hpp"
65 template<
typename DataType>
69 unsigned short n_states=5;
70 size_t l_seq1 = seq1.size()+1;
71 size_t l_seq2 = seq2.size()+1;
73 float single_c1, single_c2;
76 const std::vector<float> &insProbs = hmm.
ins_probs();
81 dp_mat[0][0] =initDistr[0];
82 for (k=1; k<n_states; ++k)
83 insert_matrices[2*k-2][0]=initDistr[k];
88 for (i=1; i<l_seq2; ++i)
90 c2=
static_cast<short>(seq2[i-1]);
91 single_c2 = insProbs[c2];
92 tmp1 = insert_matrices[2][i] = LOG_ADD(insert_matrices[2][i-1]+transProbs[2][2], dp_mat[0][i-1]+transProbs[0][2]) + single_c2;
93 tmp2 = insert_matrices[6][i] = LOG_ADD(insert_matrices[6][i-1]+transProbs[4][4], dp_mat[0][i-1]+transProbs[0][4]) + single_c2;
94 dp_mat[0][i] = LOG_ADD(tmp1, tmp2);
102 for (i=1; i<l_seq1; ++i)
104 c1=
static_cast<short>(seq1[i-1]);
105 single_c1 = insProbs[c1];
108 tmp1 = insert_matrices[1][0] = LOG_ADD(insert_matrices[0][0]+transProbs[1][1], dp_mat[i-1][0]+transProbs[0][1]) + single_c1;
109 tmp2 = insert_matrices[5][0] = LOG_ADD(insert_matrices[4][0]+transProbs[3][3], dp_mat[i-1][0]+transProbs[0][3]) + single_c1;
110 dp_mat[i][0] = LOG_ADD(tmp1, tmp2);
111 for (j=1; j<l_seq2; ++j)
113 c2=
static_cast<short>(seq2[j-1]);
114 single_c2 = insProbs[c2];
117 for (k=1; k<n_states; ++k)
121 tmp = (l) ? single_c1 : single_c2;
122 if (((i==1) && (l==1)) || ((j==1) && (m==1)))
123 insert_matrices[2*k-1][j] = dp_mat[i-l][j-m] + transProbs[0][k] + tmp;
125 insert_matrices[2*k-1][j] = LOG_ADD(dp_mat[i-l][j-m] + transProbs[0][k], insert_matrices[2*k-1-l][j-m] + transProbs[k][k]) + tmp;
129 tmp = dp_mat[i-1][j-1] + transProbs[0][0];
132 for (k=1; k<n_states; ++k)
133 LOG_PLUS_EQUALS(tmp, insert_matrices[2*k-2][j-1] + transProbs[k][0]);
137 LOG_PLUS_EQUALS(tmp, insert_matrices[0][j-1] + transProbs[1][0]);
138 LOG_PLUS_EQUALS(tmp, insert_matrices[4][j-1] + transProbs[3][0]);
142 LOG_PLUS_EQUALS(tmp, insert_matrices[2][j-1] + transProbs[2][0]);
143 LOG_PLUS_EQUALS(tmp, insert_matrices[6][j-1] + transProbs[4][0]);
145 dp_mat[i][j] = tmp + matchProbs[c1][c2];
147 std::swap(insert_matrices[0], insert_matrices[1]);
148 std::swap(insert_matrices[2], insert_matrices[3]);
149 std::swap(insert_matrices[4], insert_matrices[5]);
150 std::swap(insert_matrices[6], insert_matrices[7]);
152 float total=dp_mat[l_seq1-1][l_seq2-1];
153 LOG_PLUS_EQUALS(total, insert_matrices[0][l_seq2-1]);
154 LOG_PLUS_EQUALS(total, insert_matrices[2][l_seq2-1]);
155 LOG_PLUS_EQUALS(total, insert_matrices[4][l_seq2-1]);
156 LOG_PLUS_EQUALS(total, insert_matrices[6][l_seq2-1]);
161 hmm_forward(
const HMM &hmm, std::vector<float> &ins_probs1, std::vector<float> &ins_probs2, Matrix<float> &match_probs, Matrix<float> &dp_mat,
float **insert_matrices)
163 unsigned short n_states=5;
164 size_t l_seq1 = ins_probs1.size()+1;
165 size_t l_seq2 = ins_probs2.size()+1;
167 float single_c1, single_c2;
169 const Matrix<float> &transProbs = hmm.trans_probs();
170 const float *initDistr = hmm.init_distribution();
173 dp_mat[0][0] =initDistr[0];
174 for (k=1; k<n_states; ++k)
175 insert_matrices[2*k-2][0]=initDistr[k];
179 for (i=1; i<l_seq2; ++i)
182 single_c2 = ins_probs2[i-1];
183 tmp1 = insert_matrices[2][i] = LOG_ADD(insert_matrices[2][i-1]+transProbs[2][2], dp_mat[0][i-1]+transProbs[0][2]) + single_c2;
184 tmp2 = insert_matrices[6][i] = LOG_ADD(insert_matrices[6][i-1]+transProbs[4][4], dp_mat[0][i-1]+transProbs[0][4]) + single_c2;
185 dp_mat[0][i] = LOG_ADD(tmp1, tmp2);
193 for (i=1; i<l_seq1; ++i)
195 single_c1 = ins_probs1[i-1];
198 tmp1 = insert_matrices[1][0] = LOG_ADD(insert_matrices[0][0]+transProbs[1][1], dp_mat[i-1][0]+transProbs[0][1]) + single_c1;
199 tmp2 = insert_matrices[5][0] = LOG_ADD(insert_matrices[4][0]+transProbs[3][3], dp_mat[i-1][0]+transProbs[0][3]) + single_c1;
200 dp_mat[i][0] = LOG_ADD(tmp1, tmp2);
201 for (j=1; j<l_seq2; ++j)
203 single_c2 = ins_probs2[j-1];
206 for (k=1; k<n_states; ++k)
210 tmp = (l) ? single_c1 : single_c2;
211 if (((i==1) && (l==1)) || ((j==1) && (m==1)))
212 insert_matrices[2*k-1][j] = dp_mat[i-l][j-m] + transProbs[0][k] + tmp;
214 insert_matrices[2*k-1][j] = LOG_ADD(dp_mat[i-l][j-m] + transProbs[0][k], insert_matrices[2*k-1-l][j-m] + transProbs[k][k]) + tmp;
218 tmp = dp_mat[i-1][j-1] + transProbs[0][0];
221 for (k=1; k<n_states; ++k)
222 LOG_PLUS_EQUALS(tmp, insert_matrices[2*k-2][j-1] + transProbs[k][0]);
226 LOG_PLUS_EQUALS(tmp, insert_matrices[0][j-1] + transProbs[1][0]);
227 LOG_PLUS_EQUALS(tmp, insert_matrices[4][j-1] + transProbs[3][0]);
231 LOG_PLUS_EQUALS(tmp, insert_matrices[2][j-1] + transProbs[2][0]);
232 LOG_PLUS_EQUALS(tmp, insert_matrices[6][j-1] + transProbs[4][0]);
234 dp_mat[i][j] = tmp + match_probs[i-1][j-1];
236 std::swap(insert_matrices[0], insert_matrices[1]);
237 std::swap(insert_matrices[2], insert_matrices[3]);
238 std::swap(insert_matrices[4], insert_matrices[5]);
239 std::swap(insert_matrices[6], insert_matrices[7]);
241 float total=dp_mat[l_seq1-1][l_seq2-1];
242 LOG_PLUS_EQUALS(total, insert_matrices[0][l_seq2-1]);
243 LOG_PLUS_EQUALS(total, insert_matrices[2][l_seq2-1]);
244 LOG_PLUS_EQUALS(total, insert_matrices[4][l_seq2-1]);
245 LOG_PLUS_EQUALS(total, insert_matrices[6][l_seq2-1]);
249 template<
typename DataType>
251 hmm_backward(
const DataType &seq1,
const DataType &seq2,
const HMM &hmm, Matrix<float> &dp_mat,
float **insert_matrices)
253 unsigned short n_states=5;
254 int l_seq1 = seq1.size();
255 int l_seq2 = seq2.size();
257 float single_c1, single_c2;
259 const Matrix<float> &matchProbs = hmm.match_probs();
260 const std::vector<float> &insProbs = hmm.ins_probs();
261 const Matrix<float> &transProbs = hmm.trans_probs();
262 const float *initDistr = hmm.init_distribution();
265 dp_mat[l_seq1][l_seq2] =initDistr[0];
266 for (k=1; k<n_states; ++k)
267 insert_matrices[2*k-2][l_seq2]=initDistr[k];
272 for (i=l_seq2-1; i>=0; --i)
274 c2=
static_cast<short>(seq2[i]);
275 single_c2 = insProbs[c2];
276 tmp1 = insert_matrices[2][i] = LOG_ADD(insert_matrices[2][i+1]+transProbs[2][2], dp_mat[l_seq1][i+1]+transProbs[0][2]) + single_c2;
277 tmp2 = insert_matrices[6][i] = LOG_ADD(insert_matrices[6][i+1]+transProbs[4][4], dp_mat[l_seq1][i+1]+transProbs[0][4]) + single_c2;
278 dp_mat[l_seq1][i] = LOG_ADD(tmp1, tmp2);
286 for (i=l_seq1-1; i>=0; --i)
288 c1=
static_cast<short>(seq1[i]);
289 single_c1 = insProbs[c1];
292 tmp1 = insert_matrices[1][l_seq2] = LOG_ADD(insert_matrices[0][l_seq2]+transProbs[1][1], dp_mat[i+1][l_seq2]+transProbs[0][1]) + single_c1;
293 tmp2 = insert_matrices[5][l_seq2] = LOG_ADD(insert_matrices[4][l_seq2]+transProbs[3][3], dp_mat[i+1][l_seq2]+transProbs[0][3]) + single_c1;
294 dp_mat[i][l_seq2] = LOG_ADD(tmp1, tmp2);
295 for (j=l_seq2-1; j>=0; --j)
297 c2=
static_cast<short>(seq2[j]);
298 single_c2 = insProbs[c2];
301 for (k=1; k<n_states; ++k)
305 tmp = (l) ? single_c1 : single_c2;
306 if (((i==(l_seq1-1)) && (l==1)) || ((j==(l_seq2-1)) && (m==1)))
307 insert_matrices[2*k-1][j] = dp_mat[i+l][j+m] + transProbs[0][k] + tmp;
309 insert_matrices[2*k-1][j] = LOG_ADD(dp_mat[i+l][j+m] + transProbs[0][k], insert_matrices[2*k-1-l][j+m] + transProbs[k][k]) + tmp;
313 tmp = dp_mat[i+1][j+1] + transProbs[0][0];
314 if ((i<l_seq1-1) && (j<l_seq2-1))
316 for (k=1; k<n_states; ++k)
317 LOG_PLUS_EQUALS(tmp, insert_matrices[2*k-2][j+1] + transProbs[k][0]);
321 LOG_PLUS_EQUALS(tmp, insert_matrices[0][j+1] + transProbs[1][0]);
322 LOG_PLUS_EQUALS(tmp, insert_matrices[4][j+1] + transProbs[3][0]);
326 LOG_PLUS_EQUALS(tmp, insert_matrices[2][j+1] + transProbs[2][0]);
327 LOG_PLUS_EQUALS(tmp, insert_matrices[6][j+1] + transProbs[4][0]);
329 dp_mat[i][j] = tmp + matchProbs[c1][c2];
331 std::swap(insert_matrices[0], insert_matrices[1]);
332 std::swap(insert_matrices[2], insert_matrices[3]);
333 std::swap(insert_matrices[4], insert_matrices[5]);
334 std::swap(insert_matrices[6], insert_matrices[7]);
338 float total=dp_mat[0][0];
339 LOG_PLUS_EQUALS(total, insert_matrices[0][0]);
340 LOG_PLUS_EQUALS(total, insert_matrices[2][0]);
341 LOG_PLUS_EQUALS(total, insert_matrices[4][0]);
342 LOG_PLUS_EQUALS(total, insert_matrices[6][0]);
358 hmm_backward(
const HMM &hmm, std::vector<float> &ins_probs1, std::vector<float> &ins_probs2,
Matrix<float> &match_probs,
Matrix<float> &dp_mat,
float **insert_matrices)
360 unsigned short n_states=5;
361 int l_seq1 = ins_probs1.size();
362 int l_seq2 = ins_probs2.size();
364 float single_c1, single_c2;
370 dp_mat[l_seq1][l_seq2] =initDistr[0];
371 for (k=1; k<n_states; ++k)
372 insert_matrices[2*k-2][l_seq2]=initDistr[k];
376 for (i=l_seq2-1; i>=0; --i)
378 single_c2 = ins_probs2[i];
379 tmp1 = insert_matrices[2][i] = LOG_ADD(insert_matrices[2][i+1]+transProbs[2][2], dp_mat[l_seq1][i+1]+transProbs[0][2]) + single_c2;
380 tmp2 = insert_matrices[6][i] = LOG_ADD(insert_matrices[6][i+1]+transProbs[4][4], dp_mat[l_seq1][i+1]+transProbs[0][4]) + single_c2;
381 dp_mat[l_seq1][i] = LOG_ADD(tmp1, tmp2);
389 for (i=l_seq1-1; i>=0; --i)
391 single_c1 = ins_probs1[i];
394 tmp1 = insert_matrices[1][l_seq2] = LOG_ADD(insert_matrices[0][l_seq2]+transProbs[1][1], dp_mat[i+1][l_seq2]+transProbs[0][1]) + single_c1;
395 tmp2 = insert_matrices[5][l_seq2] = LOG_ADD(insert_matrices[4][l_seq2]+transProbs[3][3], dp_mat[i+1][l_seq2]+transProbs[0][3]) + single_c1;
396 dp_mat[i][l_seq2] = LOG_ADD(tmp1, tmp2);
397 for (j=l_seq2-1; j>=0; --j)
399 single_c2 = ins_probs2[j];
402 for (k=1; k<n_states; ++k)
406 tmp = (l) ? single_c1 : single_c2;
407 if (((i==(l_seq1-1)) && (l==1)) || ((j==(l_seq2-1)) && (m==1)))
408 insert_matrices[2*k-1][j] = dp_mat[i+l][j+m] + transProbs[0][k] + tmp;
410 insert_matrices[2*k-1][j] = LOG_ADD(dp_mat[i+l][j+m] + transProbs[0][k], insert_matrices[2*k-1-l][j+m] + transProbs[k][k]) + tmp;
414 tmp = dp_mat[i+1][j+1] + transProbs[0][0];
415 if ((i<l_seq1-1) && (j<l_seq2-1))
417 for (k=1; k<n_states; ++k)
418 LOG_PLUS_EQUALS(tmp, insert_matrices[2*k-2][j+1] + transProbs[k][0]);
422 LOG_PLUS_EQUALS(tmp, insert_matrices[0][j+1] + transProbs[1][0]);
423 LOG_PLUS_EQUALS(tmp, insert_matrices[4][j+1] + transProbs[3][0]);
427 LOG_PLUS_EQUALS(tmp, insert_matrices[2][j+1] + transProbs[2][0]);
428 LOG_PLUS_EQUALS(tmp, insert_matrices[6][j+1] + transProbs[4][0]);
430 dp_mat[i][j] = tmp + match_probs[i][j];
432 std::swap(insert_matrices[0], insert_matrices[1]);
433 std::swap(insert_matrices[2], insert_matrices[3]);
434 std::swap(insert_matrices[4], insert_matrices[5]);
435 std::swap(insert_matrices[6], insert_matrices[7]);
437 float total=dp_mat[0][0];
438 LOG_PLUS_EQUALS(total, insert_matrices[0][0]);
439 LOG_PLUS_EQUALS(total, insert_matrices[2][0]);
440 LOG_PLUS_EQUALS(total, insert_matrices[4][0]);
441 LOG_PLUS_EQUALS(total, insert_matrices[6][0]);
453 hmm_match(
float m,
size_t x_,
size_t y_):match(m), x(x_), y(y_)
458 return(a.match > b.match);
463 template<
typename DataType,
typename LibraryDataType>
467 size_t seq_id1 = seq1.id();
468 size_t seq_id2 = seq2.id();
469 size_t l_seq1 = seq1.length();
470 size_t l_seq2 = seq2.length();
472 std::vector<hmm_match> matches;
473 matches.reserve(l_seq1*l_seq2);
476 for (i=0; i<l_seq1; ++i)
478 for (j=0; j<l_seq2; ++j)
480 if ((tmp=EXP(std::min(LOG_ONE,(forward_mat[i][j] + backward_mat[i][j] - total_probability)))) >= 0.01)
486 std::sort(matches.begin(), matches.end());
487 size_t max2 = std::min(4*std::min(l_seq1,l_seq2), matches.size());
491 float min_score=matches[max2-1].match;
493 while ((matches.size()!=i)&&(matches[i].match>=min_score))
495 lib.
add(seq_id1, seq_id2, matches[i].x, matches[i].y, matches[i].match);
501 template<
typename LibraryType>
503 hmm2lib(
const Sequence &seq1,
int id1,
const Sequence &seq2,
int id2,
const Matrix<float> &forward_mat,
const Matrix<float> &backward_mat, Library<LibraryType> &lib,
float total_probability)
505 size_t l_seq1 = seq1.size();
506 size_t l_seq2 = seq2.size();
508 std::vector<hmm_match> matches;
509 matches.reserve(l_seq1*l_seq2);
512 for (i=0; i<l_seq1; ++i)
514 for (j=0; j<l_seq2; ++j)
516 if ((tmp=EXP(std::min(LOG_ONE,(forward_mat[i][j] + backward_mat[i][j] - total_probability)))) >= 0.01)
517 matches.push_back(hmm_match(tmp, i,j));
523 size_t max2 = std::min(std::min(l_seq1,l_seq2), matches.size());
526 std::sort(matches.begin(), matches.end());
527 float min_score=matches[max2-1].match;
529 while ((matches.size()!=i)&&(matches[i].match>=min_score))
531 lib.add(id1, id2, matches[i].x, matches[i].y, matches[i].match);
581 template<
typename DataType>
585 size_t n_seqs=end-start+1;
588 for (i=0; i<n_seqs; ++i)
590 if (max_len<
set[i].length())
591 max_len=
set[i].length();
596 for (i=0; i< n_seqs; ++i)
599 std::vector<std::vector<float> > ins_probs(n_seqs);
600 for (i=start; i<=end; ++i)
607 float **insert_matrices =
new float*[8];
609 insert_matrices[i] =
new float[max_len];
613 for (i=start; i<= end; ++i)
615 const DataType &aln1 =
set[i];
616 for (j=i+1; j<=end; ++j)
618 const DataType &aln2 =
set[j];
620 fw_p =
hmm_forward(hmm, ins_probs[i-start], ins_probs[j-start], match_probs, forward_mat, insert_matrices);
621 bw_p = hmm_backward(hmm, ins_probs[i-start], ins_probs[j-start], match_probs, backward_mat, insert_matrices);
622 hmm2lib(aln1, aln2, forward_mat, backward_mat, lib, (bw_p+fw_p)/2);
623 dist_mat[i-start][j-start] = dist_mat[j-start][i-start] = 1-EXP(((fw_p+bw_p)/2));
629 template<
typename DataType>
631 all_hmm_pairs(
const std::vector<DataType> &
set, Library<std::vector<DataType> > &lib, Matrix<float> &dist_mat)
636 template<
typename DataType>
638 all_hmm_pairs(
const DataType &
set, Library<DataType> &lib, Matrix<float> &dist_mat)
642 size_t n_seqs =
set.n_seqs();
643 for (i=0; i<n_seqs; ++i)
645 if (
set[i].size() > max_len)
646 max_len=
set[i].size();
652 for (i=0; i< n_seqs; ++i)
656 Matrix<float> forward_mat = Matrix<float>(max_len, max_len);
657 Matrix<float> backward_mat = Matrix<float>(max_len, max_len);
658 float **insert_matrices =
new float*[8];
660 insert_matrices[i] =
new float[max_len];
662 for (i=0; i< n_seqs; ++i)
664 for (j=i+1; j<n_seqs; ++j)
666 const typename DataType::value_type &seq1 =
set[i];
667 const typename DataType::value_type &seq2 =
set[j];
668 fw_p =
hmm_forward(seq1, seq2, hmm, forward_mat, insert_matrices);
669 bw_p = hmm_backward(seq1, seq2, hmm, backward_mat, insert_matrices);
670 hmm2lib(seq1, i, seq2, j, forward_mat, backward_mat, lib, (bw_p+fw_p)/2);
671 dist_mat[i][j] = dist_mat[j][i] = 1-EXP(((fw_p+bw_p)/2));
679 template<
typename DataType,
typename LibraryType>
681 all_hmm_pairs(
const DataType &
set, Library<LibraryType> &lib, Matrix<float> &dist_mat,
size_t start,
size_t end)
683 size_t n_seqs=end-start+1;
686 for (i=0; i<n_seqs; ++i)
688 if (max_len<
set[i].size())
689 max_len=
set[i].size();
692 HMM hmm(
set.seq_type());
694 for (i=0; i< n_seqs; ++i)
698 Matrix<float> forward_mat = Matrix<float>(max_len, max_len);
699 Matrix<float> backward_mat = Matrix<float>(max_len, max_len);
700 float **insert_matrices =
new float*[8];
702 insert_matrices[i] =
new float[max_len];
704 for (i=start; i< end; ++i)
706 for (j=i+1; j<end; ++j)
708 const Sequence &seq1 =
set[i];
709 const Sequence &seq2 =
set[j];
710 fw_p =
hmm_forward(seq1, seq2, hmm, forward_mat, insert_matrices);
711 bw_p = hmm_backward(seq1, seq2, hmm, backward_mat, insert_matrices);
712 hmm2lib(seq1, seq2, forward_mat, backward_mat, 5, lib, (bw_p+fw_p)/2);
713 dist_mat[i-start][j-start] = dist_mat[j-start][i-start] = 1-EXP(((fw_p+bw_p)/2));