MDA
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
seq_align.hpp
1 /*
2  * seq_align.hpp
3  *
4  * Created on: Sep 17, 2013
5  * Author: ckeme_01
6  */
7 
8 #ifndef SEQ_ALIGN_HPP_
9 #define SEQ_ALIGN_HPP_
10 
11 #include <algorithm>
12 #include <cmath>
13 #include <limits>
14 #include <unordered_map>
15 //#include "Library.hpp"
16 //#include "fw_bw.hpp"
17 #include "../clustering/Tree.hpp"
18 #include "../clustering/Vector.hpp"
19 #include "../utils/MatrixStack.hpp"
20 #include "../Sequence/SplitSet.hpp"
21 #include "../utils/ThreadPool.hpp"
22 
23 #include <condition_variable>
24 #include <thread>
25 #include <mutex>
26 
27 namespace MDAT
28 {
29 
30 template<typename Data>
32 {
33  bool operator() (const Data &data, int pos) const
34  {
35  return (data[pos]=='-');
36  }
37 };
38 
39 
40 
42  bool operator() (const ProteinSequenceSet<Default> &set, int column) const
43  {
44  size_t n_seqs=set.size();
45  for (size_t i=0; i<n_seqs; ++i)
46  {
47  if (set[i][column] != '-')
48  return false;
49  }
50  return true;
51  }
52 } SequenceSetGap_obj;
53 
54 
64 void
65 gotoh_align_banded(int dim1, int dim2, MatrixStack<3, std::pair<float, char> > &matrices, int gop, int gep, size_t band_width)
66 {
67  int diff = abs(dim1-dim2);
68  Matrix<std::pair<float, char> > &mat_m = matrices[0];
69  Matrix<std::pair<float, char> > &mat_h = matrices[1];
70  Matrix<std::pair<float, char> > &mat_v = matrices[2];
71  ++dim1;
72  ++dim2;
73  int use_gop;
74  int match_score;
75  // 0=match, 1=insert, 2=deletion
76 
77  mat_m[0][0].first= 0;
78  mat_h[0][0].first= 0;
79  mat_v[0][0].first= gep;
80  int i,j;
81 
82  for (i=1; i<dim1; ++i)
83  {
84  mat_h[i][0].first=INT_MIN+900;
85  mat_m[i][0].first=mat_v[i][0].first=mat_v[i-1][0].first+gep;
86  mat_h[i][0].second = 'm';
87  mat_v[i][0].second = 'v';
88  mat_m[i][0].second = 'v';
89  }
90  for (j=1; j<dim2; ++j)
91  {
92  mat_m[0][j].first=mat_h[0][j].first=mat_h[0][j-1].first+gep;
93  mat_v[0][j].first=INT_MIN+900;
94  mat_h[0][j].second = 'h';
95  mat_v[0][j].second = 'm';
96  mat_m[0][j].second = 'h';
97  }
98 
99 
100  int start, end;
101  for (i=1; i<dim1; ++i)
102  {
103  end = std::min<int>(dim2, i+diff+band_width);
104  if (end<dim2)
105  mat_v[i][end].first=mat_h[i][end].first=mat_m[i][end].first =INT_MIN+900;
106  start = std::max<int>(1,i-diff-band_width);
107  if (start>1)
108  mat_v[i][start-1].first=mat_h[i][start-1].first=mat_m[i][start-1].first =INT_MIN+900;
109  for (j=start; j<end; ++j)
110  {
111  //calculate insert value
112  use_gop = (j==(dim2-1))? 0 : gop;
113  if (mat_v[i-1][j].first > (mat_m[i-1][j].first +use_gop))
114  {
115  mat_v[i][j].second = 'v';
116  mat_v[i][j].first = mat_v[i-1][j].first;
117  }else
118  {
119  mat_v[i][j].second = 'm';
120  mat_v[i][j].first = mat_m[i-1][j].first+use_gop;
121  }
122  mat_v[i][j].first += gep;
123 
124  //calculate deletion value
125  use_gop = (i==(dim1-1))? 0 : gop;
126  if (mat_h[i][j-1].first > (mat_m[i][j-1].first +use_gop))
127  {
128  mat_h[i][j].second = 'h';
129  mat_h[i][j].first = mat_h[i][j-1].first;
130  } else
131  {
132  mat_h[i][j].second = 'm';
133  mat_h[i][j].first = mat_m[i][j-1].first+use_gop;
134  }
135  mat_h[i][j].first += gep;
136 
137  //calculate match value
138  match_score=mat_m[i][j].first;
139  if (mat_v[i][j].first > mat_h[i][j].first)
140  {
141  mat_m[i][j].second = 'v';
142  mat_m[i][j].first = mat_v[i][j].first;
143  }
144  else
145  {
146  mat_m[i][j].second = 'h';
147  mat_m[i][j].first = mat_h[i][j].first;
148  }
149 
150  if (mat_m[i-1][j-1].first + match_score >= mat_m[i][j].first)
151  {
152  mat_m[i][j].second = 'm';
153  mat_m[i][j].first = mat_m[i-1][j-1].first + match_score;
154  }
155  }
156  }
157 
158 }
159 
160 
169 void
170 gotoh_align(int dim1, int dim2, MatrixStack<3, std::pair<float, char> > &matrices, int gop, int gep)
171 {
172  Matrix<std::pair<float, char> > &mat_m = matrices[0];
173  Matrix<std::pair<float, char> > &mat_h = matrices[1];
174  Matrix<std::pair<float, char> > &mat_v = matrices[2];
175  ++dim1;
176  ++dim2;
177  int use_gop;
178  int match_score;
179  // 0=match, 1=insert, 2=deletion
180 
181  mat_m[0][0].first= 0;
182  mat_h[0][0].first= 0;
183  mat_v[0][0].first= gep;
184  int i,j;
185  for (i=1; i<dim1; ++i)
186  {
187  mat_h[i][0].first=INT_MIN+900;
188  mat_m[i][0].first=mat_v[i][0].first=mat_v[i-1][0].first+gep;
189  mat_h[i][0].second = 'm';
190  mat_v[i][0].second = 'v';
191  mat_m[i][0].second = 'v';
192  }
193  for (j=1; j<dim2; ++j)
194  {
195  mat_m[0][j].first=mat_h[0][j].first=mat_h[0][j-1].first+gep;
196  mat_v[0][j].first=INT_MIN+900;
197  mat_h[0][j].second = 'h';
198  mat_v[0][j].second = 'm';
199  mat_m[0][j].second = 'h';
200  }
201 
202 
203  for (i=1; i<dim1; ++i)
204  {
205  for (j=1; j<dim2; ++j)
206  {
207  //calculate insert value
208  use_gop = (j==(dim2-1))? 0 : gop;
209  if (mat_v[i-1][j].first > (mat_m[i-1][j].first +use_gop))
210  {
211  mat_v[i][j].second = 'v';
212  mat_v[i][j].first = mat_v[i-1][j].first;
213  }else
214  {
215  mat_v[i][j].second = 'm';
216  mat_v[i][j].first = mat_m[i-1][j].first+use_gop;
217  }
218  mat_v[i][j].first += gep;
219 
220  //calculate deletion value
221  use_gop = (i==(dim1-1))? 0 : gop;
222  if (mat_h[i][j-1].first > (mat_m[i][j-1].first +use_gop))
223  {
224  mat_h[i][j].second = 'h';
225  mat_h[i][j].first = mat_h[i][j-1].first;
226  } else
227  {
228  mat_h[i][j].second = 'm';
229  mat_h[i][j].first = mat_m[i][j-1].first+use_gop;
230  }
231  mat_h[i][j].first += gep;
232 
233  //calculate match value
234  match_score=mat_m[i][j].first;
235  if (mat_v[i][j].first > mat_h[i][j].first)
236  {
237  mat_m[i][j].second = 'v';
238  mat_m[i][j].first = mat_v[i][j].first;
239  }
240  else
241  {
242  mat_m[i][j].second = 'h';
243  mat_m[i][j].first = mat_h[i][j].first;
244  }
245 
246  if (mat_m[i-1][j-1].first + match_score >= mat_m[i][j].first)
247  {
248  mat_m[i][j].second = 'm';
249  mat_m[i][j].first = mat_m[i-1][j-1].first + match_score;
250  }
251  }
252  }
253 }
254 
255 /*
256 void
257 gotoh_align(int dim1, int dim2, MatrixStack<3, std::pair<float, char> > &matrices, const std::vector<float> &gop1, const std::vector<float> &gop2, const std::vector<float> &gep1, const std::vector<float> &gep2)
258 {
259  Matrix<std::pair<float, char> > &mat_m = matrices[0];
260  Matrix<std::pair<float, char> > &mat_h = matrices[1];
261  Matrix<std::pair<float, char> > &mat_v = matrices[2];
262 
263  ++dim1;
264  ++dim2;
265  int match_score;
266  // 0=match, 1=insert, 2=deletion
267 
268  mat_m[0][0].first= 0;
269  mat_h[0][0].first= 0;
270  mat_v[0][0].first= 0;
271  int i,j;
272  for (i=1; i<dim1; ++i)
273  {
274  mat_h[i][0].first=INT_MIN+900;
275  mat_m[i][0].first=mat_v[i][0].first=mat_v[i-1][0].first+gep1[i-1];
276  mat_h[i][0].second = 'm';
277  mat_v[i][0].second = 'v';
278  mat_m[i][0].second = 'v';
279  }
280  for (j=1; j<dim2; ++j)
281  {
282  mat_m[0][j].first=mat_h[0][j].first=mat_h[0][j-1].first+gep2[j-1];
283  mat_v[0][j].first=INT_MIN+900;
284  mat_h[0][j].second = 'h';
285  mat_v[0][j].second = 'm';
286  mat_m[0][j].second = 'h';
287  }
288 
289  float gop_j, gop_i;
290  for (i=1; i<dim1; ++i)
291  {
292 
293  for (j=1; j<dim2; ++j)
294  {
295  //calculate insert value
296  gop_j = (j==dim2-1)? 0 : gop1[i-1];
297  if (mat_v[i-1][j].first > mat_m[i-1][j].first +gop_j)
298  {
299  mat_v[i][j].second = 'v';
300  mat_v[i][j].first = mat_v[i-1][j].first;
301  }else
302  {
303  mat_v[i][j].second = 'm';
304  mat_v[i][j].first = mat_m[i-1][j].first+gop_j;
305  }
306  mat_v[i][j].first += gep1[i-1];
307 
308  //calculate deletion value
309  gop_i = (i==dim1-1)? 0 : gop2[j-1];
310  if (mat_h[i][j-1].first > mat_m[i][j-1].first +gop_i)
311  {
312  mat_h[i][j].second = 'h';
313  mat_h[i][j].first = mat_h[i][j-1].first;
314  } else
315  {
316  mat_h[i][j].second = 'm';
317  mat_h[i][j].first = mat_m[i][j-1].first+gop_i;
318  }
319  mat_h[i][j].first += gep2[j-1];
320 
321  //calculate match value
322  match_score=mat_m[i][j].first;
323  if (mat_v[i][j].first > mat_h[i][j].first)
324  {
325  mat_m[i][j].second = 'v';
326  mat_m[i][j].first = mat_v[i][j].first;
327  }
328  else
329  {
330  mat_m[i][j].second = 'h';
331  mat_m[i][j].first = mat_h[i][j].first;
332  }
333 
334  if (mat_m[i-1][j-1].first + match_score >= mat_m[i][j].first)
335  {
336  mat_m[i][j].second = 'm';
337  mat_m[i][j].first = mat_m[i-1][j-1].first + match_score;
338  }
339  }
340  }
341 }
342 */
343 
353 template <typename MatrixStackType>
354 void
355 gotoh_traceback(int dim1, int dim2, const MatrixStackType &matrices, std::string &edit_string1, std::string &edit_string2)
356 {
357  edit_string1.clear();
358  edit_string2.clear();
359  size_t i = dim1;
360  size_t j = dim2;
361 
362  char state='c';
363  int mat = 0;
364  while ((i!=0) && (j!=0))
365  {
366  state = matrices[mat][i][j].second;
367  if (mat==0)
368  {
369  if (state=='m')
370  {
371  --i;
372  --j;
373  edit_string1.push_back('m');
374  edit_string2.push_back('m');
375  }
376  else
377  {
378  if (state=='v')
379  mat=2;
380  else
381  mat=1;
382  }
383  }
384  else
385  {
386  if (mat==2)
387  {
388  --i;
389  edit_string1.push_back('m');
390  edit_string2.push_back('-');
391  }
392  else
393  {
394  --j;
395  edit_string1.push_back('-');
396  edit_string2.push_back('m');
397  }
398  if (state=='m')
399  mat = 0;
400  }
401  }
402 
403  while (j>0)
404  {
405  --j;
406  edit_string1.push_back('-');
407  edit_string2.push_back('m');
408  }
409  while (i>0)
410  {
411  --i;
412  edit_string1.push_back('m');
413  edit_string2.push_back('-');
414  }
415 }
416 
417 
426 template<typename DataType>
427 void
428 fillGotohMatrix(const DataType &set, std::vector<size_t> ids1, std::vector<size_t> ids2, MatrixStack<3,std::pair<float, char> > &matrixStack, const Matrix<int> &sim_mat)
429 {
430  size_t i,j;
431  std::unordered_map<short, int>::iterator it;
432  short c;
433  size_t n_seqs1=ids1.size();
434  size_t length1 = set[ids1[0]].size();
435  std::vector<std::unordered_map<short, int> > prof1(length1);
436  for (i=0; i<n_seqs1; ++i)
437  {
438  const typename DataType::value_type &seq = set[ids1[i]];
439  for (j=0; j<length1; ++j)
440  {
441  if (seq[j] != '-')
442  {
443  c=toupper(seq[j])-65;
444  if ((it =prof1[j].find(c)) != prof1[j].end())
445  ++it->second;
446  else
447  prof1[j][c]=1;
448  }
449  }
450  }
451 
452  size_t length2 = set[ids2[0]].size();
453  size_t n_seqs2=ids2.size();
454  std::vector<std::unordered_map<short, int> > prof2(length2);
455 
456  for (i=0; i<n_seqs2; ++i)
457  {
458  const typename DataType::value_type &seq = set[ids2[i]];
459  for (j=0; j<length2; ++j)
460  {
461  if (seq[j] != '-')
462  {
463  c=toupper(seq[j])-65;
464  if ((it =prof2[j].find(c)) != prof2[j].end())
465  ++it->second;
466  else
467  prof2[j][c]=1;
468  }
469  }
470  }
471 
472  matrixStack.ensure(length1+1, length2+1);
473  Matrix<std::pair<float, char> > &matrix = matrixStack[0];
474  std::unordered_map<short, int>::iterator it1,it2,it1_end,it2_end;
475 
476  double tmp;
477  for (i=0; i<length1; ++i)
478  {
479  it1_end=prof1[i].end();
480  for (j=0; j<length2; ++j)
481  {
482 
483  matrix[i+1][j+1].first=0;
484  it2_end=prof2[i].end();
485  tmp=0;
486  for (it1=prof1[i].begin(); it1!=it1_end; ++it1)
487  {
488  const std::vector<int> &vec=sim_mat[it1->first];
489  for (it2=prof2[j].begin(); it2!=it2_end; ++it2)
490  {
491  matrix[i+1][j+1].first += vec[it2->first] * it1->second * it2->second;
492  tmp += it1->second * it2->second;
493  }
494  }
495  matrix[i+1][j+1].first /= tmp;
496  }
497  }
498 }
499 
500 
510 template<typename DataType>
511 void
512 fillGotohMatrix_banded(const DataType &set, std::vector<size_t> ids1, std::vector<size_t> ids2, MatrixStack<3,std::pair<float, char> > &matrixStack, const Matrix<int> &sim_mat, int band_width)
513 {
514  struct helper
515  {
516  std::vector<std::pair<int, short> > counter;
517  std::vector<int> char2counter;
518  int pos;
519  helper():counter(), char2counter(26,0), pos(-1)
520  {}
521  };
522  size_t i,j;
523  int pos2;
524  short c;
525 
526  size_t n_seqs1=ids1.size();
527  size_t length1 = set[ids1[0]].size();
528  std::vector<helper> prof1(length1);
529  for (i=0; i<n_seqs1; ++i)
530  {
531  const typename DataType::value_type &seq = set[ids1[i]];
532  for (j=0; j<length1; ++j)
533  {
534  if (seq[j] != '-')
535  {
536  c=toupper(seq[j])-65;
537  if ((pos2=prof1[j].char2counter[c])!=0)
538  ++prof1[j].counter[pos2].first;
539  else
540  {
541  prof1[j].char2counter[c]=++prof1[j].pos;
542  prof1[j].counter.emplace_back(std::pair<int, short>(1,c));
543  }
544  }
545  }
546  }
547 
548  size_t length2 = set[ids2[0]].size();
549  size_t n_seqs2=ids2.size();
550  std::vector<helper> prof2(length2);
551  for (i=0; i<n_seqs2; ++i)
552  {
553  const typename DataType::value_type &seq = set[ids2[i]];
554  for (j=0; j<length2; ++j)
555  {
556  if (seq[j] != '-')
557  {
558  c=toupper(seq[j])-65;
559  if ((pos2=prof2[j].char2counter[c])!=0)
560  ++prof2[j].counter[pos2].first;
561  else
562  {
563  prof2[j].char2counter[c]=++prof2[j].pos;
564  prof2[j].counter.emplace_back(std::pair<int, short>(1,c));
565  }
566  }
567  }
568  }
569 
570 
571  matrixStack.ensure(length1+50, length2+50);
572  Matrix<std::pair<float, char> > &matrix = matrixStack[0];
573  std::vector<std::pair<int, short> >::const_iterator it1,it2,it1_end,it2_end;
574 
575  double tmp;
576  size_t start, end;
577  size_t diff = (length1 > length2) ? (length1-length2) : (length2-length1);
578  for (i=0; i<length1; ++i)
579  {
580  it1_end=prof1[i].counter.cend();
581  end = std::min<int>(length2, i+diff+band_width);
582  start = std::max<int>(0,i-diff-band_width);
583  for (j=start; j<end; ++j)
584  {
585  matrix[i+1][j+1].first=0;
586  it2_end=prof2[j].counter.cend();
587  tmp=0;
588  for (it1=prof1[i].counter.cbegin(); it1!=it1_end; ++it1)
589  {
590  const std::vector<int> &vec=sim_mat[it1->second];
591  for (it2=prof2[j].counter.cbegin(); it2!=it2_end; ++it2)
592  {
593  matrix[i+1][j+1].first += vec[it2->second] * it1->first * it2->first;
594  tmp += it1->first * it2->first;
595  }
596  }
597  matrix[i+1][j+1].first /= tmp;
598  }
599  }
600 }
601 
602 
603 
613 template<typename DataType>
614 void
615 seq_progressive_align(DataType &set, std::shared_ptr<Tree> guide_tree, const Matrix<int> &sim_mat, int gop, int gep, bool banded, MatrixStack<3, std::pair<float, char> > &dyn_matrix)
616 {
617  size_t n_elems = set.size();
618  std::stack<std::pair<const TreeNode*, unsigned int> > to_do;
619  to_do.push(std::pair<const TreeNode*, int>(guide_tree->root(), 0));
620  std::vector<std::pair<unsigned int, unsigned int> > gap1, gap2;
621 
622  Matrix<size_t> ids(n_elems, 1);
623  for (size_t i=0; i<n_elems; ++i)
624  ids[i][0]=i;
625  int *tree_helper = new int[guide_tree->n_species()*2];
626  for (size_t i=0; i<guide_tree->n_species()*2; ++i)
627  tree_helper[i] = i;
628  size_t dim1, dim2, i;
629  std::string edit_string1, edit_string2;
630  std::vector<float> pos_gops1, pos_gops2;
631  std::vector<float> pos_geps1, pos_geps2;
632  while (!to_do.empty())
633  {
634  std::pair<const TreeNode*, unsigned int> &current = to_do.top();
635  if (current.first->children.empty())
636  to_do.pop();
637  else if (current.second == current.first->children.size())
638  {
639  pos_geps1.clear();
640  pos_gops1.clear();
641  pos_geps2.clear();
642  pos_gops2.clear();
643  std::vector<size_t> &ids1 = ids[tree_helper[current.first->children[0]->id]];
644  dim1=set[ids1[0]].length();
645  std::vector<size_t> &ids2 = ids[tree_helper[current.first->children[1]->id]];
646  dim2=set[ids2[0]].length();
647 
648  if (banded)
649  {
650  fillGotohMatrix_banded(set, ids1, ids2, dyn_matrix, sim_mat, 15);
651  gotoh_align_banded(dim1, dim2, dyn_matrix, gop, gep, 15);
652  }
653  else
654  {
655  fillGotohMatrix(set, ids1, ids2, dyn_matrix, sim_mat);
656  gotoh_align(dim1, dim2, dyn_matrix, gop, gep);
657  }
658  gotoh_traceback(dim1, dim2, dyn_matrix, edit_string1, edit_string2);
659  for (i=0; i<ids1.size(); ++i)
660  set[ids1[i]].insert_gaps(edit_string1);
661  for (i=0; i<ids2.size(); ++i)
662  set[ids2[i]].insert_gaps(edit_string2);
663  ids1.reserve( ids1.size() + ids2.size() );
664  ids1.insert(ids1.end(), ids2.begin(), ids2.end());
665  tree_helper[current.first->id] = tree_helper[current.first->children[0]->id];
666  to_do.pop();
667  }
668  else
669  {
670  to_do.push(std::pair<const TreeNode*, int>(&(*current.first->children[current.second]), 0));
671  ++current.second;
672  }
673  }
674  delete[] tree_helper;
675 }
676 
677 
678 template<typename DataType>
679 void
680 seq_progressive_align(DataType &set, const Tree &guide_tree, const Matrix<int> &sim_mat, int gop, int gep, bool banded)
681 {
682  MatrixStack<3, std::pair<float, char> > dyn_matrix;
683  seq_progressive_align(set, guide_tree, dyn_matrix, sim_mat, gop, gep, banded);
684 }
685 
686 
696 template<typename DataType>
697 void
698 seq_prof_prof_align(DataType &set1, DataType &set2, const Matrix<int> &sim_mat, int gop, int gep, MatrixStack<3, std::pair<float, char> > &matrix)
699 {
700  size_t i;
701  std::vector<size_t> ids1, ids2;
702  size_t n_seqs1 = set1.size();
703  for (i=0; i<n_seqs1; ++i)
704  ids1.push_back(i);
705  size_t n_seqs2 = set2.size();
706  for (i=0; i<n_seqs2; ++i)
707  ids2.push_back(n_seqs1+i);
708 
709  std::vector<std::pair<unsigned int, unsigned int> > gap1, gap2;
710 
711  std::string edit_string1, edit_string2;
712  std::vector<float> pos_gops1, pos_gops2;
713  std::vector<float> pos_geps1, pos_geps2;
714  set1.transfer(set2);
715  size_t dim1=set1[ids1[0]].length();
716  size_t dim2=set1[ids2[0]].length();
717  fillGotohMatrix(set1, ids1, ids2, matrix, sim_mat);
718  gotoh_align(dim1, dim2, matrix, gop, gep);
719  gotoh_traceback(dim1, dim2, matrix, edit_string1, edit_string2);
720  for (i=0; i<ids1.size(); ++i)
721  set1[ids1[i]].insert_gaps(edit_string1);
722  for (i=0; i<ids2.size(); ++i)
723  set1[ids2[i]].insert_gaps(edit_string2);
724 }
725 
726 
736 template<typename DataType>
737 void
738 seq_prof_prof_align_banded(DataType &set1, DataType &set2, const Matrix<int> &sim_mat, int gop, int gep, MatrixStack<3, std::pair<float, char> > &matrix)
739 {
740  size_t i;
741  std::vector<size_t> ids1, ids2;
742  size_t n_seqs1 = set1.size();
743  for (i=0; i<n_seqs1; ++i)
744  ids1.push_back(i);
745  size_t n_seqs2 = set2.size();
746  for (i=0; i<n_seqs2; ++i)
747  ids2.push_back(n_seqs1+i);
748 
749  std::vector<std::pair<unsigned int, unsigned int> > gap1, gap2;
750 
751  std::string edit_string1, edit_string2;
752  std::vector<float> pos_gops1, pos_gops2;
753  std::vector<float> pos_geps1, pos_geps2;
754  set1.transfer(set2);
755  size_t dim1=set1[ids1[0]].length();
756  size_t dim2=set1[ids2[0]].length();
757  fillGotohMatrix_banded(set1, ids1, ids2, matrix, sim_mat, 15);
758  gotoh_align_banded(dim1, dim2, matrix, gop, gep, 15);
759  gotoh_traceback(dim1, dim2, matrix, edit_string1, edit_string2);
760  for (i=0; i<ids1.size(); ++i)
761  set1[ids1[i]].insert_gaps(edit_string1);
762  for (i=0; i<ids2.size(); ++i)
763  set1[ids2[i]].insert_gaps(edit_string2);
764 }
765 
766 
771 template<typename DataType>
772 void
773 seq_progressive_align(DataType &set, MatrixStack<3, std::pair<float, char> > &matrices, const Matrix<int> &sim_mat, int gop, int gep, bool banded)
774 {
775  size_t n_elems=set.size();
776  if (n_elems>1)
777  {
778  Matrix<float> *dist_mat = kmer_dist_mat(set);
779  Tree guide_tree;
780  std::vector<std::string> names(n_elems, "");
781  guide_tree.nj(*dist_mat, names);
782  delete dist_mat;
783  seq_progressive_align(set, matrices, guide_tree, sim_mat, gop, gep, banded);
784  }
785 }
786 
787 
788 template<typename MemoryType>
789 void
790 same_architecture_aln(ProteinSequenceSet<MemoryType> &set, const Matrix<int> &sim_mat, int gop, int gep, SplitSet<ProteinSequenceSet<Default> > &splitSet, MatrixStack<3, std::pair<float, char> > &matrix)
791 {
792  // Split sequences into domain and interdomain parts and align each separately
793  domain_column_split(set, splitSet);
794  size_t n_seqs=set.n_seqs();
795  // If only one sequence, than already aligned.
796  if (n_seqs==1)
797  return;
798 
799  // Construct guide tree
800  Matrix<float> *dist_mat = kmer_dist_mat(set);
801  std::shared_ptr<Tree> guide_tree(new Tree());
802  std::vector<std::string> names(set.size(), "");
803  std::vector<int> n_members(set.size(), 1);
804  guide_tree->upgma(*dist_mat, names, n_members);
805  delete dist_mat;
806 
807  // Align each piece
808  size_t n_pieces = splitSet.size();
809  size_t i;
810  bool banded=true;
811  for (i=0; i<n_pieces; ++i)
812  {
813  banded=!banded;
814  if (splitSet[i].size() != 0)
815  seq_progressive_align(splitSet[i], guide_tree, sim_mat, gop, gep, banded, matrix);
816  }
817 }
818 
819 
820 template<typename MemoryType>
821 void
822 same_architecture_aln(ProteinSequenceSet<MemoryType> &set, const Matrix<int> &sim_mat, int gop, int gep, SplitSet<ProteinSequenceSet<Default> > &splitSet, ThreadPool<MatrixStack<3, std::pair<float, char> > > &pool)
823 {
824  // Split sequences into domain and interdomain parts and align each separately
825  domain_column_split(set, splitSet);
826  size_t n_seqs=set.n_seqs();
827 
828  // If only one sequence, than already aligned.
829  if (n_seqs==1)
830  return;
831 
832  // Construct guide tree
833  Matrix<float> *dist_mat = kmer_dist_mat(set);
834  std::shared_ptr<Tree>guide_tree(new Tree());
835  std::vector<std::string> names(set.size(), "");
836  std::vector<int> n_members(set.size(), 1);
837  guide_tree->upgma(*dist_mat, names, n_members);
838  delete dist_mat;
839 
840  // Align each piece
841  size_t n_pieces = splitSet.size();
842  size_t i;
843  bool banded=true;
844  for (i=0; i<n_pieces; ++i)
845  {
846  banded=!banded;
847  if (splitSet[i].size() != 0)
848  pool.addTask(static_cast<void (*)(ProteinSequenceSet<MemoryType> &, std::shared_ptr<Tree>, const Matrix<int> &, int, int, bool, MatrixStack<3, std::pair<float, char> > &)>(&seq_progressive_align), std::ref(splitSet[i]), guide_tree, std::ref(sim_mat), gop, gep, banded);
849  }
850 }
851 
852 
853 template<typename MemoryType>
854 void
855 merge_sequences(SplitSet<ProteinSequenceSet<MemoryType> > &set, size_t start, size_t end)
856 {
857  size_t i,j;
858  size_t n_seqs=-1;
859  for (j=start; j<=end; ++j)
860  {
861  if (set[j].size()>0)
862  {
863  n_seqs=set[j].size();
864  break;
865  }
866  }
867  ProteinSequenceSet<MemoryType> tmp_set;
868  for (i=0; i<n_seqs; ++i)
869  tmp_set.transfer(set[j]);
870 
871  for (j=j+1; j<=end; ++j)
872  {
873  if (set[j].size()>0)
874  {
875  for (i=0; i<n_seqs; ++i)
876  tmp_set[i].append(set[j][i].sequence());
877  set[j].clear();
878  }
879  }
880  set[end].transfer(tmp_set);
881 }
882 
883 template<typename MemoryType>
884 void
885 aln_different_architectures(SplitSet<ProteinSequenceSet<MemoryType> > &set1, SplitSet<ProteinSequenceSet<MemoryType> > &set2, const Matrix<int> &sim_mat, int gop, int gep, ThreadPool<MatrixStack<3, std::pair<float, char> > > &pool)
886 {
887  size_t n_pieces=set1.size();
888  size_t i, pos=0;
889  std::vector<int> pattern1 = set1.pattern();
890  std::vector<int> pattern2 = set2.pattern();
891  for (i=0; i<n_pieces; ++i)
892  {
893  if ((pattern1[i]!=0) && (pattern2[i]!=0))
894  {
895  merge_sequences(set1, pos, i-1);
896  merge_sequences(set2, pos, i-1);
897  pool.addTask(static_cast<void (*)(ProteinSequenceSet<MemoryType> &, ProteinSequenceSet<MemoryType> &, const Matrix<int> &, int, int, MatrixStack<3, std::pair<float, char> > &)>(seq_prof_prof_align), std::ref(set1[i-1]), std::ref(set2[i-1]), std::cref(sim_mat), gop, gep);
898 
899  if (pattern1[i] == pattern2[i])
900  pool.addTask(static_cast<void (*)(ProteinSequenceSet<MemoryType> &, ProteinSequenceSet<MemoryType> &, const Matrix<int> &, int, int, MatrixStack<3, std::pair<float, char> > &)>(seq_prof_prof_align_banded), std::ref(set1[i]), std::ref(set2[i]), std::cref(sim_mat), gop, gep);
901  else
902  pool.addTask(static_cast<void (*)(ProteinSequenceSet<MemoryType> &, ProteinSequenceSet<MemoryType> &, const Matrix<int> &, int, int, MatrixStack<3, std::pair<float, char> > &)>(seq_prof_prof_align), std::ref(set1[i]), std::ref(set2[i]), std::cref(sim_mat), gop, gep);
903  pos=i+1;
904  }
905  else
906  {
907  pattern1[i]=0;
908  pattern2[i]=0;
909  }
910  }
911  merge_sequences(set1, pos, i-1);
912  merge_sequences(set2, pos, i-1);
913  pool.addTask(static_cast<void (*)(ProteinSequenceSet<MemoryType> &, ProteinSequenceSet<MemoryType> &, const Matrix<int> &, int, int, MatrixStack<3, std::pair<float, char> > &)>(seq_prof_prof_align), std::ref(set1[i-1]), std::ref(set2[i-1]), std::cref(sim_mat), gop, gep);
914  set1.pattern(pattern1);
915  set2.pattern(pattern2);
916 }
917 
918 template<typename MemoryType>
919 void
920 msa(ProteinSequenceSet<MemoryType> &set, const Tree &arch_guide_tree, const Matrix<int> &sim_mat, int gop, int gep, size_t n_threads)
921 {
922  int last=0;
923 
924  std::vector<SplitSet<ProteinSequenceSet<MemoryType> > > splitted_sets(set.n_architectures());
925  MatrixStack<3, std::pair<float, char> > matrix(10,10);
926 
927  if (set.n_architectures()==1)
928  {
929  ThreadPool<MatrixStack<3, std::pair<float, char> > > pool(n_threads, matrix);
930  same_architecture_aln(set, sim_mat, gop, gep, splitted_sets[0], pool);
931  pool.stop();
932  }
933  else
934  {
935  // Align sequences with same architecture
936  std::vector<ProteinSequenceSet<MemoryType> > architectureSplits;// = new std::vector<ProteinSequenceSet<MemoryType> >();
937  splitByArchitecture(set, architectureSplits);
938  size_t n_sets=architectureSplits.size();
939  ThreadPool<MatrixStack<3, std::pair<float, char> > > pool(n_threads, matrix);
940  for (size_t i=0; i<n_sets; ++i)
941  {
942  same_architecture_aln(architectureSplits[i], sim_mat, gop, gep, splitted_sets[i], pool);
943  }
944  pool.wait();
945  //std::cout << architectureSplits.size() << std::endl;
946  // Align sequence sets with different architecture
947  size_t n_elems = set.size();
948  std::stack<std::pair<const TreeNode*, unsigned int> > to_do;
949  to_do.push(std::pair<const TreeNode*, int>(arch_guide_tree.root(), 0));
950  int *tree_helper = new int[arch_guide_tree.n_species()*2];
951  for (size_t i=0; i<arch_guide_tree.n_species()*2; ++i)
952  tree_helper[i]=i;
953  std::vector<std::pair<unsigned int, unsigned int> > gap1, gap2;
954  Matrix<size_t> ids(n_elems, 1);
955  for (size_t i=0; i<n_elems; ++i)
956  ids[i][0]=i;
957 
958  std::string edit_string1, edit_string2;
959  std::vector<float> pos_gops1, pos_gops2;
960  std::vector<float> pos_geps1, pos_geps2;
961  while (!to_do.empty())
962  {
963  std::pair<const TreeNode*, unsigned int> &current = to_do.top();
964  if (current.first->children.empty())
965  to_do.pop();
966  else if (current.second == current.first->children.size())
967  {
968  int first = tree_helper[current.first->children[0]->id];
969  int second = tree_helper[current.first->children[1]->id];
970  tree_helper[current.first->id] = tree_helper[current.first->children[0]->id];
971  last=first;//current.first->id;
972  aln_different_architectures(splitted_sets[first], splitted_sets[second], sim_mat, gop, gep, pool);
973  pool.wait();
974  to_do.pop();
975  }
976  else
977  {
978  to_do.push(std::pair<const TreeNode*, int>(&(*current.first->children[current.second]), 0));
979  ++current.second;
980  }
981  }
982  delete[] tree_helper;
983  // pool.stop();
984  }
985 
986 
987  // Merge
988  merge_sequences(splitted_sets[last], 0, splitted_sets[last].size()-1);
989  set.clear();
990  set.transfer(splitted_sets[last][splitted_sets[last].size()-1]);
991 }
992 
993 } // namespace MDAT
994 
995 #endif /* SEQ_ALIGN_HPP_ */