14 #include <unordered_map>
17 #include "../clustering/Tree.hpp"
18 #include "../clustering/Vector.hpp"
19 #include "../utils/MatrixStack.hpp"
20 #include "../Sequence/SplitSet.hpp"
21 #include "../utils/ThreadPool.hpp"
23 #include <condition_variable>
30 template<
typename Data>
33 bool operator() (
const Data &data,
int pos)
const
35 return (data[pos]==
'-');
44 size_t n_seqs=
set.size();
45 for (
size_t i=0; i<n_seqs; ++i)
47 if (
set[i][column] !=
'-')
67 int diff = abs(dim1-dim2);
79 mat_v[0][0].first= gep;
82 for (i=1; i<dim1; ++i)
84 mat_h[i][0].first=INT_MIN+900;
85 mat_m[i][0].first=mat_v[i][0].first=mat_v[i-1][0].first+gep;
86 mat_h[i][0].second =
'm';
87 mat_v[i][0].second =
'v';
88 mat_m[i][0].second =
'v';
90 for (j=1; j<dim2; ++j)
92 mat_m[0][j].first=mat_h[0][j].first=mat_h[0][j-1].first+gep;
93 mat_v[0][j].first=INT_MIN+900;
94 mat_h[0][j].second =
'h';
95 mat_v[0][j].second =
'm';
96 mat_m[0][j].second =
'h';
101 for (i=1; i<dim1; ++i)
103 end = std::min<int>(dim2, i+diff+band_width);
105 mat_v[i][end].first=mat_h[i][end].first=mat_m[i][end].first =INT_MIN+900;
106 start = std::max<int>(1,i-diff-band_width);
108 mat_v[i][start-1].first=mat_h[i][start-1].first=mat_m[i][start-1].first =INT_MIN+900;
109 for (j=start; j<end; ++j)
112 use_gop = (j==(dim2-1))? 0 : gop;
113 if (mat_v[i-1][j].first > (mat_m[i-1][j].first +use_gop))
115 mat_v[i][j].second =
'v';
116 mat_v[i][j].first = mat_v[i-1][j].first;
119 mat_v[i][j].second =
'm';
120 mat_v[i][j].first = mat_m[i-1][j].first+use_gop;
122 mat_v[i][j].first += gep;
125 use_gop = (i==(dim1-1))? 0 : gop;
126 if (mat_h[i][j-1].first > (mat_m[i][j-1].first +use_gop))
128 mat_h[i][j].second =
'h';
129 mat_h[i][j].first = mat_h[i][j-1].first;
132 mat_h[i][j].second =
'm';
133 mat_h[i][j].first = mat_m[i][j-1].first+use_gop;
135 mat_h[i][j].first += gep;
138 match_score=mat_m[i][j].first;
139 if (mat_v[i][j].first > mat_h[i][j].first)
141 mat_m[i][j].second =
'v';
142 mat_m[i][j].first = mat_v[i][j].first;
146 mat_m[i][j].second =
'h';
147 mat_m[i][j].first = mat_h[i][j].first;
150 if (mat_m[i-1][j-1].first + match_score >= mat_m[i][j].first)
152 mat_m[i][j].second =
'm';
153 mat_m[i][j].first = mat_m[i-1][j-1].first + match_score;
181 mat_m[0][0].first= 0;
182 mat_h[0][0].first= 0;
183 mat_v[0][0].first= gep;
185 for (i=1; i<dim1; ++i)
187 mat_h[i][0].first=INT_MIN+900;
188 mat_m[i][0].first=mat_v[i][0].first=mat_v[i-1][0].first+gep;
189 mat_h[i][0].second =
'm';
190 mat_v[i][0].second =
'v';
191 mat_m[i][0].second =
'v';
193 for (j=1; j<dim2; ++j)
195 mat_m[0][j].first=mat_h[0][j].first=mat_h[0][j-1].first+gep;
196 mat_v[0][j].first=INT_MIN+900;
197 mat_h[0][j].second =
'h';
198 mat_v[0][j].second =
'm';
199 mat_m[0][j].second =
'h';
203 for (i=1; i<dim1; ++i)
205 for (j=1; j<dim2; ++j)
208 use_gop = (j==(dim2-1))? 0 : gop;
209 if (mat_v[i-1][j].first > (mat_m[i-1][j].first +use_gop))
211 mat_v[i][j].second =
'v';
212 mat_v[i][j].first = mat_v[i-1][j].first;
215 mat_v[i][j].second =
'm';
216 mat_v[i][j].first = mat_m[i-1][j].first+use_gop;
218 mat_v[i][j].first += gep;
221 use_gop = (i==(dim1-1))? 0 : gop;
222 if (mat_h[i][j-1].first > (mat_m[i][j-1].first +use_gop))
224 mat_h[i][j].second =
'h';
225 mat_h[i][j].first = mat_h[i][j-1].first;
228 mat_h[i][j].second =
'm';
229 mat_h[i][j].first = mat_m[i][j-1].first+use_gop;
231 mat_h[i][j].first += gep;
234 match_score=mat_m[i][j].first;
235 if (mat_v[i][j].first > mat_h[i][j].first)
237 mat_m[i][j].second =
'v';
238 mat_m[i][j].first = mat_v[i][j].first;
242 mat_m[i][j].second =
'h';
243 mat_m[i][j].first = mat_h[i][j].first;
246 if (mat_m[i-1][j-1].first + match_score >= mat_m[i][j].first)
248 mat_m[i][j].second =
'm';
249 mat_m[i][j].first = mat_m[i-1][j-1].first + match_score;
353 template <
typename MatrixStackType>
355 gotoh_traceback(
int dim1,
int dim2,
const MatrixStackType &matrices, std::string &edit_string1, std::string &edit_string2)
357 edit_string1.clear();
358 edit_string2.clear();
364 while ((i!=0) && (j!=0))
366 state = matrices[mat][i][j].second;
373 edit_string1.push_back(
'm');
374 edit_string2.push_back(
'm');
389 edit_string1.push_back(
'm');
390 edit_string2.push_back(
'-');
395 edit_string1.push_back(
'-');
396 edit_string2.push_back(
'm');
406 edit_string1.push_back(
'-');
407 edit_string2.push_back(
'm');
412 edit_string1.push_back(
'm');
413 edit_string2.push_back(
'-');
426 template<
typename DataType>
431 std::unordered_map<short, int>::iterator it;
433 size_t n_seqs1=ids1.size();
434 size_t length1 =
set[ids1[0]].size();
435 std::vector<std::unordered_map<short, int> > prof1(length1);
436 for (i=0; i<n_seqs1; ++i)
438 const typename DataType::value_type &seq =
set[ids1[i]];
439 for (j=0; j<length1; ++j)
443 c=toupper(seq[j])-65;
444 if ((it =prof1[j].find(c)) != prof1[j].end())
452 size_t length2 =
set[ids2[0]].size();
453 size_t n_seqs2=ids2.size();
454 std::vector<std::unordered_map<short, int> > prof2(length2);
456 for (i=0; i<n_seqs2; ++i)
458 const typename DataType::value_type &seq =
set[ids2[i]];
459 for (j=0; j<length2; ++j)
463 c=toupper(seq[j])-65;
464 if ((it =prof2[j].find(c)) != prof2[j].end())
472 matrixStack.ensure(length1+1, length2+1);
474 std::unordered_map<short, int>::iterator it1,it2,it1_end,it2_end;
477 for (i=0; i<length1; ++i)
479 it1_end=prof1[i].end();
480 for (j=0; j<length2; ++j)
483 matrix[i+1][j+1].first=0;
484 it2_end=prof2[i].end();
486 for (it1=prof1[i].begin(); it1!=it1_end; ++it1)
488 const std::vector<int> &vec=sim_mat[it1->first];
489 for (it2=prof2[j].begin(); it2!=it2_end; ++it2)
491 matrix[i+1][j+1].first += vec[it2->first] * it1->second * it2->second;
492 tmp += it1->second * it2->second;
495 matrix[i+1][j+1].first /= tmp;
510 template<
typename DataType>
516 std::vector<std::pair<int, short> > counter;
517 std::vector<int> char2counter;
519 helper():counter(), char2counter(26,0), pos(-1)
526 size_t n_seqs1=ids1.size();
527 size_t length1 =
set[ids1[0]].size();
528 std::vector<helper> prof1(length1);
529 for (i=0; i<n_seqs1; ++i)
531 const typename DataType::value_type &seq =
set[ids1[i]];
532 for (j=0; j<length1; ++j)
536 c=toupper(seq[j])-65;
537 if ((pos2=prof1[j].char2counter[c])!=0)
538 ++prof1[j].counter[pos2].first;
541 prof1[j].char2counter[c]=++prof1[j].pos;
542 prof1[j].counter.emplace_back(std::pair<int, short>(1,c));
548 size_t length2 =
set[ids2[0]].size();
549 size_t n_seqs2=ids2.size();
550 std::vector<helper> prof2(length2);
551 for (i=0; i<n_seqs2; ++i)
553 const typename DataType::value_type &seq =
set[ids2[i]];
554 for (j=0; j<length2; ++j)
558 c=toupper(seq[j])-65;
559 if ((pos2=prof2[j].char2counter[c])!=0)
560 ++prof2[j].counter[pos2].first;
563 prof2[j].char2counter[c]=++prof2[j].pos;
564 prof2[j].counter.emplace_back(std::pair<int, short>(1,c));
571 matrixStack.ensure(length1+50, length2+50);
573 std::vector<std::pair<int, short> >::const_iterator it1,it2,it1_end,it2_end;
577 size_t diff = (length1 > length2) ? (length1-length2) : (length2-length1);
578 for (i=0; i<length1; ++i)
580 it1_end=prof1[i].counter.cend();
581 end = std::min<int>(length2, i+diff+band_width);
582 start = std::max<int>(0,i-diff-band_width);
583 for (j=start; j<end; ++j)
585 matrix[i+1][j+1].first=0;
586 it2_end=prof2[j].counter.cend();
588 for (it1=prof1[i].counter.cbegin(); it1!=it1_end; ++it1)
590 const std::vector<int> &vec=sim_mat[it1->second];
591 for (it2=prof2[j].counter.cbegin(); it2!=it2_end; ++it2)
593 matrix[i+1][j+1].first += vec[it2->second] * it1->first * it2->first;
594 tmp += it1->first * it2->first;
597 matrix[i+1][j+1].first /= tmp;
613 template<
typename DataType>
617 size_t n_elems =
set.size();
618 std::stack<std::pair<const TreeNode*, unsigned int> > to_do;
619 to_do.push(std::pair<const TreeNode*, int>(guide_tree->root(), 0));
620 std::vector<std::pair<unsigned int, unsigned int> > gap1, gap2;
623 for (
size_t i=0; i<n_elems; ++i)
625 int *tree_helper =
new int[guide_tree->n_species()*2];
626 for (
size_t i=0; i<guide_tree->n_species()*2; ++i)
628 size_t dim1, dim2, i;
629 std::string edit_string1, edit_string2;
630 std::vector<float> pos_gops1, pos_gops2;
631 std::vector<float> pos_geps1, pos_geps2;
632 while (!to_do.empty())
634 std::pair<const TreeNode*, unsigned int> ¤t = to_do.top();
635 if (current.first->children.empty())
637 else if (current.second == current.first->children.size())
643 std::vector<size_t> &ids1 = ids[tree_helper[current.first->children[0]->id]];
644 dim1=
set[ids1[0]].length();
645 std::vector<size_t> &ids2 = ids[tree_helper[current.first->children[1]->id]];
646 dim2=
set[ids2[0]].length();
659 for (i=0; i<ids1.size(); ++i)
660 set[ids1[i]].insert_gaps(edit_string1);
661 for (i=0; i<ids2.size(); ++i)
662 set[ids2[i]].insert_gaps(edit_string2);
663 ids1.reserve( ids1.size() + ids2.size() );
664 ids1.insert(ids1.end(), ids2.begin(), ids2.end());
665 tree_helper[current.first->id] = tree_helper[current.first->children[0]->id];
670 to_do.push(std::pair<const TreeNode*, int>(&(*current.first->children[current.second]), 0));
674 delete[] tree_helper;
678 template<
typename DataType>
680 seq_progressive_align(DataType &
set,
const Tree &guide_tree,
const Matrix<int> &sim_mat,
int gop,
int gep,
bool banded)
682 MatrixStack<3, std::pair<float, char> > dyn_matrix;
696 template<
typename DataType>
701 std::vector<size_t> ids1, ids2;
702 size_t n_seqs1 = set1.size();
703 for (i=0; i<n_seqs1; ++i)
705 size_t n_seqs2 = set2.size();
706 for (i=0; i<n_seqs2; ++i)
707 ids2.push_back(n_seqs1+i);
709 std::vector<std::pair<unsigned int, unsigned int> > gap1, gap2;
711 std::string edit_string1, edit_string2;
712 std::vector<float> pos_gops1, pos_gops2;
713 std::vector<float> pos_geps1, pos_geps2;
715 size_t dim1=set1[ids1[0]].length();
716 size_t dim2=set1[ids2[0]].length();
720 for (i=0; i<ids1.size(); ++i)
721 set1[ids1[i]].insert_gaps(edit_string1);
722 for (i=0; i<ids2.size(); ++i)
723 set1[ids2[i]].insert_gaps(edit_string2);
736 template<
typename DataType>
741 std::vector<size_t> ids1, ids2;
742 size_t n_seqs1 = set1.size();
743 for (i=0; i<n_seqs1; ++i)
745 size_t n_seqs2 = set2.size();
746 for (i=0; i<n_seqs2; ++i)
747 ids2.push_back(n_seqs1+i);
749 std::vector<std::pair<unsigned int, unsigned int> > gap1, gap2;
751 std::string edit_string1, edit_string2;
752 std::vector<float> pos_gops1, pos_gops2;
753 std::vector<float> pos_geps1, pos_geps2;
755 size_t dim1=set1[ids1[0]].length();
756 size_t dim2=set1[ids2[0]].length();
760 for (i=0; i<ids1.size(); ++i)
761 set1[ids1[i]].insert_gaps(edit_string1);
762 for (i=0; i<ids2.size(); ++i)
763 set1[ids2[i]].insert_gaps(edit_string2);
771 template<
typename DataType>
775 size_t n_elems=
set.size();
780 std::vector<std::string> names(n_elems,
"");
781 guide_tree.
nj(*dist_mat, names);
788 template<
typename MemoryType>
790 same_architecture_aln(ProteinSequenceSet<MemoryType> &
set,
const Matrix<int> &sim_mat,
int gop,
int gep, SplitSet<ProteinSequenceSet<Default> > &splitSet, MatrixStack<3, std::pair<float, char> > &matrix)
793 domain_column_split(
set, splitSet);
794 size_t n_seqs=
set.n_seqs();
800 Matrix<float> *dist_mat = kmer_dist_mat(
set);
801 std::shared_ptr<Tree> guide_tree(
new Tree());
802 std::vector<std::string> names(
set.size(),
"");
803 std::vector<int> n_members(
set.size(), 1);
804 guide_tree->upgma(*dist_mat, names, n_members);
808 size_t n_pieces = splitSet.size();
811 for (i=0; i<n_pieces; ++i)
814 if (splitSet[i].size() != 0)
820 template<
typename MemoryType>
822 same_architecture_aln(ProteinSequenceSet<MemoryType> &
set,
const Matrix<int> &sim_mat,
int gop,
int gep, SplitSet<ProteinSequenceSet<Default> > &splitSet,
ThreadPool<MatrixStack<3, std::pair<float, char> > > &pool)
825 domain_column_split(
set, splitSet);
826 size_t n_seqs=
set.n_seqs();
833 Matrix<float> *dist_mat = kmer_dist_mat(
set);
834 std::shared_ptr<Tree>guide_tree(
new Tree());
835 std::vector<std::string> names(
set.size(),
"");
836 std::vector<int> n_members(
set.size(), 1);
837 guide_tree->upgma(*dist_mat, names, n_members);
841 size_t n_pieces = splitSet.size();
844 for (i=0; i<n_pieces; ++i)
847 if (splitSet[i].size() != 0)
848 pool.addTask(
static_cast<void (*)(ProteinSequenceSet<MemoryType> &, std::shared_ptr<Tree>,
const Matrix<int> &,
int,
int,
bool, MatrixStack<3, std::pair<float, char>
> &)>(&
seq_progressive_align), std::ref(splitSet[i]), guide_tree, std::ref(sim_mat), gop, gep, banded);
853 template<
typename MemoryType>
855 merge_sequences(SplitSet<ProteinSequenceSet<MemoryType> > &
set,
size_t start,
size_t end)
859 for (j=start; j<=end; ++j)
863 n_seqs=
set[j].size();
867 ProteinSequenceSet<MemoryType> tmp_set;
868 for (i=0; i<n_seqs; ++i)
869 tmp_set.transfer(
set[j]);
871 for (j=j+1; j<=end; ++j)
875 for (i=0; i<n_seqs; ++i)
876 tmp_set[i].append(
set[j][i].sequence());
880 set[end].transfer(tmp_set);
883 template<
typename MemoryType>
885 aln_different_architectures(SplitSet<ProteinSequenceSet<MemoryType> > &set1, SplitSet<ProteinSequenceSet<MemoryType> > &set2,
const Matrix<int> &sim_mat,
int gop,
int gep,
ThreadPool<MatrixStack<3, std::pair<float, char> > > &pool)
887 size_t n_pieces=set1.size();
889 std::vector<int> pattern1 = set1.pattern();
890 std::vector<int> pattern2 = set2.pattern();
891 for (i=0; i<n_pieces; ++i)
893 if ((pattern1[i]!=0) && (pattern2[i]!=0))
895 merge_sequences(set1, pos, i-1);
896 merge_sequences(set2, pos, i-1);
897 pool.addTask(
static_cast<void (*)(ProteinSequenceSet<MemoryType> &, ProteinSequenceSet<MemoryType> &,
const Matrix<int> &,
int,
int, MatrixStack<3, std::pair<float, char>
> &)>(
seq_prof_prof_align), std::ref(set1[i-1]), std::ref(set2[i-1]), std::cref(sim_mat), gop, gep);
899 if (pattern1[i] == pattern2[i])
900 pool.addTask(
static_cast<void (*)(ProteinSequenceSet<MemoryType> &, ProteinSequenceSet<MemoryType> &,
const Matrix<int> &,
int,
int, MatrixStack<3, std::pair<float, char>
> &)>(
seq_prof_prof_align_banded), std::ref(set1[i]), std::ref(set2[i]), std::cref(sim_mat), gop, gep);
902 pool.addTask(
static_cast<void (*)(ProteinSequenceSet<MemoryType> &, ProteinSequenceSet<MemoryType> &,
const Matrix<int> &,
int,
int, MatrixStack<3, std::pair<float, char>
> &)>(
seq_prof_prof_align), std::ref(set1[i]), std::ref(set2[i]), std::cref(sim_mat), gop, gep);
911 merge_sequences(set1, pos, i-1);
912 merge_sequences(set2, pos, i-1);
913 pool.addTask(
static_cast<void (*)(ProteinSequenceSet<MemoryType> &, ProteinSequenceSet<MemoryType> &,
const Matrix<int> &,
int,
int, MatrixStack<3, std::pair<float, char>
> &)>(
seq_prof_prof_align), std::ref(set1[i-1]), std::ref(set2[i-1]), std::cref(sim_mat), gop, gep);
914 set1.pattern(pattern1);
915 set2.pattern(pattern2);
918 template<
typename MemoryType>
920 msa(ProteinSequenceSet<MemoryType> &
set,
const Tree &arch_guide_tree,
const Matrix<int> &sim_mat,
int gop,
int gep,
size_t n_threads)
924 std::vector<SplitSet<ProteinSequenceSet<MemoryType> > > splitted_sets(
set.n_architectures());
925 MatrixStack<3, std::pair<float, char> > matrix(10,10);
927 if (
set.n_architectures()==1)
930 same_architecture_aln(
set, sim_mat, gop, gep, splitted_sets[0], pool);
936 std::vector<ProteinSequenceSet<MemoryType> > architectureSplits;
937 splitByArchitecture(
set, architectureSplits);
938 size_t n_sets=architectureSplits.size();
940 for (
size_t i=0; i<n_sets; ++i)
942 same_architecture_aln(architectureSplits[i], sim_mat, gop, gep, splitted_sets[i], pool);
947 size_t n_elems =
set.size();
948 std::stack<std::pair<const TreeNode*, unsigned int> > to_do;
949 to_do.push(std::pair<const TreeNode*, int>(arch_guide_tree.root(), 0));
950 int *tree_helper =
new int[arch_guide_tree.n_species()*2];
951 for (
size_t i=0; i<arch_guide_tree.n_species()*2; ++i)
953 std::vector<std::pair<unsigned int, unsigned int> > gap1, gap2;
954 Matrix<size_t> ids(n_elems, 1);
955 for (
size_t i=0; i<n_elems; ++i)
958 std::string edit_string1, edit_string2;
959 std::vector<float> pos_gops1, pos_gops2;
960 std::vector<float> pos_geps1, pos_geps2;
961 while (!to_do.empty())
963 std::pair<const TreeNode*, unsigned int> ¤t = to_do.top();
964 if (current.first->children.empty())
966 else if (current.second == current.first->children.size())
968 int first = tree_helper[current.first->children[0]->id];
969 int second = tree_helper[current.first->children[1]->id];
970 tree_helper[current.first->id] = tree_helper[current.first->children[0]->id];
972 aln_different_architectures(splitted_sets[first], splitted_sets[second], sim_mat, gop, gep, pool);
978 to_do.push(std::pair<const TreeNode*, int>(&(*current.first->children[current.second]), 0));
982 delete[] tree_helper;
988 merge_sequences(splitted_sets[last], 0, splitted_sets[last].size()-1);
990 set.transfer(splitted_sets[last][splitted_sets[last].size()-1]);