MDA
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
HMM.hpp
Go to the documentation of this file.
1 /*
2  * 5_state_hmm.h
3  *
4  * Created on: Apr 12, 2012
5  * Author: Carsten Kemena
6  *
7  * This file is part of MDAT.
8  *
9  * MDAT is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser General Public License as published by
11  * the Free Software Foundation, either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * MDAT is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public License
20  * along with MDAT. If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23 
28 #ifndef HMM_H_
29 #define HMM_H_
30 
31 
32 // C header
33 #include <cctype>
34 #include <cfloat>
35 
36 // C++ header
37 #include <cmath>
38 #include <unordered_map>
39 
40 
41 // MDAT header
42 #include "../utils/Matrix.hpp"
43 
44 
45 namespace MDAT {
46 
47 
48 
52 class HMM {
53 
54 private:
55  Matrix<float> _transProb;
56  std::vector<float> _insProb;
57  Matrix<float> _matchProb;
58  float *_initDistr;
59  short _num_states;
60  short _num_ins_states;
61 
62 
63 public:
68  HMM(char type);
69  virtual ~HMM();
70 
75  short num_states() const
76  {
77  return _num_states;
78  }
79 
84  short num_ins_states() const
85  {
86  return _num_ins_states;
87  }
88 
93  const Matrix<float>&
94  trans_probs() const
95  {
96  return _transProb;
97  }
98 
103  const std::vector<float>&
104  ins_probs() const
105  {
106  return _insProb;
107  }
108 
113  const Matrix<float>&
114  match_probs() const
115  {
116  return _matchProb;
117  }
118 
119 
124  const float *
126  {
127  return _initDistr;
128  }
129 
130 
137  template<typename DataType>
138  void
139  calculate_insertion_probs(const DataType &aln, std::vector<float> &ins_probs);
140 
146  template<typename DataType>
147  void
148  calculate_insertion_probs_splitted(const DataType &aln_vec, std::vector<float> &ins_probs);
149 
157  template<typename DataType>
158  void
159  calculate_match_probs(const DataType &aln1, const DataType &aln2, Matrix<float> &match_probs);
160 
168  template<typename DataType>
169  void
170  calculate_match_probs_splitted(const DataType &aln_vec1, const DataType &aln_vec2, Matrix<float> &match_probs, size_t start, size_t end);
171 
172 
173 
174 
175 };
176 
177 
178 template<typename DataType>
179 void
180 HMM::calculate_insertion_probs(const DataType &aln, std::vector<float> &ins_probs)
181 {
182  size_t aln_len = aln.length();
183  size_t i, j;
184 
185  //calculate new insertion scores
186  ins_probs.resize(aln_len);
187  for (j=0; j<aln_len; ++j)
188  ins_probs[j]=0;
189  std::vector<int> non_gap_counter(aln_len);
190 
191  size_t n_seqs = aln.n_seqs();
192  char c;
193  for (i=0; i<n_seqs; ++i)
194  {
195  for (j=0; j<aln_len; ++j)
196  {
197  c=aln[i][j];
198  if (c!='-')
199  {
200  ++non_gap_counter[j];
201  ins_probs[j] += _insProb[c];
202  }
203  }
204 
205  }
206  for (j=0; j<aln_len; ++j)
207  ins_probs[j] /= non_gap_counter[j];
208 }
209 
210 template<typename DataType>
211 void
212 HMM::calculate_insertion_probs_splitted(const DataType &aln_vec, std::vector<float> &ins_probs)
213 {
214  size_t n_pieces = aln_vec.size();
215  size_t total_len = 0;
216  size_t i, j, k;
217  for (i=0; i<n_pieces; ++i)
218  total_len+=aln_vec[i].length();
219 
220  //calculate new insertion scores
221  ins_probs.resize(total_len);
222  for (j=0; j<total_len; ++j)
223  ins_probs[j]=0;
224  std::vector<int> non_gap_counter(total_len);
225  typedef typename DataType::value_type SetType;
226  typedef typename SetType::value_type SeqType;
227 
228  size_t n_seqs;
229  char c;
230  size_t aln_length;
231  size_t overall_pos=0;
232  size_t pos=0;
233  for (k=0; k<n_pieces; ++k)
234  {
235  const SetType &aln = aln_vec[k];
236  aln_length=aln.length();
237  n_seqs=aln.size();
238  for (i=0; i<n_seqs; ++i)
239  {
240  pos=overall_pos;
241  const SeqType &seq=aln[i];
242  for (j=0; j<aln_length; ++j)
243  {
244  c=seq[j];
245  if (c!='-')
246  {
247  ++non_gap_counter[pos];
248  ins_probs[pos] += _insProb[c];
249  }
250  ++pos;
251  }
252  }
253  overall_pos+=aln_length;
254  }
255 
256  for (j=0; j<total_len; ++j)
257  ins_probs[j] /= non_gap_counter[j];
258 }
259 
260 
261 template<typename DataType>
262 void
263 HMM::calculate_match_probs(const DataType &aln1, const DataType &aln2, Matrix<float> &match_probs)
264 {
265  size_t aln_len1=aln1.length();
266  size_t aln_len2=aln2.length();
267 // Matrix<float> prof1(aln_len1, 26, 0);
268 // Matrix<float> prof2(aln_len2, 26, 0);
269  std::vector<std::unordered_map<short, int> > prof1(aln_len1);
270  size_t i,j;
271  std::vector<std::unordered_map<short, int> > prof2(aln_len2);
272  std::unordered_map<short, int>::iterator it;
273 
274  size_t n_seq1=aln1.size();
275  size_t n_seq2=aln2.size();
276  short c;
277  std::vector<int> observed(26,0);
278  for (i=0; i<n_seq1; ++i)
279  {
280  const typename DataType::value_type &seq = aln1[i];
281  for (j=0; j<aln_len1; ++j)
282  {
283  if (seq[j] != '-')
284  {
285  c=tolower(seq[j]);
286  if ((it =prof1[j].find(c)) != prof1[j].end())
287  ++it->second;
288  else
289  prof1[j][c]=1;
290  }
291  }
292  }
293 
294  for (i=0; i<n_seq2; ++i)
295  {
296  const typename DataType::value_type &seq = aln2[i];
297  for (j=0; j<aln_len2; ++j)
298  if (seq[j] != '-')
299  {
300  c=tolower(seq[j]);
301  if ((it =prof2[j].find(c)) != prof2[j].end())
302  ++it->second;
303  else
304  prof2[j][c]=1;
305  }
306  }
307 
308  match_probs.resize(aln_len1, aln_len2);
309  std::unordered_map<short, int>::iterator it1,it2,it1_end,it2_end;
310  double tmp;
311  for (i=0; i<aln_len1; ++i)
312  {
313  it1_end=prof1[i].end();
314  for (j=0; j<aln_len2; ++j)
315  {
316  it2_end=prof2[i].end();
317  tmp=0;
318  for (it1=prof1[i].begin(); it1!=it1_end; ++it1)
319  {
320  for (it2=prof2[j].begin(); it2!=it2_end; ++it2)
321  {
322  match_probs[i][j] += _matchProb[it1->first][it2->first] * it1->second * it2->second;
323  tmp += it1->second * it2->second;
324  }
325  }
326  match_probs[i][j] /= tmp;
327  }
328  }
329 }
330 
331 
332 template<typename DataType>
333 void
334 HMM::calculate_match_probs_splitted(const DataType &aln_vec1, const DataType &aln_vec2, Matrix<float> &match_probs, size_t start, size_t end)
335 {
336  typedef typename DataType::value_type SetType;
337  typedef typename SetType::value_type SeqType;
338  size_t n_pieces1=end-start+1;//aln_vec1.size();
339  size_t n_pieces2=aln_vec2.size();
340  size_t aln_len;
341 
342  size_t i,j,k,l;
343  size_t n_seqs,complete_len1=0, complete_len2=0;
344  for (k=0; k<n_pieces1; ++k)
345  complete_len1 += aln_vec1[k].length();
346  for (k=0; k<n_pieces2; ++k)
347  complete_len2 += aln_vec2[k].length();
348 
349  std::vector<std::unordered_map<short, int> > prof2(complete_len1);
350  std::vector<std::unordered_map<short, int> > prof1(complete_len2);
351  std::unordered_map<short, int>::iterator it1,it2,it1_end,it2_end;
352  size_t overall_pos=0;
353  size_t pos=0;
354  char c;
355  std::vector<bool> is_domain1(complete_len1, false);
356  std::vector<bool> is_domain2(complete_len2, false);
357  std::vector<int> observed(26,0);
358  for (k=0; k<n_pieces1; ++k)
359  {
360  const SetType &aln=aln_vec1[k];
361  aln_len = aln.length();
362  n_seqs = aln.size();
363  if (k%2 == 1)
364  {
365  for (l=overall_pos; l<overall_pos+aln_len; ++l)
366  is_domain1[l]=true;
367  }
368  for (i=0; i<n_seqs; ++i)
369  {
370  pos=overall_pos;
371  const SeqType &seq = aln[i];
372  for (j=0; j<aln_len; ++j)
373  {
374  if (seq[j] != '-')
375  {
376  c=tolower(seq[j]);
377  if ((it1 =prof1[pos].find(c)) != prof1[pos].end())
378  ++it1->second;
379  else
380  prof1[pos][c]=1;
381  }
382  }
383  }
384  overall_pos+=aln_len;
385  }
386 
387  overall_pos=0;
388  for (k=0; k<n_pieces2; ++k)
389  {
390  const SetType &aln=aln_vec2[k];
391  aln_len = aln.length();
392  n_seqs = aln.size();
393  if (k%2 == 1)
394  {
395  for (l=overall_pos; l<overall_pos+aln_len; ++l)
396  is_domain2[l]=true;
397  }
398  for (i=0; i<n_seqs; ++i)
399  {
400  pos=overall_pos;
401  const SeqType &seq = aln[i];
402  for (j=0; j<aln_len; ++j)
403  {
404  if (seq[j] != '-')
405  {
406  c=tolower(seq[j]);
407  if ((it1 =prof2[pos].find(c)) != prof2[pos].end())
408  ++it1->second;
409  else
410  prof1[pos][c]=1;
411  }
412  }
413  }
414  overall_pos+=aln_len;
415  }
416 
417  match_probs.resize(complete_len1, complete_len2);
418  double tmp;
419  for (i=0; i<complete_len1; ++i)
420  {
421  it1_end=prof1[i].end();
422  for (j=0; j<complete_len2; ++j)
423  {
424  it2_end=prof2[i].end();
425  tmp=0;
426  for (it1=prof1[i].begin(); it1!=it1_end; ++it1)
427  {
428  for (it2=prof2[j].begin(); it2!=it2_end; ++it2)
429  {
430  match_probs[i][j] += _matchProb[it1->first][it2->first] * it1->second * it2->second;
431  tmp += it1->second * it2->second;
432  }
433  }
434  match_probs[i][j] /= tmp;
435  }
436  }
437 }
438 
439 
440 
441 
442 
443 
444 
445 
446 }
447 
448 
449 #endif /* HMM_H_ */