MDA
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
Library.hpp
Go to the documentation of this file.
1 /*
2  * Library.hpp
3  *
4  * Created on: Apr 6, 2012
5  * Author: Carsten Kemena
6  *
7  * This file is part of MDAT++.
8  *
9  * MDAT++ is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser General Public License as published by
11  * the Free Software Foundation, either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * MDAT++ is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public License
20  * along with MDAT++. If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23 
24 
29 #ifndef LIBRARY_H_
30 #define LIBRARY_H_
31 
32 // C header
33 #include <cstdlib>
34 
35 // C++ header
36 #include <algorithm>
37 #include <limits>
38 #include <map>
39 #include <memory>
40 #include <utility>
41 #include <vector>
42 
43 
44 #include <iostream>
45 // MDAT header
46 #include "../utils/Matrix.hpp"
47 #include "../utils/ThreadPool.hpp"
48 
49 namespace MDAT {
50 
51 
56 typedef std::pair<unsigned int, unsigned int> Match;
57 
58 
63 {
67  Match_point():x(0),y(0),score(0.0)
68  {}
69 
76  Match_point(unsigned int x_, unsigned int y_, float score_):x(x_),y(y_),score(score_)
77  {}
78 
86  friend bool operator <(const Match_point &a, const Match_point &b)
87  {
88  if (a.x != b.x)
89  return (a.x < b.x);
90  else
91  return (a.y < b.y);
92  }
93 
94  unsigned int x;
95  unsigned int y;
96  float score;
97 };
98 
99 
111 template <typename DataType>
112 class Library
113 {
114 
115 private:
116 
117  const DataType &_data;
118  size_t _n_elems;
119  size_t _max_length;
120  size_t _n_pairs;
121 
122  std::auto_ptr<std::vector<std::vector<Match_point> > > _pairs;
123 
124  struct RelaxHelper
125  {
126  Matrix<float> final_mat;
127  Matrix<float> tmp_mat;
128  std::vector<std::pair<int,int> > used;
129  std::vector<int> n_hits;
130 
131  RelaxHelper(size_t max_length): final_mat(Matrix<float>(max_length, max_length)), tmp_mat(Matrix<float>(max_length, max_length*2)), used(max_length*max_length), n_hits(std::vector<int>(max_length*2, 0))
132  {
133  }
134 
135  };
136 
137  template <typename RelaxFunc>
138  void
139  _relax_pair(size_t i, std::vector<std::vector<Match_point> > *new_vals, RelaxFunc relax_function, RelaxHelper &helper);
140 
141 public:
142  // Constructors & Destructors
143 
145 
146 
151  Library(const DataType &data);
152 
153 
157  virtual ~Library();
162 
163 
164 
175  void add(unsigned int id1, unsigned int id2, unsigned int pos1, unsigned int pos2, float score=1.0);
176 
183  void get(size_t id1, size_t id2, std::map<Match, int> &match_points) const;
184 
199  template<typename GapFunc>
200  void get(const std::vector<size_t> &ids1, const std::vector<size_t> &ids2, std::map<Match, int> &match_points, const GapFunc gap_func) const;
201 
202 
209  void reserve_add_memory(unsigned int id1, unsigned int id2, size_t n_entries);
214 
215 
224  template <typename RelaxFunc>
225  void
226  relax(RelaxFunc relax_function);
227 
228  template<typename RelaxFunc>
229  void
230  relax(RelaxFunc relax_function, int n_threads);
231 
232 /*
233  void
234  sort();*/
235 
236  void
237  print()
238  {
239  int i,j,k;
240  int pos;
241  for (i=0; i<_n_elems; ++i)
242  {
243  for (j=i+1; j<_n_elems; ++j)
244  {
245  printf("%i %i\n", i, j);
246  pos= i*_n_elems-(i*i+i)/2 + j -i-1;
247  std::vector<Match_point> &matches=(*_pairs)[pos];
248  for (k=0; k<matches.size(); ++k)
249  printf("(%i %i %f) ", matches[k].x, matches[k].y, matches[k].score);
250  printf("\n");
251  }
252  }
253  }
255 };
256 
257 
258 
259 
260 template<typename DataType>
261 Library<DataType>::Library(const DataType &set):_data(set), _n_elems(set.size()), _max_length(0),_n_pairs((_n_elems*(_n_elems-1))/2), _pairs(new std::vector<std::vector<Match_point> >(_n_pairs))
262 {
263  for (size_t i = 0; i<_n_elems; ++i)
264  {
265  if (set[i].length() > _max_length)
266  _max_length = set[i].length();
267  }
268 }
269 
270 
271 template<typename DataType>
273  // TODO Auto-generated destructor stub
274 }
275 
276 
277 template<typename DataType>
278 void
279 Library<DataType>::add(unsigned int id1, unsigned int id2, unsigned int pos1, unsigned int pos2, float score)
280 {
281  // the index of a pair i/j (with i<j) is calculated as following: i*n_seq - (i*(i+1)/2) +j-i-1
282  // i*n_seq - (i*(i+1)/2) gives the first index having i at the beginning
283  // j-i-1 adds the shift necessary for the pair i/j
284  score=100;
285  unsigned int id;
286  if (id1 < id2)
287  {
288  id = id1*_n_elems - (id1*(id1+1)/2) +id2-id1-1;
289  (*_pairs)[id].push_back(Match_point(pos1, pos2, score));
290  }
291  else
292  {
293  id = id2*_n_elems - (id2*(id2+1)/2) +id1-id2-1;
294  (*_pairs)[id].push_back(Match_point(pos2, pos1, score));
295  }
296 }
297 
298 template<typename DataType>
299 void
300 Library<DataType>::reserve_add_memory(unsigned int id1, unsigned int id2, size_t n_entries)
301 {
302  unsigned int id;
303  if (id1 < id2)
304  id = id1*_n_elems - (id1*(id1+1)/2) +id2-id1-1;
305  else
306  id = id2*_n_elems - (id2*(id2+1)/2) +id1-id2-1;
307  (*_pairs)[id].reserve((*_pairs)[id].size()+n_entries);
308 }
309 
310 template<typename DataType>
311 void
312 Library<DataType>::get(size_t id1, size_t id2, std::map<Match, int> &match_points) const
313 {
314  unsigned int id;
315  if (id1 < id2)
316  {
317  id = id1*_n_elems - (id1*(id1+1)/2) +id2-id1-1;
318  std::vector<Match_point> &matches=(*_pairs)[id];
319  size_t n_matches=matches.size();
320  for (size_t i=0; i<n_matches; ++i)
321  match_points[Match(matches[i].x, matches[i].y)] = matches[i].score;
322  }
323  else
324  {
325  id = id2*_n_elems - (id2*(id2+1)/2) +id1-id2-1;
326  std::vector<Match_point> &matches=(*_pairs)[id];
327  size_t n_matches=matches.size();
328  for (size_t i=0; i<n_matches; ++i)
329  match_points[Match(matches[i].y, matches[i].x)] = matches[i].score;
330  }
331 
332 
333 }
334 
335 template<typename DataType>
336 template<typename GapFunc>
337 void Library<DataType>::get(const std::vector<size_t> &ids1, const std::vector<size_t> &ids2, std::map<Match, int> &match_points, const GapFunc gap_func) const
338 {
339  match_points.clear();
340  size_t n_ids1=ids1.size();
341  size_t n_ids2=ids2.size();
342  size_t n_elems1=_data[ids1[0]].length();
343  size_t n_elems2=_data[ids2[0]].length();
344  size_t i,j,k,pos;
345  std::map<Match, int> tmp_match_points;
346  std::map<Match, int>::iterator it, it_end;
347  std::vector<unsigned int> convert1, convert2;
348  convert1.resize(n_elems1);
349  convert2.resize(n_elems2);
350  std::pair<std::map<Match, int>::iterator,bool> ret;
351  for (i=0; i<n_ids1; ++i)
352  {
353  pos=0;
354  for (k=0; k<n_elems1; ++k)
355  {
356  if (!gap_func(_data[ids1[i]],k))
357  convert1[pos++] = k;
358  }
359 
360  for (j=0; j<n_ids2; ++j)
361  {
362  pos=0;
363  for (k=0; k<n_elems2; ++k)
364  {
365  if (!gap_func(_data[ids2[j]],k))
366  convert2[pos++] = k;
367  }
368 
369  get(_data[ids1[i]].id(), _data[ids2[j]].id(), tmp_match_points);
370  it_end=tmp_match_points.end();
371  for (it=tmp_match_points.begin(); it!=it_end; ++it)
372  {
373  Match tmp_m(convert1[it->first.first], convert2[it->first.second]);
374  ret=match_points.insert(std::pair<Match, int>(tmp_m, it->second));
375  if (!ret.second)
376  ret.first->second+=it->second;
377 
378  }
379  }
380  }
381  it_end = match_points.end();
382  size_t overall=n_ids1*n_ids2;
383  for (it = match_points.begin(); it != it_end; ++it )
384  it->second/=overall;
385 
386 }
387 
388 
389 
390 template<typename DataType>
391 template<typename RelaxFunc>
392 void
393 Library<DataType>::relax(RelaxFunc relax_function, int n_threads)
394 {
395  if (n_threads <=1)
396  {
397  relax(relax_function);
398  return;
399  }
400  size_t i;
401  std::vector<std::vector<Match_point> > *new_vals = new std::vector<std::vector<Match_point> >();
402  new_vals->resize(_n_pairs);
403  ThreadPool<RelaxHelper> pool(n_threads, RelaxHelper(_max_length));
404  for (i=0; i<_n_elems; ++i)
405  pool.addTask(&Library<DataType>::_relax_pair<RelaxFunc>, this, i, new_vals, relax_function);
406  pool.stop();
407  _pairs.reset(new_vals);
408 }
409 
410 
411 
412 template<typename DataType>
413 template<typename RelaxFunc>
414 void
415 Library<DataType>::relax(RelaxFunc relax_function)
416 {
417  std::vector<std::vector<Match_point> > *new_vals = new std::vector<std::vector<Match_point> >();
418  new_vals->resize(_n_pairs);
419  Matrix<float> final_mat(_max_length, _max_length);
420  Matrix<float> tmp_mat(_max_length, _max_length*2);
421  std::vector<std::pair<int,int> > used;
422  used.resize(_max_length*_max_length);
423  std::vector<int> n_hits(_max_length*2, 0);
424  size_t use=0;
425 
426  std::vector<Match_point>::const_iterator it,it_end;
427 
428  size_t i,j,k,m,id, current_pair_id, pos;
429  int l;
430  for (i=0; i<_n_elems; ++i)
431  {
432  for (j=i+1; j<_n_elems; ++j)
433  {
434  current_pair_id=i*_n_elems - (i*(i+1)/2) +j-i-1;
435  it_end=(*_pairs)[current_pair_id].end();
436  use=0;
437  for (it=(*_pairs)[current_pair_id].begin(); it!=it_end; ++it)
438  {
439  final_mat[it->x][it->y]+=2*it->score;
440  used[use].first=it->x;
441  used[use].second=it->y;
442  ++use;
443  }
444 
445  for (k=0; k<_n_elems; ++k)
446  {
447  if ((k==i) || (k==j))
448  continue;
449 // tmp_mat.fill(0);
450  if (k<i)
451  {
452 
453  id=k*_n_elems - (k*(k+1)/2) +i-k-1;
454  it=(*_pairs)[id].begin();
455  it_end=(*_pairs)[id].end();
456  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
457  {
458  pos=n_hits[it->x]*2;
459  ++n_hits[it->x];
460  tmp_mat[it->x][pos]=it->y;
461  tmp_mat[it->x][++pos]=it->score;
462  }
463  }
464  else
465  {
466  id=i*_n_elems - (i*(i+1)/2) +k-i-1;
467  it=(*_pairs)[id].begin();
468  it_end=(*_pairs)[id].end();
469  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
470  {
471  pos=n_hits[it->y]*2;
472  ++n_hits[it->y];
473  tmp_mat[it->y][pos]=it->x;
474  tmp_mat[it->y][++pos]=it->score;
475  }
476  }
477  if (k<j)
478  {
479  id=k*_n_elems - (k*(k+1)/2) +j-k-1;
480  it=(*_pairs)[id].begin();
481  it_end=(*_pairs)[id].end();
482  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
483  for(l=0; l<n_hits[it->x]; ++l)
484  if (final_mat[tmp_mat[it->x][l*2]][it->y])
485  final_mat[tmp_mat[it->x][l*2]][it->y]+= relax_function(tmp_mat[it->x][l*2+1], it->score);
486  /*else
487  {
488  used[use].first=tmp_mat[it->x][l*2];
489  used[use].second=it->y;
490  ++use;
491  final_mat[tmp_mat[it->x][l*2]][it->y] = relax_function(tmp_mat[it->x][l*2+1], it->score);
492  }*/
493  }
494  else
495  {
496  id=j*_n_elems - (j*(j+1)/2) +k-j-1;
497  it=(*_pairs)[id].begin();
498  it_end=(*_pairs)[id].end();
499  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
500  for(l=0; l<n_hits[it->y]; ++l)
501  if (final_mat[tmp_mat[it->y][l*2]][it->x])
502  final_mat[tmp_mat[it->y][l*2]][it->x]+= relax_function(tmp_mat[it->y][l*2+1], it->score);
503  /*else
504  {
505  used[use].first=tmp_mat[it->y][l*2];
506  used[use].second=it->x;
507  ++use;
508  final_mat[tmp_mat[it->y][l*2]][it->x]= relax_function(tmp_mat[it->y][l*2+1], it->score);
509  }*/
510 
511  }
512  if (k<i)
513  {
514 
515  id=k*_n_elems - (k*(k+1)/2) +i-k-1;
516  it=(*_pairs)[id].begin();
517  it_end=(*_pairs)[id].end();
518  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
519  {
520  pos=n_hits[it->x]*2;
521  ++n_hits[it->x];
522  tmp_mat[it->x][pos]=it->y;
523  tmp_mat[it->x][++pos]=0;
524  }
525  }
526  else
527  {
528  id=i*_n_elems - (i*(i+1)/2) +k-i-1;
529  it=(*_pairs)[id].begin();
530  it_end=(*_pairs)[id].end();
531  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
532  {
533  pos=n_hits[it->y]*2;
534  ++n_hits[it->y];
535  tmp_mat[it->y][pos]=it->x;
536  tmp_mat[it->y][++pos]=0;
537  }
538  }
539  for (m=0; m<2*_max_length; ++m)
540  n_hits[m]=0;
541  }
542 
543  for (k=0; k<use; ++k)
544  {
545  (*new_vals)[current_pair_id].push_back(Match_point(used[k].first, used[k].second, (final_mat[used[k].first][used[k].second])));
546  final_mat[used[k].first][used[k].second] = 0;
547  }
548  use=0;
549  }
550  }
551  _pairs.reset(new_vals);
552 }
553 
554 
555 template<typename DataType>
556 template<typename RelaxFunc>
557 void
558 Library<DataType>::_relax_pair(size_t i, std::vector<std::vector<Match_point> > *new_vals, RelaxFunc relax_function, RelaxHelper &helper)
559 {
560  Matrix<float> &final_mat = helper.final_mat;
561  Matrix<float> &tmp_mat = helper.tmp_mat;
562  std::vector<std::pair<int,int> > used = helper.used;
563  std::vector<int> &n_hits = helper.n_hits;
564  size_t use=0;
565  std::vector<Match_point>::const_iterator it,it_end;
566  size_t k,m,id, current_pair_id, pos;
567  int l;
568  for (size_t j=i+1; j<_n_elems; ++j)
569  {
570  current_pair_id=i*_n_elems - (i*(i+1)/2) +j-i-1;
571  it_end=(*_pairs)[current_pair_id].end();
572  use=0;
573  for (it=(*_pairs)[current_pair_id].begin(); it!=it_end; ++it)
574  {
575  final_mat[it->x][it->y]+=2*it->score;
576  used[use].first=it->x;
577  used[use].second=it->y;
578  ++use;
579  }
580 
581  for (k=0; k<_n_elems; ++k)
582  {
583  if ((k==i) || (k==j))
584  continue;
585  // tmp_mat.fill(0);
586  if (k<i)
587  {
588 
589  id=k*_n_elems - (k*(k+1)/2) +i-k-1;
590  it=(*_pairs)[id].begin();
591  it_end=(*_pairs)[id].end();
592  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
593  {
594  pos=n_hits[it->x]*2;
595  ++n_hits[it->x];
596  tmp_mat[it->x][pos]=it->y;
597  tmp_mat[it->x][++pos]=it->score;
598  }
599  }
600  else
601  {
602  id=i*_n_elems - (i*(i+1)/2) +k-i-1;
603  it=(*_pairs)[id].begin();
604  it_end=(*_pairs)[id].end();
605  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
606  {
607  pos=n_hits[it->y]*2;
608  ++n_hits[it->y];
609  tmp_mat[it->y][pos]=it->x;
610  tmp_mat[it->y][++pos]=it->score;
611  }
612  }
613  if (k<j)
614  {
615  id=k*_n_elems - (k*(k+1)/2) +j-k-1;
616  it=(*_pairs)[id].begin();
617  it_end=(*_pairs)[id].end();
618  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
619  for(l=0; l<n_hits[it->x]; ++l)
620  if (final_mat[tmp_mat[it->x][l*2]][it->y])
621  final_mat[tmp_mat[it->x][l*2]][it->y]+= relax_function(tmp_mat[it->x][l*2+1], it->score);
622  /*else
623  {
624  used[use].first=tmp_mat[it->x][l*2];
625  used[use].second=it->y;
626  ++use;
627  final_mat[tmp_mat[it->x][l*2]][it->y] = relax_function(tmp_mat[it->x][l*2+1], it->score);
628  }*/
629  }
630  else
631  {
632  id=j*_n_elems - (j*(j+1)/2) +k-j-1;
633  it=(*_pairs)[id].begin();
634  it_end=(*_pairs)[id].end();
635  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
636  for(l=0; l<n_hits[it->y]; ++l)
637  if (final_mat[tmp_mat[it->y][l*2]][it->x])
638  final_mat[tmp_mat[it->y][l*2]][it->x]+= relax_function(tmp_mat[it->y][l*2+1], it->score);
639  /*else
640  {
641  used[use].first=tmp_mat[it->y][l*2];
642  used[use].second=it->x;
643  ++use;
644  final_mat[tmp_mat[it->y][l*2]][it->x]= relax_function(tmp_mat[it->y][l*2+1], it->score);
645  }*/
646 
647  }
648  if (k<i)
649  {
650 
651  id=k*_n_elems - (k*(k+1)/2) +i-k-1;
652  it=(*_pairs)[id].begin();
653  it_end=(*_pairs)[id].end();
654  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
655  {
656  pos=n_hits[it->x]*2;
657  ++n_hits[it->x];
658  tmp_mat[it->x][pos]=it->y;
659  tmp_mat[it->x][++pos]=0;
660  }
661  }
662  else
663  {
664  id=i*_n_elems - (i*(i+1)/2) +k-i-1;
665  it=(*_pairs)[id].begin();
666  it_end=(*_pairs)[id].end();
667  for (it=(*_pairs)[id].begin(); it!=it_end; ++it)
668  {
669  pos=n_hits[it->y]*2;
670  ++n_hits[it->y];
671  tmp_mat[it->y][pos]=it->x;
672  tmp_mat[it->y][++pos]=0;
673  }
674  }
675  for (m=0; m<2*_max_length; ++m)
676  n_hits[m]=0;
677  }
678 
679  for (k=0; k<use; ++k)
680  {
681  (*new_vals)[current_pair_id].push_back(Match_point(used[k].first, used[k].second, (final_mat[used[k].first][used[k].second])));
682  final_mat[used[k].first][used[k].second] = 0;
683  }
684  }
685 }
686 
687 
688 
689 /*
690 void
691 Library::print(const MDAT::SequenceSet &set, std::string out_f)
692 {
693  FILE *out_F = my_fopen(out_f, "w");
694  vector<Match_point>::const_iterator it, it_end;
695  size_t i, j;
696 
697  // add information from direct pairwise alignment
698  // the index of a pair i/j (with i<j) is calculated as following: i*n_seq - (i*(i+1)/2) +j-i-1
699  // i*n_seq - (i*(i+1)/2) gives the first index having i at the beginning
700  // j-i-1 adds the shift necessary for the pair i/j
701  unsigned int id;
702  for (i=0; i<_n_elems; ++i)
703  {
704  for (j=i+1; j<_n_elems; ++j)
705  {
706  fprintf(out_F, "#%li %li\n", i, j);
707  id = i*_n_elems - (i*(i+1)/2) +j-i-1;
708  it = (*_pairs)[id].begin();
709  it_end = (*_pairs)[id].end();
710 
711  while (it != it_end)
712  {
713  fprintf(out_F, "%u %u %f\n", it->x, it->y, it->score);
714  ++it;
715  }
716 
717  }
718  }
719  fclose(out_F);
720 }
721 */
722 
723 
724 
725 
726 
727 
728 } /* namespace MDAT */
729 #endif /* LIBRARY_H_ */