MDA
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
Sequence.hpp
Go to the documentation of this file.
1 /*
2  * Sequence.hpp
3  *
4  * Created on: Jun 2, 2013
5  * Author: Carsten Kemena
6  *
7  * This file is part of MDAT.
8  *
9  * MDAT is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser General Public License as published by
11  * the Free Software Foundation, either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * MDAT is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public License
20  * along with MDAT. If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23 
29 #ifndef SEQUENCE_HPP_
30 #define SEQUENCE_HPP_
31 
32 // C++ header
33 #include <algorithm>
34 #include <map>
35 #include <memory>
36 #include <string>
37 #include <utility>
38 #include <vector>
39 #include <iostream>
40 
41 // Boost header
42 #include <boost/lexical_cast.hpp>
43 
44 // MDAT header
45 #include "Sequence_Interface.hpp"
46 
47 
48 namespace MDAT
49 {
50 
51 
60 class Sequence
61 {
62 private:
63  std::string _name;
64  std::string _sequence;
65  std::string _comment;
66  size_t _id;
67 
68 public:
69 
70  /******************************************************
71  * Constructors & Destructors *
72  ******************************************************/
73 
75 
76 
83  Sequence(const std::string &seq_name, const std::string &seq, const std::string &comment_="", size_t seq_id=0);
93  Sequence(const std::string &seq_name, const std::string &comment_, unsigned int seq_length, size_t seq_id=0);
94 
100  Sequence(const Sequence &seq);
101  //Sequence(Sequence &&seq);
102 
106  virtual ~Sequence();
112 
113 
118  char &operator[](unsigned int index)
119  {
120  return _sequence[index];
121  }
122 
126  const char &operator[](unsigned int index) const
127  {
128  return _sequence[index];
129  }
130 
136  friend bool operator ==(const Sequence& a, const Sequence& b)
137  {
138  return(a.sequence() == b.sequence());
139  }
140 
146  friend bool operator !=(const Sequence& a, const Sequence& b)
147  {
148  return(a.sequence() != b.sequence());
149  }
150 
156  friend bool operator <(const Sequence& a, const Sequence& b)
157  {
158  return(a.sequence() < b.sequence());
159  }
160 
166  friend bool operator >(const Sequence& a, const Sequence& b)
167  {
168  return(a.sequence() > b.sequence());
169  }
170 
176  friend bool operator <=(const Sequence& a, const Sequence& b)
177  {
178  return(a.sequence() <= b.sequence());
179  }
180 
186  friend bool operator >=(const Sequence& a, const Sequence& b)
187  {
188  return(a.sequence()==b.sequence());
189  }
190 
191 
192 
193  Sequence & operator= (const Sequence & other)
194  {
195  if (this != &other) // protect against invalid self-assignment
196  {
197  _name=other._name;
198  _comment=other._comment;
199  _sequence=other._sequence;
200  _id=other._id;
201  }
202  // by convention, always return *this
203  return *this;
204  }
205 
206  friend std::ostream& operator<< (std::ostream &out, const Sequence &seq);
208  //Functions
209 
211 
212 
217  const
218  std::string & name() const
219  {
220  return _name;
221  }
222 
223  void name(const std::string &na)
224  {
225  _name=na;
226  }
227 
228 
233  const
234  std::string & sequence() const
235  {
236  return _sequence;
237  }
238 
239 
244  void
245  sequence(const std::string &seq)
246  {
247  _sequence=seq;
248  }
249 
254  const
255  std::string & comment() const
256  {
257  return _comment;
258  }
259 
260 
265  void
266  comment(const std::string &com)
267  {
268  _comment=com;
269  }
270 
275  size_t size() const
276  {
277  return _sequence.size();
278  }
279 
284  size_t length() const
285  {
286  return _sequence.size();
287  }
288 
293  size_t ungapped_size() const;
294 
301  size_t id() const
302  {
303  return _id;
304  }
305 
310  void id(size_t val)
311  {
312  _id=val;
313  }
319 
320 
321 
325  template<class T>
326  void append(const T &seq)
327  {
328  _sequence.append(seq);
329  }
330 
334  void append(const Sequence &seq)
335  {
336  _sequence.append(seq.sequence());
337  }
338 
339 
343  void append(char c)
344  {
345  _sequence.push_back(c);
346  }
347 
352  void
353  resize(unsigned int new_length)
354  {
355  _sequence.resize(new_length);
356  }
357 
361  void
363  {
364  std::string::iterator it, it_end =_sequence.end();
365  for (it = _sequence.begin(); it != it_end; ++it)
366  *it = toupper(*it);
367  }
368 
372  void
374  {
375  std::string::iterator it, it_end =_sequence.end();
376  for (it = _sequence.begin(); it != it_end; ++it)
377  *it = tolower(*it);
378  }
379 
383  void
385  {
386  std::reverse(_sequence.begin(), _sequence.end());
387  }
388 
393  void
394  insert_gaps(const std::vector<std::pair<unsigned int, unsigned int> > vec);
395  void
396  insert_gaps(const std::string &edit_string);
397 
400 };
401 
409 template<typename SeqType>
410 char
411 identify_seq_type(const SeqType &seq)
412 {
413  size_t seq_len = seq.size();
414  char c;
415  for (unsigned int i = 0; i < seq_len; ++i)
416  {
417  c = tolower(seq[i]);
418  if ((c != 'a') && (c != 'c') && (c != 'g') && (c != 't') && (c != 'u'))
419  return 'P';
420  }
421  return 'N';
422 }
423 
424 
425 template<typename SeqType>
426 std::pair<size_t, size_t>
427 coverage(const SeqType &seq1, const SeqType &seq2)
428 {
429  size_t len=seq1.size();
430  if (len != seq2.size())
431  return std::pair<size_t, size_t>(-1,-1);
432  size_t pair_len=0, pos=0;
433  for (size_t i=0; i<len; ++i)
434  {
435  if ((seq1[i]!='-') || (seq2[i]!='-'))
436  ++pair_len;
437  if ((seq1[i]!='-') && (seq2[i]!='-'))
438  ++pos;
439  }
440 
441  return std::pair<size_t, size_t>(pos, pair_len);
442 }
443 
444 template<typename SeqType>
445 std::pair<size_t, size_t>
446 id(const SeqType &seq1, const SeqType &seq2)
447 {
448  size_t len=seq1.size();
449  if (len != seq2.size())
450  return std::pair<size_t, size_t>(-1,-1);
451  size_t pair_len=0, pos=0;
452  for (size_t i=0; i<len; ++i)
453  {
454  if ((seq1[i]!='-') || (seq2[i]!='-'))
455  ++pair_len;
456  if (seq1[i] == seq2[i])
457  ++pos;
458  }
459 
460  return std::pair<size_t, size_t>(pos, pair_len);
461 }
462 
463 
464 template<typename SeqType>
465 bool
466 seq_check(const SeqType &seq1, const SeqType &seq2)
467 {
468  size_t seq1_l = seq1.size();
469  size_t seq2_l = seq2.size();
470  size_t i,j=0;
471  for (i=0; i < seq1_l; ++i)
472  {
473  if (seq1[i] == '-')
474  continue;
475  while ((j<seq2_l) && (seq2[j] == '-'))
476  ++j;
477 
478  if ((j==seq2_l) || (tolower(seq1[i]) != tolower(seq2[j])))
479  return 0;
480  ++j;
481  }
482  while (j!=seq2_l)
483  {
484  if (seq2[j] != '-')
485  return 0;
486  ++j;
487  }
488  return 1;
489 }
490 
491 
492 template<typename SeqType>
493 bool
494 bio_seq(const SeqType &seq)
495 {
496  size_t len = seq.size();
497  int c;
498  for (size_t i=0; i<len; ++i)
499  {
500  c = tolower(seq[i]);
501  if ((c!=45) && ((c<97) || (c>122)))
502  return false;
503  }
504 
505  return true;
506 }
507 
508 
509 
510 
519 template<typename SequenceTypeIn, typename SequenceTypeOut>
520 SequenceTypeOut
521 dna2prot(const SequenceTypeIn &seq)
522 {
523  SequenceTypeOut new_seq(seq.name(), seq.comment(), seq.size()/3, seq.id());
524  const std::string &seq_string = seq.sequence();
525  new_seq.append(dna2prot<std::string,std::string>(seq_string));
526  return new_seq;
527 }
528 
529 
530 template<>
531 std::string
532 dna2prot<std::string, std::string>(const std::string &seq);
533 
534 
535 } /* namespace Sequence */
536 #endif /* SEQUENCE_HPP_ */