MDA
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
SequenceSet_basic.hpp
1 /*
2  * SequenceSetBase.hpp
3  *
4  * Created on: Jun 6, 2013
5  * Author: Carsten Kemena
6  *
7  * This file is part of MDAT.
8  *
9  * MDAT is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser General Public License as published by
11  * the Free Software Foundation, either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * MDAT is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public License
20  * along with MDAT. If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23 
24 
25 #ifndef SequenceSetBase_HPP_
26 #define SequenceSetBase_HPP_
27 
28 // C header
29 #include <cstdlib>
30 #include <cstring>
31 #include <cstdio>
32 #include <iostream>
33 // C++ header
34 #include <algorithm>
35 #include <fstream>
36 #include <map>
37 #include <memory>
38 #include <string>
39 #include <vector>
40 #include <utility>
41 
42 
43 // MDAT header
44 #include "../Basics/basics.hpp"
45 #include "../Basics/utils.hpp"
46 #include "../utils/MDAT_Exceptions.hpp"
47 #include "../utils/filesystem.h"
48 
49 
50 // Boost header
51 
52 namespace MDAT
53 {
54 
55 
56 
57 
68 template<typename SequenceType, typename MemoryType>
70 {
71 
72 
73 typedef std::shared_ptr<SequenceType> Seq_ptr;
74 
75 private:
76  std::vector<Seq_ptr> _seqs;
77  char _seq_type; // Sequence type
78  std::string _file; // File the alignment was read from
79  size_t _id;
80  mutable std::map<std::string, size_t> _name2index;
81 
82  // read multiple sequence alignment file
83  short _identify_aln_format(FILE *aln_F);
84  void _read_fasta_f(FILE *aln_F, const std::map<std::string, short> &seq_names);
85  void _read_clustalw_f(FILE *aln_F, const std::map<std::string, short> &seq_names);
86  void _read_msf_f(FILE *aln_F, const std::map<std::string, short> &seq_names);
87  void _read_stockholm_f(FILE *aln_F, const std::map<std::string, short> &seq_names);
88  void _read_codata_f(FILE *aln_F, const std::map<std::string, short> &seq_names);
89  void _read_amps_f(FILE *aln_F, const std::map<std::string, short> &seq_names);
90  void _read_phylip_f(FILE *aln_F, const std::map<std::string, short> &seq_names);
91 
92 
93  // write alignment to file
94  void _write_fasta(FILE *aln_f, unsigned int line_break=99999999) const;
95  void _gap_replace();
96  bool _seq_check();
97 
98 
99 public:
103  typedef SequenceType value_type;
104 
105  /***********************************************************************
106  * Constructors & Destructors *
107  ***********************************************************************/
108 
110 
111 
112 
116  SequenceSetBase();
117  SequenceSetBase(size_t value);
118  //SequenceSetBase(SequenceSetBase &seqSet);
119  //SequenceSetBase(SequenceSetBase &&seqSet);
123  virtual ~SequenceSetBase();
126  /***********************************************************************
127  * Simple Access functions *
128  ***********************************************************************/
129 
130  // Operators
132 
133 
139  SequenceType &operator[](unsigned int index)
140  {
141  return *_seqs[index];
142  }
143 
147  const SequenceType &operator[](unsigned int index) const
148  {
149  return *_seqs[index];
150  }
151 
152 
158  SequenceType &operator[](const std::string &seq_name)
159  {
160  if (_name2index.size() != _seqs.size())
161  {
162  size_t n_seqs = _seqs.size();
163  for (size_t i=0; i<n_seqs; ++i)
164  _name2index[_seqs[i]->name()]=i;
165  }
166  return *_seqs[_name2index[seq_name]];
167  }
168 
172  const SequenceType &operator[](const std::string &seq_name) const
173  {
174  if (_name2index.size() != _seqs.size())
175  {
176  size_t n_seqs = _seqs.size();
177  for (size_t i=0; i<n_seqs; ++i)
178  _name2index[_seqs[i]->name()]=i;
179  }
180  return *_seqs[_name2index[seq_name]];
181  }
182 
183 
190  template<typename SeqType, typename MemType>
191  friend std::ostream& operator<< (std::ostream &out, const SequenceSetBase<SeqType, MemType> &seqSet);
192 
193 
198 
199 
205  const SequenceType* seq(unsigned int index) const
206  {
207  return &(*(_seqs[index]));
208  }
209 
210 
215  size_t n_seqs() const
216  {
217  return _seqs.size();
218  }
219 
224  size_t size() const
225  {
226  return _seqs.size();
227  }
228 
229 
235  size_t
236  length() const
237  {
238  if (_seqs.empty())
239  return 0;
240  else
241  return _seqs[0]->size();
242  }
243 
248  double avg_size() const;
249 
253  bool empty() const
254  {
255  return (_seqs.empty());
256  }
257 
261  std::string file() const
262  {
263  return _file;
264  }
265 
270  char seq_type() const throw()
271  {
272  return _seq_type;
273  }
274 
279  void seq_type(char seq_type_) throw()
280  {
281  _seq_type = seq_type_;
282  }
283 
284 
289  int id() const throw()
290  {
291  return _id;
292  }
293 
298  void id(int val)
299  {
300  _id=val;
301  }
302 
306  void clear()
307  {
308  _seqs.clear();
309  _file.clear();
310  }
314  //****** read/write SequenceSetBases ******
316 
317 
327  virtual void read(const std::string &seq_f, const std::vector<std::string> &seq_names, bool check=false, short format =-1);
328 
337  virtual void read(const std::string &seq_f, bool check=false, short format =-1)
338  {
339  read(seq_f, std::vector<std::string>(), check, format);
340  }
341 
342 
350  virtual void write(const std::string &seq_f, const std::string format) const;
351 
352 
357  void add_seq(SequenceType *seq)
358  {
359  _seqs.push_back(std::shared_ptr<SequenceType> (seq));
360  }
361 
362 
366  //****** Manipulation methods ******
368 
369 
370 
374  void to_upper()
375  {
376  for (size_t i = 0; i<_seqs.size(); ++i)
377  _seqs[i]->to_upper();
378  }
379 
383  void to_lower()
384  {
385  for (size_t i = 0; i<_seqs.size(); ++i)
386  _seqs[i]->to_lower();
387  }
388 
389 
390 
395  virtual void delete_seqs(const std::map<std::string,bool> &names);
396 
401  virtual void delete_seqs(std::vector<size_t> &indices);
402 
407  virtual void keep_seqs(std::vector<size_t> &indices);
408 
409 
415  void
417  {
418  _seqs.push_back(Seq_ptr());
419  _seqs[_seqs.size()-1] = set._seqs[id];
420  }
421 
422 
428  void
430  {
431  _seqs.push_back(Seq_ptr(set._seqs[id]));
432  set._seqs.erase(set._seqs.begin()+id);
433  }
434 
439  void
441  {
442  size_t n_seqs=set.n_seqs();
443  _seqs.reserve(this->n_seqs()+n_seqs);
444  for (size_t i=0; i<n_seqs; ++i)
445  _seqs.push_back(Seq_ptr(set._seqs[i]));
446  set.clear();
447  }
448 
449 
454  void sort(std::string type);
455 
456 
461  void
462  insert_gaps(const std::string &edit_string);
463 
470 };
471 
477 /***********************************************************************
478  * Constructors & Destructors *
479  ***********************************************************************/
480 
481 template<typename SequenceType, typename MemoryType>
482 SequenceSetBase<SequenceType, MemoryType>::SequenceSetBase():_seqs(), _seq_type('x'), _file(""), _id(-1), _name2index()
483 {}
484 
485 template<typename SequenceType, typename MemoryType>
486 SequenceSetBase<SequenceType, MemoryType>::SequenceSetBase(size_t value):_seqs(), _seq_type('x'), _file(""), _id(value), _name2index()
487 {}
488 
489 /*template<typename SequenceType, typename MemoryType>
490 SequenceSetBase<SequenceType, MemoryType>::SequenceSetBase(SequenceSetBase<SequenceType, MemoryType> &&seqSet)
491 {
492  *this=std::move(seqSet);
493 }*/
494 
495 template<typename SequenceType, typename MemoryType>
497 {
498 
499 }
500 
501 
502 
503 template<typename SequenceType, typename MemoryType>
504 void
506 {
507  size_t num_seqs = this->n_seqs();
508  size_t n_cols;
509  size_t j;
510 
511  for (size_t i=0; i<num_seqs; ++i)
512  {
513  SequenceType &tmp_seq = *_seqs[i];
514  n_cols = tmp_seq.size();
515  for (j=0; j<n_cols; ++j)
516  {
517  if (!isalpha(tmp_seq[j]))
518  tmp_seq[j]='-';
519  }
520  }
521 }
522 
523 
524 template<typename SequenceType, typename MemoryType>
525 short
526 SequenceSetBase<SequenceType, MemoryType>::_identify_aln_format(FILE *aln_F)
527 {
528  const unsigned int LINE_LENGTH = 501;
529  char line[LINE_LENGTH];
530  char *tmp, *pos;
531  while (fgets(line, LINE_LENGTH, aln_F) != nullptr)
532  {
533  if (line[0] == '>')
534  {
535  if (fgets(line, LINE_LENGTH, aln_F)==nullptr)
536  {
537  fprintf(stderr, "Error occurred when trying to identify format!\n");
538  exit(EXIT_FAILURE);
539  }
540  if (line[0] == '>')
541  return 6; // AMPS
542  else
543  return 1; // Fasta
544  }
545  if (strstr(line, "CLUSTAL ") != nullptr)
546  return 2; // Clustal
547  if (strstr(line, "MSF:") != nullptr)
548  return 3; // MFS
549  if (strstr(line, "# STOCKHOLM 1.0") != nullptr)
550  return 4; // Stockholm
551  if (strstr(line, "ENTRY") != nullptr )
552  return 5; // Codata
553 
554  //check Phylip
555  StrTok strTok(line);
556  tmp = strTok.next(" \n\t");
557  if (tmp == nullptr)
558  continue;
559  pos=tmp;
560  while (*pos != '\0')
561  if (!isdigit(*(pos++)))
562  continue;
563  tmp = strTok.next(" \n\t");
564  if (tmp == nullptr)
565  continue;
566  pos=tmp;
567  while (*pos != '\0')
568  if (!isdigit(*(pos++)))
569  continue;
570  tmp = strTok.next(" \n\t");
571  if (tmp == nullptr)
572  return 7; // Phylip
573  }
574  return 0;
575 }
576 
577 
578 template<typename SequenceType, typename MemoryType>
579 void
580 SequenceSetBase<SequenceType, MemoryType>::read(const std::string &seq_f, const std::vector<std::string> &seq_names, bool check, short format)
581 {
582  FILE *aln_F = my_fopen(seq_f.c_str(), "r");
583 
584  _file = seq_f;
585  if (format <0)
586  format = _identify_aln_format(aln_F);
587  std::map<std::string, short> extract_only;
588  size_t n_extract_seqs = seq_names.size();
589  for (unsigned int i = 0; i < n_extract_seqs; ++i)
590  extract_only.insert(std::pair<std::string,short>(seq_names[i],1));
591 
592  fseek ( aln_F , 0 , SEEK_SET );
593  switch (format)
594  {
595  case 1:
596  _read_fasta_f(aln_F, extract_only);
597  break;
598  case 2:
599  _read_clustalw_f(aln_F, extract_only);
600  break;
601  case 3:
602  _read_msf_f(aln_F, extract_only);
603  break;
604  case 4:
605  _read_stockholm_f(aln_F, extract_only);
606  break;
607  case 5:
608  _read_codata_f(aln_F, extract_only);
609  break;
610  case 6:
611  _read_amps_f(aln_F, extract_only);
612  break;
613  case 7:
614  _read_phylip_f(aln_F, extract_only);
615  break;
616  default:
617  throw Alignment_Exception("Format could not be identified");
618  }
619  fclose(aln_F);
620 
621  if ((check) && (!check_set(*this)))
622  {
623  throw Alignment_Exception("Sequence contains bad character");
624  }
625  this->seq_type(identify_seq_type(*(this->_seqs)[0]));
626 }
627 
628 
629 template<typename SequenceType, typename MemoryType>
630 void
631 SequenceSetBase<SequenceType, MemoryType>::_read_fasta_f(FILE *aln_F, const std::map<std::string, short> &seq_names)
632 {
633  const unsigned int LINE_LENGTH = 500;
634  char line[LINE_LENGTH];
635  SequenceType *tmp_seq = nullptr;
636  char *comment = nullptr, *name = nullptr;
637  bool read_sequence = true;
638  size_t id=0;
639  size_t seq_length=0;
640  while (fgets(line, LINE_LENGTH, aln_F))
641  {
642  if (line[0] == '>')
643  {
644  if (this->n_seqs())
645  seq_length = tmp_seq->size();
646  StrTok tokenizer(&line[1]);
647  name = tokenizer.next(" \n");
648  comment = tokenizer.next("\n");
649  if (seq_names.empty() || (seq_names.count(name)>0))
650  {
651  read_sequence = true;
652  if (comment == nullptr)
653  tmp_seq = new SequenceType(name, "", seq_length, id++);
654  else
655  tmp_seq = new SequenceType(name, comment, seq_length, id++);
656  _seqs.push_back(Seq_ptr(tmp_seq));
657  }
658  else
659  read_sequence = false;
660  }
661  else if (line[0] == '/')
662  break;
663  else
664  {
665  if (read_sequence)
666  {
667  tmp_seq->append(line);
668  if ((*tmp_seq)[tmp_seq->size()-1] == '\n')
669  tmp_seq->resize(tmp_seq->size()-1);
670  }
671  }
672  }
673 }
674 
675 
676 template<typename SequenceType, typename MemoryType>
677 void
678 SequenceSetBase<SequenceType, MemoryType>::_read_clustalw_f(FILE *aln_F, const std::map<std::string, short> &seq_names)
679 {
680  const unsigned int LINE_LENGTH = 200;
681  char line[LINE_LENGTH];
682  if (fgets(line, LINE_LENGTH, aln_F)==nullptr)
683  {
684  fprintf(stderr, "Error when reading file.");
685  exit(EXIT_FAILURE);
686  }
687  long t_pos = 0;
688  size_t id = 0;
689  while ((fgets(line, LINE_LENGTH, aln_F) != nullptr) && (line[0] == '\n'))
690  t_pos = ftell(aln_F);
691  fseek(aln_F, t_pos, SEEK_SET);
692 
693  //read first block in format
694  SequenceType *seq_p = nullptr;
695  char *name = nullptr, *tmp_seq = nullptr;
696  std::vector<short> read_sequence;
697  StrTok tokenizer;
698  while (fgets(line, LINE_LENGTH, aln_F) != nullptr)
699  {
700  if ((line[0] == ' ') || (line[0] == '\n'))
701  break;
702  else
703  {
704  tokenizer.set(line);
705  name = tokenizer.next(" \n");
706  if (seq_names.empty() || seq_names.count(name) > 0)
707  {
708  read_sequence.push_back(1);
709  tmp_seq = tokenizer.next(" \n");
710  seq_p = new SequenceType(name, "", tmp_seq, id++);
711  _seqs.push_back(Seq_ptr(seq_p));
712  }
713  else
714  read_sequence.push_back(0);
715  }
716  }
717 
718  // Read rest of the alignment
719  unsigned int counter = 0;
720  unsigned int seq_id = 0;
721  while (fgets(line, LINE_LENGTH, aln_F) != nullptr)
722  {
723  if ((line[0] == ' ') || (line[0] == '\n'))
724  {
725  seq_id = 0;
726  counter = 0;
727  }
728  else
729  {
730  if (read_sequence[counter])
731  {
732  tokenizer.set(line);
733  tokenizer.next(" \n");
734  tmp_seq = tokenizer.next(" \n");
735  _seqs[seq_id++]->append(tmp_seq);
736  }
737  ++counter;
738  }
739  }
740 }
741 
742 
743 template<typename SequenceType, typename MemoryType>
744 void
745 SequenceSetBase<SequenceType, MemoryType>::_read_msf_f(FILE *aln_F, const std::map<std::string, short> &seq_names)
746 {
747  char *msf;
748  const unsigned int LINE_LENGTH = 501;
749  char line[LINE_LENGTH];
750  while ((fgets(line, LINE_LENGTH, aln_F) != nullptr) && ((msf=strstr(line, "MSF:")) == nullptr));
751  msf +=4;
752  size_t seq_length=atoi(msf);
753  char *seq_name;
754  SequenceType *tmp_seq;
755  StrTok tokenizer;
756  std::vector<short> read_sequence;
757  std::size_t id = 0;
758 
759  //read comment block
760  while (fgets(line, LINE_LENGTH, aln_F) != nullptr)
761  {
762  if ((line[0] == '/') && (line[1] == '/'))
763  break;
764  if (((seq_name=strstr(line, "Name:")) != nullptr) || ((seq_name=strstr(line, "NAME:")) != nullptr))
765  {
766  tokenizer.set(seq_name+5);
767  seq_name = tokenizer.next(" \n");
768  if (seq_names.empty() || (seq_names.count(seq_name)>0))
769  {
770  read_sequence.push_back(1);
771  tmp_seq = new SequenceType(seq_name, "", seq_length, id++);
772  _seqs.push_back(Seq_ptr(tmp_seq));
773 
774  }
775  else
776  read_sequence.push_back(0);
777  }
778  }
779 
780  //read alignment
781  char *pos;
782  char c;
783  unsigned int counter = 0;
784  unsigned int seq_id = 0;
785  while (fgets(line, LINE_LENGTH, aln_F) != nullptr)
786  {
787  if (line[0] == '\n')
788  {
789  counter = 0;
790  seq_id = 0;
791  }
792  else
793  {
794  if (read_sequence[counter]==1)
795  {
796  pos = line;
797  while (*(++pos) != ' ');
798  while (1)
799  {
800  while ((c=*pos) != '\0')
801  {
802  if ((c=='-') || (c=='.') || (isalpha(c)))
803  _seqs[seq_id]->append(c);
804  ++pos;
805  }
806  if (*(pos-1)=='\n')
807  break;
808  else
809  {
810  if(fgets(line, LINE_LENGTH, aln_F)==nullptr)
811  {
812  fprintf(stderr, "Erorr occured\n");
813  exit(EXIT_FAILURE);
814  }
815  pos = &line[0];
816  }
817  }
818  ++seq_id;
819  }
820  else
821  {
822  while (1)
823  {
824  pos = line;
825  while (*pos != '\0')
826  ++pos;
827  if (*(pos-1)=='\n')
828  break;
829  if (fgets(line, LINE_LENGTH, aln_F)==nullptr)
830  {
831  fprintf(stderr, "Erorr occured\n");
832  exit(EXIT_FAILURE);
833  }
834  }
835  }
836  ++counter;
837  }
838  }
839  _gap_replace();
840 }
841 
842 
843 template<typename SequenceType, typename MemoryType>
844 void
845 SequenceSetBase<SequenceType, MemoryType>::_read_stockholm_f(FILE *aln_F, const std::map<std::string, short> &seq_names)
846 {
847  const unsigned int LINE_LENGTH = 501;
848  char line[LINE_LENGTH];
849  char *part1_p = nullptr, *part2_p =nullptr;
850  StrTok tokenizer;
851  SequenceType *seq_p = nullptr;
852  char *end=&line[LINE_LENGTH-1];
853  bool use = false;
854  bool append = false;
855  size_t seq_id = 0;
856  std::vector<short> read_seq;
857  *end=8;
858  size_t id = 0;
859  while (fgets(line, LINE_LENGTH, aln_F))
860  {
861  if (line[0] == '\n')
862  {
863  append = true;
864  seq_id = 0;
865  }
866  else if (line[0] == '#')
867  {
868  while ((*end != 8) && (*end != '\n'))
869  {
870  *end = 8;
871  if (!fgets(line, LINE_LENGTH, aln_F))
872  {
873  fprintf(stderr, "Error occurred\n");
874  exit(EXIT_FAILURE);
875  }
876  }
877  }
878  else if (line[0] != '/')
879  {
880  tokenizer.set(line);
881  part1_p = tokenizer.next(" \n");
882  part2_p = tokenizer.next(" \n");
883  if (part2_p != nullptr)
884  {
885  if (seq_names.empty() || (seq_names.count(part1_p)))
886  {
887  use=true;
888  if (append)
889  {
890  seq_p = &(*_seqs[seq_id]);
891  seq_p->append(part2_p);
892  } else {
893  seq_p = new SequenceType(part1_p, part2_p, " ", id++);
894  _seqs.push_back(Seq_ptr(seq_p));
895  }
896  ++seq_id;
897  }
898  else
899  use=false;
900  }
901  else
902  {
903  if (use)
904  seq_p->append(part1_p);
905  }
906  } else
907  use=false;
908  *end=8;
909  }
910 
911  //Turn "." into "-"
912  _gap_replace();
913 }
914 
915 
916 template<typename SequenceType, typename MemoryType>
917 void
918 SequenceSetBase<SequenceType, MemoryType>::_read_codata_f(FILE *aln_F, const std::map<std::string, short> &seq_names)
919 {
920  const unsigned int LINE_LENGTH = 501;
921  char line[LINE_LENGTH];
922  char *name=nullptr;
923  size_t seq_length = 0;
924  StrTok tokenizer;
925  SequenceType *seq_p = nullptr;
926  size_t id = 0;
927  char c;
928  unsigned int pos1 = 0, pos2 = 1;
929  bool extract_seq = false;
930  while (fgets(line, LINE_LENGTH, aln_F))
931  {
932  if (!strncmp(line, "ENTRY", 5))
933  {
934  tokenizer.set(&line[6]);
935 
936  name = tokenizer.next(" \n");
937  }
938  else if (!strncmp(line, "SEQUENCE", 8))
939  {
940  if (seq_names.empty() || seq_names.count(name))
941  {
942  seq_p = new SequenceType(name, "", seq_length, id++);
943  _seqs.push_back(Seq_ptr(seq_p));
944  if (!fgets(line, LINE_LENGTH, aln_F))
945  {
946  fprintf(stderr, "Error occurred\n");
947  exit(EXIT_FAILURE);
948  }
949  extract_seq = true;
950  }
951  else
952  extract_seq = false;
953  }
954  else if (strncmp(line, "///", 3) == 0)
955  {
956  if (extract_seq)
957  seq_length = seq_p->size();
958  }
959  else
960  {
961  if (extract_seq)
962  {
963  pos1 = 0;
964  pos2 = 0;
965  while ((c=line[pos1++]) != '\0')
966  {
967  if (isalpha(c) || (c == '-'))
968  line[pos2++] = c;
969  }
970  line[pos2] = '\0';
971  seq_p->append(line);
972  }
973  }
974  }
975 }
976 
977 
978 template<typename SequenceType, typename MemoryType>
979 void
980 SequenceSetBase<SequenceType, MemoryType>::_read_amps_f(FILE *aln_F, const std::map<std::string, short> &seq_names)
981 {
982  const unsigned int LINE_LENGTH = 501;
983  char line[LINE_LENGTH];
984  std::vector<short> extract_seq;
985  StrTok tokenizer;
986  SequenceType *seq_p;
987  size_t id = 0;
988  char *name = nullptr;
989  unsigned int num_seqs = 0;
990  while (fgets(line, LINE_LENGTH, aln_F))
991  {
992  if (line[0] == '*')
993  break;
994  else if (line[0] == '>')
995  {
996  ++num_seqs;
997  tokenizer.set(&line[1]);
998  name = tokenizer.next("\n");
999  if (seq_names.empty() || seq_names.count(name))
1000  {
1001  seq_p = new SequenceType(name, "", 0, id++);
1002  _seqs.push_back(Seq_ptr(seq_p));
1003  extract_seq.push_back(1);
1004  }
1005  else
1006  extract_seq.push_back(0);
1007  }
1008  }
1009  char c;
1010  char *seq_line = new char[num_seqs+2];
1011  unsigned int n_seqs2 = num_seqs+2;
1012  unsigned int i;
1013  unsigned int seq_id = 0;
1014  while (fgets(seq_line, n_seqs2, aln_F))
1015  {
1016  if (seq_line[0] == '*')
1017  break;
1018  else
1019  {
1020  seq_id = 0;
1021  for (i=0; i<num_seqs; ++i)
1022  {
1023  if (extract_seq[i])
1024  {
1025  if ((c = seq_line[i]) == ' ')
1026  _seqs[seq_id]->append('-');
1027  else
1028  _seqs[seq_id]->append(c);
1029  ++seq_id;
1030  }
1031  }
1032  }
1033  }
1034  delete[] seq_line;
1035 }
1036 
1037 
1038 // reads interleaved and sequential Phylip format
1039 template<typename SequenceType, typename MemoryType>
1040 void
1041 SequenceSetBase<SequenceType, MemoryType>::_read_phylip_f(FILE *aln_F, const std::map<std::string, short> &seq_names)
1042 {
1043  const unsigned int LINE_LENGTH = 500;
1044  char line[LINE_LENGTH];
1045  std::vector<int> use_seq;
1046  size_t num_seqs, seq_length;
1047  if (fgets(line, LINE_LENGTH, aln_F))
1048  {
1049  fprintf(stderr, "Error when reading file.");
1050  exit(EXIT_FAILURE);
1051  }
1052  sscanf(line, "%lu %lu", &num_seqs, &seq_length);
1053  char name[11];
1054  name[10] = '\0';
1055  char *pos;
1056  SequenceType *seq_p;
1057  size_t seq_num = num_seqs;
1058  size_t id = 0;
1059  char c;
1060  size_t line_num = 0;
1061  int x =-1;
1062  while (fgets(line, LINE_LENGTH, aln_F))
1063  {
1064  if (line[0] == '\n')
1065  continue;
1066 
1067  //Read sequence name
1068  if (line_num < num_seqs)
1069  {
1070  strncpy(name, line, 10);
1071  pos=name;
1072  while ((*pos != '\0') && (*pos != ' '))
1073  ++pos;
1074  *pos = '\0';
1075  if ((seq_names.empty()) || (seq_names.count(name)))
1076  {
1077  use_seq.push_back(++x);
1078  seq_p = new SequenceType(name, "", seq_length, id++);
1079  _seqs.push_back(Seq_ptr(seq_p));
1080  }
1081  else
1082  {
1083  use_seq.push_back(-1);
1084  }
1085  pos=&line[10];
1086  ++line_num;
1087  }
1088  else
1089  pos=&line[0];
1090 
1091  //read_sequence
1092  ++seq_num;
1093  if (seq_num >= num_seqs)
1094  seq_num=0;
1095  if (use_seq[seq_num] >= 0)
1096  {
1097 
1098  seq_p=&(*_seqs[use_seq[seq_num]]);
1099  while (1)
1100  {
1101  while ((c=*pos) != '\0')
1102  {
1103  if ((isalpha(c)) || (c=='-'))
1104  seq_p->append(c);
1105  ++pos;
1106  }
1107 
1108  if (*(pos-1) != '\n')
1109  {
1110  if (!fgets(line, LINE_LENGTH, aln_F))
1111  break;
1112  pos=&line[0];
1113  }
1114  else
1115  break;
1116  }
1117  }
1118  else
1119  {
1120  while (line[strlen(line)-1] != '\n')
1121  {
1122  if (!fgets(line, LINE_LENGTH, aln_F))
1123  break;
1124  }
1125  }
1126 
1127  }
1128 }
1129 
1130 
1131 
1132 // Write functions
1133 
1134 template<typename SequenceType, typename MemoryType>
1135 void
1136 SequenceSetBase<SequenceType, MemoryType>::write(const std::string &aln_f, const std::string format) const
1137 {
1138  std::string format_lc;
1139  format_lc.reserve(format.size());
1140  for (size_t i= 0; i<format.size(); ++i)
1141  format_lc.push_back(tolower(format[i]));
1142 
1143  FILE *aln_F;
1144  if ((format_lc == "fasta"))
1145  aln_F = my_fopen(aln_f.c_str(), "w");
1146  else
1147  throw(Alignment_Exception("Unknown alignment format!"));
1148 
1149  if (format_lc == "fasta")
1150  _write_fasta(aln_F);
1151 
1152  fclose(aln_F);
1153 }
1154 
1155 
1156 template<typename SequenceType, typename MemoryType>
1157 void
1158 SequenceSetBase<SequenceType, MemoryType>::_write_fasta(FILE *aln_F, unsigned int line_break) const
1159 {
1160  unsigned int num_seqs = this->n_seqs();
1161  size_t current_length, seq_length;
1162  std::string tmp_seq;
1163 // char symbol='\0';
1164 
1165  for (unsigned int i = 0; i < num_seqs; ++i)
1166  {
1167  if (_seqs[i]->comment().empty())
1168  fprintf(aln_F, ">%s\n", _seqs[i]->name().c_str());
1169  else
1170  fprintf(aln_F, ">%s %s\n", _seqs[i]->name().c_str(), _seqs[i]->comment().c_str());
1171  current_length = 0;
1172  tmp_seq = _seqs[i]->sequence();
1173  seq_length=_seqs[i]->size();
1174  while (current_length < seq_length)
1175  {
1176  fprintf(aln_F, "%.*s\n", line_break, &tmp_seq.c_str()[current_length]);
1177  current_length += line_break;
1178  }
1179 
1180  }
1181 }
1182 
1183 
1184 
1185 
1186 
1187 template<typename SequenceType, typename MemoryType>
1188 void
1190 {
1191  std::sort(indices.begin(), indices.end());
1192  size_t n_dels = indices.size();
1193  size_t num_seqs = this->n_seqs();
1194  size_t index_pos = 0;
1195  size_t seq_pos = 0;
1196  for (size_t i = 0; i < num_seqs; ++i)
1197  {
1198  if ((index_pos==n_dels) || ((index_pos<n_dels) && (indices[index_pos] != i)))
1199  _seqs[seq_pos++] = _seqs[i];
1200  else
1201  ++index_pos;
1202  }
1203  _seqs.resize(seq_pos);
1204 }
1205 
1206 template<typename SequenceType, typename MemoryType>
1207 void
1209 {
1210  std::sort(indices.begin(), indices.end());
1211  size_t n_dels = indices.size();
1212  size_t num_seqs = this->n_seqs();
1213  size_t index_pos = 0;
1214  size_t seq_pos = 0;
1215  for (size_t i = 0; i < num_seqs; ++i)
1216  {
1217  if (indices[index_pos] == i)
1218  {
1219  _seqs[seq_pos++] = _seqs[i];
1220  ++index_pos;
1221  }
1222  if (index_pos==n_dels)
1223  break;
1224  }
1225  _seqs.resize(seq_pos);
1226 }
1227 
1228 template<typename SequenceType, typename MemoryType>
1229 void
1230 SequenceSetBase<SequenceType, MemoryType>::delete_seqs(const std::map<std::string,bool> &names)
1231 {
1232  size_t num_seqs = this->n_seqs();
1233  size_t seq_pos = 0;
1234  for (size_t i = 0; i < num_seqs; ++i)
1235  {
1236  if (names.count(_seqs[i]->name()))
1237  _seqs[seq_pos++] = _seqs[i];
1238  }
1239  _seqs.resize(seq_pos);
1240 }
1241 
1242 
1243 
1244 
1245 template <typename T>
1246 struct Input_Sort : public std::binary_function<T,T,bool>
1247 {
1248  bool operator()(const T& o1, const T& o2)
1249  {
1250  return o1->id() < o2->id();
1251  }
1252 };
1253 
1254 
1255 template <typename T>
1256 struct Seq_Sort : public std::binary_function<T,T,bool>
1257 {
1258  bool operator()(const T& o1, const T& o2)
1259  {
1260  return *o1<*o2;
1261  }
1262 };
1263 
1264 
1265 template <typename T>
1266 struct Name_Sort : public std::binary_function<T,T,bool>
1267 {
1268  bool operator()(const T& o1, const T& o2)
1269  {
1270  return o1->name() < o2->name();
1271  }
1272 };
1273 
1274 template<typename SequenceType, typename MemoryType>
1275 void
1277 {
1278  if (type == "id")
1279  {
1280  std::sort(_seqs.begin(), _seqs.end(), Input_Sort<std::shared_ptr<SequenceType> >());
1281  }
1282  else if (type == "seq")
1283  {
1284  std::sort(_seqs.begin(), _seqs.end(), Seq_Sort<std::shared_ptr<SequenceType> >());
1285  }
1286  else if (type == "name")
1287  {
1288  std::sort(_seqs.begin(), _seqs.end(), Name_Sort<std::shared_ptr<SequenceType> >());
1289  }
1290 }
1291 
1292 
1293 template<typename SequenceType, typename MemoryType>
1294 void
1296 {
1297  size_t n_seqs=_seqs.size();
1298  for (size_t i=0; i<n_seqs; ++i)
1299  _seqs[i]->insert_gaps(edit_string);
1300 }
1301 
1302 
1303 template<typename SequenceType, typename MemoryType>
1304 bool
1305 check_set(const SequenceSetBase<SequenceType, MemoryType> &set)
1306 {
1307  size_t n_seqs = set.n_seqs();
1308  size_t j, len;
1309  size_t *val_counting = new size_t[256];
1310  for (j=0; j<256; ++j)
1311  val_counting[j] = 0;
1312  // counting occurrences of characters
1313  for (size_t i=0; i<n_seqs; ++i)
1314  {
1315 
1316  const SequenceType &seq = set[i];
1317 
1318  len=seq.size();
1319  for (j=0; j<len; ++j)
1320  ++val_counting[static_cast<int>(seq[j])];
1321  }
1322 
1323  // see if strange character has been found
1324  for (j=0; j<45; ++j)
1325  {
1326  if (val_counting[j] != 0)
1327  return false;
1328  }
1329  for (j=46; j<65; ++j)
1330  {
1331  if (val_counting[j] != 0)
1332  return false;
1333  }
1334  for (j=91; j<97; ++j)
1335  {
1336  if (val_counting[j] != 0)
1337  return false;
1338  }
1339  for (j=123; j<256; ++j)
1340  {
1341  if (val_counting[j] != 0)
1342  return false;
1343  }
1344  delete[] val_counting;
1345  return true;
1346 }
1347 
1348 template<typename SeqType, typename MemType>
1349 std::ostream& operator<< (std::ostream &out, const SequenceSetBase<SeqType, MemType> &seqSet)
1350 {
1351  size_t n_seqs = seqSet.n_seqs();
1352  for (size_t i=0; i<n_seqs; ++i)
1353  out << seqSet[i] << std::endl;
1354  return out;
1355 }
1356 
1357 
1358 
1359 template <typename SequenceType, typename MemoryType>
1360 using SequenceSet = SequenceSetBase<SequenceType, MemoryType>;
1361 
1362 
1363 
1364 
1365 
1366 }
1367 
1368 #endif /* SequenceSetBase_HPP_ */