25 #ifndef SequenceSetBase_HPP_
26 #define SequenceSetBase_HPP_
44 #include "../Basics/basics.hpp"
45 #include "../Basics/utils.hpp"
46 #include "../utils/MDAT_Exceptions.hpp"
47 #include "../utils/filesystem.h"
68 template<
typename SequenceType,
typename MemoryType>
73 typedef std::shared_ptr<SequenceType> Seq_ptr;
76 std::vector<Seq_ptr> _seqs;
80 mutable std::map<std::string, size_t> _name2index;
83 short _identify_aln_format(FILE *aln_F);
84 void _read_fasta_f(FILE *aln_F,
const std::map<std::string, short> &seq_names);
85 void _read_clustalw_f(FILE *aln_F,
const std::map<std::string, short> &seq_names);
86 void _read_msf_f(FILE *aln_F,
const std::map<std::string, short> &seq_names);
87 void _read_stockholm_f(FILE *aln_F,
const std::map<std::string, short> &seq_names);
88 void _read_codata_f(FILE *aln_F,
const std::map<std::string, short> &seq_names);
89 void _read_amps_f(FILE *aln_F,
const std::map<std::string, short> &seq_names);
90 void _read_phylip_f(FILE *aln_F,
const std::map<std::string, short> &seq_names);
94 void _write_fasta(FILE *aln_f,
unsigned int line_break=99999999)
const;
141 return *_seqs[index];
149 return *_seqs[index];
160 if (_name2index.size() != _seqs.size())
162 size_t n_seqs = _seqs.size();
163 for (
size_t i=0; i<
n_seqs; ++i)
164 _name2index[_seqs[i]->name()]=i;
166 return *_seqs[_name2index[seq_name]];
172 const SequenceType &
operator[](
const std::string &seq_name)
const
174 if (_name2index.size() != _seqs.size())
176 size_t n_seqs = _seqs.size();
177 for (
size_t i=0; i<
n_seqs; ++i)
178 _name2index[_seqs[i]->name()]=i;
180 return *_seqs[_name2index[seq_name]];
190 template<
typename SeqType,
typename MemType>
191 friend std::ostream& operator<< (std::ostream &out, const SequenceSetBase<SeqType, MemType> &seqSet);
205 const SequenceType*
seq(
unsigned int index)
const
207 return &(*(_seqs[index]));
241 return _seqs[0]->size();
255 return (_seqs.empty());
281 _seq_type = seq_type_;
289 int id()
const throw()
327 virtual void read(
const std::string &seq_f,
const std::vector<std::string> &seq_names,
bool check=
false,
short format =-1);
337 virtual void read(
const std::string &seq_f,
bool check=
false,
short format =-1)
339 read(seq_f, std::vector<std::string>(), check, format);
350 virtual void write(
const std::string &seq_f,
const std::string format)
const;
359 _seqs.push_back(std::shared_ptr<SequenceType> (seq));
376 for (
size_t i = 0; i<_seqs.size(); ++i)
385 for (
size_t i = 0; i<_seqs.size(); ++i)
395 virtual void delete_seqs(
const std::map<std::string,bool> &names);
401 virtual void delete_seqs(std::vector<size_t> &indices);
407 virtual void keep_seqs(std::vector<size_t> &indices);
418 _seqs.push_back(Seq_ptr());
419 _seqs[_seqs.size()-1] =
set._seqs[
id];
431 _seqs.push_back(Seq_ptr(
set._seqs[
id]));
432 set._seqs.erase(
set._seqs.begin()+
id);
442 size_t n_seqs=
set.n_seqs();
443 _seqs.reserve(this->
n_seqs()+n_seqs);
444 for (
size_t i=0; i<
n_seqs; ++i)
445 _seqs.push_back(Seq_ptr(
set._seqs[i]));
454 void sort(std::string type);
481 template<
typename SequenceType,
typename MemoryType>
485 template<
typename SequenceType,
typename MemoryType>
495 template<
typename SequenceType,
typename MemoryType>
503 template<
typename SequenceType,
typename MemoryType>
507 size_t num_seqs = this->n_seqs();
511 for (
size_t i=0; i<num_seqs; ++i)
513 SequenceType &tmp_seq = *_seqs[i];
514 n_cols = tmp_seq.size();
515 for (j=0; j<n_cols; ++j)
517 if (!isalpha(tmp_seq[j]))
524 template<
typename SequenceType,
typename MemoryType>
526 SequenceSetBase<SequenceType, MemoryType>::_identify_aln_format(FILE *aln_F)
528 const unsigned int LINE_LENGTH = 501;
529 char line[LINE_LENGTH];
531 while (fgets(line, LINE_LENGTH, aln_F) !=
nullptr)
535 if (fgets(line, LINE_LENGTH, aln_F)==
nullptr)
537 fprintf(stderr,
"Error occurred when trying to identify format!\n");
545 if (strstr(line,
"CLUSTAL ") !=
nullptr)
547 if (strstr(line,
"MSF:") !=
nullptr)
549 if (strstr(line,
"# STOCKHOLM 1.0") !=
nullptr)
551 if (strstr(line,
"ENTRY") !=
nullptr )
556 tmp = strTok.next(
" \n\t");
561 if (!isdigit(*(pos++)))
563 tmp = strTok.next(
" \n\t");
568 if (!isdigit(*(pos++)))
570 tmp = strTok.next(
" \n\t");
578 template<
typename SequenceType,
typename MemoryType>
582 FILE *aln_F =
my_fopen(seq_f.c_str(),
"r");
586 format = _identify_aln_format(aln_F);
587 std::map<std::string, short> extract_only;
588 size_t n_extract_seqs = seq_names.size();
589 for (
unsigned int i = 0; i < n_extract_seqs; ++i)
590 extract_only.insert(std::pair<std::string,short>(seq_names[i],1));
592 fseek ( aln_F , 0 , SEEK_SET );
596 _read_fasta_f(aln_F, extract_only);
599 _read_clustalw_f(aln_F, extract_only);
602 _read_msf_f(aln_F, extract_only);
605 _read_stockholm_f(aln_F, extract_only);
608 _read_codata_f(aln_F, extract_only);
611 _read_amps_f(aln_F, extract_only);
614 _read_phylip_f(aln_F, extract_only);
621 if ((check) && (!check_set(*
this)))
625 this->seq_type(identify_seq_type(*(this->_seqs)[0]));
629 template<
typename SequenceType,
typename MemoryType>
633 const unsigned int LINE_LENGTH = 500;
634 char line[LINE_LENGTH];
635 SequenceType *tmp_seq =
nullptr;
636 char *comment =
nullptr, *name =
nullptr;
637 bool read_sequence =
true;
640 while (fgets(line, LINE_LENGTH, aln_F))
645 seq_length = tmp_seq->
size();
646 StrTok tokenizer(&line[1]);
647 name = tokenizer.next(
" \n");
648 comment = tokenizer.next(
"\n");
649 if (seq_names.empty() || (seq_names.count(name)>0))
651 read_sequence =
true;
652 if (comment ==
nullptr)
653 tmp_seq =
new SequenceType(name,
"", seq_length,
id++);
655 tmp_seq =
new SequenceType(name, comment, seq_length,
id++);
656 _seqs.push_back(Seq_ptr(tmp_seq));
659 read_sequence =
false;
661 else if (line[0] ==
'/')
667 tmp_seq->append(line);
668 if ((*tmp_seq)[tmp_seq->size()-1] ==
'\n')
669 tmp_seq->resize(tmp_seq->size()-1);
676 template<
typename SequenceType,
typename MemoryType>
678 SequenceSetBase<SequenceType, MemoryType>::_read_clustalw_f(FILE *aln_F,
const std::map<std::string, short> &seq_names)
680 const unsigned int LINE_LENGTH = 200;
681 char line[LINE_LENGTH];
682 if (fgets(line, LINE_LENGTH, aln_F)==
nullptr)
684 fprintf(stderr,
"Error when reading file.");
689 while ((fgets(line, LINE_LENGTH, aln_F) !=
nullptr) && (line[0] ==
'\n'))
690 t_pos = ftell(aln_F);
691 fseek(aln_F, t_pos, SEEK_SET);
694 SequenceType *seq_p =
nullptr;
695 char *name =
nullptr, *tmp_seq =
nullptr;
696 std::vector<short> read_sequence;
698 while (fgets(line, LINE_LENGTH, aln_F) !=
nullptr)
700 if ((line[0] ==
' ') || (line[0] ==
'\n'))
705 name = tokenizer.next(
" \n");
706 if (seq_names.empty() || seq_names.count(name) > 0)
708 read_sequence.push_back(1);
709 tmp_seq = tokenizer.next(
" \n");
710 seq_p =
new SequenceType(name,
"", tmp_seq,
id++);
711 _seqs.push_back(Seq_ptr(seq_p));
714 read_sequence.push_back(0);
719 unsigned int counter = 0;
720 unsigned int seq_id = 0;
721 while (fgets(line, LINE_LENGTH, aln_F) !=
nullptr)
723 if ((line[0] ==
' ') || (line[0] ==
'\n'))
730 if (read_sequence[counter])
733 tokenizer.next(
" \n");
734 tmp_seq = tokenizer.next(
" \n");
735 _seqs[seq_id++]->append(tmp_seq);
743 template<
typename SequenceType,
typename MemoryType>
745 SequenceSetBase<SequenceType, MemoryType>::_read_msf_f(FILE *aln_F,
const std::map<std::string, short> &seq_names)
748 const unsigned int LINE_LENGTH = 501;
749 char line[LINE_LENGTH];
750 while ((fgets(line, LINE_LENGTH, aln_F) !=
nullptr) && ((msf=strstr(line,
"MSF:")) ==
nullptr));
752 size_t seq_length=atoi(msf);
754 SequenceType *tmp_seq;
756 std::vector<short> read_sequence;
760 while (fgets(line, LINE_LENGTH, aln_F) !=
nullptr)
762 if ((line[0] ==
'/') && (line[1] ==
'/'))
764 if (((seq_name=strstr(line,
"Name:")) !=
nullptr) || ((seq_name=strstr(line,
"NAME:")) !=
nullptr))
766 tokenizer.set(seq_name+5);
767 seq_name = tokenizer.next(
" \n");
768 if (seq_names.empty() || (seq_names.count(seq_name)>0))
770 read_sequence.push_back(1);
771 tmp_seq =
new SequenceType(seq_name,
"", seq_length,
id++);
772 _seqs.push_back(Seq_ptr(tmp_seq));
776 read_sequence.push_back(0);
783 unsigned int counter = 0;
784 unsigned int seq_id = 0;
785 while (fgets(line, LINE_LENGTH, aln_F) !=
nullptr)
794 if (read_sequence[counter]==1)
797 while (*(++pos) !=
' ');
800 while ((c=*pos) !=
'\0')
802 if ((c==
'-') || (c==
'.') || (isalpha(c)))
803 _seqs[seq_id]->append(c);
810 if(fgets(line, LINE_LENGTH, aln_F)==
nullptr)
812 fprintf(stderr,
"Erorr occured\n");
829 if (fgets(line, LINE_LENGTH, aln_F)==
nullptr)
831 fprintf(stderr,
"Erorr occured\n");
843 template<
typename SequenceType,
typename MemoryType>
845 SequenceSetBase<SequenceType, MemoryType>::_read_stockholm_f(FILE *aln_F,
const std::map<std::string, short> &seq_names)
847 const unsigned int LINE_LENGTH = 501;
848 char line[LINE_LENGTH];
849 char *part1_p =
nullptr, *part2_p =
nullptr;
851 SequenceType *seq_p =
nullptr;
852 char *end=&line[LINE_LENGTH-1];
856 std::vector<short> read_seq;
859 while (fgets(line, LINE_LENGTH, aln_F))
866 else if (line[0] ==
'#')
868 while ((*end != 8) && (*end !=
'\n'))
871 if (!fgets(line, LINE_LENGTH, aln_F))
873 fprintf(stderr,
"Error occurred\n");
878 else if (line[0] !=
'/')
881 part1_p = tokenizer.next(
" \n");
882 part2_p = tokenizer.next(
" \n");
883 if (part2_p !=
nullptr)
885 if (seq_names.empty() || (seq_names.count(part1_p)))
890 seq_p = &(*_seqs[seq_id]);
891 seq_p->append(part2_p);
893 seq_p =
new SequenceType(part1_p, part2_p,
" ",
id++);
894 _seqs.push_back(Seq_ptr(seq_p));
904 seq_p->append(part1_p);
916 template<
typename SequenceType,
typename MemoryType>
918 SequenceSetBase<SequenceType, MemoryType>::_read_codata_f(FILE *aln_F,
const std::map<std::string, short> &seq_names)
920 const unsigned int LINE_LENGTH = 501;
921 char line[LINE_LENGTH];
923 size_t seq_length = 0;
925 SequenceType *seq_p =
nullptr;
928 unsigned int pos1 = 0, pos2 = 1;
929 bool extract_seq =
false;
930 while (fgets(line, LINE_LENGTH, aln_F))
932 if (!strncmp(line,
"ENTRY", 5))
934 tokenizer.set(&line[6]);
936 name = tokenizer.next(
" \n");
938 else if (!strncmp(line,
"SEQUENCE", 8))
940 if (seq_names.empty() || seq_names.count(name))
942 seq_p =
new SequenceType(name,
"", seq_length,
id++);
943 _seqs.push_back(Seq_ptr(seq_p));
944 if (!fgets(line, LINE_LENGTH, aln_F))
946 fprintf(stderr,
"Error occurred\n");
954 else if (strncmp(line,
"///", 3) == 0)
957 seq_length = seq_p->size();
965 while ((c=line[pos1++]) !=
'\0')
967 if (isalpha(c) || (c ==
'-'))
978 template<
typename SequenceType,
typename MemoryType>
980 SequenceSetBase<SequenceType, MemoryType>::_read_amps_f(FILE *aln_F,
const std::map<std::string, short> &seq_names)
982 const unsigned int LINE_LENGTH = 501;
983 char line[LINE_LENGTH];
984 std::vector<short> extract_seq;
988 char *name =
nullptr;
989 unsigned int num_seqs = 0;
990 while (fgets(line, LINE_LENGTH, aln_F))
994 else if (line[0] ==
'>')
997 tokenizer.set(&line[1]);
998 name = tokenizer.next(
"\n");
999 if (seq_names.empty() || seq_names.count(name))
1001 seq_p =
new SequenceType(name,
"", 0,
id++);
1002 _seqs.push_back(Seq_ptr(seq_p));
1003 extract_seq.push_back(1);
1006 extract_seq.push_back(0);
1010 char *seq_line =
new char[num_seqs+2];
1011 unsigned int n_seqs2 = num_seqs+2;
1013 unsigned int seq_id = 0;
1014 while (fgets(seq_line, n_seqs2, aln_F))
1016 if (seq_line[0] ==
'*')
1021 for (i=0; i<num_seqs; ++i)
1025 if ((c = seq_line[i]) ==
' ')
1026 _seqs[seq_id]->append(
'-');
1028 _seqs[seq_id]->append(c);
1039 template<
typename SequenceType,
typename MemoryType>
1041 SequenceSetBase<SequenceType, MemoryType>::_read_phylip_f(FILE *aln_F,
const std::map<std::string, short> &seq_names)
1043 const unsigned int LINE_LENGTH = 500;
1044 char line[LINE_LENGTH];
1045 std::vector<int> use_seq;
1046 size_t num_seqs, seq_length;
1047 if (fgets(line, LINE_LENGTH, aln_F))
1049 fprintf(stderr,
"Error when reading file.");
1052 sscanf(line,
"%lu %lu", &num_seqs, &seq_length);
1056 SequenceType *seq_p;
1057 size_t seq_num = num_seqs;
1060 size_t line_num = 0;
1062 while (fgets(line, LINE_LENGTH, aln_F))
1064 if (line[0] ==
'\n')
1068 if (line_num < num_seqs)
1070 strncpy(name, line, 10);
1072 while ((*pos !=
'\0') && (*pos !=
' '))
1075 if ((seq_names.empty()) || (seq_names.count(name)))
1077 use_seq.push_back(++x);
1078 seq_p =
new SequenceType(name,
"", seq_length,
id++);
1079 _seqs.push_back(Seq_ptr(seq_p));
1083 use_seq.push_back(-1);
1093 if (seq_num >= num_seqs)
1095 if (use_seq[seq_num] >= 0)
1098 seq_p=&(*_seqs[use_seq[seq_num]]);
1101 while ((c=*pos) !=
'\0')
1103 if ((isalpha(c)) || (c==
'-'))
1108 if (*(pos-1) !=
'\n')
1110 if (!fgets(line, LINE_LENGTH, aln_F))
1120 while (line[strlen(line)-1] !=
'\n')
1122 if (!fgets(line, LINE_LENGTH, aln_F))
1134 template<
typename SequenceType,
typename MemoryType>
1138 std::string format_lc;
1139 format_lc.reserve(format.size());
1140 for (
size_t i= 0; i<format.size(); ++i)
1141 format_lc.push_back(tolower(format[i]));
1144 if ((format_lc ==
"fasta"))
1145 aln_F =
my_fopen(aln_f.c_str(),
"w");
1149 if (format_lc ==
"fasta")
1150 _write_fasta(aln_F);
1156 template<
typename SequenceType,
typename MemoryType>
1160 unsigned int num_seqs = this->n_seqs();
1161 size_t current_length, seq_length;
1162 std::string tmp_seq;
1165 for (
unsigned int i = 0; i < num_seqs; ++i)
1167 if (_seqs[i]->comment().empty())
1168 fprintf(aln_F,
">%s\n", _seqs[i]->name().c_str());
1170 fprintf(aln_F,
">%s %s\n", _seqs[i]->name().c_str(), _seqs[i]->comment().c_str());
1172 tmp_seq = _seqs[i]->sequence();
1173 seq_length=_seqs[i]->size();
1174 while (current_length < seq_length)
1176 fprintf(aln_F,
"%.*s\n", line_break, &tmp_seq.c_str()[current_length]);
1177 current_length += line_break;
1187 template<
typename SequenceType,
typename MemoryType>
1191 std::sort(indices.begin(), indices.end());
1192 size_t n_dels = indices.size();
1193 size_t num_seqs = this->n_seqs();
1194 size_t index_pos = 0;
1196 for (
size_t i = 0; i < num_seqs; ++i)
1198 if ((index_pos==n_dels) || ((index_pos<n_dels) && (indices[index_pos] != i)))
1199 _seqs[seq_pos++] = _seqs[i];
1203 _seqs.resize(seq_pos);
1206 template<
typename SequenceType,
typename MemoryType>
1210 std::sort(indices.begin(), indices.end());
1211 size_t n_dels = indices.size();
1212 size_t num_seqs = this->n_seqs();
1213 size_t index_pos = 0;
1215 for (
size_t i = 0; i < num_seqs; ++i)
1217 if (indices[index_pos] == i)
1219 _seqs[seq_pos++] = _seqs[i];
1222 if (index_pos==n_dels)
1225 _seqs.resize(seq_pos);
1228 template<
typename SequenceType,
typename MemoryType>
1232 size_t num_seqs = this->n_seqs();
1234 for (
size_t i = 0; i < num_seqs; ++i)
1236 if (names.count(_seqs[i]->name()))
1237 _seqs[seq_pos++] = _seqs[i];
1239 _seqs.resize(seq_pos);
1245 template <
typename T>
1248 bool operator()(
const T& o1,
const T& o2)
1250 return o1->id() < o2->id();
1255 template <
typename T>
1256 struct Seq_Sort :
public std::binary_function<T,T,bool>
1258 bool operator()(
const T& o1,
const T& o2)
1265 template <
typename T>
1268 bool operator()(
const T& o1,
const T& o2)
1270 return o1->name() < o2->name();
1274 template<
typename SequenceType,
typename MemoryType>
1282 else if (type ==
"seq")
1286 else if (type ==
"name")
1293 template<
typename SequenceType,
typename MemoryType>
1297 size_t n_seqs=_seqs.size();
1298 for (
size_t i=0; i<n_seqs; ++i)
1299 _seqs[i]->insert_gaps(edit_string);
1303 template<
typename SequenceType,
typename MemoryType>
1307 size_t n_seqs =
set.n_seqs();
1309 size_t *val_counting =
new size_t[256];
1310 for (j=0; j<256; ++j)
1311 val_counting[j] = 0;
1313 for (
size_t i=0; i<n_seqs; ++i)
1316 const SequenceType &seq =
set[i];
1319 for (j=0; j<len; ++j)
1320 ++val_counting[static_cast<int>(seq[j])];
1324 for (j=0; j<45; ++j)
1326 if (val_counting[j] != 0)
1329 for (j=46; j<65; ++j)
1331 if (val_counting[j] != 0)
1334 for (j=91; j<97; ++j)
1336 if (val_counting[j] != 0)
1339 for (j=123; j<256; ++j)
1341 if (val_counting[j] != 0)
1344 delete[] val_counting;
1348 template<
typename SeqType,
typename MemType>
1349 std::ostream& operator<< (std::ostream &out, const SequenceSetBase<SeqType, MemType> &seqSet)
1351 size_t n_seqs = seqSet.n_seqs();
1352 for (
size_t i=0; i<n_seqs; ++i)
1353 out << seqSet[i] << std::endl;
1359 template <
typename SequenceType,
typename MemoryType>
1360 using SequenceSet = SequenceSetBase<SequenceType, MemoryType>;