24 #ifndef PROTEINSEQUENCESET_HPP_
25 #define PROTEINSEQUENCESET_HPP_
32 #include <boost/algorithm/string.hpp>
35 #include "SequenceSet.hpp"
37 #include "SplitSet.hpp"
39 #include "../Annotation/DomainArchitecture.hpp"
58 typedef std::shared_ptr<ProteinSequence> Seq_ptr;
60 std::vector<std::string> _domain_accessions;
62 std::string _identify_domain_file_format(std::ifstream &domain_F);
63 void _read_hmmscan_pfam(std::ifstream &domain_F);
64 void _read_pfamscan(std::ifstream &domain_F);
65 void _read_sff(std::ifstream &domain_F);
108 return _dom_archis.
size();
117 return _domain_accessions.size();
127 return _domain_accessions[i];
152 _dom_archis=dom_arch;
171 template<
typename MemoryType>
176 size_t n_seqs = this->n_seqs();
177 std::vector<std::vector<size_t> > lengths;
179 for (
size_t i = 0; i<n_seqs; ++i)
181 for (j=0; j<_dom_archis.size(); ++j)
183 if ((*
this)[i].dom_archi() == _dom_archis[j])
186 archlen = arch.
size();
187 for (k=0; k<archlen; ++k)
188 lengths[j][k] += arch[k].seq_length();
192 (*this)[i].dom_archi().
id(j);
193 if (j==_dom_archis.size())
195 _dom_archis.push_back((*
this)[i].dom_archi());
196 lengths.push_back(std::vector<size_t>());
197 for (k=0; k<_dom_archis[j].size(); ++k)
199 _dom_archis[j][k].ori_pos(k);
200 lengths[j].push_back(_dom_archis[j][k].seq_length());
203 _dom_archis[j].n_members(_dom_archis[j].n_members()+1);
205 for (j=0; j<_dom_archis.size(); ++j)
209 for (k=0; k<archlen; ++k)
211 arch[k].seq_start(0);
212 arch[k].seq_end((lengths[j][k]/arch.
n_members())-1);
218 template<
typename MemoryType>
222 std::ifstream domain_F;
223 domain_F.exceptions( std::ifstream::failbit | std::ifstream::badbit);
224 domain_F.open(domain_f.c_str(), std::ifstream::in);
225 domain_F.exceptions(std::ifstream::badbit);
227 getline(domain_F, line);
228 if (line.substr(0,14)==
"# pfam_scan.pl")
229 _read_pfamscan(domain_F);
232 getline(domain_F, line);
234 if (line.substr(0,13)==
"# target name")
235 _read_hmmscan_pfam(domain_F);
239 size_t n_seqs=this->n_seqs();
240 for (
size_t i=0; i<n_seqs; ++i)
241 (*
this)[i].dom_archi().sort();
253 template<
typename MemoryType>
259 size_t aln_start, env_start, hmm_start;
261 std::map<std::string, int> accession2id;
262 std::map<std::string, int>::iterator it;
263 std::string accession;
264 while (getline(domain_F, line))
266 if ((line[0] ==
'#') || (line[0] ==
'\0'))
268 std::vector<std::string> splitted;
269 boost::split(splitted, line, boost::is_any_of(
" "), boost::token_compress_on);
271 aln_start=boost::lexical_cast<
size_t>(splitted[17])-1;
272 env_start=boost::lexical_cast<
size_t>(splitted[19])-1;
273 hmm_start=boost::lexical_cast<
size_t>(splitted[15])-1;
274 accession=splitted[1].
substr(0,7);
275 it=accession2id.find(accession);
276 if (it != accession2id.end())
277 seq.
add_domain(
Domain(splitted[0], accession, aln_start, boost::lexical_cast<size_t>(splitted[18])-1, env_start, boost::lexical_cast<size_t>(splitted[20])-1, hmm_start, boost::lexical_cast<size_t>(splitted[16])-1, boost::lexical_cast<double>(splitted[17]),it->second));
280 accession2id[accession] = ++max_id;
281 _domain_accessions.push_back(accession);
282 seq.
add_domain(
Domain(splitted[0], accession, aln_start, boost::lexical_cast<size_t>(splitted[18])-1, env_start, boost::lexical_cast<size_t>(splitted[20])-1, hmm_start, boost::lexical_cast<size_t>(splitted[16])-1, boost::lexical_cast<double>(splitted[17]),max_id));
291 template<
typename MemoryType>
293 ProteinSequenceSet<MemoryType>::_read_pfamscan(std::ifstream &domain_F)
297 size_t aln_start, env_start, hmm_start;
299 std::map<std::string, int> name2id;
300 std::map<std::string, int>::iterator it;
301 std::string name, accession;
302 while (getline(domain_F, line))
305 if ((line[0] ==
'#') || (line[0] ==
'\0'))
307 std::vector<std::string> splitted;
308 boost::split(splitted, line, boost::is_any_of(
" "), boost::token_compress_on);
309 ProteinSequence &seq = (*this)[splitted[0]];
310 aln_start=boost::lexical_cast<
size_t>(splitted[1])-1;
311 env_start=boost::lexical_cast<
size_t>(splitted[3])-1;
312 hmm_start=boost::lexical_cast<
size_t>(splitted[8])-1;
313 accession=splitted[5].
substr(0,7);
316 it=name2id.find(accession);
317 if (it != name2id.end())
318 seq.add_domain(Domain(name, accession, aln_start, boost::lexical_cast<size_t>(splitted[2])-1, env_start, boost::lexical_cast<size_t>(splitted[4])-1, hmm_start, boost::lexical_cast<size_t>(splitted[9])-1, boost::lexical_cast<double>(splitted[12]),it->second ));
321 name2id[accession] = ++max_id;
322 _domain_accessions.push_back(accession);
323 seq.add_domain(Domain(name, accession, aln_start, boost::lexical_cast<size_t>(splitted[2])-1, env_start, boost::lexical_cast<size_t>(splitted[4])-1, hmm_start, boost::lexical_cast<size_t>(splitted[9])-1, boost::lexical_cast<double>(splitted[12]),max_id ));
329 template<
typename MemoryType>
331 ProteinSequenceSet<MemoryType>::_read_sff(std::ifstream &domain_F)
358 size_t aln_start, hmm_start;
360 std::map<std::string, int> name2id;
361 std::map<std::string, int>::iterator it;
362 std::string name, accession;
363 while (getline(domain_F, line))
365 if ((line[0] ==
'#') || (line[0] ==
'\0'))
367 std::vector<std::string> splitted;
368 boost::split(splitted, line, boost::is_any_of(
" "), boost::token_compress_on);
369 ProteinSequence &seq = (*this)[splitted[0]];
370 aln_start=boost::lexical_cast<
size_t>(splitted[6])-1;
371 hmm_start=boost::lexical_cast<
size_t>(splitted[8])-1;
375 it=name2id.find(accession);
376 if (it != name2id.end())
377 seq.add_domain(Domain(name, accession, aln_start, boost::lexical_cast<size_t>(splitted[7])-1, 0, 0, hmm_start, boost::lexical_cast<size_t>(splitted[9])-1, boost::lexical_cast<double>(splitted[10]),it->second ));
380 name2id[accession] = ++max_id;
381 _domain_accessions.push_back(accession);
382 seq.add_domain(Domain(name, accession, aln_start, boost::lexical_cast<size_t>(splitted[7])-1, 0, 0, hmm_start, boost::lexical_cast<size_t>(splitted[9])-1, boost::lexical_cast<double>(splitted[10]),max_id ));
391 template<
typename MemoryType>
395 size_t n_seqs = this->n_seqs();
396 for (
size_t i=0; i<n_seqs; ++i)
397 (*
this)[i].dom_archi().clean_up(options);
400 template<
typename MemoryType>
408 template<
typename MemoryType>
412 std::ofstream out_F(out_f);
413 out_F <<
"## MDA FORMAT 1" << std::endl << std::endl;
414 out_F <<
"#MDA" << std::endl;
415 int n_archis = _dom_archis.size();
416 for (
int i=0; i<n_archis; ++i)
417 out_F << std::right << std::setw(2)<< i+1 <<
" " << _dom_archis[i];
419 out_F << std::endl <<
"# Arch-ID: Sequences" << std::endl;
422 size_t n_seqs=this->n_seqs();
423 for (
int i=0; i<n_archis; ++i)
425 id = _dom_archis[i].id();
427 for (j=0; j<n_seqs; ++j)
429 if ((*
this)[j].dom_archi().
id() ==
id)
430 out_F <<
" " << (*this)[j].name();
449 template<
typename MemoryType>
453 size_t n_seqs=
set.n_seqs();
455 size_t n_domains=dom_arch_set[0].length();
456 size_t n_columns = n_domains*2+1;
458 std::vector<int> pattern(n_columns, 0);
459 for (j=0; j<n_domains; ++j)
461 if (dom_arch_set[0][j].
id() != -1)
462 pattern[j*2+1]=dom_arch_set[0][j].id()+1;
464 splitSet.pattern(pattern);
466 for (i=0; i<n_columns; ++i)
472 for (i=0; i<n_seqs; ++i)
476 arch_id=dom_arch.
id();
478 for (j=0; j<n_domains; ++j)
480 if (dom_arch[j].
id() != -1)
482 const Domain &dom =
set[i].dom_archi()[dom_arch[j].ori_pos()];
484 splitSet[j*2].add_seq(substr_end_no_renaming(
set[i], start, dom.
seq_start()-1));
486 splitSet[j*2].add_seq(
new ProteinSequence(
set[i].name(),
"",
"", seq_id));
487 splitSet[j*2+1].add_seq(substr_end_no_renaming(
set[i], dom.
seq_start(), dom.
seq_end()));
489 splitSet[j*2+1][id].dom_archi().id(arch_id);
490 splitSet[j*2][id].dom_archi().id(arch_id);
495 if (start!=
set[i].size())
496 splitSet[j*2].add_seq(substr_end_no_renaming(
set[i], start,
set[i].size()-1));
498 splitSet[j*2].add_seq(
new ProteinSequence(
set[i].name(),
"",
"",
set[i].
id()));
499 splitSet[j*2][id].id(seq_id);
500 splitSet[j*2][id].dom_archi().id(arch_id);
512 template<
typename MemoryType>
516 size_t nArchis =
set.dom_archis().size();
517 size_t nSeqs =
set.size();
518 architectureSplits.clear();
519 architectureSplits.reserve(nArchis);
522 for (i=0; i<nArchis; ++i)
525 for (
size_t i=0; i<nSeqs; ++i)
526 architectureSplits[
set[i].dom_archi().id()].share(
set, i);
528 for (
size_t i=0; i<nArchis; ++i)
532 architectureSplits[i].dom_archis(arch);