MDA
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
ProteinSequenceSet.hpp
1 /*
2  * ProteinSequenceSet.hpp
3  *
4  * Created on: 19 Jul 2013
5  * Author: Carsten Kemena
6  *
7  * This file is part of MDAT.
8  *
9  * MDAT is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser General Public License as published by
11  * the Free Software Foundation, either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * MDAT is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public License
20  * along with MDAT. If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23 
24 #ifndef PROTEINSEQUENCESET_HPP_
25 #define PROTEINSEQUENCESET_HPP_
26 
27 
28 // C++ header
29 #include <fstream>
30 
31 // Boost header
32 #include <boost/algorithm/string.hpp>
33 
34 // BioSeqoolBox header
35 #include "SequenceSet.hpp"
36 #include "Seq_functs.hpp"
37 #include "SplitSet.hpp"
38 #include "ProteinSequence.hpp"
39 #include "../Annotation/DomainArchitecture.hpp"
40 
41 namespace MDAT
42 {
43 
44 
54 template<typename MemoryType> class ProteinSequenceSet : public SequenceSetBase<ProteinSequence, MemoryType>
55 {
56 
57 private:
58  typedef std::shared_ptr<ProteinSequence> Seq_ptr;
59  DomainArchitectureSet _dom_archis;
60  std::vector<std::string> _domain_accessions;
61 
62  std::string _identify_domain_file_format(std::ifstream &domain_F);
63  void _read_hmmscan_pfam(std::ifstream &domain_F);
64  void _read_pfamscan(std::ifstream &domain_F);
65  void _read_sff(std::ifstream &domain_F);
66 
67 public:
68 
69  ProteinSequenceSet() : _dom_archis(), _domain_accessions()
70  {}
71  ProteinSequenceSet(size_t id_val): SequenceSetBase<ProteinSequence, MemoryType>(id_val), _dom_archis(), _domain_accessions()
72  {}
73 
74  ProteinSequenceSet(const ProteinSequenceSet&) = delete;
76 
78 
79 
84  void add_pfam_domains(const std::string &domain_f);
85 
89  void clean_up_domains(unsigned char options);
90 
91 
95  void refine_boundaries();
96 
100  void extract_architectures();
101 
106  size_t n_architectures() const
107  {
108  return _dom_archis.size();
109  }
110 
115  size_t n_domains() const
116  {
117  return _domain_accessions.size();
118  }
119 
124  const std::string&
125  domain_name(size_t i) const
126  {
127  return _domain_accessions[i];
128  }
129 
130 
136  {
137  return _dom_archis;
138  }
139 
144  {
145  return _dom_archis;
146  }
147 
148 
149  void
151  {
152  _dom_archis=dom_arch;
153  }
154 
159  void
160  write_domArchitecture(const std::string &out_f) const;
161 
163 };
164 
171 template<typename MemoryType>
172 void
174 {
175  size_t j,k;
176  size_t n_seqs = this->n_seqs();
177  std::vector<std::vector<size_t> > lengths;
178  size_t archlen;
179  for (size_t i = 0; i<n_seqs; ++i)
180  {
181  for (j=0; j<_dom_archis.size(); ++j)
182  {
183  if ((*this)[i].dom_archi() == _dom_archis[j])
184  {
185  const DomainArchitecture &arch =(*this)[i].dom_archi();
186  archlen = arch.size();
187  for (k=0; k<archlen; ++k)
188  lengths[j][k] += arch[k].seq_length();
189  break;
190  }
191  }
192  (*this)[i].dom_archi().id(j);
193  if (j==_dom_archis.size())
194  {
195  _dom_archis.push_back((*this)[i].dom_archi());
196  lengths.push_back(std::vector<size_t>());
197  for (k=0; k<_dom_archis[j].size(); ++k)
198  {
199  _dom_archis[j][k].ori_pos(k);
200  lengths[j].push_back(_dom_archis[j][k].seq_length());
201  }
202  }
203  _dom_archis[j].n_members(_dom_archis[j].n_members()+1);
204  }
205  for (j=0; j<_dom_archis.size(); ++j)
206  {
207  DomainArchitecture &arch = _dom_archis[j];
208  archlen=arch.size();
209  for (k=0; k<archlen; ++k)
210  {
211  arch[k].seq_start(0);
212  arch[k].seq_end((lengths[j][k]/arch.n_members())-1);
213  }
214  }
215 }
216 
217 
218 template<typename MemoryType>
219 void
221 {
222  std::ifstream domain_F;
223  domain_F.exceptions( std::ifstream::failbit | std::ifstream::badbit);
224  domain_F.open(domain_f.c_str(), std::ifstream::in);
225  domain_F.exceptions(std::ifstream::badbit);
226  std::string line;
227  getline(domain_F, line);
228  if (line.substr(0,14)== "# pfam_scan.pl")
229  _read_pfamscan(domain_F);
230  else
231  {
232  getline(domain_F, line);
233  domain_F.seekg(0);
234  if (line.substr(0,13)=="# target name")
235  _read_hmmscan_pfam(domain_F);
236  else
237  _read_sff(domain_F);
238  }
239  size_t n_seqs=this->n_seqs();
240  for (size_t i=0; i<n_seqs; ++i)
241  (*this)[i].dom_archi().sort();
242  domain_F.close();
243 }
244 
245 
246 
247 /*
248 # --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord
249 # target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target
250 #------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
251 HIPIP PF01355.12 64 A0YD43_9GAMM/35-102 - 68 2.5e-21 75.5 0.6 1 1 1.9e-25 2.8e-21 75.4 0.6 2 64 2 68 1 68 0.94 High potential iron-sulfur protein
252 */
253 template<typename MemoryType>
254 void
256 {
257  std::string line;
258  Seq_ptr tmp;
259  size_t aln_start, env_start, hmm_start;
260  int max_id=-1;
261  std::map<std::string, int> accession2id;
262  std::map<std::string, int>::iterator it;
263  std::string accession;
264  while (getline(domain_F, line))
265  {
266  if ((line[0] == '#') || (line[0] == '\0'))
267  continue;
268  std::vector<std::string> splitted;
269  boost::split(splitted, line, boost::is_any_of(" "), boost::token_compress_on);
270  ProteinSequence &seq = (*this)[splitted[3]];
271  aln_start=boost::lexical_cast<size_t>(splitted[17])-1;
272  env_start=boost::lexical_cast<size_t>(splitted[19])-1;
273  hmm_start=boost::lexical_cast<size_t>(splitted[15])-1;
274  accession=splitted[1].substr(0,7);
275  it=accession2id.find(accession);
276  if (it != accession2id.end())
277  seq.add_domain(Domain(splitted[0], accession, aln_start, boost::lexical_cast<size_t>(splitted[18])-1, env_start, boost::lexical_cast<size_t>(splitted[20])-1, hmm_start, boost::lexical_cast<size_t>(splitted[16])-1, boost::lexical_cast<double>(splitted[17]),it->second));
278  else
279  {
280  accession2id[accession] = ++max_id;
281  _domain_accessions.push_back(accession);
282  seq.add_domain(Domain(splitted[0], accession, aln_start, boost::lexical_cast<size_t>(splitted[18])-1, env_start, boost::lexical_cast<size_t>(splitted[20])-1, hmm_start, boost::lexical_cast<size_t>(splitted[16])-1, boost::lexical_cast<double>(splitted[17]),max_id));
283  }
284  }
285 }
286 
287 
288 //# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>
289 //
290 //1aab_ 5 77 5 78 PF09011.5 HMG_box_2 Domain 1 72 73 94.7 3e-27 1 CL0114
291 template<typename MemoryType>
292 void
293 ProteinSequenceSet<MemoryType>::_read_pfamscan(std::ifstream &domain_F)
294 {
295  std::string line;
296  Seq_ptr tmp;
297  size_t aln_start, env_start, hmm_start;
298  int max_id=-1;
299  std::map<std::string, int> name2id;
300  std::map<std::string, int>::iterator it;
301  std::string name, accession;
302  while (getline(domain_F, line))
303  {
304 
305  if ((line[0] == '#') || (line[0] == '\0'))
306  continue;
307  std::vector<std::string> splitted;
308  boost::split(splitted, line, boost::is_any_of(" "), boost::token_compress_on);
309  ProteinSequence &seq = (*this)[splitted[0]];
310  aln_start=boost::lexical_cast<size_t>(splitted[1])-1;
311  env_start=boost::lexical_cast<size_t>(splitted[3])-1;
312  hmm_start=boost::lexical_cast<size_t>(splitted[8])-1;
313  accession=splitted[5].substr(0,7);
314  name=splitted[6];
315 
316  it=name2id.find(accession);
317  if (it != name2id.end())
318  seq.add_domain(Domain(name, accession, aln_start, boost::lexical_cast<size_t>(splitted[2])-1, env_start, boost::lexical_cast<size_t>(splitted[4])-1, hmm_start, boost::lexical_cast<size_t>(splitted[9])-1, boost::lexical_cast<double>(splitted[12]),it->second ));
319  else
320  {
321  name2id[accession] = ++max_id;
322  _domain_accessions.push_back(accession);
323  seq.add_domain(Domain(name, accession, aln_start, boost::lexical_cast<size_t>(splitted[2])-1, env_start, boost::lexical_cast<size_t>(splitted[4])-1, hmm_start, boost::lexical_cast<size_t>(splitted[9])-1, boost::lexical_cast<double>(splitted[12]),max_id ));
324  }
325  }
326 }
327 
328 
329 template<typename MemoryType>
330 void
331 ProteinSequenceSet<MemoryType>::_read_sff(std::ifstream &domain_F)
332 {
333  /*
334  SSF Format Description
335  #---------------------------------------------------------------------------------
336  # FILE FORMAT: Sam Summary File (SSF) Format 2.1
337  #
338  # Column 1: Sequence accession code e.g. gi|159385
339  # Column 2: HMM model name (not taken from stat or align file; set externally by user)
340  # Column 3: Total sequence length
341  # Column 4: Total HMM Length
342  # Column 5: Length of alignment (upper case letters AND gaps that appear within segments)
343  # Column 6: Number of matched residues (upper case letters)
344  # Column 7: First residue in sequence to match model (first upper case letter)
345  # Column 8: Last residue in sequence to match model (last upper case letter)
346  # Column 9: First matched position in model
347  # Column 10: Last matched position in model
348  # Column 11: Evalue
349  # Column 12: Simple score
350  # Column 13: Reverse score
351  # Column 14: Number of matched sequence segments (2+: discontinuous match)
352  # Column 15: Colon-separated segment boundaries e.g. 1:51:99:163
353  # Column 16 (Extended only): 1 or 0 indicating DomainFinder 3 mode (Cliquer or BMS).
354  #==============================================================================*/
355 
356  std::string line;
357  Seq_ptr tmp;
358  size_t aln_start, hmm_start;
359  int max_id=-1;
360  std::map<std::string, int> name2id;
361  std::map<std::string, int>::iterator it;
362  std::string name, accession;
363  while (getline(domain_F, line))
364  {
365  if ((line[0] == '#') || (line[0] == '\0'))
366  continue;
367  std::vector<std::string> splitted;
368  boost::split(splitted, line, boost::is_any_of(" "), boost::token_compress_on);
369  ProteinSequence &seq = (*this)[splitted[0]];
370  aln_start=boost::lexical_cast<size_t>(splitted[6])-1;
371  hmm_start=boost::lexical_cast<size_t>(splitted[8])-1;
372 
373  name=splitted[7];
374 
375  it=name2id.find(accession);
376  if (it != name2id.end())
377  seq.add_domain(Domain(name, accession, aln_start, boost::lexical_cast<size_t>(splitted[7])-1, 0, 0, hmm_start, boost::lexical_cast<size_t>(splitted[9])-1, boost::lexical_cast<double>(splitted[10]),it->second ));
378  else
379  {
380  name2id[accession] = ++max_id;
381  _domain_accessions.push_back(accession);
382  seq.add_domain(Domain(name, accession, aln_start, boost::lexical_cast<size_t>(splitted[7])-1, 0, 0, hmm_start, boost::lexical_cast<size_t>(splitted[9])-1, boost::lexical_cast<double>(splitted[10]),max_id ));
383  }
384  }
385 }
386 
391 template<typename MemoryType>
392 void
394 {
395  size_t n_seqs = this->n_seqs();
396  for (size_t i=0; i<n_seqs; ++i)
397  (*this)[i].dom_archi().clean_up(options);
398 }
399 
400 template<typename MemoryType>
401 void
403 {
404 
405 }
406 
407 
408 template<typename MemoryType>
409 void
411 {
412  std::ofstream out_F(out_f);
413  out_F << "## MDA FORMAT 1" << std::endl << std::endl;
414  out_F << "#MDA" << std::endl;
415  int n_archis = _dom_archis.size();
416  for (int i=0; i<n_archis; ++i)
417  out_F << std::right << std::setw(2)<< i+1 << " " << _dom_archis[i];
418 
419  out_F << std::endl << "# Arch-ID: Sequences" << std::endl;
420  size_t j;
421  int id;
422  size_t n_seqs=this->n_seqs();
423  for (int i=0; i<n_archis; ++i)
424  {
425  id = _dom_archis[i].id();
426  out_F << i+1 << ":";
427  for (j=0; j<n_seqs; ++j)
428  {
429  if ((*this)[j].dom_archi().id() == id)
430  out_F << " " << (*this)[j].name();
431  }
432  out_F << std::endl;
433  }
434 }
435 
436 
437 
449 template<typename MemoryType>
450 void
451 domain_column_split(const ProteinSequenceSet<MemoryType> &set, SplitSet<ProteinSequenceSet<Default> > &splitSet)
452 {
453  size_t n_seqs=set.n_seqs();
454  const DomainArchitectureSet &dom_arch_set= set.dom_archis();
455  size_t n_domains=dom_arch_set[0].length();
456  size_t n_columns = n_domains*2+1;
457  size_t i,j;
458  std::vector<int> pattern(n_columns, 0);
459  for (j=0; j<n_domains; ++j)
460  {
461  if (dom_arch_set[0][j].id() != -1)
462  pattern[j*2+1]=dom_arch_set[0][j].id()+1;
463  }
464  splitSet.pattern(pattern);
465 
466  for (i=0; i<n_columns; ++i)
467  splitSet.push_back(new ProteinSequenceSet<Default>());
468 
469  size_t start;
470  int id =0;
471  int seq_id, arch_id;
472  for (i=0; i<n_seqs; ++i)
473  {
474  start =0;
475  const DomainArchitecture &dom_arch = dom_arch_set[0];//set[i].dom_archi().id()];
476  arch_id=dom_arch.id();
477  seq_id=set[i].id();
478  for (j=0; j<n_domains; ++j)
479  {
480  if (dom_arch[j].id() != -1)
481  {
482  const Domain &dom = set[i].dom_archi()[dom_arch[j].ori_pos()];
483  if (dom.seq_start()>0)
484  splitSet[j*2].add_seq(substr_end_no_renaming(set[i], start, dom.seq_start()-1));
485  else
486  splitSet[j*2].add_seq(new ProteinSequence(set[i].name(), "", "", seq_id));
487  splitSet[j*2+1].add_seq(substr_end_no_renaming(set[i], dom.seq_start(), dom.seq_end()));
488  start=dom.seq_end()+1;
489  splitSet[j*2+1][id].dom_archi().id(arch_id);
490  splitSet[j*2][id].dom_archi().id(arch_id);
491  }
492  }
493 
494  // the last column
495  if (start!=set[i].size())
496  splitSet[j*2].add_seq(substr_end_no_renaming(set[i], start, set[i].size()-1));
497  else
498  splitSet[j*2].add_seq(new ProteinSequence(set[i].name(), "", "", set[i].id()));
499  splitSet[j*2][id].id(seq_id);
500  splitSet[j*2][id].dom_archi().id(arch_id);
501  ++id;
502  }
503 }
504 
505 
512 template<typename MemoryType>
513 void
514 splitByArchitecture(const ProteinSequenceSet<MemoryType> &set, std::vector<ProteinSequenceSet<MemoryType> > &architectureSplits)
515 {
516  size_t nArchis = set.dom_archis().size();
517  size_t nSeqs = set.size();
518  architectureSplits.clear();
519  architectureSplits.reserve(nArchis);
520 
521  size_t i;
522  for (i=0; i<nArchis; ++i)
523  architectureSplits.push_back(ProteinSequenceSet<MemoryType>(i));
524 
525  for (size_t i=0; i<nSeqs; ++i)
526  architectureSplits[set[i].dom_archi().id()].share(set, i);
527 
528  for (size_t i=0; i<nArchis; ++i)
529  {
531  arch.push_back(set.dom_archis()[i]);
532  architectureSplits[i].dom_archis(arch);
533  }
534 }
535 
536 }
537 
538 #endif /* PROTEINSEQUENCESET_HPP_ */