MDA
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
SequenceSet_memsafe.hpp
1 /*
2  * SequenceSet_memsafe.hpp
3  *
4  * Created on: Jun 2, 2013
5  * Author: Carsten Kemena
6  *
7  * This file is part of MDAT.
8  *
9  * MDAT is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU Lesser General Public License as published by
11  * the Free Software Foundation, either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * MDAT is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  * GNU Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public License
20  * along with MDAT. If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23 
24 #ifndef SequenceSetBase_IO_MEMSAFE_HPP_
25 #define SequenceSetBase_IO_MEMSAFE_HPP_
26 
27 
28 // C header
29 #include <cstdlib>
30 #include <cstring>
31 #include <cstdio>
32 
33 // C++ header
34 #include <algorithm>
35 #include <fstream>
36 #include <string>
37 #include <vector>
38 
39 
40 // MDAT header
41 #include "../Basics/basics.hpp"
42 #include "../Basics/utils.hpp"
43 #include "../utils/filesystem.h"
44 
45 
46 // Boost header
47 #include <boost/algorithm/string/split.hpp>
48 #include <boost/algorithm/string.hpp>
49 #include <boost/lexical_cast.hpp>
50 
51 
52 
53 using boost::bad_lexical_cast;
54 
55 namespace MDAT
56 {
67 template<typename SequenceType>
68 class SequenceSetBase<SequenceType, MemSafe>
69 {
70 
71 typedef std::shared_ptr<SequenceType> Seq_ptr;
72 
73 private:
74  Seq_ptr _seq;
75 
76  mutable std::vector<std::streampos> _id_index;
77  mutable std::map<std::string, size_t> _name_index;
78  mutable bool _complete;
79  mutable bool _index_changed;
80  mutable size_t _current_id;
81  mutable std::string _current_name;
82  mutable std::ifstream _seq_F;
83  mutable size_t _n_seqs;
84 
85  char _seq_type; // Sequence type
86  std::string _file; // File the alignment was read from
87 
88  void
89  _read_single_seq(const std::streampos &pos) const
90  {
91  std::string line;
92  _seq_F.seekg(pos);
93  getline(_seq_F, line);
94  size_t length=line.size();
95  size_t i;
96  for (i=0; i<length; ++i)
97  {
98  if (line[i]==' ')
99  break;
100  }
101  StrTok tokenizer(&line[1]);
102  _seq->name(tokenizer.next(" \n"));
103  char *comment = tokenizer.next("\n");
104  _seq->comment((comment != NULL)?comment: "" );
105 
106  _seq->sequence("");
107  while (std::getline(_seq_F, line))
108  {
109  if (line[0] == '>')
110  break;
111  _seq->append(line);
112  }
113  if (!_seq_F.good())
114  {
115  _complete=true;
116  _seq_F.clear();
117  }
118  }
119 
120  Seq_ptr
121  _access_value(size_t index_val) const
122  {
123 
124  if (index_val!=_current_id)
125  {
126  if (index_val >= _n_seqs)
127  {
128  std::string line;
129  size_t i;
130  size_t length;
131  _seq_F.seekg(_id_index[_n_seqs-1]);
132  std::getline(_seq_F, line);
133  std::streampos pos=0;
134  while (std::getline(_seq_F, line))
135  {
136  if (line[0] =='>')
137  {
138  length=line.size();
139  for (i=0; i<length; ++i)
140  {
141  if (line[i]==' ')
142  break;
143  }
144 
145  std::string seq_name = line.substr(1,i-1);
146  _id_index.push_back(pos);
147  _name_index[seq_name]=_n_seqs;
148  ++_n_seqs;
149  if ((_n_seqs) == index_val+1)
150  break;
151  }
152  pos=_seq_F.tellg();
153  }
154  _index_changed=true;
155  }
156  _current_id=index_val;
157  _current_name=_seq->name();
158  _read_single_seq(_id_index[index_val]);
159  }
160  return _seq;
161  }
162 
163  Seq_ptr
164  _access_value(const std::string seq_id) const
165  {
166  std::map<std::string, size_t>::iterator it_end= _name_index.end();
167  if (_name_index.find(seq_id)==it_end)
168  {
169  std::string line;
170  size_t i;
171  size_t length;
172  _seq_F.seekg(_id_index[_n_seqs-1]);
173  std::getline(_seq_F, line);
174  std::streampos pos=0;
175  while (std::getline(_seq_F, line))
176  {
177  if (line[0] =='>')
178  {
179  length=line.size();
180  for (i=0; i<length; ++i)
181  {
182  if (line[i]==' ')
183  break;
184  }
185 
186  std::string seq_name = line.substr(1,i-1);
187  _id_index.push_back(pos);
188  _name_index[seq_name]=pos;
189  ++_n_seqs;
190  if (seq_name==seq_id)
191  break;
192  }
193  pos=_seq_F.tellg();
194  }
195  _read_single_seq(_id_index[_n_seqs-1]);
196  _index_changed=true;
197  }
198  else
199  _read_single_seq(_id_index[_name_index[seq_id]]);
200  return _seq;
201  }
202 
203 
204 public:
205 
206  /***********************************************************************
207  * Constructors & Destructors *
208  ***********************************************************************/
209 
210 
211  SequenceSetBase():_complete(false),_index_changed(false),_current_id(-1),_n_seqs(0),_seq_type('x')
212  {
213  SequenceType *seq = new SequenceType("","",0);
214  _seq.reset(seq);
215  }
216 
217  SequenceSetBase(const std::string &seq_f)
218  {
219  SequenceType *seq = new SequenceType("","",0);
220  _seq.reset(seq);
221  set_file(seq_f);
222  }
223 
224  SequenceSetBase(const std::string &seq_f, const std::string &index_f)
225  {
226  SequenceType *seq = new SequenceType("","",0);
227  _seq.reset(seq);
228  set_file(seq_f, index_f);
229  }
230 
231  virtual ~SequenceSetBase()
232  {
233  _seq_F.close();
234  }
235 
236 
237  /***********************************************************************
238  * Simple Access functions *
239  ***********************************************************************/
240 
241  // Operators
243 
244 
245  Seq_ptr
246  next()
247  {
248  if (((_current_id+1)==_n_seqs) && (_complete))
249  return Seq_ptr(NULL);
250  else
251  return _access_value(_current_id+1);
252  }
253 
254 
260  SequenceType &operator[](unsigned int index)
261  {
262  return *_access_value(index);
263  }
264  SequenceType &operator[](std::string &seq_id)
265  {
266  return *_access_value(seq_id);
267  }
268 
272  const SequenceType &operator[](unsigned int index) const
273  {
274  return *_access_value(index);
275  }
276 
277  const SequenceType &operator[](const std::string &seq_id) const
278  {
279  return *_access_value(seq_id);
280  }
285 
286 
287  void
288  build_index() const
289  {
290  std::streampos pos;
291  std::string line;
292  _seq_F.seekg(_id_index[_n_seqs-1]);
293  std::getline(_seq_F, line);
294  size_t i, length;
295  while (!_complete)
296  {
297  while (std::getline(_seq_F, line))
298  {
299  if (line[0] == '>')
300  break;
301  pos=_seq_F.tellg();
302  }
303  if (!_seq_F.good())
304  {
305  _complete=true;
306  break;
307  }
308  length=line.size();
309  for (i=0; i<length; ++i)
310  {
311  if (line[i]==' ')
312  break;
313  }
314  _id_index.push_back(pos);
315  _name_index[line.substr(1,i-1)] = _n_seqs;
316  ++_n_seqs;
317  _index_changed=true;
318  }
319  _seq_F.clear();
320  }
321 
322  size_t
323  n_seqs() const
324  {
325  if (!_complete)
326  build_index();
327  return _n_seqs;
328  }
329 
330  void
331  read(const std::string &seq_f)
332  {
333  set_file(seq_f);
334  }
335 
336  void
337  set_file(const std::string &seq_f)
338  {
339  _current_id=0;
340  _complete=false;
341  _seq_F.open(seq_f);
342  _id_index.clear();
343  _name_index.clear();
344 
345  std::string line;
346  std::streampos pos=_seq_F.tellg();
347  while (std::getline(_seq_F, line))
348  {
349  if (line[0] == '>')
350  break;
351  pos=_seq_F.tellg();
352 
353  }
354  _n_seqs=1;
355  size_t i;
356  size_t length=line.size();
357  for (i=0; i<length; ++i)
358  {
359  if (line[i]==' ')
360  break;
361  }
362 
363  std::string seq_name = line.substr(1,i-1);
364  _id_index.push_back(pos);
365  _name_index[seq_name]=pos;
366  _read_single_seq(pos);
367  _index_changed=true;
368  }
369 
370  void
371  set_file(const std::string &seq_f, const std::string &index_f)
372  {
373  _current_id=0;
374  _seq_F.open(seq_f);
375  _id_index.clear();
376  _name_index.clear();
377  read_index(index_f);
378  _index_changed=false;
379  _read_single_seq(_id_index[0]);
380  }
381 
382  const SequenceType* seq(unsigned int index) const
383  {
384  return &(*(_access_value(index)));
385  }
386 
387  void
388  read_index(const std::string &index_f)
389  {
390  std::ifstream index_F(index_f);
391  std::string line;
392  std::vector<std::string> list;
393  size_t seq_id;
394  getline(index_F, line); //Version check
395  if (line=="#MDAT - SeqIndex v1.0")
396  {
397  getline(index_F, line); // is completed
398  _n_seqs=boost::lexical_cast<size_t>(line);
399  _id_index.resize(_n_seqs);
400  getline(index_F, line);
401  if (line=="complete")
402  _complete=true;
403  else
404  _complete=false;
405  while(getline(index_F, line))
406  {
407  split(list, line, boost::is_any_of(" "));
408  std::streampos pos = boost::lexical_cast<size_t>(list[2]);
409  seq_id = boost::lexical_cast<size_t>(list[1]);
410  _id_index[seq_id] = pos;
411  _name_index[list[0]] = seq_id;
412  }
413  }
414  else
415  {
416  std::cerr << "Problem reading the index file" << std::endl;
417  }
418  index_F.close();
419  }
420 
421 
422  void
423  write_index(const std::string &index_f) const
424  {
425  std::ofstream index_F(index_f);
426  std::map<std::string, size_t>::const_iterator it,it_end=_name_index.end();
427  index_F << "#MDAT - SeqIndex v1.0" << std::endl;
428  index_F << _n_seqs << std::endl;
429  if (_complete)
430  index_F << "complete" << std::endl;
431  else
432  index_F << "incomplete" << std::endl;
433  for (it=_name_index.begin(); it!=it_end; ++it)
434  {
435  index_F << it->first << " " << it->second << " " << _id_index[it->second] << std::endl;
436  }
437  }
438 
439  bool
440  index_changed() const
441  {
442  return _index_changed;
443  }
444 
445 };
446 
452 }
453 
454 #endif