MDA
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups
SequenceFeatures.hpp
1 /*
2  * SequenceFeatures.h
3  *
4  * Created on: Jun 8, 2013
5  * Author: ck
6  */
7 
8 #ifndef GENOMEFEATURES_H_
9 #define GENOMEFEATURES_H_
10 
11 // C++ header
12 #include <algorithm>
13 #include <fstream>
14 #include <limits>
15 #include <map>
16 #include <string>
17 #include <vector>
18 
19 
20 // Boost header
21 #include <boost/tokenizer.hpp>
22 #include <boost/lexical_cast.hpp>
23 #include <boost/algorithm/string.hpp>
24 #include <boost/iterator.hpp>
25 
26 
27 #include "../Basics/basics.hpp"
28 #include "../utils/MDAT_Exceptions.hpp"
29 
30 namespace MDAT
31 {
32 
33 struct List_;
34 typedef Tag<List_> List;
35 
36 
37 struct
39 {
40  std::string seq_id;
41  std::string source;
42  std::string type;
43  size_t start;
44  size_t end;
45  double score;
46  char strand;
47  short phase;
48 
49  std::map<std::string, std::string> attributes;
50 
51  size_t length() const
52  {
53  return end-start+1;
54  }
55 
56 
57 };
58 
59 
60 typedef std::vector<SingleSequenceFeature>::iterator SequenceFeatures_iter;
61 
62 
63 template<typename FeatureType>
65 {
66 private:
67  typedef std::map<std::string, std::vector<SingleSequenceFeature> >Feature_container;
68  Feature_container _feature_list;
69 
70  struct sort_features
71  {
72  inline bool operator() (const SingleSequenceFeature& feature1, const SingleSequenceFeature& feature2)
73  {
74  if ((feature1.type!="gene") && (feature2.type!="gene") && (feature1.type!="contig") && (feature2.type!="contig"))
75  {
76  if (feature1.attributes.at("Parent") != feature2.attributes.at("Parent"))
77  return feature1.attributes.at("Parent") < feature2.attributes.at("Parent");
78  }
79  if (feature1.start < feature2.start)
80  return true;
81  else if (feature1.start == feature2.start)
82  return (feature1.length()>feature2.length());
83  else
84  return false;
85  }
86  };
87 
88 
89 public:
90 
92 
93  virtual ~SequenceFeatures();
94 
99  void
100  read_gff(const std::string &gff_f);
101 
106  void
107  write_gff(const std::string &out_f);
108 
109  SequenceFeatures_iter
110  begin()
111  {
112  return _feature_list[_feature_list.begin()->first].begin();
113  }
114 
115  SequenceFeatures_iter
116  end()
117  {
118  return _feature_list[_feature_list.begin()->first].end();
119  }
120 
121 
127  SequenceFeatures_iter
128  begin(const std::string &seq_name)
129  {
130  return _feature_list[seq_name].begin();
131  }
132 
138  SequenceFeatures_iter
139  end(const std::string &seq_name)
140  {
141  return _feature_list[seq_name].end();
142  }
143 
144 
145 /* void
146  sort()
147  {
148  std::map<std::string, std::vector<SingleSequenceFeature> >::iterator it,it_end=_feature_list.end();
149  for (it=_feature_list.begin(); it!=it_end; ++it)
150  std::sort(it->second.begin(), it->second.end(), sort_features());
151  }*/
152 
153  void
154  append(const std::string &seq_id, const SingleSequenceFeature &feature)
155  {
156  _feature_list[seq_id].push_back(feature);
157  }
158 
162  void
163  add_introns();
164 
169  bool
170  empty() const
171  {
172  return _feature_list.empty();
173  }
174 /*
175  void
176  add_promoter(size_t length);
177 */
178 };
179 
180 template<typename FeatureType>
181 SequenceFeatures<FeatureType>::SequenceFeatures()
182 {
183  // TODO Auto-generated constructor stub
184 
185 }
186 
187 template<typename FeatureType>
188 SequenceFeatures<FeatureType>::~SequenceFeatures()
189 {
190  // TODO Auto-generated destructor stub
191 }
192 
193 
194 
195 template<typename FeatureType>
196 void
198 {
199  Feature_container::iterator cont_it, cont_it_end;
200  std::ifstream gff_F(gff_f);
201  if(!gff_F) throw MDAT_IO_Exception("Could not open file\n");
202  std::string line;
203  size_t pos=0;
204  typedef boost::tokenizer< boost::char_separator<char> > t_tokenizer;
205  boost::char_separator<char> sep("\t");
206  t_tokenizer tok(std::string(" "), sep);
207  t_tokenizer::iterator it;
208  std::string attribute, tmp_string;
209  std::vector<std::string> attribute_list;
210  std::vector<SingleSequenceFeature>::iterator feature_it;
211  int n_elements;
212  int i;
213  std::string seq_id;
214  //for(boost::tokenizer<>::iterator beg= tok.begin(); beg!=tok.end(); ++beg)
215  //scf7180000350264 pbar_OGSv1.2 gene 77142 87005 . + . ID=PB18755;Name=PB18755;Alias=PB18755;
216  std::string last_id ="";
217  while (!gff_F.eof())
218  {
219  getline(gff_F, line);
220  if (line.empty() || (line[0] == '#'))
221  continue;
222 
223  tok.assign(line);
224  it = tok.begin();
225  seq_id = *it;
226  if (seq_id!=last_id)
227  {
228  cont_it = _feature_list.find(seq_id);
229  if (cont_it==_feature_list.end())
230  {
231  cont_it=_feature_list.insert(std::pair<std::string, std::vector<SingleSequenceFeature> >(seq_id, std::vector<SingleSequenceFeature>())).first;
232  }
233  last_id=seq_id;
234  }
235 
236  cont_it->second.push_back(SingleSequenceFeature());
237  feature_it=cont_it->second.end()-1;
238  feature_it->seq_id=seq_id;
239  feature_it->source = *(++it);
240  feature_it->type = *(++it);
241  feature_it->start = boost::lexical_cast<size_t>(*(++it))-1; // gff starts at 1 in file
242  feature_it->end = boost::lexical_cast<size_t>(*(++it))-1;
243  if (*(++it) == ".")
244  feature_it->score = std::numeric_limits<double>::min();
245  else
246  feature_it->score = boost::lexical_cast<double>(*it);
247  feature_it->strand=(*(++it))[0];
248  if (*(++it) == ".")
249  feature_it->phase = std::numeric_limits<short>::min();
250  else
251  feature_it->phase = boost::lexical_cast<short>(*it);
252 
253  tmp_string = *(++it);
254  boost::split(attribute_list, tmp_string, boost::is_any_of(";="), boost::algorithm::token_compress_on);
255  n_elements = attribute_list.size();
256  for (i=0; i<n_elements; i+=2)
257  {
258  if (!attribute_list[i].empty())
259  feature_it->attributes[attribute_list[i]] = attribute_list[i+1];
260  else
261  break;
262  }
263  ++pos;
264  }
265 }
266 
267 
268 template<typename FeatureType>
269 void
271 {
272  std::ofstream out_F(out_f);
273  out_F << "##gff-version" << std::endl;
274  std::map<std::string, std::vector<SingleSequenceFeature> >::iterator it,it_end=_feature_list.end();
275  std::vector<SingleSequenceFeature>::iterator source_it, source_it_end;
276  std::map<std::string, std::string>::iterator attr_it,attr_it_end;
277  for (it=_feature_list.begin(); it!=it_end; ++it)
278  {
279  source_it_end=it->second.end();
280  for (source_it=it->second.begin(); source_it!=source_it_end; ++source_it)
281  {
282  out_F << source_it->seq_id << "\t" << source_it->source << "\t" << source_it->type << "\t"<< source_it->start+1 << "\t"<< source_it->end+1 << "\t";
283  if (source_it->score==std::numeric_limits<double>::min())
284  out_F << ".";
285  else
286  out_F << source_it->score;
287  out_F << "\t"<< source_it->strand<< "\t";
288  if (source_it->phase==std::numeric_limits<short>::min())
289  out_F<< ".\t";
290  else
291  out_F<< source_it->phase << "\t";
292  attr_it_end=source_it->attributes.end();
293  for (attr_it=source_it->attributes.begin(); attr_it!=attr_it_end; ++attr_it)
294  out_F << attr_it->first << "=" <<attr_it->second<<";";
295 
296  out_F << std::endl;
297 
298  }
299  }
300  out_F.close();
301 }
302 
303 template<typename FeatureType>
304 void
306 {
307  std::map<std::string, std::vector<SingleSequenceFeature> >::iterator it,it_end=_feature_list.end();
308  size_t last;
309  size_t i;
310  bool new_mRNA=false;
311  SingleSequenceFeature new_feat;
312 
313  for (it=_feature_list.begin(); it!=it_end; ++it)
314  {
315  std::vector<SingleSequenceFeature> &vec = it->second;
316  last=0;
317  for (i=0; i<vec.size(); ++i)
318  {
319  if (vec[i].type=="mRNA")
320  new_mRNA = true;
321  else if (vec[i].type=="exon")
322  {
323  if (new_mRNA)
324  new_mRNA=false;
325  else
326  {
327  new_feat=vec[i];
328 
329  if (vec[i].start>vec[last].start)
330  {
331  new_feat.start=vec[last].end+1;
332  new_feat.end = vec[i].start-1;
333  new_feat.type="intron";
334  }
335  else
336  {
337  new_feat.start=vec[i].end+1;
338  new_feat.end=vec[last].start-1;
339  new_feat.type="intron";
340  }
341  new_feat.attributes["ID"]=new_feat.attributes["Parent"]+":intron";
342  vec.insert(vec.begin()+i,new_feat);
343  ++i;
344  }
345  last=i;
346  }
347 
348  }
349  }
350  //sort();
351 }
352 
353 
354 
355 /*
356 template<typename FeatureType>
357 void
358 SequenceFeatures<FeatureType>::add_promoter(size_t length)
359 {
360  std::map<std::string, std::vector<SingleSequenceFeature> >::iterator it,it_end=_feature_list.end();
361  bool first_CDS=false;
362  SingleSequenceFeature promoter_feat;
363  size_t i;
364 
365  for (it=_feature_list.begin(); it!=it_end; ++it)
366  {
367  std::vector<SingleSequenceFeature> &vec = it->second;
368  for (i=0; i<it->second.size(); ++i)
369  {
370  if (vec[i].type=="mRNA")
371  first_CDS=true;
372  if ((first_CDS) && (vec[i].type=="CDS"))
373  {
374 
375  promoter_feat=vec[i];
376  if (promoter_feat.start>length)
377  {
378  promoter_feat.length=length;
379  promoter_feat.start-=length;
380  }
381  else
382  {
383  promoter_feat.length=promoter_feat.start;
384  promoter_feat.start=0;
385  }
386  promoter_feat.attributes["ID"]=promoter_feat.attributes["Parent"]+":promoter";
387  promoter_feat.type="promoter";
388  it->second.insert(it->second.begin()+i,promoter_feat);
389  first_CDS=false;
390  }
391  }
392  }
393  //sort();
394 }
395 */
396 
397 
398 
399 } /* namespace Sequence */
400 #endif /* GENOMEFEATURES_H_ */