8 #ifndef GENOMEFEATURES_H_
9 #define GENOMEFEATURES_H_
21 #include <boost/tokenizer.hpp>
22 #include <boost/lexical_cast.hpp>
23 #include <boost/algorithm/string.hpp>
24 #include <boost/iterator.hpp>
27 #include "../Basics/basics.hpp"
28 #include "../utils/MDAT_Exceptions.hpp"
34 typedef Tag<List_> List;
49 std::map<std::string, std::string> attributes;
60 typedef std::vector<SingleSequenceFeature>::iterator SequenceFeatures_iter;
63 template<
typename FeatureType>
67 typedef std::map<std::string, std::vector<SingleSequenceFeature> >Feature_container;
68 Feature_container _feature_list;
74 if ((feature1.type!=
"gene") && (feature2.type!=
"gene") && (feature1.type!=
"contig") && (feature2.type!=
"contig"))
76 if (feature1.attributes.at(
"Parent") != feature2.attributes.at(
"Parent"))
77 return feature1.attributes.at(
"Parent") < feature2.attributes.at(
"Parent");
79 if (feature1.start < feature2.start)
81 else if (feature1.start == feature2.start)
82 return (feature1.length()>feature2.length());
109 SequenceFeatures_iter
112 return _feature_list[_feature_list.begin()->first].begin();
115 SequenceFeatures_iter
118 return _feature_list[_feature_list.begin()->first].end();
127 SequenceFeatures_iter
128 begin(
const std::string &seq_name)
130 return _feature_list[seq_name].begin();
138 SequenceFeatures_iter
139 end(
const std::string &seq_name)
141 return _feature_list[seq_name].end();
156 _feature_list[seq_id].push_back(feature);
172 return _feature_list.empty();
180 template<
typename FeatureType>
181 SequenceFeatures<FeatureType>::SequenceFeatures()
187 template<
typename FeatureType>
188 SequenceFeatures<FeatureType>::~SequenceFeatures()
195 template<
typename FeatureType>
199 Feature_container::iterator cont_it, cont_it_end;
200 std::ifstream gff_F(gff_f);
204 typedef boost::tokenizer< boost::char_separator<char> > t_tokenizer;
205 boost::char_separator<char> sep(
"\t");
206 t_tokenizer tok(std::string(
" "), sep);
207 t_tokenizer::iterator it;
208 std::string attribute, tmp_string;
209 std::vector<std::string> attribute_list;
210 std::vector<SingleSequenceFeature>::iterator feature_it;
216 std::string last_id =
"";
219 getline(gff_F, line);
220 if (line.empty() || (line[0] ==
'#'))
228 cont_it = _feature_list.find(seq_id);
229 if (cont_it==_feature_list.end())
231 cont_it=_feature_list.insert(std::pair<std::string, std::vector<SingleSequenceFeature> >(seq_id, std::vector<SingleSequenceFeature>())).first;
237 feature_it=cont_it->second.end()-1;
238 feature_it->seq_id=seq_id;
239 feature_it->source = *(++it);
240 feature_it->type = *(++it);
241 feature_it->start = boost::lexical_cast<
size_t>(*(++it))-1;
242 feature_it->end = boost::lexical_cast<
size_t>(*(++it))-1;
244 feature_it->score = std::numeric_limits<double>::min();
246 feature_it->score = boost::lexical_cast<
double>(*it);
247 feature_it->strand=(*(++it))[0];
249 feature_it->phase = std::numeric_limits<short>::min();
251 feature_it->phase = boost::lexical_cast<
short>(*it);
253 tmp_string = *(++it);
254 boost::split(attribute_list, tmp_string, boost::is_any_of(
";="), boost::algorithm::token_compress_on);
255 n_elements = attribute_list.size();
256 for (i=0; i<n_elements; i+=2)
258 if (!attribute_list[i].empty())
259 feature_it->attributes[attribute_list[i]] = attribute_list[i+1];
268 template<
typename FeatureType>
272 std::ofstream out_F(out_f);
273 out_F <<
"##gff-version" << std::endl;
274 std::map<std::string, std::vector<SingleSequenceFeature> >::iterator it,it_end=_feature_list.end();
275 std::vector<SingleSequenceFeature>::iterator source_it, source_it_end;
276 std::map<std::string, std::string>::iterator attr_it,attr_it_end;
277 for (it=_feature_list.begin(); it!=it_end; ++it)
279 source_it_end=it->second.end();
280 for (source_it=it->second.begin(); source_it!=source_it_end; ++source_it)
282 out_F << source_it->seq_id <<
"\t" << source_it->source <<
"\t" << source_it->type <<
"\t"<< source_it->start+1 <<
"\t"<< source_it->end+1 <<
"\t";
283 if (source_it->score==std::numeric_limits<double>::min())
286 out_F << source_it->score;
287 out_F <<
"\t"<< source_it->strand<<
"\t";
288 if (source_it->phase==std::numeric_limits<short>::min())
291 out_F<< source_it->phase <<
"\t";
292 attr_it_end=source_it->attributes.end();
293 for (attr_it=source_it->attributes.begin(); attr_it!=attr_it_end; ++attr_it)
294 out_F << attr_it->first <<
"=" <<attr_it->second<<
";";
303 template<
typename FeatureType>
307 std::map<std::string, std::vector<SingleSequenceFeature> >::iterator it,it_end=_feature_list.end();
313 for (it=_feature_list.begin(); it!=it_end; ++it)
315 std::vector<SingleSequenceFeature> &vec = it->second;
317 for (i=0; i<vec.size(); ++i)
319 if (vec[i].type==
"mRNA")
321 else if (vec[i].type==
"exon")
329 if (vec[i].start>vec[last].start)
331 new_feat.start=vec[last].end+1;
332 new_feat.end = vec[i].start-1;
333 new_feat.type=
"intron";
337 new_feat.start=vec[i].end+1;
338 new_feat.end=vec[last].start-1;
339 new_feat.type=
"intron";
341 new_feat.attributes[
"ID"]=new_feat.attributes[
"Parent"]+
":intron";
342 vec.insert(vec.begin()+i,new_feat);