SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
format_embl.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <string>
17 #include <string_view>
18 #include <vector>
19 
36 #include <seqan3/std/algorithm>
37 #include <seqan3/std/ranges>
38 
39 namespace seqan3
40 {
71 {
72 public:
76  format_embl() noexcept = default;
77  format_embl(format_embl const &) noexcept = default;
78  format_embl & operator=(format_embl const &) noexcept = default;
79  format_embl(format_embl &&) noexcept = default;
80  format_embl & operator=(format_embl &&) noexcept = default;
81  ~format_embl() noexcept = default;
83 
85  static inline std::vector<std::string> file_extensions
86  {
87  { "embl" },
88  };
89 
90 protected:
92  template <typename stream_type, // constraints checked by file
93  typename seq_legal_alph_type, bool seq_qual_combined,
94  typename seq_type, // other constraints checked inside function
95  typename id_type,
96  typename qual_type>
97  void read_sequence_record(stream_type & stream,
99  seq_type & sequence,
100  id_type & id,
101  qual_type & SEQAN3_DOXYGEN_ONLY(qualities))
102  {
103  auto stream_view = views::istreambuf(stream);
104  auto stream_it = std::ranges::begin(stream_view);
105 
106  std::string idbuffer;
107  std::ranges::copy(stream_view | views::take_until_or_throw(is_cntrl || is_blank),
108  std::cpp20::back_inserter(idbuffer));
109  if (idbuffer != "ID")
110  throw parse_error{"An entry has to start with the code word ID."};
111 
112  if constexpr (!detail::decays_to_ignore_v<id_type>)
113  {
114  if (options.embl_genbank_complete_header)
115  {
116  std::ranges::copy(idbuffer | views::char_to<std::ranges::range_value_t<id_type>>, std::cpp20::back_inserter(id));
117  do
118  {
119  std::ranges::copy(stream_view | views::take_until_or_throw(is_char<'S'>)
120  | views::char_to<std::ranges::range_value_t<id_type>>,
121  std::cpp20::back_inserter(id));
122  id.push_back(*stream_it);
123  ++stream_it;
124  } while (*stream_it != 'Q');
125  id.pop_back(); // remove 'S' from id
126  idbuffer = "SQ";
127  }
128  else
129  {
130  // ID
131  detail::consume(stream_view | views::take_until(!is_blank));
132 
133  // read id
134  if (options.truncate_ids)
135  {
136  std::ranges::copy(stream_view | views::take_until_or_throw(is_blank || is_char<';'> || is_cntrl)
137  | views::char_to<std::ranges::range_value_t<id_type>>,
138  std::cpp20::back_inserter(id));
139  }
140  else
141  {
142  std::ranges::copy(stream_view | views::take_until_or_throw(is_char<';'>)
143  | views::char_to<std::ranges::range_value_t<id_type>>,
144  std::cpp20::back_inserter(id));
145  }
146  }
147  }
148 
149  // Jump to sequence
150  if (idbuffer !="SQ")
151  {
152  do
153  {
154  detail::consume(stream_view | views::take_until_or_throw(is_char<'S'>));
155  ++stream_it;
156  } while (*stream_it != 'Q');
157  }
158  detail::consume(stream_view | views::take_line_or_throw); //Consume line with infos to sequence
159 
160  // Sequence
161  auto constexpr is_end = is_char<'/'> ;
162  if constexpr (!detail::decays_to_ignore_v<seq_type>)
163  {
164  auto seq_view = stream_view | std::views::filter(!(is_space || is_digit)) // ignore whitespace and numbers
165  | views::take_until_or_throw(is_end); // until //
166 
167  auto constexpr is_legal_alph = is_in_alphabet<seq_legal_alph_type>;
168  std::ranges::copy(seq_view | std::views::transform([is_legal_alph] (char const c) // enforce legal alphabet
169  {
170  if (!is_legal_alph(c))
171  {
172  throw parse_error{std::string{"Encountered an unexpected letter: "} +
173  is_legal_alph.msg +
174  " evaluated to false on " +
175  detail::make_printable(c)};
176  }
177  return c;
178  })
179  | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
180  std::cpp20::back_inserter(sequence));
181  }
182  else
183  {
184  detail::consume(stream_view | views::take_until(is_end));
185  }
186  //Jump over // and cntrl
187  ++stream_it;
188  ++stream_it;
189  ++stream_it;
190  }
191 
193  template <typename stream_type, // constraints checked by file
194  typename seq_type, // other constraints checked inside function
195  typename id_type,
196  typename qual_type>
197  void write_sequence_record(stream_type & stream,
198  sequence_file_output_options const & options,
199  seq_type && sequence,
200  id_type && id,
201  qual_type && SEQAN3_DOXYGEN_ONLY(qualities))
202  {
203  seqan3::detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
204 
205  [[maybe_unused]] size_t sequence_size = 0;
206 
207  if constexpr (!detail::decays_to_ignore_v<seq_type>)
208  sequence_size = std::ranges::distance(sequence);
209 
210  // ID
211  if constexpr (detail::decays_to_ignore_v<id_type>)
212  {
213  throw std::logic_error{"The ID field may not be set to ignore when writing embl files."};
214  }
215  else
216  {
217  if (ranges::empty(id)) //[[unlikely]]
218  throw std::runtime_error{"The ID field may not be empty when writing embl files."};
219 
220  if (options.embl_genbank_complete_header)
221  {
222  stream_it.write_range(id);
223  }
224  else
225  {
226  stream_it.write_range(std::string_view{"ID "});
227  stream_it.write_range(id);
228  stream_it.write_range(std::string_view{"; "});
229  stream_it.write_number(sequence_size);
230  stream_it.write_range(std::string_view{" BP.\n"});
231  }
232  }
233 
234  // Sequence
235  if constexpr (detail::decays_to_ignore_v<seq_type>) // sequence
236  {
237  throw std::logic_error{"The SEQ field may not be set to ignore when writing embl files."};
238  }
239  else
240  {
241  if (std::ranges::empty(sequence)) //[[unlikely]]
242  throw std::runtime_error{"The SEQ field may not be empty when writing embl files."};
243 
244  // write beginning of sequence record
245  stream_it.write_range(std::string_view{"SQ Sequence "});
246  stream_it.write_number(sequence_size);
247  stream_it.write_range(std::string_view{" BP;\n"});
248 
249  // write sequence in chunks of 60 bp's with a space after 10 bp's
250  auto char_sequence = sequence | views::to_char;
251  auto it = std::ranges::begin(char_sequence);
252  size_t written_chars{0};
253  uint8_t chunk_size{10u};
254 
255  while (it != std::ranges::end(char_sequence))
256  {
257  auto current_end = it;
258  size_t steps = std::ranges::advance(current_end, chunk_size, std::ranges::end(char_sequence));
259 
260  using subrange_t = std::ranges::subrange<decltype(it), decltype(it), std::ranges::subrange_kind::sized>;
261  it = stream_it.write_range(subrange_t{it, current_end, chunk_size - steps});
262  stream_it = ' ';
263  written_chars += chunk_size;
264 
265  if (written_chars % 60 == 0)
266  {
267  stream_it.write_number(written_chars);
268  stream_it.write_end_of_line(options.add_carriage_return);
269  }
270  }
271 
272  // fill last line
273  auto characters_in_last_line = sequence_size % 60;
274  auto number_of_padding_needed = 65 - characters_in_last_line - characters_in_last_line / chunk_size;
275  stream_it.write_range(views::repeat_n(' ', number_of_padding_needed));
276  stream_it.write_number(sequence_size);
277  stream_it.write_end_of_line(options.add_carriage_return);
278 
279  // write end-of-record-symbol
280  stream_it.write_range(std::string_view{"//"});
281  stream_it.write_end_of_line(options.add_carriage_return);
282  }
283  }
284 };
285 
286 } // namespace seqan
Adaptations of algorithms from the Ranges TS.
Provides seqan3::views::char_to.
The EMBL format.
Definition: format_embl.hpp:71
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_embl.hpp:86
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_embl.hpp:197
format_embl() noexcept=default
Defaulted.
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type, seq_qual_combined > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_embl.hpp:97
Provides seqan3::dna5, container aliases and string literals.
constexpr auto is_blank
Checks whether c is a blank character.
Definition: predicate.hpp:163
constexpr auto is_digit
Checks whether c is a digital character.
Definition: predicate.hpp:287
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition: predicate.hpp:83
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:146
constexpr auto is_cntrl
Checks whether c is a control character.
Definition: predicate.hpp:110
seqan3::type_list< trait_t< pack_t >... > transform
Apply a transformation trait to every type in the pack and return a seqan3::type_list of the results.
Definition: traits.hpp:307
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:65
constexpr auto take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:624
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:113
constexpr auto take_until
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until.hpp:610
auto const char_to
A view over an alphabet, given a range of characters.
Definition: char_to.hpp:69
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:94
constexpr auto take_line_or_throw
A view adaptor that returns a single line from the underlying range (throws if there is no end-of-lin...
Definition: take_line.hpp:90
The generic concept for a sequence.
Provides various utility functions.
Provides seqan3::fast_istreambuf_iterator and seqan3::fast_ostreambuf_iterator, as well as,...
Provides seqan3::views::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Provides character predicates for tokenisation.
Provides various utility functions.
Provides various transformation traits used by the range module.
Adaptations of concepts from the Ranges TS.
Provides seqan3::views::repeat_n.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_input_options.
Provides seqan3::sequence_file_output_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:26
bool embl_genbank_complete_header
Read the complete_header into the seqan3::field::id for embl or genbank format.
Definition: input_options.hpp:30
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: input_options.hpp:28
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:22
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:39
bool embl_genbank_complete_header
Complete header given for embl or genbank.
Definition: output_options.hpp:42
Provides seqan3::views::take_line and seqan3::views::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::views::to_char.