SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
format_fasta.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <string>
17 #include <string_view>
18 #include <vector>
19 
41 #include <seqan3/std/algorithm>
42 #include <seqan3/std/ranges>
43 
44 namespace seqan3
45 {
46 
80 {
81 public:
85  format_fasta() noexcept = default;
86  format_fasta(format_fasta const &) noexcept = default;
87  format_fasta & operator=(format_fasta const &) noexcept = default;
88  format_fasta(format_fasta &&) noexcept = default;
89  format_fasta & operator=(format_fasta &&) noexcept = default;
90  ~format_fasta() noexcept = default;
92 
94  static inline std::vector<std::string> file_extensions
95  {
96  { "fasta" },
97  { "fa" },
98  { "fna" },
99  { "ffn" },
100  { "faa" },
101  { "frn" },
102  { "fas" },
103  };
104 
105 protected:
107  template <typename stream_type, // constraints checked by file
108  typename legal_alph_type, bool seq_qual_combined,
109  typename seq_type, // other constraints checked inside function
110  typename id_type,
111  typename qual_type>
112  void read_sequence_record(stream_type & stream,
114  seq_type & sequence,
115  id_type & id,
116  qual_type & SEQAN3_DOXYGEN_ONLY(qualities))
117  {
118  auto stream_view = views::istreambuf(stream);
119 
120  // ID
121  read_id(stream_view, options, id);
122 
123  // Sequence
124  read_seq(stream_view, options, sequence);
125  }
126 
128  template <typename stream_type, // constraints checked by file
129  typename seq_type, // other constraints checked inside function
130  typename id_type,
131  typename qual_type>
132  void write_sequence_record(stream_type & stream,
133  sequence_file_output_options const & options,
134  seq_type && sequence,
135  id_type && id,
136  qual_type && SEQAN3_DOXYGEN_ONLY(qualities))
137  {
138  seqan3::detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
139 
140  // ID
141  if constexpr (detail::decays_to_ignore_v<id_type>)
142  {
143  throw std::logic_error{"The ID field may not be set to ignore when writing FASTA files."};
144  }
145  else
146  {
147  if (std::ranges::empty(id)) //[[unlikely]]
148  throw std::runtime_error{"The ID field may not be empty when writing FASTA files."};
149 
150  write_id(stream_it, options, id);
151  }
152 
153  // Sequence
154  if constexpr (detail::decays_to_ignore_v<seq_type>) // sequence
155  {
156  throw std::logic_error{"The SEQ and SEQ_QUAL fields may not both be set to ignore when writing FASTA files."};
157  }
158  else
159  {
160  if (std::ranges::empty(sequence)) //[[unlikely]]
161  throw std::runtime_error{"The SEQ field may not be empty when writing FASTA files."};
162 
163  write_seq(stream_it, options, sequence);
164  }
165  }
166 
167 private:
170  template <typename stream_view_t,
171  typename seq_legal_alph_type, bool seq_qual_combined,
172  typename id_type>
173  void read_id(stream_view_t & stream_view,
175  id_type & id)
176  {
177  auto const is_id = is_char<'>'> || is_char<';'>;
178 
179  if (!is_id(*begin(stream_view)))
180  throw parse_error{std::string{"Expected to be on beginning of ID, but "} + is_id.msg +
181  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
182 
183  // read id
184  if constexpr (!detail::decays_to_ignore_v<id_type>)
185  {
186  if (options.truncate_ids)
187  {
188  #if SEQAN3_WORKAROUND_VIEW_PERFORMANCE
189  auto it = stream_view.begin();
190  auto e = stream_view.end();
191  for (; (it != e) && (is_id || is_blank)(*it); ++it)
192  {}
193 
194  bool at_delimiter = false;
195  for (; it != e; ++it)
196  {
197  if ((is_cntrl || is_blank)(*it))
198  {
199  at_delimiter = true;
200  break;
201  }
202  id.push_back(assign_char_to(*it, std::ranges::range_value_t<id_type>{}));
203  }
204 
205  if (!at_delimiter)
206  throw unexpected_end_of_input{"FastA ID line did not end in newline."};
207 
208  for (; (it != e) && ((!is_char<'\n'>)(*it)); ++it)
209  {}
210 
211  #else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓
212 
213  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
214  | views::take_until_or_throw(is_cntrl || is_blank) // read ID until delimiter…
215  | views::char_to<std::ranges::range_value_t<id_type>>,
216  std::cpp20::back_inserter(id)); // … ^A is old delimiter
217 
218  // consume rest of line
219  detail::consume(stream_view | views::take_line_or_throw);
220  #endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE
221 
222  }
223  else
224  {
225  #if SEQAN3_WORKAROUND_VIEW_PERFORMANCE
226  auto it = stream_view.begin();
227  auto e = stream_view.end();
228  for (; (it != e) && (is_id || is_blank)(*it); ++it)
229  {}
230 
231  bool at_delimiter = false;
232  for (; it != e; ++it)
233  {
234  if ((is_char<'\n'>)(*it))
235  {
236  at_delimiter = true;
237  break;
238  }
239  id.push_back(assign_char_to(*it, std::ranges::range_value_t<id_type>{}));
240  }
241 
242  if (!at_delimiter)
243  throw unexpected_end_of_input{"FastA ID line did not end in newline."};
244 
245  #else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓
246 
247  std::ranges::copy(stream_view | views::take_line_or_throw // read line
248  | std::views::drop_while(is_id || is_blank) // skip leading >
249  | views::char_to<std::ranges::range_value_t<id_type>>,
250  std::cpp20::back_inserter(id));
251  #endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE
252  }
253  }
254  else
255  {
256  detail::consume(stream_view | views::take_line_or_throw);
257  }
258  }
259 
261  template <typename stream_view_t,
262  typename seq_legal_alph_type, bool seq_qual_combined,
263  typename seq_type>
264  void read_seq(stream_view_t & stream_view,
265  sequence_file_input_options<seq_legal_alph_type, seq_qual_combined> const &,
266  seq_type & seq)
267  {
268  auto constexpr is_id = is_char<'>'> || is_char<';'>;
269 
270  if constexpr (!detail::decays_to_ignore_v<seq_type>)
271  {
272  auto constexpr not_in_alph = !is_in_alphabet<seq_legal_alph_type>;
273 
274  #if SEQAN3_WORKAROUND_VIEW_PERFORMANCE
275  auto it = stream_view.begin();
276  auto e = stream_view.end();
277  for (; (it != e) && ((!is_id)(*it)); ++it)
278  {
279  if ((is_space || is_digit)(*it))
280  continue;
281  else if (not_in_alph(*it))
282  {
283  throw parse_error{std::string{"Encountered an unexpected letter: "} +
284  not_in_alph.msg +
285  " evaluated to true on " +
286  detail::make_printable(*it)};
287  }
288 
289  seq.push_back(assign_char_to(*it, std::ranges::range_value_t<seq_type>{}));
290  }
291 
292  #else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓
293 
294  std::ranges::copy(stream_view | views::take_until(is_id) // until next header (or end)
295  | std::views::filter(!(is_space || is_digit))// ignore whitespace and numbers
296  | std::views::transform([not_in_alph] (char const c)
297  {
298  if (not_in_alph(c))
299  {
300  throw parse_error{std::string{"Encountered an unexpected letter: "} +
301  not_in_alph.msg +
302  " evaluated to false on " +
303  detail::make_printable(c)};
304  }
305  return c;
306  }) // enforce legal alphabet
307  | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
308  std::cpp20::back_inserter(seq));
309  #endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE
310  }
311  else
312  {
313  detail::consume(stream_view | views::take_until(is_id));
314  }
315  }
316 
318  template <typename stream_it_t, typename id_type>
319  void write_id(stream_it_t & stream_it, sequence_file_output_options const & options, id_type && id)
320  {
321  if (options.fasta_legacy_id_marker)
322  stream_it = ';';
323  else
324  stream_it = '>';
325 
326  if (options.fasta_blank_before_id)
327  stream_it = ' ';
328 
329  stream_it.write_range(id);
330  stream_it.write_end_of_line(options.add_carriage_return);
331  }
332 
334  template <typename stream_it_t, typename seq_type>
335  void write_seq(stream_it_t & stream_it, sequence_file_output_options const & options, seq_type && seq)
336  {
337  auto char_sequence = seq | views::to_char;
338 
339  if (options.fasta_letters_per_line > 0)
340  {
341  /* Using `views::interleave` is probably the way to go but that needs performance-tuning.*/
342  auto it = std::ranges::begin(char_sequence);
343  auto end = std::ranges::end(char_sequence);
344 
345  while (it != end)
346  {
347  /* Note: This solution is slightly suboptimal for sized but non-random-access ranges.*/
348  auto current_end = it;
349  size_t steps = std::ranges::advance(current_end, options.fasta_letters_per_line, end);
350  using subrange_t = std::ranges::subrange<decltype(it), decltype(it), std::ranges::subrange_kind::sized>;
351  it = stream_it.write_range(subrange_t{it, current_end, (options.fasta_letters_per_line - steps)});
352  stream_it.write_end_of_line(options.add_carriage_return);
353  }
354  }
355  else
356  {
357  stream_it.write_range(char_sequence);
358  stream_it.write_end_of_line(options.add_carriage_return);
359  }
360  }
361 };
362 
363 } // namespace seqan
Adaptations of algorithms from the Ranges TS.
Provides aliases for qualified.
Provides alphabet adaptations for standard char types.
Provides seqan3::views::char_to.
The FastA format.
Definition: format_fasta.hpp:80
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_fasta.hpp:95
format_fasta() noexcept=default
Defaulted.
void read_sequence_record(stream_type &stream, sequence_file_input_options< legal_alph_type, seq_qual_combined > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_fasta.hpp:112
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_fasta.hpp:132
Provides seqan3::dna5, container aliases and string literals.
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:113
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
The generic concept for a sequence.
Provides various utility functions.
Provides seqan3::fast_istreambuf_iterator and seqan3::fast_ostreambuf_iterator, as well as,...
Provides seqan3::views::istreambuf.
Provides seqan3::views::join.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Provides character predicates for tokenisation.
Provides various utility functions.
Provides various transformation traits used by the range module.
Adaptations of concepts from the Ranges TS.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_input_options.
Provides seqan3::sequence_file_output_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:26
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:22
Provides seqan3::views::take.
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
Provides seqan3::views::take_line and seqan3::views::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::views::to_char.