1 #ifndef BFG_COMPACTED_DBG_HPP 2 #define BFG_COMPACTED_DBG_HPP 16 #include <unordered_map> 17 #include <unordered_set> 24 #include "BlockedBloomFilter.hpp" 26 #include "File_Parser.hpp" 27 #include "FASTX_Parser.hpp" 28 #include "GFA_Parser.hpp" 30 #include "KmerHashTable.hpp" 31 #include "KmerIterator.hpp" 32 #include "KmerStream.hpp" 34 #include "minHashIterator.hpp" 35 #include "RepHash.hpp" 36 #include "TinyVector.hpp" 43 #define MASK_CONTIG_ID (0xffffffff00000000) 44 #define MASK_CONTIG_TYPE (0x80000000) 45 #define MASK_CONTIG_POS (0x7fffffff) 46 #define RESERVED_ID (0xffffffff) 162 nb_bits_unique_kmers_bf(14), nb_bits_non_unique_kmers_bf(14), read_chunksize(64),
163 build(false), update(false), clipTips(false), deleteIsolated(false), useMercyKmers(false),
164 outputGFA(true), verbose(false) {}
203 template<
typename Unitig_data_t,
typename Graph_data_t =
void>
296 template<
typename Unitig_data_t =
void,
typename Graph_data_t =
void>
300 "Type of data associated with vertices of class CompactedDBG must be void (no data) or a class extending class CDBG_Data_t");
302 typedef Unitig_data_t U;
303 typedef Graph_data_t G;
307 template<
typename U,
typename G,
bool is_const>
friend class UnitigMap;
308 template<
typename U,
typename G,
bool is_const>
friend class unitigIterator;
309 template<
typename U,
typename G,
bool is_const>
friend class neighborIterator;
318 CompactedDBG(
const int kmer_length = DEFAULT_K,
const int minimizer_length = DEFAULT_G);
395 bool simplify(
const bool delete_short_isolated_unitigs =
true,
const bool clip_short_tips =
true,
const bool verbose =
false);
404 bool write(
const string& output_filename,
const size_t nb_threads = 1,
const bool GFA_output =
true,
const bool verbose =
false)
const;
414 bool read(
const string& input_filename,
const bool verbose =
false);
442 UnitigMap<U, G> findUnitig(
const char* s,
const size_t pos,
const size_t len);
456 vector<pair<size_t, UnitigMap<U, G>>> searchSequence(
const string& seq,
const bool exact,
const bool insertion,
const bool deletion,
457 const bool substitution,
const bool or_exclusive_match =
false)
const;
466 bool add(
const string& seq,
const bool verbose =
false);
487 bool merge(
const CompactedDBG& o,
const size_t nb_threads = 1,
const bool verbose =
false);
499 bool merge(
const vector<CompactedDBG>& v,
const size_t nb_threads = 1,
const bool verbose =
false);
509 const_iterator begin()
const;
519 const_iterator end()
const;
524 size_t length()
const;
526 size_t nbKmers()
const;
536 inline int getK()
const {
return k_; }
541 inline size_t size()
const {
return v_unitigs.size() + v_kmers.size() + h_kmers_ccov.size(); }
546 inline G*
getData() {
return data.getData(); }
551 inline const G*
getData()
const {
return data.getData(); }
555 bool annotateSplitUnitigs(
const CompactedDBG<U, G>& o,
const size_t nb_threads = 1,
const bool verbose =
false);
557 pair<size_t, size_t> splitAllUnitigs();
558 pair<size_t, size_t> getSplitInfoAllUnitigs()
const;
560 inline size_t joinUnitigs(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1) {
562 return joinUnitigs_<is_void<U>::value>(v_joins, nb_threads);
565 bool mergeData(
const CompactedDBG<U, G>& o,
const size_t nb_threads = 1,
const bool verbose =
false);
566 bool mergeData(
CompactedDBG<U, G>&& o,
const size_t nb_threads = 1,
const bool verbose =
false);
570 bool filter(
const CDBG_Build_opt& opt,
const size_t nb_unique_kmers,
const size_t nb_non_unique_kmers);
571 bool construct(
const CDBG_Build_opt& opt,
const size_t nb_unique_minimizers,
const size_t nb_non_unique_minimizers);
573 bool addUnitigSequenceBBF(
const Kmer km,
const string& seq,
const size_t pos_match_km,
const size_t len_match_km, LockGraph& lck_g);
575 size_t findUnitigSequenceBBF(
Kmer km,
string& s,
bool& isIsolated, vector<Kmer>& l_ignored_km_tip);
576 bool bwStepBBF(
const Kmer km,
Kmer& front,
char& c,
bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip,
const bool check_fp_cand =
true)
const;
577 bool fwStepBBF(
const Kmer km,
Kmer& end,
char& c,
bool& has_no_neighbor, vector<Kmer>& l_ignored_km_tip,
const bool check_fp_cand =
true)
const;
579 inline size_t find(
const preAllocMinHashIterator<RepHash>& it_min_h)
const {
581 const int pos = it_min_h.getPosition();
582 return (hmap_min_unitigs.find(Minimizer(&it_min_h.s[pos]).rep()) != hmap_min_unitigs.end() ? 0 : pos - it_min_h.p);
585 UnitigMap<U, G> find(
const char* s,
const size_t pos_km,
const minHashIterator<RepHash>& it_min,
const bool extremities_only =
false);
586 const_UnitigMap<U, G> find(
const char* s,
const size_t pos_km,
const minHashIterator<RepHash>& it_min,
const bool extremities_only =
false)
const;
590 vector<const_UnitigMap<U, G>> findPredecessors(
const Kmer& km,
const bool extremities_only =
false)
const;
591 vector<const_UnitigMap<U, G>> findSuccessors(
const Kmer& km,
const size_t limit = 4,
const bool extremities_only =
false)
const;
593 vector<UnitigMap<U, G>> findPredecessors(
const Kmer& km,
const bool extremities_only =
false);
594 vector<UnitigMap<U, G>> findSuccessors(
const Kmer& km,
const size_t limit = 4,
const bool extremities_only =
false);
597 UnitigMap<U, G> findUnitig(
const Kmer& km,
const char* s,
const size_t pos,
const preAllocMinHashIterator<RepHash>& it_min_h);
598 UnitigMap<U, G> findUnitig(
const char* s,
const size_t pos,
const size_t len,
const minHashIterator<RepHash>& it_min);
600 bool addUnitig(
const string& str_unitig,
const size_t id_unitig);
601 bool addUnitig(
const string& str_unitig,
const size_t id_unitig,
const size_t id_unitig_r,
const size_t is_short_r);
602 void swapUnitigs(
const bool isShort,
const size_t id_a,
const size_t id_b);
604 bool mergeUnitig(
const string& seq,
const bool verbose =
false);
605 bool annotateSplitUnitig(
const string& seq,
const bool verbose =
false);
606 bool annotateSplitUnitig(
const string& seq, LockGraph& lck_g,
const bool verbose =
false);
608 template<
bool is_
void>
614 template<
bool is_
void>
617 template<
bool is_
void>
618 typename std::enable_if<!is_void, void>::type deleteUnitig_(
const bool isShort,
const bool isAbundant,
619 const size_t id_unitig,
const bool delete_data =
true);
621 template<
bool is_
void>
622 typename std::enable_if<is_void, void>::type deleteUnitig_(
const bool isShort,
const bool isAbundant,
623 const size_t id_unitig,
const bool delete_data =
true);
625 template<
bool is_
void>
626 typename std::enable_if<!is_void, bool>::type extractUnitig_(
size_t& pos_v_unitigs,
size_t& nxt_pos_insert_v_unitigs,
627 size_t& v_unitigs_sz,
size_t& v_kmers_sz,
const vector<pair<int,int>>& sp);
628 template<
bool is_
void>
629 typename std::enable_if<is_void, bool>::type extractUnitig_(
size_t& pos_v_unitigs,
size_t& nxt_pos_insert_v_unitigs,
630 size_t& v_unitigs_sz,
size_t& v_kmers_sz,
const vector<pair<int,int>>& sp);
632 pair<size_t, size_t> extractAllUnitigs();
634 template<
bool is_
void>
635 typename std::enable_if<!is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1);
637 template<
bool is_
void>
638 typename std::enable_if<is_void, size_t>::type joinUnitigs_(vector<Kmer>* v_joins =
nullptr,
const size_t nb_threads = 1);
640 void createJoinHT(vector<Kmer>* v_joins, KmerHashTable<Kmer>& joins,
const size_t nb_threads)
const;
642 void check_fp_tips(KmerHashTable<bool>& ignored_km_tips);
643 size_t removeUnitigs(
bool rmIsolated,
bool clipTips, vector<Kmer>& v);
645 size_t joinTips(
string filename_MBBF_uniq_kmers,
const size_t nb_threads = 1,
const bool verbose =
false);
646 vector<Kmer> extractMercyKmers(BlockedBloomFilter& bf_uniq_km,
const size_t nb_threads = 1,
const bool verbose =
false);
648 void writeGFA(
const string& graphfilename,
const size_t nb_threads = 1)
const;
649 void writeFASTA(
const string& graphfilename)
const;
651 void readGFA(
const string& graphfilename);
652 void readFASTA(
const string& graphfilename);
654 template<
bool is_
void>
655 typename std::enable_if<!is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap)
const;
656 template<
bool is_
void>
657 typename std::enable_if<is_void, void>::type writeGFA_sequence_(GFA_Parser& graph, KmerHashTable<size_t>& idmap)
const;
663 void setKmerGmerLength(
const int kmer_length,
const int minimizer_length);
666 vector<Minimizer> test(
const Minimizer minz)
const;
673 static const int tiny_vector_sz = 2;
674 static const int min_abundance_lim = 15;
675 static const int max_abundance_lim = 15;
677 typedef KmerHashTable<CompressedCoverage_t<U>> h_kmers_ccov_t;
678 typedef MinimizerHashTable_2Val hmap_min_unitigs_t;
680 typedef typename hmap_min_unitigs_t::iterator hmap_min_unitigs_iterator;
681 typedef typename hmap_min_unitigs_t::const_iterator hmap_min_unitigs_const_iterator;
683 vector<Unitig<U>*> v_unitigs;
684 vector<pair<Kmer, CompressedCoverage_t<U>>> v_kmers;
686 hmap_min_unitigs_t hmap_min_unitigs;
688 h_kmers_ccov_t h_kmers_ccov;
690 BlockedBloomFilter bf;
695 #include "CompactedDBG.tcc" bool operator!=(const neighborIterator &o) const
Inequality operator: check if two neighborIterator are different.
bool deleteIsolated
Remove short isolated unitigs (length < 2k) of the graph (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:152
void merge(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Merge the data of a sub-unitig B to the data of a sub-unitig A.
Definition: CompactedDBG.hpp:240
unitigIterator< U, G, true > const_iterator
A constant iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:312
Iterator for the neighbors (predecessors or successors) of a reference unitig used in a UnitigMap obj...
Definition: NeighborIterator.hpp:34
string outFilenameBBF
String containing the name of a Bloom filter file that will be generated by CompactedDBG<U, G>::filter.
Definition: CompactedDBG.hpp:136
Iterator for the unitigs of a Compacted de Bruijn graph.
Definition: UnitigIterator.hpp:36
size_t k
Length of k-mers (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:146
unitigIterator< U, G, false > iterator
An iterator for the unitigs of the graph.
Definition: CompactedDBG.hpp:311
string prefixFilenameOut
Prefix for the name of the file to which the graph must be written.
Definition: CompactedDBG.hpp:157
bool outputGFA
Boolean indicating if the graph is written to a GFA file (true) or if the unitigs are written to a FA...
Definition: CompactedDBG.hpp:155
size_t nb_bits_unique_kmers_bf
Number of Bloom filter bits per k-mer occurring at least once in the FASTA/FASTQ/GFA files of CDBG_Bu...
Definition: CompactedDBG.hpp:132
Unitig_data_ptr_t getData() const
Get a pointer to the data associated with the reference unitig used in the mapping.
int getK() const
Return the length of k-mers of the graph.
Definition: CompactedDBG.hpp:536
G * getData()
Return a pointer to the graph data.
Definition: CompactedDBG.hpp:546
const G * getData() const
Return a constant pointer to the graph data.
Definition: CompactedDBG.hpp:551
bool build
Boolean indicating if the graph must be built.
Definition: CompactedDBG.hpp:148
vector< string > filename_seq_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:138
Most members of this structure are parameters for CompactedDBG<U, G>::build(), except for: ...
Definition: CompactedDBG.hpp:124
Represent a Compacted de Bruijn graph.
Definition: CompactedDBG.hpp:297
UnitigMap type interface.
The unitigIterator type interface.
size_t nb_bits_non_unique_kmers_bf
Number of Bloom filter bits per k-mer occurring at least twice in the FASTA/FASTQ/GFA files of CDBG_B...
Definition: CompactedDBG.hpp:133
bool verbose
Print information messages during execution if true.
Definition: CompactedDBG.hpp:126
void extract(const UnitigMap< Unitig_data_t, Graph_data_t > &um_src, bool last_extraction)
Extract data corresponding to a sub-unitig of a unitig A.
Definition: CompactedDBG.hpp:256
Interface to store and manipulate k-mers.
Definition: Kmer.hpp:40
bool clipTips
Clip short tips (length < 2k) of the graph (not used by CompactedDBG<U, G>::build).
Definition: CompactedDBG.hpp:151
bool operator==(const neighborIterator &o) const
Equality operator: check if two neighborIterator are the same.
Contain all the information for the mapping of a k-mer or a sequence to a unitig of a Compacted de Br...
Definition: NeighborIterator.hpp:12
string filename_graph_in
String containing the name of a GFA file to read using CompactedDBG<U, G>::read.
Definition: CompactedDBG.hpp:159
If data are to be associated with the unitigs of the compacted de Bruijn graph, those data must be wr...
Definition: CompactedDBG.hpp:204
void clear(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest)
Clear the data associated with a unitig.
Definition: CompactedDBG.hpp:213
size_t size() const
Return the number of unitigs in the graph.
Definition: CompactedDBG.hpp:541
Interface for the class Kmer:
bool isInvalid() const
Return a boolean indicating if the graph is invalid (wrong input parameters/files, error occurring during a method, etc.).
Definition: CompactedDBG.hpp:531
vector< string > filename_ref_in
Vector of strings, each string is the name of a FASTA/FASTQ/GFA file to use for the graph constructio...
Definition: CompactedDBG.hpp:139
bool update
Boolean indicating if the graph must be updated.
Definition: CompactedDBG.hpp:149
size_t read_chunksize
Number of reads a thread can read and process at a time.
Definition: CompactedDBG.hpp:128
size_t nb_threads
Number of threads to use for building the graph.
Definition: CompactedDBG.hpp:127
void concat(const UnitigMap< Unitig_data_t, Graph_data_t > &um_dest, const UnitigMap< Unitig_data_t, Graph_data_t > &um_src)
Join data of two unitigs which are going to be concatenated.
Definition: CompactedDBG.hpp:228
bool useMercyKmers
Keep in the graph low coverage k-mers (cov=1) connecting tips of the graph.
Definition: CompactedDBG.hpp:153
string inFilenameBBF
String containing the name of a Bloom filter file that is generated by CompactedDBG<U, G>::filter.
Definition: CompactedDBG.hpp:135
string serialize(const const_UnitigMap< Unitig_data_t, Graph_data_t > &um_src) const
Serialize the data to a GFA-formatted string.
Definition: CompactedDBG.hpp:266