BFGraph
CompressedSequence.hpp
1 #ifndef BFG_COMPRESSED_SEQUENCE_HPP
2 #define BFG_COMPRESSED_SEQUENCE_HPP
3 
4 #include <cstring>
5 #include <string>
6 #include <stdint.h>
7 
8 #include "Kmer.hpp"
9 
10 /* Short description:
11  * - Compress a DNA string by using 2 bits per base instead of 8
12  * - Easily get the DNA string back from the compressed format
13  * - Create a sequence from a kmer
14  * - Get kmers from a sequence
15  * - Get length of a sequence
16  * - Easily get length of matching substring from a given string
17  * */
19 
20  public:
21 
24 
26  CompressedSequence& operator=(const CompressedSequence& o);
27 
28  // Move constructors
30  CompressedSequence& operator=(CompressedSequence&& o);
31 
32  explicit CompressedSequence(const char *s);
33  explicit CompressedSequence(const string& s);
34  explicit CompressedSequence(const Kmer& km);
35 
36  const char operator[](size_t index) const;
37 
38  void clear();
39 
40  size_t size() const;
41 
42  void toString(char *s, const size_t offset, const size_t length) const;
43  string toString(const size_t offset, const size_t length) const;
44 
45 
46  inline string toString() const { return toString(0,size()); }
47  inline void toString(char *s) const { toString(s,0,size()); }
48 
49  Kmer getKmer(size_t offset) const;
50 
51  bool compareKmer(const size_t offset, const Kmer& km) const;
52 
53  // void setSequence(const CompressedSequence &o, size_t length, size_t offset = 0, bool reversed=false);
54  void setSequence(const CompressedSequence& o, const size_t start, const size_t length, const size_t offset = 0, const bool reversed = false);
55  void setSequence(const char *s, const size_t length, const size_t offset = 0, const bool reversed = false);
56  void setSequence(const string& s, const size_t length, const size_t offset = 0, const bool reversed=false);
57  void setSequence(const Kmer& km, const size_t length, const size_t offset = 0, const bool reversed=false);
58 
59  void reserveLength(const size_t new_length);
60 
61  CompressedSequence rev() const;
62 
63  size_t jump(const char *s, const size_t i, int pos, const bool reversed) const;
64  size_t bw_jump(const char *s, const size_t i, int pos, const bool reversed) const;
65 
66  int64_t findKmer(const Kmer& km) const;
67 
68  bool isShort() const;
69 
70  private:
71 
72  inline size_t round_to_bytes(const size_t len) const { return (len+3)/4; }
73  void _resize_and_copy(const size_t new_cap, const size_t copy_limit);
74  void initShort();
75  void setSize(const size_t size);
76 
77  size_t capacity() const;
78  const unsigned char *getPointer() const;
79 
80  static const uint8_t shortMask = 1;
81 
82  union {
83 
84  struct {
85  uint32_t _length; // size of sequence
86  uint32_t _capacity; // capacity of array allocated in bytes
87  unsigned char *_data; // 0-based 2bit compressed dna string
88  unsigned char padding[16];
89  } asPointer;
90 
91  struct {
92  uint8_t _size; // 7 bits can index up to 128
93  unsigned char _arr[31]; // can store 124 nucleotides
94  } asBits;
95  };
96 };
97 
98 #endif // BFG_COMPRESSED_SEQUENCE_HPP
Definition: Kmer.hpp:31
Definition: CompressedSequence.hpp:18