RDKit
Open-source cheminformatics and machine learning.
FingerprintGenerator.h
Go to the documentation of this file.
1//
2// Copyright (C) 2018 Boran Adas, Google Summer of Code
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10
11#include <RDGeneral/export.h>
12#ifndef RD_FINGERPRINTGEN_H_2018_05
13#define RD_FINGERPRINTGEN_H_2018_05
14
18#include <utility>
19#include <vector>
20#include <memory>
21#include <cstdint>
22
23namespace RDKit {
24class ROMol;
25
27 using atomToBitsType = std::vector<std::vector<std::uint64_t>>;
29 std::map<std::uint64_t,
30 std::vector<std::pair<std::uint32_t, std::uint32_t>>>;
31 using bitPathsType = std::map<std::uint64_t, std::vector<std::vector<int>>>;
32 using atomCountsType = std::vector<unsigned int>;
33
34 // numAtoms long
35 atomToBitsType *atomToBits = nullptr;
36
37 // bitId -> vector of (atomId, radius) for morgan
38 // bitId -> (atom1, atom2) for atom pairs
39 bitInfoMapType *bitInfoMap = nullptr;
40
41 // rdkit fp
42 // maps bitId -> vector of bond paths
43 bitPathsType *bitPaths = nullptr;
44
45 // number of paths that set bits for each atom, must have the same size as
46 // atom count for molecule
47 atomCountsType *atomCounts = nullptr;
48
50 atomToBitsHolder.reset(new atomToBitsType);
51 atomToBits = atomToBitsHolder.get();
52 }
54 bitInfoMapHolder.reset(new bitInfoMapType);
55 bitInfoMap = bitInfoMapHolder.get();
56 }
58 bitPathsHolder.reset(new bitPathsType);
59 bitPaths = bitPathsHolder.get();
60 }
62 atomCountsHolder.reset(new atomCountsType);
63 atomCounts = atomCountsHolder.get();
64 }
65
66 private:
67 std::unique_ptr<atomToBitsType> atomToBitsHolder;
68 std::unique_ptr<bitInfoMapType> bitInfoMapHolder;
69 std::unique_ptr<bitPathsType> bitPathsHolder;
70 std::unique_ptr<atomCountsType> atomCountsHolder;
71};
72
73/*!
74 \brief Abstract base class that holds molecule independent arguments that are
75 common amongst all fingerprint types and classes inherited from this would
76 hold fingerprint type specific arguments
77
78 */
79template <typename OutputType>
81 : private boost::noncopyable {
82 public:
83 FingerprintArguments(bool countSimulation,
84 const std::vector<std::uint32_t> countBounds,
85 std::uint32_t fpSize,
86 std::uint32_t numBitsPerFeature = 1);
88 const std::vector<std::uint32_t> d_countBounds;
89 const std::uint32_t d_fpSize;
90 const std::uint32_t d_numBitsPerFeature;
91
92 /*!
93 \brief Returns the size of the fingerprint based on arguments
94
95 \return OutputType size of the fingerprint
96 */
97 virtual OutputType getResultSize() const = 0;
98
99 /**
100 \brief method that returns information string about the fingerprint specific
101 argument set and the arguments themselves
102
103 \return std::string information string
104 */
105 virtual std::string infoString() const = 0;
106
107 /**
108 \brief method that returns information string about common fingerprinting
109 arguments' values
110
111 \return std::string information string
112 */
113 std::string commonArgumentsString() const;
114
116};
117
118/*!
119 \brief abstract base class that holds atom-environments that will be hashed to
120 generate the fingerprint
121
122 */
123template <typename OutputType>
124class RDKIT_FINGERPRINTS_EXPORT AtomEnvironment : private boost::noncopyable {
125 public:
126 /*!
127 \brief calculates and returns the bit id to be set for this atom-environment
128
129 \param arguments Fingerprinting type specific molecule independent
130 arguments
131 \param atomInvariants Atom-invariants to be used during hashing
132 \param bondInvariants Bond-invariants to be used during hashing
133 \param hashResults if set results will be ready to be modded
134
135 \return OutputType calculated bit id for this environment
136 */
137 virtual OutputType getBitId(FingerprintArguments<OutputType> *arguments,
138 const std::vector<std::uint32_t> *atomInvariants,
139 const std::vector<std::uint32_t> *bondInvariants,
141 const bool hashResults = false,
142 const std::uint64_t fpSize = 0) const = 0;
143
144 virtual ~AtomEnvironment() {}
145};
146
147/*!
148 \brief abstract base class that generates atom-environments from a molecule
149
150 */
151template <typename OutputType>
153 : private boost::noncopyable {
154 public:
155 /*!
156 \brief generate and return all atom-envorinments from a molecule
157
158 \param mol molecule to generate the atom-environments from
159 \param arguments fingerprint type specific molecule independent
160 arguments
161 \param fromAtoms atoms to be used during environment generation,
162 usage of this parameter depends on the implementation of different
163 fingerprint types
164 \param ignoreAtoms atoms to be ignored during environment generation,
165 usage of this parameter depends on the implementation of different
166 fingerprint types
167 \param confId which conformation to use during environment
168 generation, needed for some fingerprint types
169 \param additionalOutput contains pointers for additional outputs of
170 fingerprinting operation, usage depends on implementation of the fingerprint
171 type
172 \param atomInvariants atom invariants to be used during environment
173 generation, in some cases some of the hashing can be done during environment
174 generation so it is also passed here
175 \param bondInvariants bond invariants to be used during environment
176 generation, same as atomInvariants it might be needed
177 \param hashResults if set results will be ready to be modded
178
179 \return std::vector<AtomEnvironment *> atom-environments generated from
180 this molecule
181 */
182 virtual std::vector<AtomEnvironment<OutputType> *> getEnvironments(
183 const ROMol &mol, FingerprintArguments<OutputType> *arguments,
184 const std::vector<std::uint32_t> *fromAtoms = nullptr,
185 const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
186 const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
187 const std::vector<std::uint32_t> *atomInvariants = nullptr,
188 const std::vector<std::uint32_t> *bondInvariants = nullptr,
189 const bool hashResults = false) const = 0;
190
191 /**
192 \brief method that returns information about this /c AtomEnvironmentGenerator
193 and its arguments if any
194
195 \return std::string information string
196 */
197 virtual std::string infoString() const = 0;
198
200};
201
202/*!
203 \brief abstract base class for atom invariants generators
204
205 */
207 : private boost::noncopyable {
208 public:
209 /*!
210 \brief get atom invariants from a molecule
211
212 \param mol molecule to generate the atom invariants for
213
214 \return std::vector<std::uint32_t> atom invariants generated for the given
215 molecule
216 */
217 virtual std::vector<std::uint32_t> *getAtomInvariants(
218 const ROMol &mol) const = 0;
219
220 /**
221 \brief method that returns information about this /c AtomInvariantsGenerator
222 and its arguments
223
224 \return std::string information string
225 */
226 virtual std::string infoString() const = 0;
227
229 virtual AtomInvariantsGenerator *clone() const = 0;
230};
231
232/*!
233 \brief abstract base class for bond invariants generators
234
235 */
237 : private boost::noncopyable {
238 public:
239 /*!
240 \brief get bond invariants from a molecule
241
242 \param mol molecule to generate the bond invariants for
243
244 \return std::vector<std::uint32_t> bond invariants generated for the given
245 molecule
246 */
247 virtual std::vector<std::uint32_t> *getBondInvariants(
248 const ROMol &mol) const = 0;
249
250 /**
251 \brief method that returns information about this /c BondInvariantsGenerator
252 and its arguments
253
254 \return std::string information string
255 */
256 virtual std::string infoString() const = 0;
257
259 virtual BondInvariantsGenerator *clone() const = 0;
260}; // namespace RDKit
261
262/*!
263 \brief class that generates same fingerprint style for different output
264 formats
265
266 */
267template <typename OutputType>
269 : private boost::noncopyable {
270 FingerprintArguments<OutputType> *dp_fingerprintArguments;
271 AtomEnvironmentGenerator<OutputType> *dp_atomEnvironmentGenerator;
272 AtomInvariantsGenerator *dp_atomInvariantsGenerator;
273 BondInvariantsGenerator *dp_bondInvariantsGenerator;
274 const bool df_ownsAtomInvGenerator;
275 const bool df_ownsBondInvGenerator;
276
277 SparseIntVect<OutputType> *getFingerprintHelper(
278 const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
279 const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
280 const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
281 const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
282 const std::vector<std::uint32_t> *customBondInvariants = nullptr,
283 const std::uint64_t fpSize = 0) const;
284
285 public:
287 AtomEnvironmentGenerator<OutputType> *atomEnvironmentGenerator,
288 FingerprintArguments<OutputType> *fingerprintArguments,
289 AtomInvariantsGenerator *atomInvariantsGenerator = nullptr,
290 BondInvariantsGenerator *bondInvariantsGenerator = nullptr,
291 bool ownsAtomInvGenerator = false, bool ownsBondInvGenerator = false);
292
294
296 const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
297 const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
298 const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
299 const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
300 const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
301
303 const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
304 const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
305 const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
306 const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
307 const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
308
310 const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
311 const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
312 const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
313 const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
314 const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
315
317 const ROMol &mol, const std::vector<std::uint32_t> *fromAtoms = nullptr,
318 const std::vector<std::uint32_t> *ignoreAtoms = nullptr,
319 const int confId = -1, const AdditionalOutput *additionalOutput = nullptr,
320 const std::vector<std::uint32_t> *customAtomInvariants = nullptr,
321 const std::vector<std::uint32_t> *customBondInvariants = nullptr) const;
322
323 std::string infoString() const;
324};
325
327
328//! used to indicate errors for unimplemented fp types in convenience functions
330 : public std::exception {
331 public:
332 //! construct with an error message
333 UnimplementedFPException(const char *msg) : _msg(msg) {}
334 //! construct with an error message
335 UnimplementedFPException(std::string msg) : _msg(std::move(msg)) {}
336 //! get the error message
337 const char *what() const noexcept override { return _msg.c_str(); }
338 ~UnimplementedFPException() noexcept override = default;
339
340 private:
341 std::string _msg;
342};
343
344// convenience functions, fingerprint generation with default values
345
347 const ROMol &mol, FPType fPType);
348
350 FPType fPType);
351
353 const ROMol &mol, FPType fPType);
354
356 FPType fPType);
357
358RDKIT_FINGERPRINTS_EXPORT std::vector<SparseIntVect<std::uint64_t> *> *
359getSparseCountFPBulk(const std::vector<const ROMol *> molVector, FPType fPType);
360
362 const std::vector<const ROMol *> molVector, FPType fPType);
363
364RDKIT_FINGERPRINTS_EXPORT std::vector<SparseIntVect<std::uint32_t> *>
365 *getCountFPBulk(const std::vector<const ROMol *> molVector, FPType fPType);
366
368 const std::vector<const ROMol *> molVector, FPType fPType);
369
370} // namespace RDKit
371
372#endif
a class for bit vectors that are densely occupied
abstract base class that generates atom-environments from a molecule
virtual std::vector< AtomEnvironment< OutputType > * > getEnvironments(const ROMol &mol, FingerprintArguments< OutputType > *arguments, const std::vector< std::uint32_t > *fromAtoms=nullptr, const std::vector< std::uint32_t > *ignoreAtoms=nullptr, const int confId=-1, const AdditionalOutput *additionalOutput=nullptr, const std::vector< std::uint32_t > *atomInvariants=nullptr, const std::vector< std::uint32_t > *bondInvariants=nullptr, const bool hashResults=false) const =0
generate and return all atom-envorinments from a molecule
virtual std::string infoString() const =0
method that returns information about this /c AtomEnvironmentGenerator and its arguments if any
abstract base class that holds atom-environments that will be hashed to generate the fingerprint
virtual OutputType getBitId(FingerprintArguments< OutputType > *arguments, const std::vector< std::uint32_t > *atomInvariants, const std::vector< std::uint32_t > *bondInvariants, const AdditionalOutput *AdditionalOutput, const bool hashResults=false, const std::uint64_t fpSize=0) const =0
calculates and returns the bit id to be set for this atom-environment
abstract base class for atom invariants generators
virtual std::string infoString() const =0
method that returns information about this /c AtomInvariantsGenerator and its arguments
virtual AtomInvariantsGenerator * clone() const =0
virtual std::vector< std::uint32_t > * getAtomInvariants(const ROMol &mol) const =0
get atom invariants from a molecule
abstract base class for bond invariants generators
virtual std::string infoString() const =0
method that returns information about this /c BondInvariantsGenerator and its arguments
virtual BondInvariantsGenerator * clone() const =0
virtual std::vector< std::uint32_t > * getBondInvariants(const ROMol &mol) const =0
get bond invariants from a molecule
Abstract base class that holds molecule independent arguments that are common amongst all fingerprint...
virtual std::string infoString() const =0
method that returns information string about the fingerprint specific argument set and the arguments ...
virtual OutputType getResultSize() const =0
Returns the size of the fingerprint based on arguments.
const std::vector< std::uint32_t > d_countBounds
FingerprintArguments(bool countSimulation, const std::vector< std::uint32_t > countBounds, std::uint32_t fpSize, std::uint32_t numBitsPerFeature=1)
std::string commonArgumentsString() const
method that returns information string about common fingerprinting arguments' values
const std::uint32_t d_numBitsPerFeature
class that generates same fingerprint style for different output formats
SparseIntVect< OutputType > * getSparseCountFingerprint(const ROMol &mol, const std::vector< std::uint32_t > *fromAtoms=nullptr, const std::vector< std::uint32_t > *ignoreAtoms=nullptr, const int confId=-1, const AdditionalOutput *additionalOutput=nullptr, const std::vector< std::uint32_t > *customAtomInvariants=nullptr, const std::vector< std::uint32_t > *customBondInvariants=nullptr) const
std::string infoString() const
SparseIntVect< std::uint32_t > * getCountFingerprint(const ROMol &mol, const std::vector< std::uint32_t > *fromAtoms=nullptr, const std::vector< std::uint32_t > *ignoreAtoms=nullptr, const int confId=-1, const AdditionalOutput *additionalOutput=nullptr, const std::vector< std::uint32_t > *customAtomInvariants=nullptr, const std::vector< std::uint32_t > *customBondInvariants=nullptr) const
SparseBitVect * getSparseFingerprint(const ROMol &mol, const std::vector< std::uint32_t > *fromAtoms=nullptr, const std::vector< std::uint32_t > *ignoreAtoms=nullptr, const int confId=-1, const AdditionalOutput *additionalOutput=nullptr, const std::vector< std::uint32_t > *customAtomInvariants=nullptr, const std::vector< std::uint32_t > *customBondInvariants=nullptr) const
FingerprintGenerator(AtomEnvironmentGenerator< OutputType > *atomEnvironmentGenerator, FingerprintArguments< OutputType > *fingerprintArguments, AtomInvariantsGenerator *atomInvariantsGenerator=nullptr, BondInvariantsGenerator *bondInvariantsGenerator=nullptr, bool ownsAtomInvGenerator=false, bool ownsBondInvGenerator=false)
ExplicitBitVect * getFingerprint(const ROMol &mol, const std::vector< std::uint32_t > *fromAtoms=nullptr, const std::vector< std::uint32_t > *ignoreAtoms=nullptr, const int confId=-1, const AdditionalOutput *additionalOutput=nullptr, const std::vector< std::uint32_t > *customAtomInvariants=nullptr, const std::vector< std::uint32_t > *customBondInvariants=nullptr) const
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:28
used to indicate errors for unimplemented fp types in convenience functions
UnimplementedFPException(const char *msg)
construct with an error message
~UnimplementedFPException() noexcept override=default
UnimplementedFPException(std::string msg)
construct with an error message
const char * what() const noexcept override
get the error message
a class for bit vectors that are sparsely occupied.
Definition: SparseBitVect.h:34
#define RDKIT_FINGERPRINTS_EXPORT
Definition: export.h:177
Std stuff.
Definition: Abbreviations.h:19
RDKIT_FINGERPRINTS_EXPORT SparseBitVect * getSparseFP(const ROMol &mol, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseBitVect * > * getSparseFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT std::vector< ExplicitBitVect * > * getFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< std::uint64_t > * getSparseCountFP(const ROMol &mol, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseIntVect< std::uint32_t > * > * getCountFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT std::vector< SparseIntVect< std::uint64_t > * > * getSparseCountFPBulk(const std::vector< const ROMol * > molVector, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT SparseIntVect< std::uint32_t > * getCountFP(const ROMol &mol, FPType fPType)
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * getFP(const ROMol &mol, FPType fPType)
std::vector< std::vector< std::uint64_t > > atomToBitsType
std::vector< unsigned int > atomCountsType
std::map< std::uint64_t, std::vector< std::vector< int > > > bitPathsType
std::map< std::uint64_t, std::vector< std::pair< std::uint32_t, std::uint32_t > > > bitInfoMapType