RDKit
Open-source cheminformatics and machine learning.
RGroupData.h
Go to the documentation of this file.
1//
2// Copyright (C) 2017 Novartis Institutes for BioMedical Research
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#ifndef RGROUP_DATA
11#define RGROUP_DATA
12
13#include "../RDKitBase.h"
14#include "RGroupUtils.h"
19#include <boost/scoped_ptr.hpp>
20#include <set>
21#include <vector>
22#include <regex>
23
24namespace RDKit {
25
26//! A single rgroup attached to a given core.
27struct RGroupData {
28 boost::shared_ptr<RWMol> combinedMol;
29 std::vector<boost::shared_ptr<ROMol>> mols; // All the mols in the rgroup
30 std::vector<std::string> smilesVect; // used for rgroup equivalence
31 std::string
32 smiles; // smiles for all the mols in the rgroup (with attachments)
33 std::set<int> attachments; // core attachment points
34 std::unique_ptr<ExplicitBitVect>
35 fingerprint; // fingerprint for score calculations
36 std::vector<int> fingerprintOnBits;
37 bool is_hydrogen = false;
38 bool single_fragment = true;
40 bool is_linker = false;
41 bool labelled = false;
42
43 private:
44 RGroupData(const RGroupData &rhs);
45
46 public:
48
49 void add(boost::shared_ptr<ROMol> newMol,
50 const std::vector<int> &rlabel_attachments) {
51 // some fragments can be add multiple times if they are cyclic
52 for (auto &mol : mols) {
53 if (newMol.get() == mol.get()) {
54 return;
55 }
56 }
57
58 if (mols.size() > 0) {
59 // don't add extraneous hydrogens
60 if (isMolHydrogen(*newMol)) {
61 return;
62 }
63 if (is_hydrogen) {
64 // if we are adding a heavy attachment to hydrogens, discard the
65 // hydrogen and start over
66 combinedMol = nullptr;
67 smilesVect.clear();
68 attachments.clear();
69 mols.clear();
70 }
71 }
72
73 labelled = false;
74 std::copy(rlabel_attachments.begin(), rlabel_attachments.end(),
75 std::inserter(attachments, attachments.end()));
76
77 mols.push_back(newMol);
78 static const std::regex remove_isotopes_regex("\\[\\d*\\*\\]");
79 // remove the isotope labels from the SMILES string to avoid
80 // that identical R-group are perceived as different when
81 // MCS alignment is not used (NoAlign flag)
82 smilesVect.push_back(std::regex_replace(MolToSmiles(*newMol, true),
83 remove_isotopes_regex, "*"));
84 if (!combinedMol.get()) {
85 combinedMol = boost::shared_ptr<RWMol>(new RWMol(*mols[0].get()));
86 } else {
87 ROMol *m = combineMols(*combinedMol.get(), *newMol.get());
88 single_fragment = false;
89 m->updateProps(*combinedMol.get());
90 combinedMol.reset(new RWMol(*m));
91 delete m;
92 }
93 smiles = getSmiles();
95 computeIsHydrogen();
97 }
98
99 std::map<int, int> getNumBondsToRlabels() const {
100 std::map<int, int> rlabelsUsedCount;
101
102 for (ROMol::AtomIterator atIt = combinedMol->beginAtoms();
103 atIt != combinedMol->endAtoms(); ++atIt) {
104 Atom *atom = *atIt;
105 int rlabel;
106 if (atom->getPropIfPresent<int>(RLABEL, rlabel)) {
107 rlabelsUsedCount[rlabel] += 1;
108 }
109 }
110 return rlabelsUsedCount;
111 }
112
113 std::string toString() const {
114 auto attachmentString = std::accumulate(
115 attachments.cbegin(), attachments.cend(), std::string(),
116 [](std::string s, int a) {
117 return s.empty() ? std::to_string(a)
118 : std::move(s) + ',' + std::to_string(a);
119 });
120 std::stringstream ss;
121 ss << "RG " << attachmentString << " " << getSmiles();
122 return ss.str();
123 }
124
125 private:
126 void computeIsHydrogen() { // is the rgroup all Hs
127 for (const auto &mol : mols) {
128 if (!isMolHydrogen(*mol)) {
129 is_hydrogen = false;
130 return;
131 }
132 }
133 is_hydrogen = true;
134 }
135
136 bool isMolHydrogen(ROMol &mol) {
137 for (ROMol::AtomIterator atIt = mol.beginAtoms(); atIt != mol.endAtoms();
138 ++atIt) {
139 auto atom = *atIt;
140 if (atom->getAtomicNum() > 1) {
141 return false;
142 } else if (atom->getAtomicNum() == 0 &&
143 !atom->hasProp(SIDECHAIN_RLABELS)) {
144 return false;
145 }
146 }
147 return true;
148 }
149
150 //! compute the canonical smiles for the attachments (bug: removes dupes since
151 //! we are using a set...)
152 std::string getSmiles() const {
153 std::string s;
154 for (const auto &it : smilesVect) {
155 if (s.length()) {
156 s += ".";
157 }
158 s += it;
159 }
160 return s;
161 }
162};
163} // namespace RDKit
164
165#endif
The class for representing atoms.
Definition: Atom.h:68
bool getPropIfPresent(const std::string &key, T &res) const
Definition: RDProps.h:121
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:32
RDKIT_RDGENERAL_EXPORT const std::string internalRgroupSmiles
Std stuff.
Definition: Abbreviations.h:19
RDKIT_CHEMTRANSFORMS_EXPORT ROMol * combineMols(const ROMol &mol1, const ROMol &mol2, RDGeom::Point3D offset=RDGeom::Point3D(0, 0, 0))
Combined two molecules to create a new one.
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_RGROUPDECOMPOSITION_EXPORT const std::string RLABEL
RDKIT_RGROUPDECOMPOSITION_EXPORT const std::string SIDECHAIN_RLABELS
A single rgroup attached to a given core.
Definition: RGroupData.h:27
boost::shared_ptr< RWMol > combinedMol
Definition: RGroupData.h:28
std::vector< boost::shared_ptr< ROMol > > mols
Definition: RGroupData.h:29
std::set< int > attachments
Definition: RGroupData.h:33
std::vector< int > fingerprintOnBits
Definition: RGroupData.h:36
std::map< int, int > getNumBondsToRlabels() const
Definition: RGroupData.h:99
bool multiple_attachments
Definition: RGroupData.h:39
std::string toString() const
Definition: RGroupData.h:113
std::vector< std::string > smilesVect
Definition: RGroupData.h:30
std::string smiles
Definition: RGroupData.h:32
void add(boost::shared_ptr< ROMol > newMol, const std::vector< int > &rlabel_attachments)
Definition: RGroupData.h:49
std::unique_ptr< ExplicitBitVect > fingerprint
Definition: RGroupData.h:35