[r12728]: cdk-fingerprint-paper / trunk / paper / bmc_article.tex Maximize Restore History

Download this file

bmc_article.tex    445 lines (340 with data), 17.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
%% BioMed_Central_Tex_Template_v1.05
%% %
% bmc_article.tex ver: 1.05 %
% %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% LaTeX template for BioMed Central %%
%% journal article submissions %%
%% %%
%% <27 January 2006> %%
%% %%
%% %%
%% Uses: %%
%% cite.sty, url.sty, bmc_article.cls %%
%% ifthen.sty. multicol.sty %%
%% %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% For instructions on how to fill out this Tex template %%
%% document please refer to Readme.pdf and the instructions for %%
%% authors page on the biomed central website %%
%% http://www.biomedcentral.com/info/authors/ %%
%% %%
%% Please do not use \input{...} to include other tex files. %%
%% Submit your LaTeX manuscript as one .tex document. %%
%% %%
%% All additional figures and files should be attached %%
%% separately and not embedded in the \TeX\ document itself. %%
%% %%
%% BioMed Central currently use the MikTex distribution of %%
%% TeX for Windows) of TeX and LaTeX. This is available from %%
%% http://www.miktex.org %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\NeedsTeXFormat{LaTeX2e}[1995/12/01]
\documentclass[10pt]{bmc_article}
% Load packages
\usepackage{cite} % Make references as [1-4], not [1,2,3,4]
\usepackage{url} % Formatting web addresses
\usepackage{ifthen} % Conditional
\usepackage{multicol} %Columns
% \usepackage[utf8]{inputenc} %unicode support
\usepackage[applemac]{inputenc} %applemac support if unicode package fails
% \usepackage[latin1]{inputenc} %UNIX support if unicode package fails
\urlstyle{rm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% If you wish to display your graphics for %%
%% your own use using includegraphic or %%
%% includegraphics, then comment out the %%
%% following two lines of code. %%
%% NB: These line *must* be included when %%
%% submitting to BMC. %%
%% All figure files must be submitted as %%
%% separate graphics through the BMC %%
%% submission process, not included in the %%
%% submitted article. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\def\includegraphic{}
\def\includegraphics{}
\setlength{\topmargin}{0.0cm}
\setlength{\textheight}{21.5cm}
\setlength{\oddsidemargin}{0cm}
\setlength{\textwidth}{16.5cm}
\setlength{\columnsep}{0.6cm}
\newboolean{publ}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% You may change the following style settings %%
%% Should you wish to format your article %%
%% in a publication style for printing out and %%
%% sharing with colleagues, but ensure that %%
%% before submitting to BMC that the style is %%
%% returned to the Review style setting. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Review style settings
\newenvironment{bmcformat}{\begin{raggedright}\baselineskip20pt\sloppy\setboolean{publ}{false}}{\end{raggedright}\baselineskip20pt\sloppy}
% Publication style settings
%\newenvironment{bmcformat}{\fussy\setboolean{publ}{true}}{\fussy}
% Begin ...
\begin{document}
\begin{bmcformat}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the title of your article here %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{A Benchmark Study of the CDK Fingerprints}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors here %%
%% %%
%% Ensure \and is entered between all but %%
%% the last two authors. This will be %%
%% replaced by a comma in the final article %%
%% %%
%% Ensure there are no trailing spaces at %%
%% the ends of the lines %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author{Rajarshi Guha\correspondingauthor$^{1}$%
\email{Rajarshi Guha\correspondingauthor - rguha@indiana.edu}%
\and
Jane E Doe\correspondingauthor$^2$%
\email{Jane E Doe\correspondingauthor - jane.e.doe@cambridge.co.uk}
and
John RS Smith$^3$%
\email{John RS Smith - john.RS.Smith@cambridge.co.uk}%
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors' addresses here %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\address{%
\iid(1)School of Informatics, Indiana University, Bloomington, IN 47408\\
\iid(2)Department of Zoology, Cambridge, Waterloo Road, London, UK\\
\iid(3)Marine Ecology Department, Institute of Marine Sciences Kiel, %
D\"{u}sternbrooker Weg 20, 24105 Kiel, Germany
}%
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The Abstract begins here %%
%% %%
%% The Section headings here are those for %%
%% a Research article submitted to a %%
%% BMC-Series journal. %%
%% %%
%% If your article is not of this type, %%
%% then refer to the Instructions for %%
%% authors on http://www.biomedcentral.com %%
%% and change the section headings %%
%% accordingly. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstract}
% Do not use inserted blank lines (ie \\) until main body of text.
\paragraph*{Background:} Text for this section of the abstract.
\paragraph*{Results:} Text for this section of the abstract \ldots
\paragraph*{Conclusions:} Text for this section of the abstract \ldots
\end{abstract}
\ifthenelse{\boolean{publ}}{\begin{multicols}{2}}{}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The Main Body begins here %%
%% %%
%% The Section headings here are those for %%
%% a Research article submitted to a %%
%% BMC-Series journal. %%
%% %%
%% If your article is not of this type, %%
%% then refer to the instructions for %%
%% authors on: %%
%% http://www.biomedcentral.com/info/authors%%
%% and change the section headings %%
%% accordingly. %%
%% %%
%% See the Results and Discussion section %%
%% for details on how to create sub-sections%%
%% %%
%% use \cite{...} to cite references %%
%% \cite{koon} and %%
%% \cite{oreg,khar,zvai,xjon,schn,pond} %%
%% \nocite{smith,marg,hunn,advi,koha,mouse}%%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%
%% Background %%
%%
\section*{Background}
Binary fingerprints are bit string representations of molecular
structures and come in a variety of types. In the most common
type, each bit of the fingerprint corresponds to a specific
substructural feature (say an aromatic ring or an aldehyde
group). Other forms of fingerprints include hashed fingerprints
and atom environment fingerprints. While these representations
were initially designed for similarity searching in databases,
they have been become an important component of virtual screening
pipelines. That this is possible, is due to the ``similarity
principle''\cite{Martin:2002ab}, underlying much of virtual
screening in drug discovery scenarios, which states that similar
molecules will have similar activities. While there have been many
counter-examples\cite{Maggiora:2006aa}, this approach has been
fruitful in a number of cases.
A number of fingerprint implementations are available from
commercial vendors and a few from academic groups. The Chemistry
Development Kit (CDK) is an Open Source Java
library\cite{Steinbeck:2003bh,Steinbeck:2006aa} for
cheminformatics and provides several fingerprint
implementations. More specifically, it provides two structural key
type fingerprints and two hashed fingerprints. While the library
has been used in a number of projects, there has been no formal
testing of how well the CDK fingerprints perform in a virtual
screening scenario. It should be noted that the two structural key
fingerprints are implementations of well studied schemes
(MACCS\cite{Durant:2002aa} and EState keys) and their performance
is well known. However the two hashed fingerprints, while based on
the well known Daylight specification, have never been formally
benchmarked. The goal of this study is to compare the performance
of the CDK hashed fingerprints to other well known fingerprint
types.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Results and Discussion %%
%%
\section*{Results and Discussion}
\subsection*{Enrichment curves}
\subsection*{Information content}
\subsection*{Effect of fingerprint size}
\subsection*{Influence of ring systems}
%%%%%%%%%%%%%%%%%%%%%%
\section*{Conclusions}
Text for this section \ldots
%%%%%%%%%%%%%%%%%%
\section*{Methods}
\subsection*{Fingerprints}
Summary of how CDK FP's are calculated.
In addition to the path based fingerprints described above, we
also considered structural key type fingerprints. In these types,
each bit positions corresponds explicitly to a substructural
features. In this study we employed the MACCS 166 bit
keys\cite{Durant:2002aa} (implemented in the CDK) and the BCI 1052
bit keys\cite{Barnard:1997aa}.
Finally, we also considered atom environment fingerprints,
specifically the extended connectivity fingerprints (ECFP) as
implemented in Pipeline Pilot (Scitegic, Inc.). These types of
fingerprints characterize each atom in terms of the environment
around it, usually going up to 6 or 8 bonds from the atom in
question. The ECFP's characterize the atoms using features such as
hydrogen bonding donor capability, lipophilicity and so on. In
this study we considered the ECFP-6 type, which considers atoms up
to 6 bonds away from a central atom.
\subsection*{Measures of effectiveness}
Use of enrichment curves, enrichment factors. Note that they are
not the best of
measures\cite{Bender:2005aa,Truchon:2007aa,Nicholls:2008aa,Clark:2008aa}.
Use of ROC curves and AUC
\subsection*{Time efficiency}
\subsection*{Benchmark Datasets}
A number of datasets have been employed for benchmarking
fingerprint methods including ZINC\cite{Irwin:2005aa} and the MDL
Drug Discovery Report (MDDR). For the purposes of this study we
employed the 17 virtual screening benchmark datasets described by
Rohrer and Baumann\cite{Rohrer:2008ab}, collectively termed the
Maximum Unbiased Validation (MUV) datasets. These datasets are
derived from PubChem bioassays, each dataset corresponding to a
specific bioassay. Examples of the targets considered by these
datasets include FXIa inhibitors, FXIIa inhibitors, SF1 and HIV
RT-RNase inhibitors. More broadly, the datasets cover several
target classes including proteases, GPCR's, kinases and nuclear
receptors. These datasets were constructed to specifically avoid
the problem encountered with other datasets, namely, that many
datasets lend an unfair advantage for 2D methods over 3D
methods. More specifically, the actives in each of the datasets
exhibit a wide variety of scaffold classes, thus avoiding the
problems of analog bias\cite{Good:2008aa} and artificial
enrichment\cite{Verdonk:2004aa}.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Authors contributions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Acknowledgements}
\ifthenelse{\boolean{publ}}{\small}{}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% The Bibliography %%
%% %%
%% Bmc_article.bst will be used to %%
%% create a .BBL file for submission, which includes %%
%% XML structured for BMC. %%
%% %%
%% %%
%% Note that the displayed Bibliography will not %%
%% necessarily be rendered by Latex exactly as specified %%
%% in the online Instructions for Authors. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{\ifthenelse{\boolean{publ}}{\footnotesize}{\small}
\bibliographystyle{bmc_article} % Style BST file
\bibliography{bmc_article} } % Bibliography file (usually '*.bib' )
%%%%%%%%%%%
\ifthenelse{\boolean{publ}}{\end{multicols}}{}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Figures %%
%% %%
%% NB: this is for captions and %%
%% Titles. All graphics must be %%
%% submitted separately and NOT %%
%% included in the Tex document %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%% Do not use \listoffigures as most will included as separate files
\section*{Figures}
\subsection*{Figure 1 - Sample figure title}
A short description of the figure content
should go here.
\subsection*{Figure 2 - Sample figure title}
Figure legend text.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Tables %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Use of \listoftables is discouraged.
%%
\section*{Tables}
\subsection*{Table 1 - Sample table title}
Here is an example of a \emph{small} table in \LaTeX\ using
\verb|\tabular{...}|. This is where the description of the table
should go. \par \mbox{}
\par
\mbox{
\begin{tabular}{|c|c|c|}
\hline \multicolumn{3}{|c|}{My Table}\\ \hline
A1 & B2 & C3 \\ \hline
A2 & ... & .. \\ \hline
A3 & .. & . \\ \hline
\end{tabular}
}
\subsection*{Table 2 - Sample table title}
Large tables are attached as separate files but should
still be described here.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Additional Files %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Additional Files}
\subsection*{Additional file 1 --- Sample additional file title}
Additional file descriptions text (including details of how to
view the file, if it is in a non-standard format or the file extension). This might
refer to a multi-page table or a figure.
\subsection*{Additional file 2 --- Sample additional file title}
Additional file descriptions text.
\end{bmcformat}
\end{document}