Work at SourceForge, help us to make it a better place! We have an immediate need for a Support Technician in our San Francisco or Denver office.

Close

[r15481]: cdk-taverna-paper / trunk / cdk-taverna / bmc_article.tex Maximize Restore History

Download this file

bmc_article.tex    837 lines (707 with data), 40.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
%% BioMed_Central_Tex_Template_v1.06
%% %
% bmc_article.tex ver: 1.06 %
% %
%%IMPORTANT: do not delete the first line of this template
%%It must be present to enable the BMC Submission system to
%%recognise this template!!
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% LaTeX template for BioMed Central %%
%% journal article submissions %%
%% %%
%% <14 August 2007> %%
%% %%
%% %%
%% Uses: %%
%% cite.sty, url.sty, bmc_article.cls %%
%% ifthen.sty. multicol.sty %%
%% %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% For instructions on how to fill out this Tex template %%
%% document please refer to Readme.pdf and the instructions for %%
%% authors page on the biomed central website %%
%% http://www.biomedcentral.com/info/authors/ %%
%% %%
%% Please do not use \input{...} to include other tex files. %%
%% Submit your LaTeX manuscript as one .tex document. %%
%% %%
%% All additional figures and files should be attached %%
%% separately and not embedded in the \TeX\ document itself. %%
%% %%
%% BioMed Central currently use the MikTex distribution of %%
%% TeX for Windows) of TeX and LaTeX. This is available from %%
%% http://www.miktex.org %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\NeedsTeXFormat{LaTeX2e}[1995/12/01]
\documentclass[10pt]{bmc_article}
% Load packages
\usepackage{cite} % Make references as [1-4], not [1,2,3,4]
\usepackage{url} % Formatting web addresses
\usepackage{ifthen} % Conditional
\usepackage{multicol} %Columns
\usepackage[utf8]{inputenc} %unicode support
%\usepackage[applemac]{inputenc} %applemac support if unicode package fails
%\usepackage[latin1]{inputenc} %UNIX support if unicode package fails
\usepackage{graphicx}
\usepackage{color}
\usepackage[english]{babel}
\urlstyle{rm}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% If you wish to display your graphics for %%
%% your own use using includegraphic or %%
%% includegraphics, then comment out the %%
%% following two lines of code. %%
%% NB: These line *must* be included when %%
%% submitting to BMC. %%
%% All figure files must be submitted as %%
%% separate graphics through the BMC %%
%% submission process, not included in the %%
%% submitted article. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\def\includegraphic{}
\def\includegraphics{}
\setlength{\topmargin}{0.0cm}
\setlength{\textheight}{21.5cm}
\setlength{\oddsidemargin}{0cm}
\setlength{\textwidth}{16.5cm}
\setlength{\columnsep}{0.6cm}
\newboolean{publ}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% You may change the following style settings %%
%% Should you wish to format your article %%
%% in a publication style for printing out and %%
%% sharing with colleagues, but ensure that %%
%% before submitting to BMC that the style is %%
%% returned to the Review style setting. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\remark}[1]{\marginpar[\sf\tiny#1\hfil$\rightarrow$]{\sf\tiny$\leftarrow$#1}}
\newcommand{\todo}[1]{\textcolor{red}{\#\#\#}\remark{Todo #1}}
%Review style settings
\newenvironment{bmcformat}{\begin{raggedright}\baselineskip20pt\sloppy\setboolean{publ}{false}}{\end{raggedright}\baselineskip20pt\sloppy}
%Publication style settings
%\newenvironment{bmcformat}{\fussy\setboolean{publ}{true}}{\fussy}
% Begin ...
\begin{document}
\begin{bmcformat}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the title of your article here %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{CDK-Taverna: An open workflow environment for cheminformatics}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors here %%
%% %%
%% Ensure \and is entered between all but %%
%% the last two authors. This will be %%
%% replaced by a comma in the final article %%
%% %%
%% Ensure there are no trailing spaces at %%
%% the ends of the lines %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author{Thomas Kuhn$^{1}$%
\email{Thomas Kuhn - thomas.kuhn@fh-gelsenkirchen.de}%
\and
Egon L. Willighagen$^2$%
\email{Egon L. Willighagen - egon.willighagen@farmbio.uu.se}
\and
Achim Zielesny\correspondingauthor$^1$%
\email{Achim Zielesny\correspondingauthor - achim.zielesny@fh-gelsenkirchen.de}
and
Christoph Steinbeck\correspondingauthor$^{3}$%
\email{Christoph Steinbeck\correspondingauthor - steinbeck@ebi.ac.uk}%
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors' addresses here %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\address{%
\iid(1) Institute for Bioinformatics and Cheminformatics, University of Applied Sciences of Gelsenkirchen, Recklinghausen, Germany\\
\iid(2) Department of Pharmaceutical Biosciences, Uppsala University, Uppsala, Sweden\\
\iid(3) Chemoinformatics and Metabolism, European Bioinformatics Institute (EBI), Cambridge, UK
}%
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The Abstract begins here %%
%% %%
%% The Section headings here are those for %%
%% a Research article submitted to a %%
%% BMC-Series journal. %%
%% %%
%% If your article is not of this type, %%
%% then refer to the Instructions for %%
%% authors on http://www.biomedcentral.com %%
%% and change the section headings %%
%% accordingly. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstract}
% Do not use inserted blank lines (ie \\) until main body of text.
\paragraph*{Background:} Small molecules are of increasing interest
for bioinformatics in areas such as metabolomics and drug discovery.
The recent release of large open access chemistry
databases generates a demand for flexible tools to process
them and discover new knowledge. To freely support open science based on these data resources, it is desirable
for the processing tools to be open source and available for everyone.
\paragraph*{Results:} Here we describe a novel combination of the workflow engine
Taverna and the cheminformatics library Chemistry Development Kit (CDK)
resulting in a open source workflow solution for cheminformatics.
We have implemented more than 160 different workers to handle specific cheminformatics tasks. We describe
the applications of CDK-Taverna in various usage scenarios.
\paragraph*{Conclusions:} The combination of the workflow engine
Taverna and the Chemistry Development Kit provides the first open source cheminformatics workflow
solution for the biosciences. With the Taverna-community working towards a more powerful workflow engine and a more user-friendly user interface,
CDK-Taverna has the potential to become a free alternative to existing proprietary workflow tools.
\end{abstract}
\ifthenelse{\boolean{publ}}{\begin{multicols}{2}}{}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The Main Body begins here %%
%% %%
%% The Section headings here are those for %%
%% a Research article submitted to a %%
%% BMC-Series journal. %%
%% %%
%% If your article is not of this type, %%
%% then refer to the instructions for %%
%% authors on: %%
%% http://www.biomedcentral.com/info/authors%%
%% and change the section headings %%
%% accordingly. %%
%% %%
%% See the Results and Discussion section %%
%% for details on how to create sub-sections%%
%% %%
%% use \cite{...} to cite references %%
%% \cite{koon} and %%
%% \cite{oreg,khar,zvai,xjon,schn,pond} %%
%% \nocite{smith,marg,hunn,advi,koha,mouse}%%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%
%% Background %%
%%
\section*{Background}
Small molecules are of increasing interest
for bioinformatics in areas such as metabolomics and drug discovery.
The recent release of large open chemistry databases into the public
domain~\cite{PubChem,IrwinJ2005,ChEMBL,Williams2008} calls for flexible, open toolkits to process them.
These databases and tools will, for the first time, create opportunities for academia and third-world countries
to perform state-of-the-art open drug discovery and translational research -- endeavors so far a domain of the
pharmaceutical industry.
Commonly used in this context are workflow engines for cheminformatics, where numerous recurring tasks can be automated, including
tasks for
\begin{itemize}
\item chemical data filtering, transformation, curation and migration workflows
\item chemical documentation and information retrieval related workflows (structures, reactions, pharmacophores, object relational data etc.)
\item data analysis workflows (statistics and clustering/machine learning for QSAR, diversity analysis etc.)
\end{itemize}
The workflow paradigm allows scientists to flexibly create generic
workflows using different kinds of data sources, filters and algorithms, which can later be adapted to changing needs.
In order to achieve this, library methods are encapsulated in Lego\texttrademark-like building blocks
which can be manipulated with a mouse or any pointing device in a graphical environment,
relieving the scientist from the need to learn a programming language.
Building blocks are connected by data pipelines to enable data flow between them, which is why
\emph{pipelining} is often used interchangeably for \emph{workflow}. Workflows are
increasingly used in cheminformatics research~\cite{Hassan2006,Shon2008}.
Existing proprietary or semi-proprietary implementations of the workflow or pipeline paradigm
in molecular informatics include Pipeline Pilot~\cite{PipelinePilotWeb} from SciTegic, a subsidiary of
Accelrys or the InforSense platform from InforSense~\cite{InforSenseWeb}. Both
are commercially well established but closed source products with a large variety
of different functionality. KNIME~\cite{KNIMEWeb} is a modular data
exploration platform which uses a dual licensing model with the Aladdin free
public license. It is developed by the group of Michael Berthold at the
University of Konstanz, Germany. KNIME is based on the open source Eclipse platform.
An overview of workflow systems in life sciences was recently given by Tiwari
\emph{et al.}~\cite{Tiwari2007}.
In 2005 we started to integrate our open source cheminformatics
library, the Chemistry Development Kit (CDK)~\cite{Steinbeck2004a,Steinbeck2003a}
with Taverna~\cite{Chemblaics2005,CDKTaverna2005,TomOinn2004}, a
workflow environment with an extensible architecture, to produce CDK-Taverna,
the first completely free~\cite{free-software-foundation} workflow solution for
cheminformatics, which we present here. It makes additional use of other
open source components such as Bioclipse~\cite{Spjuth2007} for visualization of
workflow results, and Pgchem::tigress~\cite{PGChemWeb} as an interface to the
database back-end for storage of large data sets.
\section*{Implementation}
The here introduced CDK-Taverna plugin takes advantage of the plug-in detection
manager of Taverna for its installation. This manager requires a plug-in
description XML file containing a plug-in name, a version number, a target Taverna
version number, a repository location and a Maven-like Java package description,
all provided by the plug-in's installation website: http://www.cdk-taverna.de/plugin/.
After adding this URL, the manager presents all available plug-in versions
graphically to the user. In order to install the CDK-Taverna plug-in the user selects
the desired version after which all necessary Java libraries are installed on-the-fly
from the given installation website.
The CDK-Taverna plug-in is written in Java is published under the GNU Lesser
General Public License (LGPL). Version 0.5.1.1 uses CDK revision 12084.
Like Taverna itself the CDK-Taverna plug-in uses Maven 2~\cite{MavenWeb} as a build system.
To integrate the CDK functionality, the plug-in makes use of the extension points
provided by Taverna allowing dynamic discovery of the provided functionality. The following
sections describe what extension points are used, and how molecular data is
represented when flowing through the workflow.
\subsection*{Taverna's extension points}
Taverna allows the execution of workflows linking together heterogeneous
open services, applications or databases (remote or local, private or public, third-party or
home-grown)~\cite{Taylor2007}. For the integration of these
different resource types Taverna provides various interfaces and
protocols for its extension. For example, it allows for easy access
to webservices through WSDL~\cite{WSDLWeb} and SOAP~\cite{SOAPWeb}.
The CDK-Taverna plug-in, on the other hand, uses Taverna's local extension mechanism. For
local extensions, Taverna provides a list of different Service Provider Interfaces (SPI),
as given in Table~\ref{tab:SPIs}.
CDK-Taverna implements several of these, integrating CDK functionality as so-called Local Workers
which run on the same machine as the Taverna installation.
Full JavaDoc documentation of the plug-in's source code is
available at \url{http://cdk.sourceforge.net/cdk-taverna/api/}.
All workers in CDK-Taverna implement the
\texttt{CDKLocalWorker} interface. It is used for the detection of
workers by the \texttt{CDKScavenger} class which itself implements the Taverna SPI
\texttt{org.embl.ebi.escience.scuflui.workbench.Scavenger} interface.
Adding user interfaces for some of the workers requires an extension of the
\texttt{AbstractCDKProcessorAction} which again implements the Taverna SPI
\texttt{org.embl.ebi.escience.scuflui.spi.ProcessorActionSPI}. The use of this SPI
allows the addition of, for example, file chooser dialogs for workers like file
reader or writer.
\subsection*{The anatomy of a CDK-Taverna worker}
To create a CDK-Taverna worker the Java class of this worker has to implement
the CDKLocalWorker Interface. This interface defines that every worker has to
define the following methods:\\
\begin{verbatim}
public Map <String, DataThing> execute(Map String, DataThing inputMap)
throws TaskExecutionException;
public String[] inputNames();
public String[] inputTypes();
public String[] outputNames();
public String[] outputTypes();
\end{verbatim}
The method inputNames and outputNames return the names of the ports of each
worker whereas the inputTypes and outputTypes methods return the names for the
Java object types with its package declaration e.g. \texttt{java/java.util.List}
for a List. Within CDK-Taverna chemical structures are passed around using the
Java object \texttt{java/org.openscience.cdk.applications.taverna.CMLChemfile}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Results and Discussion %%
%%
\section*{Results}
The CDK-Taverna plug-in currently provides 164 different cheminformatics
workers. The fields of application of these workers are described in
Table~\ref{tab:cheminfoWorkers}. These include workers for input and output
(I/O) of various chemical files and line notations formats, databases, and
descriptors for atoms, bonds and molecules. The miscellaneous workers are e.g. a
substructure filter, an aromaticity detector, an atom typer or a reaction
enumerator. Some of the workers are outlined as part of example workflows
described below (for a complete list see~\cite{KuhnDissertation} or
\url{http://cdk.sourceforge.net/cdk-taverna/workers.html}). In the
following, we outline the application of CDK-Taverna with selected workflow
scenarios. A larger list of workflows is available from
MyExperiment.org~\cite{MyExperimentOrg}.
\subsubsection*{Iteration over large data sets}
Cheminformatics by definition deals with the discovery of chemical knowledge from large data collections.
Because these data sources are usually too large to be loaded into memory as a whole, it is needed to
loop over all data entries to process them one by one. Unfortunately, the architecture of Taverna
1.7 does not support such loops. CDK-Taverna, therefore, provides workers which
act like FOR or WHILE loops, making use of Taverna's iteration-and-retry mechanism to allow workflows
to process large data sets.
%see section iterative qsar workflow
\subsubsection*{Database I/O}
For database support the CDK-Taverna project uses the
PostgreSQL~\cite{PostgreSQLWeb} relational database management system (RDBMS)
with the open source Pgchem::tigress~\cite{PGChemWeb} extension. This
combination allows storage and fast retrieval of up to a million molecules without
running into memory limitations. The Pgchem::tigress extension uses an implementation
of the Generalized Search Tree (GiST)~\cite{GISTWeb} of the PostgreSQL database.
CDK-Taverna can use a local installation of the PostgreSQL Database with the
Pgchem::tigress extension or can connect to a remote instance.
\subsection*{Scenario 1: Substructure Search}
We may want to design workflows for performing substructure searches in
different ways depending on the type of input. In a first example the
substructure workflow performs a topological substructure search on a list of
given molecules and a given molecular substructure (see
Figure~\ref{fig:substructureworkflow.ps}). The workflow inputs are a molecular
substructure represented in the
SMILES line notation~\cite{WeiningerDavid1988} and a list of
structures stored in a MDL SDfile~\cite{DalbyArthur1992}. The structures which
match the substructure are stored as MDL Molfiles~\cite{DalbyArthur1992}. The
non-matching structures are converted into the Chemical Markup Language
(CML) file format~\cite{Murray-RustP1999,Willighagen2001,KuhnS2007}. This small example
workflow already combines four different molecular structure representations and
the use of a topological substructure filter.
A related workflow performs a substructure search directly
on a database: It uses functionality provided by the Pgchem extension
of the PostgreSQL database. This extension allows the use of SQL commands to perform a
substructure search.
A demonstration is given with the workflow in Figure~\ref{fig:database_substructure_search}.
The molecules containing the substructure are reported in tabular form in a PDF file.
\subsection*{Scenario 2: Descriptor Calculation}
The descriptor calculation workflow, depicted in
Figure~\ref{fig:QSARWorkflow}, starts with loading its molecules from a
PostgreSQL database. The recognition of the atom types is the next step,
followed by the addition of implicit hydrogens for each molecule as well as the
detection of H\"uckel aromaticity. Each molecule is tagged for the descriptor
calculation process. The tagging of the molecules is used to add an universal
identifier to each molecule. This allows the identification of the corresponding
Quantitative Structure-Activity Relationship (QSAR) descriptor values
within the table of calculated descriptor values. The QSAR worker provides a user
interface based selection of multiple QSAR descriptors (see
Figure~\ref{fig:QSARWorkerUI}) for the calculation of a molecule's property
vector based on the CDK.
The result of the workflow is comma separated value (CSV) text file which
contains the ID of the molecule and the calculate property values. The property
vectors may then be used for statistical analysis, clustering or machine
learning purposes. With the PostgreSQL Database back-end CDK-Taverna is able to
calculate a large number of descriptors for many thousand of molecules in a
reasonable time (see Figure~\ref{fig:TimeCalculateDescriptors}).
\subsection*{Scenario 3: Iterative Descriptor Calculation}
The iterative descriptor calculation workflow is a \emph{work-around} which
allows the treatment of hundreds of thousands of molecules.
This workflow (see
Figure~\ref{fig:IterativeQSARWorkflow}) processes each molecule in the same
manner as the non-iterative descriptor calculation workflow
but it uses different database workers.
Instead of the single database worker \texttt{Get\_Molecules\_From\_Database} three database
workers are applied: \texttt{Iterative\_Molecule\_From\_Database\_Reader},
\texttt{Get\_Molecule\_From\_Database} and \texttt{Has\_Next\_Molecule\_From\_Database}.
The first of the three workers is used to configure the database
connection and store it within an internal object registry. The second worker
gets the ID of the database connection as an input and loads molecules from the
database. Only a subset of the original query is loaded using the SQL functions LIMIT
and OFFSET. The last database worker checks whether the set of loaded molecules is
the last of this query or if further molecules must be loaded. If the latter applies
the output of this last worker would be the text value \texttt{true}. A last but
essential worker is \texttt{Fail\_if\_true}. This worker throws an exception if it gets
the value \texttt{true} as input. This worker is crucial for the nested workflow:
If it fails the whole nested workflow fails. Taverna then provides a retry mechanism
for a failing worker or nested workflow. This mechanism is used to re-run the
nested workflow as often as necessary. This dirty workaround might become obsolete in
later versions of Taverna.
\subsection*{Scenario 4: Validation of CDK Atom Types}
The calculation of physiochemical properties in the Chemistry Development Kit
(CDK) relies on the detection of atom types for all atoms in each molecule. The
atom types describe basic atomic properties needed by the various
cheminformatics algorithms implemented in the CDK. If no atom type is recognized
for an atom, the atom is flagged as \emph{unknown}. Based on the CDK's atom type
perception functionality, we devised an example workflow (see
Figure~\ref{fig:AtomTypingWF}) for the validation of the CDK atom typing
procedures. The detection of an unknown atom type by the CDK
indicates that either the CDK lacks this specific atom type or the molecule
contains chemically nonsensical atom types. In Figure~\ref{fig:AtomTypingWF} the
\texttt{Perceive\_atom\_types} worker performs an atom type detection, followed
by the retrieval of the database ID for those molecules with unknown atom type
by the \texttt{Extract\_the\_databaseID\_from\_the\_molecule} worker. The
workflow creates two text files, one containing the identifier of all molecules
with unknown atom types, created by the \texttt{Iterative\_File\_Writer}, and a
second one containing information about which atom of which molecule is unknown
to the CDK. An analysis~\cite{KuhnDissertation} of the atom type detection was
performed on three different databases, two proprietary natural products
databases and the open access database of Chemical Entities of Biological
Interest (ChEBI) \cite{ChEBIWeb, Degtyarenko2008}, maintained at the European
Bioinformatics Institute (EBI). The workflow was run with more than 600 thousand
molecules and showed that the CDK algorithms matches the atom types quite well,
but that the atom type list is not complete for metals and other heavy
atoms (see Figure~\ref{fig:AtomTypingResults}).
Missing atom type definitions is a general problem to many cheminformatics algorithms
and not unique to the CDK: it leads to severe problems and computation
error. Therefore, initial atom type perception is an important filter for
cheminformatics workflows.
\subsection*{Scenario 5: Reaction Enumeration}
Markush structures are chemical drawings which represent a series
of molecules by indicating locations where differences occur.
These locations are marked as \emph{Heterocyclic}, \emph{Alkyl}, or identified
by an \emph{R} group, enumerating a series of possible groups, such
as \emph{Methyl}, \emph{Isopropyl}, and \emph{Pentyl}.
Markush structures are commonly used in patents for
describing whole compound classes and are named after Eugene A. Markush who
described these kind of structures firstly in his US patent in the 1920s.
In the
process of reaction enumeration, Markush structures are used to design generic
reactions. These reactions are usable for the enumeration of large chemical
spaces, which includes the generation of chemical target libraries.
The results of the enumeration has important applications in
patent formulation and in High Throughput Screening (HTS). HTS
experiments screen large amounts of small molecules, called molecule libraries,
against one or more assays for testing for biological activity. A
couple of years ago, the libraries used for a single HTS experiment
consisted of up to 100.000 molecules. Nowadays, more
targeted libraries of a reduced size of up to 1.000 molecules are used,
but still commonly defined using Markush structures.
For reaction enumeration, a given reaction contains different building blocks,
which are needed for the enumeration. Each reactant of the reaction represents a
building block. A scientist then selects a number of molecules for each reactant
and the reaction enumeration creates a list of all possible products. The list
of products then passes a virtual screening before at last a scientist decides
which products will be synthesized. Results can be visualized and inspected at
the end in Bioclipse~\cite{Spjuth2007}. CDK-Taverna contains workers which
support an enumeration task based on a generic reaction (see
Figures~\ref{fig:ReactionEnumerationWorkflow} and
~\ref{fig:ReactionEnumerationSchema}).
\subsection*{Scenario 6: Clustering Workflows}
During the work on this project the majority of workflows used unsupervised
clustering with an implementation of the ART 2-A algorithm~\cite{Carpenter1991}.
This algorithm was chosen because of its capability to automatically cluster
open-categorical problems. Compared
to traditional clustering methods like the k-means, the
ART 2-A algorithm is computationally less demanding and therefore
applicable
especially to large data sets. Within a typical clustering workflow the
\texttt{Get\_QSAR\_vector\_from\_database} worker loads molecule's data vectors
(compare descriptor calculation workflow above) for a specific molecular SQL
query from the database. This worker provides options to inspect the result
vector which includes checks for values such as Not a Number (NaN) or Infinity.
In addition, different thresholds may be specified for components or
complete vector removal, e.g. for the removing of components whose minimum value
equals its maximum value. After the loading and cleaning of a data
vector, the clustering task is performed using the \texttt{ART2A\_Classificator}
worker, as depicted in Figure~\ref{fig:ART2AClassification}.
For the configuration of this worker different options are available:
\begin{itemize}
\item linear scaling of the input vector to values between 0 and 1,
\item a switch between \textit{deterministic random} and \textit{random random} for the selection of the vectors to process,
\item the definition of the convergence criteria of the clustering process,
\item the required similarity for the convergence criteria,
\item the maximum clustering time, and
\item a limit for the number of clustering steps and a range for the vigilance parameter
that guides the ART 2-A algorithm.
\end{itemize}
The implemented ART 2-A algorithm contains two possible convergence criteria. It
converges if the classification does not change after one epoch or if the scalar
product of the classes between two epochs is less than the required similarity.
The clustering result is stored in form of a compressed XML document. This XML
result document can be processed with different workers to create different
visualizations depending on the aim of the clustering task. For chemical
diversity analysis the ART 2-A worker was used for a successive top-down
clustering of three chemical databases (two proprietary databases containing
natural products and the ChEBI database~\cite{Degtyarenko2008} containing
molecules of biological interest, see
Figure~\ref{fig:ART2AClassificationResult}). The occupancies of the different clusters
show the similarity of the natural product databases in contrast to the ChEBI
database which differs in chemical space occupation~\cite{KuhnDissertation}.
This findings will be outlined in a subsequent publication.
%%%%%%%%%%%%%%%%%%%%%%
\section*{Conclusions}
With CDK-Taverna we have presented the first free and open cheminformatics workflow solution for the biosciences.
It allows to link and process data from various sources in visually accessible workflow diagrams without any deeper
programming experience. Processing of hundreds of thousands of molecules has been demonstrated and the upper boundary
is only limited by the amount of available memory. The currently implemented workers allow the processing of chemical data in
various formats, provides the possibility to calculate chemical properties and allows
cluster analysis of molecular descriptor vectors.
The use of the PostgreSQL database with the Pgchem::tigres cheminformatics cartridge provides
access to chemical databases with up to a million molecules.
\section*{Availability and requirements}
\begin{itemize}
\item \textbf{Project name:} CDK-Taverna
\item \textbf{Project home page:} http://www.cdk-taverna.de
\item \textbf{Operating system(s):} Platform independent
\item \textbf{Programming language:} Java
\item \textbf{Other requirements:} Java 1.6.0 or higher (\url{http://java.sun.com/}), Taverna 1.7.2 (\url{http://sourceforge.net/projects/taverna/files/taverna/1.7.2/})
\item \textbf{License:} GNU Library or Lesser General Public License (LGPL)
\item \textbf{Any restrictions to use by non-academics:} none
\end{itemize}
%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Authors contributions}
EW initiated the integration of Taverna and the CDK. CS and AZ conceived the project,
and lead the further development. TK did the majority of CDK-Taverna development and
developed the projects to its current state. All co-authors contributed to the manuscript.
%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Acknowledgements}
\ifthenelse{\boolean{publ}}{\small}{}
The authors express their gratitude to the Taverna team as well as the CDK
community for creating these great open tools, and like to thank Ernst-Georg
Schmid for his support concerning the Pgchem::tigress functionality.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% The Bibliography %%
%% %%
%% Bmc_article.bst will be used to %%
%% create a .BBL file for submission, which includes %%
%% XML structured for BMC. %%
%% After submission of the .TEX file, %%
%% you will be prompted to submit your .BBL file. %%
%% %%
%% %%
%% Note that the displayed Bibliography will not %%
%% necessarily be rendered by Latex exactly as specified %%
%% in the online Instructions for Authors. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{\ifthenelse{\boolean{publ}}{\footnotesize}{\small}
\bibliographystyle{bmc_article} % Style BST file
\bibliography{bmc_article} } % Bibliography file (usually '*.bib' )
%%%%%%%%%%%
\ifthenelse{\boolean{publ}}{\end{multicols}}{}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Figures %%
%% %%
%% NB: this is for captions and %%
%% Titles. All graphics must be %%
%% submitted separately and NOT %%
%% included in the Tex document %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%% Do not use \listoffigures as most will included as separate files
\clearpage
\section*{Figures}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=.6]{pics/substructureworkflow.pdf}
\caption{Workflow performing a topological substructure search (Scenario~1) on
molecules from a MDL SDfile~\cite{MyExperiementWFSubstructure}.
The input of this workflow is a SMILES string
which represents the substructure.}
\label{fig:substructureworkflow.ps}
\end{figure*}
\begin{figure*}[h!]
\centering
\includegraphics[angle=0,clip=false,scale=.2]{pics/databaseSubstructureSearch.png}
\caption{Workflow performing a substructure search (Scenario~1)
in a database~\cite{MyExperiementWFSubstructureDB}.
The substructure is defined with a SMILES string. The output is a PDF
file with a tabular view of the molecules from the database containing the
substructure.}
\label{fig:database_substructure_search}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=.5]{pics/QSARWorkflow.pdf}
\caption{Workflow calculating various QSAR descriptors (Scenario~2) for molecules
from a PostgreSQL database. The results of the calculation are stored in a CSV file.}
\label{fig:QSARWorkflow}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=true,scale=.5]{pics/QSARWorkerUIsmall.pdf}
\caption{User interface to select QSAR descriptors to be calculated for each molecule during
the execution of the descriptor calculation workflow shown in Figure~\ref{fig:QSARWorkflow}.}
\label{fig:QSARWorkerUI}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=.3]{pics/timeneededtocalculatemoleculardescriptors.png}
\caption{Overview of the time needed to calculate different molecular descriptors for 1000
molecules~\cite{CDKTavernaBlogWeb}.}
\label{fig:TimeCalculateDescriptors}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=.5]{pics/IterativeQSARWorkflow.png}
\caption{Workflow iteratively calculating different QSAR descriptors (Scenario~3)
for molecules loaded from a PostgreSQL
database~\cite{MyExperiementWFQSAR}. The results are stored in a CSV file.}
\label{fig:IterativeQSARWorkflow}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=.5]{pics/AtomTypingWF.pdf}
\caption{Workflow for iterative loading of molecules from a database and searches for molecules
with atom types unknown to the Chemistry Development Kit (Scenario~4).}
\label{fig:AtomTypingWF}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=.5]{pics/AtomTypingResults.png}
\caption{Allocation of the unknown atom types detected during the analysis of the ChEBI database
(12367 molecules). A total of 2414 atoms in 1035 molecules (8.36\%) did not have a
recognized atom type. X1 summarizes unrecognized atom types for the elements
Am, Cf, Cm, Dy, Es, Fm, Ga, Lr, Md, Na, Nb, No, Np, Pm, Pu, Sm, Tb, Tc, Th, and Ti.}
\label{fig:AtomTypingResults}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=flase,scale=.4]{pics/reactionEnumerationWF2.png}
\caption{Reaction enumeration (Scenario~5) loading a generic reaction from a MDL RXNfile and two reactant lists
from MDL SDfiles. The products from the enumeration are stored as MDL Molfiles. Besides these files
a PDF document showing the 2D structure of the products is created. At the end Bioclipse will start up
to allow visualization and analysis of the
results~\cite{MyExperiementWFReactionEnum}.}
\label{fig:ReactionEnumerationWorkflow}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=.5]{pics/reactionenumeration.png}
\caption{Reaction enumeration example with two building blocks. For each building block, a list of three reactants is defined. This enumeration results in nine different products.}
\label{fig:ReactionEnumerationSchema}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=.5]{pics/ATR2AClassificationWF.png}
\caption{Workflow for loading molecular descriptor data vectors from a database,
followed by a ART 2-A clustering (Scenario~6).}
\label{fig:ART2AClassification}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=.5]{pics/ART2ACluster.png}
\caption{Occupancies of six different detected clusters for
two proprietary natural product databases (yellow and red)
with the ChEBI database (blue), highlighting the unique character of
the ChEBI database.}
\label{fig:ART2AClassificationResult}
\end{figure*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Tables %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Use of \listoftables is discouraged.
%%
\clearpage
\section*{Tables}
\begin{table}[htbp]
\begin{flushleft}
\caption{List of available Service Provider Interfaces
that can be used to create plug-ins for Taverna to provide
additional functionality.}
\label{tab:SPIs}
\begin{tabular}{|c|}
\hline \textbf{Interfaces}\\ \hline
org.embl.ebi.escience.scuflworkers.java.LocalWorker \\ \hline
net.sf.taverna.perspectives.PerspectiveSPI \\ \hline
org.embl.ebi.escience.scuflui.spi.ProcessorActionSPI \\ \hline
org.embl.ebi.escience.scuflworkers.ProcessorInfoBean \\ \hline
org.embl.ebi.escience.scuflui.spi.RendererSPI \\ \hline
org.embl.ebi.escience.scuflui.spi.ResultMapSaveSPI \\ \hline
org.embl.ebi.escience.scuflui.workbench.scavenger.spi.ScavengerActionSPI \\ \hline
org.embl.ebi.escience.scuflworkers.ScavengerHelper \\ \hline
org.embl.ebi.escience.scuflui.workbench.Scavenger \\ \hline
org.embl.ebi.escience.scuflui.actions.ScuflModelActionSPI \\ \hline
org.embl.ebi.escience.scuflui.spi.UIComponentFactorySPI \\ \hline
\end{tabular}
\end{flushleft}
\end{table}
\begin{table}[htbp]
\begin{flushleft}
\caption{The worker allocation of CDK-Taverna by function.}
\label{tab:cheminfoWorkers}
\begin{tabular}{|l|c|l|}
\hline
\textbf{Workers by function} & \multicolumn{1}{l|}{\textbf{Number of workers}} & \textbf{Examples} \\ \hline
File I/O & 15 & SDFParser, CML Reader \& Writer \\ \hline
SMILES tools & 2 & SMILES Parser, SMILES Writer\\ \hline
InChI parser & 2 & InChI Parser, InChI Generator\\ \hline
Database I/O & 7 & Insert Molecules Into Database,\\
& & Read Molecules From Database\\ \hline
Molecular descriptors & 42 & AtomCount \& LargestChain \\ \hline
Atom descriptors & 27 & AtomHybridization \& BondsToAtom \\ \hline
Bond descriptors & 6 & PartialPiCharge\\ \hline
Clustering & 13 & K-Means, ART 2-A Classification\\ \hline
Miscellaneous & 50 & Substructure Search, Reaction Enumeration\\ \hline
\end{tabular}
\end{flushleft}
\end{table}
\end{bmcformat}
\end{document}