[r15627]: cdk-taverna-2-paper / bmc_article.tex  Maximize  Restore  History

Download this file

623 lines (461 with data), 47.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
%% BioMed_Central_Tex_Template_v1.06
%% %
% bmc_article.tex ver: 1.06 %
% %
%%IMPORTANT: do not delete the first line of this template
%%It must be present to enable the BMC Submission system to
%%recognise this template!!
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% LaTeX template for BioMed Central %%
%% journal article submissions %%
%% %%
%% <14 August 2007> %%
%% %%
%% %%
%% Uses: %%
%% cite.sty, url.sty, bmc_article.cls %%
%% ifthen.sty. multicol.sty %%
%% %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% For instructions on how to fill out this Tex template %%
%% document please refer to Readme.pdf and the instructions for %%
%% authors page on the biomed central website %%
%% http://www.biomedcentral.com/info/authors/ %%
%% %%
%% Please do not use \input{...} to include other tex files. %%
%% Submit your LaTeX manuscript as one .tex document. %%
%% %%
%% All additional figures and files should be attached %%
%% separately and not embedded in the \TeX\ document itself. %%
%% %%
%% BioMed Central currently use the MikTex distribution of %%
%% TeX for Windows) of TeX and LaTeX. This is available from %%
%% http://www.miktex.org %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\NeedsTeXFormat{LaTeX2e}[1995/12/01]
\documentclass[10pt]{bmc_article}
% Load packages
\usepackage{cite} % Make references as [1-4], not [1,2,3,4]
\usepackage{url} % Formatting web addresses
\usepackage{ifthen} % Conditional
\usepackage{multicol} %Columns
\usepackage[utf8]{inputenc} %unicode support
%\usepackage[applemac]{inputenc} %applemac support if unicode package fails
%\usepackage[latin1]{inputenc} %UNIX support if unicode package fails
\usepackage{graphicx}
\usepackage{array}
\urlstyle{rm}
\usepackage{placeins}
\newcommand{\origttfamily}{}
\let\origttfamily=\ttfamily %Voheriges \ttfamily sichern
\renewcommand{\ttfamily}{\origttfamily \hyphenchar\font=`\.}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% If you wish to display your graphics for %%
%% your own use using includegraphic or %%
%% includegraphics, then comment out the %%
%% following two lines of code. %%
%% NB: These line *must* be included when %%
%% submitting to BMC. %%
%% All figure files must be submitted as %%
%% separate graphics through the BMC %%
%% submission process, not included in the %%
%% submitted article. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\def\includegraphic{}
%\def\includegraphics{}
\setlength{\topmargin}{0.0cm}
\setlength{\textheight}{21.5cm}
\setlength{\oddsidemargin}{0cm}
\setlength{\textwidth}{16.5cm}
\setlength{\columnsep}{0.6cm}
\newboolean{publ}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% You may change the following style settings %%
%% Should you wish to format your article %%
%% in a publication style for printing out and %%
%% sharing with colleagues, but ensure that %%
%% before submitting to BMC that the style is %%
%% returned to the Review style setting. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%Review style settings
\newenvironment{bmcformat}{\begin{raggedright}\baselineskip20pt\sloppy\setboolean{publ}{false}}{\end{raggedright}\baselineskip20pt\sloppy}
%Publication style settings
%\newenvironment{bmcformat}{\fussy\setboolean{publ}{true}}{\fussy}
% Begin ...
\begin{document}
\begin{bmcformat}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the title of your article here %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{New developments on the cheminformatics open workflow environment CDK-Taverna}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors here %%
%% %%
%% Ensure \and is entered between all but %%
%% the last two authors. This will be %%
%% replaced by a comma in the final article %%
%% %%
%% Ensure there are no trailing spaces at %%
%% the ends of the lines %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\author{
Andreas Truszkowski$^1$%
\email{Andreas Truszkowski - andreas.truszkowski@fh-gelsenkirchen.de}%
\and
Kalai Vanii Jayaseelan$^2$%
\email{Kalai Vanii Jayaseelan - Kalai@ebi.ac.uk}%
\and
Stefan Neumann$^3$%
\email{Stefan Neumann - stefan.neumann@gnwi.de}%
\and
Egon L Willighagen$^4$%
\email{Egon L Willighagen - egon.willighagen@ki.se}%
\and
Achim Zielesny$^1$%
\email{Achim Zielesny - achim.zielesny@fh-gelsenkirchen.de}%
and
Christoph Steinbeck\correspondingauthor$^{2}$%
\email{Christoph Steinbeck\correspondingauthor - steinbeck@ebi.ac.uk}%
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Enter the authors' addresses here %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\address{%
\iid(1)Institute for Bioinformatics and Cheminformatics, University of Applied Sciences of Gelsenkirchen, Recklinghausen, Germany\\
\iid(2)Chemoinformatics and Metabolism, European Bioinformatics Institute (EBI), Cambridge, UK\\
\iid(3)GNWI - Gesellschaft fuer naturwissenschaftliche Informatik mbH, Oer-Erkenschwick, Germany\\
\iid(4)Division of Molecular Toxicology, Institute of Environmental Medicine, Karolinska Institutet, Stockholm, Sweden
}%
\maketitle
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The Abstract begins here %%
%% %%
%% The Section headings here are those for %%
%% a Research article submitted to a %%
%% BMC-Series journal. %%
%% %%
%% If your article is not of this type, %%
%% then refer to the Instructions for %%
%% authors on http://www.biomedcentral.com %%
%% and change the section headings %%
%% accordingly. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstract}
% Do not use inserted blank lines (ie \\) until main body of text.
\paragraph*{Background:} The computational processing and analysis of small molecules is at heart of cheminformatics and structural bioinformatics and their application in e.g. metabolomics or drug discovery. Pipelining or workflow tools allow for the Lego(TM)-like, graphical assembly of I/O modules and algorithms into a complex workflow which can be easily deployed, modified and tested without the hassle of implementing it into a monolithic application. The CDK-Taverna project aims at building a free open-source cheminformatics pipelining solution through combination of different open-source projects such as Taverna, the Chemistry Development Kit (CDK) or the Waikato Environment for Knowledge Analysis (WEKA). A first integrated version 1.0 of CDK-Taverna was recently released to the public.
\paragraph*{Results:} The CDK-Taverna project was migrated to the most up-to-date versions of its foundational software libraries with a complete re-engineering of its worker's architecture (version 2.0). 64-bit computing and multi-core usage by paralleled threads are now supported to allow for fast in-memory processing and analysis of large sets of molecules. Earlier deficiencies like workarounds for iterative data reading are removed. The combinatorial chemistry related reaction enumeration features are considerably enhanced. Additional functionality for calculating a natural product likeness score for small molecules is implemented to identify possible drug candidates. Finally the data analysis capabilities are extended with new workers that provide access to the open-source WEKA library for clustering and machine learning as well as training and test set partitioning. The new features are outlined with usage scenarios.
\paragraph*{Conclusions:} CDK-Taverna 2.0 as an open-source cheminformatics workflow solution matured to become a freely available and increasingly powerful tool for the biosciences. The combination of the new CDK-Taverna worker family with the already available workflows developed by a lively Taverna community and published on myexperiment.org enables molecular scientists to quickly calculate, process and analyse molecular data as typically found in e.g. today's systems biology scenarios.
\end{abstract}
\ifthenelse{\boolean{publ}}{\begin{multicols}{2}}{}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% The Main Body begins here %%
%% %%
%% The Section headings here are those for %%
%% a Research article submitted to a %%
%% BMC-Series journal. %%
%% %%
%% If your article is not of this type, %%
%% then refer to the instructions for %%
%% authors on: %%
%% http://www.biomedcentral.com/info/authors%%
%% and change the section headings %%
%% accordingly. %%
%% %%
%% See the Results and Discussion section %%
%% for details on how to create sub-sections%%
%% %%
%% use \cite{...} to cite references %%
%% \cite{koon} and %%
%% \cite{oreg,khar,zvai,xjon,schn,pond} %%
%% \nocite{smith,marg,hunn,advi,koha,mouse}%%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%
%% Background %%
%%
\section*{Background}
Current problems in the biosciences typically involve several domains of research. They require a scientist to work with different and diverse sets of data. The reconstruction of a metabolic network from sequencing data, for example, employs many of the data types found along the axis of the central dogma, including reconstruction of genome sequences, gene prediction, determination of encoded protein families, and from there to the substrates of enzymes, which then form the metabolic network. In order to work with such a processing pipeline, a scientist has to copy/paste and often transform the data between several bioinformatics web portals by hand. The manual approach involves repetitive tasks and can not be considered effective or scalable.
Especially the processing and analysis of small molecules comprises tasks like filtering, transformation, curation or migration of chemical data, information retrieval with substructures, reactions, or pharmacophores as well as the analysis of molecular data with statistics, clustering or machine learning to support chemical diversity requirements or to generate quantitative structure activity/property relationships (QSAR/QSPR models). These processing and analysis procedures itself are of increasing importance for research areas like metabolomics or drug discovery. The power and flexibility of the corresponding computational tools become essential success factors for the whole research process.
The workflow paradigm addresses the above issues with the supply of sets of elementary workers (activities) that can be flexibly assembled in a graphical manner to allow complex procedures to be performed in an effective manner - without the need of specific code development or software programming skills. Scientific workflows allow the combination of a wide spectrum of algorithms and resources in a single workspace \cite{Hassan,Shon,Oinn2007}.
Earlier problems with iterations over large data sets \cite{Kuhn} are completely resolved in version 2.0 due to new implementations in Taverna. Taverna 2 allows control structures such as "while" loops or "if-then-else" constructs. Termination criteria for loops may now be evaluated by listening to a state port \cite{Missier}. In addition the user interface of the Taverna 2 workbench has clearly improved: The design and manipulation of workflows in a graphical workflow editor is now supported. Features like copy/paste and undo/redo simplify workflow creation and maintenance \cite{TavernaWeb}.
The CDK-Taverna project aims at building a free open-source cheminformatics pipelining solution through combination of different open-source projects such as Taverna \cite{Oinn}, the Chemistry Development Kit (CDK) \cite{Steinbeck1,Steinbeck2}, or the Waikato Environment for Knowledge Analysis (WEKA) \cite{Hall}. A first integrated version 1.0 of CDK-Taverna was recently released to the public \cite{Kuhn}. To extend usability and power of CDK-Taverna for different molecular research purposes the development of ver
%%%%%%%%%%%%%%%%%%
\section*{Implementation}
The CDK-Taverna 2.0 plug-in makes use of the Taverna plug-in manager for its installation. The manager fetches all necessary information about the plug-in from a XML file which is located at \url{http://www.ts-concepts.de/cdk-taverna2/plugin/}. The information provided therein contains the name of the plug-in, its version, the repository location and the required Taverna version. Upon submitting the URL to the plug-in manager it downloads all necessary dependencies automatically from the web. After a subsequent restart the plug-in is enabled and the workers are visible in the services. The plug-in uses Taverna version 2.2.1 \cite{TavernaWeb}, CDK version 1.3.8 \cite{CDKWeb} and WEKA version 3.6.4 \cite{WekaWeb}. Like its predecessor it uses the Maven 2 build system \cite{MavenWeb} as well as the Taverna workbench for automated dependency management.
\subsection*{CDK-Taverna 2.0 worker implementation}
The CDK-Taverna 2.0 plug-in is designed to be easily extendible: The implementation allows to create new workers by simply inheriting from the single abstract class \texttt{org.openscience.cdk.applications.taverna.AbstractCDKActivity} (which is the analogue of the CDKLocalWorker interface of CDK-Taverna version 1.0). The class is located in the \texttt{cdk-taverna-2-activity} module. It provides all necessary data for the underlying worker registration mechanism which frees the software developer from handling these tasks manually. The methods which need to be overwritten in order to implement a worker are:
\begin{itemize}
\item \texttt{public void addInputPorts(), public void addOutputPorts()}\\Specify the ports for passing data between workers.
\item \texttt{public String getActivityName(), public String getFolderName()}
\\Return name and folder of a worker.
\item \texttt{public void work()}
\\Entry point for the worker's central algorithm that performs its core function.
\item \texttt{public String getDescription()}
\\Provides descriptive text that explains a worker's function.
\item \texttt{public HashMap<String, Object> getAdditionalProperties()}
\\Specifies additional properties like file extensions, the number of concurrent threads to use, etc.
\end{itemize}
Finally a new worker has to be registered to be available in the Taverna workbench. For this purpose Taverna offers the class \texttt{net.sf.taverna.t2.spi.SPIRegistry.SPIRegistry} to register Service Provider Interfaces (SPI). It is necessary to add the new worker's full name including its package declaration to the file \texttt{org.openscience.cdk.applications.taverna.AbstractCDKActivity} which contains the names and packages of all available workers. This file is located at \url{cdk-taverna-2-activity-ui/src/main/resources/META-INF/services}.
Besides the basic implementation it is possible to define a configuration panel for a worker which allows the specification of parameters. A configuration panel has to inherit from the abstract class \texttt{org.openscience.cdk.applications.taverna.ActivityConfigurationPanel}. The GUI element itself has to be defined in the constructor of the class and may contain any Java Swing element. The following methods are the backbone of a configuration panel:
\begin{itemize}
\item \texttt{public boolean checkValues()}
\\Validates all GUI values.
\item \texttt{public boolean isConfigurationChanged()}
\\After the validity check this method is used to compare the current worker settings with the GUI settings to detect changes.
\item \texttt{public void noteConfiguration()}
\\The properties of the worker are saved in a bean structure. The changes of the configuration bean object are updated by this method.
\item \texttt{public void refreshConfiguration()}
\\Updates the GUI values itself.
\item \texttt{public CDKActivityConfigurationBean getConfiguration()}
\\Access to the configuration bean.
\end{itemize}
The configuration panel has to be registered in the \texttt{CDKConfigurationPanelFactory} class of the \texttt{org.openscience.cdk.applications.taverna.ui.view} package. More details on how to write workers and their configuration panels are provided at the project's wiki page \url{http://cdk-taverna-2.ts-concepts.de/wiki/index.php?title=Main_Page}.
\subsection*{Requirements}
CDK-Taverna 2.0 supports 64-bit computing by use with a Java 64-bit virtual machine. The CDK-Taverna 2.0 plug-in is written in Java and requires Java 6 or higher. The latest java version is available at \url{http://www.java.com/de/download/}. The CDK-Taverna 2.0 plug-in is developed and tested on Microsoft Windows 7 as well as Linux and Mac OS X (32 and 64 bit).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Results and Discussion %%
%%
\section*{Results and Discussion}
The CDK-Taverna 2.0 plug-in provides 192 workers for input and output (I/O) of various chemical file and line notation formats, substructure filtering, aromaticity detection, atom typing, reaction enumeration, molecular descriptor calculation and data analysis. Parallel computing with multi-core processors by use of multiple concurrent threads is flexibly implemented for many workers where operations scale nearly linear with the number of cores. Especially the machine learning and the molecular descriptor calculation workers benefit from parallel computation. An overview is given in Table 1 and 2. Many workers are described by example workflows available at \url{http://cdk-taverna-2.ts-concepts.de/wiki/index.php?title=Main_Page}. Additionally, the workflows can be found at\url{http://www.myexperiment.org/}.
CDK-Taverna 1.0 was confined to 32 bit Java virtual machine and thus was restricted to in-memory processing of data volumes of at most 2 gigabyte in practice. Version 2.0 also supports 64-bit computing by use of a 64-bit Java virtual machine so that the processable data volume is only limited by hardware constraints (memory, speed): 64-bit in-memory workflows were successfully performed with data sets of about 1 million small molecules. Since the memory restrictions of version 1.0 were a main reason to use Pgchem::tigress as a molecular database backend \cite{Kuhn} the corresponding version 1.0 workers were not migrated to the current version 2.0 yet.
\subsection*{Advanced reaction enumeration}
CDK-Taverna 1.0 provided basic functions for combinatorial chemistry related reaction enumeration: They supported the use of two reactants, a single product and one generic group per reactant. The new enumeration options used by CDK-Taverna 2.0 offer major enhancements like multi-match detection, any number of reactants, products or generic groups as well as variable R-groups, ring sizes and atom definitions. The extended functionality was developed and applied in industrial cooperation projects.
Advanced reaction enumeration features are illustrated in Figure \ref{fig:ReactionEnumerationFeatures}. The \emph{Variable RGroup} feature allows the definition of chemical groups which can be flexibly attached to predefined atoms with syntax \emph{[A:B,B,B...-RC]} where \emph{A} is a freely selectable identifier, \emph{B} are numbers from an \emph{Atom-to-Atom-Mapping} defining the atoms to which the generic group can be attached and \emph{C} is the chemical group identifier which can be any number. The \emph{Atom Alias} feature offers the possibility to define a wild card for preconfigured elements. The syntax is \emph{[A:B,B,B...]} where \emph{A} is a freely selectable identifier and \emph{B} are the string representations of the possible elements. The \emph{Expandable Atom} feature enables the definition of freely sizeable rings or aliphatic chains with syntax \emph{[A:[]B]} where \emph{A} is a freely selectable identifier and \emph{B} is the maximum number of atoms to insert. Figure \ref{fig:ReactionEnumerationWorkflow} depicts a workflow for reaction enumeration. The capabilities of the advanced reaction enumerator implementation are summarized in Figure \ref{fig:ReactionEnumerationResults} which also demonstrates multi-match detection, i.e. multiple reaction centers within one molecule.
\subsection*{Evaluation of small molecules for natural product likeness}
In recent years, computer assisted drug design studies use natural product (NP) likeness as a criterion to screen compound libraries for potential drug candidates \cite{Ertl, Dobson}. The reason to estimate NP likeness during candidate screening is to facilitate the selection of those compounds that mimic structural features that are naturally evolved to best interact with biological targets.
Version 2.0 of CDK-Taverna provides two groups of workers that re-implement the work of \emph{Ertl et al} to score small molecules for NP-likeness \cite{Ertl}. The workers in the \texttt{Molecule Curation} folder are dedicated to the pre-processing of chemical structures: The \texttt{Molecule Connectivity Checker} worker removes counter ions and disconnects fragments, the \texttt{Remove Sugar Groups} worker removes all sugar rings and linear sugars from structures and the \texttt{Curate Strange Elements} worker discards structures that are composed of elements other than non-metals. This set of curation workers finally creates scaffolds and sub structures. From these structures atom signatures \cite{Faulon} are generated using the \texttt{Generate Atom Signatures} worker and exploited as structural descriptors in charting the compound's region in the chemical structure space. The combined workflow of curation and atom signature generation workers is illustrated in Figure \ref{fig:Curation and signature descriptor generation}. Using this workflow, atom signatures can be generated for user-defined training (Natural products and synthetics) and testing (compound libraries) structural dataset. Workers of the \texttt{Signature Scoring} folder use atom signatures generated from compound libraries and rank them for NP-likeness based on the statistics suggested by \emph{Ertl et al} \cite{Ertl}. This scoring workflow is illustrated in Figure \ref{fig:ScoringActivity}. The whole package of workflows is available for free download at \url{http://www.myexperiment.org/users/10069/packs}.
The curation and signature scoring workers may not only be applied in evaluating the NP-likeness of compound libraries but also in evaluating the metabolite-likeness of theoretical metabolites for predicting whole metabolomes. The latter application was the original purpose for the worker development and corresponding results will be presented in a subsequent publication.
\subsection*{Clustering and machine learning applications}
Unsupervised clustering tries to partition input data into a number of groups smaller than the number of data whereas supervised machine learning tries to construct model functions that map the input data onto their corresponding output data. If the output codes continuous quantities a regression task is defined. Alternatively the output may code classes so that a classification task is addressed. Molecular data sets for clustering consist of input vectors where each vector represents a molecular entity and consists of a set of molecular descriptors itself. Molecular data sets for machine learning add to each input vector a corresponding output vector with features to be learned - thus they consist of I/O pairs of input and output vectors.
The clustering and machine learning workers of CDK-Taverna 2.0 allow the use of distinct WEKA functionality. As far as clustering is concerned the ART-2a worker of version 1.0 is supplemented with five additional WEKA-based workers which offer
\begin{itemize}
\item \texttt{Expectation Maximisation (EM)}
\\Expectation maximisation algorithm for iterative maximum likelihood estimation of cluster memberships \cite{Dempster1977}.
\item \texttt{Farthest First}
\\Heuristic 2-approximation algorithm for solving the k-center problem \cite{Hochbaum1985}.
\item \texttt{Hierarchical Clusterer}
\\Hierarchical clustering methods: The distance function and the linkage type are freely selectable \cite{WekaAPI}.
\item \texttt{Simple KMeans}
\\Simple k-means clustering algorithm \cite{MacQueen1967}.
\item \texttt{XMeans}
\\Extended k-means clustering with an efficient estimation of the number of clusters \cite{Pelleg2000}.
\end{itemize}
Machine learning workers support the significance analysis of single components (i.e. features) of an input vector to obtain smaller inputs with a reduced set of components/features, the partitioning of machine learning data into training and test sets, the construction of input/output mapping model functions and model based predictions as well as result visualization. There is a total of six WEKA-based machine learning methods available: Two workers allow regression as well as classification procedures ...
\begin{itemize}
\item \texttt{Three-Layer Perceptron-Type Neuronal Networks}
\\Neural network implementation using the backpropagation algorithm for weight optimisation \cite{Mitchell1997}.
\item \texttt{Support Vector Machines}
\\Support Vector Machine implementation using the LibSVM library \cite{Chang2001}.
\end{itemize}
... two workers do only support regression ...
\begin{itemize}
\item \texttt{Multiple Linear Regression}
\\ Multiple linear regression algorithm.
\item \texttt{M5P regression trees}
\\M5 regression algorithm for contructing tree-based linear models \cite{Quinlan1992, Wang1996}.
\end{itemize}
... and two workers are restricted to classification tasks:
\begin{itemize}
\item \texttt{Naive Bayes}
\\Bayesian classifier for the estimation of continuous variables \cite{John1995}.
\item \texttt{J46 C4.5 decision tree}
\\Decision tree implementation based on the C4.5 classification algorithm \cite{Quinlan1993}.
\end{itemize}
For selection of an optimum reduced set of input vector components there are two workers available. The \texttt{GA Attribute Selection} worker generates an optimum reduced set of input components of predefined length (smaller than the full input vector length) on the basis of a genetic algorithm. The initial random population is refined by mutation and cross-over steps plus Roulette Wheel selection in each generation. A mutation switches an input component between an "on" or "off" state and a cross-over interchanges a random interval of "on/off" states between two randomly chosen chromosomes (where the number of attributes with "on" state remains fixed). As a fitness function the inverse square root mean squared error $\left(\frac{1}{RMSE}\right)^{2}$ is used - based on the complete dataset or using n-fold cross-validation. Figure \ref{fig:GASelection} illustrates the procedure. The \texttt{Leave-One-Out Attribute Selection} worker uses a "Leave-One-Out" strategy for evaluating the significance of each input vector component \cite{Zielesny2011}. In each iteration the single component is discarded that has the smallest influence on the RMSE - up to a last "most significant" component. Figure \ref{fig:LeaveOneOutResults} shows a result of a "leave-one-out" analysis and Figure \ref{fig:LeaveOneOutWorkflow} depicts the related workflow.
For training and test set partitioning the \texttt{Split Dataset Into Train-/Testset} worker is available which offers three strategies \cite{Zielesny2011}:
\begin{itemize}
\item \texttt{Random}
\\Data are split randomly into a training and test set of defined sizes.
\item \texttt{Cluster Representatives}
\\First the input data of the I/O pairs are clustered with the number of clusters to be equal to the number of training data by application of the Simple KMeans algorithm. Then a single input point of each cluster is chosen randomly as a representative and the corresponding I/O pair is inserted into the training set. The remaining I/O pairs are transferred to the test set.
\item \texttt{Single Global Max}
\\Cluster representatives are evaluated in a first step. These representatives are then refined by an iterative procedure that exchanges data between training and test set that belong to the same cluster. The latter constraint assures that the input data of training and test set have a similar spatial diversity. A single iteration determines the test set I/O pair with the largest deviation between data and model. This I/O pair is then transferred to the training set while the best predicted I/O pair of the same cluster in the training set is transferred to the test set in exchange. Oscillations during the refinement steps may be suppressed by blacklisting exchanged I/O pairs.
\end{itemize}
Figure \ref{fig:RegressionSplitWorkflow} shows a workflow using the \texttt{Split Dataset Into Train-/Testset} worker. The \texttt{Weka Regression} worker is used to build machine learning models which may be evaluated and visualized by the \texttt{Evaluate Regression Results as PDF} worker. The \texttt{Weka Regression} worker provides a configuration menu as shown in Figure \ref{fig:RegressionConfUI}. Classification workers may be used in an equivalent manner. Figure \ref{fig:RegressionVisualisation} depicts diagrams and output of a QSPR analysis to predict HPLC retention times for small molecules: The experimental dataset consists of 183 I/O pairs with a set of molecular descriptors for each small molecule as an input and the corresponding retention time as an output. The molecular descriptors were calculated with the \texttt{QSAR Descriptor Threaded} worker. Afterwards the \texttt{GA Attribute Selection} worker was used to determine an optimized minimum subset of 75 molecular descriptors (from an original 155) with maximum predictability. For machine learning a three-layer perceptron type neural network worker with three hidden neurons was used. The diagrams shown for the regression analysis are a scatter plot with experimental versus predicted output values and two kinds of residual plots. In addition characteristic quantities like the root mean squared error or the correlation coefficient are calculated for the generated model.
\subsection*{CDK-Taverna 2.0 Wiki}
Based on the free MediaWiki framework a Wiki was developed for the CDK-Taverna 2.0 project \cite{MediaWikiWeb}. The web page provides general information about the project, documentation about available workers/workflows and on how to create them as well as about installation procedures. The Wiki can be found at \url{http://cdk-taverna-2.ts-concepts.de/wiki/index.php?title=Main_Page}.
%%%%%%%%%%%%%%%%%%%%%%
\section*{Conclusions}
CDK-Taverna 2.0 provides an enhanced and matured free open cheminformatics workflow solution for the biosciences. It was successfully applied and tested in academic and industrial environments with data volumes of hundreds of thousands of small molecules. Combined with available workers and workflows from bioinformatics, image analysis or statistics CDK-Taverna supports the construction of complex systems biology oriented workflows for processing diverse sets of biological data.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Competing interests}
The authors declare that they have no competing interests.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Authors contributions}
EW initiated the integration of Taverna and CDK and supported deployment and architecture.
CS and AZ conceived the project and lead the further development.
SN supported the reaction enumeration enhancements.
KV provided workers for molecular fragmentation.
AT did the majority of CDK-Taverna re-engineering and enhancements and developed the project to its current state.
All co-authors contributed to the manuscript.
All authors read and approved the final manuscript.
%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section*{Acknowledgements}
\ifthenelse{\boolean{publ}}{\small}{}
The authors express their gratitude to the teams and communities of Taverna, CDK and WEKA for creating and developing these open tools.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% The Bibliography %%
%% %%
%% Bmc_article.bst will be used to %%
%% create a .BBL file for submission, which includes %%
%% XML structured for BMC. %%
%% After submission of the .TEX file, %%
%% you will be prompted to submit your .BBL file. %%
%% %%
%% %%
%% Note that the displayed Bibliography will not %%
%% necessarily be rendered by Latex exactly as specified %%
%% in the online Instructions for Authors. %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{\ifthenelse{\boolean{publ}}{\footnotesize}{\small}
\bibliographystyle{bmc_article} % Style BST file
\bibliography{bmc_article} } % Bibliography file (usually '*.bib' )
%%%%%%%%%%%
\ifthenelse{\boolean{publ}}{\end{multicols}}{}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Figures %%
%% %%
%% NB: this is for captions and %%
%% Titles. All graphics must be %%
%% submitted separately and NOT %%
%% included in the Tex document %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%% Do not use \listoffigures as most will included as separate files
\section*{Figures}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=0.4]{pics/arefeatures.jpg}
\caption{Advanced reaction enumeration features: (left) The \emph{Variable RGroup} feature allows the definition of chemical groups which can be flexibly attached to predefined atoms. (middle) The \emph{Atom Alias} feature offers the possibility to define a wild card for preconfigured elements. (right) The \emph{Expandable Atom} feature enables the definition of freely sizeable rings or aliphatic chains.}
\label{fig:ReactionEnumerationFeatures}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=0.8]{pics/reworkflow.jpg}
\caption{Workflow for reaction enumeration: After loading a generic reaction (\texttt{IN REACTION}, from a MDL RXN file) and two educt lists (\texttt{IN REACTANTS 1}, \texttt{IN REACTANTS 2}, from MDL SD files) the \texttt{Reaction Enumerator} worker performs the enumeration with the results stored as MDL RXN files. An additional PDF file is created which shows all enumerated reactions in a tabular manner. The results are stored in the output folder determined by the \texttt{OUT} input port.}
\label{fig:ReactionEnumerationWorkflow}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=0.4]{pics/reresults.jpg}
\caption{Capabilities of the advanced reaction enumerator: The sketched generic reaction contains three different generic groups labelled X, Y and Z. Group X defines a \emph{Variable RGroup} which can freely attach to all atoms of the ring. The \emph{Atom Alias} group labelled Y is a wild card for the elements carbon, oxygen and nitrogen. The \emph{Expandable Atom} group Z defines a variable ring size: The ring can be expanded by up to two additional carbon atoms. The enumerated products with the small letters $a$ and $b$ originate from \emph{multi-match detection}.}
\label{fig:ReactionEnumerationResults}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=0.5]{pics/curation_signature.jpg}
\caption{Molecule curation and atom signature descriptor generation workflow: The \texttt{Iterative SDfile Reader} takes the Structure Data Format (SDF) file of compounds \texttt{(Input SDF)} as input and pass the structures down the workflow for molecule curation and atom signature generation. The number of structures to be read, and pumped down the workflow can be configured (\texttt{Iterations}). As soon as the molecule is read, the \texttt{Tag Molecules with UUID} worker tags the molecule with Universal Unique IDentifier (UUID) to keep track of it during the process. The \texttt{Molecule connectivity checker} worker checks the connectedness of the structure and removes counter ions and disconnected fragments. The \texttt{Remove sugar groups} worker removes linear and ring sugars from the structures. The \texttt{Curate Strange Elements} worker removes structure containing elements other than non-metals. Finally, the \texttt{Generate Atom Signatures} worker generates atom signature for each atom in a curated compound, tagged with the respective UUID of the compound. The generated atom signatures are written out to a text file (\texttt{signatures file}) using the \texttt{Text File Writer} worker. The SDF of compound structures can be written out to a file, after tagging with UUID (\texttt{Tagged SDFile}), and also after any curation step (\texttt{Curated SDF}) using the \texttt{SDFile Writer} worker. This workflow is available for free download at http://www.myexperiment.org/workflows/2120.html
}
\label{fig:Curation and signature descriptor generation}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=0.5]{pics/ScoringActivity.jpg}
\caption{NP-likeness scoring workflow: This workflow take inputs of atom signatures file generated from the user defined natural products library (\texttt{NP file}) as well as synthetics (\texttt{SM file}) and compound libraries (\texttt{Query file}) and score the compound libraries (\texttt{Query file}) for NP-likeness. The higher the score the more is the NP-likeness of a molecule. The \texttt{Query fragments scorer} worker generates score for each compound in the \texttt{Query file} tagged with the corresponding UUID of the compound. Pairs of compound's UUID and score are written out to a text file (\texttt{Score file}) which can also be passed to the \texttt{Plot Distribution As PDF} worker to see the distribution of the score density of the complete query dataset. The \texttt{Query fragments scorer} worker also regenerates structure for every atom signature and tags it with its corresponding fragment score and UUID of the compound to which it belong to. These fragment structures with scores are written out to a SDF file(\texttt{Fragments SDF}), as they are helpful in identifying fragments with high NP-likeness. This workflow is available for free download at http://www.myexperiment.org/workflows/2121.html}
\label{fig:ScoringActivity}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=0.4]{pics/gaalgorithm.jpg}
\caption{Genetic algorithm for selection of an optimum reduced set of input vector components: The algorithm starts with a random population in which each chromosome consists of a random distribution of enabled/disabled (on/off) input vector components denoted $A_1$ to $A_n$ (where the number of components with "on" status remains fixed during evolution). This distribution is changed by mutation and cross-over. The fitness of each chromosome is evaluated by the inverse square RMSE. The selection process for each generation is performed by Roulette wheel selection where chromosomes are inherited with probabilities that correspond to their particular fitness.}
\label{fig:GASelection}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=0.4]{pics/looattrevaluation.jpg}
\caption{"Leave-One-Out" analysis to estimate the significance of input vector components: The root mean square error (RMSE) rises with an increasing number of discarded components (i.e. a decreasing number of input vector components used for the machine learning procedure). The relative RMSE shift from step to step may be correlated with the significance of the discarded component. In this case it is shown that the first fifty components do only have a negligible influence on the machine learning result and thus may be excluded from further analysis.}
\label{fig:LeaveOneOutResults}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=0.5]{pics/looattrworkflow.jpg}
\caption{Workflow for "Leave-One-Out" analysis : First a regression dataset is generated from a CSV file with UUID and molecular descriptor input data for each molecule (\texttt{IN QSAR}) and a CSV file containing the UUID of the molecule and the corresponding output (regression) value (\texttt{IN RTID}). Then the \texttt{Leave-One-Out Attribute Selection} worker evaluates the significance of the input components and generates a dataset for each evaluation step. Afterwards the composed datasets are coded as XRFF files. A CSV file with the sequence of discarded input vector components is generated. In addition the results are visualised with a PDF output file. Instead of the \texttt{Leave-One-Out Attribute Selection} worker a \texttt{GA Attribute Selection} worker may be used to determine a minimum molecular descriptor subset with maximum predictability. The results are stored in the output folder determined by the \texttt{OUT} input port.}
\label{fig:LeaveOneOutWorkflow}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=0.5]{pics/splitworkflow.jpg}
\caption{Partitioning into training and test set: A regression dataset is split into a training and a test set which is performed by the \texttt{Split Dataset Into Train-/Testset}. Then a regression model is created by the \texttt{Weka Regression} worker and evaluated by the \texttt{Evaluate Regression Results as PDF} which stores the results in a PDF file. The dataset is read from a XRFF file \texttt{(IN XRFF)}. The generated test and training sets are coded as XRFF files and stored on hard disk. The \texttt{OUT} input port determines the result output folder.}
\label{fig:RegressionSplitWorkflow}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=0.75]{pics/regressionUI.jpg}
\caption{Configuration panel for the \texttt{Weka Regression} worker: The configuration for a three-layer perceptron neural networks is selected. Each machine learning method consists of a parameter panel for individual configuration.}
\label{fig:RegressionConfUI}
\end{figure*}
\begin{figure*}[hbt]
\centering
\includegraphics[angle=0,clip=false,scale=0.4]{pics/revisresults.jpg}
\caption{Diagrams for machine learning results: (upper left) Scatter plot with experimental versus predicted output values. (upper right) Residuals plot with differences between the predicted and experimental output values. (lower left) Experimental output data are plotted over corresponding sorted predicted output data. (lower right) Characteristic quantities of the predicted model.}
\label{fig:RegressionVisualisation}
\end{figure*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%
%% Tables %%
%% %%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Use of \listoftables is discouraged.
%%
% New column type
\FloatBarrier
\newcolumntype{C}[1]{>{\centering}p{#1}}
\renewcommand{\arraystretch}{1.2}
\section*{Tables}
\subsection*{Table 1 - CDK-Taverna 2.0 workers}
Overview on CDK-Taverna 2.0 workers categorized by their function. \par \mbox{}
\par
\begin{table}[hbt]
\mbox{
\begin{tabular}{|C{0.34\textwidth}|C{0.15\textwidth}|C{0.42\textwidth}|}\hline
\textbf{Function} & \textbf{\# workers} & \textbf{Examples} \tabularnewline \hline
File I/O & 18 & \texttt{SDFReader, SmilesReader} \tabularnewline \hline
Iterative File I/O & 8 & \texttt{IterativeSDFileReader, LoopSDFileReaderActivity} \tabularnewline \hline
String Converter & 10 & \texttt{CMLStringToStructureConverter} \tabularnewline \hline
Molecular descriptor calculation & 99 & AtomCount, LargestChain, WienerIndex \tabularnewline \hline
Machine learning & 30 & kMeans, Perceptron, SVM \tabularnewline \hline
Miscellaneous & 27 & \texttt{JChemPaint, ReactionEnumerator} \tabularnewline \hline
\end{tabular}
}
\end{table}
\subsection*{Table 2 - Overview on multi-threading CDK-Taverna 2.0 workers}
Overview on CDK-Taverna 2.0 workers which are capable of using multiple threads for their calculations. \par \mbox{}
\par
\begin{table}[hbt]
\mbox{
\begin{tabular}{|C{0.53\textwidth}|C{0.40\textwidth}|}\hline
\textbf{Function} & \textbf{Worker} \tabularnewline \hline
Calculation of molecular descriptors & \texttt{QSAR Descriptor Threaded} \tabularnewline \hline
Significance of input components evaluation using a genetic algorithm & \texttt{GA Attribute Selection} \tabularnewline \hline
Significance of input components evaluation using a `Leave-One-Out' strategy & \texttt{Leave-One-Out Attribute Selection} \tabularnewline \hline
Partitioning datasets into training and test sets & \texttt{Split Dataset Into Train-/Testset} \tabularnewline \hline
Construction of clustering models & \texttt{Weka Clustering} \tabularnewline \hline
Construction of regression models & \texttt{Weka Regression} \tabularnewline \hline
Construction of classification models & \texttt{Weka Classification} \tabularnewline \hline
\end{tabular}
\label{tab:ThreadedWorker}
}
\end{table}
\end{bmcformat}
\end{document}

Get latest updates about Open Source Projects, Conferences and News.

Sign up for the SourceForge newsletter:





No, thanks