@Article{ SchweikertZZBDOPDHBKSR2009, title = {mGene: Accurate SVM-based gene finding with an application to nematode genomes}, journal = {Genome Research}, year = {2009}, month = {11}, volume = {19}, number = {11}, pages = {2133-2143}, abstract = {We present a highly accurate gene-prediction system for eukaryotic genomes, called mGene. It combines in an unprecedented manner the flexibility of generalized hidden Markov models (gHMMs) with the predictive power of modern machine learning methods, such as Support Vector Machines (SVMs). Its excellent performance was proved in an objective competition based on the genome of the nematode Caenorhabditis elegans. Considering the average of sensitivity and specificity, the developmental version of mGene exhibited the best prediction performance on nucleotide, exon, and transcript level for ab initio and multiple-genome gene-prediction tasks. The fully developed version shows superior performance in 10 out of 12 evaluation criteria compared with the other participating gene finders, including Fgenesh++ and Augustus. An in-depth analysis of mGene's genome-wide predictions revealed that ≈2200 predicted genes were not contained in the current genome annotation. Testing a subset of 57 of these genes by RT-PCR and sequencing, we confirmed expression for 24 (42%) of them. mGene missed 300 annotated genes, out of which 205 were unconfirmed. RT-PCR testing of 24 of these genes resulted in a success rate of merely 8%. These findings suggest that even the gene catalog of a well-studied organism such as C. elegans can be substantially improved by mGene's predictions. We also provide gene predictions for the four nematodes C. briggsae, C. brenneri, C. japonica, and C. remanei. Comparing the resulting proteomes among these organisms and to the known protein universe, we identified many species-specific gene inventions. In a quality assessment of several available annotations for these genomes, we find that mGene's predictions are most accurate.}, web_url = {http://genome.cshlp.org/content/19/11/2133.full.pdf+html}, state = {published}, DOI = {10.1101/gr.090597.108}, author = {Schweikert G{schweike}{Department Empirical Inference}; Zien A{zien}{Department Empirical Inference}; Zeller G{zeller}; Behr J{jonas}; Dieterich C{chrisd}; Ong CS{ong}{Department Empirical Inference}; Philips P{philips}; De Bona F{fabio}; Hartmann L{hartmann}; Bohlen A{bohlen}; Kr\"uger N; Sonnenburg S{sonne}; R\"atsch G{raetsch}{Department Empirical Inference}} } @Article{ SchweikertBZZOSR2009, title = {mGene.web: a web service for accurate computational gene finding}, journal = {Nucleic Acids Research}, year = {2009}, month = {7}, volume = {37}, number = {Supplement 2}, pages = {W312-W316}, abstract = {We describe mGene.web, a web service for the genome-wide prediction of protein coding genes from eukaryotic DNA sequences. It offers pre-trained models for the recognition of gene structures including untranslated regions in an increasing number of organisms. With mGene.web, users have the additional possibility to train the system with their own data for other organisms on the push of a button, a functionality that will greatly accelerate the annotation of newly sequenced genomes. The system is built in a highly modular way, such that individual components of the framework, like the promoter prediction tool or the splice site predictor, can be used autonomously. The underlying gene finding system mGene is based on discriminative machine learning techniques and its high accuracy has been demonstrated in an international competition on nematode genomes. mGene.web is available at http://www.mgene.org/web, it is free of charge and can be used for eukaryotic genomes of small to moderate size (several hundred Mbp).}, web_url = {http://nar.oxfordjournals.org/content/37/suppl_2/W312.full.pdf+html}, state = {published}, DOI = {10.1093/nar/gkp479}, author = {Schweikert G{schweike}{Department Empirical Inference}; Behr J{jonas}; Zien A{zien}{Department Empirical Inference}; Zeller G{zeller}; Ong CS{ong}{Department Empirical Inference}; Sonnenburg S{sonne}; R\"atsch G{raetsch}{Department Empirical Inference}} } @Article{ 5586, title = {Support Vector Machines and Kernels for Computational Biology}, journal = {PLoS Computational Biology}, year = {2008}, month = {10}, volume = {4}, number = {10}, pages = {1-10}, file_url = {/fileadmin/user_upload/files/publications/benhur08svm-tutorial_[0].pdf}, web_url = {http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri=info%3Adoi%2F10.1371%2Fjournal.pcbi.1000173&representation=PDF}, state = {published}, DOI = {10.1371/journal.pcbi.1000173}, EPUB = {e1000173}, author = {Ben-Hur A; Ong CS{ong}{Department Empirical Inference}; Sonnenburg S{sonne}; Sch\"olkopf B{bs}{Department Empirical Inference}; R\"atsch G{raetsch}{Department Empirical Inference}} } @Article{ SonnenburgZPR2008, title = {POIMs: positional oligomer importance matrices - understanding support vector machine-based signal detectors}, journal = {Bioinformatics}, year = {2008}, month = {7}, volume = {24}, number = {13}, pages = {i6-i14}, abstract = {Motivation: At the heart of many important bioinformatics problems, such as gene finding and function prediction, is the classification of biological sequences. Frequently the most accurate classifiers are obtained by training support vector machines (SVMs) with complex sequence kernels. However, a cumbersome shortcoming of SVMs is that their learned decision rules are very hard to understand for humans and cannot easily be related to biological facts. Results: To make SVM-based sequence classifiers more accessible and profitable, we introduce the concept of positional oligomer importance matrices (POIMs) and propose an efficient algorithm for their computation. In contrast to the raw SVM feature weighting, POIMs take the underlying correlation structure of k-mer features induced by overlaps of related k-mers into account. POIMs can be seen as a powerful generalization of sequence logos: they allow to capture and visualize sequence patterns that are relevant for the investigated biological phenomena.}, web_url = {http://bioinformatics.oxfordjournals.org/content/24/13/i6.full.pdf+html}, state = {published}, DOI = {10.1093/bioinformatics/btn170}, author = {Sonnenburg S{sonne}; Zien A{zien}{Department Empirical Inference}; Philips P{philips}; R\"arsch G{raetsch}{Department Empirical Inference}} } @Article{ 4809, title = {Accurate Splice site Prediction Using Support Vector Machines}, journal = {BMC Bioinformatics}, year = {2007}, month = {12}, volume = {8}, number = {Supplement 10}, pages = {1-16}, abstract = {Background: For splice site recognition, one has to solve two classification problems: discriminating true from decoy splice sites for both acceptor and donor sites. Gene finding systems typically rely on Markov Chains to solve these tasks. Results: In this work we consider Support Vector Machines for splice site recognition. We employ the so-called weighted degree kernel which turns out well suited for this task, as we will illustrate in several experiments where we compare its prediction accuracy with that of recently proposed systems. We apply our method to the genome-wide recognition of splice sites in Caenorhabditis elegans, Drosophila melanogaster, Arabidopsis thaliana, Danio rerio, and Homo sapiens. Our performance estimates indicate that splice sites can be recognized very accurately in these genomes and that our method outperforms many other methods including Markov Chains, GeneSplicer and SpliceMachine. We provide genome-wide predictions of splice sites and a stand-alone prediction tool ready to be used for incorporation in a gene finder. Availability: Data, splits, additional information on the model selection, the whole genome predictions, as well as the stand-alone prediction tool are available for download at http:// www.fml.mpg.de/raetsch/projects/splice.}, web_url = {http://www.biomedcentral.com/content/pdf/1471-2105-8-S10-S7.pdf}, state = {published}, DOI = {10.1186/1471-2105-8-S10-S7}, author = {Sonnenburg S{sonne}; Schweikert G{schweike}{Department Empirical Inference}; Philips P; Behr J; R\"atsch G{raetsch}{Department Empirical Inference}} } @Article{ 4768, title = {The Need for Open Source Software in Machine Learning}, journal = {Journal of Machine Learning Research}, year = {2007}, month = {10}, volume = {8}, pages = {2443-2466}, abstract = {Open source tools have recently reached a level of maturity which makes them suitable for building large-scale real-world systems. At the same time, the field of machine learning has developed a large body of powerful learning algorithms for diverse applications. However, the true potential of these methods is not realized, since existing implementations are not openly shared, resulting in software with low usability, and weak interoperability. We argue that this situation can be significantly improved by increasing incentives for researchers to publish their software under an open source model. Additionally, we outline the problems authors are faced with when trying to publish algorithmic implementations of machine learning methods. We believe that a resource of peer reviewed software accompanied by short articles would be highly valuable to both the machine learning and the general scientific community.}, file_url = {/fileadmin/user_upload/files/publications/JMLR-8-Sonnenburg_4768[0].pdf}, web_url = {http://jmlr.csail.mit.edu/papers/v8/sonnenburg07a.html}, state = {published}, author = {Sonnenburg S{sonne}; Braun ML; Ong CS{ong}{Department Empirical Inference}; Bengio S; Bottou L; Holmes G; LeCun Y; M\"uller K-R{klaus}{Department Empirical Inference}; Pereira F; Rasmussen CE{carl}{Department Empirical Inference}; R\"atsch G{raetsch}{Department Empirical Inference}; Sch\"olkopf B{bs}{Department Empirical Inference}; Smola A{smola}; Vincent P; Weston J{weston}{Department Empirical Inference}; Williamson RC{bwilliamson}{Department Empirical Inference}} } @Article{ 4378, title = {Improving the Caenorhabditis elegans Genome Annotation Using Machine Learning}, journal = {PLoS Computational Biology}, year = {2007}, month = {2}, volume = {3}, number = {2: e20}, pages = {0313-0322}, abstract = {For modern biology, precise genome annotations are of prime importance, as they allow the accurate definition of genic regions. We employ state-of-the-art machine learning methods to assay and improve the accuracy of the genome annotation of the nematode Caenorhabditis elegans. The proposed machine learning system is trained to recognize exons and introns on the unspliced mRNA, utilizing recent advances in support vector machines and label sequence learning. In 87% (coding and untranslated regions) and 95% (coding regions only) of all genes tested in several out-of-sample evaluations, our method correctly identified all exons and introns. Notably, only 37% and 50%, respectively, of the presently unconfirmed genes in the C. elegans genome annotation agree with our predictions, thus we hypothesize that a sizable fraction of those genes are not correctly annotated. A retrospective evaluation of the Wormbase WS120 annotation [1] of C. elegans reveals that splice form predictions on unconfirmed genes in WS120 are inaccurate in about 18% of the considered cases, while our predictions deviate from the truth only in 10%–13%. We experimentally analyzed 20 controversial genes on which our system and the annotation disagree, confirming the superiority of our predictions. While our method correctly predicted 75% of those cases, the standard annotation was never completely correct. The accuracy of our system is further corroborated by a comparison with two other recently proposed systems that can be used for splice form prediction: SNAP and ExonHunter. We conclude that the genome annotation of C. elegans and other organisms can be greatly enhanced using modern machine learning technology.}, web_url = {http://compbiol.plosjournals.org/archive/1553-7358/3/2/pdf/10.1371_journal.pcbi.0030020-S.pdf}, state = {published}, DOI = {10.1371/journal.pcbi.0030020}, author = {R\"atsch G{raetsch}{Department Empirical Inference}; Sonnenburg S{sonne}; Srinivasan J; Witte H; M\"uller K-R{kaim}; Sommer R-J; Sch\"olkopf B{bs}{Department Empirical Inference}} } @Article{ 3994, title = {Large Scale Multiple Kernel Learning}, journal = {Journal of Machine Learning Research}, year = {2006}, month = {7}, volume = {7}, pages = {1531-1565}, abstract = {While classical kernel-based learning algorithms are based on a single kernel, in practice it is often desirable to use multiple kernels. Lanckriet et al. (2004) considered conic combinations of kernel matrices for classification, leading to a convex quadratically constrained quadratic program. We show that it can be rewritten as a semi-infinite linear program that can be efficiently solved by recycling the standard SVM implementations. Moreover, we generalize the formulation and our method to a larger class of problems, including regression and one-class classification. Experimental results show that the proposed algorithm works for hundred thousands of examples or hundreds of kernels to be combined, and helps for automatic model selection, improving the interpretability of the learning result. In a second part we discuss general speed up mechanism for SVMs, especially when used with sparse feature maps as appear for string kernels, allowing us to train a string kernel SVM on a 10 million real-world splice data set from computational biology. We integrated multiple kernel learning in our machine learning toolbox SHOGUN for which the source code is publicly available at http://www.fml.tuebingen.mpg.de/raetsch/projects/shogun.}, web_url = {http://jmlr.csail.mit.edu/papers/volume7/sonnenburg06a/sonnenburg06a.pdf}, state = {published}, author = {Sonnenburg S{sonne}; R\"atsch G{raetsch}{Department Empirical Inference}; Sch\"afer C; Sch\"olkopf B{bs}{Department Empirical Inference}} } @Inproceedings{ 3960, title = {ARTS: Accurate Recognition of Transcription Starts in Human}, journal = {Bioinformatics}, year = {2006}, month = {7}, volume = {22}, number = {14}, pages = {e472-e480}, abstract = {Motivation: One of the most important features of genomic DNA are the protein-coding genes. While it is of great value to identify those genes and the encoded proteins, it is also crucial to understand how their transcription is regulated. To this end one has to identify the corresponding promoters and the contained transcription factor binding sites. TSS finders can be used to locate potential promoters. They may also be used in combination with other signal and content detectors to resolve entire gene structures. Results: We have developed a novel kernel based method - called ARTS - that accurately recognizes transcription start sites in human. The application of otherwise too computationally expensive Support Vector Machines was made possible due to the use of efficient training and evaluation techniques using suffix tries. In a carefully designed experimental study, we compare our TSS finder to state-of-the-art methods from the literature: McPromoter, Eponine and FirstEF. For given false positive rates within a reasonable range, we consistently achieve considerably higher true positive rates. For instance, ARTS finds about 24% true positives at a false positive rate of 1/1000, where the other methods find less than half (10.5%). Availability: Datasets, model selection results, whole genome predictions, and additional experimental results are available at http://www.fml.tuebingen.mpg.de/raetsch/projects/arts}, web_url = {http://www2.fml.tuebingen.mpg.de/raetsch/projects/arts}, event_name = {14th International Conference on Intelligent Systems for Molecular Biology (ISMB 2006)}, event_place = {Fortaleza, Brazil}, state = {published}, DOI = {10.1093/bioinformatics/btl250}, author = {Sonnenburg S{sonne}; Zien A{zien}{Department Empirical Inference}; R\"atsch G{raetsch}{Department Empirical Inference}} } @Inproceedings{ 3497, title = {RASE: recognition of alternatively spliced exons in C.elegans}, journal = {Bioinformatics}, year = {2005}, month = {6}, volume = {21}, number = {Supplement 1}, pages = {i369-i377}, abstract = {Motivation: Eukaryotic pre-mRNAs are spliced to form mature mRNA. Pre-mRNA alternative splicing greatly increases the complexity of gene expression. Estimates show that more than half of the human genes and at least one-third of the genes of less complex organisms, such as nematodes or flies, are alternatively spliced. In this work, we consider one major form of alternative splicing, namely the exclusion of exons from the transcript. It has been shown that alternatively spliced exons have certain properties that distinguish them from constitutively spliced exons. Although most recent computational studies on alternative splicing apply only to exons which are conserved among two species, our method only uses information that is available to the splicing machinery, i.e. the DNA sequence itself. We employ advanced machine learning techniques in order to answer the following two questions: (1) Is a certain exon alternatively spliced? (2) How can we identify yet unidentified exons within known introns? Results: We designed a support vector machine (SVM) kernel well suited for the task of classifying sequences with motifs having positional preferences. In order to solve the task (1), we combine the kernel with additional local sequence information, such as lengths of the exon and the flanking introns. The resulting SVM-based classifier achieves a true positive rate of 48.5% at a false positive rate of 1%. By scanning over single EST confirmed exons we identified 215 potential alternatively spliced exons. For 10 randomly selected such exons we successfully performed biological verification experiments and confirmed three novel alternatively spliced exons. To answer question (2), we additionally used SVM-based predictions to recognize acceptor and donor splice sites. Combined with the above mentioned features we were able to identify 85.2% of skipped exons within known introns at a false positive rate of 1%.}, file_url = {/fileadmin/user_upload/files/publications/pdf3497.pdf}, web_url = {http://bioinformatics.oxfordjournals.org/content/21/suppl_1/i369.full.pdf+html}, event_name = {Thirteenth International Conference on Intelligent Systems for Molecular Biology (ISBM 2005)}, event_place = {Detroit, MI, USA}, state = {published}, DOI = {10.1093/bioinformatics/bti1053}, author = {R\"atsch G{raetsch}{Department Empirical Inference}; Sonnenburg S{sonne}; Sch\"olkopf B{bs}{Department Empirical Inference}} } @Poster{ RatschSSMSS2006, title = {Splice Form Prediction using Machine Learning}, year = {2006}, month = {8}, abstract = {Accurate ab initio gene finding is still a major challenge in computational biology. We employ cutting edge machine learning similar to Hidden-Markov-SVMs to assay and improve the accuracy of genome annotations. We applied our system on the C_elegans genome and were able to drastically improve its annotation.}, web_url = {http://www.iscb.org/cms_addon/conferences/ismb2006/archive/ismb2006.cbi.cnptia.embrapa.br/posters_list.php}, event_name = {14th International Conference on Intelligent Systems for Molecular Biology (ISMB 2006)}, event_place = {Fortaleza, Brazil}, state = {published}, author = {R\"atsch G{raetsch}{Department Empirical Inference}; Sonnenburg S{sonne}; Srinivasan J{jagan}; M\"uller K-R{klaus}{Department Empirical Inference}; Sommer R; Sch\"olkopf B{bs}{Department Empirical Inference}} } @Conference{ 5403, title = {mGene: A Novel Discriminative Gene Finder}, year = {2008}, month = {7}, day = {25}, web_url = {http://blog.wormbase.org/2008/03/12/2008-topics-meeting-worm-genomics-and-systems-biology/}, event_name = {2008 Topics Meeting: Worm Genomics and Systems Biology}, event_place = {Cambridge, USA}, state = {published}, author = {Schweikert G{schweike}{Department Empirical Inference}; Zeller G; Zien A{zien}{Department Empirical Inference}; Behr J; Sonnenburg S{sonne}; Philips P; Ong CS{ong}{Department Empirical Inference}; R\"atsch G{raetsch}{Department Empirical Inference}} } @Conference{ SchweikertZBZOdSPRW2008, title = {mGene: A Novel Discriminative Gene Finding System}, year = {2008}, month = {7}, day = {18}, pages = {20}, abstract = {The acceleration of genome sequencing has put further emphasis on the need for accurate computational gene finders. We present our improved system, mGene, which combines state-of-the-art structure prediction algorithms with SVM classifiers. As it performed excellent in the nGASP challenge, it was recently employed to annotate new nematode genomes.}, web_url = {http://www.biomedcentral.com/content/pdf/1471-2105-9-S10-info.pdf}, event_name = {4th ISCB Student Council Symposium at ISMB 2008}, event_place = {Toronto, Canada}, state = {published}, author = {Schweikert G{schweike}{Department Empirical Inference}; Zeller G{zeller}; Behr J{jonas}; Zien A{zien}{Department Empirical Inference}; Ong CS{ong}{Department Empirical Inference}; De Bona F{fabio}; Sonnenburg S{sonne}; Philips P{philips}; Raetsch G{raetsch}{Department Empirical Inference}; Widmer C{cwidmer}} } @Conference{ 5033, title = {Positional Oligomer Importance Matrices}, year = {2007}, month = {12}, day = {8}, abstract = {At the heart of many important bioinformatics problems, such as gene finding and function prediction, is the classification of biological sequences, above all of DNA and proteins. In many cases, the most accurate classifiers are obtained by training SVMs with complex sequence kernels, for instance for transcription starts or splice sites. However, an often criticized downside of SVMs with complex kernels is that it is very hard for humans to understand the learned decision rules and to derive biological insights from them. To close this gap, we introduce the concept of positional oligomer importance matrices (POIMs) and develop an efficient algorithm for their computation. We demonstrate how they overcome the limitations of sequence logos, and how they can be used to find relevant motifs for different biological phenomena in a straight-forward way. Note that the concept of POIMs is not limited to interpreting SVMs, but is applicable to general k−mer based scoring systems.}, web_url = {http://www.mlcb.org/previous/MLCB2007/program/abstracts}, event_name = {NIPS 2007 Workshop on Machine Learning in Computational Biology (MLCB 2007)}, event_place = {Whistler, BC, Canada}, state = {published}, author = {Sonnenburg S{sonne}; Zien A{zien}{Department Empirical Inference}; Philips P{philips}; R\"atsch G{raetsch}{Department Empirical Inference}} } @Conference{ 4237, title = {Ab-initio gene finding using machine learning}, year = {2006}, month = {12}, day = {8}, web_url = {http://www.mlcb.org/previous/MLCB2006/schedule06}, event_name = {NIPS 2006 Workshop on New Problems and Methods in Computational Biology (MLCB 2006)}, event_place = {Whistler, BC, Canada}, state = {published}, author = {Schweikert G{schweike}{Department Empirical Inference}; Zeller G; Zien A{zien}{Department Empirical Inference}; Ong CS{ong}{Department Empirical Inference}; de Bona F; Sonnenburg S{sonne}; Phillips P; R\"atsch G{raetsch}{Department Empirical Inference}} } @Conference{ RatschSOS2005, title = {Accurate prediction of alternative splicing events}, year = {2005}, month = {12}, day = {9}, web_url = {http://raetschlab.org:10080/nipscompbio/previous/MLCB2005/program.pdf}, event_name = {NIPS Workshop on Computational Biology and the Analysis of Heterogeneous Data (MLCB 2005)}, event_place = {Whistler, BC, Canada}, state = {published}, author = {R\"atsch G{raetsch}{Department Empirical Inference}; Sonnenburg S{sonne}; Ong CS{ong}{Department Empirical Inference}; Sch\"olkopf B{bs}{Department Empirical Inference}} }