@Article{ 5586, title = {Support Vector Machines and Kernels for Computational Biology}, journal = {PLoS Computational Biology}, year = {2008}, month = {10}, volume = {4}, number = {10: e1000173}, pages = {1-10}, file_url = {/fileadmin/user_upload/files/publications/benhur08svm-tutorial_[0].pdf}, web_url = {http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri=info%3Adoi%2F10.1371%2Fjournal.pcbi.1000173&representation=PDF}, state = {published}, DOI = {10.1371/journal.pcbi.1000173}, author = {Ben-Hur A, Ong CS{ong}{Department Empirical Inference}, Sonnenburg S{sonne}, Sch\"olkopf B{bs}{Department Empirical Inference} and R\"atsch G{raetsch}{Department Empirical Inference}} } @Article{ 4809, title = {Accurate Splice site Prediction Using Support Vector Machines}, journal = {BMC Bioinformatics}, year = {2007}, month = {12}, volume = {8}, number = {Supplement 10}, pages = {1-16}, abstract = {Background: For splice site recognition, one has to solve two classification problems: discriminating true from decoy splice sites for both acceptor and donor sites. Gene finding systems typically rely on Markov Chains to solve these tasks. Results: In this work we consider Support Vector Machines for splice site recognition. We employ the so-called weighted degree kernel which turns out well suited for this task, as we will illustrate in several experiments where we compare its prediction accuracy with that of recently proposed systems. We apply our method to the genome-wide recognition of splice sites in Caenorhabditis elegans, Drosophila melanogaster, Arabidopsis thaliana, Danio rerio, and Homo sapiens. Our performance estimates indicate that splice sites can be recognized very accurately in these genomes and that our method outperforms many other methods including Markov Chains, GeneSplicer and SpliceMachine. We provide genome-wide predictions of splice sites and a stand-alone prediction tool ready to be used for incorporation in a gene finder. Availability: Data, splits, additional information on the model selection, the whole genome predictions, as well as the stand-alone prediction tool are available for download at http:// www.fml.mpg.de/raetsch/projects/splice.}, web_url = {http://www.biomedcentral.com/content/pdf/1471-2105-8-S10-S7.pdf}, state = {published}, DOI = {10.1186/1471-2105-8-S10-S7}, author = {Sonnenburg S{sonne}, Schweikert G{schweike}{Department Empirical Inference}, Philips P, Behr J and R\"atsch G{raetsch}{Department Empirical Inference}} } @Article{ 4768, title = {The Need for Open Source Software in Machine Learning}, journal = {Journal of Machine Learning Research}, year = {2007}, month = {10}, volume = {8}, pages = {2443-2466}, abstract = {Open source tools have recently reached a level of maturity which makes them suitable for building large-scale real-world systems. At the same time, the field of machine learning has developed a large body of powerful learning algorithms for diverse applications. However, the true potential of these methods is not realized, since existing implementations are not openly shared, resulting in software with low usability, and weak interoperability. We argue that this situation can be significantly improved by increasing incentives for researchers to publish their software under an open source model. Additionally, we outline the problems authors are faced with when trying to publish algorithmic implementations of machine learning methods. We believe that a resource of peer reviewed software accompanied by short articles would be highly valuable to both the machine learning and the general scientific community.}, file_url = {/fileadmin/user_upload/files/publications/JMLR-8-Sonnenburg_4768[0].pdf}, web_url = {http://jmlr.csail.mit.edu/papers/v8/sonnenburg07a.html}, state = {published}, author = {Sonnenburg S{sonne}, Braun ML, Ong CS{ong}{Department Empirical Inference}, Bengio S, Bottou L, Holmes G, LeCun Y, M\"uller K-R{klaus}{Department Empirical Inference}, Pereira F, Rasmussen CE{carl}{Department Empirical Inference}, R\"atsch G{raetsch}{Department Empirical Inference}, Sch\"olkopf B{bs}{Department Empirical Inference}, Smola A, Vincent P, Weston J{weston}{Department Empirical Inference} and Williamson RC} } @Article{ 4378, title = {Improving the Caenorhabditis elegans Genome Annotation Using Machine Learning}, journal = {PLoS Computational Biology}, year = {2007}, month = {2}, volume = {3}, number = {2, e20}, pages = {0313-0322}, web_url = {http://compbiol.plosjournals.org/archive/1553-7358/3/2/pdf/10.1371_journal.pcbi.0030020-S.pdf}, state = {published}, DOI = {10.1371/journal.pcbi.0030020}, author = {R\"atsch G{raetsch}{Department Empirical Inference}, Sonnenburg S{sonne}, Srinivasan J, Witte H, M\"uller K-R{kaim}, Sommer R-J and Sch\"olkopf B{bs}{Department Empirical Inference}} } @Article{ 3960, title = {ARTS: Accurate Recognition of Transcription Starts in Human}, journal = {Bioinformatics}, year = {2006}, month = {7}, volume = {22}, number = {14}, pages = {e472-e480}, abstract = {Motivation: One of the most important features of genomic DNA are the protein-coding genes. While it is of great value to identify those genes and the encoded proteins, it is also crucial to understand how their transcription is regulated. To this end one has to identify the corresponding promoters and the contained transcription factor binding sites. TSS finders can be used to locate potential promoters. They may also be used in combination with other signal and content detectors to resolve entire gene structures. Results: We have developed a novel kernel based method - called ARTS - that accurately recognizes transcription start sites in human. The application of otherwise too computationally expensive Support Vector Machines was made possible due to the use of efficient training and evaluation techniques using suffix tries. In a carefully designed experimental study, we compare our TSS finder to state-of-the-art methods from the literature: McPromoter, Eponine and FirstEF. For given false positive rates within a reasonable range, we consistently achieve considerably higher true positive rates. For instance, ARTS finds about 24% true positives at a false positive rate of 1/1000, where the other methods find less than half (10.5%). Availability: Datasets, model selection results, whole genome predictions, and additional experimental results are available at http://www.fml.tuebingen.mpg.de/raetsch/projects/arts}, web_url = {http://www2.fml.tuebingen.mpg.de/raetsch/projects/arts}, state = {published}, DOI = {10.1093/bioinformatics/btl250}, author = {Sonnenburg S{sonne}, Zien A{zien}{Department Empirical Inference} and R\"atsch G{raetsch}{Department Empirical Inference}} } @Article{ 3994, title = {Large Scale Multiple Kernel Learning}, journal = {Journal of Machine Learning Research}, year = {2006}, month = {7}, volume = {7}, pages = {1531-1565}, abstract = {While classical kernel-based learning algorithms are based on a single kernel, in practice it is often desirable to use multiple kernels. Lanckriet et al. (2004) considered conic combinations of kernel matrices for classification, leading to a convex quadratically constrained quadratic program. We show that it can be rewritten as a semi-infinite linear program that can be efficiently solved by recycling the standard SVM implementations. Moreover, we generalize the formulation and our method to a larger class of problems, including regression and one-class classification. Experimental results show that the proposed algorithm works for hundred thousands of examples or hundreds of kernels to be combined, and helps for automatic model selection, improving the interpretability of the learning result. In a second part we discuss general speed up mechanism for SVMs, especially when used with sparse feature maps as appear for string kernels, allowing us to train a string kernel SVM on a 10 million real-world splice data set from computational biology. We integrated multiple kernel learning in our machine learning toolbox SHOGUN for which the source code is publicly available at http://www.fml.tuebingen.mpg.de/raetsch/projects/shogun.}, web_url = {http://jmlr.csail.mit.edu/papers/volume7/sonnenburg06a/sonnenburg06a.pdf}, state = {published}, author = {Sonnenburg S{sonne}, R\"atsch G{raetsch}{Department Empirical Inference}, Sch\"afer C and Sch\"olkopf B{bs}{Department Empirical Inference}} } @Article{ 3497, title = {RASE: recognition of alternatively spliced exons in C.elegans}, journal = {Bioinformatics}, year = {2005}, month = {6}, volume = {21}, number = {Suppl. 1}, pages = {i369-i377}, file_url = {/fileadmin/user_upload/files/publications/pdf3497.pdf}, web_url = {http://bioinformatics.oxfordjournals.org/cgi/reprint/21/suppl_1/i369}, state = {published}, DOI = {10.1093/bioinformatics/bti1053}, author = {R\"atsch G{raetsch}{Department Empirical Inference}, Sonnenburg S{sonne} and Sch\"olkopf B{bs}{Department Empirical Inference}} } @Conference{ 5403, title = {mGene: A Novel Discriminative Gene Finder}, year = {2008}, month = {7}, day = {25}, event_name = {Worm Genomics and Systems Biology meeting}, event_place = {Cambridge, USA}, state = {published}, author = {Schweikert G{schweike}{Department Empirical Inference}, Zeller G, Zien A{zien}{Department Empirical Inference}, Behr J, Sonnenburg S{sonne}, Philips P, Ong CS{ong}{Department Empirical Inference} and R\"atsch G{raetsch}{Department Empirical Inference}} } @Conference{ 5033, title = {Positional Oligomer Importance Matrices}, year = {2007}, month = {12}, abstract = {At the heart of many important bioinformatics problems, such as gene finding and function prediction, is the classification of biological sequences, above all of DNA and proteins. In many cases, the most accurate classifiers are obtained by training SVMs with complex sequence kernels, for instance for transcription starts or splice sites. However, an often criticized downside of SVMs with complex kernels is that it is very hard for humans to understand the learned decision rules and to derive biological insights from them. To close this gap, we introduce the concept of positional oligomer importance matrices (POIMs) and develop an efficient algorithm for their computation. We demonstrate how they overcome the limitations of sequence logos, and how they can be used to find relevant motifs for different biological phenomena in a straight-forward way. Note that the concept of POIMs is not limited to interpreting SVMs, but is applicable to general k−mer based scoring systems.}, web_url = {http://nips.cc/Conferences/2007/Program/schedule.php?Session=Workshops}, event_name = {NIPS 2007 Workshop on Machine Learning in Computational Biology}, event_place = {Whistler, BC, Canada}, state = {published}, author = {Sonnenburg S{sonne}, Zien A{zien}{Department Empirical Inference}, Philips P and R\"atsch G{raetsch}{Department Empirical Inference}} } @Conference{ 4237, title = {Ab-initio gene finding using machine learning}, year = {2006}, month = {12}, web_url = {http://nips.cc/Conferences/2006/Program/event.php?ID=528}, event_name = {NIPS 2006 Workshop on New Problems and Methods in Computational Biology}, event_place = {Vancouver, BC, Canada}, state = {published}, author = {Schweikert G{schweike}{Department Empirical Inference}, Zeller G, Zien A{zien}{Department Empirical Inference}, Ong CS{ong}{Department Empirical Inference}, de Bona F, Sonnenburg S{sonne}, Phillips P and R\"atsch G{raetsch}{Department Empirical Inference}} }