@article {124, title = {Chromosome-scale, haplotype-resolved assembly of human genomes.}, journal = {Nat Biotechnol}, volume = {39}, year = {2021}, month = {2021 03}, pages = {309-312}, abstract = {

Haplotype-resolved or phased genome assembly provides a complete picture of genomes and their complex genetic variations. However, current algorithms for phased assembly either do not generate chromosome-scale phasing or require pedigree information, which limits their application. We present a method named diploid assembly (DipAsm) that uses long, accurate reads and long-range conformation data for single individuals to generate a chromosome-scale phased assembly within 1 day. Applied to four public human genomes, PGP1, HG002, NA12878 and HG00733, DipAsm produced haplotype-resolved assemblies with minimum contig length needed to cover 50\% of the known genome (NG50) up to 25 Mb and phased ~99.5\% of heterozygous sites at 98-99\% accuracy, outperforming other approaches in terms of both contiguity and phasing completeness. We demonstrate the importance of chromosome-scale phased assemblies for the discovery of structural variants (SVs), including thousands of new transposon insertions, and of highly polymorphic and medically important regions such as the human leukocyte antigen (HLA) and killer cell immunoglobulin-like receptor (KIR) regions. DipAsm will facilitate high-quality precision medicine and studies of individual haplotype variation and population diversity.

}, keywords = {Algorithms, Chromosomes, Human, Genome, Human, Haplotypes, Heterozygote, Humans, Polymorphism, Single Nucleotide}, issn = {1546-1696}, doi = {10.1038/s41587-020-0711-0}, author = {Garg, Shilpa and Fungtammasan, Arkarachai and Carroll, Andrew and Chou, Mike and Schmitt, Anthony and Zhou, Xiang and Mac, Stephen and Peluso, Paul and Hatas, Emily and Ghurye, Jay and Maguire, Jared and Mahmoud, Medhat and Cheng, Haoyu and Heller, David and Zook, Justin M and Moemke, Tobias and Marschall, Tobias and Sedlazeck, Fritz J and Aach, John and Chin, Chen-Shan and Church, George M and Li, Heng} } @article {147, title = {Towards population-scale long-read sequencing.}, journal = {Nat Rev Genet}, volume = {22}, year = {2021}, month = {2021 09}, pages = {572-587}, abstract = {

Long-read sequencing technologies have now reached a level of accuracy and yield that allows their application to variant detection at a scale of tens to thousands of samples. Concomitant with the development of new computational tools, the first population-scale studies involving long-read sequencing have emerged over the past 2 years and, given the continuous advancement of the field, many more are likely to follow. In this Review, we survey recent developments in population-scale long-read sequencing, highlight potential challenges of a scaled-up approach and provide guidance regarding experimental design. We provide an overview of current long-read sequencing platforms, variant calling methodologies and approaches for de novo assemblies and reference-based mapping approaches. Furthermore, we summarize strategies for variant validation, genotyping and predicting functional impact and emphasize challenges remaining in achieving long-read sequencing at a population scale.

}, keywords = {Computational Biology, Genome, Human, Genomics, High-Throughput Nucleotide Sequencing, Humans, Industrial Development, Sequence Analysis, DNA}, issn = {1471-0064}, doi = {10.1038/s41576-021-00367-3}, author = {De Coster, Wouter and Weissensteiner, Matthias H and Sedlazeck, Fritz J} } @article {52, title = {A multi-task convolutional deep neural network for variant calling in single molecule sequencing.}, journal = {Nat Commun}, volume = {10}, year = {2019}, month = {2019 03 01}, pages = {998}, abstract = {

The accurate identification of DNA sequence variants is an important, but challenging task in genomics. It is particularly difficult for single molecule sequencing, which has a per-nucleotide error rate of ~5-15\%. Meeting this demand, we developed Clairvoyante, a multi-task five-layer convolutional neural network model for predicting variant type (SNP or indel), zygosity, alternative allele and indel length from aligned reads. For the well-characterized NA12878 human sample, Clairvoyante achieves 99.67, 95.78, 90.53\% F1-score on 1KP common variants, and 98.65, 92.57, 87.26\% F1-score for whole-genome analysis, using Illumina, PacBio, and Oxford Nanopore data, respectively. Training on a second human sample shows Clairvoyante is sample agnostic and finds variants in less than 2 h on a standard server. Furthermore, we present 3,135 variants that are missed using Illumina but supported independently by both PacBio and Oxford Nanopore reads. Clairvoyante is available open-source ( https://github.com/aquaskyline/Clairvoyante ), with modules to train, utilize and visualize the model.

}, keywords = {Base Sequence, Computational Biology, DNA Mutational Analysis, Genome, Human, Genome-Wide Association Study, Genomics, Genotype, Genotyping Techniques, Humans, INDEL Mutation, Nanopores, Neural Networks (Computer), Polymorphism, Single Nucleotide, Sequence Analysis, DNA, Software}, issn = {2041-1723}, doi = {10.1038/s41467-019-09025-z}, author = {Luo, Ruibang and Sedlazeck, Fritz J and Lam, Tak-Wah and Schatz, Michael C} } @article {37, title = {Accurate detection of complex structural variations using single-molecule sequencing.}, journal = {Nat Methods}, volume = {15}, year = {2018}, month = {2018 06}, pages = {461-468}, abstract = {

Structural variations are the greatest source of genetic variation, but they remain poorly understood because of technological limitations. Single-molecule long-read sequencing has the potential to dramatically advance the field, although high error rates are a challenge with existing methods. Addressing this need, we introduce open-source methods for long-read alignment (NGMLR; https://github.com/philres/ngmlr ) and structural variant identification (Sniffles; https://github.com/fritzsedlazeck/Sniffles ) that provide unprecedented sensitivity and precision for variant detection, even in repeat-rich regions and for complex nested events that can have substantial effects on human health. In several long-read datasets, including healthy and cancerous human genomes, we discovered thousands of novel variants and categorized systematic errors in short-read approaches. NGMLR and Sniffles can automatically filter false events and operate on low-coverage data, thereby reducing the high costs that have hindered the application of long reads in clinical and research settings.

}, keywords = {DNA Mutational Analysis, Genome, Human, Genomics, High-Throughput Nucleotide Sequencing, Humans, Sequence Analysis, DNA}, issn = {1548-7105}, doi = {10.1038/s41592-018-0001-7}, author = {Sedlazeck, Fritz J and Rescheneder, Philipp and Smolka, Moritz and Fang, Han and Nattestad, Maria and von Haeseler, Arndt and Schatz, Michael C} } @article {40, title = {Complex rearrangements and oncogene amplifications revealed by long-read DNA and RNA sequencing of a breast cancer cell line.}, journal = {Genome Res}, volume = {28}, year = {2018}, month = {2018 08}, pages = {1126-1135}, abstract = {

The SK-BR-3 cell line is one of the most important models for HER2+ breast cancers, which affect one in five breast cancer patients. SK-BR-3 is known to be highly rearranged, although much of the variation is in complex and repetitive regions that may be underreported. Addressing this, we sequenced SK-BR-3 using long-read single molecule sequencing from Pacific Biosciences and develop one of the most detailed maps of structural variations (SVs) in a cancer genome available, with nearly 20,000 variants present, most of which were missed by short-read sequencing. Surrounding the important oncogene (also known as ), we discover a complex sequence of nested duplications and translocations, suggesting a punctuated progression. Full-length transcriptome sequencing further revealed several novel gene fusions within the nested genomic variants. Combining long-read genome and transcriptome sequencing enables an in-depth analysis of how SVs disrupt the genome and sheds new light on the complex mechanisms involved in cancer genome evolution.

}, keywords = {Breast Neoplasms, Female, Gene Amplification, Gene Rearrangement, Genome, Human, Genomic Structural Variation, High-Throughput Nucleotide Sequencing, Humans, MCF-7 Cells, Oncogenes, Receptor, ErbB-2, Repetitive Sequences, Nucleic Acid, Transcriptome}, issn = {1549-5469}, doi = {10.1101/gr.231100.117}, author = {Nattestad, Maria and Goodwin, Sara and Ng, Karen and Baslan, Timour and Sedlazeck, Fritz J and Rescheneder, Philipp and Garvin, Tyler and Fang, Han and Gurtowski, James and Hutton, Elizabeth and Tseng, Elizabeth and Chin, Chen-Shan and Beck, Timothy and Sundaravadanam, Yogi and Kramer, Melissa and Antoniou, Eric and McPherson, John D and Hicks, James and McCombie, W Richard and Schatz, Michael C} }