# %pip install --quiet --force-reinstall --upgrade git+https://github.com/MultiQC/MultiQC.git
# %pip install --quiet pandas
# %reset -f

import multiqc

print(multiqc.__version__)

1.22.dev0

!test -d "data" || (wget https://multiqc.info/examples/viralrecon/data.zip && unzip -qq data.zip && rm data.zip)

%ls data

assembly/                         pipeline_info/
fastp/                            summary_assembly_metrics_mqc.csv
fastqc/                           summary_variants_metrics_mqc.csv
kraken2/                          variants/

multiqc.parse_logs('./data/fastp')

/// https://multiqc.info 🔍 v1.22.dev0

       file_search | Search path: /Users/vlad/git/website/public/examples/jupyter/data/fastp

             fastp | Found 3 reports

multiqc.list_modules()

['fastp']

multiqc.list_samples()

['SAMPLE1_PE', 'SAMPLE2_PE', 'SAMPLE3_SE']

multiqc.list_data_sources()

['/Users/vlad/git/website/public/examples/jupyter/data/fastp/SAMPLE3_SE.fastp.json',
 '/Users/vlad/git/website/public/examples/jupyter/data/fastp/SAMPLE1_PE.fastp.json',
 '/Users/vlad/git/website/public/examples/jupyter/data/fastp/SAMPLE2_PE.fastp.json']

sample1_result = multiqc.get_module_data(module="fastp", sample="SAMPLE1_PE")
sample1_result["summary"]

{'fastp_version': '0.23.2',
 'sequencing': 'paired end (301 cycles + 301 cycles)',
 'before_filtering': {'total_reads': 55442,
  'total_bases': 16571632,
  'q20_bases': 16267224,
  'q30_bases': 15853021,
  'q20_rate': 0.981631,
  'q30_rate': 0.956636,
  'read1_mean_length': 298,
  'read2_mean_length': 298,
  'gc_content': 0.38526},
 'after_filtering': {'total_reads': 48270,
  'total_bases': 14363465,
  'q20_bases': 14323363,
  'q30_bases': 14199841,
  'q20_rate': 0.997208,
  'q30_rate': 0.988608,
  'read1_mean_length': 298,
  'read2_mean_length': 296,
  'gc_content': 0.383991}}

multiqc.list_plots()

{'fastp': ['Filtered Reads',
  'Insert Sizes',
  {'Sequence Quality': ['Read 1: Before filtering',
    'Read 1: After filtering',
    'Read 2: Before filtering',
    'Read 2: After filtering']},
  {'GC Content': ['Read 1: Before filtering',
    'Read 1: After filtering',
    'Read 2: Before filtering',
    'Read 2: After filtering']},
  {'N content': ['Read 1: Before filtering',
    'Read 1: After filtering',
    'Read 2: Before filtering',
    'Read 2: After filtering']}]}

multiqc.show_plot("fastp", "GC Content", dataset="Read 2: Before filtering")

import multiqc
multiqc.parse_logs(
    "./data/assembly",
    "./data/kraken2",
    "./data/variants",
    "./data/fastqc",
    module_order=[
        dict(
            quast=dict(
                name="VARIANTS: QUAST",
                anchor="quast_variants",
                info="This section of the report shows QUAST QC results for the consensus sequence.",
                path_filters=["*/variants/*"],
            )
        ),
        dict(
            quast=dict(
                name="ASSEMBLY: QUAST (SPAdes)",
                anchor="quast_spades",
                info="This section of the report shows QUAST results from SPAdes de novo assembly.",
                path_filters=["*/spades/*"],
            )
        ),
    ],
    extra_fn_clean_exts=[".unclassified", ".pangolin", " MN908947.3", " MN908947.3"]
)
multiqc.list_modules()

/// https://multiqc.info 🔍 v1.22.dev0
       file_search | Search path: /Users/vlad/git/website/public/examples/jupyter/data/assembly

       file_search | Search path: /Users/vlad/git/website/public/examples/jupyter/data/kraken2

       file_search | Search path: /Users/vlad/git/website/public/examples/jupyter/data/variants

       file_search | Search path: /Users/vlad/git/website/public/examples/jupyter/data/fastqc

          bcftools | Found 3 stats reports

           bowtie2 | Found 3 reports

          cutadapt | Found 3 reports

            fastqc | Found 7 reports

            kraken | Found 3 reports

          mosdepth | Found reports for 3 samples

multiqc.list_samples()

['SAMPLE1_PE',
 'SAMPLE1_PE_1',
 'SAMPLE1_PE_2',
 'SAMPLE2_PE',
 'SAMPLE2_PE_1',
 'SAMPLE2_PE_2',
 'SAMPLE2_PE_R1',
 'SAMPLE2_PE_R2',
 'SAMPLE3_SE']

from pprint import pprint
pprint(multiqc.list_plots())

{'ASSEMBLY: QUAST (SPAdes)': ['Assembly Statistics', 'Number of Contigs'],
 'Bcftools': ['Variant Substitution Types',
              {'Variant Quality': ['Count SNP',
                                   'Count Transitions',
                                   'Count Transversions',
                                   'Count Indels']},
              'Indel Distribution',
              'Variant depths'],
 'Bowtie 2 / HiSAT2': ['Single-end alignments', 'Paired-end alignments'],
 'Cutadapt': ['Filtered Reads',
              {'Trimmed Sequence Lengths': ['Counts', 'Obs/Exp']}],
 'FastQC': ['Sequence Counts',
            'Sequence Quality Histograms',
            'Per Sequence Quality Scores',
            {'Per Sequence GC Content': ['Percentages', 'Counts']},
            'Per Base N Content',
            'Sequence Length Distribution',
            'Sequence Duplication Levels',
            'Overrepresented sequences by sample',
            'Top overrepresented sequences',
            'Status Checks'],
 'Kraken': [{'Top taxa': ['Genus',
                          'Family',
                          'Order',
                          'Class',
                          'Phylum',
                          'Domain',
                          'Root',
                          'Unclassified']}],
 'Mosdepth': [],
 'Nextclade': ['Run table'],
 'Pangolin': ['Run table'],
 'Picard': [{'Alignment Summary': ['Aligned Reads', 'Aligned Bases']},
            'Mean read length',
            {'Base Distribution': ['% Adenine',
                                   '% Cytosine',
                                   '% Guanine',
                                   '% Thymine',
                                   '% Undetermined']},
            {'Insert Size': ['Counts', 'Percentages']},
            'Mean Base Quality by Cycle',
            'Base Quality Distribution'],
 'Samtools': ['Percent mapped',
              'Alignment stats',
              {'Flagstat': ['Read counts', 'Percentage of total']},
              {'Mapped reads per contig': ['Normalised Counts',
                                           'Observed over Expected Counts',
                                           'Raw Counts']}],
 'SnpEff': ['Variants by Genomic Region',
            'Variant Effects by Impact',
            'Variants by Effect Types',
            'Variants by Functional Class'],
 'VARIANTS: QUAST': ['Assembly Statistics', 'Number of Contigs'],
 'fastp': ['Filtered Reads',
           'Insert Sizes',
           {'Sequence Quality': ['Read 1: Before filtering',
                                 'Read 1: After filtering',
                                 'Read 2: Before filtering',
                                 'Read 2: After filtering']},
           {'GC Content': ['Read 1: Before filtering',
                           'Read 1: After filtering',
                           'Read 2: Before filtering',
                           'Read 2: After filtering']},
           {'N content': ['Read 1: Before filtering',
                          'Read 1: After filtering',
                          'Read 2: Before filtering',
                          'Read 2: After filtering']}]}

multiqc.show_plot(module="ASSEMBLY: QUAST (SPAdes)", section='Number of Contigs')

headers = {
    "input_reads": {
        "title": "# Input reads",
        "description": "Total number of reads in raw fastq file",
    },
    "trimmed_read_pairs": {
        "title": "# Trimmed reads (Cutadapt)",
        "description": "Total number of reads remaining after adapter/quality trimming with fastp",
    },
    "num_mapped_reads": {
        "title": "# Mapped reads",
        "description": "Total number of Bowtie2 mapped reads relative to the viral genome",
    },
    "pct_mapped_reads": {
        "title": "% Mapped reads",
        "description": "Percentage of Bowtie2 mapped reads relative to the viral genome",
        "suffix": "%",
    },
    "non_host_reads": {
        "title": "% Non-host reads (Kraken 2)",
        "description": "Total number of non-host reads identified by Kraken2",
    },
    "cov_mean": {
        "title": "Coverage",
        "description": "Mean coverage calculated by mosdepth"
    },
    "number_of_SNPs": {
        "title": "# SNPs",
        "description": "Total number of SNPs",
    },
    "number_of_indels": {
        "title": "# SNPs",
        "description": "Total number of INDELs",
    },
    "missense": {
        "title": "# Missense variants",
        "description": "Total number of variants identified as missense mutations with SnpEff",
    },
    "n_contigs": {
        "title": "# Contigs",
        "description": "Total number of contigs in SPAdes assembly as calculated by QUAST",
    },
    "largest_contig": {
        "title": "Largest contig",
        "description": "Size of largest contig in SPAdes assembly as calculated by QUAST"
    },
    "genome_fraction": {
        "title": "Genome fraction",
        "suffix": "%",
        "description": "% genome fraction for SPAdes assembly as calculated by QUAST",
    },
    "n50": {
        "title": "N50",
        "description": "N50 metric for SPAdes assembly as calculated by QUAST",
    },
    "lineage": {
        "title": "Pangolin lineage",
        "description": "Pangolin lineage inferred from the consensus sequence",
    },
    "clade": {
        "title": "Nextclade clade",
        "description": "Nextclade clade inferred from the consensus sequence",
    },
}

from collections import defaultdict
summarized_data = defaultdict(dict)
for s in multiqc.list_samples():
    if data := multiqc.get_module_data(sample=s, module="fastp"):
        summarized_data[s]["input_reads"] = data["summary"]["before_filtering"]["total_reads"]

    if data := multiqc.get_module_data(sample=s, module="Cutadapt"):
        summarized_data[s]["trimmed_read_pairs"] = data.get("pairs_written")

    if data := multiqc.get_module_data(sample=s, module="Bowtie 2 / HiSAT2"):
        summarized_data[s]["pct_mapped_reads"] = data.get("overall_alignment_rate")

    if data := multiqc.get_module_data(sample=s, module="Samtools", key="multiqc_samtools_stats"):
        summarized_data[s]["num_mapped_reads"] = data.get("reads_mapped")

    if data := multiqc.get_general_stats_data(sample=s):
        summarized_data[s]["non_host_reads"] = data.get("Kraken.pct_unclassified")
        summarized_data[s]["mean_coverage"] = data.get("Mosdepth.mean_coverage")

    if data := multiqc.get_module_data(sample=s, module="Mosdepth"):
        summarized_data[s]["cov_mean"] = data["mean_coverage"]
        
    if data := multiqc.get_module_data(sample=s, module="SnpEff"):
        summarized_data[s]["missense_variants"] = data["MISSENSE"]

    if data := multiqc.get_module_data(sample=s, module="Bcftools"):
        summarized_data[s]["number_of_SNPs"] = data["number_of_SNPs"]
        summarized_data[s]["number_of_indels"] = data["number_of_indels"]

    if data := multiqc.get_module_data(sample=s, module="VARIANTS: QUAST"):
        summarized_data[s]["n_contigs"] = data["# contigs (>= 0 bp)"]
        summarized_data[s]["largest_contig"] = data["Largest contig"]
        summarized_data[s]["genome_fraction"] = data["Genome fraction (%)"]
        summarized_data[s]["n50"] = data["N50"]

    if data := multiqc.get_module_data(sample=s, module="Pangolin"):
        summarized_data[s]["lineage"] = data["lineage"]

    if data := multiqc.get_module_data(sample=s, module="Nextclade"):
        summarized_data[s]["clade"] = data["clade"]

summarized_data = {
    s: {
        k: v for k, v in d.items() if v is not None
    } for s, d in summarized_data.items()
}
summarized_data = {s: d for s, d in summarized_data.items() if d}
pprint(dict(summarized_data))

{'SAMPLE1_PE': {'clade': '20A',
                'genome_fraction': 98.094,
                'input_reads': 55442,
                'largest_contig': 29903.0,
                'lineage': 'B.1',
                'mean_coverage': 433.22,
                'missense_variants': 3.0,
                'n50': 29903.0,
                'n_contigs': 1.0,
                'non_host_reads': 99.95856639734825,
                'num_mapped_reads': 48013.0,
                'number_of_SNPs': 7,
                'number_of_indels': 1,
                'pct_mapped_reads': 99.53,
                'trimmed_read_pairs': 24125},
 'SAMPLE2_PE': {'clade': '19B',
                'genome_fraction': 89.804,
                'input_reads': 42962,
                'largest_contig': 29903.0,
                'lineage': 'A.2',
                'mean_coverage': 343.94,
                'missense_variants': 6.0,
                'n50': 29903.0,
                'n_contigs': 1.0,
                'non_host_reads': 99.78127278408499,
                'num_mapped_reads': 37942.0,
                'number_of_SNPs': 7,
                'number_of_indels': 1,
                'pct_mapped_reads': 98.9,
                'trimmed_read_pairs': 19160},
 'SAMPLE3_SE': {'clade': '19A',
                'genome_fraction': 97.432,
                'input_reads': 49202,
                'largest_contig': 29903.0,
                'lineage': 'B',
                'mean_coverage': 411.97,
                'missense_variants': 37.0,
                'n50': 29903.0,
                'n_contigs': 1.0,
                'non_host_reads': 99.90141237489016,
                'num_mapped_reads': 46278.0,
                'number_of_SNPs': 54,
                'number_of_indels': 0,
                'pct_mapped_reads': 99.18}}

from multiqc.plots import table
plot = table.plot(
    data=summarized_data,
    headers=headers,
    pconfig={
        "id": "summary_assembly_metrics",
        "title": "Summary Assembly Metrics",
    },
)
plot.show()

plot.show(violin=True)

module = multiqc.BaseMultiqcModule(
    name="nf-core/viralrecon summary",
    anchor="custom_data",
)
module.add_section(
    name="De novo assembly metrics",
    anchor="de_novo_assembly_metrics",
    description="Summary of input reads, trimmed reads, and non-host reads. Generated by the nf-core/viralrecon pipeline",
    plot=plot,
)
multiqc.report.modules = [module] + multiqc.report.modules

multiqc.write_report(
    force=True,
    title="nf-core/viralrecon report",
    filename="multiqc_report",
    exclude_modules=["general_stats"],   
)
%ls

/// https://multiqc.info 🔍 v1.22.dev0
     update_config | Report title: nf-core/viralrecon report

       multiqc_ngi | Could not find a statusdb config file

       file_search | Excluding modules 'general_stats'

     write_results | Data        : multiqc_report_data

     write_results | Report      : multiqc_report.html

data/                        multiqc_report_data/
data.zip                     notebook.html
multiqc_config_illumina.yml  notebook.ipynb
multiqc_report.html

import IPython

# Best if using Google Colab
# IPython.display.HTML(filename='./multiqc_report.html')

# Best if running locally
IPython.display.IFrame('./multiqc_report.html', '100%', 600)

multiqc.reset()

multiqc.load_config("multiqc_config_illumina.yml")

/// https://multiqc.info 🔍 v1.22.dev0
            config | Loading config settings from: multiqc_config_illumina.yml

MultiQC Jupyter Notebook Example¶

Parsing data¶

Adding more modules¶

	input_reads	trimmed_read_pairs	num_mapped_reads	pct_mapped_reads	non_host_reads	number_of_SNPs	number_of_indels	n_contigs	largest_contig	genome_fraction	n50	lineage-1	clade-1
SAMPLE1_PE	55442	24125	48013.0	99.53	99.958566	7	1	1.0	29903.0	98.094	29903.0	B.1	20A
SAMPLE2_PE	42962	19160	37942.0	98.9	99.781273	7	1	1.0	29903.0	89.804	29903.0	A.2	19B
SAMPLE3_SE	49202	NaN	46278.0	99.18	99.901412	54	0	1.0	29903.0	97.432	29903.0	B	19A