Source code for cnv_db.db

#!/usr/bin/env python2.7

import os
import sqlite3
import gzip
import atexit

import cnv_struct

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

DB_FILE = os.path.join(os.path.split(__file__)[0], "known_cnvs.db")
CONN = sqlite3.connect(DB_FILE)

[docs]def create_dgv():
    """Creates the DGV table in the database of known cnvs. 
    
    This executes the SQL ``CREATE`` query corresponding to the DGV (Database of 
    Genomic Variants) schema. It does so by reading a compressed text file
    named 'dgv-variants-hg19.txt.gz' representing a database dump in a text
    file.

    """

    c = CONN.cursor()
    c.execute("CREATE TABLE dgv (variantaccession text, "
                                "chr int, "
                                "start int, "
                                "end int, "
                                "varianttype text, "
                                "variantsubtype text, "
                                "reference text, "
                                "pubmedid text, "
                                "method text, "
                                "platform text, "
                                "mergedvariants text, "
                                "supportingvariants text, "
                                "mergedorsample text, "
                                "frequency float, "
                                "samplesize int, "
                                "observedgains int, "
                                "observedlosses int, "
                                "cohortdescription text, "
                                "genes text, "
                                "samples text)")

    with gzip.open("dgv-variants-hg19.txt.gz", "rb") as f:
        variants = []
        for line in f:
            line = line.rstrip("\r\n")
            variants.append(tuple(line.split("\t")))

    c.executemany("INSERT INTO dgv VALUES "
                  "(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", variants)

    CONN.commit()

[docs]def query_dgv_overlap(cnv):
    """Queries the dgv table for known overlapping CNVs. 
    
    :param cnv: The cnv object to query for overlap.
    :type cnv: :py:class:`cnv_struct.cnv`

    :returns: A list of overlapping CNV objects.
    :rtype: list

    Internally, this generate the corresponding SQL ``SELECT`` from the cnv
    object and queries the sqlite database.

    .. note::
        The elegant method for detecting overlaps is from `Stack Overflow <http://stackoverflow.com/questions/325933/determine-whether-two-date-ranges-overlap>`_.

    """

    c = CONN.cursor()
    c.execute("SELECT chr, start, end, variantsubtype, variantaccession "
              "FROM dgv "
              "WHERE chr == ? AND start <= ? AND end >= ?",
              (cnv.chr, cnv.end, cnv.start))

    cnvs = []
    for tu in c:
        # Create the cnv objects
        type_map = {
                        "Loss": "loss",
                        "Gain": "gain",
                   }

        # The default 'loss' cnv type will be overriden later,
        # it's simply meant to avoid a runtime warning.
        a_cnv = cnv_struct.cnv(
            chr = tu[0],
            start = tu[1],
            end = tu[2],
            algo = "dgv",
            source = "dgv",
            type = "loss",
        )
        a_cnv.meta = tu[4]

        if tu[3] == "Gain+Loss":
            for cnv_type in ("gain", "loss"):
                a_cnv.type = cnv_type
                cnvs.append(a_cnv)
        elif tu[3] in type_map.keys():
            a_cnv.type = type_map[tu[3]]
            cnvs.append(a_cnv)
        # We ignore SVs that are not CNVs.
                    
    return cnvs

@atexit.register
def goodbye():
    CONN.commit()
    CONN.close()
Navigation

Source code for cnv_db.db

Quick search

Navigation