Source code for cnv_db.db

#!/usr/bin/env python2.7

import os
import sqlite3
import gzip
import atexit

import cnv_struct

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

DB_FILE = os.path.join(os.path.split(__file__)[0], "known_cnvs.db")
CONN = sqlite3.connect(DB_FILE)

[docs]def create_dgv(): """Creates the DGV table in the database of known cnvs. This executes the SQL ``CREATE`` query corresponding to the DGV (Database of Genomic Variants) schema. It does so by reading a compressed text file named 'dgv-variants-hg19.txt.gz' representing a database dump in a text file. """ c = CONN.cursor() c.execute("CREATE TABLE dgv (variantaccession text, " "chr int, " "start int, " "end int, " "varianttype text, " "variantsubtype text, " "reference text, " "pubmedid text, " "method text, " "platform text, " "mergedvariants text, " "supportingvariants text, " "mergedorsample text, " "frequency float, " "samplesize int, " "observedgains int, " "observedlosses int, " "cohortdescription text, " "genes text, " "samples text)") with gzip.open("dgv-variants-hg19.txt.gz", "rb") as f: variants = [] for line in f: line = line.rstrip("\r\n") variants.append(tuple(line.split("\t"))) c.executemany("INSERT INTO dgv VALUES " "(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", variants) CONN.commit()
[docs]def query_dgv_overlap(cnv): """Queries the dgv table for known overlapping CNVs. :param cnv: The cnv object to query for overlap. :type cnv: :py:class:`cnv_struct.cnv` :returns: A list of overlapping CNV objects. :rtype: list Internally, this generate the corresponding SQL ``SELECT`` from the cnv object and queries the sqlite database. .. note:: The elegant method for detecting overlaps is from `Stack Overflow <http://stackoverflow.com/questions/325933/determine-whether-two-date-ranges-overlap>`_. """ c = CONN.cursor() c.execute("SELECT chr, start, end, variantsubtype, variantaccession " "FROM dgv " "WHERE chr == ? AND start <= ? AND end >= ?", (cnv.chr, cnv.end, cnv.start)) cnvs = [] for tu in c: # Create the cnv objects type_map = { "Loss": "loss", "Gain": "gain", } # The default 'loss' cnv type will be overriden later, # it's simply meant to avoid a runtime warning. a_cnv = cnv_struct.cnv( chr = tu[0], start = tu[1], end = tu[2], algo = "dgv", source = "dgv", type = "loss", ) a_cnv.meta = tu[4] if tu[3] == "Gain+Loss": for cnv_type in ("gain", "loss"): a_cnv.type = cnv_type cnvs.append(a_cnv) elif tu[3] in type_map.keys(): a_cnv.type = type_map[tu[3]] cnvs.append(a_cnv) # We ignore SVs that are not CNVs. return cnvs
@atexit.register def goodbye(): CONN.commit() CONN.close()