#!/usr/bin/env python2.7
import os
import sqlite3
import gzip
import atexit
import cnv_struct
# This file is part of CNVAnalysisToolkit.
#
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit. If not, see <http://www.gnu.org/licenses/>.
__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"
DB_FILE = os.path.join(os.path.split(__file__)[0], "known_cnvs.db")
CONN = sqlite3.connect(DB_FILE)
[docs]def create_dgv():
"""Creates the DGV table in the database of known cnvs.
This executes the SQL ``CREATE`` query corresponding to the DGV (Database of
Genomic Variants) schema. It does so by reading a compressed text file
named 'dgv-variants-hg19.txt.gz' representing a database dump in a text
file.
"""
c = CONN.cursor()
c.execute("CREATE TABLE dgv (variantaccession text, "
"chr int, "
"start int, "
"end int, "
"varianttype text, "
"variantsubtype text, "
"reference text, "
"pubmedid text, "
"method text, "
"platform text, "
"mergedvariants text, "
"supportingvariants text, "
"mergedorsample text, "
"frequency float, "
"samplesize int, "
"observedgains int, "
"observedlosses int, "
"cohortdescription text, "
"genes text, "
"samples text)")
with gzip.open("dgv-variants-hg19.txt.gz", "rb") as f:
variants = []
for line in f:
line = line.rstrip("\r\n")
variants.append(tuple(line.split("\t")))
c.executemany("INSERT INTO dgv VALUES "
"(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", variants)
CONN.commit()
[docs]def query_dgv_overlap(cnv):
"""Queries the dgv table for known overlapping CNVs.
:param cnv: The cnv object to query for overlap.
:type cnv: :py:class:`cnv_struct.cnv`
:returns: A list of overlapping CNV objects.
:rtype: list
Internally, this generate the corresponding SQL ``SELECT`` from the cnv
object and queries the sqlite database.
.. note::
The elegant method for detecting overlaps is from `Stack Overflow <http://stackoverflow.com/questions/325933/determine-whether-two-date-ranges-overlap>`_.
"""
c = CONN.cursor()
c.execute("SELECT chr, start, end, variantsubtype, variantaccession "
"FROM dgv "
"WHERE chr == ? AND start <= ? AND end >= ?",
(cnv.chr, cnv.end, cnv.start))
cnvs = []
for tu in c:
# Create the cnv objects
type_map = {
"Loss": "loss",
"Gain": "gain",
}
# The default 'loss' cnv type will be overriden later,
# it's simply meant to avoid a runtime warning.
a_cnv = cnv_struct.cnv(
chr = tu[0],
start = tu[1],
end = tu[2],
algo = "dgv",
source = "dgv",
type = "loss",
)
a_cnv.meta = tu[4]
if tu[3] == "Gain+Loss":
for cnv_type in ("gain", "loss"):
a_cnv.type = cnv_type
cnvs.append(a_cnv)
elif tu[3] in type_map.keys():
a_cnv.type = type_map[tu[3]]
cnvs.append(a_cnv)
# We ignore SVs that are not CNVs.
return cnvs
@atexit.register
def goodbye():
CONN.commit()
CONN.close()