Source code for samples_db
#!/usr/bin/env python2.7
import sys
import os
import os.path
import sqlite3
DB_FILE = os.path.join(os.path.split(os.path.abspath(__file__))[0], "samples.db")
# This file is part of CNVAnalysisToolkit.
#
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit. If not, see <http://www.gnu.org/licenses/>.
__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"
[docs]def create_db(trios_file, coverage_file, delete=False):
"""Creates a database containing trio information.
:param trios_file: The path to a file containing trio information.
:type trios_file: str
:param coverage_file: The path to a file containing mean coverage for
samples.
:type coverage_file: str
:param delete: Flag that determines if the sqlite database file will be
overriden when running this function.
:type delete: bool
Samples files are available in the ``sample_db_files`` directory from this
repository.
"""
db_file = DB_FILE
if delete and os.path.isfile(db_file):
os.remove(db_file)
elif os.path.isfile(db_file):
raise Exception("Database file already exists. run with "
"delete=True to overwrie or delete the samples.db "
"file manually.")
# Create the database
conn = sqlite3.connect(db_file)
c = conn.cursor()
c.execute(("CREATE TABLE quatuors (id text, family text, "
"status text, avg_coverage float)"))
# Parse the coverage file into a dict.
coverage = {}
with open(coverage_file) as f:
for line in f:
line = line.rstrip("\r\n")
line = line.split(",")
# Sample -> coverage dict.
coverage[line[0]] = line[1]
# Insert the corresponding rows
samples_info = []
with open(trios_file) as f:
header = f.readline()
header = header.rstrip("\r\n")
header = header.split("\t")
for line in f:
line = line.rstrip("\r\n")
line = line.split("\t")
sample_id = line[header.index("#ID")]
family_id = line[header.index("Family")]
status = line[header.index("Link")]
cov = coverage[sample_id]
status = status.lower()
if line[header.index("Project")] == "Twins":
samples_info.append((sample_id, family_id, status, cov))
c.executemany("INSERT INTO quatuors VALUES (?,?,?,?)", samples_info)
# Save and exit
conn.commit()
conn.close()
[docs]def get_sample_ids_for_family(f):
"""Executes a query over the database to find the samples from a given family.
:param f: The family (e.g. 1443)
:type f: str
:returns: A dictionary containing the familial status (mother, father,
twin1 or twin2 as a key and the sample id as a value.
"""
family = {}
c = __get_cursor()
response = c.execute("SELECT id, status FROM quatuors WHERE family=?", (f, ))
for sample_id, status in response:
family[status] = sample_id
return family
[docs]def get_coverage_for_sample(s):
"""Executes a query over the datavase to find the coverage for a given sample.
:param s: The sample (e.g. LP6005057-DNA_F03)
:type s: str
:returns: The mean coverage for this sample.
:rtype: str (could be casted to a float)
"""
c = __get_cursor()
coverage = c.execute("SELECT avg_coverage FROM quatuors WHERE id=?", (s, )).fetchone()
return coverage[0]
def __get_cursor():
conn = sqlite3.connect(DB_FILE)
c = conn.cursor()
return c
if __name__ == "__main__":
if len(sys.argv) != 3:
print "Usage: {} trios coverage".format(__file__)
print ("Where `trios` is the `complete_trios` file to create the database "
"and `coverage` is the coverage csv file to get the average coverage "
"values from.")
print "For examples, see the `sample_db_files` directory."
sys.exit()
create_db(sys.argv[1], sys.argv[2])