#!/usr/bin/env python2.7
import sys
import argparse
import pickle
import numpy as np
import bed
import validation
import parsers
import cnv_struct
# This file is part of CNVAnalysisToolkit.
#
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit. If not, see <http://www.gnu.org/licenses/>.
__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"
[docs]def main(args):
"""Filters out the germline de novo CNVs from a given dataset and writes
different summary files.
Multiple files are generated: Pickle files after merge and BED files for
visualization will be created for every family.
The number of de novo CNVs will be appended to the ``de_novo_germinal_summary.txt``
file in the form "family_id [TAB] number of gains [TAB] number of losses".
"""
# Hard code Filtering values TODO: integrate to argparse.
if (args.format == "cnver" and
args.doc_over == float("-infinity") and
args.doc_under == float("+infinity")):
# Default values for cnver
doc_lower_bound = 0.25
doc_upper_bound = 3.0
else:
doc_lower_bound = args.doc_over
doc_upper_bound = args.doc_under
ro_over = args.ro_twins # Upper bound for ro between twins.
ro_under = args.ro_parents # Upper boud for re between twin and parents.
# done
p = validation.get_parser_if_valid(args)
gains = 0
losses = 0
# Get the CNVs
cnvs = p.get_cnvs()
de_novos = []
counts = {"gain": 0, "loss": 0}
other = "twin2"
for sample in cnvs:
if sample == "twin1":
for chr in cnvs[sample]:
for cnv in cnvs[sample][chr]:
twin_ro, twin_cnv = cnv_struct.ro_to_best_in_list(
cnv,
cnvs[other][chr],
return_cnv = True
)
twin_ro = min(twin_ro)
father_ro = cnv_struct.ro_to_best_in_list(
cnv,
cnvs["father"][chr]
)
mother_ro = cnv_struct.ro_to_best_in_list(
cnv,
cnvs["mother"][chr]
)
max_max_parent_ro = max([max(father_ro), max(mother_ro)])
add = (twin_ro >= ro_over and
max_max_parent_ro <= ro_under and
cnv.size / 1000.0 > args.size_over)
if args.confidence_over is not None and add:
print cnv.confidence
print args.confidence_over
print cnv.confidence > args.confidence_over
add = add and (cnv.confidence > args.confidence_over)
if args.doc_under is not None and args.doc_over is not None:
add = add and (cnv.doc > doc_lower_bound and
cnv.doc < doc_upper_bound)
if add:
# Create a new CNV representing this de novo event.
assert cnv.type == twin_cnv.type
assert cnv.chr == twin_cnv.chr
pos = "chr{}:{}-{}".format(
chr,
min(twin_cnv.start, cnv.start),
max(twin_cnv.end, cnv.end),
)
denovo = cnv_struct.cnv(
pos = pos,
type = cnv.type,
algo = cnv.algo,
source = "denovo",
)
de_novos.append(denovo)
counts[denovo.type] += 1
# Write pickle
with open("de_novo_germinal_{}.pickle".format(p.family_id), "w") as f:
pickle.dump(de_novos, f)
# Write beds
with open("de_novo_germinal_{}.bed".format(p.family_id), "w") as f:
header_line = ("track name=\"germinal_{algo}_{family}\" "
"description=\"Germinal de novo CNVs for "
"family {family}\" visibility=2 itemRgb=\"On\"\n")
f.write(header_line.format(family = p.family_id,
algo = de_novos[0].algo))
line_template = "\t".join([
"chr{chr}",
"{start}",
"{end}",
"{info}",
"0", "+",
"{start}",
"{end}",
"{color}\n"])
for cnv in de_novos:
l = line_template.format(
chr = cnv.chr,
start = cnv.start,
end = cnv.end,
info = "type={}".format(cnv.type),
color = bed.colors[cnv.cn]
)
f.write(l)
# Write the report.
with open("de_novo_germinal_summary.txt", "a") as f:
# family,gains,losses
f.write("{}\t{}\t{}\n".format(
p.family_id,
counts["gain"],
counts["loss"]
))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Computes the average and std CNV size.")
parser = validation.add_pickle_args(parser)
parser.add_argument("--format",
type=str,
choices=parsers.__all__,
help="The file format for the parser.")
parser.add_argument("--base_dir",
type=str,
help=("The base directory for the calls. A directory with "
"four subfolders starting with father, mother, twin1 "
"and twin2 are expected."))
parser.add_argument("--doc_under",
type=float,
help="Maximum doc value (for filtering)",
default=None,
)
parser.add_argument("--doc_over",
type=float,
help="Minimum doc value (for filtering)",
default=None,
)
parser.add_argument("--ro_twins",
type=float,
help=("Only CNVs with RO between twins over this value"
" will be considered."),
default=0.9
)
parser.add_argument("--ro_parents",
type=float,
help=("Only CNVs with maximum parent RO under this "
"value will be considered."),
default=0.01
)
parser.add_argument("--size_over",
type=float,
help=("Minimum size for CNVs."),
default=5
)
parser.add_argument("--confidence_over",
type=float,
help="Minimum confidence for CNVs.",
default=None,
)
main(parser.parse_args())