#!/usr/bin/env python2.7
# Creates the mendelian CNVs bed and pickle.
import sys
import argparse
import pickle
import numpy as np
import bed
import validation
import parsers
import cnv_struct
# This file is part of CNVAnalysisToolkit.
#
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit. If not, see <http://www.gnu.org/licenses/>.
__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"
[docs]def main(args):
"""Displays statistics about the rate of inherited CNVs for a particular
algorithm and write a pickle file containing those variants.
The statistics are the following:
- The proportion of twin-overlapping CNVs: The proportion of CNVs which
are shared between both twins (The closer to 1 the better as the twins
have identical genomes, when neglecting somatic `de novo` variations).
- The proportion of twin-overlapping CNVs that are inherited: The
proportion of CNVs that are shared with a parent, when considering
only CNVs that are shared between both twins. This expected to be a value
close to one as Mendelian inheritance dictates that CNVs that are in
the twins are inherited from the parents (if we neglect the `de novo`
rate).
- The proportion of inherited CNVs: The proportion of CNVs which are shared
amongst both twins and at least one parent.
.. note::
The script's ``--help`` option will show the user how to customize the
reciprocal overlap thresholds for comparison of CNVs between twins and
with their parents.
"""
# if (args.format == "cnver" and
# args.doc_over == float("-infinity") and
# args.doc_under == float("+infinity")):
# # Default values for cnver
# doc_lower_bound = 0.25
# doc_upper_bound = 3.0
# else:
# doc_lower_bound = args.doc_over
# doc_upper_bound = args.doc_under
doc_lower_bound = args.doc_over
doc_upper_bound = args.doc_under
parents_ro = args.parents_ro
twins_ro = args.twins_ro
p = validation.get_parser_if_valid(args)
gains = 0
losses = 0
# Get the CNVs
cnvs = p.get_cnvs()
inherited, counts = get_inherited(cnvs, twins_ro, parents_ro,
doc_lower_bound, doc_upper_bound)
i = 2.0 * counts["good_twin_ro"] / counts["total"]
print ("The proportion of twin-overlapping CNVs is {}".format(i))
i = float(len(inherited)) / counts["good_twin_ro"]
print ("The proportion of twin-overlapping "
"CNVs that are inherited is: {}").format(i)
i = 2.0 * len(inherited) / counts["total"]
print "The proportion of inherited CNVs is: {} ({}/{})".format(i,
2 * len(inherited), counts["total"])
# Write pickle
if args.format:
fn = "inherited_{}_{}.pickle".format(p.family_id, args.format)
else:
fn = "inherited_{}.pickle".format(p.family_id)
with open(fn, "w") as f:
pickle.dump(inherited, f)
[docs]def get_inherited(cnvs,
twins_ro,
parents_ro,
doc_lower_bound = float("-infinity"),
doc_upper_bound = float("+infinity")):
"""Filters the input CNV list to detect and count the inherited variants.
:param cnvs: A dict of CNVs from a same algorithm and family to
filter for inherited CNVs (of the usual form: sample,
chromosome, cnvs.
:type cnvs: dict
:param twins_ro: Minimum reciprocal overlap value to consider two CNVs to
be the same between twins.
:type twins_ro: float
:param parents_ro: Minimum reciprocal overlap value to consider two CNVs to
be the same between a twin and a parent.
:type parents_ro: float
:param doc_lower_bound: Minimum ``doc`` parameter for the filtered CNVs,
only used in very specific cases.
:type doc_lower_bound: float
:param doc_upper_bound: Maximum ``doc`` parameter for the filtered CNVs,
only used in very specific cases.
:type doc_upper_bound: float
:return: A tuple representing the list of inherited CNVs and a dictionary
informations about the frequency of specific scenarios for display
to the user.
:rtype: tuple
CNVs are considered to be inherited if they are shared between the two twins
and at least one parent, with respect to the `ro` parameters. DOC values
are allowed for filtering of CNVs with respect to the values in the CNV's
object ``doc`` attribute.
"""
inherited = []
counts = {"gain": 0,
"loss": 0,
"good_twin_ro": 0,
"total": 0}
for chr in cnvs['twin1']:
counts["total"] += (len(cnvs['twin1'][chr]) +
len(cnvs['twin2'][chr]))
for cnv in cnvs['twin1'][chr]:
twin_ro, twin_cnv = cnv_struct.ro_to_best_in_list(
cnv,
cnvs['twin2'][chr],
return_cnv = True
)
twin_ro = min(twin_ro)
father_ro = cnv_struct.ro_to_best_in_list(
cnv,
cnvs["father"][chr]
)
mother_ro = cnv_struct.ro_to_best_in_list(
cnv,
cnvs["mother"][chr]
)
# This should be above the ro threshold.
max_min_parent_ro = max(min(mother_ro),
min(father_ro))
add = True
if (cnv.doc and
cnv.doc < doc_lower_bound or
cnv.doc > doc_upper_bound):
add = False
elif twin_ro < twins_ro:
add = False
elif max_min_parent_ro < parents_ro:
add = False
# Statistics.
if twin_ro > twins_ro:
counts["good_twin_ro"] += 1
if add:
# Create a new CNV representing this inherited CNV.
assert cnv.type == twin_cnv.type
assert cnv.chr == twin_cnv.chr
pos = "chr{}:{}-{}".format(
chr,
min(twin_cnv.start, cnv.start),
max(twin_cnv.end, cnv.end),
)
new_cnv = cnv_struct.cnv(
pos = pos,
type = cnv.type,
algo = cnv.algo,
source = None,
confidence = getattr(cnv,
"confidence", None)
)
inherited.append(new_cnv)
if new_cnv.type in ("gain", "loss"):
counts[new_cnv.type] += 1
return inherited, counts
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Computes the average and std CNV size.")
parser = validation.add_pickle_args(parser)
parser.add_argument("--format",
type=str,
choices=parsers.__all__,
help="The file format for the parser.")
parser.add_argument("--base_dir",
type=str,
help=("The base directory for the calls. A directory with "
"four subfolders starting with father, mother, twin1 "
"and twin2 are expected."))
parser.add_argument("--doc_under",
type=float,
help="Maximum doc value (for filtering)",
default=float("+infinity"),
)
parser.add_argument("--doc_over",
type=float,
help="Minimum doc value (for filtering)",
default=float("-infinity"),
)
parser.add_argument("--parents_ro",
type=float,
help=("Only consider CNVs that overlap with this "
"minimum threshold between the twins and "
"only one parent (to reduce false positives) "
"default %(default).1f."),
default=0.5
)
parser.add_argument("--twins_ro",
type=float,
help=("Only consider CNVs that overlap with this "
"minimum threshold between the two twins "
"default %(default).1f."),
default=0.9
)
main(parser.parse_args())