Source code for mendelian

#!/usr/bin/env python2.7

# Creates the mendelian CNVs bed and pickle.

import sys
import argparse
import pickle

import numpy as np

import bed
import validation
import parsers
import cnv_struct

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

[docs]def main(args): """Displays statistics about the rate of inherited CNVs for a particular algorithm and write a pickle file containing those variants. The statistics are the following: - The proportion of twin-overlapping CNVs: The proportion of CNVs which are shared between both twins (The closer to 1 the better as the twins have identical genomes, when neglecting somatic `de novo` variations). - The proportion of twin-overlapping CNVs that are inherited: The proportion of CNVs that are shared with a parent, when considering only CNVs that are shared between both twins. This expected to be a value close to one as Mendelian inheritance dictates that CNVs that are in the twins are inherited from the parents (if we neglect the `de novo` rate). - The proportion of inherited CNVs: The proportion of CNVs which are shared amongst both twins and at least one parent. .. note:: The script's ``--help`` option will show the user how to customize the reciprocal overlap thresholds for comparison of CNVs between twins and with their parents. """ # if (args.format == "cnver" and # args.doc_over == float("-infinity") and # args.doc_under == float("+infinity")): # # Default values for cnver # doc_lower_bound = 0.25 # doc_upper_bound = 3.0 # else: # doc_lower_bound = args.doc_over # doc_upper_bound = args.doc_under doc_lower_bound = args.doc_over doc_upper_bound = args.doc_under parents_ro = args.parents_ro twins_ro = args.twins_ro p = validation.get_parser_if_valid(args) gains = 0 losses = 0 # Get the CNVs cnvs = p.get_cnvs() inherited, counts = get_inherited(cnvs, twins_ro, parents_ro, doc_lower_bound, doc_upper_bound) i = 2.0 * counts["good_twin_ro"] / counts["total"] print ("The proportion of twin-overlapping CNVs is {}".format(i)) i = float(len(inherited)) / counts["good_twin_ro"] print ("The proportion of twin-overlapping " "CNVs that are inherited is: {}").format(i) i = 2.0 * len(inherited) / counts["total"] print "The proportion of inherited CNVs is: {} ({}/{})".format(i, 2 * len(inherited), counts["total"]) # Write pickle if args.format: fn = "inherited_{}_{}.pickle".format(p.family_id, args.format) else: fn = "inherited_{}.pickle".format(p.family_id) with open(fn, "w") as f: pickle.dump(inherited, f)
[docs]def get_inherited(cnvs, twins_ro, parents_ro, doc_lower_bound = float("-infinity"), doc_upper_bound = float("+infinity")): """Filters the input CNV list to detect and count the inherited variants. :param cnvs: A dict of CNVs from a same algorithm and family to filter for inherited CNVs (of the usual form: sample, chromosome, cnvs. :type cnvs: dict :param twins_ro: Minimum reciprocal overlap value to consider two CNVs to be the same between twins. :type twins_ro: float :param parents_ro: Minimum reciprocal overlap value to consider two CNVs to be the same between a twin and a parent. :type parents_ro: float :param doc_lower_bound: Minimum ``doc`` parameter for the filtered CNVs, only used in very specific cases. :type doc_lower_bound: float :param doc_upper_bound: Maximum ``doc`` parameter for the filtered CNVs, only used in very specific cases. :type doc_upper_bound: float :return: A tuple representing the list of inherited CNVs and a dictionary informations about the frequency of specific scenarios for display to the user. :rtype: tuple CNVs are considered to be inherited if they are shared between the two twins and at least one parent, with respect to the `ro` parameters. DOC values are allowed for filtering of CNVs with respect to the values in the CNV's object ``doc`` attribute. """ inherited = [] counts = {"gain": 0, "loss": 0, "good_twin_ro": 0, "total": 0} for chr in cnvs['twin1']: counts["total"] += (len(cnvs['twin1'][chr]) + len(cnvs['twin2'][chr])) for cnv in cnvs['twin1'][chr]: twin_ro, twin_cnv = cnv_struct.ro_to_best_in_list( cnv, cnvs['twin2'][chr], return_cnv = True ) twin_ro = min(twin_ro) father_ro = cnv_struct.ro_to_best_in_list( cnv, cnvs["father"][chr] ) mother_ro = cnv_struct.ro_to_best_in_list( cnv, cnvs["mother"][chr] ) # This should be above the ro threshold. max_min_parent_ro = max(min(mother_ro), min(father_ro)) add = True if (cnv.doc and cnv.doc < doc_lower_bound or cnv.doc > doc_upper_bound): add = False elif twin_ro < twins_ro: add = False elif max_min_parent_ro < parents_ro: add = False # Statistics. if twin_ro > twins_ro: counts["good_twin_ro"] += 1 if add: # Create a new CNV representing this inherited CNV. assert cnv.type == twin_cnv.type assert cnv.chr == twin_cnv.chr pos = "chr{}:{}-{}".format( chr, min(twin_cnv.start, cnv.start), max(twin_cnv.end, cnv.end), ) new_cnv = cnv_struct.cnv( pos = pos, type = cnv.type, algo = cnv.algo, source = None, confidence = getattr(cnv, "confidence", None) ) inherited.append(new_cnv) if new_cnv.type in ("gain", "loss"): counts[new_cnv.type] += 1 return inherited, counts
if __name__ == "__main__": parser = argparse.ArgumentParser(description="Computes the average and std CNV size.") parser = validation.add_pickle_args(parser) parser.add_argument("--format", type=str, choices=parsers.__all__, help="The file format for the parser.") parser.add_argument("--base_dir", type=str, help=("The base directory for the calls. A directory with " "four subfolders starting with father, mother, twin1 " "and twin2 are expected.")) parser.add_argument("--doc_under", type=float, help="Maximum doc value (for filtering)", default=float("+infinity"), ) parser.add_argument("--doc_over", type=float, help="Minimum doc value (for filtering)", default=float("-infinity"), ) parser.add_argument("--parents_ro", type=float, help=("Only consider CNVs that overlap with this " "minimum threshold between the twins and " "only one parent (to reduce false positives) " "default %(default).1f."), default=0.5 ) parser.add_argument("--twins_ro", type=float, help=("Only consider CNVs that overlap with this " "minimum threshold between the two twins " "default %(default).1f."), default=0.9 ) main(parser.parse_args())