Source code for de_novo_germinal

#!/usr/bin/env python2.7

import sys
import argparse
import pickle

import numpy as np

import bed
import validation
import parsers
import cnv_struct

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

[docs]def main(args): """Filters out the germline de novo CNVs from a given dataset and writes different summary files. Multiple files are generated: Pickle files after merge and BED files for visualization will be created for every family. The number of de novo CNVs will be appended to the ``de_novo_germinal_summary.txt`` file in the form "family_id [TAB] number of gains [TAB] number of losses". """ # Hard code Filtering values TODO: integrate to argparse. if (args.format == "cnver" and args.doc_over == float("-infinity") and args.doc_under == float("+infinity")): # Default values for cnver doc_lower_bound = 0.25 doc_upper_bound = 3.0 else: doc_lower_bound = args.doc_over doc_upper_bound = args.doc_under ro_over = args.ro_twins # Upper bound for ro between twins. ro_under = args.ro_parents # Upper boud for re between twin and parents. # done p = validation.get_parser_if_valid(args) gains = 0 losses = 0 # Get the CNVs cnvs = p.get_cnvs() de_novos = [] counts = {"gain": 0, "loss": 0} other = "twin2" for sample in cnvs: if sample == "twin1": for chr in cnvs[sample]: for cnv in cnvs[sample][chr]: twin_ro, twin_cnv = cnv_struct.ro_to_best_in_list( cnv, cnvs[other][chr], return_cnv = True ) twin_ro = min(twin_ro) father_ro = cnv_struct.ro_to_best_in_list( cnv, cnvs["father"][chr] ) mother_ro = cnv_struct.ro_to_best_in_list( cnv, cnvs["mother"][chr] ) max_max_parent_ro = max([max(father_ro), max(mother_ro)]) add = (twin_ro >= ro_over and max_max_parent_ro <= ro_under and cnv.size / 1000.0 > args.size_over) if args.confidence_over is not None and add: print cnv.confidence print args.confidence_over print cnv.confidence > args.confidence_over add = add and (cnv.confidence > args.confidence_over) if args.doc_under is not None and args.doc_over is not None: add = add and (cnv.doc > doc_lower_bound and cnv.doc < doc_upper_bound) if add: # Create a new CNV representing this de novo event. assert cnv.type == twin_cnv.type assert cnv.chr == twin_cnv.chr pos = "chr{}:{}-{}".format( chr, min(twin_cnv.start, cnv.start), max(twin_cnv.end, cnv.end), ) denovo = cnv_struct.cnv( pos = pos, type = cnv.type, algo = cnv.algo, source = "denovo", ) de_novos.append(denovo) counts[denovo.type] += 1 # Write pickle with open("de_novo_germinal_{}.pickle".format(p.family_id), "w") as f: pickle.dump(de_novos, f) # Write beds with open("de_novo_germinal_{}.bed".format(p.family_id), "w") as f: header_line = ("track name=\"germinal_{algo}_{family}\" " "description=\"Germinal de novo CNVs for " "family {family}\" visibility=2 itemRgb=\"On\"\n") f.write(header_line.format(family = p.family_id, algo = de_novos[0].algo)) line_template = "\t".join([ "chr{chr}", "{start}", "{end}", "{info}", "0", "+", "{start}", "{end}", "{color}\n"]) for cnv in de_novos: l = line_template.format( chr = cnv.chr, start = cnv.start, end = cnv.end, info = "type={}".format(cnv.type), color = bed.colors[cnv.cn] ) f.write(l) # Write the report. with open("de_novo_germinal_summary.txt", "a") as f: # family,gains,losses f.write("{}\t{}\t{}\n".format( p.family_id, counts["gain"], counts["loss"] ))
if __name__ == "__main__": parser = argparse.ArgumentParser(description="Computes the average and std CNV size.") parser = validation.add_pickle_args(parser) parser.add_argument("--format", type=str, choices=parsers.__all__, help="The file format for the parser.") parser.add_argument("--base_dir", type=str, help=("The base directory for the calls. A directory with " "four subfolders starting with father, mother, twin1 " "and twin2 are expected.")) parser.add_argument("--doc_under", type=float, help="Maximum doc value (for filtering)", default=None, ) parser.add_argument("--doc_over", type=float, help="Minimum doc value (for filtering)", default=None, ) parser.add_argument("--ro_twins", type=float, help=("Only CNVs with RO between twins over this value" " will be considered."), default=0.9 ) parser.add_argument("--ro_parents", type=float, help=("Only CNVs with maximum parent RO under this " "value will be considered."), default=0.01 ) parser.add_argument("--size_over", type=float, help=("Minimum size for CNVs."), default=5 ) parser.add_argument("--confidence_over", type=float, help="Minimum confidence for CNVs.", default=None, ) main(parser.parse_args())