Source code for cnv_counter

#!/usr/bin/env python2.7

import os
import sys
import re
import argparse

import numpy as np

import parsers
import samples_db
import cnv_struct
import validation

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

[docs]def main(args): """This script is used for quick assessement of the amount and size distribution of CNVs according to a reciprocal overlap filter. The possible arguments for this scripts are threshold for lower and/or upper RO thresholds, or an exact RO value for filtering. A ``--size_stats`` argument can be used to display the mean and std size for the filtered CNVs. The output is of the form: FamID, number of match, total, percentage, mean_size, std_size """ counter = 0 total = 0 sizes = [] p = validation.get_parser_if_valid(args) cnvs = p.get_cnvs() sample_ids = samples_db.get_sample_ids_for_family(p.family_id) under = None over = None if args.under: under = args.under else: # Not restrictive if no constraint supplied. under = 1.1 if args.over: over = args.over else: over = -1.0 for sample in cnvs: other_sample = None if sample == "twin1": other_sample = "twin2" else: other_sample = "twin1" if sample.startswith("twin"): sample_id = sample_ids[sample] status = sample for chr in cnvs[sample]: for cnv in cnvs[sample][chr]: ro, _ = cnv_struct.ro_to_best_in_list(cnv, cnvs[other_sample][chr]) if not args.exact is None: if ro == args.exact: counter += 1 sizes.append(cnv.size / 1000.0) else: if ro <= under and ro >= over: counter += 1 sizes.append(cnv.size / 1000.0) total += 1 if args.size_stats: sizes = np.array(sizes) mean = np.mean(sizes) std = np.std(sizes) print "\t".join([ "family_{}".format(p.family_id), str(counter), str(total), "{:.2%}".format(1.0 * counter / total), str(mean), str(std) ]) else: # Prints FamID, number of match, total, percentage. print "family_{}\t{}\t{}\t{:.2%}".format(p.family_id, counter, total, 1.0 * counter / total)
if __name__ == "__main__": parser = argparse.ArgumentParser(description = "Fast filtering from RO") parser = validation.add_pickle_args(parser) parser.add_argument( "--format", type=str, choices=parsers.__all__, help="The file format for the parser." ) parser.add_argument( "--base_dir", "-i", type=str, help=("The base directory for the calls. A directory " "with four subfodlers starting with father, " "mother, twin1 and twin2 are expected.") ) parser.add_argument( "--under", type=float, required=False, default=None, help=("Counts CNVs with reciprocal overlap under the " "given threshold.") ) parser.add_argument( "--over", type=float, required=False, default=None, help=("Counts CNVs with reciprocal overlap over the " "given threshold.") ) parser.add_argument( "--exact", type=float, required=False, default=None, help=("Counts CNVs with reciprocal overlap at this " "exact threshold (over and under will be " "ignored.") ) parser.add_argument( "--size_stats", default=False, action="store_true", help=("Shows mean size and stdev with respect to " "the provided constraints.") ) main(parser.parse_args())