#!/usr/bin/env python2.7
import os
import sys
import re
import argparse
import numpy as np
import parsers
import samples_db
import cnv_struct
import validation
# This file is part of CNVAnalysisToolkit.
#
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit. If not, see <http://www.gnu.org/licenses/>.
__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"
[docs]def main(args):
"""This script is used for quick assessement of the amount and size
distribution of CNVs according to a reciprocal overlap filter.
The possible arguments for this scripts are threshold for lower and/or upper
RO thresholds, or an exact RO value for filtering.
A ``--size_stats`` argument can be used to display the mean and std size for
the filtered CNVs.
The output is of the form:
FamID, number of match, total, percentage, mean_size, std_size
"""
counter = 0
total = 0
sizes = []
p = validation.get_parser_if_valid(args)
cnvs = p.get_cnvs()
sample_ids = samples_db.get_sample_ids_for_family(p.family_id)
under = None
over = None
if args.under:
under = args.under
else:
# Not restrictive if no constraint supplied.
under = 1.1
if args.over:
over = args.over
else:
over = -1.0
for sample in cnvs:
other_sample = None
if sample == "twin1":
other_sample = "twin2"
else:
other_sample = "twin1"
if sample.startswith("twin"):
sample_id = sample_ids[sample]
status = sample
for chr in cnvs[sample]:
for cnv in cnvs[sample][chr]:
ro, _ = cnv_struct.ro_to_best_in_list(cnv,
cnvs[other_sample][chr])
if not args.exact is None:
if ro == args.exact:
counter += 1
sizes.append(cnv.size / 1000.0)
else:
if ro <= under and ro >= over:
counter += 1
sizes.append(cnv.size / 1000.0)
total += 1
if args.size_stats:
sizes = np.array(sizes)
mean = np.mean(sizes)
std = np.std(sizes)
print "\t".join([
"family_{}".format(p.family_id),
str(counter),
str(total),
"{:.2%}".format(1.0 * counter / total),
str(mean),
str(std)
])
else:
# Prints FamID, number of match, total, percentage.
print "family_{}\t{}\t{}\t{:.2%}".format(p.family_id,
counter,
total,
1.0 * counter / total)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description = "Fast filtering from RO")
parser = validation.add_pickle_args(parser)
parser.add_argument(
"--format",
type=str,
choices=parsers.__all__,
help="The file format for the parser."
)
parser.add_argument(
"--base_dir", "-i",
type=str,
help=("The base directory for the calls. A directory "
"with four subfodlers starting with father, "
"mother, twin1 and twin2 are expected.")
)
parser.add_argument(
"--under",
type=float,
required=False,
default=None,
help=("Counts CNVs with reciprocal overlap under the "
"given threshold.")
)
parser.add_argument(
"--over",
type=float,
required=False,
default=None,
help=("Counts CNVs with reciprocal overlap over the "
"given threshold.")
)
parser.add_argument(
"--exact",
type=float,
required=False,
default=None,
help=("Counts CNVs with reciprocal overlap at this "
"exact threshold (over and under will be "
"ignored.")
)
parser.add_argument(
"--size_stats",
default=False,
action="store_true",
help=("Shows mean size and stdev with respect to "
"the provided constraints.")
)
main(parser.parse_args())