Source code for cnv_counter

#!/usr/bin/env python2.7

import os
import sys
import re
import argparse

import numpy as np

import parsers
import samples_db
import cnv_struct
import validation

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

[docs]def main(args):
    """This script is used for quick assessement of the amount and size
    distribution of CNVs according to a reciprocal overlap filter.

    The possible arguments for this scripts are threshold for lower and/or upper
    RO thresholds, or an exact RO value for filtering.
    A ``--size_stats`` argument can be used to display the mean and std size for
    the filtered CNVs.

    The output is of the form:

        FamID, number of match, total, percentage, mean_size, std_size


    """
    counter = 0
    total = 0
    sizes = []

    p = validation.get_parser_if_valid(args)

    cnvs = p.get_cnvs()

    sample_ids = samples_db.get_sample_ids_for_family(p.family_id)

    under = None
    over = None
    if args.under:
        under = args.under
    else:
        # Not restrictive if no constraint supplied.
        under = 1.1

    if args.over:
        over = args.over
    else:
        over = -1.0

    for sample in cnvs:
        other_sample = None
        if sample == "twin1":
            other_sample = "twin2"
        else:
            other_sample = "twin1"

        if sample.startswith("twin"):
            sample_id = sample_ids[sample]
            status = sample

            for chr in cnvs[sample]:
                for cnv in cnvs[sample][chr]:
                    ro, _ = cnv_struct.ro_to_best_in_list(cnv, 
                                                       cnvs[other_sample][chr])
                    if not args.exact is None:
                        if ro == args.exact:
                            counter += 1
                            sizes.append(cnv.size / 1000.0)

                    else:
                        if ro <= under and ro >= over:
                            counter += 1
                            sizes.append(cnv.size / 1000.0)

                    total += 1
                    
    if args.size_stats:
        sizes = np.array(sizes)
        mean = np.mean(sizes)
        std = np.std(sizes)
        print "\t".join([ 
                            "family_{}".format(p.family_id),
                            str(counter),
                            str(total),
                            "{:.2%}".format(1.0 * counter / total),      
                            str(mean),
                            str(std)
                        ])

    else:
        # Prints FamID, number of match, total, percentage.
        print "family_{}\t{}\t{}\t{:.2%}".format(p.family_id, 
                                                 counter,
                                                 total,
                                                 1.0 * counter / total)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description = "Fast filtering from RO")

    parser = validation.add_pickle_args(parser)

    parser.add_argument(
                            "--format",
                            type=str,
                            choices=parsers.__all__,
                            help="The file format for the parser."
                       )

    parser.add_argument(
                            "--base_dir", "-i",
                            type=str,
                            help=("The base directory for the calls. A directory "
                                  "with four subfodlers starting with father, "
                                  "mother, twin1 and twin2 are expected.")
                       )

    parser.add_argument(
                            "--under",
                            type=float,
                            required=False,
                            default=None,
                            help=("Counts CNVs with reciprocal overlap under the "
                                  "given threshold.")
                       )

    parser.add_argument(
                            "--over",
                            type=float,
                            required=False,
                            default=None,
                            help=("Counts CNVs with reciprocal overlap over the "
                                  "given threshold.")
                       )

    parser.add_argument(
                            "--exact",
                            type=float,
                            required=False,
                            default=None,
                            help=("Counts CNVs with reciprocal overlap at this "
                                  "exact threshold (over and under will be "
                                  "ignored.")
                       )

    parser.add_argument(
                            "--size_stats",
                            default=False,
                            action="store_true",
                            help=("Shows mean size and stdev with respect to "
                                  "the provided constraints.")
                       )

    main(parser.parse_args())
Navigation

Source code for cnv_counter

Quick search

Navigation