Source code for plot_cnv_size_distribution

#!/usr/bin/env python2.7

# Computes the average CNV size and std.

import argparse
import numpy as np
import matplotlib.pyplot as plt

import validation
import parsers

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

[docs]def main(args): """Simple script to plot the histogram of CNV sizes over a dataset. The mean size and the standard deviation are also displayed. """ sizes = [] locs = [] source = [] p = validation.get_parser_if_valid(args) # Get the CNVs cnvs = p.get_cnvs() for sample in cnvs: for chr in cnvs[sample]: for cnv in cnvs[sample][chr]: sizes.append(cnv.size / 1000.0) locs.append(cnv.get_pos()) source.append(cnv.source) sizes = np.array(sizes) mu = np.mean(sizes) sigma = np.std(sizes) print "Mean size is: {} kb".format(mu) print "sigma = {}".format(sigma) # Print outliers (2 std) print print "Outlier CNVs are:" for i in xrange(len(sizes)): if sizes[i] < mu - 2 * sigma or sizes[i] > mu + 2 * sigma: print "{},{},{}".format(source[i], locs[i], sizes[i]) fig = plt.figure() ax = fig.add_subplot(111) sizes = sizes[sizes <= 100] n, bins, patches = ax.hist( sizes, 50, weights=np.zeros_like(sizes) + 1.0 / len(sizes), fc="#33B5E5" ) ax.set_xticks([int(i) for i in bins]) ax.set_xlabel("Taille (kilobases)") ax.set_ylabel("Proportion") if args.save is None: plt.show() else: plt.savefig(args.save)
if __name__ == "__main__": parser = argparse.ArgumentParser(description="Computes the average and std CNV size.") parser = validation.add_pickle_args(parser) parser.add_argument("--format", type=str, choices=parsers.__all__, help="The file format for the parser.") parser.add_argument("--base_dir", type=str, help=("The base directory for the calls. A directory with " "four subfolders starting with father, mother, twin1 " "and twin2 are expected.")) parser.add_argument("--save", type=str, help="Filename of the output graph file.", default=None ) main(parser.parse_args())