Source code for plot_cnv_size_distribution
#!/usr/bin/env python2.7
# Computes the average CNV size and std.
import argparse
import numpy as np
import matplotlib.pyplot as plt
import validation
import parsers
# This file is part of CNVAnalysisToolkit.
#
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit. If not, see <http://www.gnu.org/licenses/>.
__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"
[docs]def main(args):
"""Simple script to plot the histogram of CNV sizes over a dataset.
The mean size and the standard deviation are also displayed.
"""
sizes = []
locs = []
source = []
p = validation.get_parser_if_valid(args)
# Get the CNVs
cnvs = p.get_cnvs()
for sample in cnvs:
for chr in cnvs[sample]:
for cnv in cnvs[sample][chr]:
sizes.append(cnv.size / 1000.0)
locs.append(cnv.get_pos())
source.append(cnv.source)
sizes = np.array(sizes)
mu = np.mean(sizes)
sigma = np.std(sizes)
print "Mean size is: {} kb".format(mu)
print "sigma = {}".format(sigma)
# Print outliers (2 std)
print
print "Outlier CNVs are:"
for i in xrange(len(sizes)):
if sizes[i] < mu - 2 * sigma or sizes[i] > mu + 2 * sigma:
print "{},{},{}".format(source[i], locs[i], sizes[i])
fig = plt.figure()
ax = fig.add_subplot(111)
sizes = sizes[sizes <= 100]
n, bins, patches = ax.hist(
sizes, 50,
weights=np.zeros_like(sizes) + 1.0 / len(sizes),
fc="#33B5E5"
)
ax.set_xticks([int(i) for i in bins])
ax.set_xlabel("Taille (kilobases)")
ax.set_ylabel("Proportion")
if args.save is None:
plt.show()
else:
plt.savefig(args.save)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Computes the average and std CNV size.")
parser = validation.add_pickle_args(parser)
parser.add_argument("--format",
type=str,
choices=parsers.__all__,
help="The file format for the parser.")
parser.add_argument("--base_dir",
type=str,
help=("The base directory for the calls. A directory with "
"four subfolders starting with father, mother, twin1 "
"and twin2 are expected."))
parser.add_argument("--save",
type=str,
help="Filename of the output graph file.",
default=None
)
main(parser.parse_args())