Source code for plot_distance_distribution

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-

# Plot the distance between CNVs in all samples.
import re
import argparse 
import numpy as np
import matplotlib.pyplot as plt

import parsers
import validation

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

[docs]def main(args): """Plots the distribution of the distance separating CNVs. This tool is essential to observe fragmentation, which can occur if an algorithm does not verify if the region between calls is actually part of the genotyped CNV. In this case, the CNVs will be separated by a very small genomic distance on the CNV scale. It is recommended that a threshold be selected and that a merge of the CNVs separated by a distance under this threshold be merged. .. figure:: _static/fragmentation.png :width: 800px :alt: Example of fragmentation in a family. **Figure 1.** Sample graphs illustrating a fragmentation problem. The cumulative histogram shows that 55% of CNVs are separated by less than 5 kilobases, which is a small distance on the CNV scale. """ p = validation.get_parser_if_valid(args) cnvs = p.get_cnvs() distances = {} if args.base_dir: family = re.search(r"family_([0-9]+)", args.base_dir) if family: family = family.group(1) else: family = args.family for sample in cnvs: for chromo in cnvs[sample]: sorted_cnvs = sorted(cnvs[sample][chromo], key=lambda x: x.start) i = 0 while i < len(sorted_cnvs) - 1: cnv = sorted_cnvs[i] next_cnv = sorted_cnvs[i + 1] if cnv.type == next_cnv.type: if not distances.get(sample): distances[sample] = [] # Distance in kilobases dist = (next_cnv.start - cnv.end - 1) / 1000.0 distances[sample].append(dist) if dist < 0: print cnv, next_cnv print i += 1 mean_distances = [] for sample in distances: distances[sample] = np.array(distances[sample]) mean_dist_for_sample = np.mean(distances[sample]) mean_distances.append(mean_dist_for_sample) mean_distances = np.array(mean_distances) print np.mean(mean_distances) samples_list = ("mother", "father", "twin1", "twin2") # mpp stands for merge_proportion_plot mppxs = {} mppys = {} for sample in samples_list: mppxs[sample], mppys[sample] = merge_proportion_plot(distances[sample]) fig = plt.figure() fig.subplots_adjust(wspace = 1) ax_hist1 = plt.subplot2grid((4, 5), (0, 0), colspan=2) ax_hist2 = plt.subplot2grid((4, 5), (1, 0), colspan=2) ax_hist3 = plt.subplot2grid((4, 5), (2, 0), colspan=2) ax_hist4 = plt.subplot2grid((4, 5), (3, 0), colspan=2) hists = (ax_hist1, ax_hist2, ax_hist3, ax_hist4) ax_curvs = plt.subplot2grid((4, 5), (0, 2), colspan=3, rowspan=4) ax_hist4.set_xlabel("Distance entre les CNV (kilobases)") colors = ("#33B5E5", "#FF4444", "#99CC00", "#FFBB33") colors = dict(zip(distances.keys(), colors)) i = 0 for sample in samples_list: dist_list = distances[sample] dist_list = dist_list[dist_list <= 25] hists[i].hist( dist_list, fc=colors[sample], bins=30, weights=np.zeros_like(dist_list) + 1.0 / len(dist_list) ) i += 1 ax_curvs.set_xlabel(u"Distance entre les CNV (kilobases)") ax_curvs.set_ylabel(u"Proportion cumulée") plot_obj = [] for sample in samples_list: o, = ax_curvs.plot(mppxs[sample], mppys[sample], color=colors[sample]) plot_obj.append(o) ax_curvs.legend(plot_obj, samples_list, loc='lower right') if family: plt.suptitle("Distribution des distances entre les CNV pour " "la famille {}".format(family)) if args.save is None: plt.show() else: plt.savefig(args.save)
def merge_proportion_plot(distances): x = np.arange(1, 100) y = np.zeros(len(x)) for i in xrange(len(y)): y[i] = float(sum(distances <= x[i])) / len(distances) return (x, y) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Computes the distance between CNVs.") parser = validation.add_pickle_args(parser) parser.add_argument("--format", type=str, choices=parsers.__all__, help="The file format for the parser.") parser.add_argument("--base_dir", type=str, help=("The base directory for the calls. A directory with " "four subfolders starting with father, mother, twin1 " "and twin2 are expected.")) parser.add_argument("--threshold", type=float, default=4.6e5, help=("A minimum distance thershold to easily identify the amount " "of CNVs that are too close and need to be merged.")) parser.add_argument("--save", type=str, help="Filename of the output graph file.", default=None ) main(parser.parse_args())