Source code for venn

#!/usr/bin/python

# Module to generate 5 way Venn diagrams in Python.

import argparse
import pickle
import pprint

import cnv_struct
import parsers
import cnv_db.db as db
from generate_cnv_graph import Vertex, Edge, ConnectedComponent, merge_ccs

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

__doc__ = """

This module generates a svg file containing a 5 way venn diagram. It was used
to generate the diagrams representing the overlap between the different methods.
You can see a sample diagram in Figure 1.

.. figure:: _static/venn.png
    :width: 800px
    :alt: Example of Venn diagram.

    **Figure 1.** A Venn diagram generated by the different methods from the
    :py:mod:`venn` module. The labels were modified using image processing
    software and the file was rasterized and saved to the png format.

"""

# Coordinates are of the form x, y.
DELTA_Y = "0.7ex"

letter = lambda x: chr(x + 1 + 96)

label_to_coors = {
    "a": (30, -300),
    "b": (300, -60),
    "c": (160, 280),
    "d": (-220, 220),
    "e": (-280, -130),
    "ab": (180, -130),
    "ac": (40, 230),
    "ad": (100, -200),
    "ae": (-80, -215),
    "bc": (190, 125),
    "bd": (-190, 120),
    "be": (230, 40),
    "cd": (-60, 220),
    "ce": (-170, -150),
    "de": (-222, 0),
    "abc": (90, 150),
    "abd": (148, -153),
    "abe": (170, -20),
    "acd": (-33, 208),
    "ace": (-93, -193),
    "ade": (20, -180),
    "bcd": (-120, 120),
    "bce": (190, 100),
    "bde": (-211, 32),
    "cde": (-150, -80),
    "abcd": (-30, 160),
    "abce": (140, 80),
    "abde": (120, -100),
    "acde": (-60, -140),
    "bcde": (-160, 20),
    "abcde": (0, 0),
}

[docs]def create_label(x, y, label, dy=DELTA_Y, font_size=None): """Creates the html tag to display text on the Venn diagram. :param x: The x coordinate. :type x: str :param y: The y coordinate. :type y: str :param label: The text to display. :type label: str :param dy: The y axis offset (default: 0.7ex). :type dy: str :param font_size: The font size for the label. :type font_size: str :returns: A string representing the html for this label. :rtype: str """ if font_size: font_size = ' font-size="{}"'.format(font_size) else: font_size = '' s = " <text x='{x}' y='{y}' dy='{dy}'{fs}>{label}</text>\n" s = s.format( x = x, y = y, dy = dy, fs = font_size, label = label, ) return s
[docs]def create_venn_from_dict(d, file_name = 'venn_diagram.svg'): """Creates a Venn diagram from a dictionary of counts for the different sets. :param d: The dictionary, as described below. :type d: dict :param file_name: The output svg filename. :type file_name: str Takes a dictionary containing elements of the power set of ``{a,b,c,d,e}`` as keys and the count for every category as values. """ s = ('<?xml version="1.0" encoding="UTF-8"?>\n' '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">\n' '<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="746" height="742" viewBox="-362 -388 746 742">\n' ' <title>Radially-symmetrical Five-set Venn Diagram</title>\n' ' <desc>Devised by Branko Gruenbaum and rendered by CMG Lee.</desc>\n' ' <defs>\n' ' <ellipse id="ellipse" cx="36" cy="-56" rx="160" ry="320" />\n' ' <g id="ellipses">\n' ' <use xlink:href="#ellipse" fill="#0000ff" />\n' ' <use xlink:href="#ellipse" fill="#0099ff" transform="rotate(72)" />\n' ' <use xlink:href="#ellipse" fill="#00cc00" transform="rotate(144)" />\n' ' <use xlink:href="#ellipse" fill="#cc9900" transform="rotate(216)" />\n' ' <use xlink:href="#ellipse" fill="#ff0000" transform="rotate(288)" />\n' ' </g>\n' ' </defs>\n' ' <use xlink:href="#ellipses" fill-opacity="0.3" />\n' ' <use xlink:href="#ellipses" fill-opacity="0" stroke="#000" stroke-width="2" />\n' ' <g text-anchor="middle" font-family="sans-serif" font-size="16">\n') text_fields = [] for set_id in d: # Coors x, y = label_to_coors[set_id] # The count label = d[set_id] fs = None # Set specific font sizes for larger and smaller regions. if set_id in ('a', 'b', 'c', 'd', 'e'): fs = 40 elif set_id in ('abd', 'acd', 'ace', 'bce', 'bde'): fs = 14 text_field = create_label(x, y, label, font_size = fs) text_fields.append(text_field) s += ''.join(text_fields) s += '\n </g>\n</svg>\n' with open(file_name, 'wb') as f: f.write(s)
[docs]def position_based_venn(cnv_lists, threshold): """Creates the Venn diagram from the lists of CNVs for the different methods. :param cnv_lists: The list containing the CNVs from the different methods. (i.e. a 2D list where li[0] is a list of CNVs from method 0, li[1] is a list of CNVs from method 1, etc.) :type cnv_lists: list :param threshold: The reciprocal overlap threshold to consider CNVs to be identical (``0 < threshold <= 1``). :type threshold: float :returns: The counts dictionary that can be used by the :py:func:`create_venn_from_dict` method. :rtype: dict Since the equivalence between two CNVs is hard to define, we used the method described below to generate the counts for every method. CNVs from the same algorithm that are linked through chaining with other CNVs are discarded from the Venn. :: e.g. 1. ======= ======== 2. ========= 3. ======== 4. ========== In the above example, the two distinct CNVs from algorithm one are in the same cluster when considering `R.O. > 70%`. Thus, we will discard the second CNV from algorithm 1 from the analysis, as incrementing the count would affect the absolute counts from the other algorithms. We will remember the ignored CNVs. """ ignored = {} # Set the special property and merge the lists. li = [] for i in xrange(len(cnv_lists)): print 'Expect set {} to have {} CNVs (total).'.format(i, len(cnv_lists[i])) for cnv in cnv_lists[i]: cnv.__parent_set = i li.append(cnv) # Sort in growing order. li = sorted(li, key = lambda x: (x.chr, x.start)) # Initialize the counts counts = {} for k in label_to_coors: counts[k] = 0 # Walk i = 0 while i + 1 < len(li): # Start a new signature block. signature = set() signature.add(li[i].__parent_set) cnvs = [li[i], ] while i + 1 < len(li) and overlap(li[i], li[i + 1], threshold): next_cnv = li[i + 1] # Look for duplicates withing an algorithm. if next_cnv.__parent_set in signature: l = letter(next_cnv.__parent_set) if ignored.get(l) is None: ignored[l] = [] ignored[l].append(next_cnv) signature.add(next_cnv.__parent_set) cnvs.append(next_cnv) i += 1 # Check if this is the last iteration (add the last CNV because he has # no successor. if i + 1 == len(li) - 1: counts[letter(li[i + 1].__parent_set)] += 1 # The signature for this block is ready. # Check for dgv. in_dgv = False for cnv in cnvs: if not in_dgv: for dgv_cnv in db.query_dgv_overlap(cnv): if cnv.type == dgv_cnv.type: if cnv_struct.overlap(cnv, dgv_cnv, threshold): in_dgv = True break signature = "".join(sorted([letter(s) for s in signature])) if in_dgv: signature += 'e' counts[signature] += 1 i += 1 if len(ignored) > 0: print 'Duplicate CNVs within an algorithm were ignored: ' pprint.pprint(ignored) return counts
def overlap(cnv1, cnv2, threshold): if cnv1.type != cnv2.type: return False else: return cnv_struct.overlap(cnv1, cnv2, threshold) def main(args): # Load the CNV lists. cnv_lists = [] pickles = [args.pickle1, args.pickle2, args.pickle3, args.pickle4] for p in pickles: with open(p, 'rb') as f: cnv_lists.append(pickle.load(f)) # Counting stuff... counts = position_based_venn(cnv_lists, args.threshold) print "Writing the SVG file..." create_venn_from_dict(counts, args.out) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Generate a 5 way Venn diagram' 'from 4 input pickles (lists of cnvs) and the DGV.') parser.add_argument( '--pickle1', '-p1', type=str, required=True, help='First pickle containing a list of CNVs.' ) parser.add_argument( '--pickle2', '-p2', type=str, required=True, help='Second pickle containing a list of CNVs.' ) parser.add_argument( '--pickle3', '-p3', type=str, required=True, help='Third pickle containing a list of CNVs.' ) parser.add_argument( '--pickle4', '-p4', type=str, required=True, help='Fourth pickle containing a list of CNVs.' ) parser.add_argument( '--label1', '-l1', type=str, default='1', help='Label for the first list of cnvs.' ) parser.add_argument( '--label2', '-l2', type=str, default='2', help='Label for the second list of cnvs.' ) parser.add_argument( '--label3', '-l3', type=str, default='3', help='Label for the third list of cnvs.' ) parser.add_argument( '--label4', '-l4', type=str, default='4', help='Label for the fourth list of cnvs.' ) parser.add_argument( '--out', '-o', type=str, default='venn_diagram.svg', help='Output svg filename.' ) parser.add_argument( '--threshold', '-ro', type=float, default=0.7, help='Minimum reciprocal overlap for the Venn.' ) main(parser.parse_args())