Source code for generate_cnv_graph

#!/usr/bin/env python2.7

import sys
import time
import argparse
import pprint
import pickle
import gzip

import numpy as np

import validation
import parsers
import cnv_struct

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

[docs]def main(args):
    """Generates a graph structure representing the familial status for a given
    loci.

    The signature matrix represents the status of every individual of a given
    family at a given loci. The status can be ``+``, ``-`` or ``0``, representing
    a gain, a loss or a `no call`, respectively. This being said, given a 
    particular loci, the Mendelian inheritance can of a variant can be quickly
    assessed by contemplating the status for every indivdual from a family.
    This is why we generate matrices with the status symbol for every individual
    in the family and count the number of times a signature occurs.
    As an example, let's say that both twins and the mother have a deletion, and
    the father had no CNV called at the given region, the signature would be
    ``(-, -, -, 0)`` as the arbitrary order for signatures is always 
    ``(twin1, twin2, mother, father)``.

    The goal of such an analysis was to quickly assess the amount of inherited
    CNVs and to detect any algorithm-specific biais.

    A pileup file parsing utility is also integrated with this tool allowing
    the validation of the regions by comparing the coverage inside and outside
    of the CNV loci. Such an analysis had modest success.

    """

    p = validation.get_parser_if_valid(args)

    # Get the CNVs
    cnvs = p.get_cnvs()

    # Create the main graph
    print "Creating the main graph."
    connected_components = create_family_graph(cnvs, args.threshold)

    # Merge CNVs from same type and same sample.
    # Merge overlapping CNVs
    print "Cleaning connected components (merge same sample vertices)."
    connected_components = clean_ccs(connected_components)

    # Save the graph pickle
    print "Saving the basic graph pickle."
    if args.save is not None:
        with open(args.save, "wb") as f:
            pickle.dump(connected_components, f)
    
    # If the path to pileups is provided, coverage information will be fetched.
    pileups = (args.twin1_pileup,
               args.twin2_pileup,
               args.mother_pileup,
               args.father_pileup)

    if None not in pileups:
        print "Loading the pileups."
        print "Fetching coverage information for samples in family."
        get_coverage(connected_components, *pileups)

        print "Writing the graph with coverage information."
        with open("ccs_with_coverage.pickle", "wb") as f:
            pickle.dump(connected_components, f)

    print "Building the final matrix"
    # Build the signature matrix.
    check_profiles(connected_components)

[docs]def get_coverage(cc_list, twin1_pileup, twin2_pileup, 
                 mother_pileup, father_pileup, window_size = 1000):
    """Computes the coverage inside and outside of every CNV loci represented
    by a connected component in the cc_list graph.

    :param cc_list: Graph represented by a list of its connected components.
    :type cc_list: list

    :param twin1_pileup: Path to the pileup for twin1.
    :type twin1_pileup: str

    :param twin2_pileup: Path to the pileup for twin2.
    :type twin2_pileup: str

    :param mother_pileup: Path to the pileup for mother.
    :type mother_pileup: str

    :param father_pileup: Path to the pileup for father.
    :type father_pileup: str

    :param window_size: The size of genomic window around the region for
                        coverage computation.
    :type window_size: int

    Concretely, this script adds the region_doc and cc_doc attributes to every
    connected component in the graph. The difference between those values
    can then be included in the printed matrices.

    """
    cc_list = sorted(cc_list, key = lambda cc: (cc.chr, cc.start))
    t1 = open_pileup(twin1_pileup)
    t2 = open_pileup(twin2_pileup)
    mother = open_pileup(mother_pileup)
    father = open_pileup(father_pileup)

    pileups = [t1, t2, mother, father]

    normalize_pileups_starting_position(*pileups)

    indexes = create_seek_index(pileups)

    count = 0
    for cc in cc_list:

        count += 1
        if count % int(float(len(cc_list)) / 100) == 0:
            print int(round(100.0 * count / len(cc_list))), "%"

        # Seek to beginning of region - window size.
        eof = seek_to(cc.start - window_size, cc.chr, pileups, indexes)
        if eof:
            # We don't have coverage information for this cc region.
            cc.region_doc = None
            cc.out_doc = None
            continue
        
        # Average coverage until region start.
        ones = np.ones(4)
        cov_prefix = np.zeros(4)
        prefix_count = np.ones(4)

        lines = [i.readline().split("\t") for i in pileups]
        positions = [int(i[1]) for i in lines]

        while int(lines[0][1]) < cc.start:
            cov_prefix += np.array([int(i[3]) for i in lines])
            prefix_count += ones
            lines = [i.readline().split("\t") for i in pileups]

        # Average coverage in region.
        cov_region = np.zeros(4)
        region_count = np.ones(4)

        lines = [i.readline().split("\t") for i in pileups]
        while int(lines[0][1]) < cc.end:
            cov_region += np.array([int(i[3]) for i in lines])
            region_count += ones
            lines = [i.readline().split("\t") for i in pileups]


        # Average coverage after region.
        cov_postfix = np.zeros(4)
        postfix_count = np.ones(4)

        lines = [i.readline().split("\t") for i in pileups]
        while int(lines[0][1]) < cc.end + window_size:
            cov_postfix += np.array([int(i[3]) for i in lines])
            postfix_count += ones
            lines = [i.readline().split("\t") for i in pileups]

        # Compute the averages.
        cov_prefix /= prefix_count
        cov_region /= region_count
        cov_postfix /= postfix_count
        cov_out = np.zeros(4)

        for i in xrange(len(cov_prefix)):
            cov_out[i] = (cov_prefix[i] + cov_postfix[i]) / 2.0

        cc.region_doc = cov_region
        cc.out_doc = cov_out 

    for f in pileups:
        f.close()

[docs]def create_seek_index(pileups):
    """Create and index of the seek positions (tell) to the genomic position.

    This is used to quickly move around very large pileup files. Use it only
    on unzipped files.

    """
    indexes = [[], [], [], []]
    # The indexes are of the form: [[(chr1, pos1, seek1), ...], # sample 1
    #                               [(chr1, pos1, seek1), ...], # sample 2
    #                                       ...  

    print "Creating seek index."
    step = int(5e6)
    for i in xrange(len(pileups)):
        # For every pileup.
        stop = False
        while not stop:
            pileups[i].seek(pileups[i].tell() + step)
            # Flush old line fragment
            pileups[i].readline()
            tell_pos = pileups[i].tell()
            try:
                line = pileups[i].readline().split("\t")
                pos = int(line[1])
                chromo = int(line[0][3:])
                indexes[i].append((chromo, pos, tell_pos))
            except IndexError:
                # EOF.
                stop = True
            except ValueError:
                # Sexual chr
                stop = True

    for i in pileups:
        i.seek(0)
    print "Finished creating seek index."

    return indexes

[docs]def normalize_pileups_starting_position(*pileups):
    """Takes an arbitrary number of files and uses readline so that all the
       files have the same starting position as the highest starting position
       pileup.

    """
    # Make sure the start on same chromosome.
    first_lines = [f.readline().split("\t") for f in pileups]
    chromos = [int(l[0][3:]) for l in first_lines]
    assert len(set(chromos)) == 1 and type(chromos[0]) is int

    positions = [int(l[1]) for l in first_lines]
    max_starting_pos = max(positions)
    for i in xrange(len(pileups)):
        pos = positions[i]
        while pos < max_starting_pos:
            pos = int(pileups[i].readline().split("\t")[1])

[docs]def seek_to(position, chromo, pileups, indexes, seek_profiling = False):
    """Seeks to a given position for all pileups. """
    # Compute metrics for the number of back-seeks as a function of the
    # distance.
    first = True
    stop = False
    ti = time.time()
    if seek_profiling: print "\tseek {}:{}".format(chromo, position)

    # Read current lines
    lines = [i.readline() for i in pileups]

    # The indexes are of the form: chromo => genomic position => seek position.

    # Start seeking
    while not stop:
        stop = True

        # Foreach pilup
        for i in xrange(len(pileups)):
            # We make sure that the parsed chromosome is still before
            # the position we're looking for.
            sexual = False

            try:
                this_chromo = int(lines[i].split("\t")[0][3:])
            except ValueError:
                sexual = True
            this_pos = int(lines[i].split("\t")[1])

            if not sexual:
                assert this_chromo <= chromo

                # If we're too early or too far from destination, check the
                # index.
                if (first and
                    abs(this_chromo * this_pos - chromo * position) > 100000):
                    if seek_profiling: sys.stdout.write("\tINDEX LOOKUP... ")

                    # Sorted genomic positions
                    milestones = sorted(indexes[i], 
                                       key = lambda x: (x[0], x[1]))

                    # Find closes 'floored' milestone and seek there.
                    # TODO: log(n) lookups since milestones are sorted.
                    j = 0
                    # Seek to beginning of the right chromosome
                    if chromo != 1:
                        while milestones[j][0] < chromo and j < len(milestones):
                            j += 1
                    # Now seek to the right position.
                    while (milestones[j][1] < position and 
                           milestones[j][0] == chromo and
                           j < len(milestones)):
                        j += 1
                    
                    if seek_profiling:
                        s = "INDEX HIT: ({}:{}) -> ".format(chromo, position)
                        s += str(milestones[j])
                        print s

                    this_chromo = milestones[j][0]
                    this_pos = milestones[j][1]
                    pileups[i].seek(milestones[j][2])

                    stop = False

                # Check if we're too far.
                if first and (this_pos > position or this_chromo > chromo):
                    go_back = True
                    while go_back:
                        # if seek_profiling: print "<<<< "
                        pileups[i].seek(pileups[i].tell() - 10000)
                        pileups[i].readline()
                        lines[i] = pileups[i].readline()
                        l = lines[i].split("\t")
                        this_pos = int(l[1])
                        this_chromo = int(l[0][3:])
                        if (this_pos <= position and
                            this_chromo <= chromo):
                            go_back = False
                    stop = False

                if this_chromo < chromo or this_pos < position:
                    stop = False
                    # Seek individually.
                    lines[i] = pileups[i].readline()

        first = False

    # True if we reached the end of the autosomes in at least one pileup.
    positions = [int(i.split("\t")[1]) for i in lines]
    tf = time.time()
    if seek_profiling: print "\tendseek {} ({})".format(positions, tf - ti)
    return sexual

def open_pileup(path):
    if path.endswith(".gz"):
        return gzip.open(path, "rb")
    else:
        return open(path, "r")

[docs]def check_profiles(graph):
    """Counts the signatures and prints the matrix given a signature graph.

    As described in the :py:func:`generate_cnv_graph.main` method, the signatures
    represent the status for every member of the family at a given loci.
    
    A sample matrix could be:

    +-------+-------+--------+--------+-------+
    | Twin1 | Twin2 | Mother | Father | Count |
    +=======+=======+========+========+=======+
    |   \+  |  \+   |   \-   |    0   |   52  |
    +-------+-------+--------+--------+-------+
    |   0   |  \-   |   \-   |    0   |  105  |
    +-------+-------+--------+--------+-------+
    |   \-  |  \-   |   \-   |   \-   |   21  |
    +-------+-------+--------+--------+-------+

    Which says that at 52 loci, both twins had gains, the mother had a deletion
    and the father had no detected CNV.
    Same reasoning goes for the two other signatures.

    """
    profiles = {}
    # profile (e.g. +0+0) to array of mean coverage.
    coverage = {}

    indexes = {
                    "twin1": 0,
                    "twin2": 1,
                    "mother": 2,
                    "father": 3,
              }

    symbols = {
                    "loss": '-',
                    "gain": '+'
              }

    depth_mode = True
    for cc in graph:

        if not hasattr(cc, "region_doc"):
            depth_mode = False

        if depth_mode:
            delta = cc.region_doc - cc.out_doc

        # All no calls at first
        profile = ['0', ] * 4

        for vertex in cc.adj_list:
            cnv = vertex.cnv
            profile[indexes[cnv.source]] = symbols[cnv.type]

        profile = "".join(profile)

        if depth_mode:
            if coverage.get(profile) is None:
                coverage[profile] = delta
            else:
                coverage[profile] = np.mean(
                    np.array([coverage[profile], delta]),
                    axis = 0
                )
            
        if profiles.get(profile) is None:
            profiles[profile] = 1

        profiles[profile] += 1

    if depth_mode:
        # Dcov if the difference in coverage from in region to out of the
        # region: cov_in - cov_out => negative if deletion.
        print ("Twin1\tTwin2\tMother\tFather\tCount\tDcov_twin1\t"
               "Dcov_twin2\tDcov_mother\tDcov_father")
        for key in profiles:
            print "{}\t{}\t{}".format(
                "\t".join(list(key)), 
                profiles[key],
                "\t".join([str(i) for i in coverage[key]])
            )
    else:
        print "Twin1\tTwin2\tMother\tFather"
        for key in profiles:
            print "{}\t{}".format(
                "\t".join(list(key)),
                profiles[key],
            )

[docs]def clean_ccs(ccs):
    """Merges CNVs that have the same source (sample). 

    :param ccs: The graph as a list of Connected Components.
    :type ccs: list

    This is used so that connected components represent families with a single
    representation for every individual. Thus, we merge indirectly overlapping
    loci, meaning that if two CNVs from an individual are both overlapped by
    CNVs from another individual within a family, they will be merged.

    """
    empty_ccs = set()
    inspected = set()

    for cc in ccs:
        to_remove = set()
        to_add = set()

        for i in xrange(len(cc.adj_list)):
            v1 = cc.adj_list[i]
            cnv1 = v1.cnv
            inspected.add(i)

            for j in xrange(len(cc.adj_list)):
                if j not in inspected:
                    v2 = cc.adj_list[j]
                    cnv2 = v2.cnv

                    if cnv1.source == cnv2.source and cnv1.type == cnv2.type:

                        assert cnv1.chr == cnv2.chr

                        params = {
                            "start": min(cnv1.start, cnv2.start),
                            "end": max(cnv1.end, cnv2.end),
                            "algo": "merged",
                            "source": cnv1.source,
                            "chr": cnv1.chr,
                            "type": cnv1.type,
                        }

                        new_cnv = cnv_struct.cnv(**params)
                        new_vertex = Vertex(new_cnv)
                        to_remove.add(v1)
                        to_remove.add(v2)
                        to_add.add(new_vertex)
                        inspected.add(j)

        for v in to_remove:
            cc.remove(v)
        for v in to_add:
            cc.add(v)

        if len(cc.adj_list) == 0:
            empty_ccs.add(cc)

    for cc in empty_ccs:
        ccs.remove(cc)

    return ccs
    
[docs]def create_family_graph(cnvs, threshold):
    """Creates the graph representing the CNVs from a given family as connected
    components.

    This graph is defined as follows. The nodes represent CNVs and the edges
    represent overlap between CNVs. The complete graph is thus made of multiple
    connected components representing different loci. 

    """

    connected_components = []

    # Transform dict in list if necessary
    if type(cnvs) is not list:
        cnvs = parsers.family_to_list(cnvs)

    for cnv in cnvs:
        if cnv.chr in ["chrX", "chrY"]:
            continue
        matching_ccs = []
        for cc in connected_components:
            if cc.suitable(cnv):
                # If cnv is suitable for a cc, add it to the list.
                matching_ccs.append(cc)

        if len(matching_ccs) > 1:
            # Merge the matching ccs, using this cnv.
            master = merge_ccs(matching_ccs, cnv)
            for cc in matching_ccs:
                connected_components.remove(cc)
            connected_components.append(master)

        elif len(matching_ccs) == 1:
            # Add this CNV to the only matching cc.
            matching_ccs[0].add(cnv)

        elif len(matching_ccs) == 0:
            # Create a new CC for this cnv.
            c = ConnectedComponent(cnv, threshold)
            connected_components.append(c)

    return connected_components

[docs]def merge_ccs(cc_list, cnv):
    """Merges connected components by using their respective overlap to cnv. """

    master = cc_list[0]
    min_start = master.start
    max_end = master.end
    for cc in cc_list[1:]:
        if cc.start < min_start:
            min_start = cc.start
        if cc.end > max_end:
            max_end = cc.end

        master.adj_list += cc.adj_list

    master.start = min_start
    master.end = max_end
    master.add(cnv)
    return master

[docs]class ConnectedComponent(object):
    """A class representing a graph-theoretic connected component.

    :param first_vertex: The first vertex is either the first CNV in the
                         connected component or the first 
                         :py:class:`generate_cnv_graph.Vertex` object.
                         Internally, whenever a CNV is added to a connected
                         component, they are converted to instances of this 
                         class.
    :type first_vertex: Either :py:class:`cnv_struct.cnv` or
                        :py:class:`generate_cnv_graph.Vertex`

    :param overlap_threshold: An overlap threshold that defines if two CNVs will
                              be considered identical.
    :type overlap_threshold: float (between 0 and 1)

    Internally, a ConnectedComponent object knows where the represented loci
    starts, ends, on which chromosome, the overlap threshold used and represents
    its set of vertices as a list (the ``adj_list`` attribute).

    """

    def __init__(self, first_vertex, overlap_threshold = 0.7):
        if type(first_vertex) is cnv_struct.cnv:
            first_vertex = Vertex(first_vertex)
        elif not type(first_vertex) is Vertex:
            raise TypeError("Invalid type for new ConnectedComponent: ".format(
                str(type(first_vertex))))
            
        self.start = first_vertex.cnv.start
        self.end = first_vertex.cnv.end
        self.chr = first_vertex.cnv.chr
        self.adj_list = [first_vertex, ]
        self.overlap_threshold = overlap_threshold

[docs]    def cnv_generator(self):
        """Returns a generator (iterable) of cnv objects from the vertices of 
           this connected component. """
        return (i.cnv for i in self.adj_list)

[docs]    def suitable(self, cnv):
        """Determines if the given cnv should be added to this connected 
           component. """
        ov = cnv_struct.ro_to_best_in_list(cnv, self.cnv_generator())
        return (min(ov) > self.overlap_threshold)

[docs]    def remove(self, vertex):
        """Removes a vertex from this connected component. """
        # Remove any edge to this vertex.
        for other_vertex in self.adj_list:
            to_remove = set()

            for edge in other_vertex.edges:
                if edge.v1 == vertex:
                    to_remove.add(edge)
                elif edge.v2 == vertex:
                    to_remove.add(edge)

            for edge in to_remove:
                other_vertex.edges.remove(edge)

        self.adj_list.remove(vertex)

[docs]    def add(self, cnv):
        """Adds the given cnv to the adjacency list. """

        add = False
        # For every vertex already in graph.
        if type(cnv) is cnv_struct.cnv:
            new_vertex = Vertex(cnv)
        elif type(cnv) is Vertex:
            new_vertex = cnv
        else:
            raise TypeError("Cannot add a '{}' object to the ConnectedComponent "
                            "only Vertex and cnv_struct.cnv classes are "
                            "allowed".format(type(cnv)))

        for v in self.adj_list:
            # If it overlaps with the new guy
            if cnv_struct.overlap(new_vertex.cnv, 
                                  v.cnv, 
                                  self.overlap_threshold):

                # Build the edge, add it to every vertice.
                weight = cnv_struct.overlap(
                    new_vertex.cnv, 
                    v.cnv,
                    global_ov = True
                )

                e = Edge(new_vertex, v, weight)

                new_vertex.add_edge(e)
                v.add_edge(e)

                add = True
                
        if add:
            self.adj_list.append(new_vertex)
            self.start = min(self.start, new_vertex.cnv.start)
            self.end = max(self.end, new_vertex.cnv.end)

        if not add:
            print ("Warning: No overlapping vertex in connected component. "
                   "the cnv was not added.")

    def __repr__(self):
        s = "ConnectedComponent:\n"
        s += pprint.pformat(self.adj_list)
        return s

[docs]class Vertex(object):
    """A container class for CNVs which allows Edge objects to link overlapping
    CNVs together.

    """
    def __init__(self, cnv_obj):
        """Initialize this vertex with the given CNV object.

        :param cnv_obj: The CNV object for this Vertex.
        :type cnv_obj: :py:class:`cnv_struct.cnv`

        """
        self.cnv = cnv_obj
        self.edges = []

[docs]    def add_edge(self, e):
        """Add an edge between vertices. 

        :param e: The edge object to add to this vertex.
        :type e: :py:class:`Edge`

        Concretely, this is used to link overlapping CNVs together.
        """
        self.edges.append(e)

    def __repr__(self):
        return "<Vertex ({})>".format(self.cnv)

[docs]class Edge(object):
    """A simple Edge object that links two vertices together. Edges are weighted.

    Edges should be generated with this class and added using
    :py:func:`generate_cnv_graph.add_edge`

    """

    def __init__(self, v1, v2, w):
        self.v1 = v1
        self.v2 = v2
        self.weight = w

def test():
    cnv1 = cnv_struct.cnv(pos = "chr1:2-5", type="gain", source="father")
    cnv2 = cnv_struct.cnv(pos = "chr1:3-10", type="gain", source="twin1")

    cnv3 = cnv_struct.cnv(pos = "chr1:20-25", type="gain", source="twin1")
    cnv4 = cnv_struct.cnv(pos = "chr1:27-30", type="gain", source="twin2")
    cnv5 = cnv_struct.cnv(pos = "chr1:24-28", type="gain", source="mother")

    cnv6 = cnv_struct.cnv(pos = "chr2:24-28", type="gain", source="mother")
    cnv7 = cnv_struct.cnv(pos = "chr2:19-24", type="gain", source="father")
    cnv8 = cnv_struct.cnv(pos = "chr2:25-35", type="gain", source="twin1")
    cnv9 = cnv_struct.cnv(pos = "chr2:34-42", type="gain", source="father")

    li = [cnv1, cnv2, cnv3, cnv4, cnv5, cnv6, cnv7, cnv8, cnv9]

    connected_components = create_family_graph(li, 0)
    connected_components = clean_ccs(connected_components)

    # Build ccs manually
    cc1 = ConnectedComponent(cnv1, 0)
    cc1.add(cnv2)

    cc2 = ConnectedComponent(cnv3, 0)
    cc2.add(cnv4)
    cc2.add(cnv5)

    cc3 = ConnectedComponent(cnv6, 0)
    cc3.add(cnv7)
    cc3.add(cnv8)

    for cc in connected_components: print cc
    print
    for cc in [cc1, cc2, cc3]: print cc

if __name__ == "__main__":
    desc = "Generates the cnv graph used to build the summary matrix."
    parser = argparse.ArgumentParser(description = desc)
    parser = validation.add_pickle_args(parser)
    parser = validation.add_dir_args(parser)

    parser.add_argument("--threshold",
                        type = float,
                        default = 0.7,
                        help = "Overlap threshold."
                       )

    parser.add_argument("--save",
                        type = str,
                        default = None,
                        help = "Output pickle if you want to keep the graph."
                       )

    for i in ("twin1", "twin2", "mother", "father"):
        parser.add_argument(
            "--{}_pileup".format(i),
            type = str,
            default = None,
            help = "Path to pileup for {}".format(i)             
        )

    main(parser.parse_args())
Navigation

Source code for generate_cnv_graph

Quick search

Navigation