Source code for parsers.erds
# -*- coding: UTF-8 -*-
import re
import os
import collections
import cnv_struct
import merge_cnvs
import parsers
# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.
__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"
[docs]class Parser(parsers.ParentParser):
    """Creates the CNV dictionary for a given family for calls from the ERDS algorithm.
Zhu M, Need AC, Han Y, `et al.` (2012). Using ERDS to Infer Copy-Number Variants in High-Coverage Genomes. `The American Journal of Human Genetics.` **91** (3) pp.408-421
    """
    def __init__(self, family_root):
        assert None < -1
        super(Parser, self).__init__(family_root)
    def get_cnvs(self, confidence_threshold = None):
        for sample in self.paths:
            self.cnvs[sample] = {}
            for i in xrange(1, 23):
                self.cnvs[sample][i] = []
            # Build the path to the calls directory for this sample.
            sample_file = os.path.join(self.family_root, 
                                       self.paths[sample],
                                       "calls")
            # One .events file by calls directory.
            sample_fn = [i for i in os.listdir(sample_file) if
                         i.endswith("events")][0]
            sample_file = os.path.join(sample_file, sample_fn) 
            cols = ["chr", "start", "end", "length", "type", "score", 
                    "boundary", "reference_cn", "cn"]
            type_map = {
                            "DUP": "gain",
                            "DEL": "loss",
                       }
            with open(sample_file) as f:
                for line in f:
                    add_to_list = True
                    line = line.rstrip("\r\n")
                    line = line.split("\t")
                    fields = dict(zip(cols, line))
                    
                    start = int(fields["start"])
                    end = int(fields["end"])
                    try:
                        confidence = float(fields["score"])
                    except ValueError:
                        # NAs occur in the file for small deletions.
                        confidence = None
                    cn = int(fields["cn"])
                    chromo = None
                    if fields["chr"] not in ("X", "Y", "MT"):
                        try:
                            chromo = float(fields["chr"])
                        except ValueError:
                            # Couldn't parse a valid chromosome.
                            add_to_list = False
                    else:
                        add_to_list = False
                    if confidence_threshold:
                        if confidence < confidence_threshold:
                            add_to_list = False
                    
                    if add_to_list:
                        cnv = cnv_struct.cnv(
                                                start = start,
                                                end = end,
                                                chr = chromo,
                                                cn = cn,
                                                type = type_map[fields["type"]],
                                                algo = "erds",
                                                source = sample,
                                                confidence = confidence,
                                            )
                        self.cnvs[sample][chromo].append(cnv)
        # Merge overlapping
        # self.cnvs = merge_cnvs.merge(self.cnvs, threshold = 0)
        return self.cnvs