Source code for parsers.erds

# -*- coding: UTF-8 -*-
import re
import os
import collections

import cnv_struct
import merge_cnvs
import parsers

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

[docs]class Parser(parsers.ParentParser): """Creates the CNV dictionary for a given family for calls from the ERDS algorithm. Zhu M, Need AC, Han Y, `et al.` (2012). Using ERDS to Infer Copy-Number Variants in High-Coverage Genomes. `The American Journal of Human Genetics.` **91** (3) pp.408-421 """ def __init__(self, family_root): assert None < -1 super(Parser, self).__init__(family_root) def get_cnvs(self, confidence_threshold = None): for sample in self.paths: self.cnvs[sample] = {} for i in xrange(1, 23): self.cnvs[sample][i] = [] # Build the path to the calls directory for this sample. sample_file = os.path.join(self.family_root, self.paths[sample], "calls") # One .events file by calls directory. sample_fn = [i for i in os.listdir(sample_file) if i.endswith("events")][0] sample_file = os.path.join(sample_file, sample_fn) cols = ["chr", "start", "end", "length", "type", "score", "boundary", "reference_cn", "cn"] type_map = { "DUP": "gain", "DEL": "loss", } with open(sample_file) as f: for line in f: add_to_list = True line = line.rstrip("\r\n") line = line.split("\t") fields = dict(zip(cols, line)) start = int(fields["start"]) end = int(fields["end"]) try: confidence = float(fields["score"]) except ValueError: # NAs occur in the file for small deletions. confidence = None cn = int(fields["cn"]) chromo = None if fields["chr"] not in ("X", "Y", "MT"): try: chromo = float(fields["chr"]) except ValueError: # Couldn't parse a valid chromosome. add_to_list = False else: add_to_list = False if confidence_threshold: if confidence < confidence_threshold: add_to_list = False if add_to_list: cnv = cnv_struct.cnv( start = start, end = end, chr = chromo, cn = cn, type = type_map[fields["type"]], algo = "erds", source = sample, confidence = confidence, ) self.cnvs[sample][chromo].append(cnv) # Merge overlapping # self.cnvs = merge_cnvs.merge(self.cnvs, threshold = 0) return self.cnvs