Source code for parsers

# -*- coding: UTF-8 -*-

__all__ = ["cnver", "omni25", "pickle", "breakdancer", "erds", "cnvnator"]

import re
import os

# This file is part of CNVAnalysisToolkit.
# 
# CNVAnalysisToolkit is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# CNVAnalysisToolkit is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with CNVAnalysisToolkit.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Marc-Andre Legault (StatGen)"
__copyright__ = "Copyright (C) 2013 StatGen"
__license__ = "GNU General Public License v3 (GPL-3)"

__doc__ = """

This package contains the family-based parsers for every algorithm. Given a base
path, the parsers will walk through every sample and return a dict representing
the CNV calls for the family.
The dict will be of the form ``sample -> chromosome -> cnv list`` which means
that accessing all the CNVs from chromosome 3 on the father can be done using
``cnvs['father'][3]``.

This structure is inconvenient for analysis based on individual samples (no
family context). This feature is not yet implemented as this set of tools.

The base directory structure required for the parsers is of the form: ::

    .
    ├── father
    │   └── calls
    │       └── <Method specific files>
    ├── mother
    │   └── calls
    │       └── <Method specific files>
    ├── twin1
    │   └── calls
    │       └── <Method specific files>
    └── twin2
        └── calls
            └── <Method specific files>

Therefore, a bash script can easily be written to convert the raw output from the
different algorithms into such a structure. Alternatively, this could be done
manually if the number of samples is low.

.. note::
    Most scripts from this set of tools can take pickle files as input. When 
    this is the case, the scripts expect either a simple list of CNVs, or a 
    dictionary structure like the one presented here. In other words, the
    hassle from conforming to this directory structure could be avoided by using
    the pickle interface implemented in most scripts from this toolkit.

"""

[docs]def get_parser_for_algorithm(s): """ Gets the appropriate Parser class for a given algorithm. This is used internally to fetch the correct class when using the command-line tools. The ``--format`` argument is often used to identify the correct parser. :param s: The name of the parser to fetch (`e.g.` erds, cnvnator). :type s: str :returns: The parser :rtype: type """ # Name of the submodule to get the appropriate Parser class from. submod = "parsers.{}".format(s) # Dynamically import the module containing the Parser class. mod = __import__(submod, fromlist=["Parser"]) return getattr(mod, "Parser")
[docs]def family_to_list(li): """Converts a ``sample -> chromosome -> cnv list`` dictionary structure to a simple list of CNVs for the whole family. :param li: A dict of the samples to chromosomes to CNV lists. :type li: dict :returns: A one dimensional list of CNVs. :rtype: list This is useful when the analysis does not require familial information or when traversal in simple in list form (than a double iteration over samples and chromosomes). """ out = [] for sample in li: for chromo in li[sample]: for cnv in li[sample][chromo]: out.append(cnv) return out
[docs]class ParentParser(object): """Interface class for the parsers. """ def __init__(self, family_root): """Initializes a parser from the family root as described in the package's documentation. If the root path has a name matching ``family_([0-9]+)``, the digits will be used for the ``family_id`` attribute. This means that naming your family directories with this pattern will allow the automatic detection of the numerical id. Alternatively, if another naming convention is used, this method can be overriden. """ self.family_root = os.path.abspath(family_root) # Try to parse a family id from the family_root path self.family_id = re.search(r"family_([0-9]+)", family_root) if self.family_id: self.family_id = self.family_id.group(1) ls = os.listdir(family_root) # Get sample specific directories. try : self.twin1_dir = filter(lambda x: x.startswith("twin1"), ls)[0] self.twin2_dir = filter(lambda x: x.startswith("twin2"), ls)[0] self.father_dir = filter(lambda x: x.startswith("father"), ls)[0] self.mother_dir = filter(lambda x: x.startswith("mother"), ls)[0] except IndexError: raise Exception("Invalid family directory.") self.cnvs = {} self.paths = { "twin1": self.twin1_dir, "twin2": self.twin2_dir, "mother": self.mother_dir, "father": self.father_dir, } # Normalize paths for sample in self.paths: self.paths[sample] = os.path.join(family_root, self.paths[sample]) self.paths[sample] = os.path.abspath(self.paths[sample])
[docs] def get_cnvs(self): """Interface method which should return a dictionary of CNVs conforming with the previously established convention. """ raise NotImplementedError()